ppocr-lite 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ppocr_lite/__init__.py +19 -0
- ppocr_lite/classification.py +64 -0
- ppocr_lite/detection.py +417 -0
- ppocr_lite/engine.py +286 -0
- ppocr_lite/models.py +133 -0
- ppocr_lite/recognition.py +112 -0
- ppocr_lite/utils.py +108 -0
- ppocr_lite-0.1.0.dist-info/METADATA +129 -0
- ppocr_lite-0.1.0.dist-info/RECORD +12 -0
- ppocr_lite-0.1.0.dist-info/WHEEL +5 -0
- ppocr_lite-0.1.0.dist-info/licenses/LICENSE.md +617 -0
- ppocr_lite-0.1.0.dist-info/top_level.txt +1 -0
ppocr_lite/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""ppocr_lite – a lightweight PP-OCR runtime built on onnxruntime + numpy + PIL.
|
|
2
|
+
|
|
3
|
+
No OpenCV, no deep-learning framework required.
|
|
4
|
+
|
|
5
|
+
Typical usage::
|
|
6
|
+
|
|
7
|
+
from ppocr_lite import PPOCRLite
|
|
8
|
+
|
|
9
|
+
ocr = PPOCRLite() # auto-downloads models on first run
|
|
10
|
+
results = ocr("screenshot.png")
|
|
11
|
+
for text, score, box in results:
|
|
12
|
+
print(f"{score:.2f} {text} {box}")
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from .engine import PPOCRLite
|
|
16
|
+
from .models import ModelConfig
|
|
17
|
+
|
|
18
|
+
__all__ = ["PPOCRLite", "ModelConfig"]
|
|
19
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Text direction classifier (0° vs 180°).
|
|
2
|
+
|
|
3
|
+
Used to flip upside-down text regions before recognition.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import math
|
|
9
|
+
from typing import List, Tuple
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
from PIL import Image
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ClsPreProcess:
|
|
16
|
+
"""Resize to (C, 48, 192) and normalise to [-1, 1].
|
|
17
|
+
|
|
18
|
+
Normalisation: (x/255 − 0.5)/0.5 ≡ x × (1/127.5) − 1
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
HEIGHT = 48
|
|
22
|
+
WIDTH = 192
|
|
23
|
+
|
|
24
|
+
def __call__(self, imgs: List[np.ndarray]) -> np.ndarray:
|
|
25
|
+
batch = [self._process(img) for img in imgs]
|
|
26
|
+
return np.stack(batch, axis=0).astype(np.float32) # (N, 3, 48, W)
|
|
27
|
+
|
|
28
|
+
def _process(self, img: np.ndarray) -> np.ndarray:
|
|
29
|
+
h, w = img.shape[:2]
|
|
30
|
+
target_w = min(self.WIDTH, int(math.ceil(self.HEIGHT * w / h)))
|
|
31
|
+
pil = Image.fromarray(img).resize((target_w, self.HEIGHT), Image.BILINEAR)
|
|
32
|
+
# Fused normalise: (x/255 − 0.5)/0.5 == x/127.5 − 1
|
|
33
|
+
arr = np.asarray(pil, dtype=np.float32) * (1.0 / 127.5) - 1.0
|
|
34
|
+
arr = arr.transpose(2, 0, 1) # HWC → CHW
|
|
35
|
+
pad = np.zeros((3, self.HEIGHT, self.WIDTH), dtype=np.float32)
|
|
36
|
+
pad[:, :, :target_w] = arr
|
|
37
|
+
return pad
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def apply_cls(
|
|
41
|
+
imgs: List[np.ndarray],
|
|
42
|
+
preds: np.ndarray,
|
|
43
|
+
thresh: float = 0.9,
|
|
44
|
+
) -> List[np.ndarray]:
|
|
45
|
+
"""Rotate images predicted as 180° back to upright.
|
|
46
|
+
|
|
47
|
+
Parameters
|
|
48
|
+
----------
|
|
49
|
+
imgs:
|
|
50
|
+
Cropped text-region images (H×W×3 uint8).
|
|
51
|
+
preds:
|
|
52
|
+
Classifier output, shape (N, 2). Class 0 = 0°, class 1 = 180°.
|
|
53
|
+
thresh:
|
|
54
|
+
Confidence threshold; below this the image is left unchanged.
|
|
55
|
+
"""
|
|
56
|
+
out = []
|
|
57
|
+
for img, pred in zip(imgs, preds):
|
|
58
|
+
label = int(pred.argmax())
|
|
59
|
+
score = float(pred[label])
|
|
60
|
+
if label == 1 and score >= thresh:
|
|
61
|
+
# rotate 180°
|
|
62
|
+
img = img[::-1, ::-1, :]
|
|
63
|
+
out.append(img)
|
|
64
|
+
return out
|
ppocr_lite/detection.py
ADDED
|
@@ -0,0 +1,417 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import List, Tuple
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from PIL import Image
|
|
7
|
+
|
|
8
|
+
from ppocr_lite.utils import log_perf
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# ---------------------------------------------------------------------------
|
|
12
|
+
# Pre-processing
|
|
13
|
+
# ---------------------------------------------------------------------------
|
|
14
|
+
|
|
15
|
+
class DetPreProcess:
|
|
16
|
+
"""Resize → normalize → NCHW float32 batch for the DB detector.
|
|
17
|
+
|
|
18
|
+
Normalisation: (x/255 − 0.5) / 0.5 ≡ x × (1/127.5) − 1
|
|
19
|
+
|
|
20
|
+
The result is made C-contiguous here so FastONNXRunner never needs
|
|
21
|
+
an extra copy.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, limit_side_len: int = 960, limit_type: str = "max") -> None:
|
|
25
|
+
self.limit_side_len = limit_side_len
|
|
26
|
+
self.limit_type = limit_type
|
|
27
|
+
|
|
28
|
+
def __call__(self, img: np.ndarray) -> np.ndarray:
|
|
29
|
+
h, w = img.shape[:2]
|
|
30
|
+
ratio = self._ratio(h, w)
|
|
31
|
+
new_h = int(round(h * ratio / 32) * 32)
|
|
32
|
+
new_w = int(round(w * ratio / 32) * 32)
|
|
33
|
+
if new_h <= 0 or new_w <= 0:
|
|
34
|
+
raise ValueError(f"Invalid resize target ({new_w}×{new_h})")
|
|
35
|
+
|
|
36
|
+
pil = Image.fromarray(img).resize((new_w, new_h), Image.BILINEAR)
|
|
37
|
+
# Fused normalise: (x/255 − 0.5)/0.5 == x/127.5 − 1
|
|
38
|
+
arr = np.asarray(pil, dtype=np.float32) * (1.0 / 127.5) - 1.0
|
|
39
|
+
# Transpose HWC→CHW, add batch dim, and force C-contiguous in one shot
|
|
40
|
+
return np.ascontiguousarray(arr.transpose(2, 0, 1)[np.newaxis])
|
|
41
|
+
|
|
42
|
+
def _ratio(self, h: int, w: int) -> float:
|
|
43
|
+
lim = self.limit_side_len
|
|
44
|
+
side = min(h, w) if self.limit_type == "min" else max(h, w)
|
|
45
|
+
if self.limit_type == "min" and side < lim:
|
|
46
|
+
return lim / side
|
|
47
|
+
if self.limit_type == "max" and side > lim:
|
|
48
|
+
return lim / side
|
|
49
|
+
return 1.0
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _auto_limit(max_side: int) -> int:
|
|
53
|
+
"""Longest-side cap for the DB detector input (used with limit_type='max').
|
|
54
|
+
|
|
55
|
+
Keeps detector tensors at a manageable size without upscaling.
|
|
56
|
+
The engine chooses limit_type='min' separately for small images.
|
|
57
|
+
"""
|
|
58
|
+
if max_side <= 1280:
|
|
59
|
+
return 1280
|
|
60
|
+
if max_side <= 1920:
|
|
61
|
+
return 1920
|
|
62
|
+
return 2560
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# ---------------------------------------------------------------------------
|
|
66
|
+
# Post-processing (DB – Differentiable Binarisation)
|
|
67
|
+
# ---------------------------------------------------------------------------
|
|
68
|
+
|
|
69
|
+
class DBPostProcess:
|
|
70
|
+
"""Convert the DB probability map into oriented quad boxes.
|
|
71
|
+
|
|
72
|
+
Fast path optimized for screenshot / UI text:
|
|
73
|
+
- connected-component slices instead of repeated full-image scans
|
|
74
|
+
- axis-aligned bounding rectangles (O(N) min/max instead of PCA)
|
|
75
|
+
- optional SciPy acceleration if available
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
def __init__(
|
|
79
|
+
self,
|
|
80
|
+
thresh: float = 0.3,
|
|
81
|
+
box_thresh: float = 0.5,
|
|
82
|
+
max_candidates: int = 1000,
|
|
83
|
+
unclip_ratio: float = 1.6,
|
|
84
|
+
min_size: int = 3,
|
|
85
|
+
max_points_for_box: int = 512,
|
|
86
|
+
) -> None:
|
|
87
|
+
self.thresh = thresh
|
|
88
|
+
self.box_thresh = box_thresh
|
|
89
|
+
self.max_candidates = max_candidates
|
|
90
|
+
self.unclip_ratio = unclip_ratio
|
|
91
|
+
self.min_size = min_size
|
|
92
|
+
self.max_points_for_box = max_points_for_box
|
|
93
|
+
|
|
94
|
+
def __call__(
|
|
95
|
+
self,
|
|
96
|
+
pred: np.ndarray, # (1, 1, H, W) float32
|
|
97
|
+
orig_shape: Tuple[int, int], # (orig_h, orig_w)
|
|
98
|
+
) -> Tuple[np.ndarray, List[float]]:
|
|
99
|
+
prob = pred[0, 0] # (H, W)
|
|
100
|
+
mask = prob > self.thresh
|
|
101
|
+
|
|
102
|
+
# slight dilation equivalent: max-pool with 2×2 kernel
|
|
103
|
+
mask = _dilate2x2_bool(mask)
|
|
104
|
+
|
|
105
|
+
orig_h, orig_w = orig_shape
|
|
106
|
+
map_h, map_w = mask.shape
|
|
107
|
+
|
|
108
|
+
components = _find_components(mask)
|
|
109
|
+
|
|
110
|
+
boxes: List[np.ndarray] = []
|
|
111
|
+
scores: List[float] = []
|
|
112
|
+
|
|
113
|
+
for comp in components[:self.max_candidates]:
|
|
114
|
+
pts = comp["pts"] # global coords (N, 2) float32
|
|
115
|
+
if pts.shape[0] < 4:
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
if pts.shape[0] > self.max_points_for_box:
|
|
119
|
+
step = max(1, pts.shape[0] // self.max_points_for_box)
|
|
120
|
+
pts = pts[::step]
|
|
121
|
+
|
|
122
|
+
# Axis-aligned bounding rect: exact for horizontal screen text and
|
|
123
|
+
# ~5× faster than PCA (no eigendecomposition).
|
|
124
|
+
# Swap to _pca_rect_quad if you need rotated-text support.
|
|
125
|
+
box, sside = _axis_aligned_rect(pts)
|
|
126
|
+
if sside < self.min_size:
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
score = _box_score_fast(prob, box)
|
|
130
|
+
if score < self.box_thresh:
|
|
131
|
+
continue
|
|
132
|
+
|
|
133
|
+
box = _unclip_quad(box, self.unclip_ratio)
|
|
134
|
+
box, sside = _axis_aligned_rect(box)
|
|
135
|
+
if sside < self.min_size + 2:
|
|
136
|
+
continue
|
|
137
|
+
|
|
138
|
+
# scale back to original image coordinates
|
|
139
|
+
box[:, 0] = np.clip(np.round(box[:, 0] / map_w * orig_w), 0, orig_w - 1)
|
|
140
|
+
box[:, 1] = np.clip(np.round(box[:, 1] / map_h * orig_h), 0, orig_h - 1)
|
|
141
|
+
|
|
142
|
+
boxes.append(box.astype(np.int32))
|
|
143
|
+
scores.append(float(score))
|
|
144
|
+
|
|
145
|
+
if not boxes:
|
|
146
|
+
return np.empty((0, 4, 2), dtype=np.int32), []
|
|
147
|
+
|
|
148
|
+
boxes_arr = np.stack(boxes, axis=0)
|
|
149
|
+
boxes_arr, scores = _filter_boxes(boxes_arr, scores, orig_h, orig_w)
|
|
150
|
+
return boxes_arr, scores
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
# ---------------------------------------------------------------------------
|
|
154
|
+
# Pure-numpy image processing primitives (cv2 replacements)
|
|
155
|
+
# ---------------------------------------------------------------------------
|
|
156
|
+
|
|
157
|
+
def _dilate2x2_bool(mask: np.ndarray) -> np.ndarray:
|
|
158
|
+
"""Equivalent of cv2.dilate with a 2×2 all-ones kernel."""
|
|
159
|
+
out = mask.copy()
|
|
160
|
+
out[:-1, :] |= mask[1:, :]
|
|
161
|
+
out[:, :-1] |= mask[:, 1:]
|
|
162
|
+
out[:-1, :-1] |= mask[1:, 1:]
|
|
163
|
+
return out
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
try:
|
|
167
|
+
import scipy.ndimage as scipy_ndimage
|
|
168
|
+
except ImportError:
|
|
169
|
+
scipy_ndimage = None
|
|
170
|
+
|
|
171
|
+
def _find_components(mask: np.ndarray):
|
|
172
|
+
"""Return connected components with point clouds in global coords."""
|
|
173
|
+
if scipy_ndimage is not None:
|
|
174
|
+
with log_perf("scipy_ndimage_label.nd_label"):
|
|
175
|
+
labeled, n = scipy_ndimage.label(mask)
|
|
176
|
+
with log_perf("scipy_ndimage_label.find_objects"):
|
|
177
|
+
objs = scipy_ndimage.find_objects(labeled)
|
|
178
|
+
return _components_from_labeled(labeled, objs, n)
|
|
179
|
+
else:
|
|
180
|
+
with log_perf("_label_numpy"):
|
|
181
|
+
labeled, n = _label_numpy(mask)
|
|
182
|
+
with log_perf("_find_objects_numpy"):
|
|
183
|
+
objs = _find_objects_numpy(labeled, n)
|
|
184
|
+
return _components_from_labeled(labeled, objs, n)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _components_from_labeled(labeled: np.ndarray, objs, n: int):
|
|
188
|
+
comps = []
|
|
189
|
+
for i in range(1, n + 1):
|
|
190
|
+
sl = objs[i - 1] if i - 1 < len(objs) else None
|
|
191
|
+
if sl is None:
|
|
192
|
+
continue
|
|
193
|
+
ysl, xsl = sl
|
|
194
|
+
roi = labeled[ysl, xsl] == i
|
|
195
|
+
if roi.sum() < 4:
|
|
196
|
+
continue
|
|
197
|
+
ys, xs = np.nonzero(roi)
|
|
198
|
+
xs = xs + xsl.start
|
|
199
|
+
ys = ys + ysl.start
|
|
200
|
+
pts = np.empty((xs.size, 2), dtype=np.float32)
|
|
201
|
+
pts[:, 0] = xs
|
|
202
|
+
pts[:, 1] = ys
|
|
203
|
+
comps.append({"label": i, "slice": sl, "pts": pts})
|
|
204
|
+
return comps
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _find_objects_numpy(labeled: np.ndarray, n: int):
|
|
208
|
+
"""Cheap find_objects replacement for fallback labeling."""
|
|
209
|
+
objs = [None] * n
|
|
210
|
+
if n == 0:
|
|
211
|
+
return objs
|
|
212
|
+
ys, xs = np.nonzero(labeled)
|
|
213
|
+
vals = labeled[ys, xs]
|
|
214
|
+
mins_y = np.full(n + 1, labeled.shape[0], dtype=np.int32)
|
|
215
|
+
mins_x = np.full(n + 1, labeled.shape[1], dtype=np.int32)
|
|
216
|
+
maxs_y = np.full(n + 1, -1, dtype=np.int32)
|
|
217
|
+
maxs_x = np.full(n + 1, -1, dtype=np.int32)
|
|
218
|
+
np.minimum.at(mins_y, vals, ys)
|
|
219
|
+
np.minimum.at(mins_x, vals, xs)
|
|
220
|
+
np.maximum.at(maxs_y, vals, ys)
|
|
221
|
+
np.maximum.at(maxs_x, vals, xs)
|
|
222
|
+
for i in range(1, n + 1):
|
|
223
|
+
if maxs_y[i] >= 0:
|
|
224
|
+
objs[i - 1] = (
|
|
225
|
+
slice(int(mins_y[i]), int(maxs_y[i]) + 1),
|
|
226
|
+
slice(int(mins_x[i]), int(maxs_x[i]) + 1),
|
|
227
|
+
)
|
|
228
|
+
return objs
|
|
229
|
+
|
|
230
|
+
def downscale(mask, factor=2):
|
|
231
|
+
"""Downscale a 2D binary mask by integer factor using block reduction."""
|
|
232
|
+
h, w = mask.shape
|
|
233
|
+
new_h, new_w = h // factor, w // factor
|
|
234
|
+
# reshape into blocks and take max (if any pixel is 1, block is 1)
|
|
235
|
+
down = mask[:new_h*factor, :new_w*factor].reshape(new_h, factor, new_w, factor)
|
|
236
|
+
down = down.max(axis=(1, 3))
|
|
237
|
+
return down
|
|
238
|
+
|
|
239
|
+
def upscale(labels_small, factor=2, original_shape=None):
|
|
240
|
+
"""Upscale labels back using nearest-neighbor replication."""
|
|
241
|
+
up = np.repeat(np.repeat(labels_small, factor, axis=0), factor, axis=1)
|
|
242
|
+
if original_shape:
|
|
243
|
+
up = up[:original_shape[0], :original_shape[1]]
|
|
244
|
+
return up
|
|
245
|
+
|
|
246
|
+
def label_numpy(mask):
|
|
247
|
+
"""Minimal 4-connectivity labeling."""
|
|
248
|
+
h, w = mask.shape
|
|
249
|
+
labels = np.zeros((h, w), dtype=np.int32)
|
|
250
|
+
label = 0
|
|
251
|
+
for y in range(h):
|
|
252
|
+
for x in range(w):
|
|
253
|
+
if not mask[y, x] or labels[y, x] != 0:
|
|
254
|
+
continue
|
|
255
|
+
label += 1
|
|
256
|
+
stack = [(y, x)]
|
|
257
|
+
labels[y, x] = label
|
|
258
|
+
while stack:
|
|
259
|
+
cy, cx = stack.pop()
|
|
260
|
+
if cy > 0 and mask[cy - 1, cx] and labels[cy - 1, cx] == 0:
|
|
261
|
+
labels[cy - 1, cx] = label
|
|
262
|
+
stack.append((cy - 1, cx))
|
|
263
|
+
if cy + 1 < h and mask[cy + 1, cx] and labels[cy + 1, cx] == 0:
|
|
264
|
+
labels[cy + 1, cx] = label
|
|
265
|
+
stack.append((cy + 1, cx))
|
|
266
|
+
if cx > 0 and mask[cy, cx - 1] and labels[cy, cx - 1] == 0:
|
|
267
|
+
labels[cy, cx - 1] = label
|
|
268
|
+
stack.append((cy, cx - 1))
|
|
269
|
+
if cx + 1 < w and mask[cy, cx + 1] and labels[cy, cx + 1] == 0:
|
|
270
|
+
labels[cy, cx + 1] = label
|
|
271
|
+
stack.append((cy, cx + 1))
|
|
272
|
+
return labels, label
|
|
273
|
+
|
|
274
|
+
def _label_numpy(mask, factor=2):
|
|
275
|
+
"""Downscale → label → upscale"""
|
|
276
|
+
small = downscale(mask, factor=factor)
|
|
277
|
+
labels_small, n = label_numpy(small)
|
|
278
|
+
labels_up = upscale(labels_small, factor=factor, original_shape=mask.shape)
|
|
279
|
+
return labels_up, n
|
|
280
|
+
|
|
281
|
+
'''
|
|
282
|
+
def _label_numpy(mask: np.ndarray):
|
|
283
|
+
"""Minimal 4-connectivity connected-component labeling fallback."""
|
|
284
|
+
h, w = mask.shape
|
|
285
|
+
labels = np.zeros((h, w), dtype=np.int32)
|
|
286
|
+
label = 0
|
|
287
|
+
for y in range(h):
|
|
288
|
+
for x in range(w):
|
|
289
|
+
if not mask[y, x] or labels[y, x] != 0:
|
|
290
|
+
continue
|
|
291
|
+
label += 1
|
|
292
|
+
stack = [(y, x)]
|
|
293
|
+
labels[y, x] = label
|
|
294
|
+
while stack:
|
|
295
|
+
cy, cx = stack.pop()
|
|
296
|
+
if cy > 0 and mask[cy - 1, cx] and labels[cy - 1, cx] == 0:
|
|
297
|
+
labels[cy - 1, cx] = label
|
|
298
|
+
stack.append((cy - 1, cx))
|
|
299
|
+
if cy + 1 < h and mask[cy + 1, cx] and labels[cy + 1, cx] == 0:
|
|
300
|
+
labels[cy + 1, cx] = label
|
|
301
|
+
stack.append((cy + 1, cx))
|
|
302
|
+
if cx > 0 and mask[cy, cx - 1] and labels[cy, cx - 1] == 0:
|
|
303
|
+
labels[cy, cx - 1] = label
|
|
304
|
+
stack.append((cy, cx - 1))
|
|
305
|
+
if cx + 1 < w and mask[cy, cx + 1] and labels[cy, cx + 1] == 0:
|
|
306
|
+
labels[cy, cx + 1] = label
|
|
307
|
+
stack.append((cy, cx + 1))
|
|
308
|
+
return labels, label
|
|
309
|
+
'''
|
|
310
|
+
|
|
311
|
+
# ---------------------------------------------------------------------------
|
|
312
|
+
# Geometry
|
|
313
|
+
# ---------------------------------------------------------------------------
|
|
314
|
+
|
|
315
|
+
def _axis_aligned_rect(pts: np.ndarray) -> Tuple[np.ndarray, float]:
|
|
316
|
+
"""Fast axis-aligned bounding rectangle.
|
|
317
|
+
|
|
318
|
+
Exact for horizontal screen / UI text and ~5× faster than _pca_rect_quad
|
|
319
|
+
because it only needs min/max instead of a covariance eigendecomposition.
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
box: (4, 2) float32 quad in TL→TR→BR→BL order
|
|
323
|
+
short_side: float
|
|
324
|
+
"""
|
|
325
|
+
mn = pts.min(axis=0)
|
|
326
|
+
mx = pts.max(axis=0)
|
|
327
|
+
box = np.array(
|
|
328
|
+
[[mn[0], mn[1]], [mx[0], mn[1]], [mx[0], mx[1]], [mn[0], mx[1]]],
|
|
329
|
+
dtype=np.float32,
|
|
330
|
+
)
|
|
331
|
+
return box, float(min(mx[0] - mn[0], mx[1] - mn[1]))
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def _pca_rect_quad(pts: np.ndarray) -> Tuple[np.ndarray, float]:
|
|
335
|
+
"""PCA-based oriented rectangle — retained for rotated-text use-cases.
|
|
336
|
+
|
|
337
|
+
Use _axis_aligned_rect for screenshots (faster, equally accurate there).
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
box: (4, 2) float32 quad in TL→TR→BR→BL order
|
|
341
|
+
short_side: float
|
|
342
|
+
"""
|
|
343
|
+
pts = pts.astype(np.float32, copy=False)
|
|
344
|
+
if pts.shape[0] == 1:
|
|
345
|
+
x, y = pts[0]
|
|
346
|
+
return np.array([[x, y], [x+1, y], [x+1, y+1], [x, y+1]], dtype=np.float32), 1.0
|
|
347
|
+
|
|
348
|
+
center = pts.mean(axis=0)
|
|
349
|
+
centered = pts - center
|
|
350
|
+
cov = centered.T @ centered / max(len(pts), 1)
|
|
351
|
+
eigvals, eigvecs = np.linalg.eigh(cov)
|
|
352
|
+
order = np.argsort(eigvals)[::-1]
|
|
353
|
+
eigvecs = eigvecs[:, order]
|
|
354
|
+
|
|
355
|
+
proj = centered @ eigvecs
|
|
356
|
+
mn = proj.min(axis=0)
|
|
357
|
+
mx = proj.max(axis=0)
|
|
358
|
+
corners = np.array([[mn[0], mn[1]], [mx[0], mn[1]], [mx[0], mx[1]], [mn[0], mx[1]]], dtype=np.float32)
|
|
359
|
+
box = _order_quad(corners @ eigvecs.T + center)
|
|
360
|
+
w = float(np.linalg.norm(box[0] - box[1]))
|
|
361
|
+
h = float(np.linalg.norm(box[1] - box[2]))
|
|
362
|
+
return box, min(w, h)
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def _order_quad(box: np.ndarray) -> np.ndarray:
|
|
366
|
+
"""Sort 4 points into TL, TR, BR, BL order."""
|
|
367
|
+
box = np.asarray(box, dtype=np.float32)
|
|
368
|
+
x_sort = box[np.argsort(box[:, 0])]
|
|
369
|
+
left = x_sort[:2][np.argsort(x_sort[:2, 1])]
|
|
370
|
+
right = x_sort[2:][np.argsort(x_sort[2:, 1])]
|
|
371
|
+
return np.array([left[0], right[0], right[1], left[1]], dtype=np.float32)
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def _box_score_fast(prob: np.ndarray, box: np.ndarray) -> float:
|
|
375
|
+
"""Fast bounding-box mean score."""
|
|
376
|
+
h, w = prob.shape
|
|
377
|
+
xs = np.clip(box[:, 0].astype(np.int32), 0, w - 1)
|
|
378
|
+
ys = np.clip(box[:, 1].astype(np.int32), 0, h - 1)
|
|
379
|
+
x0, x1 = xs.min(), xs.max()
|
|
380
|
+
y0, y1 = ys.min(), ys.max()
|
|
381
|
+
if x0 >= x1 or y0 >= y1:
|
|
382
|
+
return 0.0
|
|
383
|
+
return float(prob[y0:y1 + 1, x0:x1 + 1].mean())
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def _unclip_quad(box: np.ndarray, ratio: float) -> np.ndarray:
|
|
387
|
+
"""Expand a quad outward by *ratio* using polygon area/perimeter."""
|
|
388
|
+
box = np.asarray(box, dtype=np.float32)
|
|
389
|
+
x, y = box[:, 0], box[:, 1]
|
|
390
|
+
area = 0.5 * abs(float(np.dot(x, np.roll(y, -1)) - np.dot(y, np.roll(x, -1))))
|
|
391
|
+
perimeter = float(np.sum(np.linalg.norm(np.diff(box, axis=0, append=box[:1]), axis=1)))
|
|
392
|
+
if perimeter < 1e-6:
|
|
393
|
+
return box
|
|
394
|
+
distance = area * ratio / perimeter
|
|
395
|
+
center = box.mean(axis=0)
|
|
396
|
+
vecs = box - center
|
|
397
|
+
norms = np.linalg.norm(vecs, axis=1, keepdims=True).clip(min=1e-6)
|
|
398
|
+
return (box + vecs / norms * distance).astype(np.float32)
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def _filter_boxes(
|
|
402
|
+
boxes: np.ndarray, scores: List[float], img_h: int, img_w: int
|
|
403
|
+
) -> Tuple[np.ndarray, List[float]]:
|
|
404
|
+
kept_boxes, kept_scores = [], []
|
|
405
|
+
for box, score in zip(boxes, scores):
|
|
406
|
+
box = _order_quad(box)
|
|
407
|
+
box[:, 0] = np.clip(box[:, 0], 0, img_w - 1)
|
|
408
|
+
box[:, 1] = np.clip(box[:, 1], 0, img_h - 1)
|
|
409
|
+
w = float(np.linalg.norm(box[0] - box[1]))
|
|
410
|
+
h = float(np.linalg.norm(box[1] - box[2]))
|
|
411
|
+
if w <= 3 or h <= 3:
|
|
412
|
+
continue
|
|
413
|
+
kept_boxes.append(box.astype(np.int32))
|
|
414
|
+
kept_scores.append(float(score))
|
|
415
|
+
if not kept_boxes:
|
|
416
|
+
return np.empty((0, 4, 2), dtype=np.int32), []
|
|
417
|
+
return np.stack(kept_boxes, axis=0), kept_scores
|