ppocr-lite 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ppocr_lite/__init__.py ADDED
@@ -0,0 +1,19 @@
1
+ """ppocr_lite – a lightweight PP-OCR runtime built on onnxruntime + numpy + PIL.
2
+
3
+ No OpenCV, no deep-learning framework required.
4
+
5
+ Typical usage::
6
+
7
+ from ppocr_lite import PPOCRLite
8
+
9
+ ocr = PPOCRLite() # auto-downloads models on first run
10
+ results = ocr("screenshot.png")
11
+ for text, score, box in results:
12
+ print(f"{score:.2f} {text} {box}")
13
+ """
14
+
15
+ from .engine import PPOCRLite
16
+ from .models import ModelConfig
17
+
18
+ __all__ = ["PPOCRLite", "ModelConfig"]
19
+ __version__ = "0.1.0"
@@ -0,0 +1,64 @@
1
+ """Text direction classifier (0° vs 180°).
2
+
3
+ Used to flip upside-down text regions before recognition.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import math
9
+ from typing import List, Tuple
10
+
11
+ import numpy as np
12
+ from PIL import Image
13
+
14
+
15
+ class ClsPreProcess:
16
+ """Resize to (C, 48, 192) and normalise to [-1, 1].
17
+
18
+ Normalisation: (x/255 − 0.5)/0.5 ≡ x × (1/127.5) − 1
19
+ """
20
+
21
+ HEIGHT = 48
22
+ WIDTH = 192
23
+
24
+ def __call__(self, imgs: List[np.ndarray]) -> np.ndarray:
25
+ batch = [self._process(img) for img in imgs]
26
+ return np.stack(batch, axis=0).astype(np.float32) # (N, 3, 48, W)
27
+
28
+ def _process(self, img: np.ndarray) -> np.ndarray:
29
+ h, w = img.shape[:2]
30
+ target_w = min(self.WIDTH, int(math.ceil(self.HEIGHT * w / h)))
31
+ pil = Image.fromarray(img).resize((target_w, self.HEIGHT), Image.BILINEAR)
32
+ # Fused normalise: (x/255 − 0.5)/0.5 == x/127.5 − 1
33
+ arr = np.asarray(pil, dtype=np.float32) * (1.0 / 127.5) - 1.0
34
+ arr = arr.transpose(2, 0, 1) # HWC → CHW
35
+ pad = np.zeros((3, self.HEIGHT, self.WIDTH), dtype=np.float32)
36
+ pad[:, :, :target_w] = arr
37
+ return pad
38
+
39
+
40
+ def apply_cls(
41
+ imgs: List[np.ndarray],
42
+ preds: np.ndarray,
43
+ thresh: float = 0.9,
44
+ ) -> List[np.ndarray]:
45
+ """Rotate images predicted as 180° back to upright.
46
+
47
+ Parameters
48
+ ----------
49
+ imgs:
50
+ Cropped text-region images (H×W×3 uint8).
51
+ preds:
52
+ Classifier output, shape (N, 2). Class 0 = 0°, class 1 = 180°.
53
+ thresh:
54
+ Confidence threshold; below this the image is left unchanged.
55
+ """
56
+ out = []
57
+ for img, pred in zip(imgs, preds):
58
+ label = int(pred.argmax())
59
+ score = float(pred[label])
60
+ if label == 1 and score >= thresh:
61
+ # rotate 180°
62
+ img = img[::-1, ::-1, :]
63
+ out.append(img)
64
+ return out
@@ -0,0 +1,417 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import List, Tuple
4
+
5
+ import numpy as np
6
+ from PIL import Image
7
+
8
+ from ppocr_lite.utils import log_perf
9
+
10
+
11
+ # ---------------------------------------------------------------------------
12
+ # Pre-processing
13
+ # ---------------------------------------------------------------------------
14
+
15
+ class DetPreProcess:
16
+ """Resize → normalize → NCHW float32 batch for the DB detector.
17
+
18
+ Normalisation: (x/255 − 0.5) / 0.5 ≡ x × (1/127.5) − 1
19
+
20
+ The result is made C-contiguous here so FastONNXRunner never needs
21
+ an extra copy.
22
+ """
23
+
24
+ def __init__(self, limit_side_len: int = 960, limit_type: str = "max") -> None:
25
+ self.limit_side_len = limit_side_len
26
+ self.limit_type = limit_type
27
+
28
+ def __call__(self, img: np.ndarray) -> np.ndarray:
29
+ h, w = img.shape[:2]
30
+ ratio = self._ratio(h, w)
31
+ new_h = int(round(h * ratio / 32) * 32)
32
+ new_w = int(round(w * ratio / 32) * 32)
33
+ if new_h <= 0 or new_w <= 0:
34
+ raise ValueError(f"Invalid resize target ({new_w}×{new_h})")
35
+
36
+ pil = Image.fromarray(img).resize((new_w, new_h), Image.BILINEAR)
37
+ # Fused normalise: (x/255 − 0.5)/0.5 == x/127.5 − 1
38
+ arr = np.asarray(pil, dtype=np.float32) * (1.0 / 127.5) - 1.0
39
+ # Transpose HWC→CHW, add batch dim, and force C-contiguous in one shot
40
+ return np.ascontiguousarray(arr.transpose(2, 0, 1)[np.newaxis])
41
+
42
+ def _ratio(self, h: int, w: int) -> float:
43
+ lim = self.limit_side_len
44
+ side = min(h, w) if self.limit_type == "min" else max(h, w)
45
+ if self.limit_type == "min" and side < lim:
46
+ return lim / side
47
+ if self.limit_type == "max" and side > lim:
48
+ return lim / side
49
+ return 1.0
50
+
51
+
52
+ def _auto_limit(max_side: int) -> int:
53
+ """Longest-side cap for the DB detector input (used with limit_type='max').
54
+
55
+ Keeps detector tensors at a manageable size without upscaling.
56
+ The engine chooses limit_type='min' separately for small images.
57
+ """
58
+ if max_side <= 1280:
59
+ return 1280
60
+ if max_side <= 1920:
61
+ return 1920
62
+ return 2560
63
+
64
+
65
+ # ---------------------------------------------------------------------------
66
+ # Post-processing (DB – Differentiable Binarisation)
67
+ # ---------------------------------------------------------------------------
68
+
69
+ class DBPostProcess:
70
+ """Convert the DB probability map into oriented quad boxes.
71
+
72
+ Fast path optimized for screenshot / UI text:
73
+ - connected-component slices instead of repeated full-image scans
74
+ - axis-aligned bounding rectangles (O(N) min/max instead of PCA)
75
+ - optional SciPy acceleration if available
76
+ """
77
+
78
+ def __init__(
79
+ self,
80
+ thresh: float = 0.3,
81
+ box_thresh: float = 0.5,
82
+ max_candidates: int = 1000,
83
+ unclip_ratio: float = 1.6,
84
+ min_size: int = 3,
85
+ max_points_for_box: int = 512,
86
+ ) -> None:
87
+ self.thresh = thresh
88
+ self.box_thresh = box_thresh
89
+ self.max_candidates = max_candidates
90
+ self.unclip_ratio = unclip_ratio
91
+ self.min_size = min_size
92
+ self.max_points_for_box = max_points_for_box
93
+
94
+ def __call__(
95
+ self,
96
+ pred: np.ndarray, # (1, 1, H, W) float32
97
+ orig_shape: Tuple[int, int], # (orig_h, orig_w)
98
+ ) -> Tuple[np.ndarray, List[float]]:
99
+ prob = pred[0, 0] # (H, W)
100
+ mask = prob > self.thresh
101
+
102
+ # slight dilation equivalent: max-pool with 2×2 kernel
103
+ mask = _dilate2x2_bool(mask)
104
+
105
+ orig_h, orig_w = orig_shape
106
+ map_h, map_w = mask.shape
107
+
108
+ components = _find_components(mask)
109
+
110
+ boxes: List[np.ndarray] = []
111
+ scores: List[float] = []
112
+
113
+ for comp in components[:self.max_candidates]:
114
+ pts = comp["pts"] # global coords (N, 2) float32
115
+ if pts.shape[0] < 4:
116
+ continue
117
+
118
+ if pts.shape[0] > self.max_points_for_box:
119
+ step = max(1, pts.shape[0] // self.max_points_for_box)
120
+ pts = pts[::step]
121
+
122
+ # Axis-aligned bounding rect: exact for horizontal screen text and
123
+ # ~5× faster than PCA (no eigendecomposition).
124
+ # Swap to _pca_rect_quad if you need rotated-text support.
125
+ box, sside = _axis_aligned_rect(pts)
126
+ if sside < self.min_size:
127
+ continue
128
+
129
+ score = _box_score_fast(prob, box)
130
+ if score < self.box_thresh:
131
+ continue
132
+
133
+ box = _unclip_quad(box, self.unclip_ratio)
134
+ box, sside = _axis_aligned_rect(box)
135
+ if sside < self.min_size + 2:
136
+ continue
137
+
138
+ # scale back to original image coordinates
139
+ box[:, 0] = np.clip(np.round(box[:, 0] / map_w * orig_w), 0, orig_w - 1)
140
+ box[:, 1] = np.clip(np.round(box[:, 1] / map_h * orig_h), 0, orig_h - 1)
141
+
142
+ boxes.append(box.astype(np.int32))
143
+ scores.append(float(score))
144
+
145
+ if not boxes:
146
+ return np.empty((0, 4, 2), dtype=np.int32), []
147
+
148
+ boxes_arr = np.stack(boxes, axis=0)
149
+ boxes_arr, scores = _filter_boxes(boxes_arr, scores, orig_h, orig_w)
150
+ return boxes_arr, scores
151
+
152
+
153
+ # ---------------------------------------------------------------------------
154
+ # Pure-numpy image processing primitives (cv2 replacements)
155
+ # ---------------------------------------------------------------------------
156
+
157
+ def _dilate2x2_bool(mask: np.ndarray) -> np.ndarray:
158
+ """Equivalent of cv2.dilate with a 2×2 all-ones kernel."""
159
+ out = mask.copy()
160
+ out[:-1, :] |= mask[1:, :]
161
+ out[:, :-1] |= mask[:, 1:]
162
+ out[:-1, :-1] |= mask[1:, 1:]
163
+ return out
164
+
165
+
166
+ try:
167
+ import scipy.ndimage as scipy_ndimage
168
+ except ImportError:
169
+ scipy_ndimage = None
170
+
171
+ def _find_components(mask: np.ndarray):
172
+ """Return connected components with point clouds in global coords."""
173
+ if scipy_ndimage is not None:
174
+ with log_perf("scipy_ndimage_label.nd_label"):
175
+ labeled, n = scipy_ndimage.label(mask)
176
+ with log_perf("scipy_ndimage_label.find_objects"):
177
+ objs = scipy_ndimage.find_objects(labeled)
178
+ return _components_from_labeled(labeled, objs, n)
179
+ else:
180
+ with log_perf("_label_numpy"):
181
+ labeled, n = _label_numpy(mask)
182
+ with log_perf("_find_objects_numpy"):
183
+ objs = _find_objects_numpy(labeled, n)
184
+ return _components_from_labeled(labeled, objs, n)
185
+
186
+
187
+ def _components_from_labeled(labeled: np.ndarray, objs, n: int):
188
+ comps = []
189
+ for i in range(1, n + 1):
190
+ sl = objs[i - 1] if i - 1 < len(objs) else None
191
+ if sl is None:
192
+ continue
193
+ ysl, xsl = sl
194
+ roi = labeled[ysl, xsl] == i
195
+ if roi.sum() < 4:
196
+ continue
197
+ ys, xs = np.nonzero(roi)
198
+ xs = xs + xsl.start
199
+ ys = ys + ysl.start
200
+ pts = np.empty((xs.size, 2), dtype=np.float32)
201
+ pts[:, 0] = xs
202
+ pts[:, 1] = ys
203
+ comps.append({"label": i, "slice": sl, "pts": pts})
204
+ return comps
205
+
206
+
207
+ def _find_objects_numpy(labeled: np.ndarray, n: int):
208
+ """Cheap find_objects replacement for fallback labeling."""
209
+ objs = [None] * n
210
+ if n == 0:
211
+ return objs
212
+ ys, xs = np.nonzero(labeled)
213
+ vals = labeled[ys, xs]
214
+ mins_y = np.full(n + 1, labeled.shape[0], dtype=np.int32)
215
+ mins_x = np.full(n + 1, labeled.shape[1], dtype=np.int32)
216
+ maxs_y = np.full(n + 1, -1, dtype=np.int32)
217
+ maxs_x = np.full(n + 1, -1, dtype=np.int32)
218
+ np.minimum.at(mins_y, vals, ys)
219
+ np.minimum.at(mins_x, vals, xs)
220
+ np.maximum.at(maxs_y, vals, ys)
221
+ np.maximum.at(maxs_x, vals, xs)
222
+ for i in range(1, n + 1):
223
+ if maxs_y[i] >= 0:
224
+ objs[i - 1] = (
225
+ slice(int(mins_y[i]), int(maxs_y[i]) + 1),
226
+ slice(int(mins_x[i]), int(maxs_x[i]) + 1),
227
+ )
228
+ return objs
229
+
230
+ def downscale(mask, factor=2):
231
+ """Downscale a 2D binary mask by integer factor using block reduction."""
232
+ h, w = mask.shape
233
+ new_h, new_w = h // factor, w // factor
234
+ # reshape into blocks and take max (if any pixel is 1, block is 1)
235
+ down = mask[:new_h*factor, :new_w*factor].reshape(new_h, factor, new_w, factor)
236
+ down = down.max(axis=(1, 3))
237
+ return down
238
+
239
+ def upscale(labels_small, factor=2, original_shape=None):
240
+ """Upscale labels back using nearest-neighbor replication."""
241
+ up = np.repeat(np.repeat(labels_small, factor, axis=0), factor, axis=1)
242
+ if original_shape:
243
+ up = up[:original_shape[0], :original_shape[1]]
244
+ return up
245
+
246
+ def label_numpy(mask):
247
+ """Minimal 4-connectivity labeling."""
248
+ h, w = mask.shape
249
+ labels = np.zeros((h, w), dtype=np.int32)
250
+ label = 0
251
+ for y in range(h):
252
+ for x in range(w):
253
+ if not mask[y, x] or labels[y, x] != 0:
254
+ continue
255
+ label += 1
256
+ stack = [(y, x)]
257
+ labels[y, x] = label
258
+ while stack:
259
+ cy, cx = stack.pop()
260
+ if cy > 0 and mask[cy - 1, cx] and labels[cy - 1, cx] == 0:
261
+ labels[cy - 1, cx] = label
262
+ stack.append((cy - 1, cx))
263
+ if cy + 1 < h and mask[cy + 1, cx] and labels[cy + 1, cx] == 0:
264
+ labels[cy + 1, cx] = label
265
+ stack.append((cy + 1, cx))
266
+ if cx > 0 and mask[cy, cx - 1] and labels[cy, cx - 1] == 0:
267
+ labels[cy, cx - 1] = label
268
+ stack.append((cy, cx - 1))
269
+ if cx + 1 < w and mask[cy, cx + 1] and labels[cy, cx + 1] == 0:
270
+ labels[cy, cx + 1] = label
271
+ stack.append((cy, cx + 1))
272
+ return labels, label
273
+
274
+ def _label_numpy(mask, factor=2):
275
+ """Downscale → label → upscale"""
276
+ small = downscale(mask, factor=factor)
277
+ labels_small, n = label_numpy(small)
278
+ labels_up = upscale(labels_small, factor=factor, original_shape=mask.shape)
279
+ return labels_up, n
280
+
281
+ '''
282
+ def _label_numpy(mask: np.ndarray):
283
+ """Minimal 4-connectivity connected-component labeling fallback."""
284
+ h, w = mask.shape
285
+ labels = np.zeros((h, w), dtype=np.int32)
286
+ label = 0
287
+ for y in range(h):
288
+ for x in range(w):
289
+ if not mask[y, x] or labels[y, x] != 0:
290
+ continue
291
+ label += 1
292
+ stack = [(y, x)]
293
+ labels[y, x] = label
294
+ while stack:
295
+ cy, cx = stack.pop()
296
+ if cy > 0 and mask[cy - 1, cx] and labels[cy - 1, cx] == 0:
297
+ labels[cy - 1, cx] = label
298
+ stack.append((cy - 1, cx))
299
+ if cy + 1 < h and mask[cy + 1, cx] and labels[cy + 1, cx] == 0:
300
+ labels[cy + 1, cx] = label
301
+ stack.append((cy + 1, cx))
302
+ if cx > 0 and mask[cy, cx - 1] and labels[cy, cx - 1] == 0:
303
+ labels[cy, cx - 1] = label
304
+ stack.append((cy, cx - 1))
305
+ if cx + 1 < w and mask[cy, cx + 1] and labels[cy, cx + 1] == 0:
306
+ labels[cy, cx + 1] = label
307
+ stack.append((cy, cx + 1))
308
+ return labels, label
309
+ '''
310
+
311
+ # ---------------------------------------------------------------------------
312
+ # Geometry
313
+ # ---------------------------------------------------------------------------
314
+
315
+ def _axis_aligned_rect(pts: np.ndarray) -> Tuple[np.ndarray, float]:
316
+ """Fast axis-aligned bounding rectangle.
317
+
318
+ Exact for horizontal screen / UI text and ~5× faster than _pca_rect_quad
319
+ because it only needs min/max instead of a covariance eigendecomposition.
320
+
321
+ Returns:
322
+ box: (4, 2) float32 quad in TL→TR→BR→BL order
323
+ short_side: float
324
+ """
325
+ mn = pts.min(axis=0)
326
+ mx = pts.max(axis=0)
327
+ box = np.array(
328
+ [[mn[0], mn[1]], [mx[0], mn[1]], [mx[0], mx[1]], [mn[0], mx[1]]],
329
+ dtype=np.float32,
330
+ )
331
+ return box, float(min(mx[0] - mn[0], mx[1] - mn[1]))
332
+
333
+
334
+ def _pca_rect_quad(pts: np.ndarray) -> Tuple[np.ndarray, float]:
335
+ """PCA-based oriented rectangle — retained for rotated-text use-cases.
336
+
337
+ Use _axis_aligned_rect for screenshots (faster, equally accurate there).
338
+
339
+ Returns:
340
+ box: (4, 2) float32 quad in TL→TR→BR→BL order
341
+ short_side: float
342
+ """
343
+ pts = pts.astype(np.float32, copy=False)
344
+ if pts.shape[0] == 1:
345
+ x, y = pts[0]
346
+ return np.array([[x, y], [x+1, y], [x+1, y+1], [x, y+1]], dtype=np.float32), 1.0
347
+
348
+ center = pts.mean(axis=0)
349
+ centered = pts - center
350
+ cov = centered.T @ centered / max(len(pts), 1)
351
+ eigvals, eigvecs = np.linalg.eigh(cov)
352
+ order = np.argsort(eigvals)[::-1]
353
+ eigvecs = eigvecs[:, order]
354
+
355
+ proj = centered @ eigvecs
356
+ mn = proj.min(axis=0)
357
+ mx = proj.max(axis=0)
358
+ corners = np.array([[mn[0], mn[1]], [mx[0], mn[1]], [mx[0], mx[1]], [mn[0], mx[1]]], dtype=np.float32)
359
+ box = _order_quad(corners @ eigvecs.T + center)
360
+ w = float(np.linalg.norm(box[0] - box[1]))
361
+ h = float(np.linalg.norm(box[1] - box[2]))
362
+ return box, min(w, h)
363
+
364
+
365
+ def _order_quad(box: np.ndarray) -> np.ndarray:
366
+ """Sort 4 points into TL, TR, BR, BL order."""
367
+ box = np.asarray(box, dtype=np.float32)
368
+ x_sort = box[np.argsort(box[:, 0])]
369
+ left = x_sort[:2][np.argsort(x_sort[:2, 1])]
370
+ right = x_sort[2:][np.argsort(x_sort[2:, 1])]
371
+ return np.array([left[0], right[0], right[1], left[1]], dtype=np.float32)
372
+
373
+
374
+ def _box_score_fast(prob: np.ndarray, box: np.ndarray) -> float:
375
+ """Fast bounding-box mean score."""
376
+ h, w = prob.shape
377
+ xs = np.clip(box[:, 0].astype(np.int32), 0, w - 1)
378
+ ys = np.clip(box[:, 1].astype(np.int32), 0, h - 1)
379
+ x0, x1 = xs.min(), xs.max()
380
+ y0, y1 = ys.min(), ys.max()
381
+ if x0 >= x1 or y0 >= y1:
382
+ return 0.0
383
+ return float(prob[y0:y1 + 1, x0:x1 + 1].mean())
384
+
385
+
386
+ def _unclip_quad(box: np.ndarray, ratio: float) -> np.ndarray:
387
+ """Expand a quad outward by *ratio* using polygon area/perimeter."""
388
+ box = np.asarray(box, dtype=np.float32)
389
+ x, y = box[:, 0], box[:, 1]
390
+ area = 0.5 * abs(float(np.dot(x, np.roll(y, -1)) - np.dot(y, np.roll(x, -1))))
391
+ perimeter = float(np.sum(np.linalg.norm(np.diff(box, axis=0, append=box[:1]), axis=1)))
392
+ if perimeter < 1e-6:
393
+ return box
394
+ distance = area * ratio / perimeter
395
+ center = box.mean(axis=0)
396
+ vecs = box - center
397
+ norms = np.linalg.norm(vecs, axis=1, keepdims=True).clip(min=1e-6)
398
+ return (box + vecs / norms * distance).astype(np.float32)
399
+
400
+
401
+ def _filter_boxes(
402
+ boxes: np.ndarray, scores: List[float], img_h: int, img_w: int
403
+ ) -> Tuple[np.ndarray, List[float]]:
404
+ kept_boxes, kept_scores = [], []
405
+ for box, score in zip(boxes, scores):
406
+ box = _order_quad(box)
407
+ box[:, 0] = np.clip(box[:, 0], 0, img_w - 1)
408
+ box[:, 1] = np.clip(box[:, 1], 0, img_h - 1)
409
+ w = float(np.linalg.norm(box[0] - box[1]))
410
+ h = float(np.linalg.norm(box[1] - box[2]))
411
+ if w <= 3 or h <= 3:
412
+ continue
413
+ kept_boxes.append(box.astype(np.int32))
414
+ kept_scores.append(float(score))
415
+ if not kept_boxes:
416
+ return np.empty((0, 4, 2), dtype=np.int32), []
417
+ return np.stack(kept_boxes, axis=0), kept_scores