screenshot-vision-algorithm 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. screenshot_vision_algorithm/__init__.py +48 -0
  2. screenshot_vision_algorithm/_config.py +61 -0
  3. screenshot_vision_algorithm/android/__init__.py +1 -0
  4. screenshot_vision_algorithm/android/wechat/__init__.py +1 -0
  5. screenshot_vision_algorithm/android/wechat/algorithms/__init__.py +0 -0
  6. screenshot_vision_algorithm/android/wechat/algorithms/avatar_column.py +209 -0
  7. screenshot_vision_algorithm/android/wechat/algorithms/badge_detection.py +275 -0
  8. screenshot_vision_algorithm/android/wechat/algorithms/card_bbox.py +1000 -0
  9. screenshot_vision_algorithm/android/wechat/algorithms/phash_utils.py +267 -0
  10. screenshot_vision_algorithm/android/wechat/algorithms/speaker_band.py +290 -0
  11. screenshot_vision_algorithm/android/wechat/algorithms/template_matching.py +2163 -0
  12. screenshot_vision_algorithm/android/wechat/algorithms/title_ocr.py +143 -0
  13. screenshot_vision_algorithm/android/wechat/merge/__init__.py +0 -0
  14. screenshot_vision_algorithm/android/wechat/merge/multipage.py +157 -0
  15. screenshot_vision_algorithm/android/wechat/ocr/__init__.py +0 -0
  16. screenshot_vision_algorithm/android/wechat/ocr/avatar_guard.py +434 -0
  17. screenshot_vision_algorithm/android/wechat/ocr/badge_ocr.py +232 -0
  18. screenshot_vision_algorithm/android/wechat/ocr/nickname_binding.py +1888 -0
  19. screenshot_vision_algorithm/android/wechat/ocr/text_ocr_adapter.py +625 -0
  20. screenshot_vision_algorithm/android/wechat/profiles/__init__.py +0 -0
  21. screenshot_vision_algorithm/android/wechat/profiles/android.py +53 -0
  22. screenshot_vision_algorithm/android/wechat/profiles/harmony.py +10 -0
  23. screenshot_vision_algorithm/android/wechat/profiles/ios.py +53 -0
  24. screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/chat_back_chevron.png +0 -0
  25. screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/chat_input_emoji_smile.png +0 -0
  26. screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/chat_input_plus.png +0 -0
  27. screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/chat_input_voice.png +0 -0
  28. screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/chat_title_more_dots.png +0 -0
  29. screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/favorite_label.png +0 -0
  30. screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/new_messages_hint_suffix.png +0 -0
  31. screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/unread_divider_hint.png +0 -0
  32. screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/unread_divider_hint_v2_textonly.png +0 -0
  33. screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/wechat_note_header.png +0 -0
  34. screenshot_vision_algorithm/android/xhs/__init__.py +4 -0
  35. screenshot_vision_algorithm/android/zhihu/__init__.py +4 -0
  36. screenshot_vision_algorithm/png_utils.py +86 -0
  37. screenshot_vision_algorithm-0.3.0.dist-info/METADATA +425 -0
  38. screenshot_vision_algorithm-0.3.0.dist-info/RECORD +40 -0
  39. screenshot_vision_algorithm-0.3.0.dist-info/WHEEL +5 -0
  40. screenshot_vision_algorithm-0.3.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,625 @@
1
+ """Processor-side text-only OCR adapter (d2-4).
2
+
3
+ Day 2 scope (this module):
4
+
5
+ Thin, engine-agnostic wrapper around PaddleOCR's text detection +
6
+ recognition. Feeds a single PNG (plus its ``Screenshot`` metadata)
7
+ through one pass and returns a flat list of **raw** ``TextBlock``
8
+ entries — ``(text, bbox_xyxy, confidence)`` — in reading order,
9
+ with bounding boxes mapped back to the **original** image
10
+ coordinate system.
11
+
12
+ Explicit non-goals (by design, per ADR v0.3 + §2.5.2 phase split):
13
+
14
+ - **Text normalization** (NFC / full-width ↔ half-width) is
15
+ deferred to the downstream text pipeline (d5
16
+ ``resume_text_merger`` for resume pages, or the nickname OCR
17
+ minimal pipeline in d3). Running it here would duplicate
18
+ ``lite_text_normalizer.normalize_business_text`` and cross
19
+ the scripts/venv ↔ backend/venv boundary for a function
20
+ whose output shape is a simple str. The raw OCR output must
21
+ survive untouched to the phase-4 / phase-5 consumers.
22
+ - **Nickname/speaker attribution**: ADR §2.3 phase 2
23
+ (NicknameBoundaryService) reads from this adapter's output;
24
+ it is NOT the adapter's job.
25
+ - **Resume page stitching / n-gram dedup**: d5
26
+ ``resume_text_merger`` consumes a list of ``OcrPageResult``
27
+ objects across one ``resume_group_id``; this module only
28
+ returns the per-page shape.
29
+ - **PP-Structure / layout analysis**: ADR v0.3 §4.3.0 permanent
30
+ project boundary — this adapter is **text-only** by contract;
31
+ PaddleOCR's ``PaddleOCR.predict()`` (or 2.x ``.ocr()``) is
32
+ the sole entry point used.
33
+ - **Head-avatar pHash / geometry**: ADR v0.3 permanently
34
+ discarded; the adapter does not expose any head-specific
35
+ API.
36
+
37
+ Scale policy (ADR §6.4.4):
38
+
39
+ The collector stamps ``screenshots[*].ocr_scale_hint`` at
40
+ capture time — ``1600`` for ``chat_message`` type,
41
+ ``1280`` for ``resume_detail`` type. The adapter treats this
42
+ as an **upper bound on the long-edge pre-OCR**: if the raw
43
+ image's long-edge exceeds the hint, it's downscaled with
44
+ ``cv2.INTER_AREA``; otherwise the image is passed through
45
+ unresized. Detection bounding boxes coming back from PaddleOCR
46
+ are in the *resized* coordinate space and are rescaled to
47
+ original coords before leaving the adapter so that downstream
48
+ consumers (e.g. ``side_hint_ratio`` cross-check in phase 2)
49
+ can reason in the same frame as the metadata.
50
+
51
+ Engine indirection:
52
+
53
+ ``OcrEngine`` is a Protocol with a single
54
+ ``detect_and_recognize(bgr) -> list[RawOcrItem]`` method. The
55
+ real implementation ``PaddleOcrEngine`` lazy-imports PaddleOCR
56
+ so Windows collector venvs and CI containers without PaddlePaddle
57
+ installed can still import this module (they'd just fail to
58
+ construct a real engine). Unit tests inject a ``FakeOcrEngine``
59
+ (``dummy_engine`` helper in the tests) that returns synthetic
60
+ items; that keeps d2-4 test runs offline and lets d2-5's mock
61
+ coverage target the adapter independently of Paddle's model
62
+ download / CPU startup cost.
63
+
64
+ Output contract:
65
+
66
+ ``OcrPageResult`` carries enough information for the scanner /
67
+ CLI to record session-level OCR stats (wall time, block count,
68
+ char count) AND for phase 2 nickname detection to operate on the
69
+ bboxes. It is a plain dataclass (``asdict``-safe) so JSON reports
70
+ stay trivial.
71
+
72
+ References:
73
+ OCR ADR §2.5.2 phase 1 Preprocess / phase 2 NicknameBoundary
74
+ OCR ADR §6.4.4 ACTION_TYPE_MAP (ocr_scale_hint canonical values)
75
+ OCR ADR §4.3.0 project vision-layer boundary (v0.3)
76
+ backend/app/business/wx_match_business/paddle_ocr_subprocess.py
77
+ (API 冒烟 / 消费者预加载用的子进程批 OCR;与本 adapter 的 Paddle 3.x
78
+ ``predict`` 用法对齐,但 venv 边界独立)
79
+ """
80
+
81
+ from __future__ import annotations
82
+
83
+ import time
84
+ from dataclasses import dataclass, field
85
+ from pathlib import Path
86
+ from typing import TYPE_CHECKING, Callable, Optional, Protocol, runtime_checkable
87
+
88
+ from collector_phone_android_contract import Screenshot
89
+ from loguru import logger
90
+
91
+ if TYPE_CHECKING: # avoid a hard numpy import at module-load time
92
+ import numpy as np
93
+
94
+ #: Minimum confidence kept after PaddleOCR inference(PRD §6 块级 + 与 §11 侧车 0.7 口径一致)。
95
+ #: 与 ``WxMatchSettings.wx_match_ocr_confidence_threshold``、``paddle_ocr_subprocess`` 对齐。
96
+ DEFAULT_CONFIDENCE_THRESHOLD = 0.7
97
+
98
+ #: Reading-order tie-break tolerance: two bboxes whose y1 differs by
99
+ #: less than this many px are treated as "same row" and then sorted
100
+ #: by x1. Intent: match how a human reads chat bubbles where two
101
+ #: adjacent messages sometimes have slightly offset top pixels.
102
+ READING_ORDER_ROW_TOLERANCE_PX = 6
103
+
104
+ #: 相邻 OCR 块若垂直间距超过该值,视为新段落(PRD §6 段落号;与行号独立)。
105
+ PARAGRAPH_BREAK_MIN_GAP_PX = 32
106
+
107
+
108
+ # ============================================================================
109
+ # Raw engine I/O
110
+ # ============================================================================
111
+
112
+
113
+ @dataclass(frozen=True)
114
+ class RawOcrItem:
115
+ """One raw detection returned by an engine before filtering/mapping.
116
+
117
+ Attributes:
118
+ text: recognized text, unmodified.
119
+ bbox_quad: four (x, y) corners in the **resized** image's
120
+ coordinate space; PaddleOCR gives a clockwise quad
121
+ starting from top-left, but the adapter does NOT rely
122
+ on the ordering — it axis-aligns the quad via
123
+ ``quad_to_xyxy``.
124
+ confidence: scalar in ``[0, 1]``.
125
+ """
126
+
127
+ text: str
128
+ bbox_quad: tuple[tuple[float, float], ...]
129
+ confidence: float
130
+
131
+
132
+ @runtime_checkable
133
+ class OcrEngine(Protocol):
134
+ """Minimal engine contract: take BGR image, return raw items."""
135
+
136
+ name: str
137
+
138
+ def detect_and_recognize(self, image_bgr: "np.ndarray") -> list[RawOcrItem]:
139
+ """Run text detection + recognition.
140
+
141
+ Implementations MUST NOT resize the image themselves — the
142
+ adapter has already applied ``ocr_scale_hint`` when calling
143
+ this method, and internal resize would break the bbox
144
+ back-projection contract.
145
+ """
146
+ ...
147
+
148
+
149
+ # ============================================================================
150
+ # Adapter output shape
151
+ # ============================================================================
152
+
153
+
154
+ @dataclass
155
+ class TextBlock:
156
+ """Filtered, axis-aligned block in **original** image coords.
157
+
158
+ ``bbox_xyxy`` is ``(x1, y1, x2, y2)`` with ``x1 <= x2`` and
159
+ ``y1 <= y2``. Coordinates are integers (rounded) because
160
+ downstream consumers always treat bboxes as pixel indices.
161
+
162
+ ``line_read_index``:本页阅读序下标(PRD §6 行/块序;1-based)。
163
+ ``paragraph_read_index``:段落序(按垂直大间断分段;1-based)。
164
+ """
165
+
166
+ text: str
167
+ bbox_xyxy: tuple[int, int, int, int]
168
+ confidence: float
169
+ line_read_index: int = 0
170
+ paragraph_read_index: int = 0
171
+
172
+
173
+ @dataclass
174
+ class OcrPageResult:
175
+ """Per-screenshot OCR output the scanner/CLI can stash directly.
176
+
177
+ ``text_blocks`` is already sorted in reading order; ``raw_full_text``
178
+ is the simple newline-join in that same order. ``raw_full_text``
179
+ is *raw* — callers must normalize themselves (see module docstring
180
+ "Explicit non-goals").
181
+ """
182
+
183
+ screenshot_id: str
184
+ scale_hint: int
185
+ original_size: tuple[int, int]
186
+ processed_size: tuple[int, int]
187
+ resized_applied: bool
188
+ text_blocks: list[TextBlock]
189
+ raw_full_text: str
190
+ engine_name: str
191
+ wall_ms: float
192
+ filtered_out_count: int = 0
193
+ confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD
194
+ #: If non-empty, the adapter hit a soft failure (e.g. image decoded
195
+ #: to ``None``) and returned an empty ``text_blocks``. The scanner
196
+ #: uses this to distinguish "really no text" from "could not read
197
+ #: the file" without making it a hard error.
198
+ soft_error: Optional[str] = None
199
+
200
+
201
+ # ============================================================================
202
+ # Errors
203
+ # ============================================================================
204
+
205
+
206
+ class TextOcrAdapterError(Exception):
207
+ """Raised when the adapter cannot produce a usable ``OcrPageResult``.
208
+
209
+ Reserved for programmer / environment errors (e.g. engine not
210
+ installed when expected, PNG file unreadable). Run-time OCR
211
+ "zero hits" or "low-confidence filtered all" scenarios return a
212
+ valid ``OcrPageResult`` with empty ``text_blocks`` instead, so
213
+ the scanner's terminal-status logic stays simple.
214
+ """
215
+
216
+ def __init__(self, message: str, *, error_code: str) -> None:
217
+ super().__init__(message)
218
+ self.error_code = error_code
219
+
220
+
221
+ # ============================================================================
222
+ # Geometry helpers (pure, easy to unit-test)
223
+ # ============================================================================
224
+
225
+
226
+ def resize_long_edge_to(
227
+ image_bgr: "np.ndarray",
228
+ long_edge: int,
229
+ ) -> tuple["np.ndarray", float]:
230
+ """Downscale to ``long_edge``; no-op if already smaller.
231
+
232
+ Returns ``(resized_image, scale_ratio)`` where ``scale_ratio`` is
233
+ ``new_long_edge / old_long_edge`` (``1.0`` if no resize). The
234
+ caller uses ``scale_ratio`` to map bboxes back to original
235
+ coordinates.
236
+ """
237
+ import cv2
238
+
239
+ h, w = image_bgr.shape[:2]
240
+ original_long = max(h, w)
241
+ if original_long <= long_edge:
242
+ return image_bgr, 1.0
243
+ ratio = long_edge / original_long
244
+ new_w = max(1, int(round(w * ratio)))
245
+ new_h = max(1, int(round(h * ratio)))
246
+ return (
247
+ cv2.resize(image_bgr, (new_w, new_h), interpolation=cv2.INTER_AREA),
248
+ ratio,
249
+ )
250
+
251
+
252
+ def quad_to_xyxy(
253
+ quad: tuple[tuple[float, float], ...],
254
+ ) -> tuple[float, float, float, float]:
255
+ """Axis-align a 4-corner polygon to a ``(x1, y1, x2, y2)`` bbox.
256
+
257
+ Uses the quad's min/max envelope. This is deliberate: the
258
+ downstream ``NicknameBoundaryService`` reasons in axis-aligned
259
+ boxes (font-size ratio / y-gap / left-right alignment), so
260
+ preserving the rotated quad would pay a cost with no consumer.
261
+ """
262
+ xs = [p[0] for p in quad]
263
+ ys = [p[1] for p in quad]
264
+ return (min(xs), min(ys), max(xs), max(ys))
265
+
266
+
267
+ def scale_bbox(
268
+ bbox_xyxy: tuple[float, float, float, float],
269
+ inv_ratio: float,
270
+ ) -> tuple[int, int, int, int]:
271
+ """Scale ``xyxy`` by ``inv_ratio`` and round to int pixels.
272
+
273
+ ``inv_ratio`` is ``1 / scale_ratio`` returned by
274
+ ``resize_long_edge_to`` — i.e. "resized → original" direction.
275
+ """
276
+ x1, y1, x2, y2 = bbox_xyxy
277
+ return (
278
+ max(0, int(round(x1 * inv_ratio))),
279
+ max(0, int(round(y1 * inv_ratio))),
280
+ max(0, int(round(x2 * inv_ratio))),
281
+ max(0, int(round(y2 * inv_ratio))),
282
+ )
283
+
284
+
285
+ def assign_paragraph_read_index(blocks: list[TextBlock]) -> list[TextBlock]:
286
+ """为已排序块赋 ``paragraph_read_index``(垂直间隙过大则新开段落)。"""
287
+ if not blocks:
288
+ return blocks
289
+ out: list[TextBlock] = []
290
+ para = 1
291
+ prev_bottom: Optional[int] = None
292
+ for b in blocks:
293
+ y1 = b.bbox_xyxy[1]
294
+ if prev_bottom is not None and (y1 - prev_bottom) > PARAGRAPH_BREAK_MIN_GAP_PX:
295
+ para += 1
296
+ out.append(
297
+ TextBlock(
298
+ text=b.text,
299
+ bbox_xyxy=b.bbox_xyxy,
300
+ confidence=b.confidence,
301
+ line_read_index=b.line_read_index,
302
+ paragraph_read_index=para,
303
+ )
304
+ )
305
+ prev_bottom = b.bbox_xyxy[3]
306
+ return out
307
+
308
+
309
+ def sort_reading_order(blocks: list[TextBlock]) -> list[TextBlock]:
310
+ """Top-to-bottom, left-to-right with a small row tolerance.
311
+
312
+ We bucket ``y1`` into row bands of
313
+ ``READING_ORDER_ROW_TOLERANCE_PX`` so that two bboxes sharing a
314
+ row but whose detected tops differ by a few pixels still sort
315
+ left-to-right within the row. Pure-Python, O(n log n), stable.
316
+ """
317
+
318
+ def key(b: TextBlock) -> tuple[int, int]:
319
+ y1 = b.bbox_xyxy[1]
320
+ row = y1 // READING_ORDER_ROW_TOLERANCE_PX
321
+ return (row, b.bbox_xyxy[0])
322
+
323
+ return sorted(blocks, key=key)
324
+
325
+
326
+ # ============================================================================
327
+ # Real PaddleOCR engine (lazy-loaded)
328
+ # ============================================================================
329
+
330
+
331
+ class PaddleOcrEngine:
332
+ """Real PaddleOCR 3.x engine, text-only.
333
+
334
+ Construction is expensive (model download on first run, ~30 s
335
+ startup even on cached models), so callers are expected to
336
+ instantiate **once** and reuse across screenshots within a
337
+ session.
338
+
339
+ ``use_server_model=False`` (default) matches the ADR v0.3
340
+ deployment: PP-OCRv5 *mobile* on CPU. The ``server`` variant is
341
+ reserved for GPU rollout later.
342
+
343
+ ``rec_batch`` defaults to 6 mirroring
344
+ ``app.core.config.Settings.paddle_ocr_rec_batch`` — changing it
345
+ only affects throughput, not the adapter's output shape.
346
+ """
347
+
348
+ name = "paddleocr_v3_text_only"
349
+
350
+ def __init__(
351
+ self,
352
+ *,
353
+ use_server_model: bool = False,
354
+ rec_batch: int = 6,
355
+ max_long_edge_pre_ocr: int = 2048,
356
+ ) -> None:
357
+ self._use_server_model = use_server_model
358
+ self._rec_batch = rec_batch
359
+ # Give PaddleOCR's internal det_limit a ceiling generous
360
+ # enough to never re-resize images the adapter has already
361
+ # sized to ``ocr_scale_hint``. The adapter is the single
362
+ # source of truth for scale policy.
363
+ self._max_long_edge_pre_ocr = max_long_edge_pre_ocr
364
+ self._ocr: object = None
365
+
366
+ def _ensure_engine(self) -> object:
367
+ if self._ocr is not None:
368
+ return self._ocr
369
+ from paddleocr import PaddleOCR # noqa: PLC0415 — lazy
370
+ kwargs: dict[str, object] = {
371
+ "use_textline_orientation": True,
372
+ # Doc preprocessing (orientation classify + UVDoc unwarping)
373
+ # defaults to ON in PaddleOCR 3.x. UVDoc "rectifies" flat phone
374
+ # screenshots, warping bbox coordinates non-linearly along y
375
+ # (measured -25px top → +90px bottom on 720x1612), which breaks
376
+ # card/nickname spatial binding downstream. Screenshots are
377
+ # always flat and upright — disable both.
378
+ "use_doc_orientation_classify": False,
379
+ "use_doc_unwarping": False,
380
+ "lang": "ch",
381
+ "device": "cpu",
382
+ "text_recognition_batch_size": self._rec_batch,
383
+ "text_det_limit_type": "max",
384
+ "text_det_limit_side_len": self._max_long_edge_pre_ocr,
385
+ }
386
+ if not self._use_server_model:
387
+ kwargs["text_detection_model_name"] = "PP-OCRv5_mobile_det"
388
+ kwargs["text_recognition_model_name"] = "PP-OCRv5_mobile_rec"
389
+ self._ocr = PaddleOCR(**kwargs)
390
+ return self._ocr
391
+
392
+ def detect_and_recognize(self, image_bgr: "np.ndarray") -> list[RawOcrItem]:
393
+ ocr = self._ensure_engine()
394
+ results = list(ocr.predict([image_bgr])) # type: ignore[attr-defined]
395
+ if not results:
396
+ return []
397
+ r = results[0]
398
+ texts: list[str] = list(r.get("rec_texts", []) or [])
399
+ scores: list[float] = list(r.get("rec_scores", []) or [])
400
+ polys_raw = r.get("rec_polys") or r.get("dt_polys") or []
401
+ items: list[RawOcrItem] = []
402
+ for i, text in enumerate(texts):
403
+ score = float(scores[i]) if i < len(scores) else 0.0
404
+ poly = polys_raw[i] if i < len(polys_raw) else None
405
+ if poly is None:
406
+ continue
407
+ quad = _normalize_poly(poly)
408
+ if quad is None:
409
+ continue
410
+ items.append(RawOcrItem(text=text, bbox_quad=quad, confidence=score))
411
+ return items
412
+
413
+
414
+ def _normalize_poly(poly: object) -> Optional[tuple[tuple[float, float], ...]]:
415
+ """Coerce numpy / list-of-lists poly into a plain tuple of (x, y)."""
416
+ try:
417
+ iterable = list(poly) # type: ignore[arg-type]
418
+ except TypeError:
419
+ return None
420
+ corners: list[tuple[float, float]] = []
421
+ for pt in iterable:
422
+ try:
423
+ x, y = float(pt[0]), float(pt[1])
424
+ except (TypeError, IndexError, ValueError):
425
+ return None
426
+ corners.append((x, y))
427
+ if len(corners) < 3:
428
+ return None
429
+ return tuple(corners)
430
+
431
+
432
+ # ============================================================================
433
+ # Adapter
434
+ # ============================================================================
435
+
436
+
437
+ class TextOcrAdapter:
438
+ """Process one screenshot through an injected OCR engine.
439
+
440
+ Single-session usage pattern (scanner/CLI):
441
+
442
+ engine = PaddleOcrEngine()
443
+ adapter = TextOcrAdapter(engine=engine)
444
+ for shot in metadata.screenshots:
445
+ png = resolver.resolve(session_dir, shot)
446
+ result = adapter.process_page(png, shot)
447
+ # hand result to phase-2 / phase-5 consumers
448
+ """
449
+
450
+ def __init__(
451
+ self,
452
+ engine: OcrEngine,
453
+ *,
454
+ confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD,
455
+ image_loader: Optional[Callable[[Path], "np.ndarray"]] = None,
456
+ ) -> None:
457
+ self.engine = engine
458
+ self.confidence_threshold = confidence_threshold
459
+ self._load_image = image_loader or _default_image_loader
460
+
461
+ def process_page(
462
+ self,
463
+ png_path: Path,
464
+ screenshot: Screenshot,
465
+ ) -> OcrPageResult:
466
+ """Run one screenshot through the OCR engine.
467
+
468
+ Raises:
469
+ TextOcrAdapterError: when the PNG file cannot be decoded
470
+ at all (propagated error_code=``ocr_image_decode_error``
471
+ so the scanner can mark the session row ``error``
472
+ with that code — this is a **non-retryable** failure
473
+ because redoing OCR with the same broken bytes won't
474
+ help).
475
+
476
+ Zero-hit / all-low-confidence cases are NOT raised — they
477
+ return a valid ``OcrPageResult`` with empty ``text_blocks``
478
+ and ``filtered_out_count`` set so Admin can distinguish.
479
+ """
480
+ t0 = time.perf_counter()
481
+ try:
482
+ image_bgr = self._load_image(png_path)
483
+ except FileNotFoundError:
484
+ raise
485
+ except Exception as e: # decode failure surfaces as adapter error
486
+ raise TextOcrAdapterError(
487
+ f"failed to decode image at {png_path!s}: {e!r}",
488
+ error_code="ocr_image_decode_error",
489
+ ) from e
490
+ if image_bgr is None:
491
+ raise TextOcrAdapterError(
492
+ f"cv2.imdecode returned None for {png_path!s}",
493
+ error_code="ocr_image_decode_error",
494
+ )
495
+
496
+ original_h, original_w = image_bgr.shape[:2]
497
+ resized_bgr, scale_ratio = resize_long_edge_to(
498
+ image_bgr, screenshot.ocr_scale_hint
499
+ )
500
+ processed_h, processed_w = resized_bgr.shape[:2]
501
+ resized_applied = scale_ratio != 1.0
502
+
503
+ try:
504
+ raw_items = self.engine.detect_and_recognize(resized_bgr)
505
+ except Exception as e:
506
+ logger.warning(
507
+ "processor.text_ocr_engine_error",
508
+ extra={
509
+ "screenshot_id": screenshot.screenshot_id,
510
+ "engine": getattr(self.engine, "name", type(self.engine).__name__),
511
+ "error": str(e),
512
+ },
513
+ )
514
+ wall_ms = (time.perf_counter() - t0) * 1000.0
515
+ return OcrPageResult(
516
+ screenshot_id=screenshot.screenshot_id,
517
+ scale_hint=screenshot.ocr_scale_hint,
518
+ original_size=(original_w, original_h),
519
+ processed_size=(processed_w, processed_h),
520
+ resized_applied=resized_applied,
521
+ text_blocks=[],
522
+ raw_full_text="",
523
+ engine_name=getattr(self.engine, "name", type(self.engine).__name__),
524
+ wall_ms=wall_ms,
525
+ filtered_out_count=0,
526
+ confidence_threshold=self.confidence_threshold,
527
+ soft_error=f"engine_error:{type(e).__name__}",
528
+ )
529
+
530
+ inv_ratio = 1.0 / scale_ratio if scale_ratio != 0 else 1.0
531
+ blocks: list[TextBlock] = []
532
+ filtered_out = 0
533
+ for item in raw_items:
534
+ if item.confidence < self.confidence_threshold:
535
+ filtered_out += 1
536
+ continue
537
+ axis_aligned = quad_to_xyxy(item.bbox_quad)
538
+ bbox_original = scale_bbox(axis_aligned, inv_ratio)
539
+ # Clamp to original image bounds — shields downstream
540
+ # consumers from off-by-one rounding overshoot.
541
+ x1, y1, x2, y2 = bbox_original
542
+ x1 = min(max(0, x1), original_w)
543
+ y1 = min(max(0, y1), original_h)
544
+ x2 = min(max(0, x2), original_w)
545
+ y2 = min(max(0, y2), original_h)
546
+ if x2 <= x1 or y2 <= y1:
547
+ filtered_out += 1
548
+ continue
549
+ blocks.append(
550
+ TextBlock(
551
+ text=item.text,
552
+ bbox_xyxy=(x1, y1, x2, y2),
553
+ confidence=item.confidence,
554
+ line_read_index=0,
555
+ )
556
+ )
557
+
558
+ blocks = sort_reading_order(blocks)
559
+ blocks = [
560
+ TextBlock(
561
+ text=b.text,
562
+ bbox_xyxy=b.bbox_xyxy,
563
+ confidence=b.confidence,
564
+ line_read_index=idx,
565
+ paragraph_read_index=0,
566
+ )
567
+ for idx, b in enumerate(blocks, start=1)
568
+ ]
569
+ blocks = assign_paragraph_read_index(blocks)
570
+ raw_full_text = "\n".join(b.text for b in blocks)
571
+ wall_ms = (time.perf_counter() - t0) * 1000.0
572
+
573
+ return OcrPageResult(
574
+ screenshot_id=screenshot.screenshot_id,
575
+ scale_hint=screenshot.ocr_scale_hint,
576
+ original_size=(original_w, original_h),
577
+ processed_size=(processed_w, processed_h),
578
+ resized_applied=resized_applied,
579
+ text_blocks=blocks,
580
+ raw_full_text=raw_full_text,
581
+ engine_name=getattr(self.engine, "name", type(self.engine).__name__),
582
+ wall_ms=wall_ms,
583
+ filtered_out_count=filtered_out,
584
+ confidence_threshold=self.confidence_threshold,
585
+ )
586
+
587
+
588
+ # ============================================================================
589
+ # Default image loader
590
+ # ============================================================================
591
+
592
+
593
+ def _default_image_loader(png_path: Path) -> "np.ndarray":
594
+ """Load a PNG file as BGR ``np.ndarray`` via cv2.imdecode.
595
+
596
+ Uses ``imdecode`` (not ``imread``) so non-ASCII paths on Windows
597
+ still work — ``imread`` chokes on Unicode paths.
598
+ """
599
+ import cv2
600
+ import numpy as np
601
+
602
+ if not png_path.exists():
603
+ raise FileNotFoundError(f"PNG not found: {png_path!s}")
604
+ data = png_path.read_bytes()
605
+ arr = np.frombuffer(data, dtype=np.uint8)
606
+ img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
607
+ return img
608
+
609
+
610
+ __all__ = [
611
+ "DEFAULT_CONFIDENCE_THRESHOLD",
612
+ "READING_ORDER_ROW_TOLERANCE_PX",
613
+ "OcrEngine",
614
+ "OcrPageResult",
615
+ "PaddleOcrEngine",
616
+ "RawOcrItem",
617
+ "TextBlock",
618
+ "TextOcrAdapter",
619
+ "TextOcrAdapterError",
620
+ "quad_to_xyxy",
621
+ "resize_long_edge_to",
622
+ "scale_bbox",
623
+ "sort_reading_order",
624
+ "assign_paragraph_read_index",
625
+ ]
@@ -0,0 +1,53 @@
1
+ """Android WeChat profile: thresholds, layout constants, and template paths.
2
+
3
+ All constants are anchored to a 1080px-wide baseline screen.
4
+ Scale factor ``scale_w = device_screen_width / 1080`` at runtime.
5
+ """
6
+
7
+ # ── Card bbox detection (§9) ────────────────────────────────────────
8
+ TOP_BAR_BOT_RATIO = 0.10
9
+ BOT_BAR_TOP_RATIO = 0.93
10
+ AVATAR_COLUMN_WIDTH_BASELINE = 108 # px @ 1080
11
+ MIN_CARD_X_GAP_BASELINE = 260 # min distance from avatar right edge to vline
12
+ VLINE_MIN_SEG_RATIO = 0.08 # min continuous mid-variance segment / zone height
13
+ CARD_HSPAN_MIN_RATIO = 0.5 # hline span > 50% card_w
14
+ HLINE_NEAR_VLINE_LIMIT_BASELINE = 100 # px @ 1080
15
+ ZONE_MIN_HEIGHT = 80 # minimum zone / bubble height (px)
16
+ ZONE_MIN_SEG = 10 # minimum raw segment height before clamping
17
+ LABEL_INTERSECT_MARGIN = 5 # tag × hline intersect tolerance (px)
18
+ GAP_MERGE_MAX_DIST = 30 # gap merge / extend threshold (px)
19
+
20
+ # ── Template matching thresholds (§6 §11) ──────────────────────────
21
+ CORE_THRESHOLD = 0.80 # favorite_label / note_header / chevron hard gate
22
+ AUX_THRESHOLD = 0.75 # unread_divider / new_messages_hint
23
+
24
+ # ── Speaker band (§7) ───────────────────────────────────────────────
25
+ CHAT_FIRST_BAND_TOP_EXTEND_BASELINE = 100 # first band top extension (px @ 1080)
26
+ CHAT_BAND_FIRST_AVATAR_TOP_GAP_REJECT_BASELINE = 30
27
+ CHAT_COMPOSER_RESERVE_BOTTOM_BASELINE = 160
28
+ CHAT_TITLE_BAR_TO_CONTENT_OFFSET_BASELINE = 4
29
+ CHAT_SIDE_AVATAR_COLUMN_WIDTH_BASELINE = 108
30
+ CHAT_SIDE_AVATAR_HOUGH_MIN_DIST_BASELINE = 68
31
+ CHAT_SIDE_AVATAR_MIN_R_BASELINE = 14
32
+ CHAT_SIDE_AVATAR_MAX_R_BASELINE = 54
33
+
34
+ # ── Avatar Hough parameters (§6 §7) ─────────────────────────────────
35
+ LIST_HOUGH_MIN_DIST_BASELINE = 117
36
+ LIST_HOUGH_MIN_R_BASELINE = 36
37
+ LIST_HOUGH_MAX_R_BASELINE = 80
38
+ LIST_YMIN_GAP_BASELINE = 118
39
+ LIST_SAME_ICON_DY_MAX_BASELINE = 18
40
+ NICKNAME_AVATAR_BIND_MAX_DY_BASELINE = 140
41
+ AVATAR_ROI_LEFT_BASELINE = 0
42
+ AVATAR_ROI_RIGHT_BASELINE = 290
43
+ AVATAR_MEDIAN_X_HALF_WIDTH_BASELINE = 20
44
+
45
+ # ── Card click & scroll (§11) ───────────────────────────────────────
46
+ FAVORITE_LABEL_TEMPLATE_W_BASELINE = 70
47
+ FAVORITE_LABEL_TEMPLATE_H_BASELINE = 60
48
+ FAVORITE_TO_CARD_TOP_OFFSET_BASELINE = 421
49
+ FAVORITE_TAIL_OFFSET_BASELINE = 60
50
+ REFERENCE_RESUME_CARD_TOP_GAP_BASELINE = 542
51
+
52
+ # ── Avatar side ─────────────────────────────────────────────────────
53
+ AVATAR_SIDE = "left" # Android WeChat places sender avatars on the left
@@ -0,0 +1,10 @@
1
+ """HarmonyOS WeChat profile (placeholder — not yet available).
2
+
3
+ Placeholder for future HarmonyOS WeChat support.
4
+ All constants are not implemented.
5
+ """
6
+
7
+ raise NotImplementedError(
8
+ "HarmonyOS WeChat profile is not yet implemented. "
9
+ "Use Platform.ANDROID or Platform.IOS instead."
10
+ )