screenshot-vision-algorithm 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- screenshot_vision_algorithm/__init__.py +48 -0
- screenshot_vision_algorithm/_config.py +61 -0
- screenshot_vision_algorithm/android/__init__.py +1 -0
- screenshot_vision_algorithm/android/wechat/__init__.py +1 -0
- screenshot_vision_algorithm/android/wechat/algorithms/__init__.py +0 -0
- screenshot_vision_algorithm/android/wechat/algorithms/avatar_column.py +209 -0
- screenshot_vision_algorithm/android/wechat/algorithms/badge_detection.py +275 -0
- screenshot_vision_algorithm/android/wechat/algorithms/card_bbox.py +1000 -0
- screenshot_vision_algorithm/android/wechat/algorithms/phash_utils.py +267 -0
- screenshot_vision_algorithm/android/wechat/algorithms/speaker_band.py +290 -0
- screenshot_vision_algorithm/android/wechat/algorithms/template_matching.py +2163 -0
- screenshot_vision_algorithm/android/wechat/algorithms/title_ocr.py +143 -0
- screenshot_vision_algorithm/android/wechat/merge/__init__.py +0 -0
- screenshot_vision_algorithm/android/wechat/merge/multipage.py +157 -0
- screenshot_vision_algorithm/android/wechat/ocr/__init__.py +0 -0
- screenshot_vision_algorithm/android/wechat/ocr/avatar_guard.py +434 -0
- screenshot_vision_algorithm/android/wechat/ocr/badge_ocr.py +232 -0
- screenshot_vision_algorithm/android/wechat/ocr/nickname_binding.py +1888 -0
- screenshot_vision_algorithm/android/wechat/ocr/text_ocr_adapter.py +625 -0
- screenshot_vision_algorithm/android/wechat/profiles/__init__.py +0 -0
- screenshot_vision_algorithm/android/wechat/profiles/android.py +53 -0
- screenshot_vision_algorithm/android/wechat/profiles/harmony.py +10 -0
- screenshot_vision_algorithm/android/wechat/profiles/ios.py +53 -0
- screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/chat_back_chevron.png +0 -0
- screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/chat_input_emoji_smile.png +0 -0
- screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/chat_input_plus.png +0 -0
- screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/chat_input_voice.png +0 -0
- screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/chat_title_more_dots.png +0 -0
- screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/favorite_label.png +0 -0
- screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/new_messages_hint_suffix.png +0 -0
- screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/unread_divider_hint.png +0 -0
- screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/unread_divider_hint_v2_textonly.png +0 -0
- screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/wechat_note_header.png +0 -0
- screenshot_vision_algorithm/android/xhs/__init__.py +4 -0
- screenshot_vision_algorithm/android/zhihu/__init__.py +4 -0
- screenshot_vision_algorithm/png_utils.py +86 -0
- screenshot_vision_algorithm-0.3.0.dist-info/METADATA +425 -0
- screenshot_vision_algorithm-0.3.0.dist-info/RECORD +40 -0
- screenshot_vision_algorithm-0.3.0.dist-info/WHEEL +5 -0
- screenshot_vision_algorithm-0.3.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,625 @@
|
|
|
1
|
+
"""Processor-side text-only OCR adapter (d2-4).
|
|
2
|
+
|
|
3
|
+
Day 2 scope (this module):
|
|
4
|
+
|
|
5
|
+
Thin, engine-agnostic wrapper around PaddleOCR's text detection +
|
|
6
|
+
recognition. Feeds a single PNG (plus its ``Screenshot`` metadata)
|
|
7
|
+
through one pass and returns a flat list of **raw** ``TextBlock``
|
|
8
|
+
entries — ``(text, bbox_xyxy, confidence)`` — in reading order,
|
|
9
|
+
with bounding boxes mapped back to the **original** image
|
|
10
|
+
coordinate system.
|
|
11
|
+
|
|
12
|
+
Explicit non-goals (by design, per ADR v0.3 + §2.5.2 phase split):
|
|
13
|
+
|
|
14
|
+
- **Text normalization** (NFC / full-width ↔ half-width) is
|
|
15
|
+
deferred to the downstream text pipeline (d5
|
|
16
|
+
``resume_text_merger`` for resume pages, or the nickname OCR
|
|
17
|
+
minimal pipeline in d3). Running it here would duplicate
|
|
18
|
+
``lite_text_normalizer.normalize_business_text`` and cross
|
|
19
|
+
the scripts/venv ↔ backend/venv boundary for a function
|
|
20
|
+
whose output shape is a simple str. The raw OCR output must
|
|
21
|
+
survive untouched to the phase-4 / phase-5 consumers.
|
|
22
|
+
- **Nickname/speaker attribution**: ADR §2.3 phase 2
|
|
23
|
+
(NicknameBoundaryService) reads from this adapter's output;
|
|
24
|
+
it is NOT the adapter's job.
|
|
25
|
+
- **Resume page stitching / n-gram dedup**: d5
|
|
26
|
+
``resume_text_merger`` consumes a list of ``OcrPageResult``
|
|
27
|
+
objects across one ``resume_group_id``; this module only
|
|
28
|
+
returns the per-page shape.
|
|
29
|
+
- **PP-Structure / layout analysis**: ADR v0.3 §4.3.0 permanent
|
|
30
|
+
project boundary — this adapter is **text-only** by contract;
|
|
31
|
+
PaddleOCR's ``PaddleOCR.predict()`` (or 2.x ``.ocr()``) is
|
|
32
|
+
the sole entry point used.
|
|
33
|
+
- **Head-avatar pHash / geometry**: ADR v0.3 permanently
|
|
34
|
+
discarded; the adapter does not expose any head-specific
|
|
35
|
+
API.
|
|
36
|
+
|
|
37
|
+
Scale policy (ADR §6.4.4):
|
|
38
|
+
|
|
39
|
+
The collector stamps ``screenshots[*].ocr_scale_hint`` at
|
|
40
|
+
capture time — ``1600`` for ``chat_message`` type,
|
|
41
|
+
``1280`` for ``resume_detail`` type. The adapter treats this
|
|
42
|
+
as an **upper bound on the long-edge pre-OCR**: if the raw
|
|
43
|
+
image's long-edge exceeds the hint, it's downscaled with
|
|
44
|
+
``cv2.INTER_AREA``; otherwise the image is passed through
|
|
45
|
+
unresized. Detection bounding boxes coming back from PaddleOCR
|
|
46
|
+
are in the *resized* coordinate space and are rescaled to
|
|
47
|
+
original coords before leaving the adapter so that downstream
|
|
48
|
+
consumers (e.g. ``side_hint_ratio`` cross-check in phase 2)
|
|
49
|
+
can reason in the same frame as the metadata.
|
|
50
|
+
|
|
51
|
+
Engine indirection:
|
|
52
|
+
|
|
53
|
+
``OcrEngine`` is a Protocol with a single
|
|
54
|
+
``detect_and_recognize(bgr) -> list[RawOcrItem]`` method. The
|
|
55
|
+
real implementation ``PaddleOcrEngine`` lazy-imports PaddleOCR
|
|
56
|
+
so Windows collector venvs and CI containers without PaddlePaddle
|
|
57
|
+
installed can still import this module (they'd just fail to
|
|
58
|
+
construct a real engine). Unit tests inject a ``FakeOcrEngine``
|
|
59
|
+
(``dummy_engine`` helper in the tests) that returns synthetic
|
|
60
|
+
items; that keeps d2-4 test runs offline and lets d2-5's mock
|
|
61
|
+
coverage target the adapter independently of Paddle's model
|
|
62
|
+
download / CPU startup cost.
|
|
63
|
+
|
|
64
|
+
Output contract:
|
|
65
|
+
|
|
66
|
+
``OcrPageResult`` carries enough information for the scanner /
|
|
67
|
+
CLI to record session-level OCR stats (wall time, block count,
|
|
68
|
+
char count) AND for phase 2 nickname detection to operate on the
|
|
69
|
+
bboxes. It is a plain dataclass (``asdict``-safe) so JSON reports
|
|
70
|
+
stay trivial.
|
|
71
|
+
|
|
72
|
+
References:
|
|
73
|
+
OCR ADR §2.5.2 phase 1 Preprocess / phase 2 NicknameBoundary
|
|
74
|
+
OCR ADR §6.4.4 ACTION_TYPE_MAP (ocr_scale_hint canonical values)
|
|
75
|
+
OCR ADR §4.3.0 project vision-layer boundary (v0.3)
|
|
76
|
+
backend/app/business/wx_match_business/paddle_ocr_subprocess.py
|
|
77
|
+
(API 冒烟 / 消费者预加载用的子进程批 OCR;与本 adapter 的 Paddle 3.x
|
|
78
|
+
``predict`` 用法对齐,但 venv 边界独立)
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
from __future__ import annotations
|
|
82
|
+
|
|
83
|
+
import time
|
|
84
|
+
from dataclasses import dataclass, field
|
|
85
|
+
from pathlib import Path
|
|
86
|
+
from typing import TYPE_CHECKING, Callable, Optional, Protocol, runtime_checkable
|
|
87
|
+
|
|
88
|
+
from collector_phone_android_contract import Screenshot
|
|
89
|
+
from loguru import logger
|
|
90
|
+
|
|
91
|
+
if TYPE_CHECKING: # avoid a hard numpy import at module-load time
|
|
92
|
+
import numpy as np
|
|
93
|
+
|
|
94
|
+
#: Minimum confidence kept after PaddleOCR inference(PRD §6 块级 + 与 §11 侧车 0.7 口径一致)。
|
|
95
|
+
#: 与 ``WxMatchSettings.wx_match_ocr_confidence_threshold``、``paddle_ocr_subprocess`` 对齐。
|
|
96
|
+
DEFAULT_CONFIDENCE_THRESHOLD = 0.7
|
|
97
|
+
|
|
98
|
+
#: Reading-order tie-break tolerance: two bboxes whose y1 differs by
|
|
99
|
+
#: less than this many px are treated as "same row" and then sorted
|
|
100
|
+
#: by x1. Intent: match how a human reads chat bubbles where two
|
|
101
|
+
#: adjacent messages sometimes have slightly offset top pixels.
|
|
102
|
+
READING_ORDER_ROW_TOLERANCE_PX = 6
|
|
103
|
+
|
|
104
|
+
#: 相邻 OCR 块若垂直间距超过该值,视为新段落(PRD §6 段落号;与行号独立)。
|
|
105
|
+
PARAGRAPH_BREAK_MIN_GAP_PX = 32
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
# ============================================================================
|
|
109
|
+
# Raw engine I/O
|
|
110
|
+
# ============================================================================
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
@dataclass(frozen=True)
|
|
114
|
+
class RawOcrItem:
|
|
115
|
+
"""One raw detection returned by an engine before filtering/mapping.
|
|
116
|
+
|
|
117
|
+
Attributes:
|
|
118
|
+
text: recognized text, unmodified.
|
|
119
|
+
bbox_quad: four (x, y) corners in the **resized** image's
|
|
120
|
+
coordinate space; PaddleOCR gives a clockwise quad
|
|
121
|
+
starting from top-left, but the adapter does NOT rely
|
|
122
|
+
on the ordering — it axis-aligns the quad via
|
|
123
|
+
``quad_to_xyxy``.
|
|
124
|
+
confidence: scalar in ``[0, 1]``.
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
text: str
|
|
128
|
+
bbox_quad: tuple[tuple[float, float], ...]
|
|
129
|
+
confidence: float
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
@runtime_checkable
|
|
133
|
+
class OcrEngine(Protocol):
|
|
134
|
+
"""Minimal engine contract: take BGR image, return raw items."""
|
|
135
|
+
|
|
136
|
+
name: str
|
|
137
|
+
|
|
138
|
+
def detect_and_recognize(self, image_bgr: "np.ndarray") -> list[RawOcrItem]:
|
|
139
|
+
"""Run text detection + recognition.
|
|
140
|
+
|
|
141
|
+
Implementations MUST NOT resize the image themselves — the
|
|
142
|
+
adapter has already applied ``ocr_scale_hint`` when calling
|
|
143
|
+
this method, and internal resize would break the bbox
|
|
144
|
+
back-projection contract.
|
|
145
|
+
"""
|
|
146
|
+
...
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
# ============================================================================
|
|
150
|
+
# Adapter output shape
|
|
151
|
+
# ============================================================================
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
@dataclass
|
|
155
|
+
class TextBlock:
|
|
156
|
+
"""Filtered, axis-aligned block in **original** image coords.
|
|
157
|
+
|
|
158
|
+
``bbox_xyxy`` is ``(x1, y1, x2, y2)`` with ``x1 <= x2`` and
|
|
159
|
+
``y1 <= y2``. Coordinates are integers (rounded) because
|
|
160
|
+
downstream consumers always treat bboxes as pixel indices.
|
|
161
|
+
|
|
162
|
+
``line_read_index``:本页阅读序下标(PRD §6 行/块序;1-based)。
|
|
163
|
+
``paragraph_read_index``:段落序(按垂直大间断分段;1-based)。
|
|
164
|
+
"""
|
|
165
|
+
|
|
166
|
+
text: str
|
|
167
|
+
bbox_xyxy: tuple[int, int, int, int]
|
|
168
|
+
confidence: float
|
|
169
|
+
line_read_index: int = 0
|
|
170
|
+
paragraph_read_index: int = 0
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
@dataclass
|
|
174
|
+
class OcrPageResult:
|
|
175
|
+
"""Per-screenshot OCR output the scanner/CLI can stash directly.
|
|
176
|
+
|
|
177
|
+
``text_blocks`` is already sorted in reading order; ``raw_full_text``
|
|
178
|
+
is the simple newline-join in that same order. ``raw_full_text``
|
|
179
|
+
is *raw* — callers must normalize themselves (see module docstring
|
|
180
|
+
"Explicit non-goals").
|
|
181
|
+
"""
|
|
182
|
+
|
|
183
|
+
screenshot_id: str
|
|
184
|
+
scale_hint: int
|
|
185
|
+
original_size: tuple[int, int]
|
|
186
|
+
processed_size: tuple[int, int]
|
|
187
|
+
resized_applied: bool
|
|
188
|
+
text_blocks: list[TextBlock]
|
|
189
|
+
raw_full_text: str
|
|
190
|
+
engine_name: str
|
|
191
|
+
wall_ms: float
|
|
192
|
+
filtered_out_count: int = 0
|
|
193
|
+
confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD
|
|
194
|
+
#: If non-empty, the adapter hit a soft failure (e.g. image decoded
|
|
195
|
+
#: to ``None``) and returned an empty ``text_blocks``. The scanner
|
|
196
|
+
#: uses this to distinguish "really no text" from "could not read
|
|
197
|
+
#: the file" without making it a hard error.
|
|
198
|
+
soft_error: Optional[str] = None
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
# ============================================================================
|
|
202
|
+
# Errors
|
|
203
|
+
# ============================================================================
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
class TextOcrAdapterError(Exception):
|
|
207
|
+
"""Raised when the adapter cannot produce a usable ``OcrPageResult``.
|
|
208
|
+
|
|
209
|
+
Reserved for programmer / environment errors (e.g. engine not
|
|
210
|
+
installed when expected, PNG file unreadable). Run-time OCR
|
|
211
|
+
"zero hits" or "low-confidence filtered all" scenarios return a
|
|
212
|
+
valid ``OcrPageResult`` with empty ``text_blocks`` instead, so
|
|
213
|
+
the scanner's terminal-status logic stays simple.
|
|
214
|
+
"""
|
|
215
|
+
|
|
216
|
+
def __init__(self, message: str, *, error_code: str) -> None:
|
|
217
|
+
super().__init__(message)
|
|
218
|
+
self.error_code = error_code
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
# ============================================================================
|
|
222
|
+
# Geometry helpers (pure, easy to unit-test)
|
|
223
|
+
# ============================================================================
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def resize_long_edge_to(
|
|
227
|
+
image_bgr: "np.ndarray",
|
|
228
|
+
long_edge: int,
|
|
229
|
+
) -> tuple["np.ndarray", float]:
|
|
230
|
+
"""Downscale to ``long_edge``; no-op if already smaller.
|
|
231
|
+
|
|
232
|
+
Returns ``(resized_image, scale_ratio)`` where ``scale_ratio`` is
|
|
233
|
+
``new_long_edge / old_long_edge`` (``1.0`` if no resize). The
|
|
234
|
+
caller uses ``scale_ratio`` to map bboxes back to original
|
|
235
|
+
coordinates.
|
|
236
|
+
"""
|
|
237
|
+
import cv2
|
|
238
|
+
|
|
239
|
+
h, w = image_bgr.shape[:2]
|
|
240
|
+
original_long = max(h, w)
|
|
241
|
+
if original_long <= long_edge:
|
|
242
|
+
return image_bgr, 1.0
|
|
243
|
+
ratio = long_edge / original_long
|
|
244
|
+
new_w = max(1, int(round(w * ratio)))
|
|
245
|
+
new_h = max(1, int(round(h * ratio)))
|
|
246
|
+
return (
|
|
247
|
+
cv2.resize(image_bgr, (new_w, new_h), interpolation=cv2.INTER_AREA),
|
|
248
|
+
ratio,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def quad_to_xyxy(
|
|
253
|
+
quad: tuple[tuple[float, float], ...],
|
|
254
|
+
) -> tuple[float, float, float, float]:
|
|
255
|
+
"""Axis-align a 4-corner polygon to a ``(x1, y1, x2, y2)`` bbox.
|
|
256
|
+
|
|
257
|
+
Uses the quad's min/max envelope. This is deliberate: the
|
|
258
|
+
downstream ``NicknameBoundaryService`` reasons in axis-aligned
|
|
259
|
+
boxes (font-size ratio / y-gap / left-right alignment), so
|
|
260
|
+
preserving the rotated quad would pay a cost with no consumer.
|
|
261
|
+
"""
|
|
262
|
+
xs = [p[0] for p in quad]
|
|
263
|
+
ys = [p[1] for p in quad]
|
|
264
|
+
return (min(xs), min(ys), max(xs), max(ys))
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def scale_bbox(
|
|
268
|
+
bbox_xyxy: tuple[float, float, float, float],
|
|
269
|
+
inv_ratio: float,
|
|
270
|
+
) -> tuple[int, int, int, int]:
|
|
271
|
+
"""Scale ``xyxy`` by ``inv_ratio`` and round to int pixels.
|
|
272
|
+
|
|
273
|
+
``inv_ratio`` is ``1 / scale_ratio`` returned by
|
|
274
|
+
``resize_long_edge_to`` — i.e. "resized → original" direction.
|
|
275
|
+
"""
|
|
276
|
+
x1, y1, x2, y2 = bbox_xyxy
|
|
277
|
+
return (
|
|
278
|
+
max(0, int(round(x1 * inv_ratio))),
|
|
279
|
+
max(0, int(round(y1 * inv_ratio))),
|
|
280
|
+
max(0, int(round(x2 * inv_ratio))),
|
|
281
|
+
max(0, int(round(y2 * inv_ratio))),
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def assign_paragraph_read_index(blocks: list[TextBlock]) -> list[TextBlock]:
|
|
286
|
+
"""为已排序块赋 ``paragraph_read_index``(垂直间隙过大则新开段落)。"""
|
|
287
|
+
if not blocks:
|
|
288
|
+
return blocks
|
|
289
|
+
out: list[TextBlock] = []
|
|
290
|
+
para = 1
|
|
291
|
+
prev_bottom: Optional[int] = None
|
|
292
|
+
for b in blocks:
|
|
293
|
+
y1 = b.bbox_xyxy[1]
|
|
294
|
+
if prev_bottom is not None and (y1 - prev_bottom) > PARAGRAPH_BREAK_MIN_GAP_PX:
|
|
295
|
+
para += 1
|
|
296
|
+
out.append(
|
|
297
|
+
TextBlock(
|
|
298
|
+
text=b.text,
|
|
299
|
+
bbox_xyxy=b.bbox_xyxy,
|
|
300
|
+
confidence=b.confidence,
|
|
301
|
+
line_read_index=b.line_read_index,
|
|
302
|
+
paragraph_read_index=para,
|
|
303
|
+
)
|
|
304
|
+
)
|
|
305
|
+
prev_bottom = b.bbox_xyxy[3]
|
|
306
|
+
return out
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def sort_reading_order(blocks: list[TextBlock]) -> list[TextBlock]:
|
|
310
|
+
"""Top-to-bottom, left-to-right with a small row tolerance.
|
|
311
|
+
|
|
312
|
+
We bucket ``y1`` into row bands of
|
|
313
|
+
``READING_ORDER_ROW_TOLERANCE_PX`` so that two bboxes sharing a
|
|
314
|
+
row but whose detected tops differ by a few pixels still sort
|
|
315
|
+
left-to-right within the row. Pure-Python, O(n log n), stable.
|
|
316
|
+
"""
|
|
317
|
+
|
|
318
|
+
def key(b: TextBlock) -> tuple[int, int]:
|
|
319
|
+
y1 = b.bbox_xyxy[1]
|
|
320
|
+
row = y1 // READING_ORDER_ROW_TOLERANCE_PX
|
|
321
|
+
return (row, b.bbox_xyxy[0])
|
|
322
|
+
|
|
323
|
+
return sorted(blocks, key=key)
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
# ============================================================================
|
|
327
|
+
# Real PaddleOCR engine (lazy-loaded)
|
|
328
|
+
# ============================================================================
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
class PaddleOcrEngine:
|
|
332
|
+
"""Real PaddleOCR 3.x engine, text-only.
|
|
333
|
+
|
|
334
|
+
Construction is expensive (model download on first run, ~30 s
|
|
335
|
+
startup even on cached models), so callers are expected to
|
|
336
|
+
instantiate **once** and reuse across screenshots within a
|
|
337
|
+
session.
|
|
338
|
+
|
|
339
|
+
``use_server_model=False`` (default) matches the ADR v0.3
|
|
340
|
+
deployment: PP-OCRv5 *mobile* on CPU. The ``server`` variant is
|
|
341
|
+
reserved for GPU rollout later.
|
|
342
|
+
|
|
343
|
+
``rec_batch`` defaults to 6 mirroring
|
|
344
|
+
``app.core.config.Settings.paddle_ocr_rec_batch`` — changing it
|
|
345
|
+
only affects throughput, not the adapter's output shape.
|
|
346
|
+
"""
|
|
347
|
+
|
|
348
|
+
name = "paddleocr_v3_text_only"
|
|
349
|
+
|
|
350
|
+
def __init__(
|
|
351
|
+
self,
|
|
352
|
+
*,
|
|
353
|
+
use_server_model: bool = False,
|
|
354
|
+
rec_batch: int = 6,
|
|
355
|
+
max_long_edge_pre_ocr: int = 2048,
|
|
356
|
+
) -> None:
|
|
357
|
+
self._use_server_model = use_server_model
|
|
358
|
+
self._rec_batch = rec_batch
|
|
359
|
+
# Give PaddleOCR's internal det_limit a ceiling generous
|
|
360
|
+
# enough to never re-resize images the adapter has already
|
|
361
|
+
# sized to ``ocr_scale_hint``. The adapter is the single
|
|
362
|
+
# source of truth for scale policy.
|
|
363
|
+
self._max_long_edge_pre_ocr = max_long_edge_pre_ocr
|
|
364
|
+
self._ocr: object = None
|
|
365
|
+
|
|
366
|
+
def _ensure_engine(self) -> object:
|
|
367
|
+
if self._ocr is not None:
|
|
368
|
+
return self._ocr
|
|
369
|
+
from paddleocr import PaddleOCR # noqa: PLC0415 — lazy
|
|
370
|
+
kwargs: dict[str, object] = {
|
|
371
|
+
"use_textline_orientation": True,
|
|
372
|
+
# Doc preprocessing (orientation classify + UVDoc unwarping)
|
|
373
|
+
# defaults to ON in PaddleOCR 3.x. UVDoc "rectifies" flat phone
|
|
374
|
+
# screenshots, warping bbox coordinates non-linearly along y
|
|
375
|
+
# (measured -25px top → +90px bottom on 720x1612), which breaks
|
|
376
|
+
# card/nickname spatial binding downstream. Screenshots are
|
|
377
|
+
# always flat and upright — disable both.
|
|
378
|
+
"use_doc_orientation_classify": False,
|
|
379
|
+
"use_doc_unwarping": False,
|
|
380
|
+
"lang": "ch",
|
|
381
|
+
"device": "cpu",
|
|
382
|
+
"text_recognition_batch_size": self._rec_batch,
|
|
383
|
+
"text_det_limit_type": "max",
|
|
384
|
+
"text_det_limit_side_len": self._max_long_edge_pre_ocr,
|
|
385
|
+
}
|
|
386
|
+
if not self._use_server_model:
|
|
387
|
+
kwargs["text_detection_model_name"] = "PP-OCRv5_mobile_det"
|
|
388
|
+
kwargs["text_recognition_model_name"] = "PP-OCRv5_mobile_rec"
|
|
389
|
+
self._ocr = PaddleOCR(**kwargs)
|
|
390
|
+
return self._ocr
|
|
391
|
+
|
|
392
|
+
def detect_and_recognize(self, image_bgr: "np.ndarray") -> list[RawOcrItem]:
|
|
393
|
+
ocr = self._ensure_engine()
|
|
394
|
+
results = list(ocr.predict([image_bgr])) # type: ignore[attr-defined]
|
|
395
|
+
if not results:
|
|
396
|
+
return []
|
|
397
|
+
r = results[0]
|
|
398
|
+
texts: list[str] = list(r.get("rec_texts", []) or [])
|
|
399
|
+
scores: list[float] = list(r.get("rec_scores", []) or [])
|
|
400
|
+
polys_raw = r.get("rec_polys") or r.get("dt_polys") or []
|
|
401
|
+
items: list[RawOcrItem] = []
|
|
402
|
+
for i, text in enumerate(texts):
|
|
403
|
+
score = float(scores[i]) if i < len(scores) else 0.0
|
|
404
|
+
poly = polys_raw[i] if i < len(polys_raw) else None
|
|
405
|
+
if poly is None:
|
|
406
|
+
continue
|
|
407
|
+
quad = _normalize_poly(poly)
|
|
408
|
+
if quad is None:
|
|
409
|
+
continue
|
|
410
|
+
items.append(RawOcrItem(text=text, bbox_quad=quad, confidence=score))
|
|
411
|
+
return items
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def _normalize_poly(poly: object) -> Optional[tuple[tuple[float, float], ...]]:
|
|
415
|
+
"""Coerce numpy / list-of-lists poly into a plain tuple of (x, y)."""
|
|
416
|
+
try:
|
|
417
|
+
iterable = list(poly) # type: ignore[arg-type]
|
|
418
|
+
except TypeError:
|
|
419
|
+
return None
|
|
420
|
+
corners: list[tuple[float, float]] = []
|
|
421
|
+
for pt in iterable:
|
|
422
|
+
try:
|
|
423
|
+
x, y = float(pt[0]), float(pt[1])
|
|
424
|
+
except (TypeError, IndexError, ValueError):
|
|
425
|
+
return None
|
|
426
|
+
corners.append((x, y))
|
|
427
|
+
if len(corners) < 3:
|
|
428
|
+
return None
|
|
429
|
+
return tuple(corners)
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
# ============================================================================
|
|
433
|
+
# Adapter
|
|
434
|
+
# ============================================================================
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
class TextOcrAdapter:
|
|
438
|
+
"""Process one screenshot through an injected OCR engine.
|
|
439
|
+
|
|
440
|
+
Single-session usage pattern (scanner/CLI):
|
|
441
|
+
|
|
442
|
+
engine = PaddleOcrEngine()
|
|
443
|
+
adapter = TextOcrAdapter(engine=engine)
|
|
444
|
+
for shot in metadata.screenshots:
|
|
445
|
+
png = resolver.resolve(session_dir, shot)
|
|
446
|
+
result = adapter.process_page(png, shot)
|
|
447
|
+
# hand result to phase-2 / phase-5 consumers
|
|
448
|
+
"""
|
|
449
|
+
|
|
450
|
+
def __init__(
|
|
451
|
+
self,
|
|
452
|
+
engine: OcrEngine,
|
|
453
|
+
*,
|
|
454
|
+
confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD,
|
|
455
|
+
image_loader: Optional[Callable[[Path], "np.ndarray"]] = None,
|
|
456
|
+
) -> None:
|
|
457
|
+
self.engine = engine
|
|
458
|
+
self.confidence_threshold = confidence_threshold
|
|
459
|
+
self._load_image = image_loader or _default_image_loader
|
|
460
|
+
|
|
461
|
+
def process_page(
|
|
462
|
+
self,
|
|
463
|
+
png_path: Path,
|
|
464
|
+
screenshot: Screenshot,
|
|
465
|
+
) -> OcrPageResult:
|
|
466
|
+
"""Run one screenshot through the OCR engine.
|
|
467
|
+
|
|
468
|
+
Raises:
|
|
469
|
+
TextOcrAdapterError: when the PNG file cannot be decoded
|
|
470
|
+
at all (propagated error_code=``ocr_image_decode_error``
|
|
471
|
+
so the scanner can mark the session row ``error``
|
|
472
|
+
with that code — this is a **non-retryable** failure
|
|
473
|
+
because redoing OCR with the same broken bytes won't
|
|
474
|
+
help).
|
|
475
|
+
|
|
476
|
+
Zero-hit / all-low-confidence cases are NOT raised — they
|
|
477
|
+
return a valid ``OcrPageResult`` with empty ``text_blocks``
|
|
478
|
+
and ``filtered_out_count`` set so Admin can distinguish.
|
|
479
|
+
"""
|
|
480
|
+
t0 = time.perf_counter()
|
|
481
|
+
try:
|
|
482
|
+
image_bgr = self._load_image(png_path)
|
|
483
|
+
except FileNotFoundError:
|
|
484
|
+
raise
|
|
485
|
+
except Exception as e: # decode failure surfaces as adapter error
|
|
486
|
+
raise TextOcrAdapterError(
|
|
487
|
+
f"failed to decode image at {png_path!s}: {e!r}",
|
|
488
|
+
error_code="ocr_image_decode_error",
|
|
489
|
+
) from e
|
|
490
|
+
if image_bgr is None:
|
|
491
|
+
raise TextOcrAdapterError(
|
|
492
|
+
f"cv2.imdecode returned None for {png_path!s}",
|
|
493
|
+
error_code="ocr_image_decode_error",
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
original_h, original_w = image_bgr.shape[:2]
|
|
497
|
+
resized_bgr, scale_ratio = resize_long_edge_to(
|
|
498
|
+
image_bgr, screenshot.ocr_scale_hint
|
|
499
|
+
)
|
|
500
|
+
processed_h, processed_w = resized_bgr.shape[:2]
|
|
501
|
+
resized_applied = scale_ratio != 1.0
|
|
502
|
+
|
|
503
|
+
try:
|
|
504
|
+
raw_items = self.engine.detect_and_recognize(resized_bgr)
|
|
505
|
+
except Exception as e:
|
|
506
|
+
logger.warning(
|
|
507
|
+
"processor.text_ocr_engine_error",
|
|
508
|
+
extra={
|
|
509
|
+
"screenshot_id": screenshot.screenshot_id,
|
|
510
|
+
"engine": getattr(self.engine, "name", type(self.engine).__name__),
|
|
511
|
+
"error": str(e),
|
|
512
|
+
},
|
|
513
|
+
)
|
|
514
|
+
wall_ms = (time.perf_counter() - t0) * 1000.0
|
|
515
|
+
return OcrPageResult(
|
|
516
|
+
screenshot_id=screenshot.screenshot_id,
|
|
517
|
+
scale_hint=screenshot.ocr_scale_hint,
|
|
518
|
+
original_size=(original_w, original_h),
|
|
519
|
+
processed_size=(processed_w, processed_h),
|
|
520
|
+
resized_applied=resized_applied,
|
|
521
|
+
text_blocks=[],
|
|
522
|
+
raw_full_text="",
|
|
523
|
+
engine_name=getattr(self.engine, "name", type(self.engine).__name__),
|
|
524
|
+
wall_ms=wall_ms,
|
|
525
|
+
filtered_out_count=0,
|
|
526
|
+
confidence_threshold=self.confidence_threshold,
|
|
527
|
+
soft_error=f"engine_error:{type(e).__name__}",
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
inv_ratio = 1.0 / scale_ratio if scale_ratio != 0 else 1.0
|
|
531
|
+
blocks: list[TextBlock] = []
|
|
532
|
+
filtered_out = 0
|
|
533
|
+
for item in raw_items:
|
|
534
|
+
if item.confidence < self.confidence_threshold:
|
|
535
|
+
filtered_out += 1
|
|
536
|
+
continue
|
|
537
|
+
axis_aligned = quad_to_xyxy(item.bbox_quad)
|
|
538
|
+
bbox_original = scale_bbox(axis_aligned, inv_ratio)
|
|
539
|
+
# Clamp to original image bounds — shields downstream
|
|
540
|
+
# consumers from off-by-one rounding overshoot.
|
|
541
|
+
x1, y1, x2, y2 = bbox_original
|
|
542
|
+
x1 = min(max(0, x1), original_w)
|
|
543
|
+
y1 = min(max(0, y1), original_h)
|
|
544
|
+
x2 = min(max(0, x2), original_w)
|
|
545
|
+
y2 = min(max(0, y2), original_h)
|
|
546
|
+
if x2 <= x1 or y2 <= y1:
|
|
547
|
+
filtered_out += 1
|
|
548
|
+
continue
|
|
549
|
+
blocks.append(
|
|
550
|
+
TextBlock(
|
|
551
|
+
text=item.text,
|
|
552
|
+
bbox_xyxy=(x1, y1, x2, y2),
|
|
553
|
+
confidence=item.confidence,
|
|
554
|
+
line_read_index=0,
|
|
555
|
+
)
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
blocks = sort_reading_order(blocks)
|
|
559
|
+
blocks = [
|
|
560
|
+
TextBlock(
|
|
561
|
+
text=b.text,
|
|
562
|
+
bbox_xyxy=b.bbox_xyxy,
|
|
563
|
+
confidence=b.confidence,
|
|
564
|
+
line_read_index=idx,
|
|
565
|
+
paragraph_read_index=0,
|
|
566
|
+
)
|
|
567
|
+
for idx, b in enumerate(blocks, start=1)
|
|
568
|
+
]
|
|
569
|
+
blocks = assign_paragraph_read_index(blocks)
|
|
570
|
+
raw_full_text = "\n".join(b.text for b in blocks)
|
|
571
|
+
wall_ms = (time.perf_counter() - t0) * 1000.0
|
|
572
|
+
|
|
573
|
+
return OcrPageResult(
|
|
574
|
+
screenshot_id=screenshot.screenshot_id,
|
|
575
|
+
scale_hint=screenshot.ocr_scale_hint,
|
|
576
|
+
original_size=(original_w, original_h),
|
|
577
|
+
processed_size=(processed_w, processed_h),
|
|
578
|
+
resized_applied=resized_applied,
|
|
579
|
+
text_blocks=blocks,
|
|
580
|
+
raw_full_text=raw_full_text,
|
|
581
|
+
engine_name=getattr(self.engine, "name", type(self.engine).__name__),
|
|
582
|
+
wall_ms=wall_ms,
|
|
583
|
+
filtered_out_count=filtered_out,
|
|
584
|
+
confidence_threshold=self.confidence_threshold,
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
# ============================================================================
|
|
589
|
+
# Default image loader
|
|
590
|
+
# ============================================================================
|
|
591
|
+
|
|
592
|
+
|
|
593
|
+
def _default_image_loader(png_path: Path) -> "np.ndarray":
|
|
594
|
+
"""Load a PNG file as BGR ``np.ndarray`` via cv2.imdecode.
|
|
595
|
+
|
|
596
|
+
Uses ``imdecode`` (not ``imread``) so non-ASCII paths on Windows
|
|
597
|
+
still work — ``imread`` chokes on Unicode paths.
|
|
598
|
+
"""
|
|
599
|
+
import cv2
|
|
600
|
+
import numpy as np
|
|
601
|
+
|
|
602
|
+
if not png_path.exists():
|
|
603
|
+
raise FileNotFoundError(f"PNG not found: {png_path!s}")
|
|
604
|
+
data = png_path.read_bytes()
|
|
605
|
+
arr = np.frombuffer(data, dtype=np.uint8)
|
|
606
|
+
img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
|
|
607
|
+
return img
|
|
608
|
+
|
|
609
|
+
|
|
610
|
+
__all__ = [
|
|
611
|
+
"DEFAULT_CONFIDENCE_THRESHOLD",
|
|
612
|
+
"READING_ORDER_ROW_TOLERANCE_PX",
|
|
613
|
+
"OcrEngine",
|
|
614
|
+
"OcrPageResult",
|
|
615
|
+
"PaddleOcrEngine",
|
|
616
|
+
"RawOcrItem",
|
|
617
|
+
"TextBlock",
|
|
618
|
+
"TextOcrAdapter",
|
|
619
|
+
"TextOcrAdapterError",
|
|
620
|
+
"quad_to_xyxy",
|
|
621
|
+
"resize_long_edge_to",
|
|
622
|
+
"scale_bbox",
|
|
623
|
+
"sort_reading_order",
|
|
624
|
+
"assign_paragraph_read_index",
|
|
625
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Android WeChat profile: thresholds, layout constants, and template paths.
|
|
2
|
+
|
|
3
|
+
All constants are anchored to a 1080px-wide baseline screen.
|
|
4
|
+
Scale factor ``scale_w = device_screen_width / 1080`` at runtime.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
# ── Card bbox detection (§9) ────────────────────────────────────────
|
|
8
|
+
TOP_BAR_BOT_RATIO = 0.10
|
|
9
|
+
BOT_BAR_TOP_RATIO = 0.93
|
|
10
|
+
AVATAR_COLUMN_WIDTH_BASELINE = 108 # px @ 1080
|
|
11
|
+
MIN_CARD_X_GAP_BASELINE = 260 # min distance from avatar right edge to vline
|
|
12
|
+
VLINE_MIN_SEG_RATIO = 0.08 # min continuous mid-variance segment / zone height
|
|
13
|
+
CARD_HSPAN_MIN_RATIO = 0.5 # hline span > 50% card_w
|
|
14
|
+
HLINE_NEAR_VLINE_LIMIT_BASELINE = 100 # px @ 1080
|
|
15
|
+
ZONE_MIN_HEIGHT = 80 # minimum zone / bubble height (px)
|
|
16
|
+
ZONE_MIN_SEG = 10 # minimum raw segment height before clamping
|
|
17
|
+
LABEL_INTERSECT_MARGIN = 5 # tag × hline intersect tolerance (px)
|
|
18
|
+
GAP_MERGE_MAX_DIST = 30 # gap merge / extend threshold (px)
|
|
19
|
+
|
|
20
|
+
# ── Template matching thresholds (§6 §11) ──────────────────────────
|
|
21
|
+
CORE_THRESHOLD = 0.80 # favorite_label / note_header / chevron hard gate
|
|
22
|
+
AUX_THRESHOLD = 0.75 # unread_divider / new_messages_hint
|
|
23
|
+
|
|
24
|
+
# ── Speaker band (§7) ───────────────────────────────────────────────
|
|
25
|
+
CHAT_FIRST_BAND_TOP_EXTEND_BASELINE = 100 # first band top extension (px @ 1080)
|
|
26
|
+
CHAT_BAND_FIRST_AVATAR_TOP_GAP_REJECT_BASELINE = 30
|
|
27
|
+
CHAT_COMPOSER_RESERVE_BOTTOM_BASELINE = 160
|
|
28
|
+
CHAT_TITLE_BAR_TO_CONTENT_OFFSET_BASELINE = 4
|
|
29
|
+
CHAT_SIDE_AVATAR_COLUMN_WIDTH_BASELINE = 108
|
|
30
|
+
CHAT_SIDE_AVATAR_HOUGH_MIN_DIST_BASELINE = 68
|
|
31
|
+
CHAT_SIDE_AVATAR_MIN_R_BASELINE = 14
|
|
32
|
+
CHAT_SIDE_AVATAR_MAX_R_BASELINE = 54
|
|
33
|
+
|
|
34
|
+
# ── Avatar Hough parameters (§6 §7) ─────────────────────────────────
|
|
35
|
+
LIST_HOUGH_MIN_DIST_BASELINE = 117
|
|
36
|
+
LIST_HOUGH_MIN_R_BASELINE = 36
|
|
37
|
+
LIST_HOUGH_MAX_R_BASELINE = 80
|
|
38
|
+
LIST_YMIN_GAP_BASELINE = 118
|
|
39
|
+
LIST_SAME_ICON_DY_MAX_BASELINE = 18
|
|
40
|
+
NICKNAME_AVATAR_BIND_MAX_DY_BASELINE = 140
|
|
41
|
+
AVATAR_ROI_LEFT_BASELINE = 0
|
|
42
|
+
AVATAR_ROI_RIGHT_BASELINE = 290
|
|
43
|
+
AVATAR_MEDIAN_X_HALF_WIDTH_BASELINE = 20
|
|
44
|
+
|
|
45
|
+
# ── Card click & scroll (§11) ───────────────────────────────────────
|
|
46
|
+
FAVORITE_LABEL_TEMPLATE_W_BASELINE = 70
|
|
47
|
+
FAVORITE_LABEL_TEMPLATE_H_BASELINE = 60
|
|
48
|
+
FAVORITE_TO_CARD_TOP_OFFSET_BASELINE = 421
|
|
49
|
+
FAVORITE_TAIL_OFFSET_BASELINE = 60
|
|
50
|
+
REFERENCE_RESUME_CARD_TOP_GAP_BASELINE = 542
|
|
51
|
+
|
|
52
|
+
# ── Avatar side ─────────────────────────────────────────────────────
|
|
53
|
+
AVATAR_SIDE = "left" # Android WeChat places sender avatars on the left
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""HarmonyOS WeChat profile (placeholder — not yet available).
|
|
2
|
+
|
|
3
|
+
Placeholder for future HarmonyOS WeChat support.
|
|
4
|
+
All constants are not implemented.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
raise NotImplementedError(
|
|
8
|
+
"HarmonyOS WeChat profile is not yet implemented. "
|
|
9
|
+
"Use Platform.ANDROID or Platform.IOS instead."
|
|
10
|
+
)
|