screenshot-vision-algorithm 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. screenshot_vision_algorithm/__init__.py +48 -0
  2. screenshot_vision_algorithm/_config.py +61 -0
  3. screenshot_vision_algorithm/android/__init__.py +1 -0
  4. screenshot_vision_algorithm/android/wechat/__init__.py +1 -0
  5. screenshot_vision_algorithm/android/wechat/algorithms/__init__.py +0 -0
  6. screenshot_vision_algorithm/android/wechat/algorithms/avatar_column.py +209 -0
  7. screenshot_vision_algorithm/android/wechat/algorithms/badge_detection.py +275 -0
  8. screenshot_vision_algorithm/android/wechat/algorithms/card_bbox.py +1000 -0
  9. screenshot_vision_algorithm/android/wechat/algorithms/phash_utils.py +267 -0
  10. screenshot_vision_algorithm/android/wechat/algorithms/speaker_band.py +290 -0
  11. screenshot_vision_algorithm/android/wechat/algorithms/template_matching.py +2163 -0
  12. screenshot_vision_algorithm/android/wechat/algorithms/title_ocr.py +143 -0
  13. screenshot_vision_algorithm/android/wechat/merge/__init__.py +0 -0
  14. screenshot_vision_algorithm/android/wechat/merge/multipage.py +157 -0
  15. screenshot_vision_algorithm/android/wechat/ocr/__init__.py +0 -0
  16. screenshot_vision_algorithm/android/wechat/ocr/avatar_guard.py +434 -0
  17. screenshot_vision_algorithm/android/wechat/ocr/badge_ocr.py +232 -0
  18. screenshot_vision_algorithm/android/wechat/ocr/nickname_binding.py +1888 -0
  19. screenshot_vision_algorithm/android/wechat/ocr/text_ocr_adapter.py +625 -0
  20. screenshot_vision_algorithm/android/wechat/profiles/__init__.py +0 -0
  21. screenshot_vision_algorithm/android/wechat/profiles/android.py +53 -0
  22. screenshot_vision_algorithm/android/wechat/profiles/harmony.py +10 -0
  23. screenshot_vision_algorithm/android/wechat/profiles/ios.py +53 -0
  24. screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/chat_back_chevron.png +0 -0
  25. screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/chat_input_emoji_smile.png +0 -0
  26. screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/chat_input_plus.png +0 -0
  27. screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/chat_input_voice.png +0 -0
  28. screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/chat_title_more_dots.png +0 -0
  29. screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/favorite_label.png +0 -0
  30. screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/new_messages_hint_suffix.png +0 -0
  31. screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/unread_divider_hint.png +0 -0
  32. screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/unread_divider_hint_v2_textonly.png +0 -0
  33. screenshot_vision_algorithm/android/wechat/templates/android/8.0.69/wechat_note_header.png +0 -0
  34. screenshot_vision_algorithm/android/xhs/__init__.py +4 -0
  35. screenshot_vision_algorithm/android/zhihu/__init__.py +4 -0
  36. screenshot_vision_algorithm/png_utils.py +86 -0
  37. screenshot_vision_algorithm-0.3.0.dist-info/METADATA +425 -0
  38. screenshot_vision_algorithm-0.3.0.dist-info/RECORD +40 -0
  39. screenshot_vision_algorithm-0.3.0.dist-info/WHEEL +5 -0
  40. screenshot_vision_algorithm-0.3.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1888 @@
1
+ """Session-scoped, simplest-viable speaker nickname extractor (d3-nickname-ocr-minimal, v0.5 P1-3 minimal).
2
+
3
+ Day 3 scope (this module):
4
+
5
+ Take one ``OcrPageResult`` (from ``text_ocr_adapter`` — d2-4) plus the
6
+ corresponding ``Screenshot`` metadata and return a
7
+ ``ChatSpeakerAttribution`` describing which OCR text blocks are
8
+ **nicknames** (group-chat bubble headers) and which bubbles belong
9
+ to the device owner (right-half-screen) vs. still-unknown speakers.
10
+
11
+ Replaces the ``speaker_nickname=stub='UNKNOWN'`` placeholder that
12
+ Thin Slice v1 left behind. **Session-internal only** — no cross-session
13
+ nickname-hash aggregation (that's P1-3' ``CrossSessionSpeakerService``,
14
+ scheduled post-launch).
15
+
16
+ **PRD 验收口径(防漂移)**:单帧 **昵称行 / 气泡块** 的产品边界、靠左/靠右发言之 **头像守卫** 硬门槛,以及「昵称不属于任何气泡、气泡内不包括昵称」等,以仓库根 **[`docs/requirements/product_requirement_document.md`](../../../docs/requirements/product_requirement_document.md) §九、7.(2)(3)** 为验收 SSOT;本模块启发式或 `nickname_avatar_guard` 行为与之不一致时,**以实现补丁对齐 PRD** 为准(ADR §2.5.2 术语镜像段亦引用该节)。
17
+
18
+ Explicit non-goals (by design, per ADR v0.3 / v0.5):
19
+
20
+ - **Cross-session aggregation**: P1-3' ``CrossSessionSpeakerService``
21
+ will handle ``sha256(platform|NFC(nickname)|phone_extracted|salt)[:16]``
22
+ hashing for order attribution; this module stops at
23
+ ``nickname=<raw-string>`` within one session.
24
+ - **Head-avatar pHash / geometry**: ADR v0.3 永久边界:**不做**头像 pHash;
25
+ 仅以 OCR 块 + bbox 归因;PRD 术语§5 另允许 **侧栏头像 Hough 纵向筒**
26
+ (`processor/chat_bubble_geometry_prd.py`,与 pHash **无关**)。
27
+ - **Bubble detection**: PaddleOCR returns text blocks, NOT speech bubble
28
+ outlines. We approximate bubble membership via "next text block(s)
29
+ below the nickname within ``bubble_vertical_gap_max_px`` on the same
30
+ screen-half". This is the simplest viable heuristic and covers the
31
+ 80% case (group chat with visible nicknames above each bubble).
32
+ - **Confidence-weighted voting across pages**: a nickname string is
33
+ returned as-is per page; no de-duping / normalization / merging
34
+ happens here. Downstream code (Admin ``/wx-ocr-stats`` page) does
35
+ its own roll-up (non-stub share %).
36
+ - **Text normalization**: raw OCR string is kept as-is (matches the
37
+ ``raw_full_text`` contract of ``text_ocr_adapter``). Downstream
38
+ ``lite_text_normalizer.normalize_business_text`` is the phase-4
39
+ consumer.
40
+
41
+ Contract surface (2 dataclasses + 1 function):
42
+
43
+ ``extract_nicknames(
44
+ ocr_result, screenshot, config=None, png_path=None
45
+ ) -> ChatSpeakerAttribution``
46
+
47
+ The function accepts any ``OcrPageResult`` (``screenshot.type ==
48
+ "chat_message"`` expected by caller; we assert it) and returns a
49
+ frozen ``ChatSpeakerAttribution``. All geometry heuristics live in
50
+ ``NicknameOcrConfig`` — overridable for fixture / test isolation.
51
+
52
+ Heuristic summary (all adjustable via ``NicknameOcrConfig``):
53
+
54
+ STEP 0 (PRD §7(3)):仅对相邻且均已标为 ``bubble_text`` 的块尝试合并——
55
+ **同一半屏**内「水平连续」(同水平带 + 横向间隙小)的 Paddle 碎片合成一条气泡块;
56
+ (同气泡多行竖叠合并易与相邻消息混淆,留给昵称认领链处理。)
57
+ 再进入昵称认领。
58
+
59
+ STEP 1: Pre-filter each text block to one of
60
+ "nickname_candidate" / "bubble_text" / "drop".
61
+
62
+ ``nickname_candidate`` requires:
63
+ (a) ``confidence >= config.min_confidence``
64
+ (b) ``len(text_stripped) <= config.nickname_max_chars``
65
+ (c) text does NOT start with any prefix in
66
+ ``config.system_prefix_drops`` (``@``, ``撤回``,
67
+ ``系统消息``, …)
68
+ (d) text is NOT a pure ``HH:MM`` time stamp
69
+ (e) text is NOT a pure digit run (phone / day number)
70
+ (f) bbox x-center is on the LEFT half-screen
71
+ (``x_center < original_width * config.screen_midline_ratio``)
72
+ (g) bbox height is ``<= median_text_height *
73
+ config.nickname_max_char_height_ratio``
74
+ (nicknames are visually smaller than bubble body text in
75
+ the WeChat group-chat layout)
76
+
77
+ ``bubble_text``: the leftover blocks (may be left-half or
78
+ right-half bubbles).
79
+
80
+ ``drop``: low-confidence blocks (already filtered by
81
+ ``TextOcrAdapter`` upstream; we keep the gate here for
82
+ tests that inject raw fake blocks bypassing the adapter).
83
+
84
+ STEP 2: For each ``nickname_candidate``, claim ``bubble_text``
85
+ blocks immediately below it on the LEFT half screen, within
86
+ ``bubble_vertical_gap_max_px`` vertical pixels, stopping when
87
+ we hit the next ``nickname_candidate`` or walk past the gap.
88
+
89
+ STEP 3: Classify each remaining ``bubble_text``:
90
+ - right-half bubble → ``owner``(``wo``)
91
+ - left-half orphan → 归因链结束后 **继承**同侧上方最近昵称;无可继承则
92
+ ``speaker_display=找不到归属昵称``(参见 PRD 九§7(4)(5))。
93
+
94
+ STEP 4(PRD §7(5)+(5')):``SpeakerBodySegment`` 层按 **整行占位关键词**剔除后,
95
+ ``len(body.strip()) < min_bubble_body_chars`` 的段落丢弃;
96
+ ``total_bubbles`` 等计数以_finalize 过滤后为准。
97
+
98
+ STEP 5(术语§5):在提供 ``png_path`` 且启用 ``NicknameOcrConfig.use_prd_chat_vertical_bands``
99
+ 时加载 ``chat_bubble_geometry_prd`` 纵向筒——昵称仅能认领与其 **同属一筒(含 ±12px@1080)**
100
+ 的左侧正文块;检出无头像则将整帧正文归为「找不到归属昵称」单段。
101
+
102
+ Fields on ``ChatSpeakerAttribution`` designed so Admin roll-up can
103
+ compute "non-stub share %" via ``bubbles_with_nickname / total_bubbles``.
104
+
105
+ References:
106
+ OCR ADR v0.5 §2.3 speaker_nickname (phase 2 NicknameBoundaryService)
107
+ OCR ADR v0.3 §4.3.0 project vision-layer boundary
108
+ session-handoff-20260429.mdc Day 3 d3-nickname-ocr-minimal row
109
+ scripts/wx_match/processor/text_ocr_adapter.py
110
+ ``OcrPageResult`` / ``TextBlock`` / reading order contract
111
+ """
112
+
113
+ from __future__ import annotations
114
+
115
+ import re
116
+ from dataclasses import dataclass, field
117
+ from dataclasses import replace as dataclass_replace
118
+ from pathlib import Path
119
+ from statistics import median
120
+ from typing import TYPE_CHECKING, Any, Optional, Tuple
121
+
122
+ from screenshot_vision_algorithm.android.wechat.ocr.text_ocr_adapter import OcrPageResult, TextBlock
123
+
124
+ if TYPE_CHECKING:
125
+ from collector.contracts import Screenshot
126
+
127
+ _OWNER_SPEAKER_LABEL = "wo"
128
+ """右侧气泡(设备持有方)在 PRD 发言人 JSON 中的统一展示键。"""
129
+
130
+ SPEAKER_DISPLAY_UNATTRIBUTED = "找不到归属昵称"
131
+ """PRD 九§7(4):左侧无法归因到昵称的段级占位展示键。"""
132
+
133
+
134
+ # ============================================================================
135
+ # Config
136
+ # ============================================================================
137
+
138
+
139
+ @dataclass(frozen=True)
140
+ class NicknameOcrConfig:
141
+ """All heuristic thresholds in one place.
142
+
143
+ Defaults are tuned for the ``edb1a89f`` Thin Slice v1 fixture
144
+ (1080x2248 real-device captures, WeChat 8.0.69, default-light
145
+ theme). Tests override freely — no I/O, no logging dependency.
146
+ """
147
+
148
+ min_confidence: float = 0.7
149
+ """Skip any block below this confidence(defence-in-depth vs.
150
+ ``TextOcrAdapter.confidence_threshold``;与 PRD §6 / §11 块级 **0.7** 对齐)。"""
151
+
152
+ nickname_max_chars: int = 30
153
+ """WeChat group-chat nickname cap. Longer strings are almost
154
+ certainly bubble body text."""
155
+
156
+ screen_midline_ratio: float = 0.5
157
+ """x-center < ``original_width * ratio`` ⇒ left-half-screen."""
158
+
159
+ nickname_max_char_height_ratio: float = 0.85
160
+ """Nickname bbox height must be at most ``median_height * ratio``.
161
+
162
+ Default ``0.85`` is calibrated for real WeChat captures — nicknames
163
+ render in a visibly smaller font than bubble body text (typically
164
+ ~30px vs. ~45px on a 1080-wide device). This cuts false positives
165
+ cleanly once the frame has ≥1 bubble-body block to anchor the median.
166
+
167
+ Failure mode on degenerate same-height synthetic inputs: every block
168
+ fails the bound (median == self) and no nickname is reported. That's
169
+ acceptable — a frame with zero height variance has no speaker
170
+ signal to extract anyway; callers see ``nicknames=()`` and fall
171
+ through to the owner/unknown classification. Tests override this
172
+ knob to explore boundary behaviour."""
173
+
174
+ bubble_vertical_gap_max_px: int = 120
175
+ """Max vertical distance (bubble top minus nickname bottom) within
176
+ which a ``bubble_text`` block is claimed by the nickname above."""
177
+
178
+ min_bubble_body_chars: int = 12
179
+ """PRD §7(5) L:单段气泡正文 strip 后字符数低于此值则丢弃该段(不计入分段与计数)。"""
180
+
181
+ bubble_fragment_same_line_y_tol_px: int = 18
182
+ """PRD §7(3):同一水平带上相邻 OCR 块竖直中心容差(px)。"""
183
+
184
+ bubble_fragment_horizontal_gap_max_px: int = 36
185
+ """PRD §7(3):水平相邻碎片最大间隙;实际阈值与 ``median_h`` 成比例取较大。"""
186
+
187
+ avatar_gutter_px: int = 10
188
+ avatar_min_left_roi_width_px: int = 40
189
+ avatar_laplacian_var_min: float = 48.0
190
+ avatar_rgb_std_min: float = 4.0
191
+ avatar_roi_extend_up_px: int = 80
192
+ avatar_roi_extend_down_px: int = 4
193
+ avatar_roi_extend_up_row_mul: float = 2.5
194
+ avatar_roi_narrow_up_px: int = 4
195
+ avatar_roi_narrow_down_px: int = 4
196
+ avatar_edge_sig_cols_min: float = 15.0
197
+ avatar_edge_right_left_ratio_min: float = 1.2
198
+ """头像栏 ROI 裁剪与纹理/形状阈值(见 ``nickname_avatar_guard.avatar_roi_pass``)。"""
199
+
200
+ nickname_max_x1_ratio: float = 0.30
201
+ """昵称行 bbox 左缘须 ``x1 < original_width * ratio``(排除居中系统行/时间行)。"""
202
+
203
+ nickname_phone_height_ratio: float = 1.08
204
+ """行末含 11 位手机号时,字高上限放宽为 ``median_h * ratio``。"""
205
+
206
+ #: True 时在提供 PNG 路径且 cv2 可用时启用 PRD 术语§5头像时间轴纵向筒昵称认领。
207
+ use_prd_chat_vertical_bands: bool = True
208
+
209
+ #: PRD §九 7(2):无 ``png_path`` 或守卫未通过时 **不得** 记为可绑定昵称行。
210
+ require_avatar_guard_for_nickname: bool = True
211
+
212
+ #: PRD §九 7(2):``avatar_roi_pass`` 使用 :func:`nickname_avatar_guard.nickname_row_passes_prd_avatar_guard`(须有 Hough 锚点)。
213
+ use_prd_strict_avatar_guard: bool = True
214
+
215
+ #: PRD §九 7(3):昵称行 bbox 宽度上限(px @ 1080 设计宽,× ``width/1080``);更宽视为气泡正文。
216
+ nickname_max_bbox_width_px: int = 220
217
+
218
+ system_prefix_drops: tuple[str, ...] = (
219
+ "@",
220
+ "撤回了一条消息",
221
+ "系统消息",
222
+ "邀请",
223
+ "加入了群聊",
224
+ "已加入群聊",
225
+ "[系统提示]",
226
+ "以上为历史消息",
227
+ "仅群主",
228
+ "以下为新消息",
229
+ )
230
+ """Prefix / substring tags that exclude a block from nickname
231
+ consideration even if all geometric criteria pass."""
232
+
233
+
234
+ _ATTRIBUTION_PREFIX_DROPS: tuple[str, ...] = (
235
+ "撤回了一条消息",
236
+ "系统消息",
237
+ "邀请",
238
+ "加入了群聊",
239
+ "已加入群聊",
240
+ "[系统提示]",
241
+ "以上为历史消息",
242
+ "仅群主",
243
+ )
244
+ """归因展示昵称路径专用:不包含 ``@``,避免企业微信「@后缀」整块被误判为 @mention。
245
+
246
+
247
+ 业务规则:归因用昵称取 OCR 原文(同桌邻块按阅读序拼接);仅排除明确的系统占位行。"""
248
+
249
+
250
+ # ============================================================================
251
+ # Output shape
252
+ # ============================================================================
253
+
254
+
255
+ @dataclass(frozen=True)
256
+ class NicknameExtraction:
257
+ """One identified nickname + which subsequent bubble text blocks it owns.
258
+
259
+ ``following_block_indices`` are indices into
260
+ ``OcrPageResult.text_blocks`` (reading-order). Empty list means the
261
+ nickname was detected but no bubble body was close enough to claim;
262
+ still reported so Admin can surface "naked nickname lines" (useful
263
+ for QA).
264
+
265
+ ``nickname_block_index`` is the index of the nickname line itself in
266
+ ``text_blocks`` (reading order), used to emit ``SpeakerBodySegment`` in
267
+ top-to-bottom order.
268
+ """
269
+
270
+ nickname: str
271
+ bbox_xyxy: tuple[int, int, int, int]
272
+ confidence: float
273
+ following_block_indices: tuple[int, ...] = field(default_factory=tuple)
274
+ nickname_block_index: int = -1
275
+ band_index: Optional[int] = None
276
+
277
+
278
+ @dataclass(frozen=True)
279
+ class SpeakerBodySegment:
280
+ """单页聊天图中一段可归属正文(PRD §8 发言人 JSON 的原材料)。"""
281
+
282
+ speaker_display: str
283
+ """OCR 昵称原文、设备侧 ``wo``、或空串(左侧 orphan 气泡)。"""
284
+
285
+ body_text: str
286
+ orphan: bool
287
+ #: OCR text block 在 ``blocks`` 列表中的原始索引(-1 表示无)。
288
+ #: 用于 PRD §5 band 归属推断。
289
+ block_index: int = -1
290
+
291
+
292
+ @dataclass(frozen=True)
293
+ class ResumeThumbBinding:
294
+ """PRD §6(2):单帧内一张简历卡 → 所属昵称的绑定。
295
+
296
+ 绑定依据 **PRD §5 头像 y-top 纵向筒**(``chat_bubble_geometry_prd``):
297
+ thumb bbox 与已被昵称认领的左侧气泡块几何相交 → 认领该气泡的昵称即为
298
+ thumb 所属发言人。参见 ``_resume_thumb_bindings_and_orphans``。
299
+ """
300
+
301
+ thumb_bbox: tuple[int, int, int, int]
302
+ """[x1, y1, x2, y2] — 与 metadata ``source_thumb_bbox`` 完全对齐。"""
303
+ nickname: str
304
+ """OCR 昵称原文(与 ``NicknameExtraction.nickname`` 同源)。"""
305
+
306
+
307
+ @dataclass(frozen=True)
308
+ class ChatSpeakerAttribution:
309
+ """Session-internal speaker attribution result for ONE chat screenshot.
310
+
311
+ Top-level status counters expose enough for Admin
312
+ ``/wx-ocr-stats`` to compute per-session non-stub share:
313
+
314
+ non_stub_share = bubbles_with_nickname / max(total_bubbles, 1)
315
+
316
+ Zero-text frames (OCR soft error / all low-confidence) return
317
+ ``total_bubbles=0`` and empty ``nicknames``; callers must handle
318
+ the divide-by-zero explicitly.
319
+
320
+ ``resume_thumb_orphans``: PRD 九 · §7(1)+§9(4) — metadata 中给出的简历卡
321
+ bbox 是否与**已被昵称认领**的左侧气泡相交;若不相交则计为孤儿(无法归属昵称)。
322
+
323
+ ``resume_thumb_bindings``: PRD §6(2) — 简历卡 bbox → 所属昵称的绑定映射
324
+ (经 PRD §5 头像 y-top 纵向筒归属的左侧气泡)。bridge 层直接用于
325
+ ``resolve_resume_card_speaker_binding``,不再使用侧栏启发式 fallback。
326
+ """
327
+
328
+ screenshot_id: str
329
+ nicknames: tuple[NicknameExtraction, ...] = field(default_factory=tuple)
330
+ total_bubbles: int = 0
331
+ bubbles_with_nickname: int = 0
332
+ bubbles_owner: int = 0
333
+ bubbles_unknown: int = 0
334
+ resume_thumb_orphans: int = 0
335
+ #: PRD §6(2):简历卡 bbox → 昵称的绑定映射(本帧内,经纵向筒归属验证)。
336
+ resume_thumb_bindings: tuple[ResumeThumbBinding, ...] = field(default_factory=tuple)
337
+ #: PRD §8:本页按昵称 / 设备侧 / orphan 切分的正文段(供合并后按发言人形成 JSON 工作项)。
338
+ speaker_body_segments: tuple[SpeakerBodySegment, ...] = field(default_factory=tuple)
339
+ #: 调试用:本帧 OCR 归因全量中间数据(blocks / bands / ytops / bboxes)。
340
+ #: 非 None 时写入 ``debug_session_derived.json``;线上可通过环境变量关闭。
341
+ ocr_debug: Optional[dict[str, Any]] = None
342
+
343
+
344
+ # ============================================================================
345
+ # Internal helpers
346
+ # ============================================================================
347
+
348
+
349
+ _PURE_DIGITS_RE = re.compile(r"^\d+$")
350
+ _PURE_TIME_RE = re.compile(r"^\d{1,2}:\d{2}(:\d{2})?$")
351
+ _WEEKDAY_TIME_RE = re.compile(
352
+ r"^周[一二三四五六日天].*?(?:上午|下午|晚上|凌晨)?\s*\d{0,2}[::]?\d{0,2}"
353
+ )
354
+ _MOBILE_PHONE_SUFFIX_RE = re.compile(r"1\d{10}\s*$")
355
+
356
+
357
+ def _block_height(block: "TextBlock") -> int:
358
+ x1, y1, x2, y2 = block.bbox_xyxy
359
+ return max(0, y2 - y1)
360
+
361
+
362
+ def _block_x_center(block: "TextBlock") -> float:
363
+ x1, _, x2, _ = block.bbox_xyxy
364
+ return (x1 + x2) / 2.0
365
+
366
+
367
+ def _block_top(block: "TextBlock") -> int:
368
+ return block.bbox_xyxy[1]
369
+
370
+
371
+ def _block_bottom(block: "TextBlock") -> int:
372
+ return block.bbox_xyxy[3]
373
+
374
+
375
+ def _is_pure_time(text: str) -> bool:
376
+ return bool(_PURE_TIME_RE.match(text))
377
+
378
+
379
+ def _is_pure_digits(text: str) -> bool:
380
+ return bool(_PURE_DIGITS_RE.match(text))
381
+
382
+
383
+ def _is_chat_timestamp_or_divider(text: str) -> bool:
384
+ t = (text or "").strip()
385
+ if not t:
386
+ return False
387
+ if "以下为新消息" in t:
388
+ return True
389
+ if _WEEKDAY_TIME_RE.match(t):
390
+ return True
391
+ return False
392
+
393
+
394
+ def _is_favorite_card_label(text: str) -> bool:
395
+ """微信卡片底栏「收藏」:禁止作为发言人昵称(产品约定昵称不可用「收藏」)。"""
396
+ t = (text or "").strip()
397
+ if t == "收藏":
398
+ return True
399
+ inner = t.strip("「」[]()() \t")
400
+ return inner == "收藏"
401
+
402
+
403
+ def _has_mobile_phone_suffix(text: str) -> bool:
404
+ return bool(_MOBILE_PHONE_SUFFIX_RE.search((text or "").replace(" ", "")))
405
+
406
+
407
+ def _is_avatar_column_nickname_row(
408
+ block: "TextBlock",
409
+ original_width: int,
410
+ config: NicknameOcrConfig,
411
+ avatar_layout: Any = None,
412
+ ) -> bool:
413
+ try:
414
+ from processor.left_avatar_column import nickname_bbox_in_avatar_column
415
+ except ImportError:
416
+ nickname_bbox_in_avatar_column = None # type: ignore[assignment,misc]
417
+ if avatar_layout is not None and nickname_bbox_in_avatar_column is not None:
418
+ if not getattr(avatar_layout, "empty", True):
419
+ return nickname_bbox_in_avatar_column(
420
+ avatar_layout,
421
+ block.bbox_xyxy,
422
+ original_width=original_width,
423
+ max_x1_ratio=config.nickname_max_x1_ratio,
424
+ )
425
+ x1, _, _, _ = block.bbox_xyxy
426
+ return float(x1) < float(original_width) * float(config.nickname_max_x1_ratio)
427
+
428
+
429
+ def _avatar_guard_kwargs(config: NicknameOcrConfig) -> dict:
430
+ return {
431
+ "gutter_px": config.avatar_gutter_px,
432
+ "min_roi_width_px": config.avatar_min_left_roi_width_px,
433
+ "laplacian_min": float(config.avatar_laplacian_var_min),
434
+ "rgb_std_min": float(config.avatar_rgb_std_min),
435
+ "edge_sig_cols_min": float(config.avatar_edge_sig_cols_min),
436
+ "edge_right_left_ratio_min": float(config.avatar_edge_right_left_ratio_min),
437
+ }
438
+
439
+
440
+ def _avatar_anchor_for_block(
441
+ block: "TextBlock",
442
+ avatar_layout: Any,
443
+ *,
444
+ scale_w: float,
445
+ ) -> Any:
446
+ if avatar_layout is None or getattr(avatar_layout, "empty", True):
447
+ return None
448
+ try:
449
+ from processor.left_avatar_column import find_avatar_anchor_for_nickname_bbox
450
+ except ImportError:
451
+ return None
452
+ return find_avatar_anchor_for_nickname_bbox(
453
+ avatar_layout,
454
+ block.bbox_xyxy,
455
+ scale_w=scale_w,
456
+ )
457
+
458
+
459
+ def _load_left_avatar_layout(
460
+ png_path: Optional[Path],
461
+ screenshot: "Screenshot",
462
+ ) -> Any:
463
+ if png_path is None:
464
+ return None
465
+ try:
466
+ from processor.chat_bubble_geometry_prd import compute_chat_content_vertical_bounds
467
+ from processor.left_avatar_column import detect_left_avatar_column_layout
468
+ from processor.nickname_avatar_guard import load_png_bgr
469
+ except ImportError:
470
+ return None
471
+ arr = load_png_bgr(str(png_path))
472
+ if arr is None or getattr(arr, "size", 0) == 0:
473
+ return None
474
+ ow = screenshot.original_resolution.width
475
+ oh = screenshot.original_resolution.height
476
+ sw = float(ow) / 1080.0
477
+ bounds = compute_chat_content_vertical_bounds(arr, sw, screen_h=oh)
478
+ return detect_left_avatar_column_layout(
479
+ arr,
480
+ sw,
481
+ y_top=bounds.y_top,
482
+ y_bottom_excl=bounds.y_bottom_excl,
483
+ )
484
+
485
+
486
+ def _starts_with_system_prefix(text: str, drops: tuple[str, ...]) -> bool:
487
+ for p in drops:
488
+ if text.startswith(p):
489
+ return True
490
+ return False
491
+
492
+
493
+ def _is_on_left_half(
494
+ block: "TextBlock",
495
+ original_width: int,
496
+ ratio: float,
497
+ ) -> bool:
498
+ return _block_x_center(block) < original_width * ratio
499
+
500
+
501
+ def _vertical_center(block: "TextBlock") -> float:
502
+ return (_block_top(block) + _block_bottom(block)) / 2.0
503
+
504
+
505
+ def _xyxy_intersects(
506
+ a: tuple[int, int, int, int],
507
+ b: tuple[int, int, int, int],
508
+ ) -> bool:
509
+ ax1, ay1, ax2, ay2 = a
510
+ bx1, by1, bx2, by2 = b
511
+ return not (ax2 < bx1 or bx2 < ax1 or ay2 < by1 or by2 < ay1)
512
+
513
+
514
+ def _xyxy_union(
515
+ a: tuple[int, int, int, int],
516
+ b: tuple[int, int, int, int],
517
+ ) -> tuple[int, int, int, int]:
518
+ return (
519
+ min(a[0], b[0]),
520
+ min(a[1], b[1]),
521
+ max(a[2], b[2]),
522
+ max(a[3], b[3]),
523
+ )
524
+
525
+
526
+
527
+ def _bubble_envelopes_from_blocks(
528
+ blocks: list["TextBlock"],
529
+ classes: list[str],
530
+ *,
531
+ original_width: int,
532
+ config: NicknameOcrConfig,
533
+ median_height: float,
534
+ ) -> list[tuple[int, int, int, int]]:
535
+ """由 ``bubble_text`` 聚块得到包络 bbox(昵称行不得与之相交)。"""
536
+ envelopes: list[tuple[int, int, int, int]] = []
537
+ n = len(blocks)
538
+ i = 0
539
+ while i < n:
540
+ if classes[i] != "bubble_text":
541
+ i += 1
542
+ continue
543
+ cur = blocks[i].bbox_xyxy
544
+ j = i + 1
545
+ while j < n and classes[j] == "bubble_text":
546
+ prev_b, next_b = blocks[j - 1], blocks[j]
547
+ merged = _mergeable_bubble_fragments(
548
+ prev_b,
549
+ next_b,
550
+ original_width=original_width,
551
+ median_h=median_height,
552
+ config=config,
553
+ )
554
+ same_half = _is_on_left_half(
555
+ prev_b, original_width, config.screen_midline_ratio
556
+ ) == _is_on_left_half(
557
+ next_b, original_width, config.screen_midline_ratio
558
+ )
559
+ gap = _block_top(next_b) - _block_bottom(prev_b)
560
+ if merged is not None or (
561
+ same_half
562
+ and 0 <= gap <= int(config.bubble_vertical_gap_max_px)
563
+ ):
564
+ cur = _xyxy_union(cur, next_b.bbox_xyxy)
565
+ j += 1
566
+ else:
567
+ break
568
+ envelopes.append(cur)
569
+ i = j
570
+ return envelopes
571
+
572
+
573
+ def _apply_prd_nickname_bubble_separation(
574
+ blocks: list["TextBlock"],
575
+ classes: list[str],
576
+ *,
577
+ original_width: int,
578
+ config: NicknameOcrConfig,
579
+ median_height: float,
580
+ ) -> None:
581
+ """PRD §九 7(3):(已禁用 — 改为 bridge 层 bbox 过滤)。"""
582
+ pass
583
+
584
+
585
+ def _nickname_passes_avatar_guard(
586
+ *,
587
+ bgr_guard: Any,
588
+ nick_block: "TextBlock",
589
+ avatar_layout: Any,
590
+ scale_w: float,
591
+ original_width: int,
592
+ config: NicknameOcrConfig,
593
+ ) -> bool:
594
+ """PRD §九 7(2):靠左/靠右发言均须影像守门(无 PNG 时由调用方在 ``require_*`` 下拒绝)。"""
595
+ if bgr_guard is None:
596
+ return not config.require_avatar_guard_for_nickname
597
+ try:
598
+ from processor.nickname_avatar_guard import avatar_roi_pass
599
+ except ImportError:
600
+ return not config.require_avatar_guard_for_nickname
601
+
602
+ x1, y1, x2, y2 = nick_block.bbox_xyxy
603
+ left = _is_on_left_half(nick_block, original_width, config.screen_midline_ratio)
604
+ side = "left" if left else "right"
605
+ anchor = _avatar_anchor_for_block(nick_block, avatar_layout, scale_w=scale_w)
606
+ prd_strict = bool(config.use_prd_strict_avatar_guard)
607
+ return bool(
608
+ avatar_roi_pass(
609
+ bgr_guard,
610
+ nickname_x1=int(x1),
611
+ nickname_x2=int(x2),
612
+ y_top=int(y1),
613
+ y_bottom=int(y2),
614
+ avatar_anchor=anchor,
615
+ prd_strict=prd_strict,
616
+ side=side,
617
+ **_avatar_guard_kwargs(config),
618
+ )
619
+ )
620
+
621
+
622
+ # PRD §7(5):整行占位类关键词剔除(不包含「[文件]」)。
623
+ # 追加 PRD §6(4):排除收藏 / 以下为新消息。
624
+ _PLACEHOLDER_EXACT_LINES: frozenset[str] = frozenset({
625
+ "收藏",
626
+ "以下为新消息",
627
+ "[语音]",
628
+ "[表情]",
629
+ "[动画表情]",
630
+ "[图片]",
631
+ "[视频]",
632
+ "[位置]",
633
+ "[链接]",
634
+ "[红包]",
635
+ "[转账]",
636
+ "[撤回了一条消息]",
637
+ })
638
+
639
+
640
+ def _strip_placeholder_lines_from_body(text: str) -> str:
641
+ if not text.strip():
642
+ return ""
643
+ kept: list[str] = []
644
+ for raw_ln in text.split("\n"):
645
+ stripped_ln = raw_ln.strip()
646
+ if not stripped_ln:
647
+ continue
648
+ if stripped_ln in _PLACEHOLDER_EXACT_LINES:
649
+ continue
650
+ kept.append(raw_ln.rstrip())
651
+ return "\n".join(kept).strip()
652
+
653
+
654
+ def _apply_resume_thumb_block_mask(
655
+ blocks: list["TextBlock"],
656
+ classes: list[str],
657
+ screenshot: "Screenshot",
658
+ *,
659
+ original_width: int = 1080,
660
+ screen_midline_ratio: float = 0.45,
661
+ ) -> None:
662
+ """§7:简历微信笔记 thumbnail bbox 相交的 OCR 块强制 drop。
663
+
664
+ 仅丢弃**右半屏**的块(简历卡片正文在右侧);左半屏的发言人聊天气泡
665
+ 保留为 ``bubble_text``,供后续 nickname 认领和 thumb binding 使用。
666
+ """
667
+ raws = getattr(screenshot, "resume_thumb_bboxes", None) or []
668
+ thumbs: list[tuple[int, int, int, int]] = []
669
+ for raw in raws:
670
+ if raw and len(raw) >= 4:
671
+ thumbs.append((int(raw[0]), int(raw[1]), int(raw[2]), int(raw[3])))
672
+ if not thumbs:
673
+ return
674
+ for i, b in enumerate(blocks):
675
+ bb = b.bbox_xyxy
676
+ if any(_xyxy_intersects(bb, tbb) for tbb in thumbs):
677
+ if not _is_on_left_half(b, original_width, screen_midline_ratio):
678
+ classes[i] = "drop"
679
+
680
+
681
+ def _load_prd_layout_optional(
682
+ png_path: Optional[Path],
683
+ screenshot: "Screenshot",
684
+ *,
685
+ config: NicknameOcrConfig,
686
+ ) -> Any:
687
+ if png_path is None or not config.use_prd_chat_vertical_bands:
688
+ return None
689
+ try:
690
+ from processor.chat_bubble_geometry_prd import (
691
+ build_prd_speaker_vertical_bands,
692
+ compute_chat_content_vertical_bounds,
693
+ detect_chat_side_avatar_ytops,
694
+ merge_avatar_ytops_time_order,
695
+ )
696
+ from processor.nickname_avatar_guard import load_png_bgr
697
+ except ImportError:
698
+ return None
699
+ arr = load_png_bgr(str(png_path))
700
+ if arr is None or getattr(arr, "size", 0) == 0:
701
+ return None
702
+ oh = screenshot.original_resolution.height
703
+ ow = screenshot.original_resolution.width
704
+ sw = float(ow) / 1080.0
705
+ bounds = compute_chat_content_vertical_bounds(arr, sw, screen_h=oh)
706
+ ly, ry = detect_chat_side_avatar_ytops(arr, sw, bounds)
707
+ timeline = merge_avatar_ytops_time_order(ly, ry)
708
+ return build_prd_speaker_vertical_bands(timeline, bounds, scale_w=sw)
709
+
710
+
711
+ def _finalize_prd_attribution(
712
+ attr: ChatSpeakerAttribution,
713
+ *,
714
+ config: NicknameOcrConfig,
715
+ blocks: Optional[list["TextBlock"]] = None,
716
+ layout_opt: Any = None,
717
+ scale_w: float = 1.0,
718
+ ) -> ChatSpeakerAttribution:
719
+ """§7:占位行剔除、短正文门控、左侧 orphan 昵称继承、「找不到归属昵称」。
720
+
721
+ PRD §5 补充:当 orphan 段落在 band 内且无 ``last_left`` 可继承时,
722
+ 通过 band 索引查找该 band 的首个发言人昵称作为归属。
723
+ """
724
+ # 0)预计算 band_index → 首个发言人昵称(供 orphan 回退)
725
+ band_nick: dict[int, str] = {}
726
+ if layout_opt is not None and blocks is not None:
727
+ for n in attr.nicknames:
728
+ bi = n.band_index
729
+ if bi is not None and bi not in band_nick:
730
+ band_nick[bi] = n.nickname.strip()
731
+
732
+ # 1)占位剔除 + L
733
+ pruned_segs: list[SpeakerBodySegment] = []
734
+ for s in attr.speaker_body_segments:
735
+ body = _strip_placeholder_lines_from_body(s.body_text)
736
+ stripped_len = len(body.strip())
737
+ if stripped_len == 0:
738
+ continue
739
+ if stripped_len < max(0, int(config.min_bubble_body_chars)):
740
+ continue
741
+ pruned_segs.append(dataclass_replace(s, body_text=body))
742
+
743
+ # 2)左侧继承同侧上方最近昵称(右半屏不变)
744
+ # PRD §5:若 orphan 无 last_left,回退到所在 band 的发言人
745
+ last_left = ""
746
+ out2: list[SpeakerBodySegment] = []
747
+ for s in pruned_segs:
748
+ disp_raw = (s.speaker_display or "").strip()
749
+ if s.speaker_display == _OWNER_SPEAKER_LABEL:
750
+ out2.append(s)
751
+ continue
752
+ if s.orphan and not disp_raw:
753
+ if last_left:
754
+ out2.append(
755
+ dataclass_replace(
756
+ s,
757
+ speaker_display=last_left,
758
+ orphan=False,
759
+ ),
760
+ )
761
+ elif (
762
+ band_nick
763
+ and blocks is not None
764
+ and layout_opt is not None
765
+ and s.block_index >= 0
766
+ ):
767
+ # PRD §5:进入哪个 band 就归入哪个 band 的发言人
768
+ blk = blocks[s.block_index]
769
+ bi = None
770
+ if layout_opt is not None:
771
+ _sb_o = getattr(layout_opt, "speaker_bands", ())
772
+ _by1, _by2 = float(blk.bbox_xyxy[1]), float(blk.bbox_xyxy[3])
773
+ for _ix, (_b0, _b1) in enumerate(_sb_o):
774
+ if _by2 > float(_b0) and _by1 < float(_b1):
775
+ bi = _ix
776
+ break
777
+ band_sp = band_nick.get(bi, "") if bi is not None else ""
778
+ if band_sp:
779
+ out2.append(
780
+ dataclass_replace(
781
+ s,
782
+ speaker_display=band_sp,
783
+ orphan=False,
784
+ ),
785
+ )
786
+ last_left = band_sp
787
+ else:
788
+ out2.append(
789
+ dataclass_replace(
790
+ s,
791
+ speaker_display=SPEAKER_DISPLAY_UNATTRIBUTED,
792
+ orphan=True,
793
+ ),
794
+ )
795
+ else:
796
+ out2.append(
797
+ dataclass_replace(
798
+ s,
799
+ speaker_display=SPEAKER_DISPLAY_UNATTRIBUTED,
800
+ orphan=True,
801
+ ),
802
+ )
803
+ else:
804
+ out2.append(s)
805
+ if (
806
+ disp_raw
807
+ and disp_raw != SPEAKER_DISPLAY_UNATTRIBUTED
808
+ and disp_raw != _OWNER_SPEAKER_LABEL
809
+ ):
810
+ last_left = disp_raw
811
+
812
+ bubbles_owner = sum(1 for s in out2 if s.speaker_display == _OWNER_SPEAKER_LABEL)
813
+ bubbles_unknown = sum(
814
+ 1 for s in out2 if (s.speaker_display or "").strip() == SPEAKER_DISPLAY_UNATTRIBUTED
815
+ )
816
+ bubbles_with_nickname = 0
817
+ for s in out2:
818
+ disp_fin = (s.speaker_display or "").strip()
819
+ if (
820
+ (not s.orphan)
821
+ and disp_fin
822
+ and disp_fin not in (_OWNER_SPEAKER_LABEL, SPEAKER_DISPLAY_UNATTRIBUTED)
823
+ ):
824
+ bubbles_with_nickname += 1
825
+
826
+ total_bubbles = len(out2)
827
+ return ChatSpeakerAttribution(
828
+ screenshot_id=attr.screenshot_id,
829
+ nicknames=attr.nicknames,
830
+ total_bubbles=total_bubbles,
831
+ bubbles_with_nickname=bubbles_with_nickname,
832
+ bubbles_owner=bubbles_owner,
833
+ bubbles_unknown=bubbles_unknown,
834
+ resume_thumb_orphans=attr.resume_thumb_orphans,
835
+ resume_thumb_bindings=attr.resume_thumb_bindings,
836
+ speaker_body_segments=tuple(out2),
837
+ )
838
+
839
+
840
+ def _resume_thumb_bindings_and_orphans(
841
+ screenshot: "Screenshot",
842
+ blocks: list["TextBlock"],
843
+ classes: list[str],
844
+ claimed_indices: set[int],
845
+ nicknames: tuple[NicknameExtraction, ...],
846
+ *,
847
+ original_width: int,
848
+ config: NicknameOcrConfig,
849
+ layout_opt: Any = None,
850
+ scale_w: float = 1.0,
851
+ image_size: tuple[int, int] | None = None,
852
+ ) -> tuple[tuple[ResumeThumbBinding, ...], int, set[int]]:
853
+ """PRD §7(2):简历卡 bbox → 所属昵称绑定 + orphan 计数。
854
+
855
+ 三步逻辑:
856
+ A. 找到简历卡片所属 band(PRD §5 头像 y-top 纵向筒)
857
+ B. 在 band 内找第一行符合昵称要求的文字 → 昵称
858
+ C. 若无,向上找到上一行文字 → 昵称
859
+ (排除同上;该行文字从所在区域内容里"去除")
860
+
861
+ Returns:
862
+ (bindings, orphan_count, excluded_indices)
863
+ excluded_indices 是 Step C 中"去除"的 block 索引,调用方应将其 classes 设为 drop。
864
+ """
865
+ raws = getattr(screenshot, "resume_thumb_bboxes", None) or []
866
+ if not raws:
867
+ return (), 0, set()
868
+
869
+ # ── 将 resume_thumb_bboxes 从手机原生分辨率缩放到 OCR 图像空间 ──
870
+ if image_size is not None:
871
+ iw, ih = float(image_size[0]), float(image_size[1])
872
+ now = float(screenshot.original_resolution.width)
873
+ noh = float(screenshot.original_resolution.height)
874
+ sx = iw / now if now > 0 else 1.0
875
+ sy = ih / noh if noh > 0 else 1.0
876
+ if abs(sx - 1.0) > 0.0001 or abs(sy - 1.0) > 0.0001:
877
+ raws = [
878
+ [raw[0] * sx, raw[1] * sy, raw[2] * sx, raw[3] * sy]
879
+ for raw in raws if raw and len(raw) >= 4
880
+ ]
881
+
882
+ # OCR bbox 与卡片 bbox 各有 ±2~3px 抖动:卡内标题行可能比卡顶高出 1~3px,
883
+ # 若按严格包含判定会被误认为"卡外文字"而误绑为昵称(2026-06-11 真实案例)。
884
+ # 真昵称行整体位于卡片上方(y2 < 卡顶),不受此容差影响。
885
+ _inside_tol = 4.0 * max(scale_w, 1.0)
886
+
887
+ def _completely_inside(block_bbox: tuple, card_bbox: tuple) -> bool:
888
+ bx1, by1, bx2, by2 = block_bbox
889
+ cx1, cy1, cx2, cy2 = card_bbox
890
+ # Use block centre-x rather than both edges: a block may extend a few px
891
+ # beyond the card's right side but still belong to the card (e.g. long
892
+ # skill lines), so we shouldn't treat it as "outside" on that basis.
893
+ bcx = (bx1 + bx2) / 2.0
894
+ return (
895
+ cx1 <= bcx <= cx2
896
+ and by1 >= cy1 - _inside_tol
897
+ and by2 <= cy2 + _inside_tol
898
+ )
899
+
900
+ def _should_skip_block(b: "TextBlock") -> bool:
901
+ t = (b.text or "").strip()
902
+ if not t:
903
+ return True
904
+ if t == "收藏" or t == "以下为新消息":
905
+ return True
906
+ if t.startswith("姓名:") or t.startswith("姓名:"):
907
+ return True
908
+ # 排除"**条新消息"系统提示(如 "69条新消息" 或其 OCR 片段 "69条")
909
+ if re.fullmatch(r"\d+条", t) or "条新消息" in t:
910
+ return True
911
+ if _is_pure_time(t) or _is_pure_digits(t) or _is_chat_timestamp_or_divider(t):
912
+ return True
913
+ if _is_favorite_card_label(t):
914
+ return True
915
+ return False
916
+
917
+ bindings: list[ResumeThumbBinding] = []
918
+ orphan = 0
919
+ excluded_indices: set[int] = set()
920
+
921
+ # ── 用 click_coords 虚拟 2×2 区域判定点击源卡片 ──
922
+ click_source_card_idx: int | None = None
923
+ _click_ctx = getattr(screenshot, "click_context", None)
924
+ if _click_ctx is not None:
925
+ _coords = getattr(_click_ctx, "click_coords", None)
926
+ if _coords and len(_coords) >= 2:
927
+ # 按 image_size 缩放 click_coords 到 OCR 空间
928
+ if image_size is not None:
929
+ iw, ih = float(image_size[0]), float(image_size[1])
930
+ now = float(screenshot.original_resolution.width)
931
+ noh = float(screenshot.original_resolution.height)
932
+ csx = iw / now if now > 0 else 1.0
933
+ csy = ih / noh if noh > 0 else 1.0
934
+ else:
935
+ csx, csy = 1.0, 1.0
936
+ cx, cy = _coords[0] * csx, _coords[1] * csy
937
+ # 构造 2×2 虚拟区域
938
+ click_area = (cx, cy, cx + 2.0, cy + 2.0)
939
+ # 找唯一包含该区域的卡片
940
+ matching_cards: list[int] = []
941
+ for i, raw in enumerate(raws):
942
+ if not raw or len(raw) < 4:
943
+ continue
944
+ rx1, ry1, rx2, ry2 = raw[0], raw[1], raw[2], raw[3]
945
+ if rx1 <= cx <= rx2 and ry1 <= cy <= ry2:
946
+ matching_cards.append(i)
947
+ if len(matching_cards) == 1:
948
+ click_source_card_idx = matching_cards[0]
949
+
950
+ for ci, raw in enumerate(raws):
951
+ if not raw or len(raw) < 4:
952
+ continue
953
+ tbb = (int(raw[0]), int(raw[1]), int(raw[2]), int(raw[3]))
954
+
955
+ # ── Step A: 找到简历卡片所属 band ──
956
+ # PRD §8.A:
957
+ # (0) 卡片完全在某个 band 内 → 归该 band
958
+ # (1) 卡片同时与 2 个 band 相交 → 归下方 band
959
+ band_idx: Optional[int] = None
960
+ if layout_opt is not None:
961
+ _sb_list = getattr(layout_opt, "speaker_bands", ())
962
+ _cy1, _cy2 = float(tbb[1]), float(tbb[3])
963
+ for _bi, (_b0, _b1) in enumerate(_sb_list):
964
+ if _cy2 > float(_b0) and _cy1 < float(_b1):
965
+ band_idx = _bi # 相交 → 候选;持续覆盖 → 最后一个(下方)胜出
966
+
967
+ matched_nickname: Optional[str] = None
968
+
969
+ # ── Step B: 在 band 内找第一行符合昵称要求的文字 ──
970
+ # PRD §7(2) 说明1:规则仅处理块与下方 speaker_band / 简历卡片 bbox 的相交(已去掉上方相交规则)
971
+ if band_idx is not None and band_idx >= 0:
972
+ # ── 当前 band 和下方 band / 下方卡片的 Y 范围 ──
973
+ _sb_list = getattr(layout_opt, "speaker_bands", ())
974
+ if band_idx >= len(_sb_list):
975
+ band_idx = None # 防御:band_idx 越界则跳过 Step B
976
+ else:
977
+ _band_y0, _band_y1 = float(_sb_list[band_idx][0]), float(_sb_list[band_idx][1])
978
+
979
+ _below_band_y0: Optional[float] = None
980
+ _below_band_y1: Optional[float] = None
981
+ if band_idx + 1 < len(_sb_list):
982
+ _bb_y0, _bb_y1 = _sb_list[band_idx + 1]
983
+ _below_band_y0 = float(_bb_y0)
984
+ _below_band_y1 = float(_bb_y1)
985
+
986
+ _below_card_y1: Optional[float] = None
987
+ _below_card_y2: Optional[float] = None
988
+ if ci + 1 < len(raws):
989
+ _next_raw = raws[ci + 1]
990
+ if _next_raw and len(_next_raw) >= 4:
991
+ _below_card_y1 = float(_next_raw[1])
992
+ _below_card_y2 = float(_next_raw[3])
993
+
994
+ matched_nickname: Optional[str] = None
995
+ best_y: float = float('inf')
996
+ for i, b in enumerate(blocks):
997
+ bb = b.bbox_xyxy
998
+ _by1, _by2 = float(bb[1]), float(bb[3])
999
+
1000
+ # ── 规则 (0):block 必须与当前 speaker_band 有 Y 交集 ──
1001
+ if not (_by2 > _band_y0 and _by1 < _band_y1):
1002
+ continue
1003
+
1004
+ # ── PRD §8 相交规则 (1)(2)(3) ──
1005
+ _inter_below_band = False
1006
+ if _below_band_y0 is not None:
1007
+ _inter_below_band = _by2 > _below_band_y0 and _by1 < _below_band_y1
1008
+
1009
+ _inter_below_card = False
1010
+ if _below_card_y1 is not None:
1011
+ _inter_below_card = _by2 > _below_card_y1 and _by1 < _below_card_y2
1012
+
1013
+ # 规则 (1):与下方 band 相交且不与下方 card 相交 → skip
1014
+ # (块位于下方 band 内部,但与该 band 的卡无交集,不属于当前卡)
1015
+ if _inter_below_band and not _inter_below_card:
1016
+ continue
1017
+
1018
+ # 规则 (2):与下方 band 和下方 card 同时相交 → skip
1019
+ # (块夹在 band 分界线和下方 card 顶线之间 → 属下方 band/card)
1020
+ if _inter_below_band and _inter_below_card:
1021
+ continue
1022
+
1023
+ # 规则 (3):不与下方 band 相交,但与下方 card 相交 → 属于当前 card
1024
+ # (隐含:不 continue)
1025
+
1026
+ if _should_skip_block(b):
1027
+ continue
1028
+
1029
+ is_fully_inside = _completely_inside(bb, tbb)
1030
+ # 排除完全在简历卡片 bbox 内的文字(正文)
1031
+ if is_fully_inside:
1032
+ continue
1033
+
1034
+ # ── 按距离选最优:取 y_bottom 最接近 band 上界的块 ──
1035
+ if _by2 < best_y:
1036
+ best_y = _by2
1037
+ matched_nickname = (b.text or "").strip()
1038
+
1039
+ # ── Step C: 向上找到上一行文字(跨 band) ──
1040
+ if matched_nickname is None:
1041
+ best_i: Optional[int] = None
1042
+ best_bottom: float = -1.0
1043
+ card_top = float(tbb[1])
1044
+ for i, b in enumerate(blocks):
1045
+ bcx = float(b.bbox_xyxy[3]) # block bottom
1046
+ if bcx >= card_top:
1047
+ continue
1048
+ if _should_skip_block(b):
1049
+ continue
1050
+ if bcx > best_bottom:
1051
+ best_bottom = bcx
1052
+ best_i = i
1053
+
1054
+ if best_i is not None:
1055
+ # PRD §B.1 C:距离检查——文字底部到卡片所属 speaker_band 顶部
1056
+ # 超过阈值(50px baseline × scale_w)则不作为昵称
1057
+ _use_as_nickname = True
1058
+ if band_idx is not None and band_idx >= 0:
1059
+ _sb_list = getattr(layout_opt, "speaker_bands", ())
1060
+ if band_idx < len(_sb_list):
1061
+ _band_y0 = float(_sb_list[band_idx][0])
1062
+ _gap = _band_y0 - best_bottom # positive → block above band
1063
+ _threshold = 50.0 * float(scale_w)
1064
+ if _gap > _threshold:
1065
+ _use_as_nickname = False
1066
+ if _use_as_nickname:
1067
+ matched_nickname = (blocks[best_i].text or "").strip()
1068
+ # PRD §7(2):「去除」——这行文字从所在区域内容里移除
1069
+ excluded_indices.add(best_i)
1070
+
1071
+ if matched_nickname:
1072
+ bindings.append(ResumeThumbBinding(thumb_bbox=tbb, nickname=matched_nickname))
1073
+ else:
1074
+ orphan += 1
1075
+
1076
+ return tuple(bindings), orphan, excluded_indices
1077
+
1078
+
1079
+ def _merge_attrib_line_from_seed(
1080
+ blocks: list["TextBlock"],
1081
+ seed_idx: int,
1082
+ *,
1083
+ original_width: int,
1084
+ cfg: NicknameOcrConfig,
1085
+ median_height: float,
1086
+ ) -> Optional[Tuple[str, tuple[int, int, int, int]]]:
1087
+ """从 seed 块起按阅读序合并同桌昵称行,返回 (text, bbox)。"""
1088
+ ROW_TOL = 30
1089
+ anchor_y = _vertical_center(blocks[seed_idx])
1090
+ merged_blocks: list["TextBlock"] = []
1091
+ for j in range(seed_idx, len(blocks)):
1092
+ b = blocks[j]
1093
+ tb = b.text.strip()
1094
+ if not tb or b.confidence < cfg.min_confidence:
1095
+ break
1096
+ if not _is_on_left_half(b, original_width, cfg.screen_midline_ratio):
1097
+ break
1098
+ if abs(_vertical_center(b) - anchor_y) > ROW_TOL:
1099
+ break
1100
+ if _is_pure_time(tb) or _is_pure_digits(tb) or _is_chat_timestamp_or_divider(tb):
1101
+ break
1102
+ if _is_favorite_card_label(tb):
1103
+ break
1104
+ if _starts_with_system_prefix(tb, _ATTRIBUTION_PREFIX_DROPS):
1105
+ break
1106
+ if j > seed_idx and median_height > 2.0:
1107
+ if _block_height(b) > median_height * 1.12:
1108
+ break
1109
+ merged_blocks.append(b)
1110
+ if not merged_blocks:
1111
+ return None
1112
+ xs1 = min(b.bbox_xyxy[0] for b in merged_blocks)
1113
+ ys1 = min(b.bbox_xyxy[1] for b in merged_blocks)
1114
+ xs2 = max(b.bbox_xyxy[2] for b in merged_blocks)
1115
+ ys2 = max(b.bbox_xyxy[3] for b in merged_blocks)
1116
+ merged_text = "".join(b.text.strip() for b in merged_blocks)
1117
+ if not merged_text:
1118
+ return None
1119
+ return merged_text, (int(xs1), int(ys1), int(xs2), int(ys2))
1120
+
1121
+
1122
+ def _iter_first_attrib_seed_indices(
1123
+ blocks: list["TextBlock"],
1124
+ *,
1125
+ original_width: int,
1126
+ cfg: NicknameOcrConfig,
1127
+ median_height: float,
1128
+ avatar_layout: Any = None,
1129
+ ) -> list[int]:
1130
+ """阅读序下所有通过几何启发式的首归因 seed 索引(供过滤择优)。"""
1131
+ seeds: list[int] = []
1132
+ for idx, b in enumerate(blocks):
1133
+ text = b.text.strip()
1134
+ if not text or b.confidence < cfg.min_confidence:
1135
+ continue
1136
+ if _is_pure_time(text) or _is_pure_digits(text) or _is_chat_timestamp_or_divider(text):
1137
+ continue
1138
+ if _is_favorite_card_label(text):
1139
+ continue
1140
+ if _starts_with_system_prefix(text, _ATTRIBUTION_PREFIX_DROPS):
1141
+ continue
1142
+ if not _is_on_left_half(b, original_width, cfg.screen_midline_ratio):
1143
+ continue
1144
+ if not _is_avatar_column_nickname_row(b, original_width, cfg, avatar_layout):
1145
+ continue
1146
+ seeds.append(idx)
1147
+ return seeds
1148
+
1149
+
1150
+ def first_attrib_verbatim_display_line_and_bbox(
1151
+ ocr_result: "OcrPageResult",
1152
+ screenshot: "Screenshot",
1153
+ *,
1154
+ config: Optional[NicknameOcrConfig] = None,
1155
+ png_path: Optional["Path"] = None,
1156
+ ) -> Optional[Tuple[str, tuple[int, int, int, int]]]:
1157
+ """左栏首条合格昵称:几何候选按阅读序过滤,**第一个**通过 ``avatar_roi_pass`` 的才采纳。"""
1158
+ cfg = config or NicknameOcrConfig()
1159
+ assert screenshot.type == "chat_message", (
1160
+ "first_attrib_verbatim_display_line expects chat_message type; got "
1161
+ f"{screenshot.type!r}"
1162
+ )
1163
+ assert ocr_result.screenshot_id == screenshot.screenshot_id, (
1164
+ f"screenshot_id mismatch ocr_result={ocr_result.screenshot_id!r} vs "
1165
+ f"screenshot={screenshot.screenshot_id!r}"
1166
+ )
1167
+ blocks = list(ocr_result.text_blocks)
1168
+ if not blocks:
1169
+ return None
1170
+
1171
+ original_width = screenshot.original_resolution.width
1172
+ heights = [_block_height(b) for b in blocks if _block_height(b) > 0]
1173
+ median_height = float(median(heights)) if heights else 1.0
1174
+
1175
+ bgr_guard = None
1176
+ avatar_layout = _load_left_avatar_layout(png_path, screenshot)
1177
+ scale_w = float(original_width) / 1080.0
1178
+ if png_path is not None:
1179
+ try:
1180
+ from processor.nickname_avatar_guard import load_png_bgr
1181
+
1182
+ bgr_guard = load_png_bgr(str(png_path))
1183
+ except ImportError:
1184
+ bgr_guard = None
1185
+
1186
+ for seed_idx in _iter_first_attrib_seed_indices(
1187
+ blocks,
1188
+ original_width=original_width,
1189
+ cfg=cfg,
1190
+ median_height=median_height,
1191
+ avatar_layout=avatar_layout,
1192
+ ):
1193
+ merged = _merge_attrib_line_from_seed(
1194
+ blocks,
1195
+ seed_idx,
1196
+ original_width=original_width,
1197
+ cfg=cfg,
1198
+ median_height=median_height,
1199
+ )
1200
+ if merged is None:
1201
+ continue
1202
+ merged_text, bbox = merged
1203
+ seed_block = blocks[seed_idx]
1204
+ if not _nickname_passes_avatar_guard(
1205
+ bgr_guard=bgr_guard,
1206
+ nick_block=seed_block,
1207
+ avatar_layout=avatar_layout,
1208
+ scale_w=scale_w,
1209
+ original_width=original_width,
1210
+ config=cfg,
1211
+ ):
1212
+ continue
1213
+ return merged_text, bbox
1214
+
1215
+ return None
1216
+
1217
+
1218
+ def first_attrib_verbatim_display_line(
1219
+ ocr_result: "OcrPageResult",
1220
+ screenshot: "Screenshot",
1221
+ *,
1222
+ config: Optional[NicknameOcrConfig] = None,
1223
+ png_path: Optional["Path"] = None,
1224
+ ) -> Optional[str]:
1225
+ """左栏第一条「归因展示昵称」行:拼接 OCR Reading order 中同桌邻块原文。
1226
+
1227
+ 与 ``extract_nicknames``(气泡归因统计)并行;**不入库**侧的微信昵称是否采纳还应在
1228
+ 影像层校验左侧头像 ROI(processor 中与 ``nickname_avatar_guard`` 串联)。
1229
+ """
1230
+ pair = first_attrib_verbatim_display_line_and_bbox(
1231
+ ocr_result, screenshot, config=config, png_path=png_path
1232
+ )
1233
+ return pair[0] if pair else None
1234
+
1235
+
1236
+ def _classify(
1237
+ block: "TextBlock",
1238
+ *,
1239
+ original_width: int,
1240
+ config: NicknameOcrConfig,
1241
+ median_height: float,
1242
+ avatar_layout: Any = None,
1243
+ ) -> str:
1244
+ """Return ``"nickname_candidate"`` / ``"bubble_text"`` / ``"drop"``."""
1245
+ text = block.text.strip()
1246
+ if not text:
1247
+ return "drop"
1248
+ if block.confidence < config.min_confidence:
1249
+ return "drop"
1250
+ if (
1251
+ _is_pure_time(text)
1252
+ or _is_pure_digits(text)
1253
+ or _is_chat_timestamp_or_divider(text)
1254
+ or _is_favorite_card_label(text)
1255
+ ):
1256
+ return "drop"
1257
+
1258
+ is_left = _is_on_left_half(block, original_width, config.screen_midline_ratio)
1259
+
1260
+ if (
1261
+ len(text) <= config.nickname_max_chars
1262
+ and not _starts_with_system_prefix(text, config.system_prefix_drops)
1263
+ and is_left
1264
+ and _is_avatar_column_nickname_row(
1265
+ block, original_width, config, avatar_layout
1266
+ )
1267
+ ):
1268
+ return "nickname_candidate"
1269
+
1270
+ return "bubble_text"
1271
+
1272
+
1273
+ def _merge_two_text_blocks(a: TextBlock, b: TextBlock, *, joiner: str) -> TextBlock:
1274
+ ax1, ay1, ax2, ay2 = a.bbox_xyxy
1275
+ bx1, by1, bx2, by2 = b.bbox_xyxy
1276
+ u = (min(ax1, bx1), min(ay1, by1), max(ax2, bx2), max(ay2, by2))
1277
+ ta = a.text.strip()
1278
+ tb = b.text.strip()
1279
+ if joiner == "\n":
1280
+ text = f"{ta}\n{tb}" if ta else tb
1281
+ else:
1282
+ text = f"{ta}{tb}" if ta else tb
1283
+ conf = min(a.confidence, b.confidence)
1284
+ return TextBlock(
1285
+ text=text,
1286
+ bbox_xyxy=tuple(int(x) for x in u),
1287
+ confidence=conf,
1288
+ line_read_index=a.line_read_index or b.line_read_index,
1289
+ paragraph_read_index=a.paragraph_read_index or b.paragraph_read_index,
1290
+ )
1291
+
1292
+
1293
+ def _horizontal_gap_left_to_right(a_xyxy: tuple[int, int, int, int], b_xyxy: tuple[int, int, int, int]) -> int:
1294
+ """Non-negative gap when ``a`` is left of ``b`` (``a.x2 <= b.x1``); else ``0``."""
1295
+ ax2 = a_xyxy[2]
1296
+ bx1 = b_xyxy[0]
1297
+ if ax2 <= bx1:
1298
+ return bx1 - ax2
1299
+ return 0
1300
+
1301
+
1302
+ def _vertical_axis_overlap_ratio(a_xyxy: tuple[int, int, int, int], b_xyxy: tuple[int, int, int, int]) -> float:
1303
+ ay1, ay2 = a_xyxy[1], a_xyxy[3]
1304
+ by1, by2 = b_xyxy[1], b_xyxy[3]
1305
+ inter = min(ay2, by2) - max(ay1, by1)
1306
+ if inter <= 0:
1307
+ return 0.0
1308
+ ha = max(1, ay2 - ay1)
1309
+ hb = max(1, by2 - by1)
1310
+ return float(inter) / float(min(ha, hb))
1311
+
1312
+
1313
+ def _mergeable_bubble_fragments(
1314
+ a: TextBlock,
1315
+ b: TextBlock,
1316
+ *,
1317
+ original_width: int,
1318
+ median_h: float,
1319
+ config: NicknameOcrConfig,
1320
+ ) -> Optional[str]:
1321
+ """PRD §7(3):返回 ``''``(水平拼接)或 ``'\\n'``(换行拼接),不可合并则 ``None``。"""
1322
+ if _is_on_left_half(a, original_width, config.screen_midline_ratio) != _is_on_left_half(
1323
+ b,
1324
+ original_width,
1325
+ config.screen_midline_ratio,
1326
+ ):
1327
+ return None
1328
+ if a.confidence < config.min_confidence or b.confidence < config.min_confidence:
1329
+ return None
1330
+
1331
+ y_tol = float(config.bubble_fragment_same_line_y_tol_px)
1332
+ d_centers = abs(_vertical_center(a) - _vertical_center(b))
1333
+ if d_centers <= y_tol:
1334
+ gap = _horizontal_gap_left_to_right(a.bbox_xyxy, b.bbox_xyxy)
1335
+ max_gap = max(
1336
+ float(config.bubble_fragment_horizontal_gap_max_px),
1337
+ median_h * 1.15,
1338
+ )
1339
+ if gap <= max_gap and (
1340
+ _vertical_axis_overlap_ratio(a.bbox_xyxy, b.bbox_xyxy) >= 0.22 or gap == 0
1341
+ ):
1342
+ return ""
1343
+ return None
1344
+
1345
+ return None
1346
+ def _prd_merge_adjacent_bubble_text_blocks(
1347
+ blocks: list[TextBlock],
1348
+ *,
1349
+ original_width: int,
1350
+ config: NicknameOcrConfig,
1351
+ ) -> list[TextBlock]:
1352
+ """PRD §7(3):在进入 STEP1 分类主循环前,合并同一气泡的 Paddle 碎片(仅 ``bubble_text`` 之间)。"""
1353
+ if len(blocks) < 2:
1354
+ return blocks
1355
+ heights = [_block_height(b) for b in blocks if _block_height(b) > 0]
1356
+ median_h = float(median(heights)) if heights else 1.0
1357
+ classes = [
1358
+ _classify(
1359
+ b,
1360
+ original_width=original_width,
1361
+ config=config,
1362
+ median_height=median_h,
1363
+ )
1364
+ for b in blocks
1365
+ ]
1366
+ out: list[TextBlock] = []
1367
+ i = 0
1368
+ n = len(blocks)
1369
+ while i < n:
1370
+ if classes[i] != "bubble_text":
1371
+ out.append(blocks[i])
1372
+ i += 1
1373
+ continue
1374
+ cur = blocks[i]
1375
+ j = i + 1
1376
+ while j < n:
1377
+ if classes[j] != "bubble_text":
1378
+ break
1379
+ joiner = _mergeable_bubble_fragments(
1380
+ cur,
1381
+ blocks[j],
1382
+ original_width=original_width,
1383
+ median_h=median_h,
1384
+ config=config,
1385
+ )
1386
+ if joiner is None:
1387
+ break
1388
+ cur = _merge_two_text_blocks(cur, blocks[j], joiner=joiner)
1389
+ j += 1
1390
+ out.append(cur)
1391
+ i = j
1392
+ return out
1393
+
1394
+
1395
+ def _join_segment_body_lines(blocks: list["TextBlock"], indices: tuple[int, ...]) -> str:
1396
+ parts = [blocks[i].text.strip() for i in indices]
1397
+ return "\n".join(p for p in parts if p)
1398
+
1399
+
1400
+ def _speaker_body_segments_from_layout(
1401
+ blocks: list["TextBlock"],
1402
+ classes: list[str],
1403
+ nicknames: tuple[NicknameExtraction, ...],
1404
+ claimed_indices: set[int],
1405
+ *,
1406
+ original_width: int,
1407
+ config: NicknameOcrConfig,
1408
+ ) -> tuple[SpeakerBodySegment, ...]:
1409
+ """按阅读序构造本页 ``SpeakerBodySegment``(昵称块本身不占正文)。"""
1410
+ min_len = max(0, int(config.min_bubble_body_chars))
1411
+ segments: list[tuple[int, SpeakerBodySegment]] = []
1412
+
1413
+ for n in nicknames:
1414
+ body = _join_segment_body_lines(blocks, n.following_block_indices)
1415
+ stripped = body.strip()
1416
+ if not stripped:
1417
+ continue
1418
+ if len(stripped) < min_len:
1419
+ continue
1420
+ idx = int(n.nickname_block_index)
1421
+ if idx < 0:
1422
+ continue
1423
+ segments.append(
1424
+ (
1425
+ idx,
1426
+ SpeakerBodySegment(
1427
+ speaker_display=n.nickname.strip(),
1428
+ body_text=body,
1429
+ orphan=False,
1430
+ block_index=idx,
1431
+ ),
1432
+ )
1433
+ )
1434
+
1435
+ for idx, klass in enumerate(classes):
1436
+ if klass != "bubble_text":
1437
+ continue
1438
+ if idx in claimed_indices:
1439
+ continue
1440
+ text = blocks[idx].text.strip()
1441
+ if len(text) < min_len:
1442
+ continue
1443
+ if _is_on_left_half(blocks[idx], original_width, config.screen_midline_ratio):
1444
+ segments.append(
1445
+ (
1446
+ idx,
1447
+ SpeakerBodySegment(
1448
+ speaker_display="",
1449
+ body_text=text,
1450
+ orphan=True,
1451
+ block_index=idx,
1452
+ ),
1453
+ )
1454
+ )
1455
+ else:
1456
+ segments.append(
1457
+ (
1458
+ idx,
1459
+ SpeakerBodySegment(
1460
+ speaker_display=_OWNER_SPEAKER_LABEL,
1461
+ body_text=text,
1462
+ orphan=False,
1463
+ block_index=idx,
1464
+ ),
1465
+ )
1466
+ )
1467
+
1468
+ segments.sort(key=lambda t: t[0])
1469
+ return tuple(s for _, s in segments)
1470
+
1471
+
1472
+ # ============================================================================
1473
+ # Public API
1474
+ # ============================================================================
1475
+
1476
+
1477
+ def _build_ocr_debug_dict(
1478
+ screenshot: "Screenshot",
1479
+ blocks: list["TextBlock"],
1480
+ classes: list[str],
1481
+ layout_opt: Any,
1482
+ scale_w: float,
1483
+ image_size: tuple[int, int],
1484
+ ) -> dict[str, Any]:
1485
+ """构建单帧 OCR 归因全量调试数据,供 ``debug_session_derived.json`` 落地。
1486
+
1487
+ 所有坐标统一使用 *OCR 实际图像空间*(``image_size``),与
1488
+ ``metadata.json`` 中的手机原生分辨率(``screenshot.original_resolution``)
1489
+ 可能不同(采集端可能按 ``renxin.yaml`` 缩放后输出)。
1490
+ """
1491
+ iw, ih = int(image_size[0]), int(image_size[1])
1492
+ now = float(screenshot.original_resolution.width)
1493
+ noh = float(screenshot.original_resolution.height)
1494
+ sx = iw / now if now > 0 else 1.0
1495
+ sy = ih / noh if noh > 0 else 1.0
1496
+ # 所有来自 metadata(手机原生空间)的坐标统一缩放到 OCR 图像空间
1497
+ _remap_bbox = (
1498
+ lambda raw: [int(raw[0] * sx), int(raw[1] * sy),
1499
+ int(raw[2] * sx), int(raw[3] * sy)]
1500
+ ) if (abs(sx - 1.0) > 0.0001 or abs(sy - 1.0) > 0.0001) else (
1501
+ lambda raw: list(map(int, raw))
1502
+ )
1503
+ _click_ctx = getattr(screenshot, "click_context", None)
1504
+ user_click_area: Any = None
1505
+ if _click_ctx is not None:
1506
+ _coords = getattr(_click_ctx, "click_coords", None)
1507
+ if _coords and len(_coords) >= 2:
1508
+ _cx = int(_coords[0] * sx)
1509
+ _cy = int(_coords[1] * sy)
1510
+ user_click_area = [_cx, _cy, _cx + 2, _cy + 2]
1511
+ debug: dict[str, Any] = {
1512
+ "screen_size": {"width": iw, "height": ih},
1513
+ "scale_w": round(float(iw) / 1080.0, 4),
1514
+ "user_click_area_scaling": user_click_area,
1515
+ "resume_thumb_bboxes_scaling": [
1516
+ _remap_bbox(raw)
1517
+ for raw in (getattr(screenshot, "resume_thumb_bboxes", None) or [])
1518
+ if raw and len(raw) >= 4
1519
+ ],
1520
+ # 经过实际验证,简历卡片正常高度为 260–279 px(占 87%,中位数 273)
1521
+ }
1522
+ if layout_opt is not None:
1523
+ _sb_debug = getattr(layout_opt, "speaker_bands", ())
1524
+ debug["orphan_top_band"] = (
1525
+ [int(ot[0]), int(ot[1])]
1526
+ if (ot := layout_opt.orphan_top_band) is not None
1527
+ else None
1528
+ )
1529
+ debug["speaker_bands"] = [
1530
+ [int(b0), int(b1)] for b0, b1 in _sb_debug
1531
+ ]
1532
+ cb = layout_opt.content_bounds
1533
+ debug["content_bounds"] = {
1534
+ "y_top": int(cb.y_top),
1535
+ "y_bottom_excl": int(cb.y_bottom_excl),
1536
+ }
1537
+ debug["no_avatars_all_orphan"] = bool(
1538
+ getattr(layout_opt, "no_avatars_all_orphan", False)
1539
+ )
1540
+ block_items: list[dict[str, Any]] = []
1541
+ for i, b in enumerate(blocks):
1542
+ bb = b.bbox_xyxy
1543
+ _by1, _by2 = float(bb[1]), float(bb[3])
1544
+ bi = None
1545
+ for _ix, (_b0, _b1) in enumerate(_sb_debug):
1546
+ if _by2 > float(_b0) and _by1 < float(_b1):
1547
+ bi = _ix
1548
+ break
1549
+ block_items.append({
1550
+ "bbox_xyxy": list(map(int, bb)),
1551
+ "text": (b.text or "").strip(),
1552
+ "confidence": round(float(b.confidence), 4),
1553
+ "class": classes[i] if i < len(classes) else "unknown",
1554
+ "band": bi,
1555
+ })
1556
+ debug["blocks"] = block_items
1557
+ else:
1558
+ # 无 layout 时仅输出块信息(无 band)
1559
+ debug["orphan_top_band"] = None
1560
+ debug["speaker_bands"] = []
1561
+ debug["content_bounds"] = None
1562
+ debug["no_avatars_all_orphan"] = True
1563
+ block_items: list[dict[str, Any]] = []
1564
+ for i, b in enumerate(blocks):
1565
+ bb = b.bbox_xyxy
1566
+ block_items.append({
1567
+ "bbox_xyxy": list(map(int, bb)),
1568
+ "text": (b.text or "").strip(),
1569
+ "confidence": round(float(b.confidence), 4),
1570
+ "class": classes[i] if i < len(classes) else "unknown",
1571
+ "band": None,
1572
+ })
1573
+ debug["blocks"] = block_items
1574
+ return debug
1575
+
1576
+
1577
+ def extract_nicknames(
1578
+ ocr_result: "OcrPageResult",
1579
+ screenshot: "Screenshot",
1580
+ *,
1581
+ config: Optional[NicknameOcrConfig] = None,
1582
+ png_path: Optional[Path] = None,
1583
+ ) -> ChatSpeakerAttribution:
1584
+ """Simplest viable nickname extraction from one chat-frame OCR result.
1585
+
1586
+ Preconditions:
1587
+ - ``screenshot.type == "chat_message"`` (caller must enforce;
1588
+ we assert).
1589
+ - ``ocr_result.screenshot_id == screenshot.screenshot_id``
1590
+ (we assert — defensive: catches accidental list mismatches
1591
+ in the caller's loop).
1592
+
1593
+ Returns:
1594
+ A frozen :class:`ChatSpeakerAttribution`(计数与段落经 PRD §7 后处理,见
1595
+ :func:`_finalize_prd_attribution`)。
1596
+
1597
+ Side effects / I/O:
1598
+ - 当 ``NicknameOcrConfig.use_prd_chat_vertical_bands`` 为真且提供
1599
+ ``png_path`` 时会读 PNG 做术语§5 纵向筒分段(否则退化为 OCR 间隙启发式)。
1600
+ - 仍为单帧纯函数语义:不写外部存储、不写日志。
1601
+ """
1602
+ assert screenshot.type == "chat_message", (
1603
+ f"extract_nicknames expects chat_message type; got "
1604
+ f"{screenshot.type!r} for screenshot_id={screenshot.screenshot_id!r}"
1605
+ )
1606
+ assert ocr_result.screenshot_id == screenshot.screenshot_id, (
1607
+ f"screenshot_id mismatch: ocr_result={ocr_result.screenshot_id!r} "
1608
+ f"vs screenshot={screenshot.screenshot_id!r}"
1609
+ )
1610
+
1611
+ cfg = config or NicknameOcrConfig()
1612
+ blocks = list(ocr_result.text_blocks)
1613
+
1614
+ if not blocks:
1615
+ return ChatSpeakerAttribution(
1616
+ screenshot_id=screenshot.screenshot_id,
1617
+ resume_thumb_orphans=0,
1618
+ speaker_body_segments=(),
1619
+ )
1620
+
1621
+ original_width = screenshot.original_resolution.width
1622
+ scale_w = float(original_width) / 1080.0
1623
+ layout_opt = _load_prd_layout_optional(png_path, screenshot, config=cfg)
1624
+ avatar_layout = _load_left_avatar_layout(png_path, screenshot)
1625
+
1626
+ # ── y 位置门控:丢弃聊天内容区上界以上 / 下界以下的 OCR 文本块 ──
1627
+ if layout_opt is not None:
1628
+ cb = layout_opt.content_bounds
1629
+ blocks = [
1630
+ b
1631
+ for b in blocks
1632
+ if cb.y_top <= _vertical_center(b) < cb.y_bottom_excl
1633
+ ]
1634
+ if not blocks:
1635
+ return ChatSpeakerAttribution(
1636
+ screenshot_id=screenshot.screenshot_id,
1637
+ resume_thumb_orphans=0,
1638
+ speaker_body_segments=(),
1639
+ )
1640
+
1641
+ blocks = _prd_merge_adjacent_bubble_text_blocks(
1642
+ blocks,
1643
+ original_width=original_width,
1644
+ config=cfg,
1645
+ )
1646
+ if not blocks:
1647
+ return ChatSpeakerAttribution(
1648
+ screenshot_id=screenshot.screenshot_id,
1649
+ resume_thumb_orphans=0,
1650
+ speaker_body_segments=(),
1651
+ )
1652
+
1653
+ heights = [_block_height(b) for b in blocks if _block_height(b) > 0]
1654
+ median_height = float(median(heights)) if heights else 1.0
1655
+
1656
+ classes = [
1657
+ _classify(
1658
+ b,
1659
+ original_width=original_width,
1660
+ config=cfg,
1661
+ median_height=median_height,
1662
+ avatar_layout=avatar_layout,
1663
+ )
1664
+ for b in blocks
1665
+ ]
1666
+ _apply_resume_thumb_block_mask(
1667
+ blocks, classes, screenshot,
1668
+ original_width=original_width,
1669
+ screen_midline_ratio=cfg.screen_midline_ratio,
1670
+ )
1671
+ _apply_prd_nickname_bubble_separation(
1672
+ blocks,
1673
+ classes,
1674
+ original_width=original_width,
1675
+ config=cfg,
1676
+ median_height=median_height,
1677
+ )
1678
+
1679
+ if layout_opt is not None and getattr(layout_opt, "no_avatars_all_orphan", False):
1680
+ bodies: list[str] = []
1681
+ for i, b in enumerate(blocks):
1682
+ if classes[i] == "drop":
1683
+ continue
1684
+ if b.confidence < cfg.min_confidence:
1685
+ continue
1686
+ stripped = _strip_placeholder_lines_from_body(b.text.strip())
1687
+ if stripped:
1688
+ bodies.append(stripped)
1689
+ merged = "\n".join(bodies).strip()
1690
+ claimed_indices: set[int] = set()
1691
+ thumb_bindings, thumb_o, _thumb_excluded = _resume_thumb_bindings_and_orphans(
1692
+ screenshot,
1693
+ blocks,
1694
+ classes,
1695
+ claimed_indices,
1696
+ (),
1697
+ original_width=original_width,
1698
+ config=cfg,
1699
+ layout_opt=layout_opt,
1700
+ scale_w=scale_w,
1701
+ image_size=ocr_result.original_size,
1702
+ )
1703
+ for ei in _thumb_excluded:
1704
+ if classes[ei] != "drop":
1705
+ classes[ei] = "drop"
1706
+ if not merged:
1707
+ debug_d = _build_ocr_debug_dict(
1708
+ screenshot, blocks, classes, layout_opt, scale_w,
1709
+ ocr_result.original_size,
1710
+ )
1711
+ return ChatSpeakerAttribution(
1712
+ screenshot_id=screenshot.screenshot_id,
1713
+ resume_thumb_orphans=thumb_o,
1714
+ resume_thumb_bindings=thumb_bindings,
1715
+ speaker_body_segments=(),
1716
+ ocr_debug=debug_d,
1717
+ )
1718
+ unattributed_seg = SpeakerBodySegment(
1719
+ speaker_display=SPEAKER_DISPLAY_UNATTRIBUTED,
1720
+ body_text=merged,
1721
+ orphan=True,
1722
+ )
1723
+ no_av_attr = ChatSpeakerAttribution(
1724
+ screenshot_id=screenshot.screenshot_id,
1725
+ nicknames=(),
1726
+ total_bubbles=0,
1727
+ bubbles_with_nickname=0,
1728
+ bubbles_owner=0,
1729
+ bubbles_unknown=0,
1730
+ resume_thumb_orphans=thumb_o,
1731
+ resume_thumb_bindings=thumb_bindings,
1732
+ speaker_body_segments=(unattributed_seg,),
1733
+ )
1734
+ debug_d = _build_ocr_debug_dict(
1735
+ screenshot, blocks, classes, layout_opt, scale_w,
1736
+ ocr_result.original_size,
1737
+ )
1738
+ no_av_attr_with_debug = dataclass_replace(no_av_attr, ocr_debug=debug_d)
1739
+ result = _finalize_prd_attribution(no_av_attr_with_debug, config=cfg)
1740
+ # _finalize_prd_attribution 内部构造新实例,debug 需显式带过
1741
+ return dataclass_replace(result, ocr_debug=debug_d)
1742
+
1743
+ layout_active = (
1744
+ layout_opt is not None and not getattr(layout_opt, "no_avatars_all_orphan", False)
1745
+ )
1746
+ nicknames_list: list[NicknameExtraction] = []
1747
+ claimed_indices = set[int]()
1748
+ bgr_guard = None
1749
+ if png_path is not None:
1750
+ try:
1751
+ from processor.nickname_avatar_guard import load_png_bgr
1752
+
1753
+ bgr_guard = load_png_bgr(str(png_path))
1754
+ except ImportError:
1755
+ bgr_guard = None
1756
+
1757
+ for idx, klass in enumerate(classes):
1758
+ if klass != "nickname_candidate":
1759
+ continue
1760
+ nick_block = blocks[idx]
1761
+ if not _nickname_passes_avatar_guard(
1762
+ bgr_guard=bgr_guard,
1763
+ nick_block=nick_block,
1764
+ avatar_layout=avatar_layout,
1765
+ scale_w=scale_w,
1766
+ original_width=original_width,
1767
+ config=cfg,
1768
+ ):
1769
+ continue
1770
+ nick_bottom = _block_bottom(nick_block)
1771
+ bi_n: Optional[int] = None
1772
+ if layout_active:
1773
+ _sb_for_nick = getattr(layout_opt, "speaker_bands", ())
1774
+ _ny1, _ny2 = float(nick_block.bbox_xyxy[1]), float(nick_block.bbox_xyxy[3])
1775
+ for _idx, (_b0, _b1) in enumerate(_sb_for_nick):
1776
+ if _ny2 > float(_b0) and _ny1 < float(_b1):
1777
+ bi_n = _idx
1778
+ break
1779
+ followers: list[int] = []
1780
+ for j in range(idx + 1, len(blocks)):
1781
+ if j in claimed_indices:
1782
+ continue
1783
+ if classes[j] == "nickname_candidate":
1784
+ break
1785
+ if classes[j] != "bubble_text":
1786
+ continue
1787
+ bj = blocks[j]
1788
+ if layout_active and bi_n is not None and bi_n >= 0:
1789
+ _sb = getattr(layout_opt, "speaker_bands", ())
1790
+ if bi_n < len(_sb):
1791
+ _band_y0, _band_y1 = float(_sb[bi_n][0]), float(_sb[bi_n][1])
1792
+ _bj_top, _bj_bot = float(bj.bbox_xyxy[1]), float(bj.bbox_xyxy[3])
1793
+ if not (_bj_bot > _band_y0 and _bj_top < _band_y1):
1794
+ break
1795
+ if not _is_on_left_half(bj, original_width, cfg.screen_midline_ratio):
1796
+ break
1797
+ gap = _block_top(bj) - nick_bottom
1798
+ if gap < 0 or gap > cfg.bubble_vertical_gap_max_px:
1799
+ break
1800
+ followers.append(j)
1801
+ claimed_indices.add(j)
1802
+ nick_bottom = _block_bottom(bj)
1803
+
1804
+ nicknames_list.append(
1805
+ NicknameExtraction(
1806
+ nickname=nick_block.text.strip(),
1807
+ bbox_xyxy=nick_block.bbox_xyxy,
1808
+ confidence=nick_block.confidence,
1809
+ following_block_indices=tuple(followers),
1810
+ nickname_block_index=idx,
1811
+ band_index=bi_n,
1812
+ ),
1813
+ )
1814
+
1815
+ thumb_bindings, thumb_orphans, thumb_excluded = _resume_thumb_bindings_and_orphans(
1816
+ screenshot,
1817
+ blocks,
1818
+ classes,
1819
+ claimed_indices,
1820
+ nicknames_list,
1821
+ original_width=original_width,
1822
+ config=cfg,
1823
+ layout_opt=layout_opt,
1824
+ scale_w=scale_w,
1825
+ image_size=ocr_result.original_size,
1826
+ )
1827
+ # PRD §7(2):Step C 中"去除"的 block 标记为 drop
1828
+ for ei in thumb_excluded:
1829
+ if classes[ei] != "drop":
1830
+ classes[ei] = "drop"
1831
+
1832
+ body_segments = _speaker_body_segments_from_layout(
1833
+ blocks,
1834
+ classes,
1835
+ tuple(nicknames_list),
1836
+ claimed_indices,
1837
+ original_width=original_width,
1838
+ config=cfg,
1839
+ )
1840
+ bubbles_with_nickname = sum(
1841
+ 1
1842
+ for s in body_segments
1843
+ if (not s.orphan)
1844
+ and (s.speaker_display or "").strip()
1845
+ and s.speaker_display != _OWNER_SPEAKER_LABEL
1846
+ )
1847
+ bubbles_owner = sum(
1848
+ 1 for s in body_segments if s.speaker_display == _OWNER_SPEAKER_LABEL
1849
+ )
1850
+ bubbles_unknown = sum(1 for s in body_segments if s.orphan)
1851
+ total_bubbles = len(body_segments)
1852
+
1853
+ base_attr = ChatSpeakerAttribution(
1854
+ screenshot_id=screenshot.screenshot_id,
1855
+ nicknames=tuple(nicknames_list),
1856
+ total_bubbles=total_bubbles,
1857
+ bubbles_with_nickname=bubbles_with_nickname,
1858
+ bubbles_owner=bubbles_owner,
1859
+ bubbles_unknown=bubbles_unknown,
1860
+ resume_thumb_orphans=thumb_orphans,
1861
+ resume_thumb_bindings=thumb_bindings,
1862
+ speaker_body_segments=body_segments,
1863
+ )
1864
+ debug_d = _build_ocr_debug_dict(
1865
+ screenshot, blocks, classes, layout_opt, scale_w,
1866
+ ocr_result.original_size,
1867
+ )
1868
+ result = _finalize_prd_attribution(
1869
+ base_attr,
1870
+ config=cfg,
1871
+ blocks=blocks,
1872
+ layout_opt=layout_opt if layout_active else None,
1873
+ scale_w=scale_w,
1874
+ )
1875
+ return dataclass_replace(result, ocr_debug=debug_d)
1876
+
1877
+
1878
+ __all__ = [
1879
+ "NicknameOcrConfig",
1880
+ "NicknameExtraction",
1881
+ "ResumeThumbBinding",
1882
+ "SpeakerBodySegment",
1883
+ "ChatSpeakerAttribution",
1884
+ "SPEAKER_DISPLAY_UNATTRIBUTED",
1885
+ "extract_nicknames",
1886
+ "first_attrib_verbatim_display_line",
1887
+ "first_attrib_verbatim_display_line_and_bbox",
1888
+ ]