PyPI - screenshot-vision-algorithm - Versions diffs - 0.3.0__py3-none-any.whl - Mend

screenshot-vision-algorithm 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

screenshot_vision_algorithm/android/wechat/algorithms/phash_utils.py ADDED Viewed

@@ -0,0 +1,267 @@
+"""Perceptual hashing utilities for the collector's "stuck at bottom" detector.
+``chat_content_scroll_down`` / ``detail_content_scroll_down``（capture_action）对齐 PRD **内容 scroll_down**
+（手指上滑）；术语 SSOT：``docs/requirements/product_requirement_document.md`` 开篇。
+When ``chat_content_scroll_down`` (or ``detail_content_scroll_down``) has already scrolled past
+the last new message / end of the resume, subsequent swipes produce near-
+identical frames. Comparing raw pixels fails under:
+    - WeChat's top-bar pulse animations (notification icons, battery, time)
+    - Keyboard / system insets that briefly animate
+    - Subpixel anti-aliasing on the same static content across screencap calls
+Perceptual hashing is robust to those. We use the **dHash** family (difference
+hash) because:
+    - It's cheap (grayscale + 9x8 downscale + 64 bit-compare)
+    - It handles minor pixel shifts better than aHash
+    - It's plenty for our "is this the same screen?" binary decision
+Two reference points in the collector (DD section 2.4.x D):
+    - ``chat_content_scroll_down``   ROI y: 13% ~ 92%   (exclude top status bar +
+                                                 bottom home indicator)
+    - ``detail_content_scroll_down`` ROI y: 13% ~ 95%   (note page has no home
+                                                 indicator; keep more)
+This module is stdlib + numpy + cv2 only — no pip dependency on
+``imagehash`` so the collector venv stays lean.
+Aligned with:
+    DD section 2.3 finalize trigger (phash duplicate → finalize)
+    DD section 2.4.x D phash ROI contract
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Literal
+import cv2
+import numpy as np
+#: ROI definition for chat-page pHash comparison (DD §2.4.x D).
+CHAT_ROI = (0.13, 0.92)
+#: ROI definition for note-page pHash comparison (DD §2.4.x D).
+DETAIL_ROI = (0.13, 0.95)
+#: dHash dimensions — output is 8 * 8 = 64 bits.
+DHASH_W = 9
+DHASH_H = 8
+#: Default similarity threshold for "same frame".
+#: Hamming distance ≤ 5 bits (out of 64) means ≥ 92% match — enough to be
+#: robust to top-bar icon jitter while still catching "scroll jammed at
+#: bottom" cases. Callers can dial down for stricter dup detection on
+#: detail pages where content variability is higher.
+DEFAULT_DUP_HAMMING_THRESHOLD = 5
+#: docs/adr/manual_labeling.md §步骤1(2) C — 「连续 3 次比较相似度都 ≥95%」
+#: 在 64-bit dHash 上对应 Hamming ≤ 3（(64−3)/64 ≈ 95.3%）。
+MANUAL_LABELING_SIM95_MAX_HAMMING = 3
+#: Same ADR段落 — consecutive similar pulls required before treating as anchored.
+MANUAL_LABELING_SIM95_STREAK_REQUIRED = 3
+#: manual_labeling.md §步骤2.(2) B / §步骤3 — 将「phash 相同」实现为 perceptual equivalence
+#:（与 §1(2) 的 95% 口径对齐，减轻状态栏抖动导致的误判）。
+MANUAL_LABELING_PHASH_IDENTICAL_MAX_HAMMING = MANUAL_LABELING_SIM95_MAX_HAMMING
+def manual_labeling_phash_matches(a: int, b: int) -> bool:
+    """True when two dHashes count as 「相同」 for manual_labeling list/chat stops."""
+    return hamming_distance(a, b) <= MANUAL_LABELING_PHASH_IDENTICAL_MAX_HAMMING
+@dataclass(frozen=True)
+class PHashResult:
+    """Container for a single frame's perceptual hash + diagnostics.
+    ``value`` is a 64-bit int (the 8x8 dHash bitmap packed MSB-first).
+    ``roi_pixels`` lets callers log what region was actually hashed so a
+    mis-configured ROI doesn't silently break dup detection.
+    """
+    value: int
+    roi_x1: int
+    roi_y1: int
+    roi_x2: int
+    roi_y2: int
+    def hex(self) -> str:
+        return f"{self.value:016x}"
+def crop_roi(
+    bgr: np.ndarray,
+    roi_kind: Literal["chat", "detail"],
+) -> tuple[np.ndarray, tuple[int, int, int, int]]:
+    """Crop the ROI used for perceptual-hash comparison.
+    Returns the cropped ndarray + the absolute ``(x1, y1, x2, y2)`` rectangle
+    for logging. The X axis is always full-width (0 → screen_w); we only
+    trim Y because the top/bottom fixtures (status bar / nav bar) are the
+    only regions that animate in the absence of real content changes.
+    """
+    if roi_kind == "chat":
+        y1_r, y2_r = CHAT_ROI
+    elif roi_kind == "detail":
+        y1_r, y2_r = DETAIL_ROI
+    else:
+        raise ValueError(f"roi_kind must be 'chat' or 'detail', got {roi_kind!r}")
+    h, w = bgr.shape[:2]
+    y1 = int(h * y1_r)
+    y2 = int(h * y2_r)
+    if y2 <= y1:
+        raise ValueError(
+            f"degenerate ROI: y1={y1} y2={y2} for image h={h}; "
+            f"check roi_kind={roi_kind!r} and image dims"
+        )
+    return bgr[y1:y2, :, :].copy(), (0, y1, w, y2)
+def compute_dhash(
+    bgr: np.ndarray,
+    roi_kind: Literal["chat", "detail"],
+) -> PHashResult:
+    """Compute the 64-bit dHash of the ROI-cropped frame.
+    Algorithm:
+        1. Crop to the ROI (see :func:`crop_roi`).
+        2. Convert to grayscale.
+        3. Resize to ``(DHASH_W, DHASH_H)`` using INTER_AREA — best for
+           down-sampling (preserves average intensity per bucket).
+        4. For each row, bit = (px[j] > px[j+1]); 8 bits per row * 8 rows
+           = 64-bit value, MSB-first.
+    A cold call is ~2-3ms on the baseline device image (1080x2248). Well
+    within the scroll cycle (~400ms settle). No lru_cache — frames are
+    unique per screencap so caching wouldn't help.
+    """
+    roi, (x1, y1, x2, y2) = crop_roi(bgr, roi_kind)
+    gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
+    resized = cv2.resize(
+        gray, (DHASH_W, DHASH_H), interpolation=cv2.INTER_AREA,
+    )
+    # Broadcast-compare adjacent columns (the standard dHash trick).
+    bits = resized[:, 1:] > resized[:, :-1]  # shape (8,8) of bool
+    value = 0
+    for bit in bits.flatten():
+        value = (value << 1) | int(bit)
+    return PHashResult(
+        value=value,
+        roi_x1=x1, roi_y1=y1, roi_x2=x2, roi_y2=y2,
+    )
+def hamming_distance(a: int, b: int) -> int:
+    """Count differing bits between two 64-bit hashes (xor + popcount)."""
+    return int.bit_count(a ^ b)  # Python 3.10+ stdlib popcount
+def is_duplicate(
+    a: PHashResult,
+    b: PHashResult,
+    threshold: int = DEFAULT_DUP_HAMMING_THRESHOLD,
+) -> bool:
+    """Decide whether two hashes count as "the same frame" for finalize."""
+    return hamming_distance(a.value, b.value) <= threshold
+class ScrollDuplicationTracker:
+    """Rolling accumulator: fire an ``is_stuck`` signal when we've seen
+    ``required_consecutive`` duplicate frames in a row (DD §2.3 CC3).
+    Usage (inside the flow's scroll loop)::
+        tracker = ScrollDuplicationTracker(roi_kind="chat", required_consecutive=2)
+        for i in range(max_chat_content_scroll_down):
+            do_scroll()
+            frame = screencap()
+            if tracker.observe(decode(frame)).is_stuck:
+                # Early finalize (CC3): next scroll would also land on the
+                # same frame; no point in capturing more.
+                break
+    The first observation bootstraps the state; the second with
+    ``hamming_distance ≤ threshold`` sets ``consecutive = 1``; the third
+    with ``hamming ≤ threshold`` sets ``consecutive = 2`` → is_stuck
+    returns True.
+    Reset is explicit (:meth:`reset`) so a subflow crossing phases (chat
+    loop → resume subflow → chat loop) doesn't leak stale state across
+    phases.
+    """
+    def __init__(
+        self,
+        roi_kind: Literal["chat", "detail"],
+        *,
+        threshold: int = DEFAULT_DUP_HAMMING_THRESHOLD,
+        required_consecutive: int = 2,
+    ) -> None:
+        if required_consecutive < 1:
+            raise ValueError("required_consecutive must be >= 1")
+        self.roi_kind = roi_kind
+        self.threshold = threshold
+        self.required_consecutive = required_consecutive
+        self._last: PHashResult | None = None
+        self._consecutive: int = 0
+    def reset(self) -> None:
+        self._last = None
+        self._consecutive = 0
+    @dataclass(frozen=True)
+    class Observation:
+        """What :meth:`observe` returns for the current frame.
+        ``distance`` is the Hamming distance to the PREVIOUS frame, or
+        -1 when no previous frame exists (first observation bootstrap)."""
+        phash: "PHashResult"
+        distance: int
+        consecutive_duplicates: int
+        is_stuck: bool
+    def observe(self, bgr: np.ndarray) -> "ScrollDuplicationTracker.Observation":
+        ph = compute_dhash(bgr, self.roi_kind)
+        if self._last is None:
+            self._last = ph
+            return self.Observation(
+                phash=ph, distance=-1,
+                consecutive_duplicates=0, is_stuck=False,
+            )
+        dist = hamming_distance(ph.value, self._last.value)
+        self._last = ph
+        if dist <= self.threshold:
+            self._consecutive += 1
+        else:
+            self._consecutive = 0
+        stuck = self._consecutive >= self.required_consecutive
+        return self.Observation(
+            phash=ph, distance=dist,
+            consecutive_duplicates=self._consecutive, is_stuck=stuck,
+        )
+__all__ = [
+    "CHAT_ROI",
+    "DETAIL_ROI",
+    "DHASH_W",
+    "DHASH_H",
+    "DEFAULT_DUP_HAMMING_THRESHOLD",
+    "MANUAL_LABELING_PHASH_IDENTICAL_MAX_HAMMING",
+    "MANUAL_LABELING_SIM95_MAX_HAMMING",
+    "MANUAL_LABELING_SIM95_STREAK_REQUIRED",
+    "manual_labeling_phash_matches",
+    "PHashResult",
+    "crop_roi",
+    "compute_dhash",
+    "hamming_distance",
+    "is_duplicate",
+    "ScrollDuplicationTracker",
+]

screenshot_vision_algorithm/android/wechat/algorithms/speaker_band.py ADDED Viewed

@@ -0,0 +1,290 @@
+"""PRD 术语「5. 群聊天聊天气泡（y）坐标」— 纵向筒几何（头像时间轴）。
+不识别气泡圆角矢量；仅以 **左/右侧发言头像** 顶的 y（时间序合并）划出
+发言人竖直筒 ``[y_n, y_{n+1})``，供 ``nickname_ocr_simple`` 在筒内切段。
+常量均为 **1080 设计宽度基准**，调用方乘以 ``scale_w = width/1080``。
+"""
+from __future__ import annotations
+import numpy as np
+from dataclasses import dataclass
+from typing import Literal, Optional
+from loguru import logger
+from screenshot_vision_algorithm.android.wechat.algorithms.template_matching import (
+    detect_chat_back_chevron,
+    detect_chat_title_more_dots,
+    detect_wechat_main_bottom_tab_bar_four_columns,
+    detect_wechat_main_title_bottom_y,
+)
+#: 「1-y_top」相对聊天内容上沿 orphan 阈值（baseline px @ 1080）。
+CHAT_BAND_FIRST_AVATAR_TOP_GAP_REJECT_BASELINE = 30
+#: 首个发言人筒上移量（一个头像高度，防止几何误差），见 PRD §5 补充2。
+CHAT_FIRST_BAND_TOP_EXTEND_BASELINE = 100
+#: 底部输入区预留（与主列表 scanner 一脉，baseline）。
+CHAT_COMPOSER_RESERVE_BOTTOM_BASELINE = 160
+#: 群聊/单聊标题栏底到首条聊天内容的经验偏移（baseline px @ 1080）。
+CHAT_TITLE_BAR_TO_CONTENT_OFFSET_BASELINE = 4
+#: 微信标题底到首条会话内容的经验偏移；复用列表页同一常数家族（见 template_matcher）。
+from screenshot_vision_algorithm.android.wechat.algorithms.template_matching import WECHAT_TITLE_TO_FIRST_ROW_OFFSET_BASELINE
+# Chat 页左、右头像列 ROI（baseline 宽度 1080）
+CHAT_SIDE_AVATAR_COLUMN_WIDTH_BASELINE = 108
+CHAT_SIDE_AVATAR_HOUGH_MIN_DIST_BASELINE = 68
+CHAT_SIDE_AVATAR_MIN_R_BASELINE = 14
+CHAT_SIDE_AVATAR_MAX_R_BASELINE = 54
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+Side = Literal["left", "right"]
+@dataclass(frozen=True)
+class ChatVerticalBounds:
+    """聊天可视区竖直范围（不含标题栏/状态区与底部输入栏）。"""
+    y_top: int
+    y_bottom_excl: int  # exclusive lower bound for content (open interval top)
+@dataclass(frozen=True)
+class AvatarTimelineEntry:
+    y_top: int
+    side: Side
+@dataclass(frozen=True)
+class PrdChatVerticalLayout:
+    """单帧群聊截图在 PRD §5 下的竖直布局。"""
+    content_bounds: ChatVerticalBounds
+    #: 若 ``(1-y_top - content_top) > 阈值`` 产生的顶隙 orphan ``[orphan_y0, orphan_y1)``
+    orphan_top_band: Optional[tuple[int, int]]
+    #: 时间上排序后的发言人筒 ``[y_n, y_{n+1})``；与 orphan 互不重叠。
+    speaker_bands: tuple[tuple[int, int], ...]
+    #: True 当未检出任何头像：整页应记「找不到归属昵称」。
+    no_avatars_all_orphan: bool
+def _sw(scale_w: float) -> float:
+    return max(float(scale_w), 1e-6)
+def compute_chat_content_vertical_bounds(
+    screen_bgr: np.ndarray,
+    scale_w: float,
+    *,
+    screen_h: int,
+) -> ChatVerticalBounds:
+    """聊天内容上、下界：标题/底栏 chrome 与 PRD 对齐的粗锚点。"""
+    sw = _sw(scale_w)
+    h = int(screen_h)
+    # ── 群聊 / 单聊页：标题栏锚点（「←」与「⋯」）─────────────────
+    back = detect_chat_back_chevron(screen_bgr, scale_w)
+    more = detect_chat_title_more_dots(screen_bgr, scale_w)
+    if back is not None and more is not None:
+        # 标题栏底边 = max(两个图标底边) + 少许间隙
+        title_bar_bottom = max(back.y + back.h, more.y + more.h)
+        y_top = int(
+            round(float(title_bar_bottom)
+                  + CHAT_TITLE_BAR_TO_CONTENT_OFFSET_BASELINE * sw),
+        )
+    else:
+        # ── 主会话列表页：微信标题 + 底栏四 Tab ─────────────────
+        title_y = detect_wechat_main_title_bottom_y(screen_bgr, scale_w)
+        if title_y is not None and detect_wechat_main_bottom_tab_bar_four_columns(
+            screen_bgr, scale_w,
+        ):
+            y_top = int(
+                round(float(title_y)
+                      + WECHAT_TITLE_TO_FIRST_ROW_OFFSET_BASELINE * sw),
+            )
+        else:
+            # fallback：与列表首行 top 同量级
+            y_top = int(round(280 * sw))
+    y_top = max(0, min(h - 8, y_top))
+    nav = int(round(CHAT_COMPOSER_RESERVE_BOTTOM_BASELINE * sw))
+    y_bot_exc = max(y_top + 8, h - nav)
+    return ChatVerticalBounds(y_top=y_top, y_bottom_excl=y_bot_exc)
+def _hough_ytops_in_column(
+    gray_full: np.ndarray,
+    *,
+    x1: int,
+    x2: int,
+    y1: int,
+    y2: int,
+    scale_w: float,
+) -> list[int]:
+    if cv2 is None or y2 <= y1 + 4 or x2 <= x1 + 8:
+        return []
+    crop = gray_full[y1:y2, x1:x2]
+    if crop.size == 0:
+        return []
+    blur = cv2.GaussianBlur(crop, (5, 5), 0)
+    sw = _sw(scale_w)
+    circles = cv2.HoughCircles(
+        blur,
+        cv2.HOUGH_GRADIENT,
+        dp=1.2,
+        minDist=int(round(CHAT_SIDE_AVATAR_HOUGH_MIN_DIST_BASELINE * sw)),
+        param1=90,
+        param2=20,
+        minRadius=max(8, int(round(CHAT_SIDE_AVATAR_MIN_R_BASELINE * sw))),
+        maxRadius=int(round(CHAT_SIDE_AVATAR_MAX_R_BASELINE * sw)),
+    )
+    if circles is None:
+        return []
+    ys = sorted(
+        int(y1 + float(cy))
+        for _r, _xc, cy in circles[0]
+        if y1 <= y1 + cy < y2
+    )
+    return ys
+def detect_chat_side_avatar_ytops(
+    screen_bgr: np.ndarray,
+    scale_w: float,
+    bounds: ChatVerticalBounds,
+) -> tuple[list[int], list[int]]:
+    """左列与右列头像顶 y（左列与列表页共用 ``left_avatar_column``）。"""
+    if cv2 is None:
+        return [], []
+    h, w_full = screen_bgr.shape[:2]
+    sw = _sw(scale_w)
+    gray = cv2.cvtColor(screen_bgr, cv2.COLOR_BGR2GRAY)
+    y1 = max(0, bounds.y_top)
+    y2 = min(h, bounds.y_bottom_excl)
+    try:
+        from processor.left_avatar_column import (
+            detect_left_avatar_column_layout,
+            left_avatar_ytops,
+        )
+        layout = detect_left_avatar_column_layout(
+            screen_bgr,
+            sw,
+            y_top=y1,
+            y_bottom_excl=y2,
+        )
+        left_ys = left_avatar_ytops(layout)
+    except ImportError:
+        col_w = int(round(CHAT_SIDE_AVATAR_COLUMN_WIDTH_BASELINE * sw))
+        left_x2 = min(w_full, col_w)
+        left_ys = _hough_ytops_in_column(
+            gray, x1=0, x2=left_x2, y1=y1, y2=y2, scale_w=scale_w,
+        )
+    col_w = int(round(CHAT_SIDE_AVATAR_COLUMN_WIDTH_BASELINE * sw))
+    right_x1 = max(0, w_full - col_w)
+    right_ys = _hough_ytops_in_column(
+        gray, x1=right_x1, x2=w_full, y1=y1, y2=y2, scale_w=scale_w,
+    )
+    return left_ys, right_ys
+def merge_avatar_ytops_time_order(
+    left_ytops: list[int],
+    right_ytops: list[int],
+) -> tuple[AvatarTimelineEntry, ...]:
+    """按 y 合并左右列头像顶，得到时间轴（同 y 容差内先左后右）。"""
+    items: list[AvatarTimelineEntry] = []
+    for y in left_ytops:
+        items.append(AvatarTimelineEntry(y_top=int(y), side="left"))
+    for y in right_ytops:
+        items.append(AvatarTimelineEntry(y_top=int(y), side="right"))
+    items.sort(key=lambda e: (e.y_top, 0 if e.side == "left" else 1))
+    return tuple(items)
+def build_prd_speaker_vertical_bands(
+    timeline: tuple[AvatarTimelineEntry, ...],
+    bounds: ChatVerticalBounds,
+    *,
+    scale_w: float,
+    first_avatar_gap_reject_baseline: int = CHAT_BAND_FIRST_AVATAR_TOP_GAP_REJECT_BASELINE,
+    first_band_top_extend_baseline: int = CHAT_FIRST_BAND_TOP_EXTEND_BASELINE,
+) -> PrdChatVerticalLayout:
+    """由时间序头像顶构造 PRD §5 顶隙 orphan + 左闭右开发言人筒。
+    首个发言人筒上界额外上移 ``first_band_top_extend_baseline``（PRD §5 补充2：
+    一个头像高度，防几何误差与系统变化）。
+    """
+    sw = _sw(scale_w)
+    gap_thr = int(round(int(first_avatar_gap_reject_baseline) * sw))
+    y_ct = int(bounds.y_top)
+    y_bot = int(bounds.y_bottom_excl)
+    if not timeline:
+        return PrdChatVerticalLayout(
+            content_bounds=bounds,
+            orphan_top_band=None,
+            speaker_bands=(),
+            no_avatars_all_orphan=True,
+        )
+    y_first = int(timeline[0].y_top)
+    orphan_top: Optional[tuple[int, int]] = None
+    if y_first - y_ct > gap_thr:
+        orphan_top = (y_ct, y_first)
+    # 时间轴序去重：过近视为同一人头像重复检出
+    merged_starts: list[int] = []
+    min_sep = int(round(24 * sw))
+    for e in timeline:
+        y = int(e.y_top)
+        if not merged_starts or y - merged_starts[-1] >= min_sep:
+            merged_starts.append(y)
+    # 首个筒上界上移（PRD §5 补充2）：上移 100px @1080，防几何误差与系统变化。
+    first_band_extend = int(round(int(first_band_top_extend_baseline) * sw))
+    bands: list[tuple[int, int]] = []
+    for i, y0 in enumerate(merged_starts):
+        y1 = merged_starts[i + 1] if i + 1 < len(merged_starts) else y_bot
+        if i == 0:
+            y0 = y0 - first_band_extend
+        y0c = max(y_ct, min(y0, y_bot - 1))
+        y1c = max(y0c + 1, min(y1, y_bot))
+        bands.append((y0c, y1c))
+    # 首个筒上移后，若 orphan_top_band 与首个 speaker_band 重叠，
+    # 以首个筒实际起点为界收缩 orphan（无重叠的连续左闭右开划分）。
+    if orphan_top is not None and bands:
+        first_band_start = bands[0][0]
+        if first_band_start <= orphan_top[0]:
+            orphan_top = None
+        elif first_band_start < orphan_top[1]:
+            orphan_top = (orphan_top[0], first_band_start)
+    return PrdChatVerticalLayout(
+        content_bounds=bounds,
+        orphan_top_band=orphan_top,
+        speaker_bands=tuple(bands),
+        no_avatars_all_orphan=False,
+    )
+__all__ = [
+    "build_prd_speaker_vertical_bands",
+    "compute_chat_content_vertical_bounds",
+    "detect_chat_side_avatar_ytops",
+    "merge_avatar_ytops_time_order",
+]