npm - delimit-cli - Versions diffs - 4.5.6 → 4.5.7 - Mend

delimit-cli 4.5.6 → 4.5.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/gateway/ai/scan_bridge/__init__.py +39 -0
package/gateway/ai/scan_bridge/bridge.py +473 -0
package/gateway/ai/scan_bridge/dedup.py +335 -0
package/gateway/ai/scan_bridge/digest.py +151 -0
package/package.json +1 -1

package/gateway/ai/scan_bridge/__init__.py ADDED Viewed

@@ -0,0 +1,39 @@
+"""LED-1264: scan → strategy-ledger auto-promote bridge.
+Pure consumer of ``~/.delimit/social_targets.jsonl`` (the existing
+``delimit_social_target`` output). Promotes a tightly-gated subset of
+strategic signals into the strategy ledger so the founder reviews them
+via a daily digest instead of inbox-spam pings.
+Panel decision (UNANIMOUS R3, 2026-05-07): tight guards
+(strategic + confidence ≥ 0.85 + dedup against open / 60-day-closed),
+P2 priority (review, not auto-action), one daily digest email.
+Public entry points:
+- :func:`bridge.promote_recent_signals` — main work function
+- :func:`digest.build_daily_digest` — assemble last-24h digest text
+- :func:`bridge.backfill_from` — one-time idempotent backfill walker
+The bridge is invoked by ``scripts/scan_bridge_cron.py`` on a 6-hour
+crontab cadence (founder applies manually). Direct in-process calls to
+``ai.ledger_manager.add_item`` — no MCP subprocess.
+"""
+from ai.scan_bridge.bridge import (
+    backfill_from,
+    promote_recent_signals,
+)
+from ai.scan_bridge.dedup import (
+    extract_topic_fingerprint,
+    is_duplicate,
+)
+from ai.scan_bridge.digest import build_daily_digest
+__all__ = [
+    "backfill_from",
+    "build_daily_digest",
+    "extract_topic_fingerprint",
+    "is_duplicate",
+    "promote_recent_signals",
+]

package/gateway/ai/scan_bridge/bridge.py ADDED Viewed

@@ -0,0 +1,473 @@
+"""LED-1264 scan-bridge — promotion engine.
+Reads ``~/.delimit/social_targets.jsonl`` (the existing
+``delimit_social_target`` output), filters to the tight panel-locked
+gate, runs dedup against the strategy ledger, and promotes survivors
+via direct in-process ``ledger_manager.add_item`` calls.
+State / cursor:
+    ``~/.delimit/scan_bridge_cursor.json`` records the most-recent
+    ``first_seen`` value we've already processed. Subsequent runs only
+    consider lines newer than that. Idempotent — re-running the cron
+    on the same JSONL is a no-op.
+Promotions log:
+    ``~/.delimit/scan_bridge_promotions.jsonl`` records every successful
+    promotion (item_id, signal_fingerprint, ts) so the daily digest can
+    assemble the last-24h batch without re-walking the ledger.
+"""
+from __future__ import annotations
+import json
+import logging
+import os
+from contextlib import contextmanager
+from dataclasses import dataclass
+from datetime import datetime, date, timedelta, timezone
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+from ai.scan_bridge.dedup import (
+    _candidate_strategy_items,
+    extract_topic_fingerprint,
+    is_duplicate,
+)
+logger = logging.getLogger("delimit.ai.scan_bridge.bridge")
+TARGETS_FILE = Path.home() / ".delimit" / "social_targets.jsonl"
+CURSOR_FILE = Path.home() / ".delimit" / "scan_bridge_cursor.json"
+PROMOTIONS_LOG = Path.home() / ".delimit" / "scan_bridge_promotions.jsonl"
+def _confidence_floor() -> float:
+    """Resolve the active confidence floor (env-overridable per directive)."""
+    raw = os.environ.get("DELIMIT_SCAN_PROMO_CONFIDENCE", "")
+    if not raw:
+        return 0.85
+    try:
+        v = float(raw)
+        if 0.0 <= v <= 1.0:
+            return v
+    except (TypeError, ValueError):
+        pass
+    return 0.85
+# ── Cursor I/O ────────────────────────────────────────────────────────
+def _load_cursor() -> Optional[str]:
+    """Return the most-recent ``first_seen`` we've already processed."""
+    if not CURSOR_FILE.exists():
+        return None
+    try:
+        data = json.loads(CURSOR_FILE.read_text())
+        v = data.get("last_seen_at")
+        return str(v) if v else None
+    except (OSError, ValueError, json.JSONDecodeError):
+        return None
+def _save_cursor(last_seen_at: str) -> None:
+    try:
+        CURSOR_FILE.parent.mkdir(parents=True, exist_ok=True)
+        CURSOR_FILE.write_text(json.dumps({"last_seen_at": last_seen_at}))
+    except OSError:  # pragma: no cover — best-effort
+        logger.warning("scan_bridge: failed to persist cursor")
+def _log_promotion(record: Dict[str, Any]) -> None:
+    try:
+        PROMOTIONS_LOG.parent.mkdir(parents=True, exist_ok=True)
+        with PROMOTIONS_LOG.open("a", encoding="utf-8") as fh:
+            fh.write(json.dumps(record) + "\n")
+    except OSError:  # pragma: no cover — best-effort
+        pass
+# ── Filtering ─────────────────────────────────────────────────────────
+@dataclass
+class _FilterStats:
+    considered: int = 0
+    rejected_classification: int = 0
+    rejected_confidence: int = 0
+    rejected_dedup: int = 0
+    promoted: int = 0
+def _passes_strict_gate(
+    signal: Dict[str, Any],
+    *,
+    confidence_floor: float,
+    stats: _FilterStats,
+) -> Tuple[bool, str]:
+    """Return ``(passes, reason)``. ``reason`` is "" on pass."""
+    classification = (signal.get("classification") or "").strip().lower()
+    if classification != "strategic":
+        stats.rejected_classification += 1
+        return False, f"classification={classification or 'missing'}"
+    try:
+        confidence = float(signal.get("confidence") or 0.0)
+    except (TypeError, ValueError):
+        confidence = 0.0
+    if confidence < confidence_floor:
+        stats.rejected_confidence += 1
+        return False, f"confidence={confidence:.2f}<{confidence_floor:.2f}"
+    return True, ""
+# ── Promotion path ────────────────────────────────────────────────────
+def _build_title(signal: Dict[str, Any]) -> str:
+    snippet = (signal.get("content_snippet") or "").strip()
+    # If the snippet starts with a "[TAG] head" prefix the tag + head
+    # makes the most readable title. Otherwise fall back to the first
+    # 80 chars of the snippet.
+    if snippet.startswith("["):
+        head = snippet.split("\n", 1)[0]
+        if len(head) > 120:
+            head = head[:117] + "..."
+        return f"STRATEGIC: {head}"
+    if len(snippet) > 100:
+        snippet = snippet[:97] + "..."
+    return f"STRATEGIC: {snippet}" if snippet else "STRATEGIC: (no snippet)"
+def _build_item(signal: Dict[str, Any]) -> Dict[str, Any]:
+    platform = signal.get("platform") or ""
+    canonical_url = signal.get("canonical_url") or ""
+    snippet = (signal.get("content_snippet") or "")[:280]
+    confidence = float(signal.get("confidence") or 0.0)
+    first_seen = signal.get("first_seen") or ""
+    source_id = signal.get("source_id") or signal.get("fingerprint") or ""
+    fingerprint_set = sorted(extract_topic_fingerprint(signal))
+    description = (
+        f"Auto-promoted from {platform} signal at {confidence:.2f}: "
+        f"{snippet}\n\nURL: {canonical_url or '(none)'}"
+    )
+    context_text = (
+        f"Captured by delimit_social_target on {first_seen}. "
+        "Panel-approved auto-promote (LED-1264) per deliberation 2026-05-07. "
+        "Founder reviews via daily digest."
+    )
+    return {
+        "title": _build_title(signal),
+        "ledger": "strategy",
+        "type": "strategy",
+        "priority": "P2",
+        "description": description,
+        "context": context_text,
+        "tags": ["auto_promoted", "scan_bridge", platform] if platform else ["auto_promoted", "scan_bridge"],
+        "source": "scan_bridge_auto",
+        "metadata_signal_ref": {
+            "platform": platform,
+            "source_id": source_id,
+            "fingerprint": fingerprint_set,
+            "first_seen": first_seen,
+            "confidence": confidence,
+            "canonical_url": canonical_url,
+        },
+    }
+@contextmanager
+def _signal_promote_bypass():
+    """Set ``_DELIMIT_SIGNAL_PROMOTED_BY`` so the LED-877 guard treats
+    this as the explicit promote path. Defensive against future source
+    name changes — guard currently allows ``scan_bridge_auto`` since it
+    doesn't start with the sensed prefixes, but this future-proofs.
+    """
+    key = "_DELIMIT_SIGNAL_PROMOTED_BY"
+    prev = os.environ.get(key)
+    os.environ[key] = "scan_bridge:LED-1264"
+    try:
+        yield
+    finally:
+        if prev is None:
+            os.environ.pop(key, None)
+        else:
+            os.environ[key] = prev
+def _add_to_strategy_ledger(item: Dict[str, Any]) -> Dict[str, Any]:
+    """Direct in-process call to ``ledger_manager.add_item``.
+    The ledger module currently doesn't accept a ``metadata`` kwarg, so
+    we splice signal_ref into the description as a fenced JSON block AND
+    embed the fingerprint tokens into the tags list. Future ledger
+    schema enhancements that add a metadata column should swap this in
+    without changing the caller surface.
+    """
+    from ai.ledger_manager import add_item
+    signal_ref = item.pop("metadata_signal_ref", {})
+    fp_tokens = signal_ref.get("fingerprint") or []
+    fingerprint_tags = [f"fp:{t}" for t in fp_tokens][:8]  # cap to keep tag list sane
+    # Append fenced JSON to description so tools that read raw description
+    # can recover the signal_ref structurally; the dedup module already
+    # falls back to extracting fingerprints from description text when
+    # the structured field is missing, so this is also recoverable.
+    sref_block = "\n\nsignal_ref:\n```json\n" + json.dumps(signal_ref, ensure_ascii=False, sort_keys=True) + "\n```"
+    item["description"] = item.get("description", "") + sref_block
+    item["tags"] = list(item.get("tags") or []) + fingerprint_tags
+    with _signal_promote_bypass():
+        return add_item(**item)
+# ── Public API ────────────────────────────────────────────────────────
+def _iter_signals(targets_file: Path = TARGETS_FILE) -> Iterable[Dict[str, Any]]:
+    if not targets_file.exists():
+        return
+    try:
+        with targets_file.open("r", encoding="utf-8") as fh:
+            for line in fh:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    yield json.loads(line)
+                except (json.JSONDecodeError, ValueError):
+                    continue
+    except OSError as exc:  # pragma: no cover
+        logger.warning("scan_bridge: failed to read %s: %s", targets_file, exc)
+def _normalize_first_seen(value: Any) -> str:
+    """Return a comparable string. Empty string sorts before anything."""
+    if not value:
+        return ""
+    return str(value)
+def promote_recent_signals(
+    since: Optional[datetime] = None,
+    *,
+    dry_run: bool = False,
+    targets_file: Optional[Path] = None,
+    confidence_floor: Optional[float] = None,
+    candidates: Optional[Iterable[Dict[str, Any]]] = None,
+) -> Dict[str, Any]:
+    """Process scanned signals from ``targets_file`` and promote
+    survivors of the strict gate to the strategy ledger.
+    Parameters
+    ----------
+    since:
+        Optional cutoff. Defaults to the persisted cursor; falls back to
+        24h ago when no cursor exists.
+    dry_run:
+        When True no ledger writes happen; the response still contains
+        the would-be promotions for audit / preview.
+    targets_file:
+        Override the default ``social_targets.jsonl`` path (test hook).
+    confidence_floor:
+        Override the env-resolved floor (test hook).
+    candidates:
+        Override the strategy-ledger candidate list for dedup (test
+        hook). When omitted we fetch live items inside ``is_duplicate``.
+    Returns
+    -------
+    dict with keys: ``stats``, ``promoted`` (list of {item_id,
+    signal_fingerprint, title, snippet}), ``cursor_advanced_to``,
+    ``dry_run``.
+    """
+    targets_file = targets_file or TARGETS_FILE
+    floor = confidence_floor if confidence_floor is not None else _confidence_floor()
+    cursor_value = _load_cursor()
+    if since is not None:
+        # Caller-supplied since: take the LATER of since vs cursor so we
+        # never reprocess a row we've already promoted.
+        since_iso = since.astimezone(timezone.utc).isoformat()
+        if cursor_value and cursor_value > since_iso:
+            since_iso = cursor_value
+    else:
+        if cursor_value:
+            since_iso = cursor_value
+        else:
+            since_iso = (datetime.now(timezone.utc) - timedelta(hours=24)).isoformat()
+    stats = _FilterStats()
+    promoted: List[Dict[str, Any]] = []
+    max_seen = since_iso
+    # Resolve candidates ONCE per run for performance — production calls
+    # don't pass it; we hand the live list to is_duplicate as a static
+    # snapshot so 1000 signals don't trigger 1000 ledger walks.
+    if candidates is None:
+        snapshot = list(_candidate_strategy_items(window_days=60))
+    else:
+        snapshot = list(candidates)
+    # We'll mutate snapshot during the run so an early-batch promotion
+    # blocks a later-batch duplicate within the same invocation.
+    live_snapshot: List[Dict[str, Any]] = list(snapshot)
+    # Process newest-first within the batch so when two signals about
+    # the same topic appear (e.g. oasdiff v1.15.0-beta + v1.15.2), the
+    # MOST RECENT version wins. The earlier versions then dedup against
+    # the newer item — which is what the founder wants in the digest.
+    # We still advance the cursor to the max first_seen across the run
+    # so the next call only considers genuinely-new rows.
+    queued: List[Dict[str, Any]] = []
+    for signal in _iter_signals(targets_file):
+        first_seen = _normalize_first_seen(signal.get("first_seen"))
+        if first_seen <= since_iso:
+            continue
+        queued.append((first_seen, signal))
+    queued.sort(key=lambda pair: pair[0], reverse=True)
+    for first_seen, signal in queued:
+        stats.considered += 1
+        if first_seen > max_seen:
+            max_seen = first_seen
+        passes, reason = _passes_strict_gate(
+            signal, confidence_floor=floor, stats=stats
+        )
+        if not passes:
+            continue
+        match = is_duplicate(signal, window_days=60, candidates=live_snapshot)
+        if match is not None:
+            stats.rejected_dedup += 1
+            continue
+        if dry_run:
+            stats.promoted += 1
+            promoted.append({
+                "item_id": "DRY-RUN",
+                "signal_fingerprint": signal.get("fingerprint"),
+                "title": _build_title(signal),
+                "snippet": (signal.get("content_snippet") or "")[:200],
+                "confidence": signal.get("confidence"),
+                "platform": signal.get("platform"),
+                "canonical_url": signal.get("canonical_url"),
+                "first_seen": first_seen,
+            })
+            # Mirror within-batch dedup behaviour even in dry-run so the
+            # preview count matches what a real run would write. Build a
+            # synthetic ledger-shaped item carrying the signal's
+            # fingerprint tokens.
+            tokens = sorted(extract_topic_fingerprint(signal))
+            now_iso = datetime.now(timezone.utc).isoformat()
+            live_snapshot.append({
+                "id": "DRY-RUN",
+                "status": "open",
+                "title": _build_title(signal),
+                "description": (signal.get("content_snippet") or ""),
+                "context": "",
+                "tags": [],
+                "created_at": now_iso,
+                "updated_at": now_iso,
+                "metadata": {"signal_ref": {"fingerprint": tokens}},
+            })
+            continue
+        item = _build_item(signal)
+        # Capture the signal_ref before _add_to_strategy_ledger pops it
+        # off the item dict — we need it for the within-batch snapshot
+        # append below so subsequent signals can dedup against this one.
+        captured_signal_ref = item.get("metadata_signal_ref") or {}
+        try:
+            result = _add_to_strategy_ledger(item)
+        except Exception as exc:
+            logger.exception("scan_bridge: ledger add failed for %s", signal.get("fingerprint"))
+            continue
+        added = result.get("added") or {}
+        item_id = added.get("id") or ""
+        stats.promoted += 1
+        record = {
+            "ts": datetime.now(timezone.utc).isoformat(),
+            "item_id": item_id,
+            "signal_fingerprint": signal.get("fingerprint"),
+            "title": item["title"],
+            "platform": signal.get("platform"),
+            "confidence": signal.get("confidence"),
+            "canonical_url": signal.get("canonical_url"),
+            "first_seen": first_seen,
+        }
+        _log_promotion(record)
+        promoted.append({
+            "item_id": item_id,
+            "signal_fingerprint": signal.get("fingerprint"),
+            "title": item["title"],
+            "snippet": (signal.get("content_snippet") or "")[:200],
+            "confidence": signal.get("confidence"),
+            "platform": signal.get("platform"),
+            "canonical_url": signal.get("canonical_url"),
+            "first_seen": first_seen,
+        })
+        # Add the freshly-promoted item to the in-memory snapshot so any
+        # later-but-similar signal in the same batch is correctly
+        # de-duplicated.
+        now_iso = datetime.now(timezone.utc).isoformat()
+        live_snapshot.append({
+            "id": item_id,
+            "status": "open",
+            "title": item["title"],
+            "description": item["description"],
+            "context": item.get("context", ""),
+            "tags": item.get("tags") or [],
+            "created_at": now_iso,
+            "updated_at": now_iso,
+            "metadata": {"signal_ref": captured_signal_ref},
+        })
+    # Advance cursor on success — only when not a dry-run.
+    if not dry_run and max_seen and max_seen != since_iso:
+        _save_cursor(max_seen)
+    return {
+        "stats": {
+            "considered": stats.considered,
+            "rejected_classification": stats.rejected_classification,
+            "rejected_confidence": stats.rejected_confidence,
+            "rejected_dedup": stats.rejected_dedup,
+            "promoted": stats.promoted,
+        },
+        "promoted": promoted,
+        "cursor_advanced_to": max_seen if (not dry_run and max_seen != since_iso) else None,
+        "since": since_iso,
+        "dry_run": dry_run,
+        "confidence_floor": floor,
+    }
+def backfill_from(
+    start_date: date,
+    *,
+    dry_run: bool = False,
+    targets_file: Optional[Path] = None,
+    candidates: Optional[Iterable[Dict[str, Any]]] = None,
+) -> Dict[str, Any]:
+    """Walk ``targets_file`` from ``start_date`` (UTC) forward and
+    promote everything that passes the strict gate.
+    Idempotent — leverages the same cursor as ``promote_recent_signals``
+    so re-running on the same range is a no-op (or a delta-only run if
+    the file has grown).
+    Per the directive: surface the candidate counts so the founder sees
+    how much real signal was captured but never promoted before this
+    bridge existed.
+    """
+    since_dt = datetime.combine(start_date, datetime.min.time(), tzinfo=timezone.utc)
+    return promote_recent_signals(
+        since=since_dt,
+        dry_run=dry_run,
+        targets_file=targets_file,
+        candidates=candidates,
+    )

package/gateway/ai/scan_bridge/dedup.py ADDED Viewed

@@ -0,0 +1,335 @@
+"""LED-1264: scan-bridge dedup — fingerprint a signal and check the ledger.
+Two-stage dedup:
+1. Extract a topic fingerprint from the signal — domain/orbit signal
+   terms (reuse ``social_capability.fit_floor._extract_topic_fingerprint``
+   if available), plus the canonical_url host + first significant path
+   segment, plus the leading bracket-prefixed tag (e.g. ``[COMPETITOR
+   RELEASE]``) which is a strong topic signal in our scan corpus.
+2. Look the fingerprint up against the strategy ledger inside a
+   60-day window (any status — open, done, cancelled, blocked,
+   archived). If ANY active or recently-closed item matches, skip
+   promotion. Per the directive: 60% recall is fine; cost of missing
+   a duplicate is one founder-reviewed P2 item.
+Skipped duplicates are logged to ``~/.delimit/scan_bridge_dedup.jsonl``
+so the founder can audit what the bridge filtered out.
+"""
+from __future__ import annotations
+import json
+import re
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+from typing import Any, Dict, Iterable, Optional, Set
+from urllib.parse import urlparse
+DEDUP_LOG = Path.home() / ".delimit" / "scan_bridge_dedup.jsonl"
+# Bracket-prefix tags carried by the scanner (e.g. "[COMPETITOR RELEASE]
+# oasdiff …" or "[VENDOR NEWS] …"). These are strong topic signals — when
+# present we lift them into the fingerprint as a single canonical token
+# so two scans of "oasdiff v1.15.1" + "oasdiff v1.15.2" both share the
+# "competitor_release:oasdiff" key.
+_BRACKET_PREFIX_RE = re.compile(r"^\s*\[([^\]]{1,40})\]\s*([^\s:.]{1,80})", re.IGNORECASE)
+# A trivial path-segment splitter; we just want the first non-empty
+# significant chunk (e.g. "oasdiff" from /oasdiff/oasdiff/releases/tag/...).
+_SIGNIFICANT_PATH_RE = re.compile(r"[A-Za-z0-9][A-Za-z0-9_\-.]{1,}")
+def _domain_orbit_terms(text: str) -> Set[str]:
+    """Best-effort import of fit_floor's topic extractor.
+    fit_floor extracts the union of matched Delimit-domain + orbit
+    signal terms. If the import fails for any reason (test isolation,
+    refactor) we fall back to an empty set — the URL/bracket terms
+    below are still load-bearing on their own.
+    """
+    try:
+        from ai.social_capability.fit_floor import _extract_topic_fingerprint
+    except Exception:  # pragma: no cover — tolerant fallback
+        return set()
+    try:
+        return set(_extract_topic_fingerprint(text or ""))
+    except Exception:  # pragma: no cover
+        return set()
+def _bracket_prefix_token(snippet: str) -> Optional[str]:
+    """Extract a "<tag>:<head_word>" canonical token from a bracketed
+    snippet header. Returns None when the snippet doesn't start with
+    a recognisable bracket tag.
+    """
+    if not snippet:
+        return None
+    m = _BRACKET_PREFIX_RE.match(snippet)
+    if not m:
+        return None
+    tag = re.sub(r"\s+", "_", m.group(1).strip().lower())
+    head = m.group(2).strip().lower()
+    if not tag or not head:
+        return None
+    return f"{tag}:{head}"
+def _url_terms(canonical_url: str) -> Set[str]:
+    """Return host + first significant path segment as canonical tokens."""
+    if not canonical_url:
+        return set()
+    try:
+        p = urlparse(canonical_url)
+    except Exception:
+        return set()
+    out: Set[str] = set()
+    host = (p.netloc or "").lower().lstrip("www.")
+    if host:
+        out.add(f"host:{host}")
+    # Pull first 1-2 significant path segments. For github.com the first
+    # is the org and the second is the repo — both useful as dedup keys.
+    segments = [s for s in (p.path or "").split("/") if s]
+    for seg in segments[:2]:
+        m = _SIGNIFICANT_PATH_RE.search(seg)
+        if m:
+            out.add(f"seg:{m.group(0).lower()}")
+    return out
+def extract_topic_fingerprint(signal: Dict[str, Any]) -> Set[str]:
+    """Return the dedup fingerprint set for a single scanned signal.
+    The fingerprint is a SET of canonical tokens. Two signals are
+    considered overlapping when their fingerprint sets share at least
+    one token. Per the directive: don't be too clever; 60% recall is
+    fine.
+    """
+    snippet = signal.get("content_snippet") or ""
+    canonical_url = signal.get("canonical_url") or ""
+    rationale = signal.get("rationale") or ""
+    tokens: Set[str] = set()
+    tokens.update(_domain_orbit_terms(f"{snippet}\n{rationale}"))
+    tokens.update(_url_terms(canonical_url))
+    bracket = _bracket_prefix_token(snippet)
+    if bracket:
+        tokens.add(bracket)
+    return tokens
+# ── Ledger lookup ─────────────────────────────────────────────────────
+def _parse_iso(value: Optional[str]) -> Optional[datetime]:
+    if not value:
+        return None
+    try:
+        dt = datetime.fromisoformat(str(value).replace("Z", "+00:00"))
+    except (TypeError, ValueError):
+        return None
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=timezone.utc)
+    return dt
+def _item_fingerprint_tokens(item: Dict[str, Any]) -> Set[str]:
+    """Recover a fingerprint token set from a stored ledger item.
+    Auto-promoted items carry their fingerprint in
+    ``metadata.signal_ref.fingerprint`` as a serialised list. Older /
+    hand-added items don't, so we fall back to extracting on-the-fly
+    from title + description + tags + context — the same fields a
+    reasonable founder would have written about the same topic.
+    """
+    metadata = item.get("metadata") or {}
+    signal_ref = metadata.get("signal_ref") or {}
+    stored = signal_ref.get("fingerprint")
+    if isinstance(stored, list) and stored:
+        return {str(t).lower() for t in stored if t}
+    if isinstance(stored, str) and stored:
+        # Comma-separated fallback shape.
+        return {p.strip().lower() for p in stored.split(",") if p.strip()}
+    # Fallback: synthesise a fingerprint from the human text in the item.
+    parts = [
+        item.get("title") or "",
+        item.get("description") or "",
+        item.get("context") or "",
+    ]
+    tags = item.get("tags") or []
+    if isinstance(tags, list):
+        parts.append(" ".join(str(t) for t in tags))
+    text = "\n".join(p for p in parts if p)
+    fake_signal = {"content_snippet": text, "canonical_url": "", "rationale": ""}
+    return extract_topic_fingerprint(fake_signal)
+def _within_window(item: Dict[str, Any], window_days: int, now: datetime) -> bool:
+    """Item is in-window if either created_at OR updated_at is within
+    ``window_days`` of ``now``.
+    """
+    cutoff = now - timedelta(days=window_days)
+    for field in ("updated_at", "created_at"):
+        ts = _parse_iso(item.get(field))
+        if ts and ts >= cutoff:
+            return True
+    return False
+def _candidate_strategy_items(window_days: int = 60) -> Iterable[Dict[str, Any]]:
+    """Yield strategy items in the dedup window.
+    Imports ``ai.ledger_manager.list_items`` lazily so test patches
+    targeting that symbol take effect at call time.
+    """
+    try:
+        from ai.ledger_manager import list_items
+    except Exception:  # pragma: no cover
+        return iter(())
+    now = datetime.now(timezone.utc)
+    out: list = []
+    cursor: Optional[str] = None
+    seen_ids: Set[str] = set()
+    # Walk pages defensively — most ledgers have <500 strategy items, but
+    # paginate if needed.
+    for _ in range(20):  # hard cap on pages, prevents accidental infinite loop
+        resp = list_items(
+            ledger="strategy",
+            limit=500,
+            cursor=cursor,
+            sort="updated_at",
+            order="desc",
+        )
+        items = (resp.get("items") or {}).get("strategy") or []
+        if not items:
+            break
+        for item in items:
+            iid = item.get("id") or ""
+            if iid and iid in seen_ids:
+                continue
+            if iid:
+                seen_ids.add(iid)
+            if _within_window(item, window_days, now):
+                out.append(item)
+        cursor = resp.get("next_cursor")
+        if not cursor:
+            break
+    return out
+def _log_dedup(signal: Dict[str, Any], match: Dict[str, Any], reason: str) -> None:
+    try:
+        DEDUP_LOG.parent.mkdir(parents=True, exist_ok=True)
+        with DEDUP_LOG.open("a", encoding="utf-8") as fh:
+            fh.write(json.dumps({
+                "ts": datetime.now(timezone.utc).isoformat(),
+                "signal_fingerprint_id": signal.get("fingerprint"),
+                "platform": signal.get("platform"),
+                "canonical_url": signal.get("canonical_url"),
+                "snippet_head": (signal.get("content_snippet") or "")[:160],
+                "matched_item_id": match.get("id"),
+                "matched_item_title": (match.get("title") or "")[:160],
+                "matched_item_status": match.get("status"),
+                "reason": reason,
+            }) + "\n")
+    except OSError:  # pragma: no cover — best-effort
+        pass
+def _is_strong_match(shared: Set[str], sig_tokens: Set[str]) -> bool:
+    """Return True when the shared-token set is specific enough to
+    claim two signals are about the same topic.
+    Strict rule (chosen after empirical scan-corpus tuning, see
+    LED-1264 memo): a true dedup match requires a SPECIFIC token —
+    either a bracket-prefix token (``competitor_release:oasdiff``,
+    ``vendor_news:cursor``, ``outreach_state_change:logto-io``) or a
+    ``seg:<repo>`` URL path segment. Generic orbit terms ("mcp",
+    "claude code", "cursor"), tech-context words, and bare host tokens
+    are NOT enough on their own. A signal where two of those overlap
+    but neither has a specific identifier is two different things
+    that happen to live in the same ecosystem; we want them as
+    separate ledger items.
+    Per the directive: "don't be too clever — 60% recall on duplicates
+    is fine; the cost of missing a duplicate is one founder-reviewed
+    P2 ledger item, not a catastrophe." This rule errs toward
+    promoting (more recall on the no-dedup decision).
+    """
+    if not shared:
+        return False
+    # Bracket-prefix tokens win — they're tightly scoped (vendor name
+    # baked in). Excludes host: and seg: which use the same `:` syntax
+    # but live in their own buckets below.
+    if any(":" in t and not t.startswith("host:") and not t.startswith("seg:") for t in shared):
+        return True
+    # Specific repo segments win — same repo across two signals is a
+    # real dedup. seg: tokens carry the repo name post-host (e.g. for
+    # github.com/oasdiff/oasdiff we extract seg:oasdiff). When two
+    # signals share that, they're about the same project.
+    if any(t.startswith("seg:") for t in shared):
+        return True
+    return False
+def is_duplicate(
+    signal: Dict[str, Any],
+    *,
+    window_days: int = 60,
+    candidates: Optional[Iterable[Dict[str, Any]]] = None,
+) -> Optional[Dict[str, Any]]:
+    """Return the matching ledger item dict if ``signal`` collides with
+    an existing strategy item inside the window; ``None`` otherwise.
+    The match rule is intentionally specific — sharing only "mcp" or
+    "host:github.com" between two signals isn't enough overlap to call
+    them duplicates (that's most of the scan corpus). See
+    :func:`_is_strong_match` for the exact rule.
+    Parameters
+    ----------
+    signal:
+        Raw scan target dict (the JSONL line shape from
+        ``social_targets.jsonl``).
+    window_days:
+        Age window for "recently closed" items. Default 60 — per the
+        directive, avoid re-raising things we explicitly chose not to act
+        on within the last 60 days.
+    candidates:
+        Optional iterable of strategy items to check against. Tests pass
+        an explicit list. Production callers omit it and we fetch from
+        the live ledger.
+    """
+    sig_tokens = extract_topic_fingerprint(signal)
+    if not sig_tokens:
+        # No tokens at all means we can't make a useful dedup judgement.
+        # Treat as non-duplicate; the tight confidence floor is the main
+        # quality gate.
+        return None
+    items = list(candidates) if candidates is not None else list(
+        _candidate_strategy_items(window_days=window_days)
+    )
+    now = datetime.now(timezone.utc)
+    for item in items:
+        # When candidates were supplied explicitly we still respect the
+        # window so unit tests can assert window behaviour without
+        # re-implementing the date filter.
+        if candidates is not None and not _within_window(item, window_days, now):
+            continue
+        item_tokens = _item_fingerprint_tokens(item)
+        if not item_tokens:
+            continue
+        shared = sig_tokens & item_tokens
+        if not _is_strong_match(shared, sig_tokens):
+            continue
+        reason = "open_match" if (item.get("status") == "open") else "recent_match"
+        _log_dedup(signal, item, reason)
+        return item
+    return None

package/gateway/ai/scan_bridge/digest.py ADDED Viewed

@@ -0,0 +1,151 @@
+"""LED-1264 daily digest assembler.
+Reads ``~/.delimit/scan_bridge_promotions.jsonl`` and assembles ONE
+email-ready digest of the last 24h of promotions. Returns ``None``
+(or empty subject/body) on a zero-signal day so the caller can skip
+sending — silent days are fine per the directive.
+The digest text is intentionally plain — no markdown, no html — so
+the same string can be used as an email body or a Slack message
+without re-formatting.
+"""
+from __future__ import annotations
+import json
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+PROMOTIONS_LOG = Path.home() / ".delimit" / "scan_bridge_promotions.jsonl"
+def _parse_iso(value: Optional[str]) -> Optional[datetime]:
+    if not value:
+        return None
+    try:
+        dt = datetime.fromisoformat(str(value).replace("Z", "+00:00"))
+    except (TypeError, ValueError):
+        return None
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=timezone.utc)
+    return dt
+def _load_promotions(log_path: Path) -> List[Dict[str, Any]]:
+    if not log_path.exists():
+        return []
+    out: List[Dict[str, Any]] = []
+    try:
+        with log_path.open("r", encoding="utf-8") as fh:
+            for line in fh:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    out.append(json.loads(line))
+                except (ValueError, json.JSONDecodeError):
+                    continue
+    except OSError:
+        return []
+    return out
+def _filter_window(
+    promotions: List[Dict[str, Any]], since: datetime
+) -> List[Dict[str, Any]]:
+    out: List[Dict[str, Any]] = []
+    for p in promotions:
+        ts = _parse_iso(p.get("ts"))
+        if ts and ts >= since:
+            out.append(p)
+    return out
+def build_daily_digest(
+    *,
+    now: Optional[datetime] = None,
+    window_hours: int = 24,
+    log_path: Optional[Path] = None,
+) -> Dict[str, Any]:
+    """Assemble the last-N-hour promotion digest.
+    Returns
+    -------
+    dict::
+        {
+          "subject": "Delimit scan-bridge — N strategic items (last 24h)",
+          "body":    "<plain text body>",
+          "count":   N,
+          "since":   ISO datetime,
+          "items":   [...promotion rows...],
+        }
+    When ``count == 0`` the subject and body are empty strings so the
+    caller can short-circuit ("no email on silent days") without having
+    to re-check ``count``.
+    """
+    now = now or datetime.now(timezone.utc)
+    since = now - timedelta(hours=window_hours)
+    log_path = log_path or PROMOTIONS_LOG
+    promotions = _load_promotions(log_path)
+    items = _filter_window(promotions, since)
+    items.sort(key=lambda p: p.get("ts") or "", reverse=True)
+    if not items:
+        return {
+            "subject": "",
+            "body": "",
+            "count": 0,
+            "since": since.isoformat(),
+            "items": [],
+        }
+    lines: List[str] = []
+    lines.append(
+        f"Delimit scan-bridge auto-promoted {len(items)} strategic signal(s) "
+        f"to the strategy ledger in the last {window_hours}h."
+    )
+    lines.append("")
+    lines.append(
+        "All items are P2 (review, not auto-action). Reply with item id + "
+        "decision (escalate, archive, defer) or open the ledger to triage."
+    )
+    lines.append("")
+    lines.append("─" * 70)
+    for p in items:
+        title = p.get("title") or "(no title)"
+        item_id = p.get("item_id") or "(unassigned)"
+        confidence = p.get("confidence")
+        platform = p.get("platform") or "?"
+        url = p.get("canonical_url") or ""
+        first_seen = p.get("first_seen") or ""
+        try:
+            conf_str = f"{float(confidence):.2f}" if confidence is not None else "?"
+        except (TypeError, ValueError):
+            conf_str = str(confidence)
+        lines.append(f"[{item_id}] {title}")
+        lines.append(f"    platform={platform} confidence={conf_str} first_seen={first_seen}")
+        if url:
+            lines.append(f"    {url}")
+        lines.append("")
+    lines.append("─" * 70)
+    lines.append("")
+    lines.append(
+        "Source: ~/.delimit/scan_bridge_promotions.jsonl. "
+        "Skipped duplicates: ~/.delimit/scan_bridge_dedup.jsonl. "
+        "Tune via DELIMIT_SCAN_PROMO_CONFIDENCE (default 0.85)."
+    )
+    body = "\n".join(lines)
+    subject = f"Delimit scan-bridge — {len(items)} strategic item(s) (last {window_hours}h)"
+    return {
+        "subject": subject,
+        "body": body,
+        "count": len(items),
+        "since": since.isoformat(),
+        "items": items,
+    }

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "delimit-cli",
   "mcpName": "io.github.delimit-ai/delimit-mcp-server",
-  "version": "4.5.6",
+  "version": "4.5.7",
   "description": "Unify Claude Code, Codex, Cursor, and Gemini CLI with persistent context, governance, and multi-model debate.",
   "main": "index.js",
   "files": [