delimit-cli 4.5.6 → 4.5.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,39 @@
1
+ """LED-1264: scan → strategy-ledger auto-promote bridge.
2
+
3
+ Pure consumer of ``~/.delimit/social_targets.jsonl`` (the existing
4
+ ``delimit_social_target`` output). Promotes a tightly-gated subset of
5
+ strategic signals into the strategy ledger so the founder reviews them
6
+ via a daily digest instead of inbox-spam pings.
7
+
8
+ Panel decision (UNANIMOUS R3, 2026-05-07): tight guards
9
+ (strategic + confidence ≥ 0.85 + dedup against open / 60-day-closed),
10
+ P2 priority (review, not auto-action), one daily digest email.
11
+
12
+ Public entry points:
13
+
14
+ - :func:`bridge.promote_recent_signals` — main work function
15
+ - :func:`digest.build_daily_digest` — assemble last-24h digest text
16
+ - :func:`bridge.backfill_from` — one-time idempotent backfill walker
17
+
18
+ The bridge is invoked by ``scripts/scan_bridge_cron.py`` on a 6-hour
19
+ crontab cadence (founder applies manually). Direct in-process calls to
20
+ ``ai.ledger_manager.add_item`` — no MCP subprocess.
21
+ """
22
+
23
+ from ai.scan_bridge.bridge import (
24
+ backfill_from,
25
+ promote_recent_signals,
26
+ )
27
+ from ai.scan_bridge.dedup import (
28
+ extract_topic_fingerprint,
29
+ is_duplicate,
30
+ )
31
+ from ai.scan_bridge.digest import build_daily_digest
32
+
33
+ __all__ = [
34
+ "backfill_from",
35
+ "build_daily_digest",
36
+ "extract_topic_fingerprint",
37
+ "is_duplicate",
38
+ "promote_recent_signals",
39
+ ]
@@ -0,0 +1,473 @@
1
+ """LED-1264 scan-bridge — promotion engine.
2
+
3
+ Reads ``~/.delimit/social_targets.jsonl`` (the existing
4
+ ``delimit_social_target`` output), filters to the tight panel-locked
5
+ gate, runs dedup against the strategy ledger, and promotes survivors
6
+ via direct in-process ``ledger_manager.add_item`` calls.
7
+
8
+ State / cursor:
9
+ ``~/.delimit/scan_bridge_cursor.json`` records the most-recent
10
+ ``first_seen`` value we've already processed. Subsequent runs only
11
+ consider lines newer than that. Idempotent — re-running the cron
12
+ on the same JSONL is a no-op.
13
+
14
+ Promotions log:
15
+ ``~/.delimit/scan_bridge_promotions.jsonl`` records every successful
16
+ promotion (item_id, signal_fingerprint, ts) so the daily digest can
17
+ assemble the last-24h batch without re-walking the ledger.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import json
23
+ import logging
24
+ import os
25
+ from contextlib import contextmanager
26
+ from dataclasses import dataclass
27
+ from datetime import datetime, date, timedelta, timezone
28
+ from pathlib import Path
29
+ from typing import Any, Dict, Iterable, List, Optional, Tuple
30
+
31
+ from ai.scan_bridge.dedup import (
32
+ _candidate_strategy_items,
33
+ extract_topic_fingerprint,
34
+ is_duplicate,
35
+ )
36
+
37
+ logger = logging.getLogger("delimit.ai.scan_bridge.bridge")
38
+
39
+ TARGETS_FILE = Path.home() / ".delimit" / "social_targets.jsonl"
40
+ CURSOR_FILE = Path.home() / ".delimit" / "scan_bridge_cursor.json"
41
+ PROMOTIONS_LOG = Path.home() / ".delimit" / "scan_bridge_promotions.jsonl"
42
+
43
+
44
+ def _confidence_floor() -> float:
45
+ """Resolve the active confidence floor (env-overridable per directive)."""
46
+ raw = os.environ.get("DELIMIT_SCAN_PROMO_CONFIDENCE", "")
47
+ if not raw:
48
+ return 0.85
49
+ try:
50
+ v = float(raw)
51
+ if 0.0 <= v <= 1.0:
52
+ return v
53
+ except (TypeError, ValueError):
54
+ pass
55
+ return 0.85
56
+
57
+
58
+ # ── Cursor I/O ────────────────────────────────────────────────────────
59
+
60
+
61
+ def _load_cursor() -> Optional[str]:
62
+ """Return the most-recent ``first_seen`` we've already processed."""
63
+ if not CURSOR_FILE.exists():
64
+ return None
65
+ try:
66
+ data = json.loads(CURSOR_FILE.read_text())
67
+ v = data.get("last_seen_at")
68
+ return str(v) if v else None
69
+ except (OSError, ValueError, json.JSONDecodeError):
70
+ return None
71
+
72
+
73
+ def _save_cursor(last_seen_at: str) -> None:
74
+ try:
75
+ CURSOR_FILE.parent.mkdir(parents=True, exist_ok=True)
76
+ CURSOR_FILE.write_text(json.dumps({"last_seen_at": last_seen_at}))
77
+ except OSError: # pragma: no cover — best-effort
78
+ logger.warning("scan_bridge: failed to persist cursor")
79
+
80
+
81
+ def _log_promotion(record: Dict[str, Any]) -> None:
82
+ try:
83
+ PROMOTIONS_LOG.parent.mkdir(parents=True, exist_ok=True)
84
+ with PROMOTIONS_LOG.open("a", encoding="utf-8") as fh:
85
+ fh.write(json.dumps(record) + "\n")
86
+ except OSError: # pragma: no cover — best-effort
87
+ pass
88
+
89
+
90
+ # ── Filtering ─────────────────────────────────────────────────────────
91
+
92
+
93
+ @dataclass
94
+ class _FilterStats:
95
+ considered: int = 0
96
+ rejected_classification: int = 0
97
+ rejected_confidence: int = 0
98
+ rejected_dedup: int = 0
99
+ promoted: int = 0
100
+
101
+
102
+ def _passes_strict_gate(
103
+ signal: Dict[str, Any],
104
+ *,
105
+ confidence_floor: float,
106
+ stats: _FilterStats,
107
+ ) -> Tuple[bool, str]:
108
+ """Return ``(passes, reason)``. ``reason`` is "" on pass."""
109
+ classification = (signal.get("classification") or "").strip().lower()
110
+ if classification != "strategic":
111
+ stats.rejected_classification += 1
112
+ return False, f"classification={classification or 'missing'}"
113
+ try:
114
+ confidence = float(signal.get("confidence") or 0.0)
115
+ except (TypeError, ValueError):
116
+ confidence = 0.0
117
+ if confidence < confidence_floor:
118
+ stats.rejected_confidence += 1
119
+ return False, f"confidence={confidence:.2f}<{confidence_floor:.2f}"
120
+ return True, ""
121
+
122
+
123
+ # ── Promotion path ────────────────────────────────────────────────────
124
+
125
+
126
+ def _build_title(signal: Dict[str, Any]) -> str:
127
+ snippet = (signal.get("content_snippet") or "").strip()
128
+ # If the snippet starts with a "[TAG] head" prefix the tag + head
129
+ # makes the most readable title. Otherwise fall back to the first
130
+ # 80 chars of the snippet.
131
+ if snippet.startswith("["):
132
+ head = snippet.split("\n", 1)[0]
133
+ if len(head) > 120:
134
+ head = head[:117] + "..."
135
+ return f"STRATEGIC: {head}"
136
+ if len(snippet) > 100:
137
+ snippet = snippet[:97] + "..."
138
+ return f"STRATEGIC: {snippet}" if snippet else "STRATEGIC: (no snippet)"
139
+
140
+
141
+ def _build_item(signal: Dict[str, Any]) -> Dict[str, Any]:
142
+ platform = signal.get("platform") or ""
143
+ canonical_url = signal.get("canonical_url") or ""
144
+ snippet = (signal.get("content_snippet") or "")[:280]
145
+ confidence = float(signal.get("confidence") or 0.0)
146
+ first_seen = signal.get("first_seen") or ""
147
+ source_id = signal.get("source_id") or signal.get("fingerprint") or ""
148
+
149
+ fingerprint_set = sorted(extract_topic_fingerprint(signal))
150
+
151
+ description = (
152
+ f"Auto-promoted from {platform} signal at {confidence:.2f}: "
153
+ f"{snippet}\n\nURL: {canonical_url or '(none)'}"
154
+ )
155
+ context_text = (
156
+ f"Captured by delimit_social_target on {first_seen}. "
157
+ "Panel-approved auto-promote (LED-1264) per deliberation 2026-05-07. "
158
+ "Founder reviews via daily digest."
159
+ )
160
+
161
+ return {
162
+ "title": _build_title(signal),
163
+ "ledger": "strategy",
164
+ "type": "strategy",
165
+ "priority": "P2",
166
+ "description": description,
167
+ "context": context_text,
168
+ "tags": ["auto_promoted", "scan_bridge", platform] if platform else ["auto_promoted", "scan_bridge"],
169
+ "source": "scan_bridge_auto",
170
+ "metadata_signal_ref": {
171
+ "platform": platform,
172
+ "source_id": source_id,
173
+ "fingerprint": fingerprint_set,
174
+ "first_seen": first_seen,
175
+ "confidence": confidence,
176
+ "canonical_url": canonical_url,
177
+ },
178
+ }
179
+
180
+
181
+ @contextmanager
182
+ def _signal_promote_bypass():
183
+ """Set ``_DELIMIT_SIGNAL_PROMOTED_BY`` so the LED-877 guard treats
184
+ this as the explicit promote path. Defensive against future source
185
+ name changes — guard currently allows ``scan_bridge_auto`` since it
186
+ doesn't start with the sensed prefixes, but this future-proofs.
187
+ """
188
+ key = "_DELIMIT_SIGNAL_PROMOTED_BY"
189
+ prev = os.environ.get(key)
190
+ os.environ[key] = "scan_bridge:LED-1264"
191
+ try:
192
+ yield
193
+ finally:
194
+ if prev is None:
195
+ os.environ.pop(key, None)
196
+ else:
197
+ os.environ[key] = prev
198
+
199
+
200
+ def _add_to_strategy_ledger(item: Dict[str, Any]) -> Dict[str, Any]:
201
+ """Direct in-process call to ``ledger_manager.add_item``.
202
+
203
+ The ledger module currently doesn't accept a ``metadata`` kwarg, so
204
+ we splice signal_ref into the description as a fenced JSON block AND
205
+ embed the fingerprint tokens into the tags list. Future ledger
206
+ schema enhancements that add a metadata column should swap this in
207
+ without changing the caller surface.
208
+ """
209
+ from ai.ledger_manager import add_item
210
+
211
+ signal_ref = item.pop("metadata_signal_ref", {})
212
+ fp_tokens = signal_ref.get("fingerprint") or []
213
+ fingerprint_tags = [f"fp:{t}" for t in fp_tokens][:8] # cap to keep tag list sane
214
+
215
+ # Append fenced JSON to description so tools that read raw description
216
+ # can recover the signal_ref structurally; the dedup module already
217
+ # falls back to extracting fingerprints from description text when
218
+ # the structured field is missing, so this is also recoverable.
219
+ sref_block = "\n\nsignal_ref:\n```json\n" + json.dumps(signal_ref, ensure_ascii=False, sort_keys=True) + "\n```"
220
+ item["description"] = item.get("description", "") + sref_block
221
+ item["tags"] = list(item.get("tags") or []) + fingerprint_tags
222
+
223
+ with _signal_promote_bypass():
224
+ return add_item(**item)
225
+
226
+
227
+ # ── Public API ────────────────────────────────────────────────────────
228
+
229
+
230
+ def _iter_signals(targets_file: Path = TARGETS_FILE) -> Iterable[Dict[str, Any]]:
231
+ if not targets_file.exists():
232
+ return
233
+ try:
234
+ with targets_file.open("r", encoding="utf-8") as fh:
235
+ for line in fh:
236
+ line = line.strip()
237
+ if not line:
238
+ continue
239
+ try:
240
+ yield json.loads(line)
241
+ except (json.JSONDecodeError, ValueError):
242
+ continue
243
+ except OSError as exc: # pragma: no cover
244
+ logger.warning("scan_bridge: failed to read %s: %s", targets_file, exc)
245
+
246
+
247
+ def _normalize_first_seen(value: Any) -> str:
248
+ """Return a comparable string. Empty string sorts before anything."""
249
+ if not value:
250
+ return ""
251
+ return str(value)
252
+
253
+
254
+ def promote_recent_signals(
255
+ since: Optional[datetime] = None,
256
+ *,
257
+ dry_run: bool = False,
258
+ targets_file: Optional[Path] = None,
259
+ confidence_floor: Optional[float] = None,
260
+ candidates: Optional[Iterable[Dict[str, Any]]] = None,
261
+ ) -> Dict[str, Any]:
262
+ """Process scanned signals from ``targets_file`` and promote
263
+ survivors of the strict gate to the strategy ledger.
264
+
265
+ Parameters
266
+ ----------
267
+ since:
268
+ Optional cutoff. Defaults to the persisted cursor; falls back to
269
+ 24h ago when no cursor exists.
270
+ dry_run:
271
+ When True no ledger writes happen; the response still contains
272
+ the would-be promotions for audit / preview.
273
+ targets_file:
274
+ Override the default ``social_targets.jsonl`` path (test hook).
275
+ confidence_floor:
276
+ Override the env-resolved floor (test hook).
277
+ candidates:
278
+ Override the strategy-ledger candidate list for dedup (test
279
+ hook). When omitted we fetch live items inside ``is_duplicate``.
280
+
281
+ Returns
282
+ -------
283
+ dict with keys: ``stats``, ``promoted`` (list of {item_id,
284
+ signal_fingerprint, title, snippet}), ``cursor_advanced_to``,
285
+ ``dry_run``.
286
+ """
287
+ targets_file = targets_file or TARGETS_FILE
288
+ floor = confidence_floor if confidence_floor is not None else _confidence_floor()
289
+
290
+ cursor_value = _load_cursor()
291
+ if since is not None:
292
+ # Caller-supplied since: take the LATER of since vs cursor so we
293
+ # never reprocess a row we've already promoted.
294
+ since_iso = since.astimezone(timezone.utc).isoformat()
295
+ if cursor_value and cursor_value > since_iso:
296
+ since_iso = cursor_value
297
+ else:
298
+ if cursor_value:
299
+ since_iso = cursor_value
300
+ else:
301
+ since_iso = (datetime.now(timezone.utc) - timedelta(hours=24)).isoformat()
302
+
303
+ stats = _FilterStats()
304
+ promoted: List[Dict[str, Any]] = []
305
+ max_seen = since_iso
306
+
307
+ # Resolve candidates ONCE per run for performance — production calls
308
+ # don't pass it; we hand the live list to is_duplicate as a static
309
+ # snapshot so 1000 signals don't trigger 1000 ledger walks.
310
+ if candidates is None:
311
+ snapshot = list(_candidate_strategy_items(window_days=60))
312
+ else:
313
+ snapshot = list(candidates)
314
+ # We'll mutate snapshot during the run so an early-batch promotion
315
+ # blocks a later-batch duplicate within the same invocation.
316
+ live_snapshot: List[Dict[str, Any]] = list(snapshot)
317
+
318
+ # Process newest-first within the batch so when two signals about
319
+ # the same topic appear (e.g. oasdiff v1.15.0-beta + v1.15.2), the
320
+ # MOST RECENT version wins. The earlier versions then dedup against
321
+ # the newer item — which is what the founder wants in the digest.
322
+ # We still advance the cursor to the max first_seen across the run
323
+ # so the next call only considers genuinely-new rows.
324
+ queued: List[Dict[str, Any]] = []
325
+ for signal in _iter_signals(targets_file):
326
+ first_seen = _normalize_first_seen(signal.get("first_seen"))
327
+ if first_seen <= since_iso:
328
+ continue
329
+ queued.append((first_seen, signal))
330
+ queued.sort(key=lambda pair: pair[0], reverse=True)
331
+
332
+ for first_seen, signal in queued:
333
+ stats.considered += 1
334
+ if first_seen > max_seen:
335
+ max_seen = first_seen
336
+
337
+ passes, reason = _passes_strict_gate(
338
+ signal, confidence_floor=floor, stats=stats
339
+ )
340
+ if not passes:
341
+ continue
342
+
343
+ match = is_duplicate(signal, window_days=60, candidates=live_snapshot)
344
+ if match is not None:
345
+ stats.rejected_dedup += 1
346
+ continue
347
+
348
+ if dry_run:
349
+ stats.promoted += 1
350
+ promoted.append({
351
+ "item_id": "DRY-RUN",
352
+ "signal_fingerprint": signal.get("fingerprint"),
353
+ "title": _build_title(signal),
354
+ "snippet": (signal.get("content_snippet") or "")[:200],
355
+ "confidence": signal.get("confidence"),
356
+ "platform": signal.get("platform"),
357
+ "canonical_url": signal.get("canonical_url"),
358
+ "first_seen": first_seen,
359
+ })
360
+ # Mirror within-batch dedup behaviour even in dry-run so the
361
+ # preview count matches what a real run would write. Build a
362
+ # synthetic ledger-shaped item carrying the signal's
363
+ # fingerprint tokens.
364
+ tokens = sorted(extract_topic_fingerprint(signal))
365
+ now_iso = datetime.now(timezone.utc).isoformat()
366
+ live_snapshot.append({
367
+ "id": "DRY-RUN",
368
+ "status": "open",
369
+ "title": _build_title(signal),
370
+ "description": (signal.get("content_snippet") or ""),
371
+ "context": "",
372
+ "tags": [],
373
+ "created_at": now_iso,
374
+ "updated_at": now_iso,
375
+ "metadata": {"signal_ref": {"fingerprint": tokens}},
376
+ })
377
+ continue
378
+
379
+ item = _build_item(signal)
380
+ # Capture the signal_ref before _add_to_strategy_ledger pops it
381
+ # off the item dict — we need it for the within-batch snapshot
382
+ # append below so subsequent signals can dedup against this one.
383
+ captured_signal_ref = item.get("metadata_signal_ref") or {}
384
+ try:
385
+ result = _add_to_strategy_ledger(item)
386
+ except Exception as exc:
387
+ logger.exception("scan_bridge: ledger add failed for %s", signal.get("fingerprint"))
388
+ continue
389
+ added = result.get("added") or {}
390
+ item_id = added.get("id") or ""
391
+ stats.promoted += 1
392
+ record = {
393
+ "ts": datetime.now(timezone.utc).isoformat(),
394
+ "item_id": item_id,
395
+ "signal_fingerprint": signal.get("fingerprint"),
396
+ "title": item["title"],
397
+ "platform": signal.get("platform"),
398
+ "confidence": signal.get("confidence"),
399
+ "canonical_url": signal.get("canonical_url"),
400
+ "first_seen": first_seen,
401
+ }
402
+ _log_promotion(record)
403
+ promoted.append({
404
+ "item_id": item_id,
405
+ "signal_fingerprint": signal.get("fingerprint"),
406
+ "title": item["title"],
407
+ "snippet": (signal.get("content_snippet") or "")[:200],
408
+ "confidence": signal.get("confidence"),
409
+ "platform": signal.get("platform"),
410
+ "canonical_url": signal.get("canonical_url"),
411
+ "first_seen": first_seen,
412
+ })
413
+ # Add the freshly-promoted item to the in-memory snapshot so any
414
+ # later-but-similar signal in the same batch is correctly
415
+ # de-duplicated.
416
+ now_iso = datetime.now(timezone.utc).isoformat()
417
+ live_snapshot.append({
418
+ "id": item_id,
419
+ "status": "open",
420
+ "title": item["title"],
421
+ "description": item["description"],
422
+ "context": item.get("context", ""),
423
+ "tags": item.get("tags") or [],
424
+ "created_at": now_iso,
425
+ "updated_at": now_iso,
426
+ "metadata": {"signal_ref": captured_signal_ref},
427
+ })
428
+
429
+ # Advance cursor on success — only when not a dry-run.
430
+ if not dry_run and max_seen and max_seen != since_iso:
431
+ _save_cursor(max_seen)
432
+
433
+ return {
434
+ "stats": {
435
+ "considered": stats.considered,
436
+ "rejected_classification": stats.rejected_classification,
437
+ "rejected_confidence": stats.rejected_confidence,
438
+ "rejected_dedup": stats.rejected_dedup,
439
+ "promoted": stats.promoted,
440
+ },
441
+ "promoted": promoted,
442
+ "cursor_advanced_to": max_seen if (not dry_run and max_seen != since_iso) else None,
443
+ "since": since_iso,
444
+ "dry_run": dry_run,
445
+ "confidence_floor": floor,
446
+ }
447
+
448
+
449
+ def backfill_from(
450
+ start_date: date,
451
+ *,
452
+ dry_run: bool = False,
453
+ targets_file: Optional[Path] = None,
454
+ candidates: Optional[Iterable[Dict[str, Any]]] = None,
455
+ ) -> Dict[str, Any]:
456
+ """Walk ``targets_file`` from ``start_date`` (UTC) forward and
457
+ promote everything that passes the strict gate.
458
+
459
+ Idempotent — leverages the same cursor as ``promote_recent_signals``
460
+ so re-running on the same range is a no-op (or a delta-only run if
461
+ the file has grown).
462
+
463
+ Per the directive: surface the candidate counts so the founder sees
464
+ how much real signal was captured but never promoted before this
465
+ bridge existed.
466
+ """
467
+ since_dt = datetime.combine(start_date, datetime.min.time(), tzinfo=timezone.utc)
468
+ return promote_recent_signals(
469
+ since=since_dt,
470
+ dry_run=dry_run,
471
+ targets_file=targets_file,
472
+ candidates=candidates,
473
+ )
@@ -0,0 +1,335 @@
1
+ """LED-1264: scan-bridge dedup — fingerprint a signal and check the ledger.
2
+
3
+ Two-stage dedup:
4
+
5
+ 1. Extract a topic fingerprint from the signal — domain/orbit signal
6
+ terms (reuse ``social_capability.fit_floor._extract_topic_fingerprint``
7
+ if available), plus the canonical_url host + first significant path
8
+ segment, plus the leading bracket-prefixed tag (e.g. ``[COMPETITOR
9
+ RELEASE]``) which is a strong topic signal in our scan corpus.
10
+
11
+ 2. Look the fingerprint up against the strategy ledger inside a
12
+ 60-day window (any status — open, done, cancelled, blocked,
13
+ archived). If ANY active or recently-closed item matches, skip
14
+ promotion. Per the directive: 60% recall is fine; cost of missing
15
+ a duplicate is one founder-reviewed P2 item.
16
+
17
+ Skipped duplicates are logged to ``~/.delimit/scan_bridge_dedup.jsonl``
18
+ so the founder can audit what the bridge filtered out.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import json
24
+ import re
25
+ from datetime import datetime, timedelta, timezone
26
+ from pathlib import Path
27
+ from typing import Any, Dict, Iterable, Optional, Set
28
+ from urllib.parse import urlparse
29
+
30
+ DEDUP_LOG = Path.home() / ".delimit" / "scan_bridge_dedup.jsonl"
31
+
32
+ # Bracket-prefix tags carried by the scanner (e.g. "[COMPETITOR RELEASE]
33
+ # oasdiff …" or "[VENDOR NEWS] …"). These are strong topic signals — when
34
+ # present we lift them into the fingerprint as a single canonical token
35
+ # so two scans of "oasdiff v1.15.1" + "oasdiff v1.15.2" both share the
36
+ # "competitor_release:oasdiff" key.
37
+ _BRACKET_PREFIX_RE = re.compile(r"^\s*\[([^\]]{1,40})\]\s*([^\s:.]{1,80})", re.IGNORECASE)
38
+
39
+ # A trivial path-segment splitter; we just want the first non-empty
40
+ # significant chunk (e.g. "oasdiff" from /oasdiff/oasdiff/releases/tag/...).
41
+ _SIGNIFICANT_PATH_RE = re.compile(r"[A-Za-z0-9][A-Za-z0-9_\-.]{1,}")
42
+
43
+
44
+ def _domain_orbit_terms(text: str) -> Set[str]:
45
+ """Best-effort import of fit_floor's topic extractor.
46
+
47
+ fit_floor extracts the union of matched Delimit-domain + orbit
48
+ signal terms. If the import fails for any reason (test isolation,
49
+ refactor) we fall back to an empty set — the URL/bracket terms
50
+ below are still load-bearing on their own.
51
+ """
52
+ try:
53
+ from ai.social_capability.fit_floor import _extract_topic_fingerprint
54
+ except Exception: # pragma: no cover — tolerant fallback
55
+ return set()
56
+ try:
57
+ return set(_extract_topic_fingerprint(text or ""))
58
+ except Exception: # pragma: no cover
59
+ return set()
60
+
61
+
62
+ def _bracket_prefix_token(snippet: str) -> Optional[str]:
63
+ """Extract a "<tag>:<head_word>" canonical token from a bracketed
64
+ snippet header. Returns None when the snippet doesn't start with
65
+ a recognisable bracket tag.
66
+ """
67
+ if not snippet:
68
+ return None
69
+ m = _BRACKET_PREFIX_RE.match(snippet)
70
+ if not m:
71
+ return None
72
+ tag = re.sub(r"\s+", "_", m.group(1).strip().lower())
73
+ head = m.group(2).strip().lower()
74
+ if not tag or not head:
75
+ return None
76
+ return f"{tag}:{head}"
77
+
78
+
79
+ def _url_terms(canonical_url: str) -> Set[str]:
80
+ """Return host + first significant path segment as canonical tokens."""
81
+ if not canonical_url:
82
+ return set()
83
+ try:
84
+ p = urlparse(canonical_url)
85
+ except Exception:
86
+ return set()
87
+ out: Set[str] = set()
88
+ host = (p.netloc or "").lower().lstrip("www.")
89
+ if host:
90
+ out.add(f"host:{host}")
91
+ # Pull first 1-2 significant path segments. For github.com the first
92
+ # is the org and the second is the repo — both useful as dedup keys.
93
+ segments = [s for s in (p.path or "").split("/") if s]
94
+ for seg in segments[:2]:
95
+ m = _SIGNIFICANT_PATH_RE.search(seg)
96
+ if m:
97
+ out.add(f"seg:{m.group(0).lower()}")
98
+ return out
99
+
100
+
101
+ def extract_topic_fingerprint(signal: Dict[str, Any]) -> Set[str]:
102
+ """Return the dedup fingerprint set for a single scanned signal.
103
+
104
+ The fingerprint is a SET of canonical tokens. Two signals are
105
+ considered overlapping when their fingerprint sets share at least
106
+ one token. Per the directive: don't be too clever; 60% recall is
107
+ fine.
108
+ """
109
+ snippet = signal.get("content_snippet") or ""
110
+ canonical_url = signal.get("canonical_url") or ""
111
+ rationale = signal.get("rationale") or ""
112
+
113
+ tokens: Set[str] = set()
114
+ tokens.update(_domain_orbit_terms(f"{snippet}\n{rationale}"))
115
+ tokens.update(_url_terms(canonical_url))
116
+ bracket = _bracket_prefix_token(snippet)
117
+ if bracket:
118
+ tokens.add(bracket)
119
+ return tokens
120
+
121
+
122
+ # ── Ledger lookup ─────────────────────────────────────────────────────
123
+
124
+
125
+ def _parse_iso(value: Optional[str]) -> Optional[datetime]:
126
+ if not value:
127
+ return None
128
+ try:
129
+ dt = datetime.fromisoformat(str(value).replace("Z", "+00:00"))
130
+ except (TypeError, ValueError):
131
+ return None
132
+ if dt.tzinfo is None:
133
+ dt = dt.replace(tzinfo=timezone.utc)
134
+ return dt
135
+
136
+
137
+ def _item_fingerprint_tokens(item: Dict[str, Any]) -> Set[str]:
138
+ """Recover a fingerprint token set from a stored ledger item.
139
+
140
+ Auto-promoted items carry their fingerprint in
141
+ ``metadata.signal_ref.fingerprint`` as a serialised list. Older /
142
+ hand-added items don't, so we fall back to extracting on-the-fly
143
+ from title + description + tags + context — the same fields a
144
+ reasonable founder would have written about the same topic.
145
+ """
146
+ metadata = item.get("metadata") or {}
147
+ signal_ref = metadata.get("signal_ref") or {}
148
+ stored = signal_ref.get("fingerprint")
149
+ if isinstance(stored, list) and stored:
150
+ return {str(t).lower() for t in stored if t}
151
+ if isinstance(stored, str) and stored:
152
+ # Comma-separated fallback shape.
153
+ return {p.strip().lower() for p in stored.split(",") if p.strip()}
154
+
155
+ # Fallback: synthesise a fingerprint from the human text in the item.
156
+ parts = [
157
+ item.get("title") or "",
158
+ item.get("description") or "",
159
+ item.get("context") or "",
160
+ ]
161
+ tags = item.get("tags") or []
162
+ if isinstance(tags, list):
163
+ parts.append(" ".join(str(t) for t in tags))
164
+ text = "\n".join(p for p in parts if p)
165
+ fake_signal = {"content_snippet": text, "canonical_url": "", "rationale": ""}
166
+ return extract_topic_fingerprint(fake_signal)
167
+
168
+
169
+ def _within_window(item: Dict[str, Any], window_days: int, now: datetime) -> bool:
170
+ """Item is in-window if either created_at OR updated_at is within
171
+ ``window_days`` of ``now``.
172
+ """
173
+ cutoff = now - timedelta(days=window_days)
174
+ for field in ("updated_at", "created_at"):
175
+ ts = _parse_iso(item.get(field))
176
+ if ts and ts >= cutoff:
177
+ return True
178
+ return False
179
+
180
+
181
+ def _candidate_strategy_items(window_days: int = 60) -> Iterable[Dict[str, Any]]:
182
+ """Yield strategy items in the dedup window.
183
+
184
+ Imports ``ai.ledger_manager.list_items`` lazily so test patches
185
+ targeting that symbol take effect at call time.
186
+ """
187
+ try:
188
+ from ai.ledger_manager import list_items
189
+ except Exception: # pragma: no cover
190
+ return iter(())
191
+ now = datetime.now(timezone.utc)
192
+ out: list = []
193
+ cursor: Optional[str] = None
194
+ seen_ids: Set[str] = set()
195
+ # Walk pages defensively — most ledgers have <500 strategy items, but
196
+ # paginate if needed.
197
+ for _ in range(20): # hard cap on pages, prevents accidental infinite loop
198
+ resp = list_items(
199
+ ledger="strategy",
200
+ limit=500,
201
+ cursor=cursor,
202
+ sort="updated_at",
203
+ order="desc",
204
+ )
205
+ items = (resp.get("items") or {}).get("strategy") or []
206
+ if not items:
207
+ break
208
+ for item in items:
209
+ iid = item.get("id") or ""
210
+ if iid and iid in seen_ids:
211
+ continue
212
+ if iid:
213
+ seen_ids.add(iid)
214
+ if _within_window(item, window_days, now):
215
+ out.append(item)
216
+ cursor = resp.get("next_cursor")
217
+ if not cursor:
218
+ break
219
+ return out
220
+
221
+
222
+ def _log_dedup(signal: Dict[str, Any], match: Dict[str, Any], reason: str) -> None:
223
+ try:
224
+ DEDUP_LOG.parent.mkdir(parents=True, exist_ok=True)
225
+ with DEDUP_LOG.open("a", encoding="utf-8") as fh:
226
+ fh.write(json.dumps({
227
+ "ts": datetime.now(timezone.utc).isoformat(),
228
+ "signal_fingerprint_id": signal.get("fingerprint"),
229
+ "platform": signal.get("platform"),
230
+ "canonical_url": signal.get("canonical_url"),
231
+ "snippet_head": (signal.get("content_snippet") or "")[:160],
232
+ "matched_item_id": match.get("id"),
233
+ "matched_item_title": (match.get("title") or "")[:160],
234
+ "matched_item_status": match.get("status"),
235
+ "reason": reason,
236
+ }) + "\n")
237
+ except OSError: # pragma: no cover — best-effort
238
+ pass
239
+
240
+
241
+ def _is_strong_match(shared: Set[str], sig_tokens: Set[str]) -> bool:
242
+ """Return True when the shared-token set is specific enough to
243
+ claim two signals are about the same topic.
244
+
245
+ Strict rule (chosen after empirical scan-corpus tuning, see
246
+ LED-1264 memo): a true dedup match requires a SPECIFIC token —
247
+ either a bracket-prefix token (``competitor_release:oasdiff``,
248
+ ``vendor_news:cursor``, ``outreach_state_change:logto-io``) or a
249
+ ``seg:<repo>`` URL path segment. Generic orbit terms ("mcp",
250
+ "claude code", "cursor"), tech-context words, and bare host tokens
251
+ are NOT enough on their own. A signal where two of those overlap
252
+ but neither has a specific identifier is two different things
253
+ that happen to live in the same ecosystem; we want them as
254
+ separate ledger items.
255
+
256
+ Per the directive: "don't be too clever — 60% recall on duplicates
257
+ is fine; the cost of missing a duplicate is one founder-reviewed
258
+ P2 ledger item, not a catastrophe." This rule errs toward
259
+ promoting (more recall on the no-dedup decision).
260
+ """
261
+ if not shared:
262
+ return False
263
+
264
+ # Bracket-prefix tokens win — they're tightly scoped (vendor name
265
+ # baked in). Excludes host: and seg: which use the same `:` syntax
266
+ # but live in their own buckets below.
267
+ if any(":" in t and not t.startswith("host:") and not t.startswith("seg:") for t in shared):
268
+ return True
269
+
270
+ # Specific repo segments win — same repo across two signals is a
271
+ # real dedup. seg: tokens carry the repo name post-host (e.g. for
272
+ # github.com/oasdiff/oasdiff we extract seg:oasdiff). When two
273
+ # signals share that, they're about the same project.
274
+ if any(t.startswith("seg:") for t in shared):
275
+ return True
276
+
277
+ return False
278
+
279
+
280
+ def is_duplicate(
281
+ signal: Dict[str, Any],
282
+ *,
283
+ window_days: int = 60,
284
+ candidates: Optional[Iterable[Dict[str, Any]]] = None,
285
+ ) -> Optional[Dict[str, Any]]:
286
+ """Return the matching ledger item dict if ``signal`` collides with
287
+ an existing strategy item inside the window; ``None`` otherwise.
288
+
289
+ The match rule is intentionally specific — sharing only "mcp" or
290
+ "host:github.com" between two signals isn't enough overlap to call
291
+ them duplicates (that's most of the scan corpus). See
292
+ :func:`_is_strong_match` for the exact rule.
293
+
294
+ Parameters
295
+ ----------
296
+ signal:
297
+ Raw scan target dict (the JSONL line shape from
298
+ ``social_targets.jsonl``).
299
+ window_days:
300
+ Age window for "recently closed" items. Default 60 — per the
301
+ directive, avoid re-raising things we explicitly chose not to act
302
+ on within the last 60 days.
303
+ candidates:
304
+ Optional iterable of strategy items to check against. Tests pass
305
+ an explicit list. Production callers omit it and we fetch from
306
+ the live ledger.
307
+ """
308
+ sig_tokens = extract_topic_fingerprint(signal)
309
+ if not sig_tokens:
310
+ # No tokens at all means we can't make a useful dedup judgement.
311
+ # Treat as non-duplicate; the tight confidence floor is the main
312
+ # quality gate.
313
+ return None
314
+
315
+ items = list(candidates) if candidates is not None else list(
316
+ _candidate_strategy_items(window_days=window_days)
317
+ )
318
+
319
+ now = datetime.now(timezone.utc)
320
+ for item in items:
321
+ # When candidates were supplied explicitly we still respect the
322
+ # window so unit tests can assert window behaviour without
323
+ # re-implementing the date filter.
324
+ if candidates is not None and not _within_window(item, window_days, now):
325
+ continue
326
+ item_tokens = _item_fingerprint_tokens(item)
327
+ if not item_tokens:
328
+ continue
329
+ shared = sig_tokens & item_tokens
330
+ if not _is_strong_match(shared, sig_tokens):
331
+ continue
332
+ reason = "open_match" if (item.get("status") == "open") else "recent_match"
333
+ _log_dedup(signal, item, reason)
334
+ return item
335
+ return None
@@ -0,0 +1,151 @@
1
+ """LED-1264 daily digest assembler.
2
+
3
+ Reads ``~/.delimit/scan_bridge_promotions.jsonl`` and assembles ONE
4
+ email-ready digest of the last 24h of promotions. Returns ``None``
5
+ (or empty subject/body) on a zero-signal day so the caller can skip
6
+ sending — silent days are fine per the directive.
7
+
8
+ The digest text is intentionally plain — no markdown, no html — so
9
+ the same string can be used as an email body or a Slack message
10
+ without re-formatting.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ from datetime import datetime, timedelta, timezone
17
+ from pathlib import Path
18
+ from typing import Any, Dict, List, Optional
19
+
20
+ PROMOTIONS_LOG = Path.home() / ".delimit" / "scan_bridge_promotions.jsonl"
21
+
22
+
23
+ def _parse_iso(value: Optional[str]) -> Optional[datetime]:
24
+ if not value:
25
+ return None
26
+ try:
27
+ dt = datetime.fromisoformat(str(value).replace("Z", "+00:00"))
28
+ except (TypeError, ValueError):
29
+ return None
30
+ if dt.tzinfo is None:
31
+ dt = dt.replace(tzinfo=timezone.utc)
32
+ return dt
33
+
34
+
35
+ def _load_promotions(log_path: Path) -> List[Dict[str, Any]]:
36
+ if not log_path.exists():
37
+ return []
38
+ out: List[Dict[str, Any]] = []
39
+ try:
40
+ with log_path.open("r", encoding="utf-8") as fh:
41
+ for line in fh:
42
+ line = line.strip()
43
+ if not line:
44
+ continue
45
+ try:
46
+ out.append(json.loads(line))
47
+ except (ValueError, json.JSONDecodeError):
48
+ continue
49
+ except OSError:
50
+ return []
51
+ return out
52
+
53
+
54
+ def _filter_window(
55
+ promotions: List[Dict[str, Any]], since: datetime
56
+ ) -> List[Dict[str, Any]]:
57
+ out: List[Dict[str, Any]] = []
58
+ for p in promotions:
59
+ ts = _parse_iso(p.get("ts"))
60
+ if ts and ts >= since:
61
+ out.append(p)
62
+ return out
63
+
64
+
65
+ def build_daily_digest(
66
+ *,
67
+ now: Optional[datetime] = None,
68
+ window_hours: int = 24,
69
+ log_path: Optional[Path] = None,
70
+ ) -> Dict[str, Any]:
71
+ """Assemble the last-N-hour promotion digest.
72
+
73
+ Returns
74
+ -------
75
+ dict::
76
+
77
+ {
78
+ "subject": "Delimit scan-bridge — N strategic items (last 24h)",
79
+ "body": "<plain text body>",
80
+ "count": N,
81
+ "since": ISO datetime,
82
+ "items": [...promotion rows...],
83
+ }
84
+
85
+ When ``count == 0`` the subject and body are empty strings so the
86
+ caller can short-circuit ("no email on silent days") without having
87
+ to re-check ``count``.
88
+ """
89
+ now = now or datetime.now(timezone.utc)
90
+ since = now - timedelta(hours=window_hours)
91
+ log_path = log_path or PROMOTIONS_LOG
92
+
93
+ promotions = _load_promotions(log_path)
94
+ items = _filter_window(promotions, since)
95
+ items.sort(key=lambda p: p.get("ts") or "", reverse=True)
96
+
97
+ if not items:
98
+ return {
99
+ "subject": "",
100
+ "body": "",
101
+ "count": 0,
102
+ "since": since.isoformat(),
103
+ "items": [],
104
+ }
105
+
106
+ lines: List[str] = []
107
+ lines.append(
108
+ f"Delimit scan-bridge auto-promoted {len(items)} strategic signal(s) "
109
+ f"to the strategy ledger in the last {window_hours}h."
110
+ )
111
+ lines.append("")
112
+ lines.append(
113
+ "All items are P2 (review, not auto-action). Reply with item id + "
114
+ "decision (escalate, archive, defer) or open the ledger to triage."
115
+ )
116
+ lines.append("")
117
+ lines.append("─" * 70)
118
+ for p in items:
119
+ title = p.get("title") or "(no title)"
120
+ item_id = p.get("item_id") or "(unassigned)"
121
+ confidence = p.get("confidence")
122
+ platform = p.get("platform") or "?"
123
+ url = p.get("canonical_url") or ""
124
+ first_seen = p.get("first_seen") or ""
125
+ try:
126
+ conf_str = f"{float(confidence):.2f}" if confidence is not None else "?"
127
+ except (TypeError, ValueError):
128
+ conf_str = str(confidence)
129
+ lines.append(f"[{item_id}] {title}")
130
+ lines.append(f" platform={platform} confidence={conf_str} first_seen={first_seen}")
131
+ if url:
132
+ lines.append(f" {url}")
133
+ lines.append("")
134
+
135
+ lines.append("─" * 70)
136
+ lines.append("")
137
+ lines.append(
138
+ "Source: ~/.delimit/scan_bridge_promotions.jsonl. "
139
+ "Skipped duplicates: ~/.delimit/scan_bridge_dedup.jsonl. "
140
+ "Tune via DELIMIT_SCAN_PROMO_CONFIDENCE (default 0.85)."
141
+ )
142
+
143
+ body = "\n".join(lines)
144
+ subject = f"Delimit scan-bridge — {len(items)} strategic item(s) (last {window_hours}h)"
145
+ return {
146
+ "subject": subject,
147
+ "body": body,
148
+ "count": len(items),
149
+ "since": since.isoformat(),
150
+ "items": items,
151
+ }
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "delimit-cli",
3
3
  "mcpName": "io.github.delimit-ai/delimit-mcp-server",
4
- "version": "4.5.6",
4
+ "version": "4.5.7",
5
5
  "description": "Unify Claude Code, Codex, Cursor, and Gemini CLI with persistent context, governance, and multi-model debate.",
6
6
  "main": "index.js",
7
7
  "files": [