delimit-cli 4.5.6 → 4.5.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,335 @@
1
+ """LED-1264: scan-bridge dedup — fingerprint a signal and check the ledger.
2
+
3
+ Two-stage dedup:
4
+
5
+ 1. Extract a topic fingerprint from the signal — domain/orbit signal
6
+ terms (reuse ``social_capability.fit_floor._extract_topic_fingerprint``
7
+ if available), plus the canonical_url host + first significant path
8
+ segment, plus the leading bracket-prefixed tag (e.g. ``[COMPETITOR
9
+ RELEASE]``) which is a strong topic signal in our scan corpus.
10
+
11
+ 2. Look the fingerprint up against the strategy ledger inside a
12
+ 60-day window (any status — open, done, cancelled, blocked,
13
+ archived). If ANY active or recently-closed item matches, skip
14
+ promotion. Per the directive: 60% recall is fine; cost of missing
15
+ a duplicate is one founder-reviewed P2 item.
16
+
17
+ Skipped duplicates are logged to ``~/.delimit/scan_bridge_dedup.jsonl``
18
+ so the founder can audit what the bridge filtered out.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import json
24
+ import re
25
+ from datetime import datetime, timedelta, timezone
26
+ from pathlib import Path
27
+ from typing import Any, Dict, Iterable, Optional, Set
28
+ from urllib.parse import urlparse
29
+
30
+ DEDUP_LOG = Path.home() / ".delimit" / "scan_bridge_dedup.jsonl"
31
+
32
+ # Bracket-prefix tags carried by the scanner (e.g. "[COMPETITOR RELEASE]
33
+ # oasdiff …" or "[VENDOR NEWS] …"). These are strong topic signals — when
34
+ # present we lift them into the fingerprint as a single canonical token
35
+ # so two scans of "oasdiff v1.15.1" + "oasdiff v1.15.2" both share the
36
+ # "competitor_release:oasdiff" key.
37
+ _BRACKET_PREFIX_RE = re.compile(r"^\s*\[([^\]]{1,40})\]\s*([^\s:.]{1,80})", re.IGNORECASE)
38
+
39
+ # A trivial path-segment splitter; we just want the first non-empty
40
+ # significant chunk (e.g. "oasdiff" from /oasdiff/oasdiff/releases/tag/...).
41
+ _SIGNIFICANT_PATH_RE = re.compile(r"[A-Za-z0-9][A-Za-z0-9_\-.]{1,}")
42
+
43
+
44
+ def _domain_orbit_terms(text: str) -> Set[str]:
45
+ """Best-effort import of fit_floor's topic extractor.
46
+
47
+ fit_floor extracts the union of matched Delimit-domain + orbit
48
+ signal terms. If the import fails for any reason (test isolation,
49
+ refactor) we fall back to an empty set — the URL/bracket terms
50
+ below are still load-bearing on their own.
51
+ """
52
+ try:
53
+ from ai.social_capability.fit_floor import _extract_topic_fingerprint
54
+ except Exception: # pragma: no cover — tolerant fallback
55
+ return set()
56
+ try:
57
+ return set(_extract_topic_fingerprint(text or ""))
58
+ except Exception: # pragma: no cover
59
+ return set()
60
+
61
+
62
+ def _bracket_prefix_token(snippet: str) -> Optional[str]:
63
+ """Extract a "<tag>:<head_word>" canonical token from a bracketed
64
+ snippet header. Returns None when the snippet doesn't start with
65
+ a recognisable bracket tag.
66
+ """
67
+ if not snippet:
68
+ return None
69
+ m = _BRACKET_PREFIX_RE.match(snippet)
70
+ if not m:
71
+ return None
72
+ tag = re.sub(r"\s+", "_", m.group(1).strip().lower())
73
+ head = m.group(2).strip().lower()
74
+ if not tag or not head:
75
+ return None
76
+ return f"{tag}:{head}"
77
+
78
+
79
+ def _url_terms(canonical_url: str) -> Set[str]:
80
+ """Return host + first significant path segment as canonical tokens."""
81
+ if not canonical_url:
82
+ return set()
83
+ try:
84
+ p = urlparse(canonical_url)
85
+ except Exception:
86
+ return set()
87
+ out: Set[str] = set()
88
+ host = (p.netloc or "").lower().lstrip("www.")
89
+ if host:
90
+ out.add(f"host:{host}")
91
+ # Pull first 1-2 significant path segments. For github.com the first
92
+ # is the org and the second is the repo — both useful as dedup keys.
93
+ segments = [s for s in (p.path or "").split("/") if s]
94
+ for seg in segments[:2]:
95
+ m = _SIGNIFICANT_PATH_RE.search(seg)
96
+ if m:
97
+ out.add(f"seg:{m.group(0).lower()}")
98
+ return out
99
+
100
+
101
+ def extract_topic_fingerprint(signal: Dict[str, Any]) -> Set[str]:
102
+ """Return the dedup fingerprint set for a single scanned signal.
103
+
104
+ The fingerprint is a SET of canonical tokens. Two signals are
105
+ considered overlapping when their fingerprint sets share at least
106
+ one token. Per the directive: don't be too clever; 60% recall is
107
+ fine.
108
+ """
109
+ snippet = signal.get("content_snippet") or ""
110
+ canonical_url = signal.get("canonical_url") or ""
111
+ rationale = signal.get("rationale") or ""
112
+
113
+ tokens: Set[str] = set()
114
+ tokens.update(_domain_orbit_terms(f"{snippet}\n{rationale}"))
115
+ tokens.update(_url_terms(canonical_url))
116
+ bracket = _bracket_prefix_token(snippet)
117
+ if bracket:
118
+ tokens.add(bracket)
119
+ return tokens
120
+
121
+
122
+ # ── Ledger lookup ─────────────────────────────────────────────────────
123
+
124
+
125
+ def _parse_iso(value: Optional[str]) -> Optional[datetime]:
126
+ if not value:
127
+ return None
128
+ try:
129
+ dt = datetime.fromisoformat(str(value).replace("Z", "+00:00"))
130
+ except (TypeError, ValueError):
131
+ return None
132
+ if dt.tzinfo is None:
133
+ dt = dt.replace(tzinfo=timezone.utc)
134
+ return dt
135
+
136
+
137
+ def _item_fingerprint_tokens(item: Dict[str, Any]) -> Set[str]:
138
+ """Recover a fingerprint token set from a stored ledger item.
139
+
140
+ Auto-promoted items carry their fingerprint in
141
+ ``metadata.signal_ref.fingerprint`` as a serialised list. Older /
142
+ hand-added items don't, so we fall back to extracting on-the-fly
143
+ from title + description + tags + context — the same fields a
144
+ reasonable founder would have written about the same topic.
145
+ """
146
+ metadata = item.get("metadata") or {}
147
+ signal_ref = metadata.get("signal_ref") or {}
148
+ stored = signal_ref.get("fingerprint")
149
+ if isinstance(stored, list) and stored:
150
+ return {str(t).lower() for t in stored if t}
151
+ if isinstance(stored, str) and stored:
152
+ # Comma-separated fallback shape.
153
+ return {p.strip().lower() for p in stored.split(",") if p.strip()}
154
+
155
+ # Fallback: synthesise a fingerprint from the human text in the item.
156
+ parts = [
157
+ item.get("title") or "",
158
+ item.get("description") or "",
159
+ item.get("context") or "",
160
+ ]
161
+ tags = item.get("tags") or []
162
+ if isinstance(tags, list):
163
+ parts.append(" ".join(str(t) for t in tags))
164
+ text = "\n".join(p for p in parts if p)
165
+ fake_signal = {"content_snippet": text, "canonical_url": "", "rationale": ""}
166
+ return extract_topic_fingerprint(fake_signal)
167
+
168
+
169
+ def _within_window(item: Dict[str, Any], window_days: int, now: datetime) -> bool:
170
+ """Item is in-window if either created_at OR updated_at is within
171
+ ``window_days`` of ``now``.
172
+ """
173
+ cutoff = now - timedelta(days=window_days)
174
+ for field in ("updated_at", "created_at"):
175
+ ts = _parse_iso(item.get(field))
176
+ if ts and ts >= cutoff:
177
+ return True
178
+ return False
179
+
180
+
181
+ def _candidate_strategy_items(window_days: int = 60) -> Iterable[Dict[str, Any]]:
182
+ """Yield strategy items in the dedup window.
183
+
184
+ Imports ``ai.ledger_manager.list_items`` lazily so test patches
185
+ targeting that symbol take effect at call time.
186
+ """
187
+ try:
188
+ from ai.ledger_manager import list_items
189
+ except Exception: # pragma: no cover
190
+ return iter(())
191
+ now = datetime.now(timezone.utc)
192
+ out: list = []
193
+ cursor: Optional[str] = None
194
+ seen_ids: Set[str] = set()
195
+ # Walk pages defensively — most ledgers have <500 strategy items, but
196
+ # paginate if needed.
197
+ for _ in range(20): # hard cap on pages, prevents accidental infinite loop
198
+ resp = list_items(
199
+ ledger="strategy",
200
+ limit=500,
201
+ cursor=cursor,
202
+ sort="updated_at",
203
+ order="desc",
204
+ )
205
+ items = (resp.get("items") or {}).get("strategy") or []
206
+ if not items:
207
+ break
208
+ for item in items:
209
+ iid = item.get("id") or ""
210
+ if iid and iid in seen_ids:
211
+ continue
212
+ if iid:
213
+ seen_ids.add(iid)
214
+ if _within_window(item, window_days, now):
215
+ out.append(item)
216
+ cursor = resp.get("next_cursor")
217
+ if not cursor:
218
+ break
219
+ return out
220
+
221
+
222
+ def _log_dedup(signal: Dict[str, Any], match: Dict[str, Any], reason: str) -> None:
223
+ try:
224
+ DEDUP_LOG.parent.mkdir(parents=True, exist_ok=True)
225
+ with DEDUP_LOG.open("a", encoding="utf-8") as fh:
226
+ fh.write(json.dumps({
227
+ "ts": datetime.now(timezone.utc).isoformat(),
228
+ "signal_fingerprint_id": signal.get("fingerprint"),
229
+ "platform": signal.get("platform"),
230
+ "canonical_url": signal.get("canonical_url"),
231
+ "snippet_head": (signal.get("content_snippet") or "")[:160],
232
+ "matched_item_id": match.get("id"),
233
+ "matched_item_title": (match.get("title") or "")[:160],
234
+ "matched_item_status": match.get("status"),
235
+ "reason": reason,
236
+ }) + "\n")
237
+ except OSError: # pragma: no cover — best-effort
238
+ pass
239
+
240
+
241
+ def _is_strong_match(shared: Set[str], sig_tokens: Set[str]) -> bool:
242
+ """Return True when the shared-token set is specific enough to
243
+ claim two signals are about the same topic.
244
+
245
+ Strict rule (chosen after empirical scan-corpus tuning, see
246
+ LED-1264 memo): a true dedup match requires a SPECIFIC token —
247
+ either a bracket-prefix token (``competitor_release:oasdiff``,
248
+ ``vendor_news:cursor``, ``outreach_state_change:logto-io``) or a
249
+ ``seg:<repo>`` URL path segment. Generic orbit terms ("mcp",
250
+ "claude code", "cursor"), tech-context words, and bare host tokens
251
+ are NOT enough on their own. A signal where two of those overlap
252
+ but neither has a specific identifier is two different things
253
+ that happen to live in the same ecosystem; we want them as
254
+ separate ledger items.
255
+
256
+ Per the directive: "don't be too clever — 60% recall on duplicates
257
+ is fine; the cost of missing a duplicate is one founder-reviewed
258
+ P2 ledger item, not a catastrophe." This rule errs toward
259
+ promoting (more recall on the no-dedup decision).
260
+ """
261
+ if not shared:
262
+ return False
263
+
264
+ # Bracket-prefix tokens win — they're tightly scoped (vendor name
265
+ # baked in). Excludes host: and seg: which use the same `:` syntax
266
+ # but live in their own buckets below.
267
+ if any(":" in t and not t.startswith("host:") and not t.startswith("seg:") for t in shared):
268
+ return True
269
+
270
+ # Specific repo segments win — same repo across two signals is a
271
+ # real dedup. seg: tokens carry the repo name post-host (e.g. for
272
+ # github.com/oasdiff/oasdiff we extract seg:oasdiff). When two
273
+ # signals share that, they're about the same project.
274
+ if any(t.startswith("seg:") for t in shared):
275
+ return True
276
+
277
+ return False
278
+
279
+
280
+ def is_duplicate(
281
+ signal: Dict[str, Any],
282
+ *,
283
+ window_days: int = 60,
284
+ candidates: Optional[Iterable[Dict[str, Any]]] = None,
285
+ ) -> Optional[Dict[str, Any]]:
286
+ """Return the matching ledger item dict if ``signal`` collides with
287
+ an existing strategy item inside the window; ``None`` otherwise.
288
+
289
+ The match rule is intentionally specific — sharing only "mcp" or
290
+ "host:github.com" between two signals isn't enough overlap to call
291
+ them duplicates (that's most of the scan corpus). See
292
+ :func:`_is_strong_match` for the exact rule.
293
+
294
+ Parameters
295
+ ----------
296
+ signal:
297
+ Raw scan target dict (the JSONL line shape from
298
+ ``social_targets.jsonl``).
299
+ window_days:
300
+ Age window for "recently closed" items. Default 60 — per the
301
+ directive, avoid re-raising things we explicitly chose not to act
302
+ on within the last 60 days.
303
+ candidates:
304
+ Optional iterable of strategy items to check against. Tests pass
305
+ an explicit list. Production callers omit it and we fetch from
306
+ the live ledger.
307
+ """
308
+ sig_tokens = extract_topic_fingerprint(signal)
309
+ if not sig_tokens:
310
+ # No tokens at all means we can't make a useful dedup judgement.
311
+ # Treat as non-duplicate; the tight confidence floor is the main
312
+ # quality gate.
313
+ return None
314
+
315
+ items = list(candidates) if candidates is not None else list(
316
+ _candidate_strategy_items(window_days=window_days)
317
+ )
318
+
319
+ now = datetime.now(timezone.utc)
320
+ for item in items:
321
+ # When candidates were supplied explicitly we still respect the
322
+ # window so unit tests can assert window behaviour without
323
+ # re-implementing the date filter.
324
+ if candidates is not None and not _within_window(item, window_days, now):
325
+ continue
326
+ item_tokens = _item_fingerprint_tokens(item)
327
+ if not item_tokens:
328
+ continue
329
+ shared = sig_tokens & item_tokens
330
+ if not _is_strong_match(shared, sig_tokens):
331
+ continue
332
+ reason = "open_match" if (item.get("status") == "open") else "recent_match"
333
+ _log_dedup(signal, item, reason)
334
+ return item
335
+ return None
@@ -0,0 +1,151 @@
1
+ """LED-1264 daily digest assembler.
2
+
3
+ Reads ``~/.delimit/scan_bridge_promotions.jsonl`` and assembles ONE
4
+ email-ready digest of the last 24h of promotions. Returns ``None``
5
+ (or empty subject/body) on a zero-signal day so the caller can skip
6
+ sending — silent days are fine per the directive.
7
+
8
+ The digest text is intentionally plain — no markdown, no html — so
9
+ the same string can be used as an email body or a Slack message
10
+ without re-formatting.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ from datetime import datetime, timedelta, timezone
17
+ from pathlib import Path
18
+ from typing import Any, Dict, List, Optional
19
+
20
+ PROMOTIONS_LOG = Path.home() / ".delimit" / "scan_bridge_promotions.jsonl"
21
+
22
+
23
+ def _parse_iso(value: Optional[str]) -> Optional[datetime]:
24
+ if not value:
25
+ return None
26
+ try:
27
+ dt = datetime.fromisoformat(str(value).replace("Z", "+00:00"))
28
+ except (TypeError, ValueError):
29
+ return None
30
+ if dt.tzinfo is None:
31
+ dt = dt.replace(tzinfo=timezone.utc)
32
+ return dt
33
+
34
+
35
+ def _load_promotions(log_path: Path) -> List[Dict[str, Any]]:
36
+ if not log_path.exists():
37
+ return []
38
+ out: List[Dict[str, Any]] = []
39
+ try:
40
+ with log_path.open("r", encoding="utf-8") as fh:
41
+ for line in fh:
42
+ line = line.strip()
43
+ if not line:
44
+ continue
45
+ try:
46
+ out.append(json.loads(line))
47
+ except (ValueError, json.JSONDecodeError):
48
+ continue
49
+ except OSError:
50
+ return []
51
+ return out
52
+
53
+
54
+ def _filter_window(
55
+ promotions: List[Dict[str, Any]], since: datetime
56
+ ) -> List[Dict[str, Any]]:
57
+ out: List[Dict[str, Any]] = []
58
+ for p in promotions:
59
+ ts = _parse_iso(p.get("ts"))
60
+ if ts and ts >= since:
61
+ out.append(p)
62
+ return out
63
+
64
+
65
+ def build_daily_digest(
66
+ *,
67
+ now: Optional[datetime] = None,
68
+ window_hours: int = 24,
69
+ log_path: Optional[Path] = None,
70
+ ) -> Dict[str, Any]:
71
+ """Assemble the last-N-hour promotion digest.
72
+
73
+ Returns
74
+ -------
75
+ dict::
76
+
77
+ {
78
+ "subject": "Delimit scan-bridge — N strategic items (last 24h)",
79
+ "body": "<plain text body>",
80
+ "count": N,
81
+ "since": ISO datetime,
82
+ "items": [...promotion rows...],
83
+ }
84
+
85
+ When ``count == 0`` the subject and body are empty strings so the
86
+ caller can short-circuit ("no email on silent days") without having
87
+ to re-check ``count``.
88
+ """
89
+ now = now or datetime.now(timezone.utc)
90
+ since = now - timedelta(hours=window_hours)
91
+ log_path = log_path or PROMOTIONS_LOG
92
+
93
+ promotions = _load_promotions(log_path)
94
+ items = _filter_window(promotions, since)
95
+ items.sort(key=lambda p: p.get("ts") or "", reverse=True)
96
+
97
+ if not items:
98
+ return {
99
+ "subject": "",
100
+ "body": "",
101
+ "count": 0,
102
+ "since": since.isoformat(),
103
+ "items": [],
104
+ }
105
+
106
+ lines: List[str] = []
107
+ lines.append(
108
+ f"Delimit scan-bridge auto-promoted {len(items)} strategic signal(s) "
109
+ f"to the strategy ledger in the last {window_hours}h."
110
+ )
111
+ lines.append("")
112
+ lines.append(
113
+ "All items are P2 (review, not auto-action). Reply with item id + "
114
+ "decision (escalate, archive, defer) or open the ledger to triage."
115
+ )
116
+ lines.append("")
117
+ lines.append("─" * 70)
118
+ for p in items:
119
+ title = p.get("title") or "(no title)"
120
+ item_id = p.get("item_id") or "(unassigned)"
121
+ confidence = p.get("confidence")
122
+ platform = p.get("platform") or "?"
123
+ url = p.get("canonical_url") or ""
124
+ first_seen = p.get("first_seen") or ""
125
+ try:
126
+ conf_str = f"{float(confidence):.2f}" if confidence is not None else "?"
127
+ except (TypeError, ValueError):
128
+ conf_str = str(confidence)
129
+ lines.append(f"[{item_id}] {title}")
130
+ lines.append(f" platform={platform} confidence={conf_str} first_seen={first_seen}")
131
+ if url:
132
+ lines.append(f" {url}")
133
+ lines.append("")
134
+
135
+ lines.append("─" * 70)
136
+ lines.append("")
137
+ lines.append(
138
+ "Source: ~/.delimit/scan_bridge_promotions.jsonl. "
139
+ "Skipped duplicates: ~/.delimit/scan_bridge_dedup.jsonl. "
140
+ "Tune via DELIMIT_SCAN_PROMO_CONFIDENCE (default 0.85)."
141
+ )
142
+
143
+ body = "\n".join(lines)
144
+ subject = f"Delimit scan-bridge — {len(items)} strategic item(s) (last {window_hours}h)"
145
+ return {
146
+ "subject": subject,
147
+ "body": body,
148
+ "count": len(items),
149
+ "since": since.isoformat(),
150
+ "items": items,
151
+ }
@@ -59,6 +59,7 @@ allowed_claims:
59
59
  - id: diff_engine
60
60
  surface_name: "27 breaking-change types"
61
61
  description: "Deterministic diff engine for OpenAPI spec changes."
62
+ evidence_link: https://delimit.ai/docs/changes
62
63
  - id: github_action
63
64
  surface_name: "delimit-ai/delimit-action GitHub Action"
64
65
  description: "On Marketplace, breaking-change detection on PRs."
@@ -26,6 +26,7 @@ from __future__ import annotations
26
26
 
27
27
  import json
28
28
  import logging
29
+ import os
29
30
  import shlex
30
31
  import subprocess
31
32
  import time
@@ -94,16 +95,24 @@ ACTION_SPEC: Dict[str, Dict[str, Any]] = {
94
95
  }
95
96
 
96
97
 
97
- # LED-988: allowlist for propose_pr. Any repo path NOT in this set is
98
- # rejected at runtime regardless of whether the caller claimed validation
98
+ # LED-988 + LED-1258: allowlist for propose_pr. Any repo path NOT in this set
99
+ # is rejected at runtime regardless of whether the caller claimed validation
99
100
  # passed. Path-traversal-safe (resolved then checked against canonical).
100
- PROPOSE_PR_ALLOWED_REPOS = frozenset({
101
- "/home/delimit/delimit-gateway",
102
- "/home/delimit/delimit-ui",
103
- "/home/delimit/delimit-action",
104
- "/home/delimit/npm-delimit",
105
- "/root/governance-framework",
106
- })
101
+ #
102
+ # Loaded from the DELIMIT_PROPOSE_PR_REPOS env var (comma-separated absolute
103
+ # paths), NOT hardcoded — hardcoding developer-machine paths in shipped source
104
+ # both leaks the dev directory layout to customers AND makes the allowlist
105
+ # dead-code on customer machines (their paths won't match). Empty / unset env
106
+ # var = empty allowlist = propose_pr fails closed for all repo paths.
107
+
108
+ def _load_propose_pr_allowed_repos() -> frozenset:
109
+ raw = os.environ.get("DELIMIT_PROPOSE_PR_REPOS", "").strip()
110
+ if not raw:
111
+ return frozenset()
112
+ return frozenset(p.strip() for p in raw.split(",") if p.strip())
113
+
114
+
115
+ PROPOSE_PR_ALLOWED_REPOS = _load_propose_pr_allowed_repos()
107
116
  # Any branch created by propose_pr must carry this prefix so human branches
108
117
  # are never clobbered and PRs are obviously agent-authored at a glance.
109
118
  PROPOSE_PR_BRANCH_PREFIX = "delimit/"
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "delimit-cli",
3
3
  "mcpName": "io.github.delimit-ai/delimit-mcp-server",
4
- "version": "4.5.6",
4
+ "version": "4.5.8",
5
5
  "description": "Unify Claude Code, Codex, Cursor, and Gemini CLI with persistent context, governance, and multi-model debate.",
6
6
  "main": "index.js",
7
7
  "files": [