delimit-cli 4.5.13 → 4.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/CHANGELOG.md +48 -0
  2. package/README.md +9 -8
  3. package/bin/delimit-cli.js +179 -4
  4. package/bin/delimit-setup.js +46 -6
  5. package/gateway/ai/_compile_status.py +154 -0
  6. package/gateway/ai/agent_dispatch.py +41 -0
  7. package/gateway/ai/backends/git_health.py +175 -0
  8. package/gateway/ai/backends/tools_infra.py +163 -10
  9. package/gateway/ai/cli_contract.py +185 -0
  10. package/gateway/ai/daemon.py +10 -0
  11. package/gateway/ai/daily_digest.py +1 -2
  12. package/gateway/ai/delimit_daemon.py +67 -0
  13. package/gateway/ai/dispatch_gate.py +399 -0
  14. package/gateway/ai/governance.py +181 -0
  15. package/gateway/ai/heartbeat.py +290 -0
  16. package/gateway/ai/hot_reload.py +1 -2
  17. package/gateway/ai/led193_daemon/executor.py +9 -0
  18. package/gateway/ai/ledger_manager.py +90 -4
  19. package/gateway/ai/ledger_proof.py +127 -0
  20. package/gateway/ai/license.py +132 -47
  21. package/gateway/ai/license_core.cpython-310-x86_64-linux-gnu.so +0 -0
  22. package/gateway/ai/license_core.pyi +1 -1
  23. package/gateway/ai/notify.py +39 -0
  24. package/gateway/ai/outreach_loop_daemon.py +349 -0
  25. package/gateway/ai/outreach_substantive.py +1437 -0
  26. package/gateway/ai/pro_tools.yaml +167 -0
  27. package/gateway/ai/reaper.py +70 -0
  28. package/gateway/ai/reddit_scanner.py +17 -6
  29. package/gateway/ai/sensing/schema.py +1 -1
  30. package/gateway/ai/sensing/signal_store.py +0 -1
  31. package/gateway/ai/server.py +5490 -1602
  32. package/gateway/ai/social_capability/fit_floor.py +114 -12
  33. package/gateway/ai/social_queue.py +166 -10
  34. package/gateway/ai/tdqs_lint.py +611 -0
  35. package/gateway/ai/tenant_auth.py +329 -0
  36. package/gateway/ai/tenant_data.py +339 -0
  37. package/gateway/ai/tenant_paths.py +150 -0
  38. package/gateway/ai/usage_allowlist.py +198 -0
  39. package/gateway/ai/workers/base.py +2 -2
  40. package/gateway/ai/workers/executor.py +32 -3
  41. package/gateway/ai/workers/outreach_drafter.py +0 -1
  42. package/gateway/ai/workers/pr_drafter.py +0 -1
  43. package/gateway/ai/x_ranker.py +12 -2
  44. package/gateway/core/json_schema_diff.py +25 -1
  45. package/lib/auth-signin.js +136 -0
  46. package/lib/auth-signout.js +169 -0
  47. package/lib/delimit-template.js +11 -0
  48. package/lib/migration-2092-banner.js +213 -0
  49. package/package.json +5 -2
  50. package/server.json +4 -4
  51. package/scripts/build-license-core.sh +0 -85
  52. package/scripts/security-check.sh +0 -66
  53. package/scripts/test-license-core-so.sh +0 -107
@@ -0,0 +1,1437 @@
1
+ """Substantive-outreach payload, gate, and dispatch (LED-2214b).
2
+
3
+ Implements the autonomous-github-outreach architecture ratified by the
4
+ 2026-05-11 deliberation (A1 + Codex payload amendment, B3 + Claude reg-O
5
+ target-side veto, C1 single-responsibility daemon). Transcript stored
6
+ privately.
7
+
8
+ The three SHIFT-1 holes this module closes:
9
+
10
+ * **Empty-payload dispatch** — the old generic ``outreach`` task type
11
+ could be dispatched on a bare "engage user" target with no evidence
12
+ anchor. Twenty-nine LEDs (LED-915–965) had to be bulk-cancelled in
13
+ 2026-05 because of this class of failure. The dataclass enforces
14
+ required evidence fields at construction time, so empty-payload
15
+ dispatch is structurally impossible.
16
+ * **Reg-O / banking veto** — a perfectly substantive bug report on a
17
+ banking-fintech repo still violates SHIFT-1 (KYC would deanonymize
18
+ the operating account). ``is_banking_adjacent`` runs at both the scanner layer
19
+ (impossible-by-construction) and the submit-time gate (defense in
20
+ depth) so a regulator-adjacent target never reaches dispatch and
21
+ never reaches submission.
22
+ * **Covert commercial outreach** — even with a substantive technical
23
+ anchor, the agent might leak "btw try delimit-cli". The content gate
24
+ rejects forbidden phrases including our own product names, and
25
+ requires at least one concrete technical anchor (commit hash, spec
26
+ path, issue number, or CVE) before allowing submission.
27
+
28
+ Public surface:
29
+
30
+ * :class:`SubstantiveCandidate` — typed payload schema for dispatch.
31
+ * :func:`is_banking_adjacent` — reg-O / fintech / banking classifier.
32
+ * :func:`extract_technical_anchors` — anchor extraction for content gate.
33
+ * :func:`check_substantive_content` — content-shape gate.
34
+ * :func:`evaluate_substantive_payload` — composite gate (target then content).
35
+ * :func:`build_candidate_from_github_target` — scanner-level constructor.
36
+ * :func:`dispatch_substantive_outreach` — wraps :func:`dispatch_task`
37
+ with task_type='outreach_substantive' and the typed payload.
38
+
39
+ Not part of this module: the daemon (:mod:`ai.outreach_loop_daemon`)
40
+ that ticks scanner → file ledger → dispatch.
41
+ """
42
+
43
+ from __future__ import annotations
44
+
45
+ import json as _json
46
+ import logging
47
+ import os as _os
48
+ import re
49
+ import subprocess as _subprocess
50
+ import time as _time
51
+ from dataclasses import asdict, dataclass, field
52
+ from pathlib import Path as _Path
53
+ from typing import Any, Dict, List, Optional, Tuple
54
+
55
+ logger = logging.getLogger("delimit.ai.outreach_substantive")
56
+
57
+
58
+ # ---------------------------------------------------------------------------
59
+ # LED-2266: env-configurable thresholds for the outreach gate stack.
60
+ #
61
+ # Each defense layer has a default value chosen during initial deployment
62
+ # (PR #179 anti-spam, PR #180 engagement-floor). Operators can tune any
63
+ # of them via env var without code changes — useful for trying tighter
64
+ # thresholds on a new venture, or loosening when scanner yield is low.
65
+ #
66
+ # Defaults are conservative: they reproduce the PR-as-shipped behavior
67
+ # when no env var is set. The lookup helpers below are the single source
68
+ # of truth — module constants below resolve through them at import time
69
+ # so each threshold is documented in one place.
70
+ # ---------------------------------------------------------------------------
71
+
72
+
73
+ def _env_int(name: str, default: int, minimum: int = 0) -> int:
74
+ """Read an int env var; fall back to `default` on missing/invalid.
75
+
76
+ Enforces `minimum` (e.g. >=1 for caps) to reject zero/negative
77
+ overrides that would silently disable a defense. Logs at WARNING
78
+ when an override is applied OR rejected so operators can see what
79
+ the engine is actually using.
80
+ """
81
+ raw = _os.environ.get(name, "").strip()
82
+ if not raw:
83
+ return default
84
+ try:
85
+ value = int(raw)
86
+ except ValueError:
87
+ logger.warning(
88
+ "config: %s=%r is not an integer — using default=%d", name, raw, default,
89
+ )
90
+ return default
91
+ if value < minimum:
92
+ logger.warning(
93
+ "config: %s=%d below floor %d — using default=%d",
94
+ name, value, minimum, default,
95
+ )
96
+ return default
97
+ if value != default:
98
+ logger.warning("config: %s overridden default=%d -> %d", name, default, value)
99
+ return value
100
+
101
+
102
+ # ---------------------------------------------------------------------------
103
+ # Constants — keep these auditable. Edits require panel deliberation per
104
+ # the CLAUDE.md SHIFT-1 constitutional binding.
105
+ # ---------------------------------------------------------------------------
106
+
107
+ PROPOSED_ACTIONS = ("comment", "issue", "pr")
108
+
109
+ # CLAUDE.md SHIFT-1 HARD VETO. KYC will deanonymize the operating account
110
+ # on any of these target classes regardless of brand cover, so the target
111
+ # never enters the dispatch queue. Keyword match runs over the repo name +
112
+ # description + topics; any hit blocks the target.
113
+ #
114
+ # Conservative by design — false positives cost zero (we just don't
115
+ # engage), false negatives risk constitutional violation.
116
+ BANKING_ADJACENT_KEYWORDS: Tuple[str, ...] = (
117
+ # Direct
118
+ "bank", "banking", "credit-union", "credit union",
119
+ # Brokerage / capital markets
120
+ "broker", "brokerage", "securities", "custodian", "custody",
121
+ "clearinghouse", "clearing-house", "settlement",
122
+ # Payments / cards
123
+ "payment", "payments", "card-issuer", "card issuer", "issuer-processor",
124
+ "acquirer", "merchant-acquirer", "interchange", "ach ", "swift ",
125
+ # Lending
126
+ "lender", "lending", "mortgage", "underwriting", "underwrite",
127
+ # Insurance (reg-adjacent under McCarran-Ferguson)
128
+ "insurance", "insurer", "reinsurer", "underwriter",
129
+ # Crypto-fiat onramps (FinCEN-regulated MSBs)
130
+ "msb", "money-services-business", "money services business",
131
+ "onramp", "off-ramp", "fiat-onramp",
132
+ # Wealth / advisors (RIA / IAR regulated)
133
+ "wealth-management", "wealth management", "registered investment",
134
+ "ria-firm", "broker-dealer", "broker dealer",
135
+ # Compliance / AML / KYC vendors (likely reg-O downstream)
136
+ "aml-platform", "kyc-platform", "kyc-provider", "kyc provider",
137
+ "bsa-aml", "sanctions-screening", "ofac-screening",
138
+ # Regulator-adjacent
139
+ "regulator", "regulatory-reporting", "fr-y-9c", "call-report",
140
+ "fdic", "occ-supervised", "frb-supervised", "finra", "sec-registered",
141
+ # Reg-O specifically
142
+ "reg-o", "regulation-o", "regulation o", "regulation-w",
143
+ # Stablecoins / fintech with clear bank rails
144
+ "stablecoin", "neobank", "challenger-bank", "core-banking",
145
+ "core banking", "ledger-banking", "open-banking",
146
+ )
147
+
148
+ # Self-references and commercial phrasing the agent must never emit on
149
+ # a third-party repo. Per panel verdict + Codex amendment, we ban our
150
+ # own product names too — substantive contributions stand on technical
151
+ # merit alone, not on naming the upstream tool.
152
+ #
153
+ # Matching is case-insensitive, word-boundary aware where it matters
154
+ # (e.g. "delimit" must not flag "delimited" or "delimiter").
155
+ FORBIDDEN_PHRASES: Tuple[str, ...] = (
156
+ # Commercial framing
157
+ "we built", "we made", "we created", "we developed", "we ship",
158
+ "our tool", "our product", "our cli", "our service", "our platform",
159
+ "you should try", "you might try", "you may want to try",
160
+ "you could try", "give it a try", "give us a try",
161
+ "check out our", "have a look at our", "take a look at our",
162
+ "btw try", "btw, try", "by the way try",
163
+ # Generic non-substantive
164
+ "thanks for the project", "great project", "love the project",
165
+ "interesting project",
166
+ )
167
+
168
+ # Word-boundary product names. Ban "delimit" and "delimit-cli" as
169
+ # standalone tokens; don't false-positive on "delimited" or "delimiter".
170
+ FORBIDDEN_PRODUCT_TOKENS: Tuple[str, ...] = (
171
+ "delimit", "delimit-cli", "delimit.ai", "delimitdev",
172
+ )
173
+
174
+ # Minimum content length below which a body cannot be substantive
175
+ # regardless of anchors. Calibrated to "two-sentence bug report".
176
+ MIN_BODY_LENGTH = 200
177
+
178
+ # Patterns for technical-anchor extraction. At least one must hit.
179
+ _COMMIT_HASH_RE = re.compile(r"\b[0-9a-f]{7,40}\b", re.IGNORECASE)
180
+ _ISSUE_REF_RE = re.compile(r"#\d{1,7}\b")
181
+ _CVE_RE = re.compile(r"\bCVE-\d{4}-\d{4,7}\b", re.IGNORECASE)
182
+ _SPEC_PATH_RE = re.compile(
183
+ r"(?:^|[\s`])(?:[A-Za-z0-9_\-/\.]+/)?(?:openapi|swagger|asyncapi)"
184
+ r"[\w\-/]*\.(?:ya?ml|json)\b",
185
+ re.IGNORECASE,
186
+ )
187
+ _FILE_PATH_RE = re.compile(
188
+ r"(?:^|[\s`])[A-Za-z0-9_\-/.]+\.(?:py|ts|tsx|js|jsx|go|rs|java|"
189
+ r"rb|c|cc|cpp|h|md|ya?ml|json|toml|proto)\b"
190
+ )
191
+
192
+
193
+ # ---------------------------------------------------------------------------
194
+ # Payload schema
195
+ # ---------------------------------------------------------------------------
196
+
197
+
198
+ @dataclass(frozen=True)
199
+ class SubstantiveCandidate:
200
+ """Typed dispatch payload for substantive github outreach.
201
+
202
+ The dataclass is ``frozen=True`` (immutable) and the constructor
203
+ enforces every required field — there is no path to a partially
204
+ populated ``SubstantiveCandidate``, which is the entire point of
205
+ the Codex amendment to A1. The scanner builds one of these or
206
+ nothing; the dispatcher refuses to fire on anything else.
207
+
208
+ Fields:
209
+ repo: ``owner/name`` of the target repository. Required.
210
+ category: One of ``pain_thread``, ``adoption_lead``,
211
+ ``competitor_user``, ``own_repo_activity``. Required.
212
+ target_artifact: Canonical URL of the artifact we'd act on
213
+ (the issue, the PR, the repo root, etc.). Required.
214
+ evidence_refs: Non-empty list of concrete technical anchors
215
+ extracted from the target — issue numbers, commit hashes,
216
+ spec paths, CVE IDs. Empty list raises at construction.
217
+ proposed_action: One of ``comment``, ``issue``, ``pr``.
218
+ subcategory: Optional finer-grained label (e.g.
219
+ ``openapi_spec``). Allowed to be empty.
220
+ venture: Sourcing venture (e.g. ``delimit``). Default ``delimit``.
221
+ fingerprint: Scanner fingerprint for idempotency. Optional.
222
+ """
223
+
224
+ repo: str
225
+ category: str
226
+ target_artifact: str
227
+ evidence_refs: Tuple[str, ...]
228
+ proposed_action: str
229
+ subcategory: str = ""
230
+ venture: str = "delimit"
231
+ fingerprint: str = ""
232
+
233
+ def __post_init__(self):
234
+ # Mirror normal validate-on-construct ergonomics for a frozen
235
+ # dataclass. We use object.__setattr__ only for normalisation
236
+ # before validation; validation itself just raises.
237
+ if not self.repo or "/" not in self.repo:
238
+ raise ValueError(
239
+ f"SubstantiveCandidate.repo must be 'owner/name', got {self.repo!r}"
240
+ )
241
+ if self.category not in {
242
+ "pain_thread", "adoption_lead", "competitor_user", "own_repo_activity",
243
+ }:
244
+ raise ValueError(
245
+ f"SubstantiveCandidate.category invalid: {self.category!r}"
246
+ )
247
+ if not self.target_artifact:
248
+ raise ValueError("SubstantiveCandidate.target_artifact is required")
249
+ if not self.evidence_refs:
250
+ raise ValueError(
251
+ "SubstantiveCandidate.evidence_refs cannot be empty — "
252
+ "empty-payload dispatch is structurally forbidden (LED-2214b)"
253
+ )
254
+ if self.proposed_action not in PROPOSED_ACTIONS:
255
+ raise ValueError(
256
+ f"SubstantiveCandidate.proposed_action must be one of "
257
+ f"{PROPOSED_ACTIONS}, got {self.proposed_action!r}"
258
+ )
259
+ # Coerce evidence_refs to a tuple if a list slipped in. (frozen
260
+ # dataclasses don't auto-coerce; we go through object.__setattr__.)
261
+ if not isinstance(self.evidence_refs, tuple):
262
+ object.__setattr__(self, "evidence_refs", tuple(self.evidence_refs))
263
+
264
+ def to_dict(self) -> Dict[str, Any]:
265
+ d = asdict(self)
266
+ d["evidence_refs"] = list(self.evidence_refs)
267
+ return d
268
+
269
+
270
+ # ---------------------------------------------------------------------------
271
+ # Reg-O / banking target-side veto
272
+ # ---------------------------------------------------------------------------
273
+
274
+
275
+ def is_banking_adjacent(target: Dict[str, Any]) -> Tuple[bool, str]:
276
+ """Return ``(is_adjacent, matched_keyword)``.
277
+
278
+ Scans a target dict for any banking / fintech / regulator-adjacent
279
+ keyword across the fields the scanner emits today (``canonical_url``,
280
+ ``rationale``, ``content_snippet``, and the optional ``repo_topics``
281
+ + ``repo_description`` if present). Match is substring + case
282
+ insensitive on the lowercased haystack.
283
+
284
+ LED-2265: also checks the org/username portion of the canonical URL
285
+ for typo-squat impersonation of known regulated entities (e.g.
286
+ ``JPM0RCHASE`` for ``jpmorgan``, ``g0ldman`` for ``goldman``). The
287
+ raw keyword pass above misses these because the user-facing string
288
+ isn't a banking-noun; the impersonation IS the signal. Defense in
289
+ depth — the substantive engagement path should never land on a
290
+ spoofed-bank account regardless of the repo's content topic.
291
+
292
+ The first-match-wins return makes the logged reason actionable
293
+ ("matched 'broker-dealer' in repo_description" or "matched
294
+ typosquat:jpmorgan in author=JPM0RCHASE"). Callers should treat any
295
+ True return as a hard veto — no override path exists at the scanner
296
+ layer, by design.
297
+ """
298
+ haystack_parts: List[str] = []
299
+ for key in (
300
+ "canonical_url", "rationale", "content_snippet",
301
+ "repo_topics", "repo_description", "repo", "source_id",
302
+ ):
303
+ value = target.get(key)
304
+ if isinstance(value, list):
305
+ haystack_parts.extend(str(v) for v in value)
306
+ elif value is not None:
307
+ haystack_parts.append(str(value))
308
+ haystack = " ".join(haystack_parts).lower()
309
+ for kw in BANKING_ADJACENT_KEYWORDS:
310
+ if kw in haystack:
311
+ return True, kw
312
+
313
+ # LED-2265: typo-squat impersonation of known regulated orgs.
314
+ typosquat = _is_typosquat_impersonation(target)
315
+ if typosquat:
316
+ return True, f"typosquat:{typosquat}"
317
+
318
+ return False, ""
319
+
320
+
321
+ # LED-2265: known-regulated-entity org names. Used by the typo-squat
322
+ # impersonation check below. Names are lowercased and stored without
323
+ # common suffixes (`-bank`, `-chase`, etc.). Conservative list — false
324
+ # positives cost zero (we just don't engage), false negatives risk
325
+ # substantive engagement with a malicious impersonator.
326
+ _KNOWN_REGULATED_ORGS: Tuple[str, ...] = (
327
+ # Tier-1 US banks
328
+ "jpmorgan", "jpmorganchase", "chase", "goldman", "goldmansachs",
329
+ "morganstanley", "citi", "citigroup", "citibank",
330
+ "bankofamerica", "bofa", "wellsfargo", "usbank", "pnc", "truist",
331
+ "capitalone",
332
+ # Foreign G-SIBs
333
+ "hsbc", "barclays", "deutschebank", "credit-suisse", "creditsuisse",
334
+ "ubs", "santander", "bnpparibas", "societegenerale", "ing", "lloyds",
335
+ # US clearing / capital markets
336
+ "blackrock", "vanguard", "fidelity", "schwab", "interactive-brokers",
337
+ "interactivebrokers", "nyse", "nasdaq",
338
+ # Crypto / fintech with bank rails
339
+ "coinbase", "kraken", "circle", "tether", "binance",
340
+ # Card networks
341
+ "visa", "mastercard", "amex", "americanexpress",
342
+ # Regulators
343
+ "fdic", "occ", "frb", "federalreserve", "finra", "secgov",
344
+ )
345
+
346
+
347
+ # LED-2265: simple homoglyph map for digit-for-letter substitutions.
348
+ # Keys are digits commonly used as letter substitutes; values are the
349
+ # letter they impersonate. Asymmetric on purpose (we transform a
350
+ # candidate username INTO a likely impersonated name, then compare).
351
+ _HOMOGLYPH_DIGITS: Dict[str, str] = {
352
+ "0": "o", "1": "i", "3": "e", "4": "a", "5": "s", "7": "t",
353
+ }
354
+
355
+
356
+ def _normalize_for_typosquat(name: str) -> str:
357
+ """Lowercase + strip non-alphanumeric + map digits to letters via the
358
+ homoglyph table. ``JPM0RCHASE`` → ``jpmorchase`` (after step 1) →
359
+ ``jpmorchase`` (digits absent). Used both for the candidate org name
360
+ and as the comparison target — but the comparison list is built
361
+ from raw _KNOWN_REGULATED_ORGS (already letters only), so the
362
+ homoglyph step does the work."""
363
+ alphanum = re.sub(r"[^a-z0-9]", "", name.lower())
364
+ return "".join(_HOMOGLYPH_DIGITS.get(c, c) for c in alphanum)
365
+
366
+
367
+ def _is_typosquat_impersonation(target: Dict[str, Any]) -> str:
368
+ """Return the matched known-org name if the target's author/org/repo
369
+ appears to impersonate a regulated entity via digit-for-letter
370
+ substitution. Returns "" if no impersonation suspected.
371
+
372
+ Checks BOTH the github username AND the repo-name segment. Real
373
+ JPMorgan engagement would be ``jpmorganchase/<repo>`` — anything
374
+ matching the impersonation pattern that ISN'T the canonical org is
375
+ flagged.
376
+ """
377
+ # Collect the candidate name parts: author (github username) and the
378
+ # owner/name segment of the canonical_url.
379
+ candidates: List[str] = []
380
+ author = target.get("author") or ""
381
+ if isinstance(author, str) and author:
382
+ candidates.append(author)
383
+ url = target.get("canonical_url") or ""
384
+ if isinstance(url, str) and url:
385
+ m = re.match(r"^https?://github\.com/([^/]+)/([^/?#]+)", url)
386
+ if m:
387
+ candidates.append(m.group(1)) # org/user
388
+ candidates.append(m.group(2)) # repo name
389
+ fp = target.get("fingerprint") or ""
390
+ if isinstance(fp, str) and fp:
391
+ m = re.match(r"^github:[^:]+:([^/:]+)(?:/([^:]+))?", fp)
392
+ if m:
393
+ candidates.append(m.group(1))
394
+ if m.group(2):
395
+ candidates.append(m.group(2))
396
+
397
+ for cand in candidates:
398
+ # Only digit-bearing candidates can be homoglyph typosquats.
399
+ # A pure-letter username like ``goldman`` would either be the
400
+ # legit org (caught by BANKING_ADJACENT_KEYWORDS keyword pass)
401
+ # or some other case (e.g. ``goldman-recipes``) where we don't
402
+ # have positive evidence of impersonation intent. Digits are
403
+ # the disambiguator.
404
+ if not any(c.isdigit() for c in cand):
405
+ continue
406
+ normalized = _normalize_for_typosquat(cand)
407
+ if not normalized:
408
+ continue
409
+ for org in _KNOWN_REGULATED_ORGS:
410
+ if org in normalized:
411
+ return org
412
+
413
+ return ""
414
+
415
+
416
+ # ---------------------------------------------------------------------------
417
+ # Technical-anchor extraction + content gate
418
+ # ---------------------------------------------------------------------------
419
+
420
+
421
+ def extract_technical_anchors(text: str) -> Dict[str, List[str]]:
422
+ """Extract all technical anchors found in ``text``.
423
+
424
+ Returns a dict with keys ``commits``, ``issues``, ``cves``,
425
+ ``spec_paths``, ``file_paths``. Empty lists mean nothing of that
426
+ type was found. A non-empty union across any key is sufficient to
427
+ satisfy the substantive-content gate.
428
+
429
+ Spec paths are matched explicitly (openapi/swagger/asyncapi) and
430
+ are also captured by the broader file-path regex, but the spec
431
+ list is the load-bearing signal for adoption-lead targets.
432
+ """
433
+ if not text:
434
+ return {"commits": [], "issues": [], "cves": [], "spec_paths": [], "file_paths": []}
435
+ return {
436
+ "commits": _COMMIT_HASH_RE.findall(text),
437
+ "issues": _ISSUE_REF_RE.findall(text),
438
+ "cves": _CVE_RE.findall(text),
439
+ "spec_paths": [m.strip("` ") for m in _SPEC_PATH_RE.findall(text)],
440
+ "file_paths": [m.strip("` ") for m in _FILE_PATH_RE.findall(text)],
441
+ }
442
+
443
+
444
+ def _hits_forbidden_product_token(text_lower: str) -> Optional[str]:
445
+ """Return the first product token present as a word, else None."""
446
+ for token in FORBIDDEN_PRODUCT_TOKENS:
447
+ pattern = r"\b" + re.escape(token) + r"\b"
448
+ if re.search(pattern, text_lower):
449
+ return token
450
+ return None
451
+
452
+
453
+ def check_substantive_content(
454
+ body: str,
455
+ proposed_action: str,
456
+ ) -> Dict[str, Any]:
457
+ """Validate a draft body against the SHIFT-1 content rules.
458
+
459
+ Order of checks (load-bearing — do not reorder without panel
460
+ deliberation):
461
+
462
+ 1. Type / length floor — empty or under-length bodies block.
463
+ 2. Forbidden product tokens — bans our own names (defends against
464
+ "btw try delimit-cli" class).
465
+ 3. Forbidden commercial phrases — bans the broader "we built /
466
+ our tool / you should try" class.
467
+ 4. Technical anchor — must have at least one commit hash, issue
468
+ ref, CVE, spec path, or file path. Without an anchor the body
469
+ is "thanks for the project" by definition.
470
+
471
+ The function does NOT enforce target-side reg-O veto — that lives
472
+ at :func:`is_banking_adjacent`, called separately by
473
+ :func:`evaluate_substantive_payload`. Splitting them keeps the
474
+ failure modes distinguishable in logs and ledger entries.
475
+
476
+ Returns:
477
+ Dict with keys ``verdict`` (``"allow"`` | ``"block"``),
478
+ ``reason``, ``violations`` (list of strings), ``anchors``
479
+ (the extracted-anchors dict).
480
+ """
481
+ violations: List[str] = []
482
+ if not isinstance(body, str) or not body.strip():
483
+ return {
484
+ "verdict": "block",
485
+ "reason": "empty_body",
486
+ "violations": ["body is empty"],
487
+ "anchors": {},
488
+ }
489
+ if proposed_action not in PROPOSED_ACTIONS:
490
+ return {
491
+ "verdict": "block",
492
+ "reason": "invalid_proposed_action",
493
+ "violations": [f"proposed_action must be one of {PROPOSED_ACTIONS}"],
494
+ "anchors": {},
495
+ }
496
+ if len(body) < MIN_BODY_LENGTH:
497
+ violations.append(
498
+ f"body length {len(body)} < MIN_BODY_LENGTH={MIN_BODY_LENGTH}"
499
+ )
500
+
501
+ body_lower = body.lower()
502
+ product_hit = _hits_forbidden_product_token(body_lower)
503
+ if product_hit:
504
+ violations.append(f"forbidden_product_token: {product_hit!r}")
505
+ for phrase in FORBIDDEN_PHRASES:
506
+ if phrase in body_lower:
507
+ violations.append(f"forbidden_phrase: {phrase!r}")
508
+
509
+ anchors = extract_technical_anchors(body)
510
+ has_anchor = any(anchors[k] for k in anchors)
511
+ if not has_anchor:
512
+ violations.append(
513
+ "no_technical_anchor: body must cite a commit hash, "
514
+ "issue number, CVE, spec path, or source file path"
515
+ )
516
+
517
+ if violations:
518
+ return {
519
+ "verdict": "block",
520
+ "reason": violations[0].split(":")[0],
521
+ "violations": violations,
522
+ "anchors": anchors,
523
+ }
524
+ return {
525
+ "verdict": "allow",
526
+ "reason": "ok",
527
+ "violations": [],
528
+ "anchors": anchors,
529
+ }
530
+
531
+
532
+ # ---------------------------------------------------------------------------
533
+ # Composite gate: target-side veto BEFORE content
534
+ # ---------------------------------------------------------------------------
535
+
536
+
537
+ def evaluate_substantive_payload(
538
+ body: str,
539
+ proposed_action: str,
540
+ target: Optional[Dict[str, Any]] = None,
541
+ repo: str = "",
542
+ repo_description: str = "",
543
+ repo_topics: Optional[List[str]] = None,
544
+ ) -> Dict[str, Any]:
545
+ """Full pre-submit gate: reg-O target veto, then content shape.
546
+
547
+ Per the 2026-05-11 panel verdict + Claude's reg-O target-side veto
548
+ amendment: target classification is checked FIRST. A perfectly
549
+ substantive bug report on a banking-adjacent repo still violates
550
+ SHIFT-1, so the gate refuses regardless of content quality.
551
+
552
+ Callers can pass either:
553
+ * a full ``target`` dict (forwarded to :func:`is_banking_adjacent`),
554
+ * or the discrete ``repo`` / ``repo_description`` / ``repo_topics``
555
+ fields, which we wrap in a synthetic target.
556
+
557
+ Returns:
558
+ Dict with ``verdict``, ``reason``, ``violations``, ``anchors``,
559
+ and ``stage`` (``"target"`` or ``"content"``) indicating where
560
+ the gate fired.
561
+ """
562
+ if target is None:
563
+ target = {
564
+ "repo": repo,
565
+ "repo_description": repo_description,
566
+ "repo_topics": repo_topics or [],
567
+ }
568
+ elif repo or repo_description or repo_topics:
569
+ # Caller passed both — merge, keyword scan looks at union.
570
+ target = {
571
+ **target,
572
+ **({"repo": repo} if repo else {}),
573
+ **({"repo_description": repo_description} if repo_description else {}),
574
+ **({"repo_topics": repo_topics} if repo_topics else {}),
575
+ }
576
+
577
+ adjacent, matched = is_banking_adjacent(target)
578
+ if adjacent:
579
+ return {
580
+ "verdict": "block",
581
+ "reason": "banking_adjacent_target",
582
+ "violations": [f"banking_adjacent_target: matched keyword {matched!r}"],
583
+ "anchors": {},
584
+ "stage": "target",
585
+ }
586
+
587
+ content_result = check_substantive_content(body, proposed_action)
588
+ content_result["stage"] = "content"
589
+ return content_result
590
+
591
+
592
+ # ---------------------------------------------------------------------------
593
+ # Scanner-level constructor
594
+ # ---------------------------------------------------------------------------
595
+
596
+
597
+ _FINGERPRINT_REPO_RE = re.compile(
598
+ r"^github:(?:issue|repo|fork|star|outreach):([^:]+/[^:]+)(?::|$)"
599
+ )
600
+ _URL_REPO_RE = re.compile(
601
+ r"^https?://github\.com/([^/]+/[^/]+?)(?:/|$|#|\?)"
602
+ )
603
+
604
+
605
+ def _repo_from_target(target: Dict[str, Any]) -> str:
606
+ repo = (target.get("repo") or "").strip()
607
+ if repo and "/" in repo:
608
+ return repo
609
+ fingerprint = target.get("fingerprint", "")
610
+ m = _FINGERPRINT_REPO_RE.match(fingerprint)
611
+ if m:
612
+ return m.group(1)
613
+ url = target.get("canonical_url", "")
614
+ m = _URL_REPO_RE.match(url)
615
+ if m:
616
+ return m.group(1)
617
+ return ""
618
+
619
+
620
+ _CATEGORY_TO_ACTION = {
621
+ "pain_thread": "comment",
622
+ "adoption_lead": "issue",
623
+ "competitor_user": "comment",
624
+ "own_repo_activity": "comment",
625
+ }
626
+
627
+
628
+ # ---------------------------------------------------------------------------
629
+ # Issue-body fetch + cache (LED-2214b followup)
630
+ #
631
+ # The scanner truncates issue bodies to 200 chars before they reach the
632
+ # substantive gate (see ai/social_target.py:_scan_github phase 2). 200
633
+ # chars covers the title + opening summary but almost always strips the
634
+ # tail where anchors live — stack traces, file paths in error messages,
635
+ # references to other issues/commits. Result: every issue target gets
636
+ # rejected as no-anchor even when the issue body is anchor-rich.
637
+ #
638
+ # This block fetches the FULL issue body + first N comments via gh CLI
639
+ # when the snippet-derived extraction comes up empty. Per-issue 7-day
640
+ # disk cache; daily tick at max_dispatch=3 means worst-case ~3 API calls
641
+ # per day after cache warms.
642
+ # ---------------------------------------------------------------------------
643
+
644
+ _ISSUE_BODY_CACHE_DIR = _Path.home() / ".delimit" / "cache" / "outreach_issue_bodies"
645
+ # LED-2266: env-overridable via DELIMIT_OUTREACH_ISSUE_BODY_CACHE_TTL_S.
646
+ # Default 7 days. Minimum 60s (don't disable caching outright; would
647
+ # spam the github api on every tick).
648
+ _ISSUE_BODY_CACHE_TTL_S = _env_int(
649
+ "DELIMIT_OUTREACH_ISSUE_BODY_CACHE_TTL_S", 7 * 24 * 3600, minimum=60,
650
+ )
651
+ _ISSUE_COMMENTS_FETCH_LIMIT = 5
652
+ _GH_API_TIMEOUT_S = 30
653
+ _ISSUE_FP_RE = re.compile(r"^github:issue:([^/:]+/[^/:]+):(\d+)$")
654
+
655
+
656
+ def _issue_fp_parts(fingerprint: str) -> Optional[Tuple[str, int]]:
657
+ """Extract (repo, issue_number) from a ``github:issue:owner/name:N`` fp.
658
+
659
+ Returns None for any non-issue fingerprint, so callers can use the
660
+ None return as the "skip body fetch" signal.
661
+ """
662
+ m = _ISSUE_FP_RE.match(fingerprint or "")
663
+ if not m:
664
+ return None
665
+ try:
666
+ return m.group(1), int(m.group(2))
667
+ except (TypeError, ValueError):
668
+ return None
669
+
670
+
671
+ def _issue_cache_path(repo: str, number: int) -> _Path:
672
+ safe = repo.replace("/", "__")
673
+ return _ISSUE_BODY_CACHE_DIR / f"{safe}_{number}.json"
674
+
675
+
676
+ def _read_cached_issue_body(repo: str, number: int) -> Optional[str]:
677
+ """Return cached full-text or None if missing/expired/corrupt."""
678
+ cache_file = _issue_cache_path(repo, number)
679
+ if not cache_file.exists():
680
+ return None
681
+ try:
682
+ data = _json.loads(cache_file.read_text())
683
+ except (OSError, ValueError):
684
+ return None
685
+ ts = data.get("ts")
686
+ if not isinstance(ts, (int, float)) or _time.time() - ts > _ISSUE_BODY_CACHE_TTL_S:
687
+ return None
688
+ body = data.get("body")
689
+ return body if isinstance(body, str) else None
690
+
691
+
692
+ def _write_cached_issue_body(repo: str, number: int, body: str) -> None:
693
+ """Persist fetched body. Best-effort — silent on disk failure."""
694
+ try:
695
+ _ISSUE_BODY_CACHE_DIR.mkdir(parents=True, exist_ok=True)
696
+ _issue_cache_path(repo, number).write_text(
697
+ _json.dumps({"ts": _time.time(), "body": body})
698
+ )
699
+ except OSError as exc:
700
+ logger.warning(
701
+ "issue-body cache write failed for %s#%d: %s", repo, number, exc,
702
+ )
703
+
704
+
705
+ _RATE_LIMIT_KILL_FILE = _Path.home() / ".delimit" / "outreach_pause"
706
+ _RATE_LIMIT_SIGNATURES = (
707
+ "rate limit", "rate-limit", "secondary rate",
708
+ "403", "abuse detection", "too many requests",
709
+ )
710
+
711
+
712
+ def _maybe_halt_on_rate_limit(endpoint: str, stderr: str) -> None:
713
+ """LED-2214b followup — defensive halt when github signals rate
714
+ limit / abuse-detection / forbidden. Writes the kill-switch file
715
+ AND ntfys (priority=5). The daemon's pre-import kill-switch check
716
+ will then short-circuit subsequent ticks until the file is removed.
717
+
718
+ Best-effort: silent on any failure. The halt is defense in depth —
719
+ if it doesn't fire here, the rate limit's own retry-after backoff
720
+ handles the immediate request, but future ticks would still hit
721
+ the same limit. The halt-on-warning pattern protects the account
722
+ from escalation (warning -> hard block -> ban)."""
723
+ if not stderr:
724
+ return
725
+ sl = stderr.lower()
726
+ if not any(sig in sl for sig in _RATE_LIMIT_SIGNATURES):
727
+ return
728
+ try:
729
+ _RATE_LIMIT_KILL_FILE.parent.mkdir(parents=True, exist_ok=True)
730
+ _RATE_LIMIT_KILL_FILE.write_text(
731
+ f"halted by _maybe_halt_on_rate_limit at "
732
+ f"{_time.strftime('%Y-%m-%dT%H:%M:%SZ', _time.gmtime())}\n"
733
+ f"endpoint: {endpoint}\n"
734
+ f"stderr: {stderr[:400]}\n"
735
+ )
736
+ logger.error(
737
+ "outreach RATE LIMIT detected — wrote kill-switch %s "
738
+ "(endpoint=%s)", _RATE_LIMIT_KILL_FILE, endpoint,
739
+ )
740
+ except OSError as exc:
741
+ logger.error(
742
+ "outreach rate-limit halt failed to write kill-switch: %s", exc,
743
+ )
744
+
745
+
746
+ def _gh_api_call(endpoint: str) -> Any:
747
+ """Call ``gh api <endpoint>`` and return parsed JSON or None on failure.
748
+
749
+ Local copy of the same idiom in ai.social_target — duplicated to keep
750
+ this module importable without pulling in the much larger
751
+ social_target dependency graph.
752
+
753
+ On any 403 / 429 / rate-limit signature in stderr, writes the
754
+ kill-switch file so subsequent daemon ticks short-circuit. See
755
+ _maybe_halt_on_rate_limit.
756
+ """
757
+ try:
758
+ proc = _subprocess.run(
759
+ ["gh", "api", endpoint],
760
+ capture_output=True,
761
+ text=True,
762
+ timeout=_GH_API_TIMEOUT_S,
763
+ )
764
+ except (_subprocess.TimeoutExpired, FileNotFoundError) as exc:
765
+ logger.warning("gh api %s failed: %s", endpoint, exc)
766
+ return None
767
+ if proc.returncode != 0:
768
+ # LED-2214b followup: halt the outreach daemon on rate-limit
769
+ # signatures BEFORE returning. Defense in depth against escalating
770
+ # github enforcement (warn -> block -> ban).
771
+ _maybe_halt_on_rate_limit(endpoint, proc.stderr or "")
772
+ logger.info(
773
+ "gh api %s returned %d: %s",
774
+ endpoint, proc.returncode, (proc.stderr or "")[:160],
775
+ )
776
+ return None
777
+ try:
778
+ return _json.loads(proc.stdout)
779
+ except ValueError as exc:
780
+ logger.warning("gh api %s returned non-JSON: %s", endpoint, exc)
781
+ return None
782
+
783
+
784
+ # ---------------------------------------------------------------------------
785
+ # Engagement-floor check (LED-2214b followup, found 2026-05-17 when first
786
+ # autonomous engagement landed on a same-day-created 0-star 4-follower
787
+ # personal scratchpad). Substantive content gate passed (anchors were
788
+ # valid) but engagement value was near zero — no readership, no community.
789
+ #
790
+ # This block fetches lightweight repo metadata (1 gh api call, 7-day
791
+ # cached) and enforces a stars + age + not-archived + not-fork floor
792
+ # BEFORE the anchor check. Sits parallel to the existing repo-search
793
+ # filter in ai/social_target.py:_scan_github line 2024 ("stars == 0 and
794
+ # no description: continue") which only catches REPO targets — issue
795
+ # targets bypass it entirely, which was the gap.
796
+ #
797
+ # Fail-closed: if we can't fetch the metadata, we DON'T engage. Better
798
+ # to skip a real target than spam a maintainer on stale / missing data.
799
+ # ---------------------------------------------------------------------------
800
+
801
+ _REPO_META_CACHE_DIR = _Path.home() / ".delimit" / "cache" / "outreach_repo_meta"
802
+ # LED-2266: env-overridable engagement-floor thresholds.
803
+ # Defaults reproduce PR #180 shipped behavior. Floors enforce sanity
804
+ # (no zero or negative values that would silently disable the gate).
805
+ _REPO_META_CACHE_TTL_S = _env_int(
806
+ "DELIMIT_OUTREACH_REPO_META_CACHE_TTL_S", 7 * 24 * 3600, minimum=60,
807
+ )
808
+ _MIN_REPO_STARS = _env_int("DELIMIT_OUTREACH_MIN_STARS", 50, minimum=1)
809
+ _MIN_REPO_AGE_DAYS = _env_int("DELIMIT_OUTREACH_MIN_AGE_DAYS", 30, minimum=1)
810
+
811
+
812
+ def _repo_meta_cache_path(repo: str) -> _Path:
813
+ safe = repo.replace("/", "__")
814
+ return _REPO_META_CACHE_DIR / f"{safe}.json"
815
+
816
+
817
+ def _read_cached_repo_meta(repo: str) -> Optional[Dict[str, Any]]:
818
+ cache_file = _repo_meta_cache_path(repo)
819
+ if not cache_file.exists():
820
+ return None
821
+ try:
822
+ data = _json.loads(cache_file.read_text())
823
+ except (OSError, ValueError):
824
+ return None
825
+ ts = data.get("_cached_ts")
826
+ if not isinstance(ts, (int, float)) or _time.time() - ts > _REPO_META_CACHE_TTL_S:
827
+ return None
828
+ meta = data.get("meta")
829
+ return meta if isinstance(meta, dict) else None
830
+
831
+
832
+ def _write_cached_repo_meta(repo: str, meta: Dict[str, Any]) -> None:
833
+ try:
834
+ _REPO_META_CACHE_DIR.mkdir(parents=True, exist_ok=True)
835
+ _repo_meta_cache_path(repo).write_text(
836
+ _json.dumps({"_cached_ts": _time.time(), "meta": meta})
837
+ )
838
+ except OSError as exc:
839
+ logger.warning("repo-meta cache write failed for %s: %s", repo, exc)
840
+
841
+
842
+ def fetch_repo_metadata(repo: str) -> Optional[Dict[str, Any]]:
843
+ """Fetch lightweight repo metadata via ``gh api repos/{repo}``.
844
+ Cached 7 days. Returns dict with stargazers_count / forks_count /
845
+ open_issues_count / created_at / archived / fork / description /
846
+ pushed_at / owner_login, or None on any failure (caller fails closed)."""
847
+ cached = _read_cached_repo_meta(repo)
848
+ if cached is not None:
849
+ return cached
850
+ data = _gh_api_call(f"repos/{repo}")
851
+ if not isinstance(data, dict):
852
+ # Don't poison cache with None — repo may exist on next attempt
853
+ return None
854
+ owner_obj = data.get("owner") or {}
855
+ meta = {
856
+ "stargazers_count": data.get("stargazers_count", 0),
857
+ "forks_count": data.get("forks_count", 0),
858
+ "open_issues_count": data.get("open_issues_count", 0),
859
+ "created_at": data.get("created_at", ""),
860
+ "pushed_at": data.get("pushed_at", ""),
861
+ "archived": bool(data.get("archived", False)),
862
+ "fork": bool(data.get("fork", False)),
863
+ "description": data.get("description") or "",
864
+ # LED-2214b followup: owner login lets the engagement-floor veto
865
+ # owner-authored issues / PRs. Most owner-authored items are
866
+ # internal chore/release artifacts (today's audit queue had 4 of
867
+ # 5 real candidates in this class) — engagement value near zero.
868
+ "owner_login": owner_obj.get("login", "") if isinstance(owner_obj, dict) else "",
869
+ }
870
+ _write_cached_repo_meta(repo, meta)
871
+ return meta
872
+
873
+
874
+ # LED-2214b followup: per-issue state cache. Lighter than fetch_issue_full_text
875
+ # (which pulls body + comments) — we only need the state field. Separate cache
876
+ # because issue state changes more often than repo metadata, so shorter TTL.
877
+ _ISSUE_STATE_CACHE_TTL_S = 6 * 3600 # 6h: catches "open then closed same day"
878
+
879
+
880
+ def _issue_state_cache_path(repo: str, number: int) -> _Path:
881
+ safe = repo.replace("/", "__")
882
+ return _ISSUE_BODY_CACHE_DIR / f"{safe}_{number}__state.json"
883
+
884
+
885
+ def _read_cached_issue_state(repo: str, number: int) -> Optional[str]:
886
+ cf = _issue_state_cache_path(repo, number)
887
+ if not cf.exists():
888
+ return None
889
+ try:
890
+ data = _json.loads(cf.read_text())
891
+ except (OSError, ValueError):
892
+ return None
893
+ ts = data.get("_cached_ts")
894
+ if not isinstance(ts, (int, float)) or _time.time() - ts > _ISSUE_STATE_CACHE_TTL_S:
895
+ return None
896
+ state = data.get("state")
897
+ return state if isinstance(state, str) else None
898
+
899
+
900
+ def _write_cached_issue_state(repo: str, number: int, state: str) -> None:
901
+ try:
902
+ _ISSUE_BODY_CACHE_DIR.mkdir(parents=True, exist_ok=True)
903
+ _issue_state_cache_path(repo, number).write_text(
904
+ _json.dumps({"_cached_ts": _time.time(), "state": state})
905
+ )
906
+ except OSError as exc:
907
+ logger.warning(
908
+ "issue-state cache write failed for %s#%d: %s", repo, number, exc,
909
+ )
910
+
911
+
912
+ def fetch_issue_state(repo: str, number: int) -> Optional[str]:
913
+ """Return current github issue/PR state ('open' / 'closed') or None
914
+ on fetch failure. Cached 6h. Fail-closed: callers treating None as
915
+ 'don't engage' is correct (we can't verify the target is live)."""
916
+ cached = _read_cached_issue_state(repo, number)
917
+ if cached is not None:
918
+ return cached
919
+ data = _gh_api_call(f"repos/{repo}/issues/{number}")
920
+ if not isinstance(data, dict):
921
+ return None
922
+ state = data.get("state")
923
+ if isinstance(state, str) and state:
924
+ _write_cached_issue_state(repo, number, state)
925
+ return state
926
+ return None
927
+
928
+
929
+ def _repo_age_days(created_at: str) -> Optional[float]:
930
+ """Parse ISO timestamp and return age in days. None on parse failure."""
931
+ if not created_at:
932
+ return None
933
+ try:
934
+ # Strip fractional seconds + Z suffix
935
+ clean = created_at.replace("Z", "").split(".")[0]
936
+ epoch = _time.mktime(_time.strptime(clean, "%Y-%m-%dT%H:%M:%S")) - _time.timezone
937
+ except (ValueError, TypeError):
938
+ return None
939
+ return (_time.time() - epoch) / 86400.0
940
+
941
+
942
+ def check_engagement_floor(repo: str) -> Tuple[bool, str]:
943
+ """Apply the engagement-worthiness floor.
944
+
945
+ Returns (passes, reason). On failure, reason is a short tag the
946
+ caller logs: ``stars<50:3`` / ``age_days<30:0.4`` / ``archived`` /
947
+ ``fork`` / ``no_metadata``. Tunable thresholds: _MIN_REPO_STARS,
948
+ _MIN_REPO_AGE_DAYS.
949
+ """
950
+ meta = fetch_repo_metadata(repo)
951
+ if meta is None:
952
+ return False, "no_metadata"
953
+ if meta.get("archived"):
954
+ return False, "archived"
955
+ if meta.get("fork"):
956
+ return False, "fork"
957
+ stars = meta.get("stargazers_count", 0) or 0
958
+ if stars < _MIN_REPO_STARS:
959
+ return False, f"stars<{_MIN_REPO_STARS}:{stars}"
960
+ age = _repo_age_days(meta.get("created_at", ""))
961
+ if age is not None and age < _MIN_REPO_AGE_DAYS:
962
+ return False, f"age_days<{_MIN_REPO_AGE_DAYS}:{age:.1f}"
963
+ return True, "ok"
964
+
965
+
966
+ def fetch_issue_full_text(repo: str, number: int) -> str:
967
+ """Fetch issue body + first N comments concatenated.
968
+
969
+ Cached for 7 days. Returns "" on any failure — the caller treats
970
+ empty string as 'no anchors available' which correctly blocks
971
+ dispatch (defense in depth; we never accidentally dispatch on a
972
+ target whose substantive evidence we couldn't actually fetch).
973
+
974
+ Public surface (no underscore prefix) so tests + callers can
975
+ monkeypatch without depending on the private cache helpers.
976
+ """
977
+ cached = _read_cached_issue_body(repo, number)
978
+ if cached is not None:
979
+ return cached
980
+
981
+ issue = _gh_api_call(f"repos/{repo}/issues/{number}")
982
+ if not isinstance(issue, dict):
983
+ _write_cached_issue_body(repo, number, "")
984
+ return ""
985
+ parts: List[str] = []
986
+ body = issue.get("body")
987
+ if isinstance(body, str) and body:
988
+ parts.append(body)
989
+
990
+ comments = _gh_api_call(
991
+ f"repos/{repo}/issues/{number}/comments?per_page={_ISSUE_COMMENTS_FETCH_LIMIT}"
992
+ )
993
+ if isinstance(comments, list):
994
+ for c in comments[:_ISSUE_COMMENTS_FETCH_LIMIT]:
995
+ if isinstance(c, dict):
996
+ cb = c.get("body")
997
+ if isinstance(cb, str) and cb:
998
+ parts.append(cb)
999
+
1000
+ full = "\n\n".join(parts)
1001
+ _write_cached_issue_body(repo, number, full)
1002
+ return full
1003
+
1004
+
1005
+ # ---------------------------------------------------------------------------
1006
+ # Anti-spam — protect the operating account from github enforcement
1007
+ # ---------------------------------------------------------------------------
1008
+ #
1009
+ # Three hard limits on top of the per-tick spam firewall
1010
+ # (DEFAULT_MAX_DISPATCH=3) in the daemon:
1011
+ #
1012
+ # 1. Per-repo cooldown: don't dispatch on a repo we already dispatched
1013
+ # to within the last _DISPATCH_COOLDOWN_DAYS days. Avoids the
1014
+ # "scanner finds 3 issues on the SAME repo in one tick + we
1015
+ # engage on all of them = swarm" failure mode.
1016
+ # 2. Per-day global cap: refuse dispatch once we've crossed
1017
+ # _MAX_DISPATCHES_PER_DAY in the rolling 24-hour window. Catches
1018
+ # multiple-tick scenarios (manual run + scheduled run + retry)
1019
+ # that would multiply the per-tick cap.
1020
+ # 3. Halt on rate-limit (in _gh_api_call): if gh api returns 403/429,
1021
+ # write the kill-switch file and ntfy. GitHub typically warns
1022
+ # before banning; respecting that warning protects the account.
1023
+ #
1024
+ # The dispatch log at _DISPATCH_LOG is the source of truth for #1 and #2.
1025
+ # It's append-only JSONL; each successful dispatch_substantive_outreach
1026
+ # call writes one line.
1027
+
1028
+ _DISPATCH_LOG = _Path.home() / ".delimit" / "state" / "outreach-dispatch-log.jsonl"
1029
+ # LED-2266: env-overridable anti-spam thresholds (PR #179 follow-up
1030
+ # panel-flagged). Defaults reproduce shipped behavior. Floors enforce
1031
+ # sanity (minimum=1 — zero would silently disable the spam protection).
1032
+ _DISPATCH_COOLDOWN_DAYS = _env_int("DELIMIT_OUTREACH_COOLDOWN_DAYS", 7, minimum=1)
1033
+ _MAX_DISPATCHES_PER_DAY = _env_int("DELIMIT_OUTREACH_MAX_PER_DAY", 5, minimum=1)
1034
+
1035
+
1036
+ def _read_dispatch_log() -> List[Dict[str, Any]]:
1037
+ """Return all dispatch log entries (newest first). Empty on missing/
1038
+ unreadable. Best-effort — never raises."""
1039
+ if not _DISPATCH_LOG.exists():
1040
+ return []
1041
+ try:
1042
+ out: List[Dict[str, Any]] = []
1043
+ for line in _DISPATCH_LOG.read_text().splitlines():
1044
+ line = line.strip()
1045
+ if not line:
1046
+ continue
1047
+ try:
1048
+ out.append(_json.loads(line))
1049
+ except ValueError:
1050
+ continue
1051
+ out.sort(key=lambda r: r.get("ts", ""), reverse=True)
1052
+ return out
1053
+ except OSError as exc:
1054
+ logger.warning("dispatch log read failed: %s", exc)
1055
+ return []
1056
+
1057
+
1058
+ def _record_dispatch(repo: str, fingerprint: str, category: str) -> None:
1059
+ """Append one entry to the dispatch log. Best-effort — silent on
1060
+ disk failure (dispatch must not crash because logging broke)."""
1061
+ try:
1062
+ _DISPATCH_LOG.parent.mkdir(parents=True, exist_ok=True)
1063
+ entry = {
1064
+ "ts": _time.strftime("%Y-%m-%dT%H:%M:%SZ", _time.gmtime()),
1065
+ "repo": repo,
1066
+ "fingerprint": fingerprint,
1067
+ "category": category,
1068
+ }
1069
+ with _DISPATCH_LOG.open("a") as f:
1070
+ f.write(_json.dumps(entry) + "\n")
1071
+ except OSError as exc:
1072
+ logger.warning("dispatch log write failed: %s", exc)
1073
+
1074
+
1075
+ def _check_per_repo_cooldown(repo: str, now: float | None = None) -> Optional[str]:
1076
+ """Return cooldown-expiry ISO string if repo is in cooldown, else None.
1077
+
1078
+ `now` is overridable for tests. Defaults to current UTC epoch.
1079
+ """
1080
+ if not repo:
1081
+ return None
1082
+ if now is None:
1083
+ now = _time.time()
1084
+ cutoff = now - (_DISPATCH_COOLDOWN_DAYS * 86400)
1085
+ for entry in _read_dispatch_log():
1086
+ if (entry.get("repo") or "").strip().lower() != repo.strip().lower():
1087
+ continue
1088
+ ts = entry.get("ts", "")
1089
+ try:
1090
+ entry_epoch = _time.mktime(_time.strptime(ts, "%Y-%m-%dT%H:%M:%SZ")) - _time.timezone
1091
+ except (ValueError, TypeError):
1092
+ continue
1093
+ if entry_epoch >= cutoff:
1094
+ # Compute cooldown-expiry as entry_ts + cooldown_days
1095
+ expires_epoch = entry_epoch + (_DISPATCH_COOLDOWN_DAYS * 86400)
1096
+ return _time.strftime("%Y-%m-%dT%H:%M:%SZ", _time.gmtime(expires_epoch))
1097
+ return None
1098
+
1099
+
1100
+ def _check_per_day_cap(now: float | None = None) -> int:
1101
+ """Return count of dispatches in the rolling 24h window. Caller
1102
+ checks against _MAX_DISPATCHES_PER_DAY."""
1103
+ if now is None:
1104
+ now = _time.time()
1105
+ cutoff = now - 86400
1106
+ count = 0
1107
+ for entry in _read_dispatch_log():
1108
+ ts = entry.get("ts", "")
1109
+ try:
1110
+ entry_epoch = _time.mktime(_time.strptime(ts, "%Y-%m-%dT%H:%M:%SZ")) - _time.timezone
1111
+ except (ValueError, TypeError):
1112
+ continue
1113
+ if entry_epoch >= cutoff:
1114
+ count += 1
1115
+ return count
1116
+
1117
+
1118
+ def build_candidate_from_github_target(
1119
+ target: Dict[str, Any],
1120
+ category: str,
1121
+ subcategory: str = "",
1122
+ ) -> Optional[SubstantiveCandidate]:
1123
+ """Build a :class:`SubstantiveCandidate` or return None.
1124
+
1125
+ The function returns None — *not* raises — when the target cannot
1126
+ yield a substantive payload. This is the structural-impossibility
1127
+ guarantee: callers that get None must NOT dispatch.
1128
+
1129
+ Reasons for None return:
1130
+ * Target classified banking-adjacent (SHIFT-1 hard veto).
1131
+ * Repo could not be derived from fingerprint or URL.
1132
+ * No technical anchor extractable from snippet + rationale.
1133
+ * Category not in the mapped action table.
1134
+
1135
+ The reg-O check happens here too, not just at submit time, so
1136
+ banking-adjacent targets never reach the agent prompt at all.
1137
+ Defense in depth: scanner + submit gate both veto.
1138
+ """
1139
+ adjacent, matched = is_banking_adjacent(target)
1140
+ if adjacent:
1141
+ logger.info(
1142
+ "build_candidate: banking-adjacent veto fingerprint=%s matched=%s",
1143
+ target.get("fingerprint"), matched,
1144
+ )
1145
+ return None
1146
+
1147
+ repo = _repo_from_target(target)
1148
+ if not repo:
1149
+ logger.info(
1150
+ "build_candidate: repo unresolved fingerprint=%s url=%s",
1151
+ target.get("fingerprint"), target.get("canonical_url"),
1152
+ )
1153
+ return None
1154
+
1155
+ if category not in _CATEGORY_TO_ACTION:
1156
+ logger.info("build_candidate: unmapped category=%s", category)
1157
+ return None
1158
+
1159
+ # LED-2214b followup (founder's Niklas-Flaig observation 2026-05-17):
1160
+ # engagement-floor check BEFORE the anchor extraction + body fetch so
1161
+ # we don't pay the per-issue API cost on a target that's a 0-star
1162
+ # personal scratchpad. Existing repo-search filter in social_target
1163
+ # catches `stars==0 AND no description` for repo targets only; issue
1164
+ # targets bypassed it entirely (the gap this closes).
1165
+ floor_ok, floor_reason = check_engagement_floor(repo)
1166
+ if not floor_ok:
1167
+ logger.info(
1168
+ "build_candidate: engagement floor fingerprint=%s repo=%s reason=%s",
1169
+ target.get("fingerprint"), repo, floor_reason,
1170
+ )
1171
+ return None
1172
+
1173
+ # LED-2214b followup (2026-05-17 audit-queue observation): 4 of 7
1174
+ # dispatched tasks today were owner-authored (chore PRs, dev→main
1175
+ # promotions, internal scout reports). Engagement value near zero —
1176
+ # the owner is doing their own work, not seeking community input.
1177
+ # Repo metadata fetch above already populated owner_login; compare
1178
+ # directly to target's author. Cheap check.
1179
+ repo_meta = fetch_repo_metadata(repo)
1180
+ if repo_meta is not None:
1181
+ owner_login = (repo_meta.get("owner_login") or "").strip().lower()
1182
+ target_author = (target.get("author") or "").strip().lower()
1183
+ if owner_login and target_author and owner_login == target_author:
1184
+ logger.info(
1185
+ "build_candidate: owner-authored target fingerprint=%s "
1186
+ "author=%s == owner=%s",
1187
+ target.get("fingerprint"), target_author, owner_login,
1188
+ )
1189
+ return None
1190
+
1191
+ # LED-2214b followup (2026-05-17 audit-queue observation): 3 of 7
1192
+ # dispatched tasks today were on CLOSED issues. Engaging on a closed
1193
+ # thread is noise — the decision is already made. Cheap state check
1194
+ # before paying the body-fetch cost. Only applies to issue targets;
1195
+ # repo targets don't have a state in this sense.
1196
+ fp_parts_state = _issue_fp_parts(target.get("fingerprint", ""))
1197
+ if fp_parts_state is not None:
1198
+ state = fetch_issue_state(fp_parts_state[0], fp_parts_state[1])
1199
+ if state is None:
1200
+ # Fail-closed: can't verify the issue is live → skip
1201
+ logger.info(
1202
+ "build_candidate: issue state unverifiable fingerprint=%s",
1203
+ target.get("fingerprint"),
1204
+ )
1205
+ return None
1206
+ if state != "open":
1207
+ logger.info(
1208
+ "build_candidate: issue state=%s (not open) fingerprint=%s",
1209
+ state, target.get("fingerprint"),
1210
+ )
1211
+ return None
1212
+
1213
+ # LED-2214b followup — anti-spam protection for the operating account.
1214
+ # These checks run AFTER the banking veto + repo-resolve + category
1215
+ # check (so we don't burden the dispatch log with rejected targets
1216
+ # that wouldn't have dispatched anyway) but BEFORE the anchor
1217
+ # extraction + body fetch (so cool-down catches re-targeting on
1218
+ # repos we recently engaged with without paying the API cost to
1219
+ # re-fetch their issue body).
1220
+
1221
+ cooldown_expires = _check_per_repo_cooldown(repo)
1222
+ if cooldown_expires:
1223
+ logger.info(
1224
+ "build_candidate: per-repo cooldown fingerprint=%s repo=%s "
1225
+ "expires=%s",
1226
+ target.get("fingerprint"), repo, cooldown_expires,
1227
+ )
1228
+ return None
1229
+
1230
+ today_count = _check_per_day_cap()
1231
+ if today_count >= _MAX_DISPATCHES_PER_DAY:
1232
+ logger.warning(
1233
+ "build_candidate: per-day cap hit fingerprint=%s "
1234
+ "today_count=%d cap=%d",
1235
+ target.get("fingerprint"), today_count, _MAX_DISPATCHES_PER_DAY,
1236
+ )
1237
+ return None
1238
+
1239
+ snippet = target.get("content_snippet", "") or ""
1240
+ rationale = target.get("rationale", "") or ""
1241
+ anchors = extract_technical_anchors(f"{snippet}\n{rationale}")
1242
+
1243
+ # LED-2214b followup: if the snippet didn't yield anchors AND this is
1244
+ # an issue target, fetch the full issue body + first N comments and
1245
+ # re-extract. The scanner truncates issue bodies to 200 chars (see
1246
+ # ai/social_target.py:_scan_github phase 2) which almost always
1247
+ # strips the part where anchors live. Fetch is cached 7 days per
1248
+ # issue (see fetch_issue_full_text). On any fetch failure the
1249
+ # function returns "" which leaves anchors unchanged → still blocks.
1250
+ fp_parts = _issue_fp_parts(target.get("fingerprint", ""))
1251
+ needs_body_fetch = fp_parts is not None and not any(
1252
+ anchors.get(k) for k in ("issues", "spec_paths", "cves", "commits", "file_paths")
1253
+ )
1254
+ if needs_body_fetch:
1255
+ body = fetch_issue_full_text(fp_parts[0], fp_parts[1])
1256
+ if body:
1257
+ anchors = extract_technical_anchors(
1258
+ f"{snippet}\n{rationale}\n{body}"
1259
+ )
1260
+
1261
+ evidence_refs: List[str] = []
1262
+ for key in ("issues", "spec_paths", "cves", "commits", "file_paths"):
1263
+ for ref in anchors.get(key, []):
1264
+ label = f"{key[:-1] if key.endswith('s') else key}:{ref}"
1265
+ if label not in evidence_refs:
1266
+ evidence_refs.append(label)
1267
+ if not evidence_refs:
1268
+ logger.info(
1269
+ "build_candidate: no_technical_anchor fingerprint=%s category=%s "
1270
+ "(body_fetched=%s)",
1271
+ target.get("fingerprint"), category, needs_body_fetch,
1272
+ )
1273
+ return None
1274
+
1275
+ target_artifact = target.get("canonical_url") or target.get("fingerprint", "")
1276
+ if not target_artifact:
1277
+ return None
1278
+
1279
+ try:
1280
+ return SubstantiveCandidate(
1281
+ repo=repo,
1282
+ category=category,
1283
+ target_artifact=target_artifact,
1284
+ evidence_refs=tuple(evidence_refs),
1285
+ proposed_action=_CATEGORY_TO_ACTION[category],
1286
+ subcategory=subcategory or "",
1287
+ venture=target.get("venture", "delimit"),
1288
+ fingerprint=target.get("fingerprint", "") or "",
1289
+ )
1290
+ except ValueError as exc:
1291
+ logger.warning(
1292
+ "build_candidate: construction failed for fingerprint=%s: %s",
1293
+ target.get("fingerprint"), exc,
1294
+ )
1295
+ return None
1296
+
1297
+
1298
+ # ---------------------------------------------------------------------------
1299
+ # Dispatch wrapper
1300
+ # ---------------------------------------------------------------------------
1301
+
1302
+
1303
+ OUTREACH_SUBSTANTIVE_TASK_TYPE = "outreach_substantive"
1304
+
1305
+
1306
+ def dispatch_substantive_outreach(
1307
+ candidate: SubstantiveCandidate,
1308
+ target: Dict[str, Any],
1309
+ ledger_item_id: str = "",
1310
+ ) -> Dict[str, Any]:
1311
+ """Dispatch a substantive outreach task — only fires on a real payload.
1312
+
1313
+ The payload is the :class:`SubstantiveCandidate` — its construction
1314
+ has already enforced that every required evidence field is present.
1315
+ The task_type ``outreach_substantive`` is distinct from the legacy
1316
+ ``outreach`` type (which still serves reddit / x branches) so a
1317
+ regression that tries to dispatch a non-substantive github task on
1318
+ the old type does not silently route to the new agent.
1319
+
1320
+ The agent that picks up this task is expected to call
1321
+ ``delimit_substantive_content_check`` BEFORE submitting any draft
1322
+ body, and ``delimit_external_pr_check`` BEFORE submitting if the
1323
+ action is ``pr``. Those gates live in :mod:`ai.server`.
1324
+ """
1325
+ if not isinstance(candidate, SubstantiveCandidate):
1326
+ # Belt-and-suspenders: the dataclass cannot be constructed
1327
+ # without the required fields, but a caller might still pass
1328
+ # a stray dict. Refuse rather than coerce.
1329
+ raise TypeError(
1330
+ "dispatch_substantive_outreach requires a SubstantiveCandidate "
1331
+ f"instance, got {type(candidate).__name__}"
1332
+ )
1333
+
1334
+ # Late-bound import to keep the foundation module light and the
1335
+ # cyclic-import surface clean.
1336
+ from ai.agent_dispatch import dispatch_task, link_ledger_item
1337
+
1338
+ constraints = [
1339
+ "no-deploy", "no-secrets", "no-destructive",
1340
+ "shift-1-quiet-attraction",
1341
+ "must-call-delimit_substantive_content_check-before-submit",
1342
+ ]
1343
+ if candidate.proposed_action == "pr":
1344
+ constraints.append("must-call-delimit_external_pr_check-before-submit")
1345
+
1346
+ tools_needed = [
1347
+ "delimit_substantive_content_check",
1348
+ "delimit_sensor_github_issue",
1349
+ ]
1350
+ if candidate.proposed_action == "pr":
1351
+ tools_needed.append("delimit_external_pr_check")
1352
+
1353
+ variables: Dict[str, Any] = {
1354
+ "candidate": candidate.to_dict(),
1355
+ "venture": candidate.venture,
1356
+ "repo": candidate.repo,
1357
+ "category": candidate.category,
1358
+ "subcategory": candidate.subcategory,
1359
+ "target_artifact": candidate.target_artifact,
1360
+ "evidence_refs": list(candidate.evidence_refs),
1361
+ "proposed_action": candidate.proposed_action,
1362
+ "source_url": target.get("canonical_url", ""),
1363
+ "source_fingerprint": candidate.fingerprint,
1364
+ "author": target.get("author", ""),
1365
+ "rationale": target.get("rationale", ""),
1366
+ }
1367
+
1368
+ title = (
1369
+ f"[{candidate.venture.upper()}] Substantive {candidate.proposed_action} "
1370
+ f"on {candidate.repo} ({candidate.category})"
1371
+ )
1372
+
1373
+ description = (
1374
+ "Substantive-outreach task (LED-2214b architecture).\n"
1375
+ f"Repo: {candidate.repo}\n"
1376
+ f"Category: {candidate.category}"
1377
+ f"{' / ' + candidate.subcategory if candidate.subcategory else ''}\n"
1378
+ f"Action: {candidate.proposed_action}\n"
1379
+ f"Target: {candidate.target_artifact}\n"
1380
+ f"Evidence: {', '.join(candidate.evidence_refs)}\n"
1381
+ "\n"
1382
+ "SHIFT-1 constraints:\n"
1383
+ " - Pseudonymous account only; no founder identity.\n"
1384
+ " - Real technical contribution only. No 'we built' / 'our tool' / "
1385
+ "'btw try' framing. Never name our own product in the body.\n"
1386
+ " - delimit_substantive_content_check is MANDATORY pre-submit.\n"
1387
+ " - delimit_external_pr_check is MANDATORY when proposed_action='pr'.\n"
1388
+ )
1389
+
1390
+ context = (
1391
+ "Substantive autonomous outreach via the LED-2214b architecture. "
1392
+ "The pseudonymous-substantive-contribution carve-out (CLAUDE.md SHIFT-1, "
1393
+ "2026-05-04) permits this provided the activity is a genuine technical "
1394
+ "contribution. The pre-submit gate stack enforces that. If the gate "
1395
+ "blocks, file the rejection reason on the linked ledger item and stop."
1396
+ )
1397
+
1398
+ result = dispatch_task(
1399
+ title=title,
1400
+ description=description,
1401
+ assignee="any",
1402
+ priority="P1",
1403
+ tools_needed=tools_needed,
1404
+ constraints=constraints,
1405
+ context=context,
1406
+ task_type=OUTREACH_SUBSTANTIVE_TASK_TYPE,
1407
+ venture=candidate.venture,
1408
+ variables=variables,
1409
+ external_key=(
1410
+ f"outreach_substantive:{candidate.fingerprint}"
1411
+ if candidate.fingerprint
1412
+ else f"outreach_substantive:{candidate.repo}:{candidate.target_artifact}"
1413
+ ),
1414
+ )
1415
+ task_id = result.get("task_id", "")
1416
+ if task_id and ledger_item_id:
1417
+ try:
1418
+ link_ledger_item(task_id, ledger_item_id)
1419
+ except Exception as exc: # link is best-effort
1420
+ logger.warning(
1421
+ "dispatch_substantive_outreach: link_ledger_item failed "
1422
+ "task=%s ledger=%s err=%s",
1423
+ task_id, ledger_item_id, exc,
1424
+ )
1425
+
1426
+ # LED-2214b followup — record the dispatch for per-repo cooldown +
1427
+ # per-day cap. Append-only JSONL; subsequent build_candidate calls
1428
+ # read this log via _check_per_repo_cooldown / _check_per_day_cap.
1429
+ # Best-effort; logging failures must not crash a successful dispatch.
1430
+ if task_id:
1431
+ _record_dispatch(
1432
+ repo=candidate.repo,
1433
+ fingerprint=candidate.fingerprint,
1434
+ category=candidate.category,
1435
+ )
1436
+
1437
+ return result