delimit-cli 4.5.13 → 4.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +48 -0
- package/README.md +9 -8
- package/bin/delimit-cli.js +179 -4
- package/bin/delimit-setup.js +46 -6
- package/gateway/ai/_compile_status.py +154 -0
- package/gateway/ai/agent_dispatch.py +41 -0
- package/gateway/ai/backends/git_health.py +175 -0
- package/gateway/ai/backends/tools_infra.py +163 -10
- package/gateway/ai/cli_contract.py +185 -0
- package/gateway/ai/daemon.py +10 -0
- package/gateway/ai/daily_digest.py +1 -2
- package/gateway/ai/delimit_daemon.py +67 -0
- package/gateway/ai/dispatch_gate.py +399 -0
- package/gateway/ai/governance.py +181 -0
- package/gateway/ai/heartbeat.py +290 -0
- package/gateway/ai/hot_reload.py +1 -2
- package/gateway/ai/led193_daemon/executor.py +9 -0
- package/gateway/ai/ledger_manager.py +90 -4
- package/gateway/ai/ledger_proof.py +127 -0
- package/gateway/ai/license.py +132 -47
- package/gateway/ai/license_core.cpython-310-x86_64-linux-gnu.so +0 -0
- package/gateway/ai/license_core.pyi +1 -1
- package/gateway/ai/notify.py +39 -0
- package/gateway/ai/outreach_loop_daemon.py +349 -0
- package/gateway/ai/outreach_substantive.py +1437 -0
- package/gateway/ai/pro_tools.yaml +167 -0
- package/gateway/ai/reaper.py +70 -0
- package/gateway/ai/reddit_scanner.py +17 -6
- package/gateway/ai/sensing/schema.py +1 -1
- package/gateway/ai/sensing/signal_store.py +0 -1
- package/gateway/ai/server.py +5490 -1602
- package/gateway/ai/social_capability/fit_floor.py +114 -12
- package/gateway/ai/social_queue.py +166 -10
- package/gateway/ai/tdqs_lint.py +611 -0
- package/gateway/ai/tenant_auth.py +329 -0
- package/gateway/ai/tenant_data.py +339 -0
- package/gateway/ai/tenant_paths.py +150 -0
- package/gateway/ai/usage_allowlist.py +198 -0
- package/gateway/ai/workers/base.py +2 -2
- package/gateway/ai/workers/executor.py +32 -3
- package/gateway/ai/workers/outreach_drafter.py +0 -1
- package/gateway/ai/workers/pr_drafter.py +0 -1
- package/gateway/ai/x_ranker.py +12 -2
- package/gateway/core/json_schema_diff.py +25 -1
- package/lib/auth-signin.js +136 -0
- package/lib/auth-signout.js +169 -0
- package/lib/delimit-template.js +11 -0
- package/lib/migration-2092-banner.js +213 -0
- package/package.json +5 -2
- package/server.json +4 -4
- package/scripts/build-license-core.sh +0 -85
- package/scripts/security-check.sh +0 -66
- package/scripts/test-license-core-so.sh +0 -107
|
@@ -0,0 +1,1437 @@
|
|
|
1
|
+
"""Substantive-outreach payload, gate, and dispatch (LED-2214b).
|
|
2
|
+
|
|
3
|
+
Implements the autonomous-github-outreach architecture ratified by the
|
|
4
|
+
2026-05-11 deliberation (A1 + Codex payload amendment, B3 + Claude reg-O
|
|
5
|
+
target-side veto, C1 single-responsibility daemon). Transcript stored
|
|
6
|
+
privately.
|
|
7
|
+
|
|
8
|
+
The three SHIFT-1 holes this module closes:
|
|
9
|
+
|
|
10
|
+
* **Empty-payload dispatch** — the old generic ``outreach`` task type
|
|
11
|
+
could be dispatched on a bare "engage user" target with no evidence
|
|
12
|
+
anchor. Twenty-nine LEDs (LED-915–965) had to be bulk-cancelled in
|
|
13
|
+
2026-05 because of this class of failure. The dataclass enforces
|
|
14
|
+
required evidence fields at construction time, so empty-payload
|
|
15
|
+
dispatch is structurally impossible.
|
|
16
|
+
* **Reg-O / banking veto** — a perfectly substantive bug report on a
|
|
17
|
+
banking-fintech repo still violates SHIFT-1 (KYC would deanonymize
|
|
18
|
+
the operating account). ``is_banking_adjacent`` runs at both the scanner layer
|
|
19
|
+
(impossible-by-construction) and the submit-time gate (defense in
|
|
20
|
+
depth) so a regulator-adjacent target never reaches dispatch and
|
|
21
|
+
never reaches submission.
|
|
22
|
+
* **Covert commercial outreach** — even with a substantive technical
|
|
23
|
+
anchor, the agent might leak "btw try delimit-cli". The content gate
|
|
24
|
+
rejects forbidden phrases including our own product names, and
|
|
25
|
+
requires at least one concrete technical anchor (commit hash, spec
|
|
26
|
+
path, issue number, or CVE) before allowing submission.
|
|
27
|
+
|
|
28
|
+
Public surface:
|
|
29
|
+
|
|
30
|
+
* :class:`SubstantiveCandidate` — typed payload schema for dispatch.
|
|
31
|
+
* :func:`is_banking_adjacent` — reg-O / fintech / banking classifier.
|
|
32
|
+
* :func:`extract_technical_anchors` — anchor extraction for content gate.
|
|
33
|
+
* :func:`check_substantive_content` — content-shape gate.
|
|
34
|
+
* :func:`evaluate_substantive_payload` — composite gate (target then content).
|
|
35
|
+
* :func:`build_candidate_from_github_target` — scanner-level constructor.
|
|
36
|
+
* :func:`dispatch_substantive_outreach` — wraps :func:`dispatch_task`
|
|
37
|
+
with task_type='outreach_substantive' and the typed payload.
|
|
38
|
+
|
|
39
|
+
Not part of this module: the daemon (:mod:`ai.outreach_loop_daemon`)
|
|
40
|
+
that ticks scanner → file ledger → dispatch.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
from __future__ import annotations
|
|
44
|
+
|
|
45
|
+
import json as _json
|
|
46
|
+
import logging
|
|
47
|
+
import os as _os
|
|
48
|
+
import re
|
|
49
|
+
import subprocess as _subprocess
|
|
50
|
+
import time as _time
|
|
51
|
+
from dataclasses import asdict, dataclass, field
|
|
52
|
+
from pathlib import Path as _Path
|
|
53
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
54
|
+
|
|
55
|
+
logger = logging.getLogger("delimit.ai.outreach_substantive")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# ---------------------------------------------------------------------------
|
|
59
|
+
# LED-2266: env-configurable thresholds for the outreach gate stack.
|
|
60
|
+
#
|
|
61
|
+
# Each defense layer has a default value chosen during initial deployment
|
|
62
|
+
# (PR #179 anti-spam, PR #180 engagement-floor). Operators can tune any
|
|
63
|
+
# of them via env var without code changes — useful for trying tighter
|
|
64
|
+
# thresholds on a new venture, or loosening when scanner yield is low.
|
|
65
|
+
#
|
|
66
|
+
# Defaults are conservative: they reproduce the PR-as-shipped behavior
|
|
67
|
+
# when no env var is set. The lookup helpers below are the single source
|
|
68
|
+
# of truth — module constants below resolve through them at import time
|
|
69
|
+
# so each threshold is documented in one place.
|
|
70
|
+
# ---------------------------------------------------------------------------
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _env_int(name: str, default: int, minimum: int = 0) -> int:
|
|
74
|
+
"""Read an int env var; fall back to `default` on missing/invalid.
|
|
75
|
+
|
|
76
|
+
Enforces `minimum` (e.g. >=1 for caps) to reject zero/negative
|
|
77
|
+
overrides that would silently disable a defense. Logs at WARNING
|
|
78
|
+
when an override is applied OR rejected so operators can see what
|
|
79
|
+
the engine is actually using.
|
|
80
|
+
"""
|
|
81
|
+
raw = _os.environ.get(name, "").strip()
|
|
82
|
+
if not raw:
|
|
83
|
+
return default
|
|
84
|
+
try:
|
|
85
|
+
value = int(raw)
|
|
86
|
+
except ValueError:
|
|
87
|
+
logger.warning(
|
|
88
|
+
"config: %s=%r is not an integer — using default=%d", name, raw, default,
|
|
89
|
+
)
|
|
90
|
+
return default
|
|
91
|
+
if value < minimum:
|
|
92
|
+
logger.warning(
|
|
93
|
+
"config: %s=%d below floor %d — using default=%d",
|
|
94
|
+
name, value, minimum, default,
|
|
95
|
+
)
|
|
96
|
+
return default
|
|
97
|
+
if value != default:
|
|
98
|
+
logger.warning("config: %s overridden default=%d -> %d", name, default, value)
|
|
99
|
+
return value
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# ---------------------------------------------------------------------------
|
|
103
|
+
# Constants — keep these auditable. Edits require panel deliberation per
|
|
104
|
+
# the CLAUDE.md SHIFT-1 constitutional binding.
|
|
105
|
+
# ---------------------------------------------------------------------------
|
|
106
|
+
|
|
107
|
+
PROPOSED_ACTIONS = ("comment", "issue", "pr")
|
|
108
|
+
|
|
109
|
+
# CLAUDE.md SHIFT-1 HARD VETO. KYC will deanonymize the operating account
|
|
110
|
+
# on any of these target classes regardless of brand cover, so the target
|
|
111
|
+
# never enters the dispatch queue. Keyword match runs over the repo name +
|
|
112
|
+
# description + topics; any hit blocks the target.
|
|
113
|
+
#
|
|
114
|
+
# Conservative by design — false positives cost zero (we just don't
|
|
115
|
+
# engage), false negatives risk constitutional violation.
|
|
116
|
+
BANKING_ADJACENT_KEYWORDS: Tuple[str, ...] = (
|
|
117
|
+
# Direct
|
|
118
|
+
"bank", "banking", "credit-union", "credit union",
|
|
119
|
+
# Brokerage / capital markets
|
|
120
|
+
"broker", "brokerage", "securities", "custodian", "custody",
|
|
121
|
+
"clearinghouse", "clearing-house", "settlement",
|
|
122
|
+
# Payments / cards
|
|
123
|
+
"payment", "payments", "card-issuer", "card issuer", "issuer-processor",
|
|
124
|
+
"acquirer", "merchant-acquirer", "interchange", "ach ", "swift ",
|
|
125
|
+
# Lending
|
|
126
|
+
"lender", "lending", "mortgage", "underwriting", "underwrite",
|
|
127
|
+
# Insurance (reg-adjacent under McCarran-Ferguson)
|
|
128
|
+
"insurance", "insurer", "reinsurer", "underwriter",
|
|
129
|
+
# Crypto-fiat onramps (FinCEN-regulated MSBs)
|
|
130
|
+
"msb", "money-services-business", "money services business",
|
|
131
|
+
"onramp", "off-ramp", "fiat-onramp",
|
|
132
|
+
# Wealth / advisors (RIA / IAR regulated)
|
|
133
|
+
"wealth-management", "wealth management", "registered investment",
|
|
134
|
+
"ria-firm", "broker-dealer", "broker dealer",
|
|
135
|
+
# Compliance / AML / KYC vendors (likely reg-O downstream)
|
|
136
|
+
"aml-platform", "kyc-platform", "kyc-provider", "kyc provider",
|
|
137
|
+
"bsa-aml", "sanctions-screening", "ofac-screening",
|
|
138
|
+
# Regulator-adjacent
|
|
139
|
+
"regulator", "regulatory-reporting", "fr-y-9c", "call-report",
|
|
140
|
+
"fdic", "occ-supervised", "frb-supervised", "finra", "sec-registered",
|
|
141
|
+
# Reg-O specifically
|
|
142
|
+
"reg-o", "regulation-o", "regulation o", "regulation-w",
|
|
143
|
+
# Stablecoins / fintech with clear bank rails
|
|
144
|
+
"stablecoin", "neobank", "challenger-bank", "core-banking",
|
|
145
|
+
"core banking", "ledger-banking", "open-banking",
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
# Self-references and commercial phrasing the agent must never emit on
|
|
149
|
+
# a third-party repo. Per panel verdict + Codex amendment, we ban our
|
|
150
|
+
# own product names too — substantive contributions stand on technical
|
|
151
|
+
# merit alone, not on naming the upstream tool.
|
|
152
|
+
#
|
|
153
|
+
# Matching is case-insensitive, word-boundary aware where it matters
|
|
154
|
+
# (e.g. "delimit" must not flag "delimited" or "delimiter").
|
|
155
|
+
FORBIDDEN_PHRASES: Tuple[str, ...] = (
|
|
156
|
+
# Commercial framing
|
|
157
|
+
"we built", "we made", "we created", "we developed", "we ship",
|
|
158
|
+
"our tool", "our product", "our cli", "our service", "our platform",
|
|
159
|
+
"you should try", "you might try", "you may want to try",
|
|
160
|
+
"you could try", "give it a try", "give us a try",
|
|
161
|
+
"check out our", "have a look at our", "take a look at our",
|
|
162
|
+
"btw try", "btw, try", "by the way try",
|
|
163
|
+
# Generic non-substantive
|
|
164
|
+
"thanks for the project", "great project", "love the project",
|
|
165
|
+
"interesting project",
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
# Word-boundary product names. Ban "delimit" and "delimit-cli" as
|
|
169
|
+
# standalone tokens; don't false-positive on "delimited" or "delimiter".
|
|
170
|
+
FORBIDDEN_PRODUCT_TOKENS: Tuple[str, ...] = (
|
|
171
|
+
"delimit", "delimit-cli", "delimit.ai", "delimitdev",
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# Minimum content length below which a body cannot be substantive
|
|
175
|
+
# regardless of anchors. Calibrated to "two-sentence bug report".
|
|
176
|
+
MIN_BODY_LENGTH = 200
|
|
177
|
+
|
|
178
|
+
# Patterns for technical-anchor extraction. At least one must hit.
|
|
179
|
+
_COMMIT_HASH_RE = re.compile(r"\b[0-9a-f]{7,40}\b", re.IGNORECASE)
|
|
180
|
+
_ISSUE_REF_RE = re.compile(r"#\d{1,7}\b")
|
|
181
|
+
_CVE_RE = re.compile(r"\bCVE-\d{4}-\d{4,7}\b", re.IGNORECASE)
|
|
182
|
+
_SPEC_PATH_RE = re.compile(
|
|
183
|
+
r"(?:^|[\s`])(?:[A-Za-z0-9_\-/\.]+/)?(?:openapi|swagger|asyncapi)"
|
|
184
|
+
r"[\w\-/]*\.(?:ya?ml|json)\b",
|
|
185
|
+
re.IGNORECASE,
|
|
186
|
+
)
|
|
187
|
+
_FILE_PATH_RE = re.compile(
|
|
188
|
+
r"(?:^|[\s`])[A-Za-z0-9_\-/.]+\.(?:py|ts|tsx|js|jsx|go|rs|java|"
|
|
189
|
+
r"rb|c|cc|cpp|h|md|ya?ml|json|toml|proto)\b"
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
# ---------------------------------------------------------------------------
|
|
194
|
+
# Payload schema
|
|
195
|
+
# ---------------------------------------------------------------------------
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
@dataclass(frozen=True)
|
|
199
|
+
class SubstantiveCandidate:
|
|
200
|
+
"""Typed dispatch payload for substantive github outreach.
|
|
201
|
+
|
|
202
|
+
The dataclass is ``frozen=True`` (immutable) and the constructor
|
|
203
|
+
enforces every required field — there is no path to a partially
|
|
204
|
+
populated ``SubstantiveCandidate``, which is the entire point of
|
|
205
|
+
the Codex amendment to A1. The scanner builds one of these or
|
|
206
|
+
nothing; the dispatcher refuses to fire on anything else.
|
|
207
|
+
|
|
208
|
+
Fields:
|
|
209
|
+
repo: ``owner/name`` of the target repository. Required.
|
|
210
|
+
category: One of ``pain_thread``, ``adoption_lead``,
|
|
211
|
+
``competitor_user``, ``own_repo_activity``. Required.
|
|
212
|
+
target_artifact: Canonical URL of the artifact we'd act on
|
|
213
|
+
(the issue, the PR, the repo root, etc.). Required.
|
|
214
|
+
evidence_refs: Non-empty list of concrete technical anchors
|
|
215
|
+
extracted from the target — issue numbers, commit hashes,
|
|
216
|
+
spec paths, CVE IDs. Empty list raises at construction.
|
|
217
|
+
proposed_action: One of ``comment``, ``issue``, ``pr``.
|
|
218
|
+
subcategory: Optional finer-grained label (e.g.
|
|
219
|
+
``openapi_spec``). Allowed to be empty.
|
|
220
|
+
venture: Sourcing venture (e.g. ``delimit``). Default ``delimit``.
|
|
221
|
+
fingerprint: Scanner fingerprint for idempotency. Optional.
|
|
222
|
+
"""
|
|
223
|
+
|
|
224
|
+
repo: str
|
|
225
|
+
category: str
|
|
226
|
+
target_artifact: str
|
|
227
|
+
evidence_refs: Tuple[str, ...]
|
|
228
|
+
proposed_action: str
|
|
229
|
+
subcategory: str = ""
|
|
230
|
+
venture: str = "delimit"
|
|
231
|
+
fingerprint: str = ""
|
|
232
|
+
|
|
233
|
+
def __post_init__(self):
|
|
234
|
+
# Mirror normal validate-on-construct ergonomics for a frozen
|
|
235
|
+
# dataclass. We use object.__setattr__ only for normalisation
|
|
236
|
+
# before validation; validation itself just raises.
|
|
237
|
+
if not self.repo or "/" not in self.repo:
|
|
238
|
+
raise ValueError(
|
|
239
|
+
f"SubstantiveCandidate.repo must be 'owner/name', got {self.repo!r}"
|
|
240
|
+
)
|
|
241
|
+
if self.category not in {
|
|
242
|
+
"pain_thread", "adoption_lead", "competitor_user", "own_repo_activity",
|
|
243
|
+
}:
|
|
244
|
+
raise ValueError(
|
|
245
|
+
f"SubstantiveCandidate.category invalid: {self.category!r}"
|
|
246
|
+
)
|
|
247
|
+
if not self.target_artifact:
|
|
248
|
+
raise ValueError("SubstantiveCandidate.target_artifact is required")
|
|
249
|
+
if not self.evidence_refs:
|
|
250
|
+
raise ValueError(
|
|
251
|
+
"SubstantiveCandidate.evidence_refs cannot be empty — "
|
|
252
|
+
"empty-payload dispatch is structurally forbidden (LED-2214b)"
|
|
253
|
+
)
|
|
254
|
+
if self.proposed_action not in PROPOSED_ACTIONS:
|
|
255
|
+
raise ValueError(
|
|
256
|
+
f"SubstantiveCandidate.proposed_action must be one of "
|
|
257
|
+
f"{PROPOSED_ACTIONS}, got {self.proposed_action!r}"
|
|
258
|
+
)
|
|
259
|
+
# Coerce evidence_refs to a tuple if a list slipped in. (frozen
|
|
260
|
+
# dataclasses don't auto-coerce; we go through object.__setattr__.)
|
|
261
|
+
if not isinstance(self.evidence_refs, tuple):
|
|
262
|
+
object.__setattr__(self, "evidence_refs", tuple(self.evidence_refs))
|
|
263
|
+
|
|
264
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
265
|
+
d = asdict(self)
|
|
266
|
+
d["evidence_refs"] = list(self.evidence_refs)
|
|
267
|
+
return d
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
# ---------------------------------------------------------------------------
|
|
271
|
+
# Reg-O / banking target-side veto
|
|
272
|
+
# ---------------------------------------------------------------------------
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def is_banking_adjacent(target: Dict[str, Any]) -> Tuple[bool, str]:
|
|
276
|
+
"""Return ``(is_adjacent, matched_keyword)``.
|
|
277
|
+
|
|
278
|
+
Scans a target dict for any banking / fintech / regulator-adjacent
|
|
279
|
+
keyword across the fields the scanner emits today (``canonical_url``,
|
|
280
|
+
``rationale``, ``content_snippet``, and the optional ``repo_topics``
|
|
281
|
+
+ ``repo_description`` if present). Match is substring + case
|
|
282
|
+
insensitive on the lowercased haystack.
|
|
283
|
+
|
|
284
|
+
LED-2265: also checks the org/username portion of the canonical URL
|
|
285
|
+
for typo-squat impersonation of known regulated entities (e.g.
|
|
286
|
+
``JPM0RCHASE`` for ``jpmorgan``, ``g0ldman`` for ``goldman``). The
|
|
287
|
+
raw keyword pass above misses these because the user-facing string
|
|
288
|
+
isn't a banking-noun; the impersonation IS the signal. Defense in
|
|
289
|
+
depth — the substantive engagement path should never land on a
|
|
290
|
+
spoofed-bank account regardless of the repo's content topic.
|
|
291
|
+
|
|
292
|
+
The first-match-wins return makes the logged reason actionable
|
|
293
|
+
("matched 'broker-dealer' in repo_description" or "matched
|
|
294
|
+
typosquat:jpmorgan in author=JPM0RCHASE"). Callers should treat any
|
|
295
|
+
True return as a hard veto — no override path exists at the scanner
|
|
296
|
+
layer, by design.
|
|
297
|
+
"""
|
|
298
|
+
haystack_parts: List[str] = []
|
|
299
|
+
for key in (
|
|
300
|
+
"canonical_url", "rationale", "content_snippet",
|
|
301
|
+
"repo_topics", "repo_description", "repo", "source_id",
|
|
302
|
+
):
|
|
303
|
+
value = target.get(key)
|
|
304
|
+
if isinstance(value, list):
|
|
305
|
+
haystack_parts.extend(str(v) for v in value)
|
|
306
|
+
elif value is not None:
|
|
307
|
+
haystack_parts.append(str(value))
|
|
308
|
+
haystack = " ".join(haystack_parts).lower()
|
|
309
|
+
for kw in BANKING_ADJACENT_KEYWORDS:
|
|
310
|
+
if kw in haystack:
|
|
311
|
+
return True, kw
|
|
312
|
+
|
|
313
|
+
# LED-2265: typo-squat impersonation of known regulated orgs.
|
|
314
|
+
typosquat = _is_typosquat_impersonation(target)
|
|
315
|
+
if typosquat:
|
|
316
|
+
return True, f"typosquat:{typosquat}"
|
|
317
|
+
|
|
318
|
+
return False, ""
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
# LED-2265: known-regulated-entity org names. Used by the typo-squat
|
|
322
|
+
# impersonation check below. Names are lowercased and stored without
|
|
323
|
+
# common suffixes (`-bank`, `-chase`, etc.). Conservative list — false
|
|
324
|
+
# positives cost zero (we just don't engage), false negatives risk
|
|
325
|
+
# substantive engagement with a malicious impersonator.
|
|
326
|
+
_KNOWN_REGULATED_ORGS: Tuple[str, ...] = (
|
|
327
|
+
# Tier-1 US banks
|
|
328
|
+
"jpmorgan", "jpmorganchase", "chase", "goldman", "goldmansachs",
|
|
329
|
+
"morganstanley", "citi", "citigroup", "citibank",
|
|
330
|
+
"bankofamerica", "bofa", "wellsfargo", "usbank", "pnc", "truist",
|
|
331
|
+
"capitalone",
|
|
332
|
+
# Foreign G-SIBs
|
|
333
|
+
"hsbc", "barclays", "deutschebank", "credit-suisse", "creditsuisse",
|
|
334
|
+
"ubs", "santander", "bnpparibas", "societegenerale", "ing", "lloyds",
|
|
335
|
+
# US clearing / capital markets
|
|
336
|
+
"blackrock", "vanguard", "fidelity", "schwab", "interactive-brokers",
|
|
337
|
+
"interactivebrokers", "nyse", "nasdaq",
|
|
338
|
+
# Crypto / fintech with bank rails
|
|
339
|
+
"coinbase", "kraken", "circle", "tether", "binance",
|
|
340
|
+
# Card networks
|
|
341
|
+
"visa", "mastercard", "amex", "americanexpress",
|
|
342
|
+
# Regulators
|
|
343
|
+
"fdic", "occ", "frb", "federalreserve", "finra", "secgov",
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
# LED-2265: simple homoglyph map for digit-for-letter substitutions.
|
|
348
|
+
# Keys are digits commonly used as letter substitutes; values are the
|
|
349
|
+
# letter they impersonate. Asymmetric on purpose (we transform a
|
|
350
|
+
# candidate username INTO a likely impersonated name, then compare).
|
|
351
|
+
_HOMOGLYPH_DIGITS: Dict[str, str] = {
|
|
352
|
+
"0": "o", "1": "i", "3": "e", "4": "a", "5": "s", "7": "t",
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def _normalize_for_typosquat(name: str) -> str:
|
|
357
|
+
"""Lowercase + strip non-alphanumeric + map digits to letters via the
|
|
358
|
+
homoglyph table. ``JPM0RCHASE`` → ``jpmorchase`` (after step 1) →
|
|
359
|
+
``jpmorchase`` (digits absent). Used both for the candidate org name
|
|
360
|
+
and as the comparison target — but the comparison list is built
|
|
361
|
+
from raw _KNOWN_REGULATED_ORGS (already letters only), so the
|
|
362
|
+
homoglyph step does the work."""
|
|
363
|
+
alphanum = re.sub(r"[^a-z0-9]", "", name.lower())
|
|
364
|
+
return "".join(_HOMOGLYPH_DIGITS.get(c, c) for c in alphanum)
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def _is_typosquat_impersonation(target: Dict[str, Any]) -> str:
|
|
368
|
+
"""Return the matched known-org name if the target's author/org/repo
|
|
369
|
+
appears to impersonate a regulated entity via digit-for-letter
|
|
370
|
+
substitution. Returns "" if no impersonation suspected.
|
|
371
|
+
|
|
372
|
+
Checks BOTH the github username AND the repo-name segment. Real
|
|
373
|
+
JPMorgan engagement would be ``jpmorganchase/<repo>`` — anything
|
|
374
|
+
matching the impersonation pattern that ISN'T the canonical org is
|
|
375
|
+
flagged.
|
|
376
|
+
"""
|
|
377
|
+
# Collect the candidate name parts: author (github username) and the
|
|
378
|
+
# owner/name segment of the canonical_url.
|
|
379
|
+
candidates: List[str] = []
|
|
380
|
+
author = target.get("author") or ""
|
|
381
|
+
if isinstance(author, str) and author:
|
|
382
|
+
candidates.append(author)
|
|
383
|
+
url = target.get("canonical_url") or ""
|
|
384
|
+
if isinstance(url, str) and url:
|
|
385
|
+
m = re.match(r"^https?://github\.com/([^/]+)/([^/?#]+)", url)
|
|
386
|
+
if m:
|
|
387
|
+
candidates.append(m.group(1)) # org/user
|
|
388
|
+
candidates.append(m.group(2)) # repo name
|
|
389
|
+
fp = target.get("fingerprint") or ""
|
|
390
|
+
if isinstance(fp, str) and fp:
|
|
391
|
+
m = re.match(r"^github:[^:]+:([^/:]+)(?:/([^:]+))?", fp)
|
|
392
|
+
if m:
|
|
393
|
+
candidates.append(m.group(1))
|
|
394
|
+
if m.group(2):
|
|
395
|
+
candidates.append(m.group(2))
|
|
396
|
+
|
|
397
|
+
for cand in candidates:
|
|
398
|
+
# Only digit-bearing candidates can be homoglyph typosquats.
|
|
399
|
+
# A pure-letter username like ``goldman`` would either be the
|
|
400
|
+
# legit org (caught by BANKING_ADJACENT_KEYWORDS keyword pass)
|
|
401
|
+
# or some other case (e.g. ``goldman-recipes``) where we don't
|
|
402
|
+
# have positive evidence of impersonation intent. Digits are
|
|
403
|
+
# the disambiguator.
|
|
404
|
+
if not any(c.isdigit() for c in cand):
|
|
405
|
+
continue
|
|
406
|
+
normalized = _normalize_for_typosquat(cand)
|
|
407
|
+
if not normalized:
|
|
408
|
+
continue
|
|
409
|
+
for org in _KNOWN_REGULATED_ORGS:
|
|
410
|
+
if org in normalized:
|
|
411
|
+
return org
|
|
412
|
+
|
|
413
|
+
return ""
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
# ---------------------------------------------------------------------------
|
|
417
|
+
# Technical-anchor extraction + content gate
|
|
418
|
+
# ---------------------------------------------------------------------------
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def extract_technical_anchors(text: str) -> Dict[str, List[str]]:
|
|
422
|
+
"""Extract all technical anchors found in ``text``.
|
|
423
|
+
|
|
424
|
+
Returns a dict with keys ``commits``, ``issues``, ``cves``,
|
|
425
|
+
``spec_paths``, ``file_paths``. Empty lists mean nothing of that
|
|
426
|
+
type was found. A non-empty union across any key is sufficient to
|
|
427
|
+
satisfy the substantive-content gate.
|
|
428
|
+
|
|
429
|
+
Spec paths are matched explicitly (openapi/swagger/asyncapi) and
|
|
430
|
+
are also captured by the broader file-path regex, but the spec
|
|
431
|
+
list is the load-bearing signal for adoption-lead targets.
|
|
432
|
+
"""
|
|
433
|
+
if not text:
|
|
434
|
+
return {"commits": [], "issues": [], "cves": [], "spec_paths": [], "file_paths": []}
|
|
435
|
+
return {
|
|
436
|
+
"commits": _COMMIT_HASH_RE.findall(text),
|
|
437
|
+
"issues": _ISSUE_REF_RE.findall(text),
|
|
438
|
+
"cves": _CVE_RE.findall(text),
|
|
439
|
+
"spec_paths": [m.strip("` ") for m in _SPEC_PATH_RE.findall(text)],
|
|
440
|
+
"file_paths": [m.strip("` ") for m in _FILE_PATH_RE.findall(text)],
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def _hits_forbidden_product_token(text_lower: str) -> Optional[str]:
|
|
445
|
+
"""Return the first product token present as a word, else None."""
|
|
446
|
+
for token in FORBIDDEN_PRODUCT_TOKENS:
|
|
447
|
+
pattern = r"\b" + re.escape(token) + r"\b"
|
|
448
|
+
if re.search(pattern, text_lower):
|
|
449
|
+
return token
|
|
450
|
+
return None
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
def check_substantive_content(
|
|
454
|
+
body: str,
|
|
455
|
+
proposed_action: str,
|
|
456
|
+
) -> Dict[str, Any]:
|
|
457
|
+
"""Validate a draft body against the SHIFT-1 content rules.
|
|
458
|
+
|
|
459
|
+
Order of checks (load-bearing — do not reorder without panel
|
|
460
|
+
deliberation):
|
|
461
|
+
|
|
462
|
+
1. Type / length floor — empty or under-length bodies block.
|
|
463
|
+
2. Forbidden product tokens — bans our own names (defends against
|
|
464
|
+
"btw try delimit-cli" class).
|
|
465
|
+
3. Forbidden commercial phrases — bans the broader "we built /
|
|
466
|
+
our tool / you should try" class.
|
|
467
|
+
4. Technical anchor — must have at least one commit hash, issue
|
|
468
|
+
ref, CVE, spec path, or file path. Without an anchor the body
|
|
469
|
+
is "thanks for the project" by definition.
|
|
470
|
+
|
|
471
|
+
The function does NOT enforce target-side reg-O veto — that lives
|
|
472
|
+
at :func:`is_banking_adjacent`, called separately by
|
|
473
|
+
:func:`evaluate_substantive_payload`. Splitting them keeps the
|
|
474
|
+
failure modes distinguishable in logs and ledger entries.
|
|
475
|
+
|
|
476
|
+
Returns:
|
|
477
|
+
Dict with keys ``verdict`` (``"allow"`` | ``"block"``),
|
|
478
|
+
``reason``, ``violations`` (list of strings), ``anchors``
|
|
479
|
+
(the extracted-anchors dict).
|
|
480
|
+
"""
|
|
481
|
+
violations: List[str] = []
|
|
482
|
+
if not isinstance(body, str) or not body.strip():
|
|
483
|
+
return {
|
|
484
|
+
"verdict": "block",
|
|
485
|
+
"reason": "empty_body",
|
|
486
|
+
"violations": ["body is empty"],
|
|
487
|
+
"anchors": {},
|
|
488
|
+
}
|
|
489
|
+
if proposed_action not in PROPOSED_ACTIONS:
|
|
490
|
+
return {
|
|
491
|
+
"verdict": "block",
|
|
492
|
+
"reason": "invalid_proposed_action",
|
|
493
|
+
"violations": [f"proposed_action must be one of {PROPOSED_ACTIONS}"],
|
|
494
|
+
"anchors": {},
|
|
495
|
+
}
|
|
496
|
+
if len(body) < MIN_BODY_LENGTH:
|
|
497
|
+
violations.append(
|
|
498
|
+
f"body length {len(body)} < MIN_BODY_LENGTH={MIN_BODY_LENGTH}"
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
body_lower = body.lower()
|
|
502
|
+
product_hit = _hits_forbidden_product_token(body_lower)
|
|
503
|
+
if product_hit:
|
|
504
|
+
violations.append(f"forbidden_product_token: {product_hit!r}")
|
|
505
|
+
for phrase in FORBIDDEN_PHRASES:
|
|
506
|
+
if phrase in body_lower:
|
|
507
|
+
violations.append(f"forbidden_phrase: {phrase!r}")
|
|
508
|
+
|
|
509
|
+
anchors = extract_technical_anchors(body)
|
|
510
|
+
has_anchor = any(anchors[k] for k in anchors)
|
|
511
|
+
if not has_anchor:
|
|
512
|
+
violations.append(
|
|
513
|
+
"no_technical_anchor: body must cite a commit hash, "
|
|
514
|
+
"issue number, CVE, spec path, or source file path"
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
if violations:
|
|
518
|
+
return {
|
|
519
|
+
"verdict": "block",
|
|
520
|
+
"reason": violations[0].split(":")[0],
|
|
521
|
+
"violations": violations,
|
|
522
|
+
"anchors": anchors,
|
|
523
|
+
}
|
|
524
|
+
return {
|
|
525
|
+
"verdict": "allow",
|
|
526
|
+
"reason": "ok",
|
|
527
|
+
"violations": [],
|
|
528
|
+
"anchors": anchors,
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
# ---------------------------------------------------------------------------
|
|
533
|
+
# Composite gate: target-side veto BEFORE content
|
|
534
|
+
# ---------------------------------------------------------------------------
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
def evaluate_substantive_payload(
|
|
538
|
+
body: str,
|
|
539
|
+
proposed_action: str,
|
|
540
|
+
target: Optional[Dict[str, Any]] = None,
|
|
541
|
+
repo: str = "",
|
|
542
|
+
repo_description: str = "",
|
|
543
|
+
repo_topics: Optional[List[str]] = None,
|
|
544
|
+
) -> Dict[str, Any]:
|
|
545
|
+
"""Full pre-submit gate: reg-O target veto, then content shape.
|
|
546
|
+
|
|
547
|
+
Per the 2026-05-11 panel verdict + Claude's reg-O target-side veto
|
|
548
|
+
amendment: target classification is checked FIRST. A perfectly
|
|
549
|
+
substantive bug report on a banking-adjacent repo still violates
|
|
550
|
+
SHIFT-1, so the gate refuses regardless of content quality.
|
|
551
|
+
|
|
552
|
+
Callers can pass either:
|
|
553
|
+
* a full ``target`` dict (forwarded to :func:`is_banking_adjacent`),
|
|
554
|
+
* or the discrete ``repo`` / ``repo_description`` / ``repo_topics``
|
|
555
|
+
fields, which we wrap in a synthetic target.
|
|
556
|
+
|
|
557
|
+
Returns:
|
|
558
|
+
Dict with ``verdict``, ``reason``, ``violations``, ``anchors``,
|
|
559
|
+
and ``stage`` (``"target"`` or ``"content"``) indicating where
|
|
560
|
+
the gate fired.
|
|
561
|
+
"""
|
|
562
|
+
if target is None:
|
|
563
|
+
target = {
|
|
564
|
+
"repo": repo,
|
|
565
|
+
"repo_description": repo_description,
|
|
566
|
+
"repo_topics": repo_topics or [],
|
|
567
|
+
}
|
|
568
|
+
elif repo or repo_description or repo_topics:
|
|
569
|
+
# Caller passed both — merge, keyword scan looks at union.
|
|
570
|
+
target = {
|
|
571
|
+
**target,
|
|
572
|
+
**({"repo": repo} if repo else {}),
|
|
573
|
+
**({"repo_description": repo_description} if repo_description else {}),
|
|
574
|
+
**({"repo_topics": repo_topics} if repo_topics else {}),
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
adjacent, matched = is_banking_adjacent(target)
|
|
578
|
+
if adjacent:
|
|
579
|
+
return {
|
|
580
|
+
"verdict": "block",
|
|
581
|
+
"reason": "banking_adjacent_target",
|
|
582
|
+
"violations": [f"banking_adjacent_target: matched keyword {matched!r}"],
|
|
583
|
+
"anchors": {},
|
|
584
|
+
"stage": "target",
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
content_result = check_substantive_content(body, proposed_action)
|
|
588
|
+
content_result["stage"] = "content"
|
|
589
|
+
return content_result
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
# ---------------------------------------------------------------------------
|
|
593
|
+
# Scanner-level constructor
|
|
594
|
+
# ---------------------------------------------------------------------------
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
_FINGERPRINT_REPO_RE = re.compile(
|
|
598
|
+
r"^github:(?:issue|repo|fork|star|outreach):([^:]+/[^:]+)(?::|$)"
|
|
599
|
+
)
|
|
600
|
+
_URL_REPO_RE = re.compile(
|
|
601
|
+
r"^https?://github\.com/([^/]+/[^/]+?)(?:/|$|#|\?)"
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
def _repo_from_target(target: Dict[str, Any]) -> str:
|
|
606
|
+
repo = (target.get("repo") or "").strip()
|
|
607
|
+
if repo and "/" in repo:
|
|
608
|
+
return repo
|
|
609
|
+
fingerprint = target.get("fingerprint", "")
|
|
610
|
+
m = _FINGERPRINT_REPO_RE.match(fingerprint)
|
|
611
|
+
if m:
|
|
612
|
+
return m.group(1)
|
|
613
|
+
url = target.get("canonical_url", "")
|
|
614
|
+
m = _URL_REPO_RE.match(url)
|
|
615
|
+
if m:
|
|
616
|
+
return m.group(1)
|
|
617
|
+
return ""
|
|
618
|
+
|
|
619
|
+
|
|
620
|
+
_CATEGORY_TO_ACTION = {
|
|
621
|
+
"pain_thread": "comment",
|
|
622
|
+
"adoption_lead": "issue",
|
|
623
|
+
"competitor_user": "comment",
|
|
624
|
+
"own_repo_activity": "comment",
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
|
|
628
|
+
# ---------------------------------------------------------------------------
|
|
629
|
+
# Issue-body fetch + cache (LED-2214b followup)
|
|
630
|
+
#
|
|
631
|
+
# The scanner truncates issue bodies to 200 chars before they reach the
|
|
632
|
+
# substantive gate (see ai/social_target.py:_scan_github phase 2). 200
|
|
633
|
+
# chars covers the title + opening summary but almost always strips the
|
|
634
|
+
# tail where anchors live — stack traces, file paths in error messages,
|
|
635
|
+
# references to other issues/commits. Result: every issue target gets
|
|
636
|
+
# rejected as no-anchor even when the issue body is anchor-rich.
|
|
637
|
+
#
|
|
638
|
+
# This block fetches the FULL issue body + first N comments via gh CLI
|
|
639
|
+
# when the snippet-derived extraction comes up empty. Per-issue 7-day
|
|
640
|
+
# disk cache; daily tick at max_dispatch=3 means worst-case ~3 API calls
|
|
641
|
+
# per day after cache warms.
|
|
642
|
+
# ---------------------------------------------------------------------------
|
|
643
|
+
|
|
644
|
+
_ISSUE_BODY_CACHE_DIR = _Path.home() / ".delimit" / "cache" / "outreach_issue_bodies"
|
|
645
|
+
# LED-2266: env-overridable via DELIMIT_OUTREACH_ISSUE_BODY_CACHE_TTL_S.
|
|
646
|
+
# Default 7 days. Minimum 60s (don't disable caching outright; would
|
|
647
|
+
# spam the github api on every tick).
|
|
648
|
+
_ISSUE_BODY_CACHE_TTL_S = _env_int(
|
|
649
|
+
"DELIMIT_OUTREACH_ISSUE_BODY_CACHE_TTL_S", 7 * 24 * 3600, minimum=60,
|
|
650
|
+
)
|
|
651
|
+
_ISSUE_COMMENTS_FETCH_LIMIT = 5
|
|
652
|
+
_GH_API_TIMEOUT_S = 30
|
|
653
|
+
_ISSUE_FP_RE = re.compile(r"^github:issue:([^/:]+/[^/:]+):(\d+)$")
|
|
654
|
+
|
|
655
|
+
|
|
656
|
+
def _issue_fp_parts(fingerprint: str) -> Optional[Tuple[str, int]]:
|
|
657
|
+
"""Extract (repo, issue_number) from a ``github:issue:owner/name:N`` fp.
|
|
658
|
+
|
|
659
|
+
Returns None for any non-issue fingerprint, so callers can use the
|
|
660
|
+
None return as the "skip body fetch" signal.
|
|
661
|
+
"""
|
|
662
|
+
m = _ISSUE_FP_RE.match(fingerprint or "")
|
|
663
|
+
if not m:
|
|
664
|
+
return None
|
|
665
|
+
try:
|
|
666
|
+
return m.group(1), int(m.group(2))
|
|
667
|
+
except (TypeError, ValueError):
|
|
668
|
+
return None
|
|
669
|
+
|
|
670
|
+
|
|
671
|
+
def _issue_cache_path(repo: str, number: int) -> _Path:
|
|
672
|
+
safe = repo.replace("/", "__")
|
|
673
|
+
return _ISSUE_BODY_CACHE_DIR / f"{safe}_{number}.json"
|
|
674
|
+
|
|
675
|
+
|
|
676
|
+
def _read_cached_issue_body(repo: str, number: int) -> Optional[str]:
|
|
677
|
+
"""Return cached full-text or None if missing/expired/corrupt."""
|
|
678
|
+
cache_file = _issue_cache_path(repo, number)
|
|
679
|
+
if not cache_file.exists():
|
|
680
|
+
return None
|
|
681
|
+
try:
|
|
682
|
+
data = _json.loads(cache_file.read_text())
|
|
683
|
+
except (OSError, ValueError):
|
|
684
|
+
return None
|
|
685
|
+
ts = data.get("ts")
|
|
686
|
+
if not isinstance(ts, (int, float)) or _time.time() - ts > _ISSUE_BODY_CACHE_TTL_S:
|
|
687
|
+
return None
|
|
688
|
+
body = data.get("body")
|
|
689
|
+
return body if isinstance(body, str) else None
|
|
690
|
+
|
|
691
|
+
|
|
692
|
+
def _write_cached_issue_body(repo: str, number: int, body: str) -> None:
|
|
693
|
+
"""Persist fetched body. Best-effort — silent on disk failure."""
|
|
694
|
+
try:
|
|
695
|
+
_ISSUE_BODY_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
|
696
|
+
_issue_cache_path(repo, number).write_text(
|
|
697
|
+
_json.dumps({"ts": _time.time(), "body": body})
|
|
698
|
+
)
|
|
699
|
+
except OSError as exc:
|
|
700
|
+
logger.warning(
|
|
701
|
+
"issue-body cache write failed for %s#%d: %s", repo, number, exc,
|
|
702
|
+
)
|
|
703
|
+
|
|
704
|
+
|
|
705
|
+
_RATE_LIMIT_KILL_FILE = _Path.home() / ".delimit" / "outreach_pause"
|
|
706
|
+
_RATE_LIMIT_SIGNATURES = (
|
|
707
|
+
"rate limit", "rate-limit", "secondary rate",
|
|
708
|
+
"403", "abuse detection", "too many requests",
|
|
709
|
+
)
|
|
710
|
+
|
|
711
|
+
|
|
712
|
+
def _maybe_halt_on_rate_limit(endpoint: str, stderr: str) -> None:
|
|
713
|
+
"""LED-2214b followup — defensive halt when github signals rate
|
|
714
|
+
limit / abuse-detection / forbidden. Writes the kill-switch file
|
|
715
|
+
AND ntfys (priority=5). The daemon's pre-import kill-switch check
|
|
716
|
+
will then short-circuit subsequent ticks until the file is removed.
|
|
717
|
+
|
|
718
|
+
Best-effort: silent on any failure. The halt is defense in depth —
|
|
719
|
+
if it doesn't fire here, the rate limit's own retry-after backoff
|
|
720
|
+
handles the immediate request, but future ticks would still hit
|
|
721
|
+
the same limit. The halt-on-warning pattern protects the account
|
|
722
|
+
from escalation (warning -> hard block -> ban)."""
|
|
723
|
+
if not stderr:
|
|
724
|
+
return
|
|
725
|
+
sl = stderr.lower()
|
|
726
|
+
if not any(sig in sl for sig in _RATE_LIMIT_SIGNATURES):
|
|
727
|
+
return
|
|
728
|
+
try:
|
|
729
|
+
_RATE_LIMIT_KILL_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
730
|
+
_RATE_LIMIT_KILL_FILE.write_text(
|
|
731
|
+
f"halted by _maybe_halt_on_rate_limit at "
|
|
732
|
+
f"{_time.strftime('%Y-%m-%dT%H:%M:%SZ', _time.gmtime())}\n"
|
|
733
|
+
f"endpoint: {endpoint}\n"
|
|
734
|
+
f"stderr: {stderr[:400]}\n"
|
|
735
|
+
)
|
|
736
|
+
logger.error(
|
|
737
|
+
"outreach RATE LIMIT detected — wrote kill-switch %s "
|
|
738
|
+
"(endpoint=%s)", _RATE_LIMIT_KILL_FILE, endpoint,
|
|
739
|
+
)
|
|
740
|
+
except OSError as exc:
|
|
741
|
+
logger.error(
|
|
742
|
+
"outreach rate-limit halt failed to write kill-switch: %s", exc,
|
|
743
|
+
)
|
|
744
|
+
|
|
745
|
+
|
|
746
|
+
def _gh_api_call(endpoint: str) -> Any:
|
|
747
|
+
"""Call ``gh api <endpoint>`` and return parsed JSON or None on failure.
|
|
748
|
+
|
|
749
|
+
Local copy of the same idiom in ai.social_target — duplicated to keep
|
|
750
|
+
this module importable without pulling in the much larger
|
|
751
|
+
social_target dependency graph.
|
|
752
|
+
|
|
753
|
+
On any 403 / 429 / rate-limit signature in stderr, writes the
|
|
754
|
+
kill-switch file so subsequent daemon ticks short-circuit. See
|
|
755
|
+
_maybe_halt_on_rate_limit.
|
|
756
|
+
"""
|
|
757
|
+
try:
|
|
758
|
+
proc = _subprocess.run(
|
|
759
|
+
["gh", "api", endpoint],
|
|
760
|
+
capture_output=True,
|
|
761
|
+
text=True,
|
|
762
|
+
timeout=_GH_API_TIMEOUT_S,
|
|
763
|
+
)
|
|
764
|
+
except (_subprocess.TimeoutExpired, FileNotFoundError) as exc:
|
|
765
|
+
logger.warning("gh api %s failed: %s", endpoint, exc)
|
|
766
|
+
return None
|
|
767
|
+
if proc.returncode != 0:
|
|
768
|
+
# LED-2214b followup: halt the outreach daemon on rate-limit
|
|
769
|
+
# signatures BEFORE returning. Defense in depth against escalating
|
|
770
|
+
# github enforcement (warn -> block -> ban).
|
|
771
|
+
_maybe_halt_on_rate_limit(endpoint, proc.stderr or "")
|
|
772
|
+
logger.info(
|
|
773
|
+
"gh api %s returned %d: %s",
|
|
774
|
+
endpoint, proc.returncode, (proc.stderr or "")[:160],
|
|
775
|
+
)
|
|
776
|
+
return None
|
|
777
|
+
try:
|
|
778
|
+
return _json.loads(proc.stdout)
|
|
779
|
+
except ValueError as exc:
|
|
780
|
+
logger.warning("gh api %s returned non-JSON: %s", endpoint, exc)
|
|
781
|
+
return None
|
|
782
|
+
|
|
783
|
+
|
|
784
|
+
# ---------------------------------------------------------------------------
|
|
785
|
+
# Engagement-floor check (LED-2214b followup, found 2026-05-17 when first
|
|
786
|
+
# autonomous engagement landed on a same-day-created 0-star 4-follower
|
|
787
|
+
# personal scratchpad). Substantive content gate passed (anchors were
|
|
788
|
+
# valid) but engagement value was near zero — no readership, no community.
|
|
789
|
+
#
|
|
790
|
+
# This block fetches lightweight repo metadata (1 gh api call, 7-day
|
|
791
|
+
# cached) and enforces a stars + age + not-archived + not-fork floor
|
|
792
|
+
# BEFORE the anchor check. Sits parallel to the existing repo-search
|
|
793
|
+
# filter in ai/social_target.py:_scan_github line 2024 ("stars == 0 and
|
|
794
|
+
# no description: continue") which only catches REPO targets — issue
|
|
795
|
+
# targets bypass it entirely, which was the gap.
|
|
796
|
+
#
|
|
797
|
+
# Fail-closed: if we can't fetch the metadata, we DON'T engage. Better
|
|
798
|
+
# to skip a real target than spam a maintainer on stale / missing data.
|
|
799
|
+
# ---------------------------------------------------------------------------
|
|
800
|
+
|
|
801
|
+
_REPO_META_CACHE_DIR = _Path.home() / ".delimit" / "cache" / "outreach_repo_meta"
|
|
802
|
+
# LED-2266: env-overridable engagement-floor thresholds.
|
|
803
|
+
# Defaults reproduce PR #180 shipped behavior. Floors enforce sanity
|
|
804
|
+
# (no zero or negative values that would silently disable the gate).
|
|
805
|
+
_REPO_META_CACHE_TTL_S = _env_int(
|
|
806
|
+
"DELIMIT_OUTREACH_REPO_META_CACHE_TTL_S", 7 * 24 * 3600, minimum=60,
|
|
807
|
+
)
|
|
808
|
+
_MIN_REPO_STARS = _env_int("DELIMIT_OUTREACH_MIN_STARS", 50, minimum=1)
|
|
809
|
+
_MIN_REPO_AGE_DAYS = _env_int("DELIMIT_OUTREACH_MIN_AGE_DAYS", 30, minimum=1)
|
|
810
|
+
|
|
811
|
+
|
|
812
|
+
def _repo_meta_cache_path(repo: str) -> _Path:
|
|
813
|
+
safe = repo.replace("/", "__")
|
|
814
|
+
return _REPO_META_CACHE_DIR / f"{safe}.json"
|
|
815
|
+
|
|
816
|
+
|
|
817
|
+
def _read_cached_repo_meta(repo: str) -> Optional[Dict[str, Any]]:
|
|
818
|
+
cache_file = _repo_meta_cache_path(repo)
|
|
819
|
+
if not cache_file.exists():
|
|
820
|
+
return None
|
|
821
|
+
try:
|
|
822
|
+
data = _json.loads(cache_file.read_text())
|
|
823
|
+
except (OSError, ValueError):
|
|
824
|
+
return None
|
|
825
|
+
ts = data.get("_cached_ts")
|
|
826
|
+
if not isinstance(ts, (int, float)) or _time.time() - ts > _REPO_META_CACHE_TTL_S:
|
|
827
|
+
return None
|
|
828
|
+
meta = data.get("meta")
|
|
829
|
+
return meta if isinstance(meta, dict) else None
|
|
830
|
+
|
|
831
|
+
|
|
832
|
+
def _write_cached_repo_meta(repo: str, meta: Dict[str, Any]) -> None:
|
|
833
|
+
try:
|
|
834
|
+
_REPO_META_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
|
835
|
+
_repo_meta_cache_path(repo).write_text(
|
|
836
|
+
_json.dumps({"_cached_ts": _time.time(), "meta": meta})
|
|
837
|
+
)
|
|
838
|
+
except OSError as exc:
|
|
839
|
+
logger.warning("repo-meta cache write failed for %s: %s", repo, exc)
|
|
840
|
+
|
|
841
|
+
|
|
842
|
+
def fetch_repo_metadata(repo: str) -> Optional[Dict[str, Any]]:
|
|
843
|
+
"""Fetch lightweight repo metadata via ``gh api repos/{repo}``.
|
|
844
|
+
Cached 7 days. Returns dict with stargazers_count / forks_count /
|
|
845
|
+
open_issues_count / created_at / archived / fork / description /
|
|
846
|
+
pushed_at / owner_login, or None on any failure (caller fails closed)."""
|
|
847
|
+
cached = _read_cached_repo_meta(repo)
|
|
848
|
+
if cached is not None:
|
|
849
|
+
return cached
|
|
850
|
+
data = _gh_api_call(f"repos/{repo}")
|
|
851
|
+
if not isinstance(data, dict):
|
|
852
|
+
# Don't poison cache with None — repo may exist on next attempt
|
|
853
|
+
return None
|
|
854
|
+
owner_obj = data.get("owner") or {}
|
|
855
|
+
meta = {
|
|
856
|
+
"stargazers_count": data.get("stargazers_count", 0),
|
|
857
|
+
"forks_count": data.get("forks_count", 0),
|
|
858
|
+
"open_issues_count": data.get("open_issues_count", 0),
|
|
859
|
+
"created_at": data.get("created_at", ""),
|
|
860
|
+
"pushed_at": data.get("pushed_at", ""),
|
|
861
|
+
"archived": bool(data.get("archived", False)),
|
|
862
|
+
"fork": bool(data.get("fork", False)),
|
|
863
|
+
"description": data.get("description") or "",
|
|
864
|
+
# LED-2214b followup: owner login lets the engagement-floor veto
|
|
865
|
+
# owner-authored issues / PRs. Most owner-authored items are
|
|
866
|
+
# internal chore/release artifacts (today's audit queue had 4 of
|
|
867
|
+
# 5 real candidates in this class) — engagement value near zero.
|
|
868
|
+
"owner_login": owner_obj.get("login", "") if isinstance(owner_obj, dict) else "",
|
|
869
|
+
}
|
|
870
|
+
_write_cached_repo_meta(repo, meta)
|
|
871
|
+
return meta
|
|
872
|
+
|
|
873
|
+
|
|
874
|
+
# LED-2214b followup: per-issue state cache. Lighter than fetch_issue_full_text
|
|
875
|
+
# (which pulls body + comments) — we only need the state field. Separate cache
|
|
876
|
+
# because issue state changes more often than repo metadata, so shorter TTL.
|
|
877
|
+
_ISSUE_STATE_CACHE_TTL_S = 6 * 3600 # 6h: catches "open then closed same day"
|
|
878
|
+
|
|
879
|
+
|
|
880
|
+
def _issue_state_cache_path(repo: str, number: int) -> _Path:
|
|
881
|
+
safe = repo.replace("/", "__")
|
|
882
|
+
return _ISSUE_BODY_CACHE_DIR / f"{safe}_{number}__state.json"
|
|
883
|
+
|
|
884
|
+
|
|
885
|
+
def _read_cached_issue_state(repo: str, number: int) -> Optional[str]:
|
|
886
|
+
cf = _issue_state_cache_path(repo, number)
|
|
887
|
+
if not cf.exists():
|
|
888
|
+
return None
|
|
889
|
+
try:
|
|
890
|
+
data = _json.loads(cf.read_text())
|
|
891
|
+
except (OSError, ValueError):
|
|
892
|
+
return None
|
|
893
|
+
ts = data.get("_cached_ts")
|
|
894
|
+
if not isinstance(ts, (int, float)) or _time.time() - ts > _ISSUE_STATE_CACHE_TTL_S:
|
|
895
|
+
return None
|
|
896
|
+
state = data.get("state")
|
|
897
|
+
return state if isinstance(state, str) else None
|
|
898
|
+
|
|
899
|
+
|
|
900
|
+
def _write_cached_issue_state(repo: str, number: int, state: str) -> None:
|
|
901
|
+
try:
|
|
902
|
+
_ISSUE_BODY_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
|
903
|
+
_issue_state_cache_path(repo, number).write_text(
|
|
904
|
+
_json.dumps({"_cached_ts": _time.time(), "state": state})
|
|
905
|
+
)
|
|
906
|
+
except OSError as exc:
|
|
907
|
+
logger.warning(
|
|
908
|
+
"issue-state cache write failed for %s#%d: %s", repo, number, exc,
|
|
909
|
+
)
|
|
910
|
+
|
|
911
|
+
|
|
912
|
+
def fetch_issue_state(repo: str, number: int) -> Optional[str]:
|
|
913
|
+
"""Return current github issue/PR state ('open' / 'closed') or None
|
|
914
|
+
on fetch failure. Cached 6h. Fail-closed: callers treating None as
|
|
915
|
+
'don't engage' is correct (we can't verify the target is live)."""
|
|
916
|
+
cached = _read_cached_issue_state(repo, number)
|
|
917
|
+
if cached is not None:
|
|
918
|
+
return cached
|
|
919
|
+
data = _gh_api_call(f"repos/{repo}/issues/{number}")
|
|
920
|
+
if not isinstance(data, dict):
|
|
921
|
+
return None
|
|
922
|
+
state = data.get("state")
|
|
923
|
+
if isinstance(state, str) and state:
|
|
924
|
+
_write_cached_issue_state(repo, number, state)
|
|
925
|
+
return state
|
|
926
|
+
return None
|
|
927
|
+
|
|
928
|
+
|
|
929
|
+
def _repo_age_days(created_at: str) -> Optional[float]:
|
|
930
|
+
"""Parse ISO timestamp and return age in days. None on parse failure."""
|
|
931
|
+
if not created_at:
|
|
932
|
+
return None
|
|
933
|
+
try:
|
|
934
|
+
# Strip fractional seconds + Z suffix
|
|
935
|
+
clean = created_at.replace("Z", "").split(".")[0]
|
|
936
|
+
epoch = _time.mktime(_time.strptime(clean, "%Y-%m-%dT%H:%M:%S")) - _time.timezone
|
|
937
|
+
except (ValueError, TypeError):
|
|
938
|
+
return None
|
|
939
|
+
return (_time.time() - epoch) / 86400.0
|
|
940
|
+
|
|
941
|
+
|
|
942
|
+
def check_engagement_floor(repo: str) -> Tuple[bool, str]:
|
|
943
|
+
"""Apply the engagement-worthiness floor.
|
|
944
|
+
|
|
945
|
+
Returns (passes, reason). On failure, reason is a short tag the
|
|
946
|
+
caller logs: ``stars<50:3`` / ``age_days<30:0.4`` / ``archived`` /
|
|
947
|
+
``fork`` / ``no_metadata``. Tunable thresholds: _MIN_REPO_STARS,
|
|
948
|
+
_MIN_REPO_AGE_DAYS.
|
|
949
|
+
"""
|
|
950
|
+
meta = fetch_repo_metadata(repo)
|
|
951
|
+
if meta is None:
|
|
952
|
+
return False, "no_metadata"
|
|
953
|
+
if meta.get("archived"):
|
|
954
|
+
return False, "archived"
|
|
955
|
+
if meta.get("fork"):
|
|
956
|
+
return False, "fork"
|
|
957
|
+
stars = meta.get("stargazers_count", 0) or 0
|
|
958
|
+
if stars < _MIN_REPO_STARS:
|
|
959
|
+
return False, f"stars<{_MIN_REPO_STARS}:{stars}"
|
|
960
|
+
age = _repo_age_days(meta.get("created_at", ""))
|
|
961
|
+
if age is not None and age < _MIN_REPO_AGE_DAYS:
|
|
962
|
+
return False, f"age_days<{_MIN_REPO_AGE_DAYS}:{age:.1f}"
|
|
963
|
+
return True, "ok"
|
|
964
|
+
|
|
965
|
+
|
|
966
|
+
def fetch_issue_full_text(repo: str, number: int) -> str:
|
|
967
|
+
"""Fetch issue body + first N comments concatenated.
|
|
968
|
+
|
|
969
|
+
Cached for 7 days. Returns "" on any failure — the caller treats
|
|
970
|
+
empty string as 'no anchors available' which correctly blocks
|
|
971
|
+
dispatch (defense in depth; we never accidentally dispatch on a
|
|
972
|
+
target whose substantive evidence we couldn't actually fetch).
|
|
973
|
+
|
|
974
|
+
Public surface (no underscore prefix) so tests + callers can
|
|
975
|
+
monkeypatch without depending on the private cache helpers.
|
|
976
|
+
"""
|
|
977
|
+
cached = _read_cached_issue_body(repo, number)
|
|
978
|
+
if cached is not None:
|
|
979
|
+
return cached
|
|
980
|
+
|
|
981
|
+
issue = _gh_api_call(f"repos/{repo}/issues/{number}")
|
|
982
|
+
if not isinstance(issue, dict):
|
|
983
|
+
_write_cached_issue_body(repo, number, "")
|
|
984
|
+
return ""
|
|
985
|
+
parts: List[str] = []
|
|
986
|
+
body = issue.get("body")
|
|
987
|
+
if isinstance(body, str) and body:
|
|
988
|
+
parts.append(body)
|
|
989
|
+
|
|
990
|
+
comments = _gh_api_call(
|
|
991
|
+
f"repos/{repo}/issues/{number}/comments?per_page={_ISSUE_COMMENTS_FETCH_LIMIT}"
|
|
992
|
+
)
|
|
993
|
+
if isinstance(comments, list):
|
|
994
|
+
for c in comments[:_ISSUE_COMMENTS_FETCH_LIMIT]:
|
|
995
|
+
if isinstance(c, dict):
|
|
996
|
+
cb = c.get("body")
|
|
997
|
+
if isinstance(cb, str) and cb:
|
|
998
|
+
parts.append(cb)
|
|
999
|
+
|
|
1000
|
+
full = "\n\n".join(parts)
|
|
1001
|
+
_write_cached_issue_body(repo, number, full)
|
|
1002
|
+
return full
|
|
1003
|
+
|
|
1004
|
+
|
|
1005
|
+
# ---------------------------------------------------------------------------
|
|
1006
|
+
# Anti-spam — protect the operating account from github enforcement
|
|
1007
|
+
# ---------------------------------------------------------------------------
|
|
1008
|
+
#
|
|
1009
|
+
# Three hard limits on top of the per-tick spam firewall
|
|
1010
|
+
# (DEFAULT_MAX_DISPATCH=3) in the daemon:
|
|
1011
|
+
#
|
|
1012
|
+
# 1. Per-repo cooldown: don't dispatch on a repo we already dispatched
|
|
1013
|
+
# to within the last _DISPATCH_COOLDOWN_DAYS days. Avoids the
|
|
1014
|
+
# "scanner finds 3 issues on the SAME repo in one tick + we
|
|
1015
|
+
# engage on all of them = swarm" failure mode.
|
|
1016
|
+
# 2. Per-day global cap: refuse dispatch once we've crossed
|
|
1017
|
+
# _MAX_DISPATCHES_PER_DAY in the rolling 24-hour window. Catches
|
|
1018
|
+
# multiple-tick scenarios (manual run + scheduled run + retry)
|
|
1019
|
+
# that would multiply the per-tick cap.
|
|
1020
|
+
# 3. Halt on rate-limit (in _gh_api_call): if gh api returns 403/429,
|
|
1021
|
+
# write the kill-switch file and ntfy. GitHub typically warns
|
|
1022
|
+
# before banning; respecting that warning protects the account.
|
|
1023
|
+
#
|
|
1024
|
+
# The dispatch log at _DISPATCH_LOG is the source of truth for #1 and #2.
|
|
1025
|
+
# It's append-only JSONL; each successful dispatch_substantive_outreach
|
|
1026
|
+
# call writes one line.
|
|
1027
|
+
|
|
1028
|
+
_DISPATCH_LOG = _Path.home() / ".delimit" / "state" / "outreach-dispatch-log.jsonl"
|
|
1029
|
+
# LED-2266: env-overridable anti-spam thresholds (PR #179 follow-up
|
|
1030
|
+
# panel-flagged). Defaults reproduce shipped behavior. Floors enforce
|
|
1031
|
+
# sanity (minimum=1 — zero would silently disable the spam protection).
|
|
1032
|
+
_DISPATCH_COOLDOWN_DAYS = _env_int("DELIMIT_OUTREACH_COOLDOWN_DAYS", 7, minimum=1)
|
|
1033
|
+
_MAX_DISPATCHES_PER_DAY = _env_int("DELIMIT_OUTREACH_MAX_PER_DAY", 5, minimum=1)
|
|
1034
|
+
|
|
1035
|
+
|
|
1036
|
+
def _read_dispatch_log() -> List[Dict[str, Any]]:
|
|
1037
|
+
"""Return all dispatch log entries (newest first). Empty on missing/
|
|
1038
|
+
unreadable. Best-effort — never raises."""
|
|
1039
|
+
if not _DISPATCH_LOG.exists():
|
|
1040
|
+
return []
|
|
1041
|
+
try:
|
|
1042
|
+
out: List[Dict[str, Any]] = []
|
|
1043
|
+
for line in _DISPATCH_LOG.read_text().splitlines():
|
|
1044
|
+
line = line.strip()
|
|
1045
|
+
if not line:
|
|
1046
|
+
continue
|
|
1047
|
+
try:
|
|
1048
|
+
out.append(_json.loads(line))
|
|
1049
|
+
except ValueError:
|
|
1050
|
+
continue
|
|
1051
|
+
out.sort(key=lambda r: r.get("ts", ""), reverse=True)
|
|
1052
|
+
return out
|
|
1053
|
+
except OSError as exc:
|
|
1054
|
+
logger.warning("dispatch log read failed: %s", exc)
|
|
1055
|
+
return []
|
|
1056
|
+
|
|
1057
|
+
|
|
1058
|
+
def _record_dispatch(repo: str, fingerprint: str, category: str) -> None:
|
|
1059
|
+
"""Append one entry to the dispatch log. Best-effort — silent on
|
|
1060
|
+
disk failure (dispatch must not crash because logging broke)."""
|
|
1061
|
+
try:
|
|
1062
|
+
_DISPATCH_LOG.parent.mkdir(parents=True, exist_ok=True)
|
|
1063
|
+
entry = {
|
|
1064
|
+
"ts": _time.strftime("%Y-%m-%dT%H:%M:%SZ", _time.gmtime()),
|
|
1065
|
+
"repo": repo,
|
|
1066
|
+
"fingerprint": fingerprint,
|
|
1067
|
+
"category": category,
|
|
1068
|
+
}
|
|
1069
|
+
with _DISPATCH_LOG.open("a") as f:
|
|
1070
|
+
f.write(_json.dumps(entry) + "\n")
|
|
1071
|
+
except OSError as exc:
|
|
1072
|
+
logger.warning("dispatch log write failed: %s", exc)
|
|
1073
|
+
|
|
1074
|
+
|
|
1075
|
+
def _check_per_repo_cooldown(repo: str, now: float | None = None) -> Optional[str]:
|
|
1076
|
+
"""Return cooldown-expiry ISO string if repo is in cooldown, else None.
|
|
1077
|
+
|
|
1078
|
+
`now` is overridable for tests. Defaults to current UTC epoch.
|
|
1079
|
+
"""
|
|
1080
|
+
if not repo:
|
|
1081
|
+
return None
|
|
1082
|
+
if now is None:
|
|
1083
|
+
now = _time.time()
|
|
1084
|
+
cutoff = now - (_DISPATCH_COOLDOWN_DAYS * 86400)
|
|
1085
|
+
for entry in _read_dispatch_log():
|
|
1086
|
+
if (entry.get("repo") or "").strip().lower() != repo.strip().lower():
|
|
1087
|
+
continue
|
|
1088
|
+
ts = entry.get("ts", "")
|
|
1089
|
+
try:
|
|
1090
|
+
entry_epoch = _time.mktime(_time.strptime(ts, "%Y-%m-%dT%H:%M:%SZ")) - _time.timezone
|
|
1091
|
+
except (ValueError, TypeError):
|
|
1092
|
+
continue
|
|
1093
|
+
if entry_epoch >= cutoff:
|
|
1094
|
+
# Compute cooldown-expiry as entry_ts + cooldown_days
|
|
1095
|
+
expires_epoch = entry_epoch + (_DISPATCH_COOLDOWN_DAYS * 86400)
|
|
1096
|
+
return _time.strftime("%Y-%m-%dT%H:%M:%SZ", _time.gmtime(expires_epoch))
|
|
1097
|
+
return None
|
|
1098
|
+
|
|
1099
|
+
|
|
1100
|
+
def _check_per_day_cap(now: float | None = None) -> int:
|
|
1101
|
+
"""Return count of dispatches in the rolling 24h window. Caller
|
|
1102
|
+
checks against _MAX_DISPATCHES_PER_DAY."""
|
|
1103
|
+
if now is None:
|
|
1104
|
+
now = _time.time()
|
|
1105
|
+
cutoff = now - 86400
|
|
1106
|
+
count = 0
|
|
1107
|
+
for entry in _read_dispatch_log():
|
|
1108
|
+
ts = entry.get("ts", "")
|
|
1109
|
+
try:
|
|
1110
|
+
entry_epoch = _time.mktime(_time.strptime(ts, "%Y-%m-%dT%H:%M:%SZ")) - _time.timezone
|
|
1111
|
+
except (ValueError, TypeError):
|
|
1112
|
+
continue
|
|
1113
|
+
if entry_epoch >= cutoff:
|
|
1114
|
+
count += 1
|
|
1115
|
+
return count
|
|
1116
|
+
|
|
1117
|
+
|
|
1118
|
+
def build_candidate_from_github_target(
|
|
1119
|
+
target: Dict[str, Any],
|
|
1120
|
+
category: str,
|
|
1121
|
+
subcategory: str = "",
|
|
1122
|
+
) -> Optional[SubstantiveCandidate]:
|
|
1123
|
+
"""Build a :class:`SubstantiveCandidate` or return None.
|
|
1124
|
+
|
|
1125
|
+
The function returns None — *not* raises — when the target cannot
|
|
1126
|
+
yield a substantive payload. This is the structural-impossibility
|
|
1127
|
+
guarantee: callers that get None must NOT dispatch.
|
|
1128
|
+
|
|
1129
|
+
Reasons for None return:
|
|
1130
|
+
* Target classified banking-adjacent (SHIFT-1 hard veto).
|
|
1131
|
+
* Repo could not be derived from fingerprint or URL.
|
|
1132
|
+
* No technical anchor extractable from snippet + rationale.
|
|
1133
|
+
* Category not in the mapped action table.
|
|
1134
|
+
|
|
1135
|
+
The reg-O check happens here too, not just at submit time, so
|
|
1136
|
+
banking-adjacent targets never reach the agent prompt at all.
|
|
1137
|
+
Defense in depth: scanner + submit gate both veto.
|
|
1138
|
+
"""
|
|
1139
|
+
adjacent, matched = is_banking_adjacent(target)
|
|
1140
|
+
if adjacent:
|
|
1141
|
+
logger.info(
|
|
1142
|
+
"build_candidate: banking-adjacent veto fingerprint=%s matched=%s",
|
|
1143
|
+
target.get("fingerprint"), matched,
|
|
1144
|
+
)
|
|
1145
|
+
return None
|
|
1146
|
+
|
|
1147
|
+
repo = _repo_from_target(target)
|
|
1148
|
+
if not repo:
|
|
1149
|
+
logger.info(
|
|
1150
|
+
"build_candidate: repo unresolved fingerprint=%s url=%s",
|
|
1151
|
+
target.get("fingerprint"), target.get("canonical_url"),
|
|
1152
|
+
)
|
|
1153
|
+
return None
|
|
1154
|
+
|
|
1155
|
+
if category not in _CATEGORY_TO_ACTION:
|
|
1156
|
+
logger.info("build_candidate: unmapped category=%s", category)
|
|
1157
|
+
return None
|
|
1158
|
+
|
|
1159
|
+
# LED-2214b followup (founder's Niklas-Flaig observation 2026-05-17):
|
|
1160
|
+
# engagement-floor check BEFORE the anchor extraction + body fetch so
|
|
1161
|
+
# we don't pay the per-issue API cost on a target that's a 0-star
|
|
1162
|
+
# personal scratchpad. Existing repo-search filter in social_target
|
|
1163
|
+
# catches `stars==0 AND no description` for repo targets only; issue
|
|
1164
|
+
# targets bypassed it entirely (the gap this closes).
|
|
1165
|
+
floor_ok, floor_reason = check_engagement_floor(repo)
|
|
1166
|
+
if not floor_ok:
|
|
1167
|
+
logger.info(
|
|
1168
|
+
"build_candidate: engagement floor fingerprint=%s repo=%s reason=%s",
|
|
1169
|
+
target.get("fingerprint"), repo, floor_reason,
|
|
1170
|
+
)
|
|
1171
|
+
return None
|
|
1172
|
+
|
|
1173
|
+
# LED-2214b followup (2026-05-17 audit-queue observation): 4 of 7
|
|
1174
|
+
# dispatched tasks today were owner-authored (chore PRs, dev→main
|
|
1175
|
+
# promotions, internal scout reports). Engagement value near zero —
|
|
1176
|
+
# the owner is doing their own work, not seeking community input.
|
|
1177
|
+
# Repo metadata fetch above already populated owner_login; compare
|
|
1178
|
+
# directly to target's author. Cheap check.
|
|
1179
|
+
repo_meta = fetch_repo_metadata(repo)
|
|
1180
|
+
if repo_meta is not None:
|
|
1181
|
+
owner_login = (repo_meta.get("owner_login") or "").strip().lower()
|
|
1182
|
+
target_author = (target.get("author") or "").strip().lower()
|
|
1183
|
+
if owner_login and target_author and owner_login == target_author:
|
|
1184
|
+
logger.info(
|
|
1185
|
+
"build_candidate: owner-authored target fingerprint=%s "
|
|
1186
|
+
"author=%s == owner=%s",
|
|
1187
|
+
target.get("fingerprint"), target_author, owner_login,
|
|
1188
|
+
)
|
|
1189
|
+
return None
|
|
1190
|
+
|
|
1191
|
+
# LED-2214b followup (2026-05-17 audit-queue observation): 3 of 7
|
|
1192
|
+
# dispatched tasks today were on CLOSED issues. Engaging on a closed
|
|
1193
|
+
# thread is noise — the decision is already made. Cheap state check
|
|
1194
|
+
# before paying the body-fetch cost. Only applies to issue targets;
|
|
1195
|
+
# repo targets don't have a state in this sense.
|
|
1196
|
+
fp_parts_state = _issue_fp_parts(target.get("fingerprint", ""))
|
|
1197
|
+
if fp_parts_state is not None:
|
|
1198
|
+
state = fetch_issue_state(fp_parts_state[0], fp_parts_state[1])
|
|
1199
|
+
if state is None:
|
|
1200
|
+
# Fail-closed: can't verify the issue is live → skip
|
|
1201
|
+
logger.info(
|
|
1202
|
+
"build_candidate: issue state unverifiable fingerprint=%s",
|
|
1203
|
+
target.get("fingerprint"),
|
|
1204
|
+
)
|
|
1205
|
+
return None
|
|
1206
|
+
if state != "open":
|
|
1207
|
+
logger.info(
|
|
1208
|
+
"build_candidate: issue state=%s (not open) fingerprint=%s",
|
|
1209
|
+
state, target.get("fingerprint"),
|
|
1210
|
+
)
|
|
1211
|
+
return None
|
|
1212
|
+
|
|
1213
|
+
# LED-2214b followup — anti-spam protection for the operating account.
|
|
1214
|
+
# These checks run AFTER the banking veto + repo-resolve + category
|
|
1215
|
+
# check (so we don't burden the dispatch log with rejected targets
|
|
1216
|
+
# that wouldn't have dispatched anyway) but BEFORE the anchor
|
|
1217
|
+
# extraction + body fetch (so cool-down catches re-targeting on
|
|
1218
|
+
# repos we recently engaged with without paying the API cost to
|
|
1219
|
+
# re-fetch their issue body).
|
|
1220
|
+
|
|
1221
|
+
cooldown_expires = _check_per_repo_cooldown(repo)
|
|
1222
|
+
if cooldown_expires:
|
|
1223
|
+
logger.info(
|
|
1224
|
+
"build_candidate: per-repo cooldown fingerprint=%s repo=%s "
|
|
1225
|
+
"expires=%s",
|
|
1226
|
+
target.get("fingerprint"), repo, cooldown_expires,
|
|
1227
|
+
)
|
|
1228
|
+
return None
|
|
1229
|
+
|
|
1230
|
+
today_count = _check_per_day_cap()
|
|
1231
|
+
if today_count >= _MAX_DISPATCHES_PER_DAY:
|
|
1232
|
+
logger.warning(
|
|
1233
|
+
"build_candidate: per-day cap hit fingerprint=%s "
|
|
1234
|
+
"today_count=%d cap=%d",
|
|
1235
|
+
target.get("fingerprint"), today_count, _MAX_DISPATCHES_PER_DAY,
|
|
1236
|
+
)
|
|
1237
|
+
return None
|
|
1238
|
+
|
|
1239
|
+
snippet = target.get("content_snippet", "") or ""
|
|
1240
|
+
rationale = target.get("rationale", "") or ""
|
|
1241
|
+
anchors = extract_technical_anchors(f"{snippet}\n{rationale}")
|
|
1242
|
+
|
|
1243
|
+
# LED-2214b followup: if the snippet didn't yield anchors AND this is
|
|
1244
|
+
# an issue target, fetch the full issue body + first N comments and
|
|
1245
|
+
# re-extract. The scanner truncates issue bodies to 200 chars (see
|
|
1246
|
+
# ai/social_target.py:_scan_github phase 2) which almost always
|
|
1247
|
+
# strips the part where anchors live. Fetch is cached 7 days per
|
|
1248
|
+
# issue (see fetch_issue_full_text). On any fetch failure the
|
|
1249
|
+
# function returns "" which leaves anchors unchanged → still blocks.
|
|
1250
|
+
fp_parts = _issue_fp_parts(target.get("fingerprint", ""))
|
|
1251
|
+
needs_body_fetch = fp_parts is not None and not any(
|
|
1252
|
+
anchors.get(k) for k in ("issues", "spec_paths", "cves", "commits", "file_paths")
|
|
1253
|
+
)
|
|
1254
|
+
if needs_body_fetch:
|
|
1255
|
+
body = fetch_issue_full_text(fp_parts[0], fp_parts[1])
|
|
1256
|
+
if body:
|
|
1257
|
+
anchors = extract_technical_anchors(
|
|
1258
|
+
f"{snippet}\n{rationale}\n{body}"
|
|
1259
|
+
)
|
|
1260
|
+
|
|
1261
|
+
evidence_refs: List[str] = []
|
|
1262
|
+
for key in ("issues", "spec_paths", "cves", "commits", "file_paths"):
|
|
1263
|
+
for ref in anchors.get(key, []):
|
|
1264
|
+
label = f"{key[:-1] if key.endswith('s') else key}:{ref}"
|
|
1265
|
+
if label not in evidence_refs:
|
|
1266
|
+
evidence_refs.append(label)
|
|
1267
|
+
if not evidence_refs:
|
|
1268
|
+
logger.info(
|
|
1269
|
+
"build_candidate: no_technical_anchor fingerprint=%s category=%s "
|
|
1270
|
+
"(body_fetched=%s)",
|
|
1271
|
+
target.get("fingerprint"), category, needs_body_fetch,
|
|
1272
|
+
)
|
|
1273
|
+
return None
|
|
1274
|
+
|
|
1275
|
+
target_artifact = target.get("canonical_url") or target.get("fingerprint", "")
|
|
1276
|
+
if not target_artifact:
|
|
1277
|
+
return None
|
|
1278
|
+
|
|
1279
|
+
try:
|
|
1280
|
+
return SubstantiveCandidate(
|
|
1281
|
+
repo=repo,
|
|
1282
|
+
category=category,
|
|
1283
|
+
target_artifact=target_artifact,
|
|
1284
|
+
evidence_refs=tuple(evidence_refs),
|
|
1285
|
+
proposed_action=_CATEGORY_TO_ACTION[category],
|
|
1286
|
+
subcategory=subcategory or "",
|
|
1287
|
+
venture=target.get("venture", "delimit"),
|
|
1288
|
+
fingerprint=target.get("fingerprint", "") or "",
|
|
1289
|
+
)
|
|
1290
|
+
except ValueError as exc:
|
|
1291
|
+
logger.warning(
|
|
1292
|
+
"build_candidate: construction failed for fingerprint=%s: %s",
|
|
1293
|
+
target.get("fingerprint"), exc,
|
|
1294
|
+
)
|
|
1295
|
+
return None
|
|
1296
|
+
|
|
1297
|
+
|
|
1298
|
+
# ---------------------------------------------------------------------------
|
|
1299
|
+
# Dispatch wrapper
|
|
1300
|
+
# ---------------------------------------------------------------------------
|
|
1301
|
+
|
|
1302
|
+
|
|
1303
|
+
OUTREACH_SUBSTANTIVE_TASK_TYPE = "outreach_substantive"
|
|
1304
|
+
|
|
1305
|
+
|
|
1306
|
+
def dispatch_substantive_outreach(
|
|
1307
|
+
candidate: SubstantiveCandidate,
|
|
1308
|
+
target: Dict[str, Any],
|
|
1309
|
+
ledger_item_id: str = "",
|
|
1310
|
+
) -> Dict[str, Any]:
|
|
1311
|
+
"""Dispatch a substantive outreach task — only fires on a real payload.
|
|
1312
|
+
|
|
1313
|
+
The payload is the :class:`SubstantiveCandidate` — its construction
|
|
1314
|
+
has already enforced that every required evidence field is present.
|
|
1315
|
+
The task_type ``outreach_substantive`` is distinct from the legacy
|
|
1316
|
+
``outreach`` type (which still serves reddit / x branches) so a
|
|
1317
|
+
regression that tries to dispatch a non-substantive github task on
|
|
1318
|
+
the old type does not silently route to the new agent.
|
|
1319
|
+
|
|
1320
|
+
The agent that picks up this task is expected to call
|
|
1321
|
+
``delimit_substantive_content_check`` BEFORE submitting any draft
|
|
1322
|
+
body, and ``delimit_external_pr_check`` BEFORE submitting if the
|
|
1323
|
+
action is ``pr``. Those gates live in :mod:`ai.server`.
|
|
1324
|
+
"""
|
|
1325
|
+
if not isinstance(candidate, SubstantiveCandidate):
|
|
1326
|
+
# Belt-and-suspenders: the dataclass cannot be constructed
|
|
1327
|
+
# without the required fields, but a caller might still pass
|
|
1328
|
+
# a stray dict. Refuse rather than coerce.
|
|
1329
|
+
raise TypeError(
|
|
1330
|
+
"dispatch_substantive_outreach requires a SubstantiveCandidate "
|
|
1331
|
+
f"instance, got {type(candidate).__name__}"
|
|
1332
|
+
)
|
|
1333
|
+
|
|
1334
|
+
# Late-bound import to keep the foundation module light and the
|
|
1335
|
+
# cyclic-import surface clean.
|
|
1336
|
+
from ai.agent_dispatch import dispatch_task, link_ledger_item
|
|
1337
|
+
|
|
1338
|
+
constraints = [
|
|
1339
|
+
"no-deploy", "no-secrets", "no-destructive",
|
|
1340
|
+
"shift-1-quiet-attraction",
|
|
1341
|
+
"must-call-delimit_substantive_content_check-before-submit",
|
|
1342
|
+
]
|
|
1343
|
+
if candidate.proposed_action == "pr":
|
|
1344
|
+
constraints.append("must-call-delimit_external_pr_check-before-submit")
|
|
1345
|
+
|
|
1346
|
+
tools_needed = [
|
|
1347
|
+
"delimit_substantive_content_check",
|
|
1348
|
+
"delimit_sensor_github_issue",
|
|
1349
|
+
]
|
|
1350
|
+
if candidate.proposed_action == "pr":
|
|
1351
|
+
tools_needed.append("delimit_external_pr_check")
|
|
1352
|
+
|
|
1353
|
+
variables: Dict[str, Any] = {
|
|
1354
|
+
"candidate": candidate.to_dict(),
|
|
1355
|
+
"venture": candidate.venture,
|
|
1356
|
+
"repo": candidate.repo,
|
|
1357
|
+
"category": candidate.category,
|
|
1358
|
+
"subcategory": candidate.subcategory,
|
|
1359
|
+
"target_artifact": candidate.target_artifact,
|
|
1360
|
+
"evidence_refs": list(candidate.evidence_refs),
|
|
1361
|
+
"proposed_action": candidate.proposed_action,
|
|
1362
|
+
"source_url": target.get("canonical_url", ""),
|
|
1363
|
+
"source_fingerprint": candidate.fingerprint,
|
|
1364
|
+
"author": target.get("author", ""),
|
|
1365
|
+
"rationale": target.get("rationale", ""),
|
|
1366
|
+
}
|
|
1367
|
+
|
|
1368
|
+
title = (
|
|
1369
|
+
f"[{candidate.venture.upper()}] Substantive {candidate.proposed_action} "
|
|
1370
|
+
f"on {candidate.repo} ({candidate.category})"
|
|
1371
|
+
)
|
|
1372
|
+
|
|
1373
|
+
description = (
|
|
1374
|
+
"Substantive-outreach task (LED-2214b architecture).\n"
|
|
1375
|
+
f"Repo: {candidate.repo}\n"
|
|
1376
|
+
f"Category: {candidate.category}"
|
|
1377
|
+
f"{' / ' + candidate.subcategory if candidate.subcategory else ''}\n"
|
|
1378
|
+
f"Action: {candidate.proposed_action}\n"
|
|
1379
|
+
f"Target: {candidate.target_artifact}\n"
|
|
1380
|
+
f"Evidence: {', '.join(candidate.evidence_refs)}\n"
|
|
1381
|
+
"\n"
|
|
1382
|
+
"SHIFT-1 constraints:\n"
|
|
1383
|
+
" - Pseudonymous account only; no founder identity.\n"
|
|
1384
|
+
" - Real technical contribution only. No 'we built' / 'our tool' / "
|
|
1385
|
+
"'btw try' framing. Never name our own product in the body.\n"
|
|
1386
|
+
" - delimit_substantive_content_check is MANDATORY pre-submit.\n"
|
|
1387
|
+
" - delimit_external_pr_check is MANDATORY when proposed_action='pr'.\n"
|
|
1388
|
+
)
|
|
1389
|
+
|
|
1390
|
+
context = (
|
|
1391
|
+
"Substantive autonomous outreach via the LED-2214b architecture. "
|
|
1392
|
+
"The pseudonymous-substantive-contribution carve-out (CLAUDE.md SHIFT-1, "
|
|
1393
|
+
"2026-05-04) permits this provided the activity is a genuine technical "
|
|
1394
|
+
"contribution. The pre-submit gate stack enforces that. If the gate "
|
|
1395
|
+
"blocks, file the rejection reason on the linked ledger item and stop."
|
|
1396
|
+
)
|
|
1397
|
+
|
|
1398
|
+
result = dispatch_task(
|
|
1399
|
+
title=title,
|
|
1400
|
+
description=description,
|
|
1401
|
+
assignee="any",
|
|
1402
|
+
priority="P1",
|
|
1403
|
+
tools_needed=tools_needed,
|
|
1404
|
+
constraints=constraints,
|
|
1405
|
+
context=context,
|
|
1406
|
+
task_type=OUTREACH_SUBSTANTIVE_TASK_TYPE,
|
|
1407
|
+
venture=candidate.venture,
|
|
1408
|
+
variables=variables,
|
|
1409
|
+
external_key=(
|
|
1410
|
+
f"outreach_substantive:{candidate.fingerprint}"
|
|
1411
|
+
if candidate.fingerprint
|
|
1412
|
+
else f"outreach_substantive:{candidate.repo}:{candidate.target_artifact}"
|
|
1413
|
+
),
|
|
1414
|
+
)
|
|
1415
|
+
task_id = result.get("task_id", "")
|
|
1416
|
+
if task_id and ledger_item_id:
|
|
1417
|
+
try:
|
|
1418
|
+
link_ledger_item(task_id, ledger_item_id)
|
|
1419
|
+
except Exception as exc: # link is best-effort
|
|
1420
|
+
logger.warning(
|
|
1421
|
+
"dispatch_substantive_outreach: link_ledger_item failed "
|
|
1422
|
+
"task=%s ledger=%s err=%s",
|
|
1423
|
+
task_id, ledger_item_id, exc,
|
|
1424
|
+
)
|
|
1425
|
+
|
|
1426
|
+
# LED-2214b followup — record the dispatch for per-repo cooldown +
|
|
1427
|
+
# per-day cap. Append-only JSONL; subsequent build_candidate calls
|
|
1428
|
+
# read this log via _check_per_repo_cooldown / _check_per_day_cap.
|
|
1429
|
+
# Best-effort; logging failures must not crash a successful dispatch.
|
|
1430
|
+
if task_id:
|
|
1431
|
+
_record_dispatch(
|
|
1432
|
+
repo=candidate.repo,
|
|
1433
|
+
fingerprint=candidate.fingerprint,
|
|
1434
|
+
category=candidate.category,
|
|
1435
|
+
)
|
|
1436
|
+
|
|
1437
|
+
return result
|