delimit-cli 4.6.0 → 4.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,8 +2,8 @@
2
2
 
3
3
  Implements the autonomous-github-outreach architecture ratified by the
4
4
  2026-05-11 deliberation (A1 + Codex payload amendment, B3 + Claude reg-O
5
- target-side veto, C1 single-responsibility daemon). Transcript:
6
- ``/home/delimit/delimit-private/deliberations/2026-05-11-autonomous-github-outreach-architecture.md``.
5
+ target-side veto, C1 single-responsibility daemon). Transcript stored
6
+ privately.
7
7
 
8
8
  The three SHIFT-1 holes this module closes:
9
9
 
@@ -42,14 +42,63 @@ that ticks scanner → file ledger → dispatch.
42
42
 
43
43
  from __future__ import annotations
44
44
 
45
+ import json as _json
45
46
  import logging
47
+ import os as _os
46
48
  import re
49
+ import subprocess as _subprocess
50
+ import time as _time
47
51
  from dataclasses import asdict, dataclass, field
52
+ from pathlib import Path as _Path
48
53
  from typing import Any, Dict, List, Optional, Tuple
49
54
 
50
55
  logger = logging.getLogger("delimit.ai.outreach_substantive")
51
56
 
52
57
 
58
+ # ---------------------------------------------------------------------------
59
+ # LED-2266: env-configurable thresholds for the outreach gate stack.
60
+ #
61
+ # Each defense layer has a default value chosen during initial deployment
62
+ # (PR #179 anti-spam, PR #180 engagement-floor). Operators can tune any
63
+ # of them via env var without code changes — useful for trying tighter
64
+ # thresholds on a new venture, or loosening when scanner yield is low.
65
+ #
66
+ # Defaults are conservative: they reproduce the PR-as-shipped behavior
67
+ # when no env var is set. The lookup helpers below are the single source
68
+ # of truth — module constants below resolve through them at import time
69
+ # so each threshold is documented in one place.
70
+ # ---------------------------------------------------------------------------
71
+
72
+
73
+ def _env_int(name: str, default: int, minimum: int = 0) -> int:
74
+ """Read an int env var; fall back to `default` on missing/invalid.
75
+
76
+ Enforces `minimum` (e.g. >=1 for caps) to reject zero/negative
77
+ overrides that would silently disable a defense. Logs at WARNING
78
+ when an override is applied OR rejected so operators can see what
79
+ the engine is actually using.
80
+ """
81
+ raw = _os.environ.get(name, "").strip()
82
+ if not raw:
83
+ return default
84
+ try:
85
+ value = int(raw)
86
+ except ValueError:
87
+ logger.warning(
88
+ "config: %s=%r is not an integer — using default=%d", name, raw, default,
89
+ )
90
+ return default
91
+ if value < minimum:
92
+ logger.warning(
93
+ "config: %s=%d below floor %d — using default=%d",
94
+ name, value, minimum, default,
95
+ )
96
+ return default
97
+ if value != default:
98
+ logger.warning("config: %s overridden default=%d -> %d", name, default, value)
99
+ return value
100
+
101
+
53
102
  # ---------------------------------------------------------------------------
54
103
  # Constants — keep these auditable. Edits require panel deliberation per
55
104
  # the CLAUDE.md SHIFT-1 constitutional binding.
@@ -232,10 +281,19 @@ def is_banking_adjacent(target: Dict[str, Any]) -> Tuple[bool, str]:
232
281
  + ``repo_description`` if present). Match is substring + case
233
282
  insensitive on the lowercased haystack.
234
283
 
284
+ LED-2265: also checks the org/username portion of the canonical URL
285
+ for typo-squat impersonation of known regulated entities (e.g.
286
+ ``JPM0RCHASE`` for ``jpmorgan``, ``g0ldman`` for ``goldman``). The
287
+ raw keyword pass above misses these because the user-facing string
288
+ isn't a banking-noun; the impersonation IS the signal. Defense in
289
+ depth — the substantive engagement path should never land on a
290
+ spoofed-bank account regardless of the repo's content topic.
291
+
235
292
  The first-match-wins return makes the logged reason actionable
236
- ("matched 'broker-dealer' in repo_description"). Callers should
237
- treat any True return as a hard veto — no override path exists at
238
- the scanner layer, by design.
293
+ ("matched 'broker-dealer' in repo_description" or "matched
294
+ typosquat:jpmorgan in author=JPM0RCHASE"). Callers should treat any
295
+ True return as a hard veto — no override path exists at the scanner
296
+ layer, by design.
239
297
  """
240
298
  haystack_parts: List[str] = []
241
299
  for key in (
@@ -251,9 +309,110 @@ def is_banking_adjacent(target: Dict[str, Any]) -> Tuple[bool, str]:
251
309
  for kw in BANKING_ADJACENT_KEYWORDS:
252
310
  if kw in haystack:
253
311
  return True, kw
312
+
313
+ # LED-2265: typo-squat impersonation of known regulated orgs.
314
+ typosquat = _is_typosquat_impersonation(target)
315
+ if typosquat:
316
+ return True, f"typosquat:{typosquat}"
317
+
254
318
  return False, ""
255
319
 
256
320
 
321
+ # LED-2265: known-regulated-entity org names. Used by the typo-squat
322
+ # impersonation check below. Names are lowercased and stored without
323
+ # common suffixes (`-bank`, `-chase`, etc.). Conservative list — false
324
+ # positives cost zero (we just don't engage), false negatives risk
325
+ # substantive engagement with a malicious impersonator.
326
+ _KNOWN_REGULATED_ORGS: Tuple[str, ...] = (
327
+ # Tier-1 US banks
328
+ "jpmorgan", "jpmorganchase", "chase", "goldman", "goldmansachs",
329
+ "morganstanley", "citi", "citigroup", "citibank",
330
+ "bankofamerica", "bofa", "wellsfargo", "usbank", "pnc", "truist",
331
+ "capitalone",
332
+ # Foreign G-SIBs
333
+ "hsbc", "barclays", "deutschebank", "credit-suisse", "creditsuisse",
334
+ "ubs", "santander", "bnpparibas", "societegenerale", "ing", "lloyds",
335
+ # US clearing / capital markets
336
+ "blackrock", "vanguard", "fidelity", "schwab", "interactive-brokers",
337
+ "interactivebrokers", "nyse", "nasdaq",
338
+ # Crypto / fintech with bank rails
339
+ "coinbase", "kraken", "circle", "tether", "binance",
340
+ # Card networks
341
+ "visa", "mastercard", "amex", "americanexpress",
342
+ # Regulators
343
+ "fdic", "occ", "frb", "federalreserve", "finra", "secgov",
344
+ )
345
+
346
+
347
+ # LED-2265: simple homoglyph map for digit-for-letter substitutions.
348
+ # Keys are digits commonly used as letter substitutes; values are the
349
+ # letter they impersonate. Asymmetric on purpose (we transform a
350
+ # candidate username INTO a likely impersonated name, then compare).
351
+ _HOMOGLYPH_DIGITS: Dict[str, str] = {
352
+ "0": "o", "1": "i", "3": "e", "4": "a", "5": "s", "7": "t",
353
+ }
354
+
355
+
356
+ def _normalize_for_typosquat(name: str) -> str:
357
+ """Lowercase + strip non-alphanumeric + map digits to letters via the
358
+ homoglyph table. ``JPM0RCHASE`` → ``jpmorchase`` (after step 1) →
359
+ ``jpmorchase`` (digits absent). Used both for the candidate org name
360
+ and as the comparison target — but the comparison list is built
361
+ from raw _KNOWN_REGULATED_ORGS (already letters only), so the
362
+ homoglyph step does the work."""
363
+ alphanum = re.sub(r"[^a-z0-9]", "", name.lower())
364
+ return "".join(_HOMOGLYPH_DIGITS.get(c, c) for c in alphanum)
365
+
366
+
367
+ def _is_typosquat_impersonation(target: Dict[str, Any]) -> str:
368
+ """Return the matched known-org name if the target's author/org/repo
369
+ appears to impersonate a regulated entity via digit-for-letter
370
+ substitution. Returns "" if no impersonation suspected.
371
+
372
+ Checks BOTH the github username AND the repo-name segment. Real
373
+ JPMorgan engagement would be ``jpmorganchase/<repo>`` — anything
374
+ matching the impersonation pattern that ISN'T the canonical org is
375
+ flagged.
376
+ """
377
+ # Collect the candidate name parts: author (github username) and the
378
+ # owner/name segment of the canonical_url.
379
+ candidates: List[str] = []
380
+ author = target.get("author") or ""
381
+ if isinstance(author, str) and author:
382
+ candidates.append(author)
383
+ url = target.get("canonical_url") or ""
384
+ if isinstance(url, str) and url:
385
+ m = re.match(r"^https?://github\.com/([^/]+)/([^/?#]+)", url)
386
+ if m:
387
+ candidates.append(m.group(1)) # org/user
388
+ candidates.append(m.group(2)) # repo name
389
+ fp = target.get("fingerprint") or ""
390
+ if isinstance(fp, str) and fp:
391
+ m = re.match(r"^github:[^:]+:([^/:]+)(?:/([^:]+))?", fp)
392
+ if m:
393
+ candidates.append(m.group(1))
394
+ if m.group(2):
395
+ candidates.append(m.group(2))
396
+
397
+ for cand in candidates:
398
+ # Only digit-bearing candidates can be homoglyph typosquats.
399
+ # A pure-letter username like ``goldman`` would either be the
400
+ # legit org (caught by BANKING_ADJACENT_KEYWORDS keyword pass)
401
+ # or some other case (e.g. ``goldman-recipes``) where we don't
402
+ # have positive evidence of impersonation intent. Digits are
403
+ # the disambiguator.
404
+ if not any(c.isdigit() for c in cand):
405
+ continue
406
+ normalized = _normalize_for_typosquat(cand)
407
+ if not normalized:
408
+ continue
409
+ for org in _KNOWN_REGULATED_ORGS:
410
+ if org in normalized:
411
+ return org
412
+
413
+ return ""
414
+
415
+
257
416
  # ---------------------------------------------------------------------------
258
417
  # Technical-anchor extraction + content gate
259
418
  # ---------------------------------------------------------------------------
@@ -466,6 +625,496 @@ _CATEGORY_TO_ACTION = {
466
625
  }
467
626
 
468
627
 
628
+ # ---------------------------------------------------------------------------
629
+ # Issue-body fetch + cache (LED-2214b followup)
630
+ #
631
+ # The scanner truncates issue bodies to 200 chars before they reach the
632
+ # substantive gate (see ai/social_target.py:_scan_github phase 2). 200
633
+ # chars covers the title + opening summary but almost always strips the
634
+ # tail where anchors live — stack traces, file paths in error messages,
635
+ # references to other issues/commits. Result: every issue target gets
636
+ # rejected as no-anchor even when the issue body is anchor-rich.
637
+ #
638
+ # This block fetches the FULL issue body + first N comments via gh CLI
639
+ # when the snippet-derived extraction comes up empty. Per-issue 7-day
640
+ # disk cache; daily tick at max_dispatch=3 means worst-case ~3 API calls
641
+ # per day after cache warms.
642
+ # ---------------------------------------------------------------------------
643
+
644
+ _ISSUE_BODY_CACHE_DIR = _Path.home() / ".delimit" / "cache" / "outreach_issue_bodies"
645
+ # LED-2266: env-overridable via DELIMIT_OUTREACH_ISSUE_BODY_CACHE_TTL_S.
646
+ # Default 7 days. Minimum 60s (don't disable caching outright; would
647
+ # spam the github api on every tick).
648
+ _ISSUE_BODY_CACHE_TTL_S = _env_int(
649
+ "DELIMIT_OUTREACH_ISSUE_BODY_CACHE_TTL_S", 7 * 24 * 3600, minimum=60,
650
+ )
651
+ _ISSUE_COMMENTS_FETCH_LIMIT = 5
652
+ _GH_API_TIMEOUT_S = 30
653
+ _ISSUE_FP_RE = re.compile(r"^github:issue:([^/:]+/[^/:]+):(\d+)$")
654
+
655
+
656
+ def _issue_fp_parts(fingerprint: str) -> Optional[Tuple[str, int]]:
657
+ """Extract (repo, issue_number) from a ``github:issue:owner/name:N`` fp.
658
+
659
+ Returns None for any non-issue fingerprint, so callers can use the
660
+ None return as the "skip body fetch" signal.
661
+ """
662
+ m = _ISSUE_FP_RE.match(fingerprint or "")
663
+ if not m:
664
+ return None
665
+ try:
666
+ return m.group(1), int(m.group(2))
667
+ except (TypeError, ValueError):
668
+ return None
669
+
670
+
671
+ def _issue_cache_path(repo: str, number: int) -> _Path:
672
+ safe = repo.replace("/", "__")
673
+ return _ISSUE_BODY_CACHE_DIR / f"{safe}_{number}.json"
674
+
675
+
676
+ def _read_cached_issue_body(repo: str, number: int) -> Optional[str]:
677
+ """Return cached full-text or None if missing/expired/corrupt."""
678
+ cache_file = _issue_cache_path(repo, number)
679
+ if not cache_file.exists():
680
+ return None
681
+ try:
682
+ data = _json.loads(cache_file.read_text())
683
+ except (OSError, ValueError):
684
+ return None
685
+ ts = data.get("ts")
686
+ if not isinstance(ts, (int, float)) or _time.time() - ts > _ISSUE_BODY_CACHE_TTL_S:
687
+ return None
688
+ body = data.get("body")
689
+ return body if isinstance(body, str) else None
690
+
691
+
692
+ def _write_cached_issue_body(repo: str, number: int, body: str) -> None:
693
+ """Persist fetched body. Best-effort — silent on disk failure."""
694
+ try:
695
+ _ISSUE_BODY_CACHE_DIR.mkdir(parents=True, exist_ok=True)
696
+ _issue_cache_path(repo, number).write_text(
697
+ _json.dumps({"ts": _time.time(), "body": body})
698
+ )
699
+ except OSError as exc:
700
+ logger.warning(
701
+ "issue-body cache write failed for %s#%d: %s", repo, number, exc,
702
+ )
703
+
704
+
705
+ _RATE_LIMIT_KILL_FILE = _Path.home() / ".delimit" / "outreach_pause"
706
+ _RATE_LIMIT_SIGNATURES = (
707
+ "rate limit", "rate-limit", "secondary rate",
708
+ "403", "abuse detection", "too many requests",
709
+ )
710
+
711
+
712
+ def _maybe_halt_on_rate_limit(endpoint: str, stderr: str) -> None:
713
+ """LED-2214b followup — defensive halt when github signals rate
714
+ limit / abuse-detection / forbidden. Writes the kill-switch file
715
+ AND ntfys (priority=5). The daemon's pre-import kill-switch check
716
+ will then short-circuit subsequent ticks until the file is removed.
717
+
718
+ Best-effort: silent on any failure. The halt is defense in depth —
719
+ if it doesn't fire here, the rate limit's own retry-after backoff
720
+ handles the immediate request, but future ticks would still hit
721
+ the same limit. The halt-on-warning pattern protects the account
722
+ from escalation (warning -> hard block -> ban)."""
723
+ if not stderr:
724
+ return
725
+ sl = stderr.lower()
726
+ if not any(sig in sl for sig in _RATE_LIMIT_SIGNATURES):
727
+ return
728
+ try:
729
+ _RATE_LIMIT_KILL_FILE.parent.mkdir(parents=True, exist_ok=True)
730
+ _RATE_LIMIT_KILL_FILE.write_text(
731
+ f"halted by _maybe_halt_on_rate_limit at "
732
+ f"{_time.strftime('%Y-%m-%dT%H:%M:%SZ', _time.gmtime())}\n"
733
+ f"endpoint: {endpoint}\n"
734
+ f"stderr: {stderr[:400]}\n"
735
+ )
736
+ logger.error(
737
+ "outreach RATE LIMIT detected — wrote kill-switch %s "
738
+ "(endpoint=%s)", _RATE_LIMIT_KILL_FILE, endpoint,
739
+ )
740
+ except OSError as exc:
741
+ logger.error(
742
+ "outreach rate-limit halt failed to write kill-switch: %s", exc,
743
+ )
744
+
745
+
746
+ def _gh_api_call(endpoint: str) -> Any:
747
+ """Call ``gh api <endpoint>`` and return parsed JSON or None on failure.
748
+
749
+ Local copy of the same idiom in ai.social_target — duplicated to keep
750
+ this module importable without pulling in the much larger
751
+ social_target dependency graph.
752
+
753
+ On any 403 / 429 / rate-limit signature in stderr, writes the
754
+ kill-switch file so subsequent daemon ticks short-circuit. See
755
+ _maybe_halt_on_rate_limit.
756
+ """
757
+ try:
758
+ proc = _subprocess.run(
759
+ ["gh", "api", endpoint],
760
+ capture_output=True,
761
+ text=True,
762
+ timeout=_GH_API_TIMEOUT_S,
763
+ )
764
+ except (_subprocess.TimeoutExpired, FileNotFoundError) as exc:
765
+ logger.warning("gh api %s failed: %s", endpoint, exc)
766
+ return None
767
+ if proc.returncode != 0:
768
+ # LED-2214b followup: halt the outreach daemon on rate-limit
769
+ # signatures BEFORE returning. Defense in depth against escalating
770
+ # github enforcement (warn -> block -> ban).
771
+ _maybe_halt_on_rate_limit(endpoint, proc.stderr or "")
772
+ logger.info(
773
+ "gh api %s returned %d: %s",
774
+ endpoint, proc.returncode, (proc.stderr or "")[:160],
775
+ )
776
+ return None
777
+ try:
778
+ return _json.loads(proc.stdout)
779
+ except ValueError as exc:
780
+ logger.warning("gh api %s returned non-JSON: %s", endpoint, exc)
781
+ return None
782
+
783
+
784
+ # ---------------------------------------------------------------------------
785
+ # Engagement-floor check (LED-2214b followup, found 2026-05-17 when first
786
+ # autonomous engagement landed on a same-day-created 0-star 4-follower
787
+ # personal scratchpad). Substantive content gate passed (anchors were
788
+ # valid) but engagement value was near zero — no readership, no community.
789
+ #
790
+ # This block fetches lightweight repo metadata (1 gh api call, 7-day
791
+ # cached) and enforces a stars + age + not-archived + not-fork floor
792
+ # BEFORE the anchor check. Sits parallel to the existing repo-search
793
+ # filter in ai/social_target.py:_scan_github line 2024 ("stars == 0 and
794
+ # no description: continue") which only catches REPO targets — issue
795
+ # targets bypass it entirely, which was the gap.
796
+ #
797
+ # Fail-closed: if we can't fetch the metadata, we DON'T engage. Better
798
+ # to skip a real target than spam a maintainer on stale / missing data.
799
+ # ---------------------------------------------------------------------------
800
+
801
+ _REPO_META_CACHE_DIR = _Path.home() / ".delimit" / "cache" / "outreach_repo_meta"
802
+ # LED-2266: env-overridable engagement-floor thresholds.
803
+ # Defaults reproduce PR #180 shipped behavior. Floors enforce sanity
804
+ # (no zero or negative values that would silently disable the gate).
805
+ _REPO_META_CACHE_TTL_S = _env_int(
806
+ "DELIMIT_OUTREACH_REPO_META_CACHE_TTL_S", 7 * 24 * 3600, minimum=60,
807
+ )
808
+ _MIN_REPO_STARS = _env_int("DELIMIT_OUTREACH_MIN_STARS", 50, minimum=1)
809
+ _MIN_REPO_AGE_DAYS = _env_int("DELIMIT_OUTREACH_MIN_AGE_DAYS", 30, minimum=1)
810
+
811
+
812
+ def _repo_meta_cache_path(repo: str) -> _Path:
813
+ safe = repo.replace("/", "__")
814
+ return _REPO_META_CACHE_DIR / f"{safe}.json"
815
+
816
+
817
+ def _read_cached_repo_meta(repo: str) -> Optional[Dict[str, Any]]:
818
+ cache_file = _repo_meta_cache_path(repo)
819
+ if not cache_file.exists():
820
+ return None
821
+ try:
822
+ data = _json.loads(cache_file.read_text())
823
+ except (OSError, ValueError):
824
+ return None
825
+ ts = data.get("_cached_ts")
826
+ if not isinstance(ts, (int, float)) or _time.time() - ts > _REPO_META_CACHE_TTL_S:
827
+ return None
828
+ meta = data.get("meta")
829
+ return meta if isinstance(meta, dict) else None
830
+
831
+
832
+ def _write_cached_repo_meta(repo: str, meta: Dict[str, Any]) -> None:
833
+ try:
834
+ _REPO_META_CACHE_DIR.mkdir(parents=True, exist_ok=True)
835
+ _repo_meta_cache_path(repo).write_text(
836
+ _json.dumps({"_cached_ts": _time.time(), "meta": meta})
837
+ )
838
+ except OSError as exc:
839
+ logger.warning("repo-meta cache write failed for %s: %s", repo, exc)
840
+
841
+
842
+ def fetch_repo_metadata(repo: str) -> Optional[Dict[str, Any]]:
843
+ """Fetch lightweight repo metadata via ``gh api repos/{repo}``.
844
+ Cached 7 days. Returns dict with stargazers_count / forks_count /
845
+ open_issues_count / created_at / archived / fork / description /
846
+ pushed_at / owner_login, or None on any failure (caller fails closed)."""
847
+ cached = _read_cached_repo_meta(repo)
848
+ if cached is not None:
849
+ return cached
850
+ data = _gh_api_call(f"repos/{repo}")
851
+ if not isinstance(data, dict):
852
+ # Don't poison cache with None — repo may exist on next attempt
853
+ return None
854
+ owner_obj = data.get("owner") or {}
855
+ meta = {
856
+ "stargazers_count": data.get("stargazers_count", 0),
857
+ "forks_count": data.get("forks_count", 0),
858
+ "open_issues_count": data.get("open_issues_count", 0),
859
+ "created_at": data.get("created_at", ""),
860
+ "pushed_at": data.get("pushed_at", ""),
861
+ "archived": bool(data.get("archived", False)),
862
+ "fork": bool(data.get("fork", False)),
863
+ "description": data.get("description") or "",
864
+ # LED-2214b followup: owner login lets the engagement-floor veto
865
+ # owner-authored issues / PRs. Most owner-authored items are
866
+ # internal chore/release artifacts (today's audit queue had 4 of
867
+ # 5 real candidates in this class) — engagement value near zero.
868
+ "owner_login": owner_obj.get("login", "") if isinstance(owner_obj, dict) else "",
869
+ }
870
+ _write_cached_repo_meta(repo, meta)
871
+ return meta
872
+
873
+
874
+ # LED-2214b followup: per-issue state cache. Lighter than fetch_issue_full_text
875
+ # (which pulls body + comments) — we only need the state field. Separate cache
876
+ # because issue state changes more often than repo metadata, so shorter TTL.
877
+ _ISSUE_STATE_CACHE_TTL_S = 6 * 3600 # 6h: catches "open then closed same day"
878
+
879
+
880
+ def _issue_state_cache_path(repo: str, number: int) -> _Path:
881
+ safe = repo.replace("/", "__")
882
+ return _ISSUE_BODY_CACHE_DIR / f"{safe}_{number}__state.json"
883
+
884
+
885
+ def _read_cached_issue_state(repo: str, number: int) -> Optional[str]:
886
+ cf = _issue_state_cache_path(repo, number)
887
+ if not cf.exists():
888
+ return None
889
+ try:
890
+ data = _json.loads(cf.read_text())
891
+ except (OSError, ValueError):
892
+ return None
893
+ ts = data.get("_cached_ts")
894
+ if not isinstance(ts, (int, float)) or _time.time() - ts > _ISSUE_STATE_CACHE_TTL_S:
895
+ return None
896
+ state = data.get("state")
897
+ return state if isinstance(state, str) else None
898
+
899
+
900
+ def _write_cached_issue_state(repo: str, number: int, state: str) -> None:
901
+ try:
902
+ _ISSUE_BODY_CACHE_DIR.mkdir(parents=True, exist_ok=True)
903
+ _issue_state_cache_path(repo, number).write_text(
904
+ _json.dumps({"_cached_ts": _time.time(), "state": state})
905
+ )
906
+ except OSError as exc:
907
+ logger.warning(
908
+ "issue-state cache write failed for %s#%d: %s", repo, number, exc,
909
+ )
910
+
911
+
912
+ def fetch_issue_state(repo: str, number: int) -> Optional[str]:
913
+ """Return current github issue/PR state ('open' / 'closed') or None
914
+ on fetch failure. Cached 6h. Fail-closed: callers treating None as
915
+ 'don't engage' is correct (we can't verify the target is live)."""
916
+ cached = _read_cached_issue_state(repo, number)
917
+ if cached is not None:
918
+ return cached
919
+ data = _gh_api_call(f"repos/{repo}/issues/{number}")
920
+ if not isinstance(data, dict):
921
+ return None
922
+ state = data.get("state")
923
+ if isinstance(state, str) and state:
924
+ _write_cached_issue_state(repo, number, state)
925
+ return state
926
+ return None
927
+
928
+
929
+ def _repo_age_days(created_at: str) -> Optional[float]:
930
+ """Parse ISO timestamp and return age in days. None on parse failure."""
931
+ if not created_at:
932
+ return None
933
+ try:
934
+ # Strip fractional seconds + Z suffix
935
+ clean = created_at.replace("Z", "").split(".")[0]
936
+ epoch = _time.mktime(_time.strptime(clean, "%Y-%m-%dT%H:%M:%S")) - _time.timezone
937
+ except (ValueError, TypeError):
938
+ return None
939
+ return (_time.time() - epoch) / 86400.0
940
+
941
+
942
+ def check_engagement_floor(repo: str) -> Tuple[bool, str]:
943
+ """Apply the engagement-worthiness floor.
944
+
945
+ Returns (passes, reason). On failure, reason is a short tag the
946
+ caller logs: ``stars<50:3`` / ``age_days<30:0.4`` / ``archived`` /
947
+ ``fork`` / ``no_metadata``. Tunable thresholds: _MIN_REPO_STARS,
948
+ _MIN_REPO_AGE_DAYS.
949
+ """
950
+ meta = fetch_repo_metadata(repo)
951
+ if meta is None:
952
+ return False, "no_metadata"
953
+ if meta.get("archived"):
954
+ return False, "archived"
955
+ if meta.get("fork"):
956
+ return False, "fork"
957
+ stars = meta.get("stargazers_count", 0) or 0
958
+ if stars < _MIN_REPO_STARS:
959
+ return False, f"stars<{_MIN_REPO_STARS}:{stars}"
960
+ age = _repo_age_days(meta.get("created_at", ""))
961
+ if age is not None and age < _MIN_REPO_AGE_DAYS:
962
+ return False, f"age_days<{_MIN_REPO_AGE_DAYS}:{age:.1f}"
963
+ return True, "ok"
964
+
965
+
966
+ def fetch_issue_full_text(repo: str, number: int) -> str:
967
+ """Fetch issue body + first N comments concatenated.
968
+
969
+ Cached for 7 days. Returns "" on any failure — the caller treats
970
+ empty string as 'no anchors available' which correctly blocks
971
+ dispatch (defense in depth; we never accidentally dispatch on a
972
+ target whose substantive evidence we couldn't actually fetch).
973
+
974
+ Public surface (no underscore prefix) so tests + callers can
975
+ monkeypatch without depending on the private cache helpers.
976
+ """
977
+ cached = _read_cached_issue_body(repo, number)
978
+ if cached is not None:
979
+ return cached
980
+
981
+ issue = _gh_api_call(f"repos/{repo}/issues/{number}")
982
+ if not isinstance(issue, dict):
983
+ _write_cached_issue_body(repo, number, "")
984
+ return ""
985
+ parts: List[str] = []
986
+ body = issue.get("body")
987
+ if isinstance(body, str) and body:
988
+ parts.append(body)
989
+
990
+ comments = _gh_api_call(
991
+ f"repos/{repo}/issues/{number}/comments?per_page={_ISSUE_COMMENTS_FETCH_LIMIT}"
992
+ )
993
+ if isinstance(comments, list):
994
+ for c in comments[:_ISSUE_COMMENTS_FETCH_LIMIT]:
995
+ if isinstance(c, dict):
996
+ cb = c.get("body")
997
+ if isinstance(cb, str) and cb:
998
+ parts.append(cb)
999
+
1000
+ full = "\n\n".join(parts)
1001
+ _write_cached_issue_body(repo, number, full)
1002
+ return full
1003
+
1004
+
1005
+ # ---------------------------------------------------------------------------
1006
+ # Anti-spam — protect the operating account from github enforcement
1007
+ # ---------------------------------------------------------------------------
1008
+ #
1009
+ # Three hard limits on top of the per-tick spam firewall
1010
+ # (DEFAULT_MAX_DISPATCH=3) in the daemon:
1011
+ #
1012
+ # 1. Per-repo cooldown: don't dispatch on a repo we already dispatched
1013
+ # to within the last _DISPATCH_COOLDOWN_DAYS days. Avoids the
1014
+ # "scanner finds 3 issues on the SAME repo in one tick + we
1015
+ # engage on all of them = swarm" failure mode.
1016
+ # 2. Per-day global cap: refuse dispatch once we've crossed
1017
+ # _MAX_DISPATCHES_PER_DAY in the rolling 24-hour window. Catches
1018
+ # multiple-tick scenarios (manual run + scheduled run + retry)
1019
+ # that would multiply the per-tick cap.
1020
+ # 3. Halt on rate-limit (in _gh_api_call): if gh api returns 403/429,
1021
+ # write the kill-switch file and ntfy. GitHub typically warns
1022
+ # before banning; respecting that warning protects the account.
1023
+ #
1024
+ # The dispatch log at _DISPATCH_LOG is the source of truth for #1 and #2.
1025
+ # It's append-only JSONL; each successful dispatch_substantive_outreach
1026
+ # call writes one line.
1027
+
1028
+ _DISPATCH_LOG = _Path.home() / ".delimit" / "state" / "outreach-dispatch-log.jsonl"
1029
+ # LED-2266: env-overridable anti-spam thresholds (PR #179 follow-up
1030
+ # panel-flagged). Defaults reproduce shipped behavior. Floors enforce
1031
+ # sanity (minimum=1 — zero would silently disable the spam protection).
1032
+ _DISPATCH_COOLDOWN_DAYS = _env_int("DELIMIT_OUTREACH_COOLDOWN_DAYS", 7, minimum=1)
1033
+ _MAX_DISPATCHES_PER_DAY = _env_int("DELIMIT_OUTREACH_MAX_PER_DAY", 5, minimum=1)
1034
+
1035
+
1036
+ def _read_dispatch_log() -> List[Dict[str, Any]]:
1037
+ """Return all dispatch log entries (newest first). Empty on missing/
1038
+ unreadable. Best-effort — never raises."""
1039
+ if not _DISPATCH_LOG.exists():
1040
+ return []
1041
+ try:
1042
+ out: List[Dict[str, Any]] = []
1043
+ for line in _DISPATCH_LOG.read_text().splitlines():
1044
+ line = line.strip()
1045
+ if not line:
1046
+ continue
1047
+ try:
1048
+ out.append(_json.loads(line))
1049
+ except ValueError:
1050
+ continue
1051
+ out.sort(key=lambda r: r.get("ts", ""), reverse=True)
1052
+ return out
1053
+ except OSError as exc:
1054
+ logger.warning("dispatch log read failed: %s", exc)
1055
+ return []
1056
+
1057
+
1058
+ def _record_dispatch(repo: str, fingerprint: str, category: str) -> None:
1059
+ """Append one entry to the dispatch log. Best-effort — silent on
1060
+ disk failure (dispatch must not crash because logging broke)."""
1061
+ try:
1062
+ _DISPATCH_LOG.parent.mkdir(parents=True, exist_ok=True)
1063
+ entry = {
1064
+ "ts": _time.strftime("%Y-%m-%dT%H:%M:%SZ", _time.gmtime()),
1065
+ "repo": repo,
1066
+ "fingerprint": fingerprint,
1067
+ "category": category,
1068
+ }
1069
+ with _DISPATCH_LOG.open("a") as f:
1070
+ f.write(_json.dumps(entry) + "\n")
1071
+ except OSError as exc:
1072
+ logger.warning("dispatch log write failed: %s", exc)
1073
+
1074
+
1075
+ def _check_per_repo_cooldown(repo: str, now: float | None = None) -> Optional[str]:
1076
+ """Return cooldown-expiry ISO string if repo is in cooldown, else None.
1077
+
1078
+ `now` is overridable for tests. Defaults to current UTC epoch.
1079
+ """
1080
+ if not repo:
1081
+ return None
1082
+ if now is None:
1083
+ now = _time.time()
1084
+ cutoff = now - (_DISPATCH_COOLDOWN_DAYS * 86400)
1085
+ for entry in _read_dispatch_log():
1086
+ if (entry.get("repo") or "").strip().lower() != repo.strip().lower():
1087
+ continue
1088
+ ts = entry.get("ts", "")
1089
+ try:
1090
+ entry_epoch = _time.mktime(_time.strptime(ts, "%Y-%m-%dT%H:%M:%SZ")) - _time.timezone
1091
+ except (ValueError, TypeError):
1092
+ continue
1093
+ if entry_epoch >= cutoff:
1094
+ # Compute cooldown-expiry as entry_ts + cooldown_days
1095
+ expires_epoch = entry_epoch + (_DISPATCH_COOLDOWN_DAYS * 86400)
1096
+ return _time.strftime("%Y-%m-%dT%H:%M:%SZ", _time.gmtime(expires_epoch))
1097
+ return None
1098
+
1099
+
1100
+ def _check_per_day_cap(now: float | None = None) -> int:
1101
+ """Return count of dispatches in the rolling 24h window. Caller
1102
+ checks against _MAX_DISPATCHES_PER_DAY."""
1103
+ if now is None:
1104
+ now = _time.time()
1105
+ cutoff = now - 86400
1106
+ count = 0
1107
+ for entry in _read_dispatch_log():
1108
+ ts = entry.get("ts", "")
1109
+ try:
1110
+ entry_epoch = _time.mktime(_time.strptime(ts, "%Y-%m-%dT%H:%M:%SZ")) - _time.timezone
1111
+ except (ValueError, TypeError):
1112
+ continue
1113
+ if entry_epoch >= cutoff:
1114
+ count += 1
1115
+ return count
1116
+
1117
+
469
1118
  def build_candidate_from_github_target(
470
1119
  target: Dict[str, Any],
471
1120
  category: str,
@@ -507,9 +1156,108 @@ def build_candidate_from_github_target(
507
1156
  logger.info("build_candidate: unmapped category=%s", category)
508
1157
  return None
509
1158
 
1159
+ # LED-2214b followup (founder's Niklas-Flaig observation 2026-05-17):
1160
+ # engagement-floor check BEFORE the anchor extraction + body fetch so
1161
+ # we don't pay the per-issue API cost on a target that's a 0-star
1162
+ # personal scratchpad. Existing repo-search filter in social_target
1163
+ # catches `stars==0 AND no description` for repo targets only; issue
1164
+ # targets bypassed it entirely (the gap this closes).
1165
+ floor_ok, floor_reason = check_engagement_floor(repo)
1166
+ if not floor_ok:
1167
+ logger.info(
1168
+ "build_candidate: engagement floor fingerprint=%s repo=%s reason=%s",
1169
+ target.get("fingerprint"), repo, floor_reason,
1170
+ )
1171
+ return None
1172
+
1173
+ # LED-2214b followup (2026-05-17 audit-queue observation): 4 of 7
1174
+ # dispatched tasks today were owner-authored (chore PRs, dev→main
1175
+ # promotions, internal scout reports). Engagement value near zero —
1176
+ # the owner is doing their own work, not seeking community input.
1177
+ # Repo metadata fetch above already populated owner_login; compare
1178
+ # directly to target's author. Cheap check.
1179
+ repo_meta = fetch_repo_metadata(repo)
1180
+ if repo_meta is not None:
1181
+ owner_login = (repo_meta.get("owner_login") or "").strip().lower()
1182
+ target_author = (target.get("author") or "").strip().lower()
1183
+ if owner_login and target_author and owner_login == target_author:
1184
+ logger.info(
1185
+ "build_candidate: owner-authored target fingerprint=%s "
1186
+ "author=%s == owner=%s",
1187
+ target.get("fingerprint"), target_author, owner_login,
1188
+ )
1189
+ return None
1190
+
1191
+ # LED-2214b followup (2026-05-17 audit-queue observation): 3 of 7
1192
+ # dispatched tasks today were on CLOSED issues. Engaging on a closed
1193
+ # thread is noise — the decision is already made. Cheap state check
1194
+ # before paying the body-fetch cost. Only applies to issue targets;
1195
+ # repo targets don't have a state in this sense.
1196
+ fp_parts_state = _issue_fp_parts(target.get("fingerprint", ""))
1197
+ if fp_parts_state is not None:
1198
+ state = fetch_issue_state(fp_parts_state[0], fp_parts_state[1])
1199
+ if state is None:
1200
+ # Fail-closed: can't verify the issue is live → skip
1201
+ logger.info(
1202
+ "build_candidate: issue state unverifiable fingerprint=%s",
1203
+ target.get("fingerprint"),
1204
+ )
1205
+ return None
1206
+ if state != "open":
1207
+ logger.info(
1208
+ "build_candidate: issue state=%s (not open) fingerprint=%s",
1209
+ state, target.get("fingerprint"),
1210
+ )
1211
+ return None
1212
+
1213
+ # LED-2214b followup — anti-spam protection for the operating account.
1214
+ # These checks run AFTER the banking veto + repo-resolve + category
1215
+ # check (so we don't burden the dispatch log with rejected targets
1216
+ # that wouldn't have dispatched anyway) but BEFORE the anchor
1217
+ # extraction + body fetch (so cool-down catches re-targeting on
1218
+ # repos we recently engaged with without paying the API cost to
1219
+ # re-fetch their issue body).
1220
+
1221
+ cooldown_expires = _check_per_repo_cooldown(repo)
1222
+ if cooldown_expires:
1223
+ logger.info(
1224
+ "build_candidate: per-repo cooldown fingerprint=%s repo=%s "
1225
+ "expires=%s",
1226
+ target.get("fingerprint"), repo, cooldown_expires,
1227
+ )
1228
+ return None
1229
+
1230
+ today_count = _check_per_day_cap()
1231
+ if today_count >= _MAX_DISPATCHES_PER_DAY:
1232
+ logger.warning(
1233
+ "build_candidate: per-day cap hit fingerprint=%s "
1234
+ "today_count=%d cap=%d",
1235
+ target.get("fingerprint"), today_count, _MAX_DISPATCHES_PER_DAY,
1236
+ )
1237
+ return None
1238
+
510
1239
  snippet = target.get("content_snippet", "") or ""
511
1240
  rationale = target.get("rationale", "") or ""
512
1241
  anchors = extract_technical_anchors(f"{snippet}\n{rationale}")
1242
+
1243
+ # LED-2214b followup: if the snippet didn't yield anchors AND this is
1244
+ # an issue target, fetch the full issue body + first N comments and
1245
+ # re-extract. The scanner truncates issue bodies to 200 chars (see
1246
+ # ai/social_target.py:_scan_github phase 2) which almost always
1247
+ # strips the part where anchors live. Fetch is cached 7 days per
1248
+ # issue (see fetch_issue_full_text). On any fetch failure the
1249
+ # function returns "" which leaves anchors unchanged → still blocks.
1250
+ fp_parts = _issue_fp_parts(target.get("fingerprint", ""))
1251
+ needs_body_fetch = fp_parts is not None and not any(
1252
+ anchors.get(k) for k in ("issues", "spec_paths", "cves", "commits", "file_paths")
1253
+ )
1254
+ if needs_body_fetch:
1255
+ body = fetch_issue_full_text(fp_parts[0], fp_parts[1])
1256
+ if body:
1257
+ anchors = extract_technical_anchors(
1258
+ f"{snippet}\n{rationale}\n{body}"
1259
+ )
1260
+
513
1261
  evidence_refs: List[str] = []
514
1262
  for key in ("issues", "spec_paths", "cves", "commits", "file_paths"):
515
1263
  for ref in anchors.get(key, []):
@@ -518,8 +1266,9 @@ def build_candidate_from_github_target(
518
1266
  evidence_refs.append(label)
519
1267
  if not evidence_refs:
520
1268
  logger.info(
521
- "build_candidate: no_technical_anchor fingerprint=%s category=%s",
522
- target.get("fingerprint"), category,
1269
+ "build_candidate: no_technical_anchor fingerprint=%s category=%s "
1270
+ "(body_fetched=%s)",
1271
+ target.get("fingerprint"), category, needs_body_fetch,
523
1272
  )
524
1273
  return None
525
1274
 
@@ -673,4 +1422,16 @@ def dispatch_substantive_outreach(
673
1422
  "task=%s ledger=%s err=%s",
674
1423
  task_id, ledger_item_id, exc,
675
1424
  )
1425
+
1426
+ # LED-2214b followup — record the dispatch for per-repo cooldown +
1427
+ # per-day cap. Append-only JSONL; subsequent build_candidate calls
1428
+ # read this log via _check_per_repo_cooldown / _check_per_day_cap.
1429
+ # Best-effort; logging failures must not crash a successful dispatch.
1430
+ if task_id:
1431
+ _record_dispatch(
1432
+ repo=candidate.repo,
1433
+ fingerprint=candidate.fingerprint,
1434
+ category=candidate.category,
1435
+ )
1436
+
676
1437
  return result