delimit-cli 4.5.1 → 4.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/CHANGELOG.md +87 -0
  2. package/README.md +15 -5
  3. package/bin/delimit-cli.js +109 -24
  4. package/gateway/ai/content_engine.py +3 -4
  5. package/gateway/ai/inbox_classifier.py +215 -0
  6. package/gateway/ai/integrations/opensage_wrapper.py +4 -1
  7. package/gateway/ai/ledger_manager.py +218 -38
  8. package/gateway/ai/license.py +26 -0
  9. package/gateway/ai/notify.py +68 -3
  10. package/gateway/ai/reddit_proxy.py +93 -15
  11. package/gateway/ai/reddit_scanner.py +36 -18
  12. package/gateway/ai/remote_resolve.py +422 -0
  13. package/gateway/ai/server.py +301 -117
  14. package/gateway/ai/social_capability/__init__.py +6 -0
  15. package/gateway/ai/social_capability/capability_validator.py +367 -0
  16. package/gateway/ai/social_capability/current_capabilities.yaml +95 -0
  17. package/gateway/ai/social_capability/fit_floor.py +360 -0
  18. package/gateway/ai/social_queue.py +307 -0
  19. package/gateway/ai/supabase_sync.py +14 -2
  20. package/gateway/ai/swarm.py +29 -11
  21. package/gateway/ai/tui.py +6 -2
  22. package/gateway/ai/vendor_news/__init__.py +14 -0
  23. package/gateway/ai/vendor_news/drafter.py +562 -0
  24. package/gateway/ai/vendor_news/sensor.py +509 -0
  25. package/gateway/ai/vendor_news/watchlist.yaml +71 -0
  26. package/gateway/ai/x_ranker.py +417 -0
  27. package/lib/attest-mcp.js +487 -0
  28. package/lib/attest-telemetry.js +48 -0
  29. package/lib/delimit-home.js +35 -0
  30. package/lib/delimit-template.js +14 -0
  31. package/package.json +25 -3
  32. package/scripts/postinstall.js +89 -40
  33. package/adapters/codex-security.js +0 -64
  34. package/adapters/codex-skill.js +0 -78
  35. package/gateway/ai/content_grounding/__init__.py +0 -98
  36. package/gateway/ai/content_grounding/build.py +0 -350
  37. package/gateway/ai/content_grounding/consume.py +0 -280
  38. package/gateway/ai/content_grounding/features.py +0 -218
  39. package/gateway/ai/content_grounding/fixtures/fail/01_missing_evidence.json +0 -9
  40. package/gateway/ai/content_grounding/fixtures/fail/02_unknown_evidence_prefix.json +0 -9
  41. package/gateway/ai/content_grounding/fixtures/fail/03_banned_comparative.json +0 -17
  42. package/gateway/ai/content_grounding/fixtures/fail/04_banned_adoption.json +0 -17
  43. package/gateway/ai/content_grounding/fixtures/fail/05_aggregate_no_numeric.json +0 -17
  44. package/gateway/ai/content_grounding/fixtures/fail/06_unversioned_inference_rule.json +0 -18
  45. package/gateway/ai/content_grounding/fixtures/pass/01_feature_shipped.json +0 -18
  46. package/gateway/ai/content_grounding/fixtures/pass/02_aggregate_claim.json +0 -23
  47. package/gateway/ai/content_grounding/fixtures/pass/03_attestation.json +0 -16
  48. package/gateway/ai/content_grounding/schemas/claim.schema.json +0 -40
  49. package/gateway/ai/content_grounding/schemas/event.schema.json +0 -23
  50. package/gateway/ai/content_grounding/schemas.py +0 -276
  51. package/gateway/ai/content_grounding/telemetry.py +0 -221
  52. package/gateway/ai/inbox_drafts/__init__.py +0 -61
  53. package/gateway/ai/inbox_drafts/registry.py +0 -412
  54. package/gateway/ai/inbox_drafts/schema.py +0 -374
  55. package/gateway/ai/inbox_executor.py +0 -565
@@ -144,7 +144,54 @@ def _register_venture(info: Dict[str, str]):
144
144
  VENTURES_FILE.write_text(json.dumps(ventures, indent=2))
145
145
 
146
146
 
147
- CENTRAL_LEDGER_DIR = Path.home() / ".delimit" / "ledger"
147
+ # LED-1188 / Plan-C: env-aware home so DELIMIT_HOME / DELIMIT_NAMESPACE_ROOT
148
+ # overrides apply to the ledger paths same as everywhere else. Falls back
149
+ # to ~/.delimit when neither env var is set (back-compat with v4.5.1 and
150
+ # all prior versions).
151
+ def _delimit_home() -> Path:
152
+ for env_key in ("DELIMIT_HOME", "DELIMIT_NAMESPACE_ROOT"):
153
+ val = os.environ.get(env_key, "").strip()
154
+ if val:
155
+ return Path(val)
156
+ return Path.home() / ".delimit"
157
+
158
+
159
+ CENTRAL_LEDGER_DIR = _delimit_home() / "ledger"
160
+ LEDGER_V2_DIR = _delimit_home() / "ledger-v2"
161
+
162
+ # LED-1188 D3 (deliberation att_f86e1f51110e8ed6 follow-up, 2026-04-28):
163
+ # Plan-C migration partitions the central ledger into per-venture sub-ledgers
164
+ # under ledger-v2/<slug>/. The resolver below auto-detects which layout is
165
+ # present and reads from it. Slugs match the migration script's canonical
166
+ # names so a v4.5.2 install picks up an existing Plan-C-staged tree without
167
+ # requiring the swap to happen first.
168
+ _VENTURE_CANONICAL = {
169
+ "delimit-mcp": "delimit",
170
+ "delimit-action": "delimit",
171
+ "delimit-ui": "delimit",
172
+ "delimit-cli": "delimit", # npm package name
173
+ "delimit-gateway": "delimit", # gateway repo
174
+ ".delimit": "delimit",
175
+ "wirereport": "wire-report",
176
+ "stakeone": "stake-one",
177
+ }
178
+ _KNOWN_VENTURE_SLUGS = {
179
+ "delimit", "wire-report", "domainvested",
180
+ "livetube", "stake-one", "root", "unsorted",
181
+ }
182
+
183
+
184
+ def _canonical_venture_slug(name: str) -> Optional[str]:
185
+ """Map a detected venture name to a canonical sub-ledger slug.
186
+
187
+ Returns None when the name doesn't match any known venture; callers
188
+ treat that as "no per-venture sub-ledger, use the central layout."
189
+ """
190
+ if not name:
191
+ return None
192
+ n = name.lower().strip()
193
+ n = _VENTURE_CANONICAL.get(n, n)
194
+ return n if n in _KNOWN_VENTURE_SLUGS else None
148
195
 
149
196
 
150
197
  def _detect_model() -> str:
@@ -182,15 +229,45 @@ def _detect_model() -> str:
182
229
 
183
230
 
184
231
  def _project_ledger_dir(project_path: str = ".") -> Path:
185
- """Get the ledger directory ALWAYS uses central ~/.delimit/ledger/.
186
-
187
- Cross-model handoff fix: Codex and Gemini were writing to $PWD/.delimit/ledger/
188
- which caused ledger fragmentation. All models must use the same central location
189
- so Claude, Codex, and Gemini see the same items.
190
-
191
- The central ledger at ~/.delimit/ledger/ is the source of truth.
192
- Per-project .delimit/ dirs are for policies and config only, not ledger state.
232
+ """Resolve the ledger directory for a project, with Plan-C auto-detect.
233
+
234
+ Resolution order (LED-1188 D3, deliberation att_f86e1f51110e8ed6):
235
+ 1. Detect venture from project_path -> canonical slug (delimit,
236
+ wire-report, domainvested, livetube, stake-one).
237
+ 2. If LEDGER_V2_DIR / <slug> / operations.jsonl exists, return that
238
+ per-venture sub-ledger. (Plan-C staged but not yet swapped.)
239
+ 3. If CENTRAL_LEDGER_DIR / <slug> / operations.jsonl exists, return
240
+ that per-venture sub-ledger. (Plan-C swapped.)
241
+ 4. Fall back to CENTRAL_LEDGER_DIR (legacy single-file layout).
242
+
243
+ Cross-model handoff fix (still enforced): Codex and Gemini were writing to
244
+ $PWD/.delimit/ledger/ which caused ledger fragmentation. The central
245
+ ~/.delimit/ledger/ tree (or its Plan-C-partitioned form) remains the
246
+ single source of truth — per-project .delimit/ dirs are for policies and
247
+ config only.
193
248
  """
249
+ # Quick exit: legacy callers that pass venture="" / project="." and
250
+ # have no Plan-C tree on disk get the original single-file layout.
251
+ if not LEDGER_V2_DIR.exists() and not (CENTRAL_LEDGER_DIR / "delimit").exists():
252
+ return CENTRAL_LEDGER_DIR
253
+
254
+ info = _detect_venture(project_path)
255
+ slug = _canonical_venture_slug(info.get("name", ""))
256
+ if slug is None:
257
+ return CENTRAL_LEDGER_DIR
258
+
259
+ # Plan-C staged: ledger-v2/<slug>/
260
+ staged = LEDGER_V2_DIR / slug
261
+ if (staged / "operations.jsonl").exists():
262
+ return staged
263
+
264
+ # Plan-C swapped: ledger/<slug>/
265
+ swapped = CENTRAL_LEDGER_DIR / slug
266
+ if (swapped / "operations.jsonl").exists():
267
+ return swapped
268
+
269
+ # No partitioned tree for this venture — fall back to the central
270
+ # legacy layout (operations.jsonl + strategy.jsonl directly in ledger/).
194
271
  return CENTRAL_LEDGER_DIR
195
272
 
196
273
 
@@ -235,6 +312,90 @@ def _append(path: Path, entry: Dict) -> Dict:
235
312
  return entry
236
313
 
237
314
 
315
+ # ── LED-877 signal guard ─────────────────────────────────────────────
316
+
317
+
318
+ # Sources that originate from sensed observations (social/strategy scans).
319
+ # Centralized so the guard logic is in one place even if more prefixes are
320
+ # added later (e.g. github_sense, reddit_sense).
321
+ _SENSED_SOURCE_PREFIXES = ("social_scan", "social_strategy")
322
+
323
+
324
+ def _check_source_is_ledger_item(
325
+ source: str,
326
+ *,
327
+ purpose: str = "promote_to_ledger",
328
+ title: str = "",
329
+ ledger: str = "",
330
+ ) -> None:
331
+ """LED-877 signal guard.
332
+
333
+ Sensed observations (``source='social_scan:...'``) MUST NOT land in
334
+ the ledger by default — they belong in the intel signal store.
335
+
336
+ LED-216 Phase 1 split: callers can declare *why* they are checking.
337
+
338
+ ``purpose='promote_to_ledger'`` (default, original strict behavior)
339
+ Used by ``add_item`` and any code path that actually writes a
340
+ ledger row. Raises ``ValueError`` on a sensed source unless the
341
+ ``_DELIMIT_SIGNAL_PROMOTED_BY`` bypass env var is set (which is
342
+ the explicit promote-to-ledger path).
343
+
344
+ ``purpose='draft_only'``
345
+ Used by code paths that produce a reply draft from a sensed
346
+ observation but do NOT promote the signal to the ledger. Drafts
347
+ are an acceptable consumer of sensed sources, so the guard is a
348
+ no-op for this purpose.
349
+
350
+ Both purposes still respect ``DELIMIT_SIGNAL_GUARD=shadow`` for the
351
+ shadow-log fallback used during the LED-877 rollout.
352
+ """
353
+ _src_norm = (source or "").strip().lower()
354
+ if not any(_src_norm.startswith(p) for p in _SENSED_SOURCE_PREFIXES):
355
+ return # Not a sensed source; nothing to guard against.
356
+
357
+ if purpose == "draft_only":
358
+ # Drafts may legitimately reference a sensed observation. The
359
+ # guard exists to prevent ledger writes, not draft generation.
360
+ return
361
+
362
+ if purpose != "promote_to_ledger":
363
+ # Defensive: unknown purpose ⇒ default to strict behavior so a
364
+ # typo can't accidentally weaken the guard.
365
+ pass
366
+
367
+ _promoted_by = os.environ.get("_DELIMIT_SIGNAL_PROMOTED_BY", "")
368
+ if _promoted_by:
369
+ return # Explicit promote_to_ledger path; bypass authorized.
370
+
371
+ msg = (
372
+ f"LED-877 guard: source={source!r} is a sensed observation, not "
373
+ f"a ledger item. Use ai.sensing.signal_store.ingest() instead. "
374
+ f"Promote explicitly via promote_to_ledger(signal_id=...)."
375
+ )
376
+
377
+ _guard_mode = os.environ.get("DELIMIT_SIGNAL_GUARD", "enforce").lower()
378
+ if _guard_mode == "shadow":
379
+ try:
380
+ _shadow_log = Path.home() / ".delimit" / "logs" / "signal_guard_shadow.jsonl"
381
+ _shadow_log.parent.mkdir(parents=True, exist_ok=True)
382
+ with _shadow_log.open("a") as _f:
383
+ _f.write(json.dumps({
384
+ "ts": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
385
+ "title": title,
386
+ "source": source,
387
+ "ledger": ledger,
388
+ "purpose": purpose,
389
+ "msg": msg,
390
+ }) + "\n")
391
+ except Exception:
392
+ pass
393
+ # fall through (shadow mode does not raise)
394
+ return
395
+
396
+ raise ValueError(msg)
397
+
398
+
238
399
  def add_item(
239
400
  title: str,
240
401
  ledger: str = "ops",
@@ -259,44 +420,63 @@ def add_item(
259
420
  observations cannot land in the ledger. Observations belong in the intel
260
421
  signal store (ai/sensing/signal_store.py). Bypass via env var for the
261
422
  promote_to_ledger path: _DELIMIT_SIGNAL_PROMOTED_BY=<who>.
423
+
424
+ LED-216 Phase 1: the guard is now reusable via
425
+ ``_check_source_is_ledger_item(..., purpose='draft_only')`` for code
426
+ paths that produce reply drafts from sensed observations without
427
+ promoting the underlying signal to the ledger.
262
428
  """
263
- _src_norm = (source or "").strip().lower()
264
- _promoted_by = os.environ.get("_DELIMIT_SIGNAL_PROMOTED_BY", "")
265
- _guard_mode = os.environ.get("DELIMIT_SIGNAL_GUARD", "enforce").lower()
266
- if _src_norm.startswith("social_scan") or _src_norm.startswith("social_strategy"):
267
- if not _promoted_by:
268
- msg = (
269
- f"LED-877 guard: source={source!r} is a sensed observation, not "
270
- f"a ledger item. Use ai.sensing.signal_store.ingest() instead. "
271
- f"Promote explicitly via promote_to_ledger(signal_id=...)."
272
- )
273
- if _guard_mode == "shadow":
274
- try:
275
- _shadow_log = Path.home() / ".delimit" / "logs" / "signal_guard_shadow.jsonl"
276
- _shadow_log.parent.mkdir(parents=True, exist_ok=True)
277
- with _shadow_log.open("a") as _f:
278
- _f.write(json.dumps({
279
- "ts": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
280
- "title": title,
281
- "source": source,
282
- "ledger": ledger,
283
- "msg": msg,
284
- }) + "\n")
285
- except Exception:
286
- pass
287
- # fall through
288
- else:
289
- raise ValueError(msg)
429
+ _check_source_is_ledger_item(
430
+ source,
431
+ purpose="promote_to_ledger",
432
+ title=title,
433
+ ledger=ledger,
434
+ )
290
435
 
291
436
  _ensure(project_path)
292
437
  venture = _detect_venture(project_path)
293
438
  ledger_dir = _project_ledger_dir(project_path)
294
439
  path = ledger_dir / ("strategy.jsonl" if ledger == "strategy" else "operations.jsonl")
295
440
 
441
+ # LED-824: ID-collision fix. The Plan-C resolver routes delimit-context
442
+ # queries to ledger-v2/<slug>/. Per-venture ID counters used to scan
443
+ # only the active sub-ledger, so newly-created items could collide with
444
+ # IDs already used in the legacy CENTRAL_LEDGER_DIR root files. Now we
445
+ # union all known IDs across (a) the resolved sub-ledger AND (b) every
446
+ # peer sub-ledger AND (c) the legacy root, then pick the next free.
296
447
  items = _read_ledger(path)
297
448
  prefix = "STR" if ledger == "strategy" else "LED"
298
- existing_ids = [i.get("id", "") for i in items if i.get("type") != "update"]
299
- num = len(existing_ids) + 1
449
+ existing_ids = {i.get("id", "") for i in items if i.get("type") != "update"}
450
+
451
+ # Union with all peer files in ledger-v2/* and the legacy root files,
452
+ # for both strategy and operations ledgers (an LED-N could collide
453
+ # whether it lives in operations or strategy in any sub-ledger).
454
+ filename = "strategy.jsonl" if ledger == "strategy" else "operations.jsonl"
455
+ candidate_paths: list[Path] = []
456
+ if LEDGER_V2_DIR.exists():
457
+ for sub in LEDGER_V2_DIR.iterdir():
458
+ if sub.is_dir():
459
+ candidate_paths.append(sub / filename)
460
+ candidate_paths.append(CENTRAL_LEDGER_DIR / filename)
461
+
462
+ for cand in candidate_paths:
463
+ if cand == path:
464
+ continue # already scanned
465
+ if not cand.exists():
466
+ continue
467
+ try:
468
+ for entry in _read_ledger(cand):
469
+ if entry.get("type") == "update":
470
+ continue
471
+ eid = entry.get("id", "")
472
+ if eid:
473
+ existing_ids.add(eid)
474
+ except Exception:
475
+ # Best-effort: a malformed peer file shouldn't block id assignment
476
+ continue
477
+
478
+ # Walk forward from len()+1 until we find a non-colliding slot.
479
+ num = len(items) + 1
300
480
  while f"{prefix}-{num:03d}" in existing_ids:
301
481
  num += 1
302
482
  item_id = f"{prefix}-{num:03d}"
@@ -227,3 +227,29 @@ except ImportError:
227
227
  LICENSE_FILE.parent.mkdir(parents=True, exist_ok=True)
228
228
  LICENSE_FILE.write_text(json.dumps(license_data, indent=2))
229
229
  return {"status": "activated", "tier": "pro", "message": "Activated (offline fallback). Will validate on next network access."}
230
+
231
+
232
+ # ─── LED-2060 (P1): test-mode license bypass ─────────────────────────────
233
+ # tests/conftest.py sets DELIMIT_TEST_MODE=1 at session start. Without this
234
+ # wrapper, every test that exercises a Pro tool got back a premium_required
235
+ # error and asserted-against-the-wrong-shape, blocking CI on every PR.
236
+ # Bypass is scoped: only active when the env var is explicitly set, only
237
+ # returns None (the "no gate" sentinel), and wraps both compiled-binary
238
+ # and fallback paths. Customers never hit this path because their
239
+ # environments don't set DELIMIT_TEST_MODE.
240
+ import os as _os
241
+
242
+ _original_require_premium = require_premium # type: ignore[has-type]
243
+ _original_is_premium = is_premium # type: ignore[has-type]
244
+
245
+
246
+ def require_premium(tool_name: str): # type: ignore[no-redef]
247
+ if _os.environ.get("DELIMIT_TEST_MODE") == "1":
248
+ return None
249
+ return _original_require_premium(tool_name)
250
+
251
+
252
+ def is_premium() -> bool: # type: ignore[no-redef]
253
+ if _os.environ.get("DELIMIT_TEST_MODE") == "1":
254
+ return True
255
+ return _original_is_premium()
@@ -158,6 +158,30 @@ def _record_notification(entry: Dict[str, Any]) -> None:
158
158
  logger.warning("Failed to record notification: %s", e)
159
159
 
160
160
 
161
+ _QUARANTINE_FILE = Path.home() / ".delimit" / "notifications_quarantine.jsonl"
162
+
163
+
164
+ def _quarantine_record(entry: Dict[str, Any]) -> None:
165
+ """Log a notification that was suppressed by the test-mode / skip-marker
166
+ guard in send_notification(). The would-be email is NOT delivered;
167
+ this file is for audit only.
168
+
169
+ Added 2026-05-01 after gateway pytest runs were repeatedly leaking
170
+ [Test] / [Test Subject] / [DELIMIT_TEST_MODE=1 skipped] emails into
171
+ the founder's real inbox via test paths that called send_notification
172
+ without stubbing.
173
+ """
174
+ import datetime as _dt
175
+ try:
176
+ _QUARANTINE_FILE.parent.mkdir(parents=True, exist_ok=True)
177
+ entry = {**entry, "ts": _dt.datetime.now(_dt.timezone.utc).isoformat()}
178
+ with open(_QUARANTINE_FILE, "a", encoding="utf-8") as f:
179
+ f.write(json.dumps(entry) + "\n")
180
+ except OSError:
181
+ # Quarantine log failure must not crash the caller.
182
+ pass
183
+
184
+
161
185
  def record_owner_action(entry: Dict[str, Any]) -> None:
162
186
  """Append an owner-action record for dashboard and async fanout."""
163
187
  try:
@@ -1041,9 +1065,17 @@ def _enforce_email_protocol(subject: str, message: str, event_type: str) -> tupl
1041
1065
  """Validate and fix email against the protocol. Returns (subject, message, warnings)."""
1042
1066
  warnings = []
1043
1067
 
1044
- # 1. Subject must have a valid prefix bracket
1045
- if not any(subject.startswith(p) for p in _VALID_SUBJECT_PREFIXES):
1046
- # Try to infer from event_type
1068
+ # 1. Subject must have SOME bracket prefix (e.g. [DONE], [POSTED], [FIX])
1069
+ # so the founder can triage on mobile.
1070
+ #
1071
+ # Founder-tone fix 2026-04-28: previously the validator hard-rejected any
1072
+ # bracket prefix not in _VALID_SUBJECT_PREFIXES and injected [INFO] in
1073
+ # front, producing subjects like "[INFO] [DONE] LED-2056 fixed". The
1074
+ # injected prefix overrode the caller's intent and bloated the subject.
1075
+ # Now any `[WORD]` prefix (uppercase short tag) is accepted as-is, and
1076
+ # we only inject when there's no bracket at all.
1077
+ _has_any_bracket_prefix = bool(_re.match(r"^\[[A-Z][A-Z0-9_-]{0,15}\]\s", subject))
1078
+ if not _has_any_bracket_prefix:
1047
1079
  # LED-969: customer-facing emails should not get bracket prefixes.
1048
1080
  # Any event_type starting with "customer_" is external-facing and
1049
1081
  # the subject should be sent as-is (clean, professional).
@@ -1135,6 +1167,39 @@ def send_notification(
1135
1167
  if not message:
1136
1168
  return {"error": "message is required"}
1137
1169
 
1170
+ # ── Contaminated-content guard ────────────────────────────────────
1171
+ # Every gateway pytest run was spamming the founder's real inbox via
1172
+ # tests that called send_notification without stubbing SMTP. Two
1173
+ # failure modes observed (2026-05-01):
1174
+ # 1. Bare test invocations (subject="Test", message="test")
1175
+ # 2. Social drafts where _call_model returned the
1176
+ # "[X skipped under DELIMIT_TEST_MODE=1 ...]" sentinel and the
1177
+ # sentinel string ended up as the draft body.
1178
+ # Either is a noise/leak event. Refuse to send; log to a quarantine
1179
+ # JSONL so the would-be content is auditable.
1180
+ #
1181
+ # Surgical match — only on the specific leaked shapes. Tests that
1182
+ # correctly mock smtplib.SMTP keep working (their mock fires inside
1183
+ # send_email, after this guard, and returns a fake delivered=True).
1184
+ if channel in ("email", "webhook", "slack", "telegram"):
1185
+ body = message or ""
1186
+ subj = subject or ""
1187
+ leak_match = (
1188
+ "skipped under DELIMIT_TEST_MODE" in body
1189
+ or "DELIMIT_TEST_MODE=1" in body
1190
+ or (subj.strip().lower() == "test" and body.strip().lower() == "test")
1191
+ or (subj.strip().lower() == "test subject" and body.strip().lower() == "test body")
1192
+ )
1193
+ if leak_match:
1194
+ _quarantine_record({
1195
+ "reason": "leaked_shape",
1196
+ "channel": channel,
1197
+ "subject": subj[:100],
1198
+ "event_type": event_type,
1199
+ "to": to,
1200
+ })
1201
+ return {"skipped": "leaked shape detected — not sent (audit: ~/.delimit/notifications_quarantine.jsonl)"}
1202
+
1138
1203
  # Enforce email protocol for all email notifications
1139
1204
  protocol_warnings = []
1140
1205
  if channel == "email":
@@ -1,6 +1,7 @@
1
1
  import json
2
2
  import logging
3
3
  import os
4
+ import time
4
5
  import urllib.parse
5
6
  import urllib.request
6
7
  from pathlib import Path
@@ -8,6 +9,62 @@ from typing import Any, Dict, List, Optional
8
9
 
9
10
  logger = logging.getLogger("delimit.ai.reddit_proxy")
10
11
 
12
+ # LED-2068: freshness ceiling. PullPush stopped ingesting around 2025-05-19;
13
+ # the residential proxy gets 403 from Reddit on datacenter IPs; direct fetch
14
+ # is blocked. ALL three tiers can return stale archive data on any given
15
+ # fetch, and stale data is worse than no data for engagement discovery
16
+ # (drafting against year-old threads burns trust). Default to a 14-day
17
+ # freshness ceiling — anything older is dropped before returning.
18
+ #
19
+ # Override via DELIMIT_REDDIT_MAX_AGE_DAYS (set to a large number to disable).
20
+ DEFAULT_MAX_AGE_DAYS = 14
21
+ TIER_PROXY = "proxy"
22
+ TIER_PULLPUSH = "pullpush"
23
+ TIER_DIRECT = "direct"
24
+
25
+
26
+ def _max_age_seconds() -> float:
27
+ raw = os.environ.get("DELIMIT_REDDIT_MAX_AGE_DAYS", "").strip()
28
+ if raw:
29
+ try:
30
+ return max(0.0, float(raw)) * 86400.0
31
+ except ValueError:
32
+ pass
33
+ return DEFAULT_MAX_AGE_DAYS * 86400.0
34
+
35
+
36
+ def _stamp_and_filter(posts: List[Dict[str, Any]], tier: str, subreddit: str) -> List[Dict[str, Any]]:
37
+ """Tag each post with _source_tier and drop anything older than the
38
+ freshness ceiling. Returns kept posts. Also logs the drop count for
39
+ debugging stale-archive regressions (LED-2068)."""
40
+ if not posts:
41
+ return []
42
+ now = time.time()
43
+ max_age = _max_age_seconds()
44
+ if max_age <= 0:
45
+ cutoff = 0.0
46
+ else:
47
+ cutoff = now - max_age
48
+ kept: List[Dict[str, Any]] = []
49
+ dropped = 0
50
+ for p in posts:
51
+ try:
52
+ created = float(p.get("created_utc") or 0)
53
+ except (TypeError, ValueError):
54
+ created = 0.0
55
+ if created and created >= cutoff:
56
+ p["_source_tier"] = tier
57
+ kept.append(p)
58
+ else:
59
+ dropped += 1
60
+ if dropped:
61
+ logger.info(
62
+ "reddit_proxy: dropped %d/%d stale post(s) from %s tier for r/%s "
63
+ "(freshness ceiling=%.1fd)",
64
+ dropped, len(posts), tier, subreddit, max_age / 86400.0,
65
+ )
66
+ return kept
67
+
11
68
  def _get_proxy_config() -> Dict[str, str]:
12
69
  """Load proxy config from private secrets or environment.
13
70
 
@@ -43,10 +100,12 @@ def _get_proxy_config() -> Dict[str, str]:
43
100
  def fetch_subreddit(subreddit: str, sort: str = "new", limit: int = 10) -> List[Dict[str, Any]]:
44
101
  """
45
102
  Fetch posts from a single subreddit with fallback chain.
46
- Returns standardized post dicts.
103
+ Returns standardized post dicts. Each post is tagged with _source_tier
104
+ indicating which fallback served it, and stale posts (older than the
105
+ freshness ceiling per LED-2068) are dropped before returning.
47
106
  """
48
107
  reddit_url = f"https://www.reddit.com/r/{subreddit}/{sort}.json?limit={limit}&raw_json=1"
49
-
108
+
50
109
  # 1. Try Local Proxy (Residential IP)
51
110
  proxy_cfg = _get_proxy_config()
52
111
  proxy_url = proxy_cfg.get("proxy_url")
@@ -62,29 +121,48 @@ def fetch_subreddit(subreddit: str, sort: str = "new", limit: int = 10) -> List[
62
121
  with urllib.request.urlopen(req, timeout=10) as resp:
63
122
  body = json.loads(resp.read().decode())
64
123
  children = body.get("data", {}).get("children", [])
65
- return [c.get("data", {}) for c in children if c.get("data")]
124
+ raw = [c.get("data", {}) for c in children if c.get("data")]
125
+ kept = _stamp_and_filter(raw, TIER_PROXY, subreddit)
126
+ if kept:
127
+ return kept
128
+ # If the proxy succeeded but returned only stale data, fall
129
+ # through to next tier rather than returning empty — gives
130
+ # us a chance to find fresh data elsewhere.
66
131
  except Exception as e:
67
132
  logger.debug(f"Local proxy failed for r/{subreddit}: {e}")
68
133
 
69
- # 2. Fallback: PullPush API (Public Archive)
134
+ # 3. Try Direct (often blocked on datacenter IPs, but fast when it works
135
+ # and is the only tier currently capable of serving fresh data — PullPush
136
+ # stopped ingesting ~2025-05-19, residential proxy 403s from datacenter).
137
+ # Direct moved AHEAD of PullPush in the chain post-LED-2068 because a
138
+ # blocked direct fetch is recoverable via fallback, while a successful
139
+ # PullPush serves stale archive that pollutes downstream classifiers.
70
140
  try:
71
- pp_url = f"https://api.pullpush.io/reddit/search/submission/?subreddit={subreddit}&size={limit}&sort=desc"
72
- req = urllib.request.Request(pp_url, headers={"User-Agent": "Delimit/1.0"})
73
- with urllib.request.urlopen(req, timeout=10) as resp:
141
+ req = urllib.request.Request(reddit_url, headers={"User-Agent": "Mozilla/5.0 (Delimit)"})
142
+ with urllib.request.urlopen(req, timeout=5) as resp:
74
143
  body = json.loads(resp.read().decode())
75
- return body.get("data", [])
144
+ children = body.get("data", {}).get("children", [])
145
+ raw = [c.get("data", {}) for c in children if c.get("data")]
146
+ kept = _stamp_and_filter(raw, TIER_DIRECT, subreddit)
147
+ if kept:
148
+ return kept
76
149
  except Exception as e:
77
- logger.debug(f"PullPush fallback failed for r/{subreddit}: {e}")
150
+ logger.debug(f"Direct fetch failed for r/{subreddit}: {e}")
78
151
 
79
- # 3. Fallback: Direct (Often blocked on servers)
152
+ # 2. Last-resort: PullPush archive. Currently stale (May 2025 ceiling)
153
+ # but the freshness filter will drop everything if so — leaves the door
154
+ # open for the day PullPush resumes ingesting fresh data.
80
155
  try:
81
- req = urllib.request.Request(reddit_url, headers={"User-Agent": "Mozilla/5.0 (Delimit)"})
82
- with urllib.request.urlopen(req, timeout=5) as resp:
156
+ pp_url = f"https://api.pullpush.io/reddit/search/submission/?subreddit={subreddit}&size={limit}&sort=desc"
157
+ req = urllib.request.Request(pp_url, headers={"User-Agent": "Delimit/1.0"})
158
+ with urllib.request.urlopen(req, timeout=10) as resp:
83
159
  body = json.loads(resp.read().decode())
84
- children = body.get("data", {}).get("children", [])
85
- return [c.get("data", {}) for c in children if c.get("data")]
160
+ raw = body.get("data", []) or []
161
+ kept = _stamp_and_filter(raw, TIER_PULLPUSH, subreddit)
162
+ if kept:
163
+ return kept
86
164
  except Exception as e:
87
- logger.warning(f"Direct fetch failed for r/{subreddit}: {e}")
165
+ logger.debug(f"PullPush fallback failed for r/{subreddit}: {e}")
88
166
 
89
167
  return []
90
168
 
@@ -96,7 +96,34 @@ _PAIN_TO_RELEVANCE: Dict[str, str] = {
96
96
  "cost": "new_opportunity", # pricing transparency / cost tracking
97
97
  }
98
98
 
99
- PROXY_URL = "http://127.0.0.1:4819/reddit-fetch"
99
+ def _load_proxy_url() -> str:
100
+ """Load proxy URL from the canonical reddit-proxy.json secrets file.
101
+
102
+ Single source of truth shared with ai.reddit_proxy. Falls back to the
103
+ canonical SSH-tunnel localhost endpoint if the secrets file is missing.
104
+
105
+ LED-2068b note: the residential proxy is reached via an SSH local-port-
106
+ forward — `127.0.0.1:4819/reddit-fetch` is the LOCAL endpoint of the
107
+ tunnel into the *actual* residential machine that performs the Reddit
108
+ fetch. There is also a local Python wrapper at `:8787/fetch` (systemd
109
+ unit `delimit-reddit-proxy.service`) — that one runs on this datacenter
110
+ VM and gets 403 from Reddit's anti-bot wall, so it serves nothing
111
+ useful. Do not change the default away from 4819 without first
112
+ confirming the SSH tunnel is no longer the canonical path.
113
+ """
114
+ try:
115
+ secrets_path = Path.home() / ".delimit" / "secrets" / "reddit-proxy.json"
116
+ if secrets_path.exists():
117
+ data = json.loads(secrets_path.read_text())
118
+ url = (data.get("proxy_url") or "").strip()
119
+ if url:
120
+ return url
121
+ except Exception:
122
+ pass
123
+ return "http://127.0.0.1:4819/reddit-fetch"
124
+
125
+
126
+ PROXY_URL = _load_proxy_url()
100
127
  SCANS_DIR = Path.home() / ".delimit" / "reddit_scans"
101
128
  VENTURES_CONFIG_PATH = Path.home() / ".delimit" / "social_target_ventures.json"
102
129
 
@@ -143,29 +170,20 @@ def _fetch_subreddit(
143
170
  The proxy endpoint expects a query parameter ``url`` containing the
144
171
  actual Reddit JSON URL. Returns a list of extracted post dicts.
145
172
  """
146
- reddit_url = f"https://www.reddit.com/r/{subreddit}/{sort}.json?limit={limit}&raw_json=1"
147
- fetch_url = f"{proxy_url}?url={urllib.request.quote(reddit_url, safe='')}"
148
-
149
- req = urllib.request.Request(
150
- fetch_url,
151
- headers={"User-Agent": "delimit-scanner/1.0", "Accept": "application/json"},
152
- )
153
-
173
+ # Delegate to ai.reddit_proxy.fetch_subreddit which has the canonical
174
+ # 3-tier fallback chain (residential proxy → direct → PullPush archive).
175
+ # Datacenter IPs get 403 from Reddit even with auth; the freshness filter
176
+ # in reddit_proxy drops stale-archive results so the scanner returns
177
+ # honest empty rather than fake old data.
178
+ from ai.reddit_proxy import fetch_subreddit as _proxy_fetch
154
179
  try:
155
- with urllib.request.urlopen(req, timeout=15) as resp:
156
- body = json.loads(resp.read().decode())
180
+ raw = _proxy_fetch(subreddit, sort=sort, limit=limit) or []
157
181
  except Exception as exc:
158
182
  logger.warning("Failed to fetch r/%s: %s", subreddit, exc)
159
183
  return []
160
184
 
161
- # Reddit returns {"data": {"children": [...]}}
162
- children = []
163
- if isinstance(body, dict):
164
- children = body.get("data", {}).get("children", [])
165
-
166
185
  posts: List[Dict[str, Any]] = []
167
- for child in children:
168
- d = child.get("data", {})
186
+ for d in raw:
169
187
  if not d:
170
188
  continue
171
189
  # Skip stickied