delimit-cli 4.1.53 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,24 +9,35 @@ from typing import Any, Dict, List, Optional
9
9
  logger = logging.getLogger("delimit.ai.reddit_proxy")
10
10
 
11
11
  def _get_proxy_config() -> Dict[str, str]:
12
- """Load proxy config from private secrets or environment."""
13
- config = {"proxy_url": ""}
14
-
15
- # 1. Check environment variable
12
+ """Load proxy config from private secrets or environment.
13
+
14
+ Returns {proxy_url, token}. The server-side proxy requires a bearer
15
+ token (LED-988 follow-up) clients without a token still populate
16
+ proxy_url but will fail auth at the server unless the server is run
17
+ without a token (not recommended).
18
+ """
19
+ config = {"proxy_url": "", "token": ""}
20
+
21
+ # 1. Environment variables
16
22
  env_url = os.environ.get("DELIMIT_REDDIT_PROXY")
23
+ env_token = os.environ.get("DELIMIT_REDDIT_PROXY_TOKEN")
17
24
  if env_url:
18
25
  config["proxy_url"] = env_url
26
+ if env_token:
27
+ config["token"] = env_token
28
+ if config["proxy_url"]:
19
29
  return config
20
30
 
21
- # 2. Check private secrets file
31
+ # 2. Secrets file
22
32
  secrets_path = Path.home() / ".delimit" / "secrets" / "reddit-proxy.json"
23
33
  if secrets_path.exists():
24
34
  try:
25
35
  secrets = json.loads(secrets_path.read_text())
26
- config["proxy_url"] = secrets.get("proxy_url", "")
36
+ config["proxy_url"] = secrets.get("proxy_url", "") or config["proxy_url"]
37
+ config["token"] = secrets.get("token", "") or config["token"]
27
38
  except Exception as e:
28
39
  logger.debug(f"Failed to load reddit-proxy secrets: {e}")
29
-
40
+
30
41
  return config
31
42
 
32
43
  def fetch_subreddit(subreddit: str, sort: str = "new", limit: int = 10) -> List[Dict[str, Any]]:
@@ -42,7 +53,11 @@ def fetch_subreddit(subreddit: str, sort: str = "new", limit: int = 10) -> List[
42
53
  if proxy_url:
43
54
  try:
44
55
  fetch_url = f"{proxy_url}?url={urllib.parse.quote(reddit_url, safe='')}"
45
- req = urllib.request.Request(fetch_url, headers={"User-Agent": "Delimit/1.0"})
56
+ headers = {"User-Agent": "Delimit/1.0"}
57
+ token = proxy_cfg.get("token", "")
58
+ if token:
59
+ headers["Authorization"] = f"Bearer {token}"
60
+ req = urllib.request.Request(fetch_url, headers=headers)
46
61
  with urllib.request.urlopen(req, timeout=10) as resp:
47
62
  body = json.loads(resp.read().decode())
48
63
  children = body.get("data", {}).get("children", [])
@@ -84,7 +99,11 @@ def fetch_thread(thread_id: str) -> Optional[Dict[str, Any]]:
84
99
  if proxy_url:
85
100
  try:
86
101
  fetch_url = f"{proxy_url}?url={urllib.parse.quote(reddit_url, safe='')}"
87
- req = urllib.request.Request(fetch_url, headers={"User-Agent": "Delimit/1.0"})
102
+ headers = {"User-Agent": "Delimit/1.0"}
103
+ token = proxy_cfg.get("token", "")
104
+ if token:
105
+ headers["Authorization"] = f"Bearer {token}"
106
+ req = urllib.request.Request(fetch_url, headers=headers)
88
107
  with urllib.request.urlopen(req, timeout=10) as resp:
89
108
  data = json.loads(resp.read().decode())
90
109
  if isinstance(data, list) and len(data) > 0:
@@ -0,0 +1,35 @@
1
+ """Signal sensing layer (LED-877).
2
+
3
+ Physically separates observational signals from the ledger. Signals are a
4
+ deliberation corpus, not a task queue — they must never be pulled by
5
+ build_loop as work. Import from ai.sensing.signal_store for ingest/query.
6
+ """
7
+
8
+ from ai.sensing.schema import Signal, ValidationError, normalize_url, fingerprint_of
9
+ from ai.sensing.signal_store import (
10
+ ingest,
11
+ query,
12
+ dedup_check,
13
+ age_out_to_warm,
14
+ freeze_cold,
15
+ promote_to_ledger,
16
+ SIGNALS_DIR,
17
+ HOT_WINDOW_DAYS,
18
+ WARM_WINDOW_DAYS,
19
+ )
20
+
21
+ __all__ = [
22
+ "Signal",
23
+ "ValidationError",
24
+ "normalize_url",
25
+ "fingerprint_of",
26
+ "ingest",
27
+ "query",
28
+ "dedup_check",
29
+ "age_out_to_warm",
30
+ "freeze_cold",
31
+ "promote_to_ledger",
32
+ "SIGNALS_DIR",
33
+ "HOT_WINDOW_DAYS",
34
+ "WARM_WINDOW_DAYS",
35
+ ]
@@ -0,0 +1,107 @@
1
+ """Signal schema + validation (LED-877).
2
+
3
+ A signal is an observation, not a commitment. Schema enforces enough metadata
4
+ for deliberation to work with, rejects empty-identity rows at ingest (killing
5
+ the LED-876 ghost-engage-task class of bug at its source).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import hashlib
11
+ import re
12
+ from dataclasses import dataclass, field, asdict
13
+ from typing import Any, Dict, List, Optional
14
+ from urllib.parse import urlparse, urlunparse, parse_qsl, urlencode
15
+
16
+
17
+ class ValidationError(ValueError):
18
+ """Raised when a signal fails schema validation on ingest."""
19
+
20
+
21
+ _UTM_RE = re.compile(r"^utm_")
22
+
23
+
24
+ def normalize_url(url: str) -> str:
25
+ """Canonicalize URL: strip utm_* query params, fragment, trailing slash."""
26
+ if not url:
27
+ return ""
28
+ try:
29
+ p = urlparse(url.strip())
30
+ except Exception:
31
+ return url.strip()
32
+ if not p.scheme:
33
+ return url.strip()
34
+ query = [(k, v) for k, v in parse_qsl(p.query) if not _UTM_RE.match(k)]
35
+ path = p.path.rstrip("/") or "/"
36
+ cleaned = urlunparse(
37
+ (p.scheme.lower(), p.netloc.lower(), path, "", urlencode(query), "")
38
+ )
39
+ return cleaned
40
+
41
+
42
+ def fingerprint_of(platform: str, canonical_url: str, author: str) -> str:
43
+ """Stable dedup key for a signal."""
44
+ raw = f"{(platform or '').lower()}|{normalize_url(canonical_url)}|{(author or '').lower()}"
45
+ return hashlib.sha256(raw.encode("utf-8")).hexdigest()[:16]
46
+
47
+
48
+ @dataclass
49
+ class Signal:
50
+ """A sensed observation from an external platform.
51
+
52
+ Mandatory: canonical_url AND (author OR content_snippet).
53
+ Anything weaker than that is rejected at ingest because deliberation
54
+ cannot draw useful conclusions from a row with no identity.
55
+ """
56
+
57
+ fingerprint: str
58
+ platform: str
59
+ canonical_url: str
60
+ author: str = ""
61
+ author_handle: str = ""
62
+ content_snippet: str = ""
63
+ posted_at: str = ""
64
+ ingested_at: str = ""
65
+ classification: str = "signal"
66
+ relevance_score: float = 0.0
67
+ themes: List[str] = field(default_factory=list)
68
+ raw_ref: str = ""
69
+ id: str = ""
70
+
71
+ def to_dict(self) -> Dict[str, Any]:
72
+ return asdict(self)
73
+
74
+
75
+ def validate_and_normalize(raw: Dict[str, Any]) -> Signal:
76
+ """Convert a raw target dict from social_target.py into a validated Signal.
77
+
78
+ Raises ValidationError on missing mandatory fields so bugs surface loudly
79
+ at ingest rather than producing empty-identity rows that pollute the
80
+ corpus (the LED-876 failure mode).
81
+ """
82
+ platform = (raw.get("platform") or "").strip()
83
+ canonical_url = normalize_url(raw.get("canonical_url") or raw.get("url") or "")
84
+ author = (raw.get("author") or "").strip()
85
+ content_snippet = (raw.get("content_snippet") or raw.get("title") or "").strip()[:500]
86
+
87
+ if not canonical_url:
88
+ raise ValidationError("canonical_url is required")
89
+ if not author and not content_snippet:
90
+ raise ValidationError("at least one of author or content_snippet is required")
91
+ if not platform:
92
+ raise ValidationError("platform is required")
93
+
94
+ return Signal(
95
+ fingerprint=fingerprint_of(platform, canonical_url, author),
96
+ platform=platform,
97
+ canonical_url=canonical_url,
98
+ author=author,
99
+ author_handle=(raw.get("author_handle") or "").strip(),
100
+ content_snippet=content_snippet,
101
+ posted_at=(raw.get("posted_at") or "").strip(),
102
+ ingested_at="", # filled by signal_store.ingest
103
+ classification=(raw.get("classification") or "signal").strip(),
104
+ relevance_score=float(raw.get("relevance_score") or 0.0),
105
+ themes=list(raw.get("themes") or []),
106
+ raw_ref=(raw.get("raw_ref") or raw.get("source_url") or canonical_url).strip(),
107
+ )
@@ -0,0 +1,348 @@
1
+ """Signal store (LED-877).
2
+
3
+ Physically separated from the ledger. Daily shards, append-only. Reuses
4
+ ~/.delimit/intel/ as the parent directory so intel_* tooling can already
5
+ query it via intel_dataset_list.
6
+
7
+ Consumers: delimit sense CLI, delimit_signals_query MCP tool (future).
8
+ NOT a consumer: build_loop, agent_dispatch, ledger_manager.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ import os
15
+ import time
16
+ import uuid
17
+ from collections import Counter, defaultdict
18
+ from datetime import datetime, timedelta, timezone
19
+ from pathlib import Path
20
+ from typing import Any, Dict, Iterable, List, Optional
21
+
22
+ from ai.sensing.schema import (
23
+ Signal,
24
+ ValidationError,
25
+ fingerprint_of,
26
+ normalize_url,
27
+ validate_and_normalize,
28
+ )
29
+
30
+
31
+ class StorageError(RuntimeError):
32
+ """Raised when signal persistence fails."""
33
+
34
+
35
+ INTEL_DIR = Path.home() / ".delimit" / "intel"
36
+ SIGNALS_DIR = INTEL_DIR / "signals"
37
+ ARCHIVE_DIR = SIGNALS_DIR / "archive"
38
+ DEDUP_INDEX_PATH = SIGNALS_DIR / "_dedup_index.json"
39
+
40
+ HOT_WINDOW_DAYS = 7
41
+ WARM_WINDOW_DAYS = 30
42
+ MAX_SIGNALS_PER_AUTHOR_PER_DAY = 3
43
+
44
+
45
+ def _now() -> datetime:
46
+ return datetime.now(timezone.utc)
47
+
48
+
49
+ def _today_shard_path(when: Optional[datetime] = None) -> Path:
50
+ when = when or _now()
51
+ return SIGNALS_DIR / f"{when.date().isoformat()}.jsonl"
52
+
53
+
54
+ def _ensure_dirs() -> None:
55
+ SIGNALS_DIR.mkdir(parents=True, exist_ok=True)
56
+ ARCHIVE_DIR.mkdir(parents=True, exist_ok=True)
57
+
58
+
59
+ def _load_dedup_index() -> Dict[str, Dict[str, Any]]:
60
+ if not DEDUP_INDEX_PATH.exists():
61
+ return {}
62
+ try:
63
+ return json.loads(DEDUP_INDEX_PATH.read_text())
64
+ except (json.JSONDecodeError, OSError):
65
+ return {}
66
+
67
+
68
+ def _save_dedup_index(index: Dict[str, Dict[str, Any]]) -> None:
69
+ _ensure_dirs()
70
+ tmp = DEDUP_INDEX_PATH.with_suffix(".tmp")
71
+ tmp.write_text(json.dumps(index))
72
+ tmp.replace(DEDUP_INDEX_PATH)
73
+
74
+
75
+ def dedup_check(fingerprint: str, window_days: int = WARM_WINDOW_DAYS) -> bool:
76
+ """Return True if a signal with this fingerprint was ingested within window_days."""
77
+ if not fingerprint:
78
+ return False
79
+ index = _load_dedup_index()
80
+ entry = index.get(fingerprint)
81
+ if not entry:
82
+ return False
83
+ try:
84
+ ingested = datetime.fromisoformat(entry.get("ingested_at", ""))
85
+ except ValueError:
86
+ return False
87
+ cutoff = _now() - timedelta(days=window_days)
88
+ return ingested >= cutoff
89
+
90
+
91
+ def _author_count_today(author: str) -> int:
92
+ if not author:
93
+ return 0
94
+ path = _today_shard_path()
95
+ if not path.exists():
96
+ return 0
97
+ count = 0
98
+ needle = author.lower()
99
+ try:
100
+ for line in path.read_text().splitlines():
101
+ if not line.strip():
102
+ continue
103
+ try:
104
+ row = json.loads(line)
105
+ except json.JSONDecodeError:
106
+ continue
107
+ if (row.get("author") or "").lower() == needle:
108
+ count += 1
109
+ except OSError:
110
+ return 0
111
+ return count
112
+
113
+
114
+ def ingest(raw: Dict[str, Any]) -> Dict[str, Any]:
115
+ """Ingest a raw target dict as a validated Signal.
116
+
117
+ Raises ValidationError on schema violation (caller decides whether to
118
+ log-and-skip or propagate). Returns the stored signal with assigned id
119
+ and ingested_at, or a dedup/rate-limit notice.
120
+ """
121
+ signal = validate_and_normalize(raw)
122
+
123
+ if dedup_check(signal.fingerprint):
124
+ index = _load_dedup_index()
125
+ entry = index.get(signal.fingerprint, {})
126
+ entry["hit_count"] = int(entry.get("hit_count", 1)) + 1
127
+ entry["last_seen_at"] = _now().isoformat()
128
+ index[signal.fingerprint] = entry
129
+ _save_dedup_index(index)
130
+ return {
131
+ "status": "deduped",
132
+ "fingerprint": signal.fingerprint,
133
+ "hit_count": entry["hit_count"],
134
+ }
135
+
136
+ if _author_count_today(signal.author) >= MAX_SIGNALS_PER_AUTHOR_PER_DAY:
137
+ return {
138
+ "status": "rate_limited",
139
+ "author": signal.author,
140
+ "limit": MAX_SIGNALS_PER_AUTHOR_PER_DAY,
141
+ }
142
+
143
+ _ensure_dirs()
144
+ now = _now()
145
+ signal.ingested_at = now.isoformat()
146
+ signal.id = f"SIG-{uuid.uuid4().hex[:10].upper()}"
147
+
148
+ shard = _today_shard_path(now)
149
+ try:
150
+ with shard.open("a") as f:
151
+ f.write(json.dumps(signal.to_dict()) + "\n")
152
+ except OSError as exc:
153
+ raise StorageError(f"failed to write signal shard {shard}: {exc}") from exc
154
+
155
+ index = _load_dedup_index()
156
+ index[signal.fingerprint] = {
157
+ "id": signal.id,
158
+ "ingested_at": signal.ingested_at,
159
+ "hit_count": 1,
160
+ "shard": shard.name,
161
+ }
162
+ _save_dedup_index(index)
163
+
164
+ return {"status": "ingested", "signal": signal.to_dict(), "shard": shard.name}
165
+
166
+
167
+ def _iter_shards(since_days: int = HOT_WINDOW_DAYS) -> Iterable[Path]:
168
+ if not SIGNALS_DIR.exists():
169
+ return []
170
+ cutoff = (_now() - timedelta(days=since_days)).date()
171
+ paths = []
172
+ for path in SIGNALS_DIR.glob("*.jsonl"):
173
+ if path.name.startswith("_"):
174
+ continue
175
+ try:
176
+ shard_date = datetime.fromisoformat(path.stem).date()
177
+ except ValueError:
178
+ continue
179
+ if shard_date >= cutoff:
180
+ paths.append(path)
181
+ return sorted(paths, reverse=True)
182
+
183
+
184
+ def query(
185
+ since_days: int = 1,
186
+ platform: str = "",
187
+ limit: int = 50,
188
+ ) -> List[Dict[str, Any]]:
189
+ """Return signals from the hot window, newest first.
190
+
191
+ since_days=1 returns the last 24h of signals (the default `delimit sense`
192
+ view). platform filters to a specific source; empty = all.
193
+ """
194
+ rows: List[Dict[str, Any]] = []
195
+ want_platform = (platform or "").strip().lower()
196
+ for shard in _iter_shards(since_days):
197
+ try:
198
+ for line in shard.read_text().splitlines():
199
+ if not line.strip():
200
+ continue
201
+ try:
202
+ row = json.loads(line)
203
+ except json.JSONDecodeError:
204
+ continue
205
+ if want_platform and (row.get("platform") or "").lower() != want_platform:
206
+ continue
207
+ rows.append(row)
208
+ if len(rows) >= limit:
209
+ return rows
210
+ except OSError:
211
+ continue
212
+ return rows
213
+
214
+
215
+ def age_out_to_warm(days: int = HOT_WINDOW_DAYS) -> int:
216
+ """No-op placeholder: hot/warm separation is a query boundary, not a move.
217
+
218
+ We keep all shards in SIGNALS_DIR and rely on query()'s since_days filter
219
+ to enforce the hot window. Returns the count of shards older than `days`
220
+ for reporting.
221
+ """
222
+ if not SIGNALS_DIR.exists():
223
+ return 0
224
+ cutoff = (_now() - timedelta(days=days)).date()
225
+ old = 0
226
+ for path in SIGNALS_DIR.glob("*.jsonl"):
227
+ if path.name.startswith("_"):
228
+ continue
229
+ try:
230
+ shard_date = datetime.fromisoformat(path.stem).date()
231
+ except ValueError:
232
+ continue
233
+ if shard_date < cutoff:
234
+ old += 1
235
+ return old
236
+
237
+
238
+ def freeze_cold(month: str) -> str:
239
+ """Move all shards whose date starts with `month` (YYYY-MM) into ARCHIVE_DIR/{month}.jsonl.
240
+
241
+ Returns the archive path. Safe to run repeatedly; reruns append.
242
+ """
243
+ if not month or len(month) != 7 or month[4] != "-":
244
+ raise ValidationError(f"month must be YYYY-MM, got {month!r}")
245
+ _ensure_dirs()
246
+ archive_path = ARCHIVE_DIR / f"{month}.jsonl"
247
+ moved = 0
248
+ with archive_path.open("a") as out:
249
+ for path in sorted(SIGNALS_DIR.glob(f"{month}-*.jsonl")):
250
+ try:
251
+ out.write(path.read_text())
252
+ except OSError:
253
+ continue
254
+ try:
255
+ path.unlink()
256
+ moved += 1
257
+ except OSError:
258
+ pass
259
+ return str(archive_path)
260
+
261
+
262
+ def promote_to_ledger(
263
+ signal_id: str,
264
+ ledger: str = "ops",
265
+ priority: str = "P2",
266
+ extra_tags: Optional[List[str]] = None,
267
+ ) -> Dict[str, Any]:
268
+ """Explicit founder-initiated promotion of a signal to a ledger item.
269
+
270
+ This is the ONLY path from intel → ledger. Writes via ledger_manager.add_item
271
+ with promoted_by set so the guard accepts it.
272
+ """
273
+ signal = _find_signal(signal_id)
274
+ if not signal:
275
+ raise ValidationError(f"signal {signal_id} not found in hot shards")
276
+
277
+ from ai.ledger_manager import add_item
278
+
279
+ title = f"[{(signal.get('platform') or 'signal').upper()}] Promoted: {signal.get('author') or signal.get('canonical_url')}"
280
+ description = (
281
+ f"Promoted from intel store (signal {signal_id}).\n"
282
+ f"URL: {signal.get('canonical_url', '')}\n"
283
+ f"Author: {signal.get('author', '')}\n"
284
+ f"Snippet: {(signal.get('content_snippet') or '')[:400]}\n"
285
+ f"Posted: {signal.get('posted_at', '')}\n"
286
+ f"Fingerprint: {signal.get('fingerprint', '')}"
287
+ )
288
+ tags = ["promoted-signal", signal.get("platform", "")]
289
+ if extra_tags:
290
+ tags.extend(extra_tags)
291
+
292
+ # Guard checks source=='promoted_signal' + promoted_by set, so bypass the
293
+ # social_scan rejection.
294
+ os.environ.setdefault("_DELIMIT_SIGNAL_PROMOTED_BY", "founder")
295
+ try:
296
+ result = add_item(
297
+ title=title,
298
+ ledger=ledger,
299
+ type="task",
300
+ priority=priority,
301
+ description=description,
302
+ source=f"promoted_signal:{signal_id}",
303
+ tags=tags,
304
+ context=f"Promoted from signal {signal_id} for strategic action.",
305
+ )
306
+ finally:
307
+ os.environ.pop("_DELIMIT_SIGNAL_PROMOTED_BY", None)
308
+ return result
309
+
310
+
311
+ def _find_signal(signal_id: str) -> Optional[Dict[str, Any]]:
312
+ if not signal_id:
313
+ return None
314
+ for shard in _iter_shards(since_days=WARM_WINDOW_DAYS):
315
+ try:
316
+ for line in shard.read_text().splitlines():
317
+ if not line.strip():
318
+ continue
319
+ try:
320
+ row = json.loads(line)
321
+ except json.JSONDecodeError:
322
+ continue
323
+ if row.get("id") == signal_id:
324
+ return row
325
+ except OSError:
326
+ continue
327
+ return None
328
+
329
+
330
+ def digest(since_days: int = HOT_WINDOW_DAYS, top_n: int = 20) -> Dict[str, Any]:
331
+ """Cluster recent signals by platform + top authors + theme counters."""
332
+ rows = query(since_days=since_days, limit=1000)
333
+ by_platform: Counter[str] = Counter()
334
+ by_author: Counter[str] = Counter()
335
+ by_theme: Counter[str] = Counter()
336
+ for row in rows:
337
+ by_platform[row.get("platform", "?")] += 1
338
+ by_author[row.get("author", "?")] += 1
339
+ for theme in row.get("themes") or []:
340
+ by_theme[theme] += 1
341
+ return {
342
+ "window_days": since_days,
343
+ "total_signals": len(rows),
344
+ "top_platforms": by_platform.most_common(10),
345
+ "top_authors": by_author.most_common(top_n),
346
+ "top_themes": by_theme.most_common(top_n),
347
+ "sample": rows[:5],
348
+ }