metatron-cli 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
metatron/dedup.py ADDED
@@ -0,0 +1,210 @@
1
+ """Deduplication pipeline.
2
+
3
+ Two phases per refresh:
4
+
5
+ Phase A (cheap, per-item):
6
+ - L1: exact canonical-URL match → drop as duplicate.
7
+ - L2: normalized-title match (lowercase, punctuation stripped, common
8
+ newsroom prefixes removed) → attach to that existing article's cluster.
9
+
10
+ Phase B (batched, one LLM call):
11
+ - For every candidate that survived A, pair it with the recent-history
12
+ articles whose token overlap exceeds ``llm_threshold``.
13
+ - Send the union of (candidates + their candidate-history peers) to
14
+ Claude in a single batched prompt asking "which of these report the
15
+ same story?".
16
+ - Apply the returned clusters: each candidate either joins an
17
+ existing cluster (matched against history) or starts a new one
18
+ with another candidate.
19
+
20
+ This collapses dozens of per-pair CLI calls (each with ~5–10 s overhead)
21
+ into one prompt, while still giving the model rich pairwise context to
22
+ work with.
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import hashlib
28
+ import logging
29
+ from dataclasses import dataclass
30
+ from typing import Any, Iterable
31
+
32
+ from metatron.llm import BatchJudge, ClusterItem
33
+ from metatron.normalize import canonicalize_url, jaccard, normalize_title, tokenize
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ @dataclass
39
+ class DedupConfig:
40
+ llm_threshold: float = 0.12 # min token overlap for a history pairing to be sent to the LLM
41
+ history_pair_cap: int = 5 # cap candidate-history peers added per candidate
42
+
43
+
44
+ @dataclass
45
+ class CheapDecision:
46
+ """Result of Phase A (cheap layers)."""
47
+
48
+ status: str # "duplicate" | "cluster" | "candidate"
49
+ canonical_article_id: str | None = None
50
+ cluster_id: str | None = None
51
+ reason: str = ""
52
+
53
+
54
+ def cheap_decide(
55
+ candidate: dict[str, Any], history: list[dict[str, Any]]
56
+ ) -> CheapDecision:
57
+ """Run Phase A (URL + normalized-title layers).
58
+
59
+ Returns:
60
+ - status="duplicate": exact canonical URL already exists.
61
+ - status="cluster": normalized title matches an existing article.
62
+ - status="candidate": needs LLM review in Phase B.
63
+ """
64
+ cand_url = candidate.get("canonical_url") or canonicalize_url(
65
+ candidate.get("source_url", "")
66
+ )
67
+ cand_title_norm = normalize_title(candidate.get("title", ""))
68
+
69
+ for row in history:
70
+ if row.get("canonical_url") and row["canonical_url"] == cand_url:
71
+ return CheapDecision(
72
+ status="duplicate",
73
+ canonical_article_id=row["id"],
74
+ cluster_id=row.get("cluster_id"),
75
+ reason="canonical url match",
76
+ )
77
+
78
+ for row in history:
79
+ if cand_title_norm and normalize_title(row.get("title", "")) == cand_title_norm:
80
+ return CheapDecision(
81
+ status="cluster",
82
+ canonical_article_id=row["id"],
83
+ cluster_id=row.get("cluster_id"),
84
+ reason="normalized title match",
85
+ )
86
+
87
+ return CheapDecision(status="candidate", reason="needs LLM review")
88
+
89
+
90
+ @dataclass
91
+ class BatchPlan:
92
+ """Inputs for the batched LLM call.
93
+
94
+ ``items`` contains every article — both new candidates and history
95
+ articles selected as their token-overlap peers — that the model will
96
+ receive. ``candidate_ids`` is the subset that are new (not yet
97
+ inserted). The result maps candidate ref_id → list of history ref_ids
98
+ the model said cover the same story.
99
+ """
100
+
101
+ items: list[ClusterItem]
102
+ candidate_ids: set[str]
103
+
104
+
105
+ def build_batch_plan(
106
+ candidates: list[dict[str, Any]],
107
+ history: list[dict[str, Any]],
108
+ *,
109
+ config: DedupConfig | None = None,
110
+ ) -> BatchPlan:
111
+ """Pick the set of articles the LLM should see in one prompt.
112
+
113
+ Every candidate is included. For each candidate we add the top
114
+ ``history_pair_cap`` history rows whose token overlap with the
115
+ candidate exceeds ``llm_threshold``. Each row is included at most
116
+ once even if multiple candidates peer with it.
117
+ """
118
+ cfg = config or DedupConfig()
119
+
120
+ items_by_id: dict[str, ClusterItem] = {}
121
+ candidate_ids: set[str] = set()
122
+
123
+ for cand in candidates:
124
+ rid = _candidate_ref_id(cand)
125
+ cand["_ref_id"] = rid
126
+ items_by_id[rid] = _as_cluster_item(rid, cand)
127
+ candidate_ids.add(rid)
128
+
129
+ if not history:
130
+ return BatchPlan(items=list(items_by_id.values()), candidate_ids=candidate_ids)
131
+
132
+ history_token_cache: dict[str, set[str]] = {}
133
+
134
+ def hist_tokens(row: dict[str, Any]) -> set[str]:
135
+ rid = row["id"]
136
+ if rid not in history_token_cache:
137
+ history_token_cache[rid] = tokenize(_lead_text(row))
138
+ return history_token_cache[rid]
139
+
140
+ for cand in candidates:
141
+ cand_tokens = tokenize(_lead_text(cand))
142
+ if not cand_tokens:
143
+ continue
144
+ scored: list[tuple[float, dict[str, Any]]] = []
145
+ for row in history:
146
+ score = jaccard(cand_tokens, hist_tokens(row))
147
+ if score >= cfg.llm_threshold:
148
+ scored.append((score, row))
149
+ scored.sort(key=lambda t: t[0], reverse=True)
150
+ for _, row in scored[: cfg.history_pair_cap]:
151
+ rid = row["id"]
152
+ if rid not in items_by_id:
153
+ items_by_id[rid] = _as_cluster_item(rid, row)
154
+
155
+ return BatchPlan(items=list(items_by_id.values()), candidate_ids=candidate_ids)
156
+
157
+
158
+ def run_batch(
159
+ plan: BatchPlan, judge: BatchJudge
160
+ ) -> dict[str, list[str]]:
161
+ """Run the single LLM call and return candidate_id → [matched_ref_ids].
162
+
163
+ Non-candidate IDs in the returned mapping are filtered out by the
164
+ caller (they are history rows that grouped with a candidate).
165
+ """
166
+ if not judge.enabled or len(plan.items) < 2:
167
+ return {}
168
+
169
+ groups = judge.cluster(plan.items)
170
+ if not groups:
171
+ return {}
172
+
173
+ out: dict[str, list[str]] = {}
174
+ for group in groups:
175
+ cand_in_group = [r for r in group.ref_ids if r in plan.candidate_ids]
176
+ other_in_group = [r for r in group.ref_ids if r not in plan.candidate_ids]
177
+ for cid in cand_in_group:
178
+ peers = [r for r in group.ref_ids if r != cid]
179
+ out.setdefault(cid, []).extend(peers)
180
+ _ = other_in_group # noqa: F841 — informational only
181
+ return out
182
+
183
+
184
+ # ── helpers ──────────────────────────────────────────────────────────────
185
+
186
+
187
+ def _candidate_ref_id(candidate: dict[str, Any]) -> str:
188
+ """A stable, short ID for a not-yet-inserted candidate."""
189
+ seed = (
190
+ candidate.get("canonical_url", "")
191
+ or candidate.get("source_url", "")
192
+ or candidate.get("title", "")
193
+ )
194
+ return "C-" + hashlib.sha256(seed.encode("utf-8")).hexdigest()[:10]
195
+
196
+
197
+ def _as_cluster_item(ref_id: str, article: dict[str, Any]) -> ClusterItem:
198
+ return ClusterItem(
199
+ ref_id=ref_id,
200
+ title=article.get("title", "") or "",
201
+ summary=_lead_text(article),
202
+ source=article.get("source", "") or "",
203
+ )
204
+
205
+
206
+ def _lead_text(article: dict[str, Any]) -> str:
207
+ title = article.get("title", "") or ""
208
+ summary = article.get("summary", "") or ""
209
+ body = article.get("body", "") or ""
210
+ return f"{title}\n{summary}\n{body[:400]}".strip()
metatron/fetcher.py ADDED
@@ -0,0 +1,147 @@
1
+ """Fetch a single RSS/Atom feed and produce article candidates.
2
+
3
+ Two-stage:
4
+ 1. ``parse_feed(url)`` — grab the feed XML, parse with feedparser, return
5
+ a list of lightweight item dicts (title, summary, source url, source
6
+ name, published).
7
+ 2. ``enrich_article(item)`` — fetch the article HTML and extract clean
8
+ body text with trafilatura. Returns the item dict with ``body`` set.
9
+
10
+ Body extraction is opt-in per article (the dedup pipeline calls it only
11
+ when the cheap layers can't decide). Bandwidth-conscious.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import logging
17
+ from dataclasses import dataclass
18
+ from datetime import datetime, timezone
19
+ from typing import Any
20
+
21
+ import feedparser
22
+ import requests
23
+ import trafilatura
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ @dataclass
29
+ class FetchedItem:
30
+ title: str
31
+ source_url: str
32
+ summary: str
33
+ source: str
34
+ published: datetime | None
35
+ body: str = ""
36
+
37
+ def to_dict(self) -> dict[str, Any]:
38
+ return {
39
+ "title": self.title,
40
+ "source_url": self.source_url,
41
+ "summary": self.summary,
42
+ "source": self.source,
43
+ "published": self.published.isoformat() if self.published else None,
44
+ "body": self.body,
45
+ }
46
+
47
+
48
+ class FeedFetchError(Exception):
49
+ """Raised when a feed cannot be retrieved or parsed."""
50
+
51
+
52
+ def parse_feed(url: str, *, source_name: str | None = None, timeout: float = 30.0) -> list[FetchedItem]:
53
+ """Fetch and parse one RSS/Atom feed. Raises ``FeedFetchError`` on failure."""
54
+ try:
55
+ resp = requests.get(
56
+ url,
57
+ timeout=timeout,
58
+ headers={"User-Agent": "metatron/0.2 (+https://github.com/anthropic/metatron)"},
59
+ )
60
+ resp.raise_for_status()
61
+ except requests.RequestException as e:
62
+ raise FeedFetchError(f"HTTP error for {url}: {e}") from e
63
+
64
+ parsed = feedparser.parse(resp.content)
65
+ if parsed.bozo and not parsed.entries:
66
+ raise FeedFetchError(f"Feed parse failed for {url}: {parsed.bozo_exception}")
67
+
68
+ feed_title = parsed.feed.get("title", "") if hasattr(parsed, "feed") else ""
69
+ source = source_name or feed_title or _host_of(url)
70
+
71
+ out: list[FetchedItem] = []
72
+ for entry in parsed.entries:
73
+ link = entry.get("link", "").strip()
74
+ if not link:
75
+ continue
76
+ title = (entry.get("title") or "").strip()
77
+ summary = (entry.get("summary") or entry.get("description") or "").strip()
78
+ published = _parse_entry_date(entry)
79
+ out.append(
80
+ FetchedItem(
81
+ title=title,
82
+ source_url=link,
83
+ summary=_strip_html(summary),
84
+ source=source,
85
+ published=published,
86
+ )
87
+ )
88
+ return out
89
+
90
+
91
+ def enrich_article(item: FetchedItem, *, timeout: float = 15.0) -> FetchedItem:
92
+ """Fetch the article page and extract clean body text.
93
+
94
+ On any failure, returns ``item`` with body left as-is (likely empty).
95
+ We don't want one bad page to poison the whole poll loop, so the only
96
+ surfacing here is a log line — the dedup pipeline gracefully handles
97
+ missing bodies by leaning on title/summary.
98
+ """
99
+ try:
100
+ downloaded = trafilatura.fetch_url(item.source_url, no_ssl=False)
101
+ if not downloaded:
102
+ return item
103
+ text = trafilatura.extract(
104
+ downloaded,
105
+ include_comments=False,
106
+ include_tables=False,
107
+ favor_precision=True,
108
+ no_fallback=False,
109
+ )
110
+ if text:
111
+ item.body = text.strip()
112
+ except Exception as e:
113
+ logger.debug("body extraction failed for %s: %s", item.source_url, e)
114
+ return item
115
+
116
+
117
+ def _parse_entry_date(entry: Any) -> datetime | None:
118
+ for key in ("published_parsed", "updated_parsed"):
119
+ tm = entry.get(key)
120
+ if tm:
121
+ try:
122
+ return datetime(*tm[:6], tzinfo=timezone.utc)
123
+ except (TypeError, ValueError):
124
+ continue
125
+ return None
126
+
127
+
128
+ _TAG_RE = None
129
+
130
+
131
+ def _strip_html(text: str) -> str:
132
+ global _TAG_RE
133
+ import re
134
+
135
+ if _TAG_RE is None:
136
+ _TAG_RE = re.compile(r"<[^>]+>")
137
+ return _TAG_RE.sub("", text).strip()
138
+
139
+
140
+ def _host_of(url: str) -> str:
141
+ from urllib.parse import urlsplit
142
+
143
+ try:
144
+ host = urlsplit(url).hostname or ""
145
+ return host[4:] if host.startswith("www.") else host
146
+ except ValueError:
147
+ return ""
metatron/llm.py ADDED
@@ -0,0 +1,270 @@
1
+ """Claude CLI wrapper for the dedup tiebreaker.
2
+
3
+ The CLI overhead (~5-10s per `claude -p` invocation) makes per-pair LLM
4
+ calls infeasible at any real scale. So we batch: one prompt per refresh
5
+ asks Claude to identify clusters across the full candidate set + recent
6
+ history. The model returns ``{"groups": [[id, id, ...], ...]}`` where
7
+ each inner list groups article IDs that report the same story.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import logging
14
+ import os
15
+ import re
16
+ import select
17
+ import shutil
18
+ import subprocess
19
+ import time
20
+ from dataclasses import dataclass
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ @dataclass
26
+ class ClusterItem:
27
+ """One item passed into the batch judge.
28
+
29
+ ``ref_id`` is whatever the caller wants the model to echo back — usually
30
+ the article's database id, or a fresh hash for new candidates that
31
+ aren't in the DB yet.
32
+ """
33
+
34
+ ref_id: str
35
+ title: str
36
+ summary: str
37
+ source: str
38
+
39
+
40
+ @dataclass
41
+ class ClusterGroup:
42
+ ref_ids: list[str]
43
+
44
+
45
+ class BatchJudge:
46
+ """Run `claude -p` once per refresh to cluster items by same-story."""
47
+
48
+ DEFAULT_MODEL = "sonnet"
49
+ # Idle window: kill the subprocess only if nothing has been written to
50
+ # stdout/stderr for this many seconds. A productive call that runs for
51
+ # 10 minutes is fine; a stuck call that's been silent for 2 minutes is not.
52
+ DEFAULT_IDLE_TIMEOUT = 120.0
53
+
54
+ def __init__(
55
+ self,
56
+ *,
57
+ model: str | None = None,
58
+ binary: str | None = None,
59
+ idle_timeout: float | None = None,
60
+ ) -> None:
61
+ self.model = model or self.DEFAULT_MODEL
62
+ self.binary = binary or "claude"
63
+ self.idle_timeout = idle_timeout or self.DEFAULT_IDLE_TIMEOUT
64
+
65
+ @property
66
+ def enabled(self) -> bool:
67
+ return shutil.which(self.binary) is not None
68
+
69
+ def cluster(self, items: list[ClusterItem]) -> list[ClusterGroup]:
70
+ """Return groups of ref_ids that belong to the same story.
71
+
72
+ Items not appearing in any returned group are singletons (their own
73
+ cluster). On any failure (CLI missing, idle stall, parse error)
74
+ returns an empty list — caller should treat that as "no LLM
75
+ groupings learned" and rely on the cheap layers only.
76
+ """
77
+ if not items or len(items) < 2:
78
+ return []
79
+
80
+ prompt = _format_batch_prompt(items)
81
+ try:
82
+ proc = subprocess.Popen(
83
+ [
84
+ self.binary,
85
+ "-p",
86
+ "--model",
87
+ self.model,
88
+ "--output-format",
89
+ "json",
90
+ prompt,
91
+ ],
92
+ stdout=subprocess.PIPE,
93
+ stderr=subprocess.PIPE,
94
+ text=False, # read bytes so partial UTF-8 mid-stream is fine
95
+ )
96
+ except FileNotFoundError:
97
+ logger.warning("Claude CLI not found on PATH (looked for %s)", self.binary)
98
+ return []
99
+
100
+ stdout_chunks: list[bytes] = []
101
+ stderr_chunks: list[bytes] = []
102
+ idle_killed = False
103
+ last_output = time.monotonic()
104
+
105
+ # Stream the pipes, watching for inactivity. A productive call that
106
+ # keeps producing output (even slowly) is allowed to run as long as
107
+ # it needs; we only intervene when both pipes have been silent for
108
+ # longer than `idle_timeout`.
109
+ for chunk_bytes in _stream_until_idle(
110
+ proc, stdout_chunks, stderr_chunks, self.idle_timeout
111
+ ):
112
+ if chunk_bytes:
113
+ last_output = time.monotonic()
114
+ elapsed_idle = time.monotonic() - last_output
115
+ if elapsed_idle > self.idle_timeout:
116
+ logger.warning(
117
+ "Claude CLI batch dedup idle for %.0fs — terminating",
118
+ elapsed_idle,
119
+ )
120
+ _terminate(proc)
121
+ idle_killed = True
122
+ break
123
+
124
+ # Drain any remaining buffered output once the process is done.
125
+ try:
126
+ tail_out, tail_err = proc.communicate(timeout=5)
127
+ except subprocess.TimeoutExpired:
128
+ _terminate(proc)
129
+ tail_out, tail_err = proc.communicate()
130
+ if tail_out:
131
+ stdout_chunks.append(tail_out)
132
+ if tail_err:
133
+ stderr_chunks.append(tail_err)
134
+
135
+ if idle_killed:
136
+ return []
137
+ if proc.returncode != 0:
138
+ stderr_text = b"".join(stderr_chunks).decode("utf-8", "replace")
139
+ logger.warning(
140
+ "Claude CLI batch dedup exited %d: %s",
141
+ proc.returncode,
142
+ stderr_text[:300],
143
+ )
144
+ return []
145
+
146
+ stdout_text = b"".join(stdout_chunks).decode("utf-8", "replace")
147
+ return _parse_cli_output(stdout_text)
148
+
149
+
150
+ def _stream_until_idle(
151
+ proc: subprocess.Popen,
152
+ stdout_chunks: list[bytes],
153
+ stderr_chunks: list[bytes],
154
+ idle_timeout: float,
155
+ ):
156
+ """Yield bytes-read events while the subprocess runs.
157
+
158
+ Yields ``b""`` on each tick so the caller can re-evaluate idle elapsed,
159
+ and yields the actual bytes when a pipe produced data. Exits when the
160
+ process terminates.
161
+ """
162
+ pipes = {proc.stdout: stdout_chunks, proc.stderr: stderr_chunks}
163
+ open_fds = [p for p in pipes if p is not None]
164
+ # Make non-blocking so reads can't hang past the select tick.
165
+ for p in open_fds:
166
+ os.set_blocking(p.fileno(), False)
167
+
168
+ tick = min(1.0, max(0.1, idle_timeout / 10))
169
+ while open_fds and proc.poll() is None:
170
+ ready, _, _ = select.select(open_fds, [], [], tick)
171
+ if not ready:
172
+ yield b""
173
+ continue
174
+ produced = b""
175
+ for p in ready:
176
+ try:
177
+ chunk = p.read()
178
+ except (OSError, ValueError):
179
+ chunk = None
180
+ if not chunk:
181
+ # EOF or transient empty — drop from poll set if truly closed.
182
+ if proc.poll() is not None:
183
+ open_fds = [x for x in open_fds if x is not p]
184
+ continue
185
+ pipes[p].append(chunk)
186
+ produced += chunk
187
+ yield produced
188
+
189
+
190
+ def _terminate(proc: subprocess.Popen) -> None:
191
+ """Best-effort shutdown of the CLI subprocess."""
192
+ if proc.poll() is not None:
193
+ return
194
+ proc.terminate()
195
+ try:
196
+ proc.wait(timeout=5)
197
+ except subprocess.TimeoutExpired:
198
+ proc.kill()
199
+
200
+
201
+ _BATCH_PROMPT_HEADER = """\
202
+ You are a news deduplication arbiter. Below is a numbered list of news
203
+ articles. Identify which articles report the SAME specific story — same
204
+ event, same announcement, same incident. Articles about similar topics
205
+ but different specific stories are NOT the same.
206
+
207
+ For each group of duplicates, list the IDs of every article in that
208
+ group. Articles not in any group are unique stories on their own; you
209
+ do not need to list them.
210
+
211
+ Respond with ONLY a JSON object on a single line. The "groups" key holds
212
+ an array of arrays; each inner array contains the IDs (as strings) of
213
+ articles in one same-story cluster. If you find no duplicates, return
214
+ {"groups": []}.
215
+
216
+ Example response:
217
+ {"groups": [["A1", "B3", "C7"], ["D2", "E5"]]}
218
+
219
+ Articles:
220
+ """
221
+
222
+
223
+ def _format_batch_prompt(items: list[ClusterItem]) -> str:
224
+ lines = [_BATCH_PROMPT_HEADER]
225
+ for it in items:
226
+ summary = (it.summary or "").replace("\n", " ").strip()[:300]
227
+ lines.append(
228
+ f"\n[{it.ref_id}] ({it.source}) {it.title}"
229
+ + (f"\n {summary}" if summary else "")
230
+ )
231
+ return "".join(lines)
232
+
233
+
234
+ _JSON_OBJ_RE = re.compile(r"\{.*\}", re.DOTALL)
235
+
236
+
237
+ def _parse_cli_output(stdout: str) -> list[ClusterGroup]:
238
+ if not stdout.strip():
239
+ return []
240
+ try:
241
+ envelope = json.loads(stdout.strip())
242
+ except json.JSONDecodeError:
243
+ return []
244
+ if envelope.get("is_error"):
245
+ logger.warning("Claude CLI returned error: %s", str(envelope.get("result", ""))[:300])
246
+ return []
247
+ result_text = envelope.get("result") or ""
248
+ return _parse_groups(result_text)
249
+
250
+
251
+ def _parse_groups(text: str) -> list[ClusterGroup]:
252
+ if not text:
253
+ return []
254
+ match = _JSON_OBJ_RE.search(text)
255
+ payload = match.group(0) if match else text
256
+ try:
257
+ data = json.loads(payload)
258
+ except json.JSONDecodeError:
259
+ logger.warning("Failed to parse model response as JSON: %s", text[:200])
260
+ return []
261
+ raw_groups = data.get("groups", [])
262
+ if not isinstance(raw_groups, list):
263
+ return []
264
+ result: list[ClusterGroup] = []
265
+ for group in raw_groups:
266
+ if isinstance(group, list) and len(group) >= 2:
267
+ ids = [str(x) for x in group if isinstance(x, (str, int))]
268
+ if len(ids) >= 2:
269
+ result.append(ClusterGroup(ref_ids=ids))
270
+ return result