metatron-cli 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metatron/__init__.py +51 -0
- metatron/api.py +290 -0
- metatron/cli.py +221 -0
- metatron/config.py +187 -0
- metatron/db.py +357 -0
- metatron/dedup.py +210 -0
- metatron/fetcher.py +147 -0
- metatron/llm.py +270 -0
- metatron/normalize.py +141 -0
- metatron/poller.py +325 -0
- metatron_cli-0.2.1.dist-info/METADATA +174 -0
- metatron_cli-0.2.1.dist-info/RECORD +16 -0
- metatron_cli-0.2.1.dist-info/WHEEL +5 -0
- metatron_cli-0.2.1.dist-info/entry_points.txt +2 -0
- metatron_cli-0.2.1.dist-info/licenses/LICENSE +21 -0
- metatron_cli-0.2.1.dist-info/top_level.txt +1 -0
metatron/dedup.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""Deduplication pipeline.
|
|
2
|
+
|
|
3
|
+
Two phases per refresh:
|
|
4
|
+
|
|
5
|
+
Phase A (cheap, per-item):
|
|
6
|
+
- L1: exact canonical-URL match → drop as duplicate.
|
|
7
|
+
- L2: normalized-title match (lowercase, punctuation stripped, common
|
|
8
|
+
newsroom prefixes removed) → attach to that existing article's cluster.
|
|
9
|
+
|
|
10
|
+
Phase B (batched, one LLM call):
|
|
11
|
+
- For every candidate that survived A, pair it with the recent-history
|
|
12
|
+
articles whose token overlap exceeds ``llm_threshold``.
|
|
13
|
+
- Send the union of (candidates + their candidate-history peers) to
|
|
14
|
+
Claude in a single batched prompt asking "which of these report the
|
|
15
|
+
same story?".
|
|
16
|
+
- Apply the returned clusters: each candidate either joins an
|
|
17
|
+
existing cluster (matched against history) or starts a new one
|
|
18
|
+
with another candidate.
|
|
19
|
+
|
|
20
|
+
This collapses dozens of per-pair CLI calls (each with ~5–10 s overhead)
|
|
21
|
+
into one prompt, while still giving the model rich pairwise context to
|
|
22
|
+
work with.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import hashlib
|
|
28
|
+
import logging
|
|
29
|
+
from dataclasses import dataclass
|
|
30
|
+
from typing import Any, Iterable
|
|
31
|
+
|
|
32
|
+
from metatron.llm import BatchJudge, ClusterItem
|
|
33
|
+
from metatron.normalize import canonicalize_url, jaccard, normalize_title, tokenize
|
|
34
|
+
|
|
35
|
+
logger = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class DedupConfig:
|
|
40
|
+
llm_threshold: float = 0.12 # min token overlap for a history pairing to be sent to the LLM
|
|
41
|
+
history_pair_cap: int = 5 # cap candidate-history peers added per candidate
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class CheapDecision:
|
|
46
|
+
"""Result of Phase A (cheap layers)."""
|
|
47
|
+
|
|
48
|
+
status: str # "duplicate" | "cluster" | "candidate"
|
|
49
|
+
canonical_article_id: str | None = None
|
|
50
|
+
cluster_id: str | None = None
|
|
51
|
+
reason: str = ""
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def cheap_decide(
|
|
55
|
+
candidate: dict[str, Any], history: list[dict[str, Any]]
|
|
56
|
+
) -> CheapDecision:
|
|
57
|
+
"""Run Phase A (URL + normalized-title layers).
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
- status="duplicate": exact canonical URL already exists.
|
|
61
|
+
- status="cluster": normalized title matches an existing article.
|
|
62
|
+
- status="candidate": needs LLM review in Phase B.
|
|
63
|
+
"""
|
|
64
|
+
cand_url = candidate.get("canonical_url") or canonicalize_url(
|
|
65
|
+
candidate.get("source_url", "")
|
|
66
|
+
)
|
|
67
|
+
cand_title_norm = normalize_title(candidate.get("title", ""))
|
|
68
|
+
|
|
69
|
+
for row in history:
|
|
70
|
+
if row.get("canonical_url") and row["canonical_url"] == cand_url:
|
|
71
|
+
return CheapDecision(
|
|
72
|
+
status="duplicate",
|
|
73
|
+
canonical_article_id=row["id"],
|
|
74
|
+
cluster_id=row.get("cluster_id"),
|
|
75
|
+
reason="canonical url match",
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
for row in history:
|
|
79
|
+
if cand_title_norm and normalize_title(row.get("title", "")) == cand_title_norm:
|
|
80
|
+
return CheapDecision(
|
|
81
|
+
status="cluster",
|
|
82
|
+
canonical_article_id=row["id"],
|
|
83
|
+
cluster_id=row.get("cluster_id"),
|
|
84
|
+
reason="normalized title match",
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
return CheapDecision(status="candidate", reason="needs LLM review")
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@dataclass
|
|
91
|
+
class BatchPlan:
|
|
92
|
+
"""Inputs for the batched LLM call.
|
|
93
|
+
|
|
94
|
+
``items`` contains every article — both new candidates and history
|
|
95
|
+
articles selected as their token-overlap peers — that the model will
|
|
96
|
+
receive. ``candidate_ids`` is the subset that are new (not yet
|
|
97
|
+
inserted). The result maps candidate ref_id → list of history ref_ids
|
|
98
|
+
the model said cover the same story.
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
items: list[ClusterItem]
|
|
102
|
+
candidate_ids: set[str]
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def build_batch_plan(
|
|
106
|
+
candidates: list[dict[str, Any]],
|
|
107
|
+
history: list[dict[str, Any]],
|
|
108
|
+
*,
|
|
109
|
+
config: DedupConfig | None = None,
|
|
110
|
+
) -> BatchPlan:
|
|
111
|
+
"""Pick the set of articles the LLM should see in one prompt.
|
|
112
|
+
|
|
113
|
+
Every candidate is included. For each candidate we add the top
|
|
114
|
+
``history_pair_cap`` history rows whose token overlap with the
|
|
115
|
+
candidate exceeds ``llm_threshold``. Each row is included at most
|
|
116
|
+
once even if multiple candidates peer with it.
|
|
117
|
+
"""
|
|
118
|
+
cfg = config or DedupConfig()
|
|
119
|
+
|
|
120
|
+
items_by_id: dict[str, ClusterItem] = {}
|
|
121
|
+
candidate_ids: set[str] = set()
|
|
122
|
+
|
|
123
|
+
for cand in candidates:
|
|
124
|
+
rid = _candidate_ref_id(cand)
|
|
125
|
+
cand["_ref_id"] = rid
|
|
126
|
+
items_by_id[rid] = _as_cluster_item(rid, cand)
|
|
127
|
+
candidate_ids.add(rid)
|
|
128
|
+
|
|
129
|
+
if not history:
|
|
130
|
+
return BatchPlan(items=list(items_by_id.values()), candidate_ids=candidate_ids)
|
|
131
|
+
|
|
132
|
+
history_token_cache: dict[str, set[str]] = {}
|
|
133
|
+
|
|
134
|
+
def hist_tokens(row: dict[str, Any]) -> set[str]:
|
|
135
|
+
rid = row["id"]
|
|
136
|
+
if rid not in history_token_cache:
|
|
137
|
+
history_token_cache[rid] = tokenize(_lead_text(row))
|
|
138
|
+
return history_token_cache[rid]
|
|
139
|
+
|
|
140
|
+
for cand in candidates:
|
|
141
|
+
cand_tokens = tokenize(_lead_text(cand))
|
|
142
|
+
if not cand_tokens:
|
|
143
|
+
continue
|
|
144
|
+
scored: list[tuple[float, dict[str, Any]]] = []
|
|
145
|
+
for row in history:
|
|
146
|
+
score = jaccard(cand_tokens, hist_tokens(row))
|
|
147
|
+
if score >= cfg.llm_threshold:
|
|
148
|
+
scored.append((score, row))
|
|
149
|
+
scored.sort(key=lambda t: t[0], reverse=True)
|
|
150
|
+
for _, row in scored[: cfg.history_pair_cap]:
|
|
151
|
+
rid = row["id"]
|
|
152
|
+
if rid not in items_by_id:
|
|
153
|
+
items_by_id[rid] = _as_cluster_item(rid, row)
|
|
154
|
+
|
|
155
|
+
return BatchPlan(items=list(items_by_id.values()), candidate_ids=candidate_ids)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def run_batch(
|
|
159
|
+
plan: BatchPlan, judge: BatchJudge
|
|
160
|
+
) -> dict[str, list[str]]:
|
|
161
|
+
"""Run the single LLM call and return candidate_id → [matched_ref_ids].
|
|
162
|
+
|
|
163
|
+
Non-candidate IDs in the returned mapping are filtered out by the
|
|
164
|
+
caller (they are history rows that grouped with a candidate).
|
|
165
|
+
"""
|
|
166
|
+
if not judge.enabled or len(plan.items) < 2:
|
|
167
|
+
return {}
|
|
168
|
+
|
|
169
|
+
groups = judge.cluster(plan.items)
|
|
170
|
+
if not groups:
|
|
171
|
+
return {}
|
|
172
|
+
|
|
173
|
+
out: dict[str, list[str]] = {}
|
|
174
|
+
for group in groups:
|
|
175
|
+
cand_in_group = [r for r in group.ref_ids if r in plan.candidate_ids]
|
|
176
|
+
other_in_group = [r for r in group.ref_ids if r not in plan.candidate_ids]
|
|
177
|
+
for cid in cand_in_group:
|
|
178
|
+
peers = [r for r in group.ref_ids if r != cid]
|
|
179
|
+
out.setdefault(cid, []).extend(peers)
|
|
180
|
+
_ = other_in_group # noqa: F841 — informational only
|
|
181
|
+
return out
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
# ── helpers ──────────────────────────────────────────────────────────────
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _candidate_ref_id(candidate: dict[str, Any]) -> str:
|
|
188
|
+
"""A stable, short ID for a not-yet-inserted candidate."""
|
|
189
|
+
seed = (
|
|
190
|
+
candidate.get("canonical_url", "")
|
|
191
|
+
or candidate.get("source_url", "")
|
|
192
|
+
or candidate.get("title", "")
|
|
193
|
+
)
|
|
194
|
+
return "C-" + hashlib.sha256(seed.encode("utf-8")).hexdigest()[:10]
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _as_cluster_item(ref_id: str, article: dict[str, Any]) -> ClusterItem:
|
|
198
|
+
return ClusterItem(
|
|
199
|
+
ref_id=ref_id,
|
|
200
|
+
title=article.get("title", "") or "",
|
|
201
|
+
summary=_lead_text(article),
|
|
202
|
+
source=article.get("source", "") or "",
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _lead_text(article: dict[str, Any]) -> str:
|
|
207
|
+
title = article.get("title", "") or ""
|
|
208
|
+
summary = article.get("summary", "") or ""
|
|
209
|
+
body = article.get("body", "") or ""
|
|
210
|
+
return f"{title}\n{summary}\n{body[:400]}".strip()
|
metatron/fetcher.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""Fetch a single RSS/Atom feed and produce article candidates.
|
|
2
|
+
|
|
3
|
+
Two-stage:
|
|
4
|
+
1. ``parse_feed(url)`` — grab the feed XML, parse with feedparser, return
|
|
5
|
+
a list of lightweight item dicts (title, summary, source url, source
|
|
6
|
+
name, published).
|
|
7
|
+
2. ``enrich_article(item)`` — fetch the article HTML and extract clean
|
|
8
|
+
body text with trafilatura. Returns the item dict with ``body`` set.
|
|
9
|
+
|
|
10
|
+
Body extraction is opt-in per article (the dedup pipeline calls it only
|
|
11
|
+
when the cheap layers can't decide). Bandwidth-conscious.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import logging
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from datetime import datetime, timezone
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
import feedparser
|
|
22
|
+
import requests
|
|
23
|
+
import trafilatura
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class FetchedItem:
|
|
30
|
+
title: str
|
|
31
|
+
source_url: str
|
|
32
|
+
summary: str
|
|
33
|
+
source: str
|
|
34
|
+
published: datetime | None
|
|
35
|
+
body: str = ""
|
|
36
|
+
|
|
37
|
+
def to_dict(self) -> dict[str, Any]:
|
|
38
|
+
return {
|
|
39
|
+
"title": self.title,
|
|
40
|
+
"source_url": self.source_url,
|
|
41
|
+
"summary": self.summary,
|
|
42
|
+
"source": self.source,
|
|
43
|
+
"published": self.published.isoformat() if self.published else None,
|
|
44
|
+
"body": self.body,
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class FeedFetchError(Exception):
|
|
49
|
+
"""Raised when a feed cannot be retrieved or parsed."""
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def parse_feed(url: str, *, source_name: str | None = None, timeout: float = 30.0) -> list[FetchedItem]:
|
|
53
|
+
"""Fetch and parse one RSS/Atom feed. Raises ``FeedFetchError`` on failure."""
|
|
54
|
+
try:
|
|
55
|
+
resp = requests.get(
|
|
56
|
+
url,
|
|
57
|
+
timeout=timeout,
|
|
58
|
+
headers={"User-Agent": "metatron/0.2 (+https://github.com/anthropic/metatron)"},
|
|
59
|
+
)
|
|
60
|
+
resp.raise_for_status()
|
|
61
|
+
except requests.RequestException as e:
|
|
62
|
+
raise FeedFetchError(f"HTTP error for {url}: {e}") from e
|
|
63
|
+
|
|
64
|
+
parsed = feedparser.parse(resp.content)
|
|
65
|
+
if parsed.bozo and not parsed.entries:
|
|
66
|
+
raise FeedFetchError(f"Feed parse failed for {url}: {parsed.bozo_exception}")
|
|
67
|
+
|
|
68
|
+
feed_title = parsed.feed.get("title", "") if hasattr(parsed, "feed") else ""
|
|
69
|
+
source = source_name or feed_title or _host_of(url)
|
|
70
|
+
|
|
71
|
+
out: list[FetchedItem] = []
|
|
72
|
+
for entry in parsed.entries:
|
|
73
|
+
link = entry.get("link", "").strip()
|
|
74
|
+
if not link:
|
|
75
|
+
continue
|
|
76
|
+
title = (entry.get("title") or "").strip()
|
|
77
|
+
summary = (entry.get("summary") or entry.get("description") or "").strip()
|
|
78
|
+
published = _parse_entry_date(entry)
|
|
79
|
+
out.append(
|
|
80
|
+
FetchedItem(
|
|
81
|
+
title=title,
|
|
82
|
+
source_url=link,
|
|
83
|
+
summary=_strip_html(summary),
|
|
84
|
+
source=source,
|
|
85
|
+
published=published,
|
|
86
|
+
)
|
|
87
|
+
)
|
|
88
|
+
return out
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def enrich_article(item: FetchedItem, *, timeout: float = 15.0) -> FetchedItem:
|
|
92
|
+
"""Fetch the article page and extract clean body text.
|
|
93
|
+
|
|
94
|
+
On any failure, returns ``item`` with body left as-is (likely empty).
|
|
95
|
+
We don't want one bad page to poison the whole poll loop, so the only
|
|
96
|
+
surfacing here is a log line — the dedup pipeline gracefully handles
|
|
97
|
+
missing bodies by leaning on title/summary.
|
|
98
|
+
"""
|
|
99
|
+
try:
|
|
100
|
+
downloaded = trafilatura.fetch_url(item.source_url, no_ssl=False)
|
|
101
|
+
if not downloaded:
|
|
102
|
+
return item
|
|
103
|
+
text = trafilatura.extract(
|
|
104
|
+
downloaded,
|
|
105
|
+
include_comments=False,
|
|
106
|
+
include_tables=False,
|
|
107
|
+
favor_precision=True,
|
|
108
|
+
no_fallback=False,
|
|
109
|
+
)
|
|
110
|
+
if text:
|
|
111
|
+
item.body = text.strip()
|
|
112
|
+
except Exception as e:
|
|
113
|
+
logger.debug("body extraction failed for %s: %s", item.source_url, e)
|
|
114
|
+
return item
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _parse_entry_date(entry: Any) -> datetime | None:
|
|
118
|
+
for key in ("published_parsed", "updated_parsed"):
|
|
119
|
+
tm = entry.get(key)
|
|
120
|
+
if tm:
|
|
121
|
+
try:
|
|
122
|
+
return datetime(*tm[:6], tzinfo=timezone.utc)
|
|
123
|
+
except (TypeError, ValueError):
|
|
124
|
+
continue
|
|
125
|
+
return None
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
_TAG_RE = None
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _strip_html(text: str) -> str:
|
|
132
|
+
global _TAG_RE
|
|
133
|
+
import re
|
|
134
|
+
|
|
135
|
+
if _TAG_RE is None:
|
|
136
|
+
_TAG_RE = re.compile(r"<[^>]+>")
|
|
137
|
+
return _TAG_RE.sub("", text).strip()
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _host_of(url: str) -> str:
|
|
141
|
+
from urllib.parse import urlsplit
|
|
142
|
+
|
|
143
|
+
try:
|
|
144
|
+
host = urlsplit(url).hostname or ""
|
|
145
|
+
return host[4:] if host.startswith("www.") else host
|
|
146
|
+
except ValueError:
|
|
147
|
+
return ""
|
metatron/llm.py
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
"""Claude CLI wrapper for the dedup tiebreaker.
|
|
2
|
+
|
|
3
|
+
The CLI overhead (~5-10s per `claude -p` invocation) makes per-pair LLM
|
|
4
|
+
calls infeasible at any real scale. So we batch: one prompt per refresh
|
|
5
|
+
asks Claude to identify clusters across the full candidate set + recent
|
|
6
|
+
history. The model returns ``{"groups": [[id, id, ...], ...]}`` where
|
|
7
|
+
each inner list groups article IDs that report the same story.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import logging
|
|
14
|
+
import os
|
|
15
|
+
import re
|
|
16
|
+
import select
|
|
17
|
+
import shutil
|
|
18
|
+
import subprocess
|
|
19
|
+
import time
|
|
20
|
+
from dataclasses import dataclass
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class ClusterItem:
|
|
27
|
+
"""One item passed into the batch judge.
|
|
28
|
+
|
|
29
|
+
``ref_id`` is whatever the caller wants the model to echo back — usually
|
|
30
|
+
the article's database id, or a fresh hash for new candidates that
|
|
31
|
+
aren't in the DB yet.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
ref_id: str
|
|
35
|
+
title: str
|
|
36
|
+
summary: str
|
|
37
|
+
source: str
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class ClusterGroup:
|
|
42
|
+
ref_ids: list[str]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class BatchJudge:
|
|
46
|
+
"""Run `claude -p` once per refresh to cluster items by same-story."""
|
|
47
|
+
|
|
48
|
+
DEFAULT_MODEL = "sonnet"
|
|
49
|
+
# Idle window: kill the subprocess only if nothing has been written to
|
|
50
|
+
# stdout/stderr for this many seconds. A productive call that runs for
|
|
51
|
+
# 10 minutes is fine; a stuck call that's been silent for 2 minutes is not.
|
|
52
|
+
DEFAULT_IDLE_TIMEOUT = 120.0
|
|
53
|
+
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
*,
|
|
57
|
+
model: str | None = None,
|
|
58
|
+
binary: str | None = None,
|
|
59
|
+
idle_timeout: float | None = None,
|
|
60
|
+
) -> None:
|
|
61
|
+
self.model = model or self.DEFAULT_MODEL
|
|
62
|
+
self.binary = binary or "claude"
|
|
63
|
+
self.idle_timeout = idle_timeout or self.DEFAULT_IDLE_TIMEOUT
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def enabled(self) -> bool:
|
|
67
|
+
return shutil.which(self.binary) is not None
|
|
68
|
+
|
|
69
|
+
def cluster(self, items: list[ClusterItem]) -> list[ClusterGroup]:
|
|
70
|
+
"""Return groups of ref_ids that belong to the same story.
|
|
71
|
+
|
|
72
|
+
Items not appearing in any returned group are singletons (their own
|
|
73
|
+
cluster). On any failure (CLI missing, idle stall, parse error)
|
|
74
|
+
returns an empty list — caller should treat that as "no LLM
|
|
75
|
+
groupings learned" and rely on the cheap layers only.
|
|
76
|
+
"""
|
|
77
|
+
if not items or len(items) < 2:
|
|
78
|
+
return []
|
|
79
|
+
|
|
80
|
+
prompt = _format_batch_prompt(items)
|
|
81
|
+
try:
|
|
82
|
+
proc = subprocess.Popen(
|
|
83
|
+
[
|
|
84
|
+
self.binary,
|
|
85
|
+
"-p",
|
|
86
|
+
"--model",
|
|
87
|
+
self.model,
|
|
88
|
+
"--output-format",
|
|
89
|
+
"json",
|
|
90
|
+
prompt,
|
|
91
|
+
],
|
|
92
|
+
stdout=subprocess.PIPE,
|
|
93
|
+
stderr=subprocess.PIPE,
|
|
94
|
+
text=False, # read bytes so partial UTF-8 mid-stream is fine
|
|
95
|
+
)
|
|
96
|
+
except FileNotFoundError:
|
|
97
|
+
logger.warning("Claude CLI not found on PATH (looked for %s)", self.binary)
|
|
98
|
+
return []
|
|
99
|
+
|
|
100
|
+
stdout_chunks: list[bytes] = []
|
|
101
|
+
stderr_chunks: list[bytes] = []
|
|
102
|
+
idle_killed = False
|
|
103
|
+
last_output = time.monotonic()
|
|
104
|
+
|
|
105
|
+
# Stream the pipes, watching for inactivity. A productive call that
|
|
106
|
+
# keeps producing output (even slowly) is allowed to run as long as
|
|
107
|
+
# it needs; we only intervene when both pipes have been silent for
|
|
108
|
+
# longer than `idle_timeout`.
|
|
109
|
+
for chunk_bytes in _stream_until_idle(
|
|
110
|
+
proc, stdout_chunks, stderr_chunks, self.idle_timeout
|
|
111
|
+
):
|
|
112
|
+
if chunk_bytes:
|
|
113
|
+
last_output = time.monotonic()
|
|
114
|
+
elapsed_idle = time.monotonic() - last_output
|
|
115
|
+
if elapsed_idle > self.idle_timeout:
|
|
116
|
+
logger.warning(
|
|
117
|
+
"Claude CLI batch dedup idle for %.0fs — terminating",
|
|
118
|
+
elapsed_idle,
|
|
119
|
+
)
|
|
120
|
+
_terminate(proc)
|
|
121
|
+
idle_killed = True
|
|
122
|
+
break
|
|
123
|
+
|
|
124
|
+
# Drain any remaining buffered output once the process is done.
|
|
125
|
+
try:
|
|
126
|
+
tail_out, tail_err = proc.communicate(timeout=5)
|
|
127
|
+
except subprocess.TimeoutExpired:
|
|
128
|
+
_terminate(proc)
|
|
129
|
+
tail_out, tail_err = proc.communicate()
|
|
130
|
+
if tail_out:
|
|
131
|
+
stdout_chunks.append(tail_out)
|
|
132
|
+
if tail_err:
|
|
133
|
+
stderr_chunks.append(tail_err)
|
|
134
|
+
|
|
135
|
+
if idle_killed:
|
|
136
|
+
return []
|
|
137
|
+
if proc.returncode != 0:
|
|
138
|
+
stderr_text = b"".join(stderr_chunks).decode("utf-8", "replace")
|
|
139
|
+
logger.warning(
|
|
140
|
+
"Claude CLI batch dedup exited %d: %s",
|
|
141
|
+
proc.returncode,
|
|
142
|
+
stderr_text[:300],
|
|
143
|
+
)
|
|
144
|
+
return []
|
|
145
|
+
|
|
146
|
+
stdout_text = b"".join(stdout_chunks).decode("utf-8", "replace")
|
|
147
|
+
return _parse_cli_output(stdout_text)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _stream_until_idle(
|
|
151
|
+
proc: subprocess.Popen,
|
|
152
|
+
stdout_chunks: list[bytes],
|
|
153
|
+
stderr_chunks: list[bytes],
|
|
154
|
+
idle_timeout: float,
|
|
155
|
+
):
|
|
156
|
+
"""Yield bytes-read events while the subprocess runs.
|
|
157
|
+
|
|
158
|
+
Yields ``b""`` on each tick so the caller can re-evaluate idle elapsed,
|
|
159
|
+
and yields the actual bytes when a pipe produced data. Exits when the
|
|
160
|
+
process terminates.
|
|
161
|
+
"""
|
|
162
|
+
pipes = {proc.stdout: stdout_chunks, proc.stderr: stderr_chunks}
|
|
163
|
+
open_fds = [p for p in pipes if p is not None]
|
|
164
|
+
# Make non-blocking so reads can't hang past the select tick.
|
|
165
|
+
for p in open_fds:
|
|
166
|
+
os.set_blocking(p.fileno(), False)
|
|
167
|
+
|
|
168
|
+
tick = min(1.0, max(0.1, idle_timeout / 10))
|
|
169
|
+
while open_fds and proc.poll() is None:
|
|
170
|
+
ready, _, _ = select.select(open_fds, [], [], tick)
|
|
171
|
+
if not ready:
|
|
172
|
+
yield b""
|
|
173
|
+
continue
|
|
174
|
+
produced = b""
|
|
175
|
+
for p in ready:
|
|
176
|
+
try:
|
|
177
|
+
chunk = p.read()
|
|
178
|
+
except (OSError, ValueError):
|
|
179
|
+
chunk = None
|
|
180
|
+
if not chunk:
|
|
181
|
+
# EOF or transient empty — drop from poll set if truly closed.
|
|
182
|
+
if proc.poll() is not None:
|
|
183
|
+
open_fds = [x for x in open_fds if x is not p]
|
|
184
|
+
continue
|
|
185
|
+
pipes[p].append(chunk)
|
|
186
|
+
produced += chunk
|
|
187
|
+
yield produced
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _terminate(proc: subprocess.Popen) -> None:
|
|
191
|
+
"""Best-effort shutdown of the CLI subprocess."""
|
|
192
|
+
if proc.poll() is not None:
|
|
193
|
+
return
|
|
194
|
+
proc.terminate()
|
|
195
|
+
try:
|
|
196
|
+
proc.wait(timeout=5)
|
|
197
|
+
except subprocess.TimeoutExpired:
|
|
198
|
+
proc.kill()
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
_BATCH_PROMPT_HEADER = """\
|
|
202
|
+
You are a news deduplication arbiter. Below is a numbered list of news
|
|
203
|
+
articles. Identify which articles report the SAME specific story — same
|
|
204
|
+
event, same announcement, same incident. Articles about similar topics
|
|
205
|
+
but different specific stories are NOT the same.
|
|
206
|
+
|
|
207
|
+
For each group of duplicates, list the IDs of every article in that
|
|
208
|
+
group. Articles not in any group are unique stories on their own; you
|
|
209
|
+
do not need to list them.
|
|
210
|
+
|
|
211
|
+
Respond with ONLY a JSON object on a single line. The "groups" key holds
|
|
212
|
+
an array of arrays; each inner array contains the IDs (as strings) of
|
|
213
|
+
articles in one same-story cluster. If you find no duplicates, return
|
|
214
|
+
{"groups": []}.
|
|
215
|
+
|
|
216
|
+
Example response:
|
|
217
|
+
{"groups": [["A1", "B3", "C7"], ["D2", "E5"]]}
|
|
218
|
+
|
|
219
|
+
Articles:
|
|
220
|
+
"""
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _format_batch_prompt(items: list[ClusterItem]) -> str:
|
|
224
|
+
lines = [_BATCH_PROMPT_HEADER]
|
|
225
|
+
for it in items:
|
|
226
|
+
summary = (it.summary or "").replace("\n", " ").strip()[:300]
|
|
227
|
+
lines.append(
|
|
228
|
+
f"\n[{it.ref_id}] ({it.source}) {it.title}"
|
|
229
|
+
+ (f"\n {summary}" if summary else "")
|
|
230
|
+
)
|
|
231
|
+
return "".join(lines)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
_JSON_OBJ_RE = re.compile(r"\{.*\}", re.DOTALL)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _parse_cli_output(stdout: str) -> list[ClusterGroup]:
|
|
238
|
+
if not stdout.strip():
|
|
239
|
+
return []
|
|
240
|
+
try:
|
|
241
|
+
envelope = json.loads(stdout.strip())
|
|
242
|
+
except json.JSONDecodeError:
|
|
243
|
+
return []
|
|
244
|
+
if envelope.get("is_error"):
|
|
245
|
+
logger.warning("Claude CLI returned error: %s", str(envelope.get("result", ""))[:300])
|
|
246
|
+
return []
|
|
247
|
+
result_text = envelope.get("result") or ""
|
|
248
|
+
return _parse_groups(result_text)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _parse_groups(text: str) -> list[ClusterGroup]:
|
|
252
|
+
if not text:
|
|
253
|
+
return []
|
|
254
|
+
match = _JSON_OBJ_RE.search(text)
|
|
255
|
+
payload = match.group(0) if match else text
|
|
256
|
+
try:
|
|
257
|
+
data = json.loads(payload)
|
|
258
|
+
except json.JSONDecodeError:
|
|
259
|
+
logger.warning("Failed to parse model response as JSON: %s", text[:200])
|
|
260
|
+
return []
|
|
261
|
+
raw_groups = data.get("groups", [])
|
|
262
|
+
if not isinstance(raw_groups, list):
|
|
263
|
+
return []
|
|
264
|
+
result: list[ClusterGroup] = []
|
|
265
|
+
for group in raw_groups:
|
|
266
|
+
if isinstance(group, list) and len(group) >= 2:
|
|
267
|
+
ids = [str(x) for x in group if isinstance(x, (str, int))]
|
|
268
|
+
if len(ids) >= 2:
|
|
269
|
+
result.append(ClusterGroup(ref_ids=ids))
|
|
270
|
+
return result
|