metatron-cli 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metatron/__init__.py +51 -0
- metatron/api.py +290 -0
- metatron/cli.py +221 -0
- metatron/config.py +187 -0
- metatron/db.py +357 -0
- metatron/dedup.py +210 -0
- metatron/fetcher.py +147 -0
- metatron/llm.py +270 -0
- metatron/normalize.py +141 -0
- metatron/poller.py +325 -0
- metatron_cli-0.2.1.dist-info/METADATA +174 -0
- metatron_cli-0.2.1.dist-info/RECORD +16 -0
- metatron_cli-0.2.1.dist-info/WHEEL +5 -0
- metatron_cli-0.2.1.dist-info/entry_points.txt +2 -0
- metatron_cli-0.2.1.dist-info/licenses/LICENSE +21 -0
- metatron_cli-0.2.1.dist-info/top_level.txt +1 -0
metatron/config.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
"""Metatron configuration — loaded from ~/.config/metatron/config.toml.
|
|
2
|
+
|
|
3
|
+
XDG-style config location, stdlib tomllib parsing, no .env files, no required
|
|
4
|
+
fields. Sensible defaults so the service runs out of the box; configure the
|
|
5
|
+
[llm] section to enable Sonnet-powered cross-outlet dedup, and the [api]
|
|
6
|
+
section to set a bearer token.
|
|
7
|
+
|
|
8
|
+
Resolution order for the config file path:
|
|
9
|
+
1. Explicit path passed to ``from_file``
|
|
10
|
+
2. ``METATRON_CONFIG`` env var (operator override only)
|
|
11
|
+
3. ``$XDG_CONFIG_HOME/metatron/config.toml``
|
|
12
|
+
4. ``~/.config/metatron/config.toml``
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import os
|
|
18
|
+
import tomllib
|
|
19
|
+
from dataclasses import dataclass, field
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ConfigError(Exception):
|
|
24
|
+
"""Raised when the config file is malformed or unreadable."""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(frozen=True)
|
|
28
|
+
class ApiConfig:
|
|
29
|
+
"""HTTP API settings."""
|
|
30
|
+
|
|
31
|
+
host: str = "127.0.0.1"
|
|
32
|
+
port: int = 8765
|
|
33
|
+
api_token: str = ""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass(frozen=True)
|
|
37
|
+
class DatabaseConfig:
|
|
38
|
+
"""SQLite path override."""
|
|
39
|
+
|
|
40
|
+
path: str = ""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass(frozen=True)
|
|
44
|
+
class LlmConfig:
|
|
45
|
+
"""Claude CLI wrapper for the dedup tiebreaker.
|
|
46
|
+
|
|
47
|
+
Shells out to the locally-installed `claude` binary, which uses your
|
|
48
|
+
Claude subscription. No API key is needed — the CLI handles auth on
|
|
49
|
+
your behalf. ``enabled`` here flips on the tiebreaker entirely; ``model``
|
|
50
|
+
is passed as ``claude -p --model <model>``.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
enabled: bool = True
|
|
54
|
+
model: str = "sonnet"
|
|
55
|
+
binary: str = "claude"
|
|
56
|
+
# Kill the CLI subprocess only if stdout/stderr have been silent for
|
|
57
|
+
# this many seconds. A productive call is allowed to run as long as
|
|
58
|
+
# it needs — wall-clock timeouts waste tokens on perfectly fine calls.
|
|
59
|
+
idle_timeout_seconds: float = 120.0
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass(frozen=True)
|
|
63
|
+
class PollerConfig:
|
|
64
|
+
"""Background poller settings."""
|
|
65
|
+
|
|
66
|
+
enabled: bool = True
|
|
67
|
+
tick_seconds: int = 60 # how often the loop checks for due feeds
|
|
68
|
+
default_feed_interval_seconds: int = 1800
|
|
69
|
+
feed_timeout_seconds: int = 30
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@dataclass(frozen=True)
|
|
73
|
+
class MetatronConfig:
|
|
74
|
+
"""Immutable configuration loaded from TOML."""
|
|
75
|
+
|
|
76
|
+
api: ApiConfig = field(default_factory=ApiConfig)
|
|
77
|
+
database: DatabaseConfig = field(default_factory=DatabaseConfig)
|
|
78
|
+
llm: LlmConfig = field(default_factory=LlmConfig)
|
|
79
|
+
poller: PollerConfig = field(default_factory=PollerConfig)
|
|
80
|
+
log_dir: Path = field(default_factory=lambda: Path("logs"))
|
|
81
|
+
|
|
82
|
+
@staticmethod
|
|
83
|
+
def default_config_path() -> Path:
|
|
84
|
+
xdg = os.environ.get("XDG_CONFIG_HOME")
|
|
85
|
+
base = Path(xdg) if xdg else Path.home() / ".config"
|
|
86
|
+
return base / "metatron" / "config.toml"
|
|
87
|
+
|
|
88
|
+
@classmethod
|
|
89
|
+
def from_file(cls, path: Path | None = None) -> MetatronConfig:
|
|
90
|
+
"""Load config from TOML. If path is None, follow the resolution order.
|
|
91
|
+
|
|
92
|
+
Missing file is OK — returns an all-defaults config. Malformed TOML
|
|
93
|
+
raises ConfigError.
|
|
94
|
+
"""
|
|
95
|
+
if path is None:
|
|
96
|
+
override = os.environ.get("METATRON_CONFIG")
|
|
97
|
+
path = Path(override) if override else cls.default_config_path()
|
|
98
|
+
|
|
99
|
+
if not path.exists():
|
|
100
|
+
return cls()
|
|
101
|
+
|
|
102
|
+
try:
|
|
103
|
+
with path.open("rb") as fh:
|
|
104
|
+
data = tomllib.load(fh)
|
|
105
|
+
except (OSError, tomllib.TOMLDecodeError) as e:
|
|
106
|
+
raise ConfigError(f"Failed to read {path}: {e}") from e
|
|
107
|
+
|
|
108
|
+
return cls._from_dict(data)
|
|
109
|
+
|
|
110
|
+
@classmethod
|
|
111
|
+
def _from_dict(cls, data: dict) -> MetatronConfig:
|
|
112
|
+
api_section = data.get("api", {}) or {}
|
|
113
|
+
db_section = data.get("database", {}) or {}
|
|
114
|
+
llm_section = data.get("llm", {}) or {}
|
|
115
|
+
poller_section = data.get("poller", {}) or {}
|
|
116
|
+
logging_section = data.get("logging", {}) or {}
|
|
117
|
+
|
|
118
|
+
return cls(
|
|
119
|
+
api=ApiConfig(
|
|
120
|
+
host=str(api_section.get("host", "127.0.0.1")),
|
|
121
|
+
port=int(api_section.get("port", 8765)),
|
|
122
|
+
api_token=str(api_section.get("api_token", "")),
|
|
123
|
+
),
|
|
124
|
+
database=DatabaseConfig(path=str(db_section.get("path", ""))),
|
|
125
|
+
llm=LlmConfig(
|
|
126
|
+
enabled=bool(llm_section.get("enabled", True)),
|
|
127
|
+
model=str(llm_section.get("model", "sonnet")),
|
|
128
|
+
binary=str(llm_section.get("binary", "claude")),
|
|
129
|
+
idle_timeout_seconds=float(
|
|
130
|
+
llm_section.get("idle_timeout_seconds", 120.0)
|
|
131
|
+
),
|
|
132
|
+
),
|
|
133
|
+
poller=PollerConfig(
|
|
134
|
+
enabled=bool(poller_section.get("enabled", True)),
|
|
135
|
+
tick_seconds=int(poller_section.get("tick_seconds", 60)),
|
|
136
|
+
default_feed_interval_seconds=int(
|
|
137
|
+
poller_section.get("default_feed_interval_seconds", 1800)
|
|
138
|
+
),
|
|
139
|
+
feed_timeout_seconds=int(
|
|
140
|
+
poller_section.get("feed_timeout_seconds", 30)
|
|
141
|
+
),
|
|
142
|
+
),
|
|
143
|
+
log_dir=Path(str(logging_section.get("dir", "logs"))),
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
DEFAULT_CONFIG_TEMPLATE = """\
|
|
148
|
+
# Metatron configuration
|
|
149
|
+
# Location: ~/.config/metatron/config.toml
|
|
150
|
+
# Override with METATRON_CONFIG=/path/to/config.toml
|
|
151
|
+
|
|
152
|
+
[api]
|
|
153
|
+
# HTTP server bind. Default 127.0.0.1 (localhost-only).
|
|
154
|
+
host = "127.0.0.1"
|
|
155
|
+
port = 8765
|
|
156
|
+
# Bearer token clients must present. Leave empty to disable auth (dev only).
|
|
157
|
+
api_token = ""
|
|
158
|
+
|
|
159
|
+
[database]
|
|
160
|
+
# Override the default SQLite path. Empty = ~/.local/share/metatron/metatron.db
|
|
161
|
+
path = ""
|
|
162
|
+
|
|
163
|
+
[llm]
|
|
164
|
+
# Cross-outlet dedup tiebreaker via the local `claude` CLI (Claude Code).
|
|
165
|
+
# Uses your Claude subscription — no API key needed. Set enabled = false
|
|
166
|
+
# to skip the tiebreaker entirely (cheaper, but cross-outlet duplicates
|
|
167
|
+
# with different headlines will leak through).
|
|
168
|
+
enabled = true
|
|
169
|
+
model = "sonnet"
|
|
170
|
+
binary = "claude"
|
|
171
|
+
# Kill the CLI subprocess only if stdout has been silent this long.
|
|
172
|
+
# A productive call is never killed on wall-clock alone.
|
|
173
|
+
idle_timeout_seconds = 120.0
|
|
174
|
+
|
|
175
|
+
[poller]
|
|
176
|
+
# Background polling. Disable to fetch only on-demand via /refresh.
|
|
177
|
+
enabled = true
|
|
178
|
+
# How often the loop wakes up to check for due feeds.
|
|
179
|
+
tick_seconds = 60
|
|
180
|
+
# Default polling interval per feed (overridable per feed).
|
|
181
|
+
default_feed_interval_seconds = 1800
|
|
182
|
+
# Per-feed HTTP timeout when polling.
|
|
183
|
+
feed_timeout_seconds = 30
|
|
184
|
+
|
|
185
|
+
[logging]
|
|
186
|
+
dir = "logs"
|
|
187
|
+
"""
|
metatron/db.py
ADDED
|
@@ -0,0 +1,357 @@
|
|
|
1
|
+
"""SQLite schema and helpers for Metatron.
|
|
2
|
+
|
|
3
|
+
Single-file database at ~/.local/share/metatron/metatron.db (XDG-state by
|
|
4
|
+
default; override with [database].path in config.toml).
|
|
5
|
+
|
|
6
|
+
Schema:
|
|
7
|
+
projects(id, name, created_at)
|
|
8
|
+
feeds(id, project_id, url, name, category, enabled, poll_interval_seconds, last_polled, last_error)
|
|
9
|
+
articles(id, project_id, canonical_url, source_url, source, title, summary, body, published, fetched_at, cluster_id)
|
|
10
|
+
clusters(id, project_id, canonical_article_id, created_at)
|
|
11
|
+
|
|
12
|
+
A "cluster" is a deduplicated group of articles all covering the same story
|
|
13
|
+
across outlets. The canonical_article_id points at the first article in the
|
|
14
|
+
cluster (the one we keep showing). Members of a cluster have cluster_id set
|
|
15
|
+
to that cluster's id.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import os
|
|
21
|
+
import secrets
|
|
22
|
+
import sqlite3
|
|
23
|
+
import threading
|
|
24
|
+
from contextlib import contextmanager
|
|
25
|
+
from datetime import datetime, timezone
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
from typing import Any, Iterator
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def default_db_path() -> Path:
|
|
31
|
+
xdg = os.environ.get("XDG_STATE_HOME") or os.environ.get("XDG_DATA_HOME")
|
|
32
|
+
base = Path(xdg) if xdg else Path.home() / ".local" / "share"
|
|
33
|
+
return base / "metatron" / "metatron.db"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
_SCHEMA = """
|
|
37
|
+
CREATE TABLE IF NOT EXISTS projects (
|
|
38
|
+
id TEXT PRIMARY KEY,
|
|
39
|
+
name TEXT NOT NULL UNIQUE,
|
|
40
|
+
created_at TEXT NOT NULL
|
|
41
|
+
);
|
|
42
|
+
|
|
43
|
+
CREATE TABLE IF NOT EXISTS feeds (
|
|
44
|
+
id TEXT PRIMARY KEY,
|
|
45
|
+
project_id TEXT NOT NULL REFERENCES projects(id) ON DELETE CASCADE,
|
|
46
|
+
url TEXT NOT NULL,
|
|
47
|
+
name TEXT NOT NULL,
|
|
48
|
+
category TEXT NOT NULL DEFAULT '',
|
|
49
|
+
enabled INTEGER NOT NULL DEFAULT 1,
|
|
50
|
+
poll_interval_seconds INTEGER NOT NULL DEFAULT 1800,
|
|
51
|
+
last_polled TEXT,
|
|
52
|
+
last_error TEXT,
|
|
53
|
+
UNIQUE(project_id, url)
|
|
54
|
+
);
|
|
55
|
+
|
|
56
|
+
CREATE TABLE IF NOT EXISTS clusters (
|
|
57
|
+
id TEXT PRIMARY KEY,
|
|
58
|
+
project_id TEXT NOT NULL REFERENCES projects(id) ON DELETE CASCADE,
|
|
59
|
+
canonical_article_id TEXT NOT NULL,
|
|
60
|
+
created_at TEXT NOT NULL
|
|
61
|
+
);
|
|
62
|
+
|
|
63
|
+
CREATE TABLE IF NOT EXISTS articles (
|
|
64
|
+
id TEXT PRIMARY KEY,
|
|
65
|
+
project_id TEXT NOT NULL REFERENCES projects(id) ON DELETE CASCADE,
|
|
66
|
+
canonical_url TEXT NOT NULL,
|
|
67
|
+
source_url TEXT NOT NULL,
|
|
68
|
+
source TEXT NOT NULL,
|
|
69
|
+
title TEXT NOT NULL,
|
|
70
|
+
summary TEXT NOT NULL DEFAULT '',
|
|
71
|
+
body TEXT NOT NULL DEFAULT '',
|
|
72
|
+
published TEXT,
|
|
73
|
+
fetched_at TEXT NOT NULL,
|
|
74
|
+
cluster_id TEXT REFERENCES clusters(id) ON DELETE SET NULL,
|
|
75
|
+
UNIQUE(project_id, canonical_url)
|
|
76
|
+
);
|
|
77
|
+
|
|
78
|
+
CREATE INDEX IF NOT EXISTS idx_articles_project_fetched ON articles(project_id, fetched_at);
|
|
79
|
+
CREATE INDEX IF NOT EXISTS idx_articles_cluster ON articles(cluster_id);
|
|
80
|
+
CREATE INDEX IF NOT EXISTS idx_feeds_project_enabled ON feeds(project_id, enabled);
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class Database:
|
|
85
|
+
"""Thin wrapper around sqlite3 with thread-local connections.
|
|
86
|
+
|
|
87
|
+
SQLite handles concurrent reads fine. Writes happen from the API thread
|
|
88
|
+
and the poller thread; we use SQLite's WAL mode and a brief busy timeout
|
|
89
|
+
to handle contention.
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
def __init__(self, path: Path | None = None) -> None:
|
|
93
|
+
self.path = path or default_db_path()
|
|
94
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
95
|
+
self._local = threading.local()
|
|
96
|
+
with self.connect() as conn:
|
|
97
|
+
conn.executescript(_SCHEMA)
|
|
98
|
+
conn.execute("PRAGMA journal_mode = WAL")
|
|
99
|
+
conn.execute("PRAGMA foreign_keys = ON")
|
|
100
|
+
|
|
101
|
+
@contextmanager
|
|
102
|
+
def connect(self) -> Iterator[sqlite3.Connection]:
|
|
103
|
+
conn = sqlite3.connect(self.path, timeout=10.0)
|
|
104
|
+
conn.row_factory = sqlite3.Row
|
|
105
|
+
conn.execute("PRAGMA foreign_keys = ON")
|
|
106
|
+
try:
|
|
107
|
+
yield conn
|
|
108
|
+
conn.commit()
|
|
109
|
+
finally:
|
|
110
|
+
conn.close()
|
|
111
|
+
|
|
112
|
+
# ── projects ─────────────────────────────────────────────────────────
|
|
113
|
+
def create_project(self, name: str) -> dict[str, Any]:
|
|
114
|
+
pid = _new_id()
|
|
115
|
+
now = _now_iso()
|
|
116
|
+
with self.connect() as conn:
|
|
117
|
+
conn.execute(
|
|
118
|
+
"INSERT INTO projects (id, name, created_at) VALUES (?, ?, ?)",
|
|
119
|
+
(pid, name, now),
|
|
120
|
+
)
|
|
121
|
+
return {"id": pid, "name": name, "created_at": now}
|
|
122
|
+
|
|
123
|
+
def list_projects(self) -> list[dict[str, Any]]:
|
|
124
|
+
with self.connect() as conn:
|
|
125
|
+
rows = conn.execute(
|
|
126
|
+
"SELECT id, name, created_at FROM projects ORDER BY created_at"
|
|
127
|
+
).fetchall()
|
|
128
|
+
return [dict(r) for r in rows]
|
|
129
|
+
|
|
130
|
+
def get_project(self, project_id: str) -> dict[str, Any] | None:
|
|
131
|
+
with self.connect() as conn:
|
|
132
|
+
row = conn.execute(
|
|
133
|
+
"SELECT id, name, created_at FROM projects WHERE id = ?",
|
|
134
|
+
(project_id,),
|
|
135
|
+
).fetchone()
|
|
136
|
+
return dict(row) if row else None
|
|
137
|
+
|
|
138
|
+
def delete_project(self, project_id: str) -> bool:
|
|
139
|
+
with self.connect() as conn:
|
|
140
|
+
cur = conn.execute("DELETE FROM projects WHERE id = ?", (project_id,))
|
|
141
|
+
return cur.rowcount > 0
|
|
142
|
+
|
|
143
|
+
# ── feeds ────────────────────────────────────────────────────────────
|
|
144
|
+
def add_feed(
|
|
145
|
+
self,
|
|
146
|
+
project_id: str,
|
|
147
|
+
url: str,
|
|
148
|
+
name: str,
|
|
149
|
+
category: str = "",
|
|
150
|
+
poll_interval_seconds: int = 1800,
|
|
151
|
+
) -> dict[str, Any]:
|
|
152
|
+
fid = _new_id()
|
|
153
|
+
with self.connect() as conn:
|
|
154
|
+
conn.execute(
|
|
155
|
+
"""
|
|
156
|
+
INSERT INTO feeds
|
|
157
|
+
(id, project_id, url, name, category, enabled, poll_interval_seconds)
|
|
158
|
+
VALUES (?, ?, ?, ?, ?, 1, ?)
|
|
159
|
+
""",
|
|
160
|
+
(fid, project_id, url, name, category, poll_interval_seconds),
|
|
161
|
+
)
|
|
162
|
+
return self.get_feed(fid) # type: ignore[return-value]
|
|
163
|
+
|
|
164
|
+
def get_feed(self, feed_id: str) -> dict[str, Any] | None:
|
|
165
|
+
with self.connect() as conn:
|
|
166
|
+
row = conn.execute(
|
|
167
|
+
"SELECT * FROM feeds WHERE id = ?", (feed_id,)
|
|
168
|
+
).fetchone()
|
|
169
|
+
return dict(row) if row else None
|
|
170
|
+
|
|
171
|
+
def list_feeds(self, project_id: str) -> list[dict[str, Any]]:
|
|
172
|
+
with self.connect() as conn:
|
|
173
|
+
rows = conn.execute(
|
|
174
|
+
"SELECT * FROM feeds WHERE project_id = ? ORDER BY name",
|
|
175
|
+
(project_id,),
|
|
176
|
+
).fetchall()
|
|
177
|
+
return [dict(r) for r in rows]
|
|
178
|
+
|
|
179
|
+
def list_due_feeds(self, now: datetime | None = None) -> list[dict[str, Any]]:
|
|
180
|
+
"""Feeds whose last_polled + poll_interval_seconds is in the past."""
|
|
181
|
+
now = now or datetime.now(timezone.utc)
|
|
182
|
+
now_iso = now.isoformat()
|
|
183
|
+
with self.connect() as conn:
|
|
184
|
+
rows = conn.execute(
|
|
185
|
+
"""
|
|
186
|
+
SELECT * FROM feeds
|
|
187
|
+
WHERE enabled = 1
|
|
188
|
+
AND (last_polled IS NULL
|
|
189
|
+
OR datetime(last_polled, '+' || poll_interval_seconds || ' seconds')
|
|
190
|
+
<= datetime(?))
|
|
191
|
+
""",
|
|
192
|
+
(now_iso,),
|
|
193
|
+
).fetchall()
|
|
194
|
+
return [dict(r) for r in rows]
|
|
195
|
+
|
|
196
|
+
def delete_feed(self, feed_id: str) -> bool:
|
|
197
|
+
with self.connect() as conn:
|
|
198
|
+
cur = conn.execute("DELETE FROM feeds WHERE id = ?", (feed_id,))
|
|
199
|
+
return cur.rowcount > 0
|
|
200
|
+
|
|
201
|
+
def mark_feed_polled(
|
|
202
|
+
self, feed_id: str, error: str | None = None
|
|
203
|
+
) -> None:
|
|
204
|
+
with self.connect() as conn:
|
|
205
|
+
conn.execute(
|
|
206
|
+
"UPDATE feeds SET last_polled = ?, last_error = ? WHERE id = ?",
|
|
207
|
+
(_now_iso(), error, feed_id),
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# ── articles + clusters ──────────────────────────────────────────────
|
|
211
|
+
def get_article_by_canonical_url(
|
|
212
|
+
self, project_id: str, canonical_url: str
|
|
213
|
+
) -> dict[str, Any] | None:
|
|
214
|
+
with self.connect() as conn:
|
|
215
|
+
row = conn.execute(
|
|
216
|
+
"SELECT * FROM articles WHERE project_id = ? AND canonical_url = ?",
|
|
217
|
+
(project_id, canonical_url),
|
|
218
|
+
).fetchone()
|
|
219
|
+
return dict(row) if row else None
|
|
220
|
+
|
|
221
|
+
def recent_articles_for_dedup(
|
|
222
|
+
self, project_id: str, days: int = 7
|
|
223
|
+
) -> list[dict[str, Any]]:
|
|
224
|
+
with self.connect() as conn:
|
|
225
|
+
rows = conn.execute(
|
|
226
|
+
"""
|
|
227
|
+
SELECT id, title, summary, body, canonical_url, cluster_id
|
|
228
|
+
FROM articles
|
|
229
|
+
WHERE project_id = ?
|
|
230
|
+
AND fetched_at >= datetime('now', ?)
|
|
231
|
+
""",
|
|
232
|
+
(project_id, f"-{days} days"),
|
|
233
|
+
).fetchall()
|
|
234
|
+
return [dict(r) for r in rows]
|
|
235
|
+
|
|
236
|
+
def insert_article(self, article: dict[str, Any]) -> dict[str, Any] | None:
|
|
237
|
+
"""Insert one article. Returns the inserted row, or ``None`` if a
|
|
238
|
+
row with this ``(project_id, canonical_url)`` already exists. The
|
|
239
|
+
race-safe behavior is important: both the background poller and
|
|
240
|
+
an on-demand /refresh may attempt to insert the same item.
|
|
241
|
+
"""
|
|
242
|
+
aid = article.get("id") or _new_id()
|
|
243
|
+
article["id"] = aid
|
|
244
|
+
with self.connect() as conn:
|
|
245
|
+
cur = conn.execute(
|
|
246
|
+
"""
|
|
247
|
+
INSERT INTO articles
|
|
248
|
+
(id, project_id, canonical_url, source_url, source, title,
|
|
249
|
+
summary, body, published, fetched_at, cluster_id)
|
|
250
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
251
|
+
ON CONFLICT(project_id, canonical_url) DO NOTHING
|
|
252
|
+
""",
|
|
253
|
+
(
|
|
254
|
+
aid,
|
|
255
|
+
article["project_id"],
|
|
256
|
+
article["canonical_url"],
|
|
257
|
+
article["source_url"],
|
|
258
|
+
article["source"],
|
|
259
|
+
article["title"],
|
|
260
|
+
article.get("summary", ""),
|
|
261
|
+
article.get("body", ""),
|
|
262
|
+
article.get("published"),
|
|
263
|
+
article.get("fetched_at") or _now_iso(),
|
|
264
|
+
article.get("cluster_id"),
|
|
265
|
+
),
|
|
266
|
+
)
|
|
267
|
+
if cur.rowcount == 0:
|
|
268
|
+
return None
|
|
269
|
+
return self.get_article(aid)
|
|
270
|
+
|
|
271
|
+
def get_article(self, article_id: str) -> dict[str, Any] | None:
|
|
272
|
+
with self.connect() as conn:
|
|
273
|
+
row = conn.execute(
|
|
274
|
+
"SELECT * FROM articles WHERE id = ?", (article_id,)
|
|
275
|
+
).fetchone()
|
|
276
|
+
return dict(row) if row else None
|
|
277
|
+
|
|
278
|
+
def list_articles(
|
|
279
|
+
self,
|
|
280
|
+
project_id: str,
|
|
281
|
+
since: str | None = None,
|
|
282
|
+
limit: int = 100,
|
|
283
|
+
deduped: bool = True,
|
|
284
|
+
) -> list[dict[str, Any]]:
|
|
285
|
+
"""List articles for a project, newest first.
|
|
286
|
+
|
|
287
|
+
When ``deduped`` is True (default), returns only the canonical
|
|
288
|
+
article from each cluster (or articles with no cluster, which means
|
|
289
|
+
they are themselves canonical).
|
|
290
|
+
"""
|
|
291
|
+
params: list[Any] = [project_id]
|
|
292
|
+
where = "WHERE a.project_id = ?"
|
|
293
|
+
if since:
|
|
294
|
+
where += " AND a.fetched_at >= ?"
|
|
295
|
+
params.append(since)
|
|
296
|
+
if deduped:
|
|
297
|
+
# Either no cluster (singleton — itself canonical) OR the row IS
|
|
298
|
+
# the canonical of its cluster.
|
|
299
|
+
where += (
|
|
300
|
+
" AND (a.cluster_id IS NULL OR a.id = ("
|
|
301
|
+
" SELECT canonical_article_id FROM clusters WHERE id = a.cluster_id"
|
|
302
|
+
"))"
|
|
303
|
+
)
|
|
304
|
+
params.append(limit)
|
|
305
|
+
with self.connect() as conn:
|
|
306
|
+
rows = conn.execute(
|
|
307
|
+
f"""
|
|
308
|
+
SELECT a.* FROM articles a
|
|
309
|
+
{where}
|
|
310
|
+
ORDER BY datetime(COALESCE(a.published, a.fetched_at)) DESC
|
|
311
|
+
LIMIT ?
|
|
312
|
+
""",
|
|
313
|
+
params,
|
|
314
|
+
).fetchall()
|
|
315
|
+
return [dict(r) for r in rows]
|
|
316
|
+
|
|
317
|
+
def create_cluster(
|
|
318
|
+
self, project_id: str, canonical_article_id: str
|
|
319
|
+
) -> str:
|
|
320
|
+
cid = _new_id()
|
|
321
|
+
with self.connect() as conn:
|
|
322
|
+
conn.execute(
|
|
323
|
+
"""
|
|
324
|
+
INSERT INTO clusters (id, project_id, canonical_article_id, created_at)
|
|
325
|
+
VALUES (?, ?, ?, ?)
|
|
326
|
+
""",
|
|
327
|
+
(cid, project_id, canonical_article_id, _now_iso()),
|
|
328
|
+
)
|
|
329
|
+
return cid
|
|
330
|
+
|
|
331
|
+
def attach_to_cluster(self, article_id: str, cluster_id: str) -> None:
|
|
332
|
+
with self.connect() as conn:
|
|
333
|
+
conn.execute(
|
|
334
|
+
"UPDATE articles SET cluster_id = ? WHERE id = ?",
|
|
335
|
+
(cluster_id, article_id),
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
def article_cluster_members(self, cluster_id: str) -> list[dict[str, Any]]:
|
|
339
|
+
with self.connect() as conn:
|
|
340
|
+
rows = conn.execute(
|
|
341
|
+
"""
|
|
342
|
+
SELECT id, source, title, source_url
|
|
343
|
+
FROM articles
|
|
344
|
+
WHERE cluster_id = ?
|
|
345
|
+
ORDER BY fetched_at
|
|
346
|
+
""",
|
|
347
|
+
(cluster_id,),
|
|
348
|
+
).fetchall()
|
|
349
|
+
return [dict(r) for r in rows]
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def _new_id() -> str:
|
|
353
|
+
return secrets.token_urlsafe(12)
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def _now_iso() -> str:
|
|
357
|
+
return datetime.now(timezone.utc).isoformat()
|