metatron-cli 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
metatron/config.py ADDED
@@ -0,0 +1,187 @@
1
+ """Metatron configuration — loaded from ~/.config/metatron/config.toml.
2
+
3
+ XDG-style config location, stdlib tomllib parsing, no .env files, no required
4
+ fields. Sensible defaults so the service runs out of the box; configure the
5
+ [llm] section to enable Sonnet-powered cross-outlet dedup, and the [api]
6
+ section to set a bearer token.
7
+
8
+ Resolution order for the config file path:
9
+ 1. Explicit path passed to ``from_file``
10
+ 2. ``METATRON_CONFIG`` env var (operator override only)
11
+ 3. ``$XDG_CONFIG_HOME/metatron/config.toml``
12
+ 4. ``~/.config/metatron/config.toml``
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import os
18
+ import tomllib
19
+ from dataclasses import dataclass, field
20
+ from pathlib import Path
21
+
22
+
23
+ class ConfigError(Exception):
24
+ """Raised when the config file is malformed or unreadable."""
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class ApiConfig:
29
+ """HTTP API settings."""
30
+
31
+ host: str = "127.0.0.1"
32
+ port: int = 8765
33
+ api_token: str = ""
34
+
35
+
36
+ @dataclass(frozen=True)
37
+ class DatabaseConfig:
38
+ """SQLite path override."""
39
+
40
+ path: str = ""
41
+
42
+
43
+ @dataclass(frozen=True)
44
+ class LlmConfig:
45
+ """Claude CLI wrapper for the dedup tiebreaker.
46
+
47
+ Shells out to the locally-installed `claude` binary, which uses your
48
+ Claude subscription. No API key is needed — the CLI handles auth on
49
+ your behalf. ``enabled`` here flips on the tiebreaker entirely; ``model``
50
+ is passed as ``claude -p --model <model>``.
51
+ """
52
+
53
+ enabled: bool = True
54
+ model: str = "sonnet"
55
+ binary: str = "claude"
56
+ # Kill the CLI subprocess only if stdout/stderr have been silent for
57
+ # this many seconds. A productive call is allowed to run as long as
58
+ # it needs — wall-clock timeouts waste tokens on perfectly fine calls.
59
+ idle_timeout_seconds: float = 120.0
60
+
61
+
62
+ @dataclass(frozen=True)
63
+ class PollerConfig:
64
+ """Background poller settings."""
65
+
66
+ enabled: bool = True
67
+ tick_seconds: int = 60 # how often the loop checks for due feeds
68
+ default_feed_interval_seconds: int = 1800
69
+ feed_timeout_seconds: int = 30
70
+
71
+
72
+ @dataclass(frozen=True)
73
+ class MetatronConfig:
74
+ """Immutable configuration loaded from TOML."""
75
+
76
+ api: ApiConfig = field(default_factory=ApiConfig)
77
+ database: DatabaseConfig = field(default_factory=DatabaseConfig)
78
+ llm: LlmConfig = field(default_factory=LlmConfig)
79
+ poller: PollerConfig = field(default_factory=PollerConfig)
80
+ log_dir: Path = field(default_factory=lambda: Path("logs"))
81
+
82
+ @staticmethod
83
+ def default_config_path() -> Path:
84
+ xdg = os.environ.get("XDG_CONFIG_HOME")
85
+ base = Path(xdg) if xdg else Path.home() / ".config"
86
+ return base / "metatron" / "config.toml"
87
+
88
+ @classmethod
89
+ def from_file(cls, path: Path | None = None) -> MetatronConfig:
90
+ """Load config from TOML. If path is None, follow the resolution order.
91
+
92
+ Missing file is OK — returns an all-defaults config. Malformed TOML
93
+ raises ConfigError.
94
+ """
95
+ if path is None:
96
+ override = os.environ.get("METATRON_CONFIG")
97
+ path = Path(override) if override else cls.default_config_path()
98
+
99
+ if not path.exists():
100
+ return cls()
101
+
102
+ try:
103
+ with path.open("rb") as fh:
104
+ data = tomllib.load(fh)
105
+ except (OSError, tomllib.TOMLDecodeError) as e:
106
+ raise ConfigError(f"Failed to read {path}: {e}") from e
107
+
108
+ return cls._from_dict(data)
109
+
110
+ @classmethod
111
+ def _from_dict(cls, data: dict) -> MetatronConfig:
112
+ api_section = data.get("api", {}) or {}
113
+ db_section = data.get("database", {}) or {}
114
+ llm_section = data.get("llm", {}) or {}
115
+ poller_section = data.get("poller", {}) or {}
116
+ logging_section = data.get("logging", {}) or {}
117
+
118
+ return cls(
119
+ api=ApiConfig(
120
+ host=str(api_section.get("host", "127.0.0.1")),
121
+ port=int(api_section.get("port", 8765)),
122
+ api_token=str(api_section.get("api_token", "")),
123
+ ),
124
+ database=DatabaseConfig(path=str(db_section.get("path", ""))),
125
+ llm=LlmConfig(
126
+ enabled=bool(llm_section.get("enabled", True)),
127
+ model=str(llm_section.get("model", "sonnet")),
128
+ binary=str(llm_section.get("binary", "claude")),
129
+ idle_timeout_seconds=float(
130
+ llm_section.get("idle_timeout_seconds", 120.0)
131
+ ),
132
+ ),
133
+ poller=PollerConfig(
134
+ enabled=bool(poller_section.get("enabled", True)),
135
+ tick_seconds=int(poller_section.get("tick_seconds", 60)),
136
+ default_feed_interval_seconds=int(
137
+ poller_section.get("default_feed_interval_seconds", 1800)
138
+ ),
139
+ feed_timeout_seconds=int(
140
+ poller_section.get("feed_timeout_seconds", 30)
141
+ ),
142
+ ),
143
+ log_dir=Path(str(logging_section.get("dir", "logs"))),
144
+ )
145
+
146
+
147
+ DEFAULT_CONFIG_TEMPLATE = """\
148
+ # Metatron configuration
149
+ # Location: ~/.config/metatron/config.toml
150
+ # Override with METATRON_CONFIG=/path/to/config.toml
151
+
152
+ [api]
153
+ # HTTP server bind. Default 127.0.0.1 (localhost-only).
154
+ host = "127.0.0.1"
155
+ port = 8765
156
+ # Bearer token clients must present. Leave empty to disable auth (dev only).
157
+ api_token = ""
158
+
159
+ [database]
160
+ # Override the default SQLite path. Empty = ~/.local/share/metatron/metatron.db
161
+ path = ""
162
+
163
+ [llm]
164
+ # Cross-outlet dedup tiebreaker via the local `claude` CLI (Claude Code).
165
+ # Uses your Claude subscription — no API key needed. Set enabled = false
166
+ # to skip the tiebreaker entirely (cheaper, but cross-outlet duplicates
167
+ # with different headlines will leak through).
168
+ enabled = true
169
+ model = "sonnet"
170
+ binary = "claude"
171
+ # Kill the CLI subprocess only if stdout has been silent this long.
172
+ # A productive call is never killed on wall-clock alone.
173
+ idle_timeout_seconds = 120.0
174
+
175
+ [poller]
176
+ # Background polling. Disable to fetch only on-demand via /refresh.
177
+ enabled = true
178
+ # How often the loop wakes up to check for due feeds.
179
+ tick_seconds = 60
180
+ # Default polling interval per feed (overridable per feed).
181
+ default_feed_interval_seconds = 1800
182
+ # Per-feed HTTP timeout when polling.
183
+ feed_timeout_seconds = 30
184
+
185
+ [logging]
186
+ dir = "logs"
187
+ """
metatron/db.py ADDED
@@ -0,0 +1,357 @@
1
+ """SQLite schema and helpers for Metatron.
2
+
3
+ Single-file database at ~/.local/share/metatron/metatron.db (XDG-state by
4
+ default; override with [database].path in config.toml).
5
+
6
+ Schema:
7
+ projects(id, name, created_at)
8
+ feeds(id, project_id, url, name, category, enabled, poll_interval_seconds, last_polled, last_error)
9
+ articles(id, project_id, canonical_url, source_url, source, title, summary, body, published, fetched_at, cluster_id)
10
+ clusters(id, project_id, canonical_article_id, created_at)
11
+
12
+ A "cluster" is a deduplicated group of articles all covering the same story
13
+ across outlets. The canonical_article_id points at the first article in the
14
+ cluster (the one we keep showing). Members of a cluster have cluster_id set
15
+ to that cluster's id.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import os
21
+ import secrets
22
+ import sqlite3
23
+ import threading
24
+ from contextlib import contextmanager
25
+ from datetime import datetime, timezone
26
+ from pathlib import Path
27
+ from typing import Any, Iterator
28
+
29
+
30
+ def default_db_path() -> Path:
31
+ xdg = os.environ.get("XDG_STATE_HOME") or os.environ.get("XDG_DATA_HOME")
32
+ base = Path(xdg) if xdg else Path.home() / ".local" / "share"
33
+ return base / "metatron" / "metatron.db"
34
+
35
+
36
+ _SCHEMA = """
37
+ CREATE TABLE IF NOT EXISTS projects (
38
+ id TEXT PRIMARY KEY,
39
+ name TEXT NOT NULL UNIQUE,
40
+ created_at TEXT NOT NULL
41
+ );
42
+
43
+ CREATE TABLE IF NOT EXISTS feeds (
44
+ id TEXT PRIMARY KEY,
45
+ project_id TEXT NOT NULL REFERENCES projects(id) ON DELETE CASCADE,
46
+ url TEXT NOT NULL,
47
+ name TEXT NOT NULL,
48
+ category TEXT NOT NULL DEFAULT '',
49
+ enabled INTEGER NOT NULL DEFAULT 1,
50
+ poll_interval_seconds INTEGER NOT NULL DEFAULT 1800,
51
+ last_polled TEXT,
52
+ last_error TEXT,
53
+ UNIQUE(project_id, url)
54
+ );
55
+
56
+ CREATE TABLE IF NOT EXISTS clusters (
57
+ id TEXT PRIMARY KEY,
58
+ project_id TEXT NOT NULL REFERENCES projects(id) ON DELETE CASCADE,
59
+ canonical_article_id TEXT NOT NULL,
60
+ created_at TEXT NOT NULL
61
+ );
62
+
63
+ CREATE TABLE IF NOT EXISTS articles (
64
+ id TEXT PRIMARY KEY,
65
+ project_id TEXT NOT NULL REFERENCES projects(id) ON DELETE CASCADE,
66
+ canonical_url TEXT NOT NULL,
67
+ source_url TEXT NOT NULL,
68
+ source TEXT NOT NULL,
69
+ title TEXT NOT NULL,
70
+ summary TEXT NOT NULL DEFAULT '',
71
+ body TEXT NOT NULL DEFAULT '',
72
+ published TEXT,
73
+ fetched_at TEXT NOT NULL,
74
+ cluster_id TEXT REFERENCES clusters(id) ON DELETE SET NULL,
75
+ UNIQUE(project_id, canonical_url)
76
+ );
77
+
78
+ CREATE INDEX IF NOT EXISTS idx_articles_project_fetched ON articles(project_id, fetched_at);
79
+ CREATE INDEX IF NOT EXISTS idx_articles_cluster ON articles(cluster_id);
80
+ CREATE INDEX IF NOT EXISTS idx_feeds_project_enabled ON feeds(project_id, enabled);
81
+ """
82
+
83
+
84
+ class Database:
85
+ """Thin wrapper around sqlite3 with thread-local connections.
86
+
87
+ SQLite handles concurrent reads fine. Writes happen from the API thread
88
+ and the poller thread; we use SQLite's WAL mode and a brief busy timeout
89
+ to handle contention.
90
+ """
91
+
92
+ def __init__(self, path: Path | None = None) -> None:
93
+ self.path = path or default_db_path()
94
+ self.path.parent.mkdir(parents=True, exist_ok=True)
95
+ self._local = threading.local()
96
+ with self.connect() as conn:
97
+ conn.executescript(_SCHEMA)
98
+ conn.execute("PRAGMA journal_mode = WAL")
99
+ conn.execute("PRAGMA foreign_keys = ON")
100
+
101
+ @contextmanager
102
+ def connect(self) -> Iterator[sqlite3.Connection]:
103
+ conn = sqlite3.connect(self.path, timeout=10.0)
104
+ conn.row_factory = sqlite3.Row
105
+ conn.execute("PRAGMA foreign_keys = ON")
106
+ try:
107
+ yield conn
108
+ conn.commit()
109
+ finally:
110
+ conn.close()
111
+
112
+ # ── projects ─────────────────────────────────────────────────────────
113
+ def create_project(self, name: str) -> dict[str, Any]:
114
+ pid = _new_id()
115
+ now = _now_iso()
116
+ with self.connect() as conn:
117
+ conn.execute(
118
+ "INSERT INTO projects (id, name, created_at) VALUES (?, ?, ?)",
119
+ (pid, name, now),
120
+ )
121
+ return {"id": pid, "name": name, "created_at": now}
122
+
123
+ def list_projects(self) -> list[dict[str, Any]]:
124
+ with self.connect() as conn:
125
+ rows = conn.execute(
126
+ "SELECT id, name, created_at FROM projects ORDER BY created_at"
127
+ ).fetchall()
128
+ return [dict(r) for r in rows]
129
+
130
+ def get_project(self, project_id: str) -> dict[str, Any] | None:
131
+ with self.connect() as conn:
132
+ row = conn.execute(
133
+ "SELECT id, name, created_at FROM projects WHERE id = ?",
134
+ (project_id,),
135
+ ).fetchone()
136
+ return dict(row) if row else None
137
+
138
+ def delete_project(self, project_id: str) -> bool:
139
+ with self.connect() as conn:
140
+ cur = conn.execute("DELETE FROM projects WHERE id = ?", (project_id,))
141
+ return cur.rowcount > 0
142
+
143
+ # ── feeds ────────────────────────────────────────────────────────────
144
+ def add_feed(
145
+ self,
146
+ project_id: str,
147
+ url: str,
148
+ name: str,
149
+ category: str = "",
150
+ poll_interval_seconds: int = 1800,
151
+ ) -> dict[str, Any]:
152
+ fid = _new_id()
153
+ with self.connect() as conn:
154
+ conn.execute(
155
+ """
156
+ INSERT INTO feeds
157
+ (id, project_id, url, name, category, enabled, poll_interval_seconds)
158
+ VALUES (?, ?, ?, ?, ?, 1, ?)
159
+ """,
160
+ (fid, project_id, url, name, category, poll_interval_seconds),
161
+ )
162
+ return self.get_feed(fid) # type: ignore[return-value]
163
+
164
+ def get_feed(self, feed_id: str) -> dict[str, Any] | None:
165
+ with self.connect() as conn:
166
+ row = conn.execute(
167
+ "SELECT * FROM feeds WHERE id = ?", (feed_id,)
168
+ ).fetchone()
169
+ return dict(row) if row else None
170
+
171
+ def list_feeds(self, project_id: str) -> list[dict[str, Any]]:
172
+ with self.connect() as conn:
173
+ rows = conn.execute(
174
+ "SELECT * FROM feeds WHERE project_id = ? ORDER BY name",
175
+ (project_id,),
176
+ ).fetchall()
177
+ return [dict(r) for r in rows]
178
+
179
+ def list_due_feeds(self, now: datetime | None = None) -> list[dict[str, Any]]:
180
+ """Feeds whose last_polled + poll_interval_seconds is in the past."""
181
+ now = now or datetime.now(timezone.utc)
182
+ now_iso = now.isoformat()
183
+ with self.connect() as conn:
184
+ rows = conn.execute(
185
+ """
186
+ SELECT * FROM feeds
187
+ WHERE enabled = 1
188
+ AND (last_polled IS NULL
189
+ OR datetime(last_polled, '+' || poll_interval_seconds || ' seconds')
190
+ <= datetime(?))
191
+ """,
192
+ (now_iso,),
193
+ ).fetchall()
194
+ return [dict(r) for r in rows]
195
+
196
+ def delete_feed(self, feed_id: str) -> bool:
197
+ with self.connect() as conn:
198
+ cur = conn.execute("DELETE FROM feeds WHERE id = ?", (feed_id,))
199
+ return cur.rowcount > 0
200
+
201
+ def mark_feed_polled(
202
+ self, feed_id: str, error: str | None = None
203
+ ) -> None:
204
+ with self.connect() as conn:
205
+ conn.execute(
206
+ "UPDATE feeds SET last_polled = ?, last_error = ? WHERE id = ?",
207
+ (_now_iso(), error, feed_id),
208
+ )
209
+
210
+ # ── articles + clusters ──────────────────────────────────────────────
211
+ def get_article_by_canonical_url(
212
+ self, project_id: str, canonical_url: str
213
+ ) -> dict[str, Any] | None:
214
+ with self.connect() as conn:
215
+ row = conn.execute(
216
+ "SELECT * FROM articles WHERE project_id = ? AND canonical_url = ?",
217
+ (project_id, canonical_url),
218
+ ).fetchone()
219
+ return dict(row) if row else None
220
+
221
+ def recent_articles_for_dedup(
222
+ self, project_id: str, days: int = 7
223
+ ) -> list[dict[str, Any]]:
224
+ with self.connect() as conn:
225
+ rows = conn.execute(
226
+ """
227
+ SELECT id, title, summary, body, canonical_url, cluster_id
228
+ FROM articles
229
+ WHERE project_id = ?
230
+ AND fetched_at >= datetime('now', ?)
231
+ """,
232
+ (project_id, f"-{days} days"),
233
+ ).fetchall()
234
+ return [dict(r) for r in rows]
235
+
236
+ def insert_article(self, article: dict[str, Any]) -> dict[str, Any] | None:
237
+ """Insert one article. Returns the inserted row, or ``None`` if a
238
+ row with this ``(project_id, canonical_url)`` already exists. The
239
+ race-safe behavior is important: both the background poller and
240
+ an on-demand /refresh may attempt to insert the same item.
241
+ """
242
+ aid = article.get("id") or _new_id()
243
+ article["id"] = aid
244
+ with self.connect() as conn:
245
+ cur = conn.execute(
246
+ """
247
+ INSERT INTO articles
248
+ (id, project_id, canonical_url, source_url, source, title,
249
+ summary, body, published, fetched_at, cluster_id)
250
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
251
+ ON CONFLICT(project_id, canonical_url) DO NOTHING
252
+ """,
253
+ (
254
+ aid,
255
+ article["project_id"],
256
+ article["canonical_url"],
257
+ article["source_url"],
258
+ article["source"],
259
+ article["title"],
260
+ article.get("summary", ""),
261
+ article.get("body", ""),
262
+ article.get("published"),
263
+ article.get("fetched_at") or _now_iso(),
264
+ article.get("cluster_id"),
265
+ ),
266
+ )
267
+ if cur.rowcount == 0:
268
+ return None
269
+ return self.get_article(aid)
270
+
271
+ def get_article(self, article_id: str) -> dict[str, Any] | None:
272
+ with self.connect() as conn:
273
+ row = conn.execute(
274
+ "SELECT * FROM articles WHERE id = ?", (article_id,)
275
+ ).fetchone()
276
+ return dict(row) if row else None
277
+
278
+ def list_articles(
279
+ self,
280
+ project_id: str,
281
+ since: str | None = None,
282
+ limit: int = 100,
283
+ deduped: bool = True,
284
+ ) -> list[dict[str, Any]]:
285
+ """List articles for a project, newest first.
286
+
287
+ When ``deduped`` is True (default), returns only the canonical
288
+ article from each cluster (or articles with no cluster, which means
289
+ they are themselves canonical).
290
+ """
291
+ params: list[Any] = [project_id]
292
+ where = "WHERE a.project_id = ?"
293
+ if since:
294
+ where += " AND a.fetched_at >= ?"
295
+ params.append(since)
296
+ if deduped:
297
+ # Either no cluster (singleton — itself canonical) OR the row IS
298
+ # the canonical of its cluster.
299
+ where += (
300
+ " AND (a.cluster_id IS NULL OR a.id = ("
301
+ " SELECT canonical_article_id FROM clusters WHERE id = a.cluster_id"
302
+ "))"
303
+ )
304
+ params.append(limit)
305
+ with self.connect() as conn:
306
+ rows = conn.execute(
307
+ f"""
308
+ SELECT a.* FROM articles a
309
+ {where}
310
+ ORDER BY datetime(COALESCE(a.published, a.fetched_at)) DESC
311
+ LIMIT ?
312
+ """,
313
+ params,
314
+ ).fetchall()
315
+ return [dict(r) for r in rows]
316
+
317
+ def create_cluster(
318
+ self, project_id: str, canonical_article_id: str
319
+ ) -> str:
320
+ cid = _new_id()
321
+ with self.connect() as conn:
322
+ conn.execute(
323
+ """
324
+ INSERT INTO clusters (id, project_id, canonical_article_id, created_at)
325
+ VALUES (?, ?, ?, ?)
326
+ """,
327
+ (cid, project_id, canonical_article_id, _now_iso()),
328
+ )
329
+ return cid
330
+
331
+ def attach_to_cluster(self, article_id: str, cluster_id: str) -> None:
332
+ with self.connect() as conn:
333
+ conn.execute(
334
+ "UPDATE articles SET cluster_id = ? WHERE id = ?",
335
+ (cluster_id, article_id),
336
+ )
337
+
338
+ def article_cluster_members(self, cluster_id: str) -> list[dict[str, Any]]:
339
+ with self.connect() as conn:
340
+ rows = conn.execute(
341
+ """
342
+ SELECT id, source, title, source_url
343
+ FROM articles
344
+ WHERE cluster_id = ?
345
+ ORDER BY fetched_at
346
+ """,
347
+ (cluster_id,),
348
+ ).fetchall()
349
+ return [dict(r) for r in rows]
350
+
351
+
352
+ def _new_id() -> str:
353
+ return secrets.token_urlsafe(12)
354
+
355
+
356
+ def _now_iso() -> str:
357
+ return datetime.now(timezone.utc).isoformat()