metatron-cli 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
metatron/__init__.py ADDED
@@ -0,0 +1,51 @@
1
+ """Metatron — multi-project RSS feed manager with cross-outlet deduplication."""
2
+
3
+ __version__ = "0.2.0"
4
+
5
+ from metatron.config import (
6
+ ApiConfig,
7
+ ConfigError,
8
+ DatabaseConfig,
9
+ LlmConfig,
10
+ MetatronConfig,
11
+ PollerConfig,
12
+ )
13
+ from metatron.db import Database
14
+ from metatron.dedup import (
15
+ BatchPlan,
16
+ CheapDecision,
17
+ DedupConfig,
18
+ build_batch_plan,
19
+ cheap_decide,
20
+ run_batch,
21
+ )
22
+ from metatron.llm import BatchJudge, ClusterGroup, ClusterItem
23
+ from metatron.normalize import canonicalize_url, jaccard, normalize_title, tokenize
24
+ from metatron.poller import Poller, poll_feeds, refresh_project_now
25
+
26
+ __all__ = [
27
+ "__version__",
28
+ "ApiConfig",
29
+ "BatchJudge",
30
+ "BatchPlan",
31
+ "CheapDecision",
32
+ "ClusterGroup",
33
+ "ClusterItem",
34
+ "ConfigError",
35
+ "Database",
36
+ "DatabaseConfig",
37
+ "DedupConfig",
38
+ "LlmConfig",
39
+ "MetatronConfig",
40
+ "Poller",
41
+ "PollerConfig",
42
+ "build_batch_plan",
43
+ "canonicalize_url",
44
+ "cheap_decide",
45
+ "jaccard",
46
+ "normalize_title",
47
+ "poll_feeds",
48
+ "refresh_project_now",
49
+ "run_batch",
50
+ "tokenize",
51
+ ]
metatron/api.py ADDED
@@ -0,0 +1,290 @@
1
+ """FastAPI HTTP surface for Metatron.
2
+
3
+ One purpose: manage feeds, expose deduplicated articles. No curation, no
4
+ personality, no opinions about what's interesting — that's for the caller.
5
+
6
+ Endpoints:
7
+ POST /projects — create a project
8
+ GET /projects — list projects
9
+ DELETE /projects/{project_id} — delete a project (cascades)
10
+ POST /projects/{project_id}/feeds — add a feed to a project
11
+ GET /projects/{project_id}/feeds — list feeds for a project
12
+ DELETE /feeds/{feed_id} — remove a feed
13
+ GET /projects/{project_id}/articles — list deduped articles (?since=, ?limit=)
14
+ GET /articles/{article_id} — fetch one article with body + cluster members
15
+ POST /projects/{project_id}/refresh — force a synchronous poll of this project's feeds
16
+ GET /health — liveness probe (no auth)
17
+
18
+ Authentication: bearer token from config [api].api_token. When the token is
19
+ empty, the API is open (dev convenience; configure a token before exposing).
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import asyncio
25
+ import logging
26
+ from datetime import datetime, timezone
27
+ from typing import Any
28
+
29
+ from fastapi import Depends, FastAPI, Header, HTTPException, Path as PathParam, Query
30
+ from pydantic import BaseModel, Field
31
+
32
+ from metatron.config import MetatronConfig
33
+ from metatron.db import Database
34
+ from metatron.dedup import DedupConfig
35
+ from metatron.llm import BatchJudge
36
+ from metatron.poller import Poller, refresh_project_now
37
+
38
+ logger = logging.getLogger("metatron.api")
39
+
40
+
41
+ # ── request / response shapes ────────────────────────────────────────────
42
+
43
+
44
+ class CreateProjectIn(BaseModel):
45
+ name: str = Field(..., min_length=1, max_length=200)
46
+
47
+
48
+ class ProjectOut(BaseModel):
49
+ id: str
50
+ name: str
51
+ created_at: str
52
+
53
+
54
+ class AddFeedIn(BaseModel):
55
+ url: str = Field(..., min_length=1)
56
+ name: str | None = None
57
+ category: str = ""
58
+ poll_interval_seconds: int = 1800
59
+
60
+
61
+ class FeedOut(BaseModel):
62
+ id: str
63
+ project_id: str
64
+ url: str
65
+ name: str
66
+ category: str
67
+ enabled: bool
68
+ poll_interval_seconds: int
69
+ last_polled: str | None
70
+ last_error: str | None
71
+
72
+
73
+ class ArticleOut(BaseModel):
74
+ id: str
75
+ project_id: str
76
+ canonical_url: str
77
+ source_url: str
78
+ source: str
79
+ title: str
80
+ summary: str
81
+ published: str | None
82
+ fetched_at: str
83
+ cluster_id: str | None
84
+
85
+
86
+ class ArticleDetailOut(ArticleOut):
87
+ body: str
88
+ cluster_members: list[dict[str, Any]] = []
89
+
90
+
91
+ class RefreshOut(BaseModel):
92
+ polled: int
93
+ new_articles: int
94
+ duplicates: int
95
+ cluster_joins: int
96
+ feed_errors: int = 0
97
+
98
+
99
+ # ── factory ─────────────────────────────────────────────────────────────
100
+
101
+
102
+ def create_app(
103
+ *,
104
+ config: MetatronConfig,
105
+ db: Database,
106
+ judge: BatchJudge,
107
+ poller: Poller | None = None,
108
+ ) -> FastAPI:
109
+ """Build the FastAPI app wired to the given dependencies."""
110
+ app = FastAPI(title="Metatron", version="0.2.0")
111
+
112
+ expected_token = config.api.api_token
113
+
114
+ def require_auth(authorization: str | None = Header(default=None)) -> None:
115
+ if not expected_token:
116
+ return
117
+ if not authorization or not authorization.startswith("Bearer "):
118
+ raise HTTPException(status_code=401, detail="Missing bearer token")
119
+ token = authorization.removeprefix("Bearer ").strip()
120
+ if token != expected_token:
121
+ raise HTTPException(status_code=401, detail="Invalid bearer token")
122
+
123
+ @app.on_event("startup")
124
+ async def _startup() -> None:
125
+ if poller is not None and config.poller.enabled:
126
+ poller.start()
127
+
128
+ @app.on_event("shutdown")
129
+ async def _shutdown() -> None:
130
+ if poller is not None:
131
+ await poller.stop()
132
+
133
+ @app.get("/health")
134
+ async def health() -> dict[str, Any]:
135
+ return {"status": "ok", "llm_enabled": judge.enabled}
136
+
137
+ # ── projects ─────────────────────────────────────────────────────────
138
+ @app.post("/projects", response_model=ProjectOut, status_code=201, dependencies=[Depends(require_auth)])
139
+ async def create_project(payload: CreateProjectIn) -> ProjectOut:
140
+ try:
141
+ row = db.create_project(payload.name)
142
+ except Exception as e:
143
+ raise HTTPException(status_code=409, detail=f"Project create failed: {e}") from e
144
+ return ProjectOut(**row)
145
+
146
+ @app.get("/projects", response_model=list[ProjectOut], dependencies=[Depends(require_auth)])
147
+ async def list_projects() -> list[ProjectOut]:
148
+ return [ProjectOut(**r) for r in db.list_projects()]
149
+
150
+ @app.delete("/projects/{project_id}", status_code=204, dependencies=[Depends(require_auth)])
151
+ async def delete_project(project_id: str = PathParam(...)) -> None:
152
+ if not db.delete_project(project_id):
153
+ raise HTTPException(status_code=404, detail="Project not found")
154
+
155
+ # ── feeds ────────────────────────────────────────────────────────────
156
+ @app.post(
157
+ "/projects/{project_id}/feeds",
158
+ response_model=FeedOut,
159
+ status_code=201,
160
+ dependencies=[Depends(require_auth)],
161
+ )
162
+ async def add_feed(project_id: str, payload: AddFeedIn) -> FeedOut:
163
+ if not db.get_project(project_id):
164
+ raise HTTPException(status_code=404, detail="Project not found")
165
+ name = payload.name or _host_of(payload.url)
166
+ try:
167
+ row = db.add_feed(
168
+ project_id=project_id,
169
+ url=payload.url,
170
+ name=name,
171
+ category=payload.category,
172
+ poll_interval_seconds=payload.poll_interval_seconds,
173
+ )
174
+ except Exception as e:
175
+ raise HTTPException(status_code=409, detail=f"Feed add failed: {e}") from e
176
+ return _feed_to_out(row)
177
+
178
+ @app.get(
179
+ "/projects/{project_id}/feeds",
180
+ response_model=list[FeedOut],
181
+ dependencies=[Depends(require_auth)],
182
+ )
183
+ async def list_feeds(project_id: str) -> list[FeedOut]:
184
+ if not db.get_project(project_id):
185
+ raise HTTPException(status_code=404, detail="Project not found")
186
+ return [_feed_to_out(r) for r in db.list_feeds(project_id)]
187
+
188
+ @app.delete("/feeds/{feed_id}", status_code=204, dependencies=[Depends(require_auth)])
189
+ async def delete_feed(feed_id: str) -> None:
190
+ if not db.delete_feed(feed_id):
191
+ raise HTTPException(status_code=404, detail="Feed not found")
192
+
193
+ # ── articles ─────────────────────────────────────────────────────────
194
+ @app.get(
195
+ "/projects/{project_id}/articles",
196
+ response_model=list[ArticleOut],
197
+ dependencies=[Depends(require_auth)],
198
+ )
199
+ async def list_articles(
200
+ project_id: str,
201
+ since: str | None = Query(default=None),
202
+ limit: int = Query(default=50, ge=1, le=500),
203
+ ) -> list[ArticleOut]:
204
+ if not db.get_project(project_id):
205
+ raise HTTPException(status_code=404, detail="Project not found")
206
+ rows = db.list_articles(project_id, since=since, limit=limit, deduped=True)
207
+ return [_article_to_out(r) for r in rows]
208
+
209
+ @app.get(
210
+ "/articles/{article_id}",
211
+ response_model=ArticleDetailOut,
212
+ dependencies=[Depends(require_auth)],
213
+ )
214
+ async def get_article(article_id: str) -> ArticleDetailOut:
215
+ row = db.get_article(article_id)
216
+ if not row:
217
+ raise HTTPException(status_code=404, detail="Article not found")
218
+ members: list[dict[str, Any]] = []
219
+ if row.get("cluster_id"):
220
+ members = db.article_cluster_members(row["cluster_id"])
221
+ return ArticleDetailOut(
222
+ **_article_to_out(row).model_dump(),
223
+ body=row.get("body", ""),
224
+ cluster_members=members,
225
+ )
226
+
227
+ # ── on-demand refresh ────────────────────────────────────────────────
228
+ @app.post(
229
+ "/projects/{project_id}/refresh",
230
+ response_model=RefreshOut,
231
+ dependencies=[Depends(require_auth)],
232
+ )
233
+ async def refresh(project_id: str) -> RefreshOut:
234
+ if not db.get_project(project_id):
235
+ raise HTTPException(status_code=404, detail="Project not found")
236
+ loop = asyncio.get_running_loop()
237
+ stats = await loop.run_in_executor(
238
+ None,
239
+ refresh_project_now,
240
+ project_id,
241
+ db,
242
+ judge,
243
+ DedupConfig(),
244
+ config.poller.feed_timeout_seconds,
245
+ )
246
+ return RefreshOut(**stats)
247
+
248
+ return app
249
+
250
+
251
+ # ── helpers ─────────────────────────────────────────────────────────────
252
+
253
+
254
+ def _feed_to_out(row: dict[str, Any]) -> FeedOut:
255
+ return FeedOut(
256
+ id=row["id"],
257
+ project_id=row["project_id"],
258
+ url=row["url"],
259
+ name=row["name"],
260
+ category=row.get("category", "") or "",
261
+ enabled=bool(row.get("enabled", 1)),
262
+ poll_interval_seconds=int(row.get("poll_interval_seconds", 1800)),
263
+ last_polled=row.get("last_polled"),
264
+ last_error=row.get("last_error"),
265
+ )
266
+
267
+
268
+ def _article_to_out(row: dict[str, Any]) -> ArticleOut:
269
+ return ArticleOut(
270
+ id=row["id"],
271
+ project_id=row["project_id"],
272
+ canonical_url=row["canonical_url"],
273
+ source_url=row["source_url"],
274
+ source=row["source"],
275
+ title=row["title"],
276
+ summary=row.get("summary", "") or "",
277
+ published=row.get("published"),
278
+ fetched_at=row["fetched_at"],
279
+ cluster_id=row.get("cluster_id"),
280
+ )
281
+
282
+
283
+ def _host_of(url: str) -> str:
284
+ from urllib.parse import urlsplit
285
+
286
+ try:
287
+ host = urlsplit(url).hostname or url
288
+ return host[4:] if host.startswith("www.") else host
289
+ except ValueError:
290
+ return url
metatron/cli.py ADDED
@@ -0,0 +1,221 @@
1
+ """Metatron CLI.
2
+
3
+ Subcommands:
4
+ metatron serve — run the HTTP API + background poller
5
+ metatron config init — write a starter config to ~/.config/metatron/config.toml
6
+ metatron config show — print the resolved config
7
+ metatron seed-feeds <project> <feeds.json>
8
+ — bulk-add feeds to a project from a JSON file
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import argparse
14
+ import json
15
+ import logging
16
+ import os
17
+ import sys
18
+ from pathlib import Path
19
+
20
+ from metatron.config import (
21
+ DEFAULT_CONFIG_TEMPLATE,
22
+ ConfigError,
23
+ MetatronConfig,
24
+ )
25
+
26
+ logger = logging.getLogger("metatron.cli")
27
+
28
+
29
+ def _resolve_config_path() -> Path:
30
+ override = os.environ.get("METATRON_CONFIG")
31
+ return Path(override) if override else MetatronConfig.default_config_path()
32
+
33
+
34
+ def _cmd_config_init() -> int:
35
+ path = _resolve_config_path()
36
+ if path.exists():
37
+ print(f"Config already exists: {path}", file=sys.stderr)
38
+ return 1
39
+ path.parent.mkdir(parents=True, exist_ok=True)
40
+ path.write_text(DEFAULT_CONFIG_TEMPLATE, encoding="utf-8")
41
+ print(f"Wrote starter config to {path}")
42
+ return 0
43
+
44
+
45
+ def _cmd_config_show() -> int:
46
+ path = _resolve_config_path()
47
+ try:
48
+ config = MetatronConfig.from_file()
49
+ except ConfigError as e:
50
+ print(f"Configuration error: {e}", file=sys.stderr)
51
+ return 1
52
+ print(f"Config path: {path}")
53
+ print(f"Exists: {path.exists()}")
54
+ print(f"API host:port: {config.api.host}:{config.api.port}")
55
+ print(f"API token set: {'yes' if config.api.api_token else 'no (open)'}")
56
+ print(f"LLM tiebreaker: {'enabled' if config.llm.enabled else 'disabled'}")
57
+ print(f"LLM model: {config.llm.model} (via {config.llm.binary} CLI)")
58
+ print(f"Poller enabled: {config.poller.enabled}")
59
+ print(f"Poller tick: {config.poller.tick_seconds}s")
60
+ print(f"Feed interval: {config.poller.default_feed_interval_seconds}s")
61
+ return 0
62
+
63
+
64
+ def _build_judge(config: MetatronConfig) -> "BatchJudge":
65
+ from metatron.llm import BatchJudge as _Judge
66
+
67
+ if not config.llm.enabled:
68
+ class _Disabled(_Judge):
69
+ @property
70
+ def enabled(self) -> bool: # type: ignore[override]
71
+ return False
72
+
73
+ def cluster(self, items): # type: ignore[override]
74
+ return []
75
+
76
+ return _Disabled(model=config.llm.model, binary=config.llm.binary)
77
+ return _Judge(
78
+ model=config.llm.model,
79
+ binary=config.llm.binary,
80
+ idle_timeout=config.llm.idle_timeout_seconds,
81
+ )
82
+
83
+
84
+ def _cmd_serve() -> int:
85
+ import uvicorn
86
+
87
+ from metatron.api import create_app
88
+ from metatron.db import Database, default_db_path
89
+ from metatron.llm import BatchJudge
90
+ from metatron.poller import Poller
91
+
92
+ try:
93
+ config = MetatronConfig.from_file()
94
+ except ConfigError as e:
95
+ print(f"Configuration error: {e}", file=sys.stderr)
96
+ return 1
97
+
98
+ db_path = Path(config.database.path) if config.database.path else default_db_path()
99
+ db = Database(path=db_path)
100
+ judge = _build_judge(config)
101
+ poller = Poller(
102
+ db=db,
103
+ judge=judge,
104
+ tick_seconds=config.poller.tick_seconds,
105
+ feed_timeout_seconds=config.poller.feed_timeout_seconds,
106
+ )
107
+ app = create_app(config=config, db=db, judge=judge, poller=poller)
108
+
109
+ uvicorn.run(
110
+ app,
111
+ host=config.api.host,
112
+ port=config.api.port,
113
+ log_level="info",
114
+ )
115
+ return 0
116
+
117
+
118
+ def _cmd_seed_feeds(project_name: str, feeds_path: str) -> int:
119
+ from metatron.db import Database, default_db_path
120
+
121
+ path = Path(feeds_path)
122
+ if not path.exists():
123
+ print(f"feeds file not found: {path}", file=sys.stderr)
124
+ return 1
125
+ try:
126
+ data = json.loads(path.read_text(encoding="utf-8"))
127
+ except json.JSONDecodeError as e:
128
+ print(f"feeds file not valid JSON: {e}", file=sys.stderr)
129
+ return 1
130
+ feeds = data.get("feeds", data if isinstance(data, list) else [])
131
+ if not feeds:
132
+ print("no feeds found in input", file=sys.stderr)
133
+ return 1
134
+
135
+ config = MetatronConfig.from_file()
136
+ db_path = Path(config.database.path) if config.database.path else default_db_path()
137
+ db = Database(path=db_path)
138
+
139
+ existing = next(
140
+ (p for p in db.list_projects() if p["name"] == project_name), None
141
+ )
142
+ if existing is None:
143
+ project = db.create_project(project_name)
144
+ else:
145
+ project = existing
146
+ project_id = project["id"]
147
+
148
+ added = 0
149
+ skipped = 0
150
+ for feed in feeds:
151
+ url = feed.get("url")
152
+ name = feed.get("name") or url
153
+ category = feed.get("category", "")
154
+ if not url:
155
+ skipped += 1
156
+ continue
157
+ try:
158
+ db.add_feed(
159
+ project_id=project_id,
160
+ url=url,
161
+ name=name,
162
+ category=category,
163
+ poll_interval_seconds=int(
164
+ feed.get(
165
+ "poll_interval_seconds",
166
+ config.poller.default_feed_interval_seconds,
167
+ )
168
+ ),
169
+ )
170
+ added += 1
171
+ except Exception as e:
172
+ print(f" skip {url}: {e}", file=sys.stderr)
173
+ skipped += 1
174
+ print(f"Project: {project_name} ({project_id})")
175
+ print(f"Added: {added}")
176
+ print(f"Skipped: {skipped}")
177
+ return 0
178
+
179
+
180
+ def main(args: list[str] | None = None) -> int:
181
+ logging.basicConfig(
182
+ level=logging.INFO,
183
+ format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
184
+ datefmt="%Y-%m-%d %H:%M:%S",
185
+ )
186
+
187
+ from metatron import __version__
188
+
189
+ parser = argparse.ArgumentParser(
190
+ description="Metatron — multi-project RSS feed manager with cross-outlet deduplication",
191
+ )
192
+ parser.add_argument("--version", action="version", version=f"metatron {__version__}")
193
+ sub = parser.add_subparsers(dest="subcommand")
194
+
195
+ sub.add_parser("serve", help="Run the HTTP API + background poller")
196
+
197
+ cfg = sub.add_parser("config", help="Config management")
198
+ cfg.add_argument("action", choices=["init", "show"])
199
+
200
+ seed = sub.add_parser("seed-feeds", help="Bulk-add feeds to a project from JSON")
201
+ seed.add_argument("project", help="Project name (created if missing)")
202
+ seed.add_argument("feeds_path", help="Path to a JSON file with feeds")
203
+
204
+ parsed = parser.parse_args(args)
205
+
206
+ if parsed.subcommand == "serve":
207
+ return _cmd_serve()
208
+ if parsed.subcommand == "config":
209
+ if parsed.action == "init":
210
+ return _cmd_config_init()
211
+ if parsed.action == "show":
212
+ return _cmd_config_show()
213
+ if parsed.subcommand == "seed-feeds":
214
+ return _cmd_seed_feeds(parsed.project, parsed.feeds_path)
215
+
216
+ parser.print_help()
217
+ return 0
218
+
219
+
220
+ if __name__ == "__main__":
221
+ sys.exit(main())