feedkit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
feedkit/__init__.py ADDED
@@ -0,0 +1,8 @@
1
+ """FeedKit — RSS/Atom feed collection with curated catalog."""
2
+
3
+ from feedkit.core import fetch_feed
4
+ from feedkit.catalog import search_catalog, get_catalog_stats
5
+ from feedkit.storage import FeedStore
6
+
7
+ __all__ = ["fetch_feed", "search_catalog", "get_catalog_stats", "FeedStore"]
8
+ __version__ = "0.1.0"
feedkit/__main__.py ADDED
@@ -0,0 +1,231 @@
1
+ """CLI entry point — python -m feedkit or `feedkit` command."""
2
+
3
+ import asyncio
4
+ import json
5
+ import sys
6
+
7
+ import click
8
+ from rich.console import Console
9
+ from rich.table import Table
10
+
11
+ from feedkit import __version__
12
+ from feedkit.catalog import get_catalog_stats, list_categories, search_catalog
13
+ from feedkit.core import collect, fetch_feed
14
+ from feedkit.storage import FeedStore
15
+
16
+ console = Console()
17
+
18
+
19
+ def _get_store():
20
+ return FeedStore()
21
+
22
+
23
+ @click.group()
24
+ @click.version_option(__version__, prog_name="feedkit")
25
+ def main():
26
+ """FeedKit — RSS/Atom feed collection with curated catalog."""
27
+
28
+
29
+ @main.command()
30
+ @click.argument("query", default="")
31
+ @click.option("--category", "-c", default="", help="Filter by category")
32
+ @click.option("--language", "-l", default="", help="Filter by language (en, ko, ...)")
33
+ @click.option("--limit", "-n", default=20, help="Max results")
34
+ @click.option("--json-output", "-j", is_flag=True, help="JSON output")
35
+ def search(query, category, language, limit, json_output):
36
+ """Search the built-in feed catalog."""
37
+ results = search_catalog(query, category=category, language=language, limit=limit)
38
+
39
+ if json_output:
40
+ data = [{"url": f.url, "title": f.title, "category": f.category, "language": f.language} for f in results]
41
+ click.echo(json.dumps(data, ensure_ascii=False, indent=2))
42
+ else:
43
+ table = Table(title=f"Catalog Search: {query or '(all)'} ({len(results)} results)")
44
+ table.add_column("Title", style="cyan", max_width=40)
45
+ table.add_column("Category", style="green")
46
+ table.add_column("Lang")
47
+ table.add_column("URL", style="dim", max_width=50)
48
+ for f in results:
49
+ table.add_row(f.title, f.category, f.language, f.url)
50
+ console.print(table)
51
+
52
+
53
+ @main.command()
54
+ @click.argument("url")
55
+ @click.option("--category", "-c", default="", help="Category for this subscription")
56
+ @click.option("--title", "-t", default="", help="Title override")
57
+ def subscribe(url, category, title):
58
+ """Subscribe to a feed."""
59
+ store = _get_store()
60
+ store.subscribe(url, title=title, category=category)
61
+ console.print(f"[green]✓[/green] Subscribed to {url}")
62
+ store.close()
63
+
64
+
65
+ @main.command()
66
+ @click.argument("url")
67
+ def unsubscribe(url):
68
+ """Unsubscribe from a feed."""
69
+ store = _get_store()
70
+ store.unsubscribe(url)
71
+ console.print(f"[yellow]✓[/yellow] Unsubscribed from {url}")
72
+ store.close()
73
+
74
+
75
+ @main.command("list")
76
+ def list_subs():
77
+ """List all subscriptions."""
78
+ store = _get_store()
79
+ subs = store.list_subscriptions()
80
+
81
+ if not subs:
82
+ console.print("[dim]No subscriptions yet. Use `feedkit subscribe <url>` or `feedkit subscribe-catalog`.[/dim]")
83
+ else:
84
+ table = Table(title=f"Subscriptions ({len(subs)})")
85
+ table.add_column("Title", style="cyan", max_width=35)
86
+ table.add_column("Category", style="green")
87
+ table.add_column("Fetched", justify="right")
88
+ table.add_column("Errors", justify="right")
89
+ for s in subs:
90
+ table.add_row(s.title or s.feed_url[:35], s.category, str(s.fetch_count), str(s.error_count))
91
+ console.print(table)
92
+
93
+ store.close()
94
+
95
+
96
+ @main.command("subscribe-catalog")
97
+ @click.option("--category", "-c", required=True, help="Subscribe to all feeds in this category")
98
+ def subscribe_catalog(category):
99
+ """Subscribe to all feeds in a catalog category."""
100
+ feeds = search_catalog(category=category, limit=1000)
101
+ if not feeds:
102
+ console.print(f"[red]No feeds found in category '{category}'[/red]")
103
+ return
104
+
105
+ store = _get_store()
106
+ for f in feeds:
107
+ store.subscribe(f.url, title=f.title, category=f.category, language=f.language)
108
+ console.print(f"[green]✓[/green] Subscribed to {len(feeds)} feeds in category '{category}'")
109
+ store.close()
110
+
111
+
112
+ @main.command("collect")
113
+ @click.option("--category", "-c", default="", help="Only collect from this category")
114
+ @click.option("--concurrency", "-n", default=20, help="Max concurrent requests")
115
+ def collect_cmd(category, concurrency):
116
+ """Collect articles from all subscribed feeds."""
117
+ store = _get_store()
118
+ sub_count = store.subscription_count()
119
+
120
+ if sub_count == 0:
121
+ console.print("[dim]No subscriptions. Use `feedkit subscribe` first.[/dim]")
122
+ store.close()
123
+ return
124
+
125
+ console.print(f"Collecting from {sub_count} feeds...")
126
+
127
+ try:
128
+ result = asyncio.run(collect(store, category=category, concurrency=concurrency))
129
+ except KeyboardInterrupt:
130
+ sys.exit(130)
131
+
132
+ console.print(f"[green]✓[/green] {result.feeds_ok}/{result.feeds_total} feeds OK, "
133
+ f"{result.new_articles} new articles, {result.duration_ms:.0f}ms")
134
+ if result.errors:
135
+ console.print(f"[yellow]{result.feeds_error} feeds failed[/yellow]")
136
+
137
+ store.close()
138
+
139
+
140
+ @main.command()
141
+ @click.argument("query")
142
+ @click.option("--count", "-n", default=20, help="Max results")
143
+ def find(query, count):
144
+ """Full-text search across collected articles."""
145
+ store = _get_store()
146
+ articles = store.search(query, count=count)
147
+
148
+ if not articles:
149
+ console.print("[dim]No matching articles found.[/dim]")
150
+ else:
151
+ table = Table(title=f"Search: '{query}' ({len(articles)} results)")
152
+ table.add_column("Title", style="cyan", max_width=50)
153
+ table.add_column("Published")
154
+ table.add_column("URL", style="dim", max_width=50)
155
+ for a in articles:
156
+ table.add_row(a.title[:50], a.published or "", a.url[:50])
157
+ console.print(table)
158
+
159
+ store.close()
160
+
161
+
162
+ @main.command()
163
+ @click.option("--count", "-n", default=20, help="Number of articles")
164
+ @click.option("--category", "-c", default="", help="Filter by category")
165
+ def latest(count, category):
166
+ """Show latest collected articles."""
167
+ store = _get_store()
168
+ articles = store.get_latest(count=count, category=category)
169
+
170
+ if not articles:
171
+ console.print("[dim]No articles yet. Run `feedkit collect` first.[/dim]")
172
+ else:
173
+ table = Table(title=f"Latest Articles ({len(articles)})")
174
+ table.add_column("Title", style="cyan", max_width=50)
175
+ table.add_column("Published")
176
+ table.add_column("Feed", style="dim", max_width=30)
177
+ for a in articles:
178
+ table.add_row(a.title[:50], a.published or "", a.feed_url[:30])
179
+ console.print(table)
180
+
181
+ store.close()
182
+
183
+
184
+ @main.command()
185
+ def stats():
186
+ """Show catalog and subscription statistics."""
187
+ cat_stats = get_catalog_stats()
188
+ console.print(f"\n[bold]Catalog:[/bold] {cat_stats['total_feeds']} feeds")
189
+ for cat, n in cat_stats["categories"].items():
190
+ console.print(f" {cat}: {n}")
191
+
192
+ store = _get_store()
193
+ sub_count = store.subscription_count()
194
+ art_count = store.article_count()
195
+ console.print(f"\n[bold]Local:[/bold] {sub_count} subscriptions, {art_count} articles")
196
+ store.close()
197
+
198
+
199
+ @main.command("categories")
200
+ def categories_cmd():
201
+ """List available catalog categories."""
202
+ for cat in list_categories():
203
+ console.print(f" {cat}")
204
+
205
+
206
+ @main.command("import-opml")
207
+ @click.argument("path")
208
+ def import_opml_cmd(path):
209
+ """Import feeds from an OPML file."""
210
+ from feedkit.opml import import_opml
211
+
212
+ store = _get_store()
213
+ count = import_opml(store, path)
214
+ console.print(f"[green]✓[/green] Imported {count} feeds from {path}")
215
+ store.close()
216
+
217
+
218
+ @main.command("export-opml")
219
+ @click.argument("path")
220
+ def export_opml_cmd(path):
221
+ """Export subscriptions to an OPML file."""
222
+ from feedkit.opml import export_opml
223
+
224
+ store = _get_store()
225
+ count = export_opml(store, path)
226
+ console.print(f"[green]✓[/green] Exported {count} feeds to {path}")
227
+ store.close()
228
+
229
+
230
+ if __name__ == "__main__":
231
+ main()
feedkit/catalog.py ADDED
@@ -0,0 +1,104 @@
1
+ """Built-in curated feed catalog — 449 verified RSS/Atom feeds."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ from dataclasses import dataclass
8
+ from pathlib import Path
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ _CATALOG_PATH = Path(__file__).parent / "data" / "feeds.json"
13
+ _catalog: list[dict] | None = None
14
+
15
+
16
+ @dataclass
17
+ class CatalogFeed:
18
+ """A feed from the built-in catalog."""
19
+
20
+ url: str
21
+ title: str
22
+ category: str
23
+ subcategory: str
24
+ language: str
25
+ domain: str
26
+
27
+
28
+ def _load_catalog() -> list[dict]:
29
+ global _catalog
30
+ if _catalog is None:
31
+ with open(_CATALOG_PATH, encoding="utf-8") as f:
32
+ _catalog = json.load(f)
33
+ return _catalog
34
+
35
+
36
+ def search_catalog(
37
+ query: str = "",
38
+ *,
39
+ category: str = "",
40
+ language: str = "",
41
+ limit: int = 50,
42
+ ) -> list[CatalogFeed]:
43
+ """Search the built-in feed catalog.
44
+
45
+ Args:
46
+ query: Search by title or domain (case-insensitive substring match).
47
+ category: Filter by category (e.g., "technology", "science", "finance").
48
+ language: Filter by language (e.g., "en", "ko").
49
+ limit: Maximum results to return.
50
+ """
51
+ catalog = _load_catalog()
52
+ results = []
53
+
54
+ q = query.lower()
55
+ for entry in catalog:
56
+ if category and entry.get("category", "") != category:
57
+ continue
58
+ if language and entry.get("language", "") != language:
59
+ continue
60
+ if q:
61
+ title = entry.get("title", "").lower()
62
+ domain = entry.get("domain", "").lower()
63
+ url = entry.get("url", "").lower()
64
+ if q not in title and q not in domain and q not in url:
65
+ continue
66
+
67
+ results.append(CatalogFeed(
68
+ url=entry["url"],
69
+ title=entry.get("title", ""),
70
+ category=entry.get("category", ""),
71
+ subcategory=entry.get("subcategory", ""),
72
+ language=entry.get("language", "en"),
73
+ domain=entry.get("domain", ""),
74
+ ))
75
+
76
+ if len(results) >= limit:
77
+ break
78
+
79
+ return results
80
+
81
+
82
+ def get_catalog_stats() -> dict:
83
+ """Get catalog statistics."""
84
+ catalog = _load_catalog()
85
+ categories: dict[str, int] = {}
86
+ languages: dict[str, int] = {}
87
+
88
+ for entry in catalog:
89
+ cat = entry.get("category", "unknown")
90
+ lang = entry.get("language", "unknown")
91
+ categories[cat] = categories.get(cat, 0) + 1
92
+ languages[lang] = languages.get(lang, 0) + 1
93
+
94
+ return {
95
+ "total_feeds": len(catalog),
96
+ "categories": dict(sorted(categories.items())),
97
+ "languages": dict(sorted(languages.items())),
98
+ }
99
+
100
+
101
+ def list_categories() -> list[str]:
102
+ """List all available categories."""
103
+ catalog = _load_catalog()
104
+ return sorted({entry.get("category", "") for entry in catalog if entry.get("category")})
feedkit/core.py ADDED
@@ -0,0 +1,167 @@
1
+ """Core feed fetching logic — async HTTP + feedparser."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ import time
8
+ from dataclasses import dataclass, field
9
+ from datetime import datetime
10
+
11
+ import feedparser
12
+ import httpx
13
+
14
+ from feedkit.storage import Article, FeedStore
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ DEFAULT_TIMEOUT = 15.0
19
+ DEFAULT_CONCURRENCY = 20
20
+ USER_AGENT = "FeedKit/0.1.0 (+https://github.com/QuartzUnit/feedkit)"
21
+
22
+
23
+ @dataclass
24
+ class FeedEntry:
25
+ """A single entry from a feed."""
26
+
27
+ title: str = ""
28
+ url: str = ""
29
+ summary: str = ""
30
+ published: str | None = None
31
+ author: str = ""
32
+ feed_url: str = ""
33
+ feed_title: str = ""
34
+ guid: str = ""
35
+
36
+
37
+ @dataclass
38
+ class CollectResult:
39
+ """Result of a bulk feed collection."""
40
+
41
+ feeds_total: int = 0
42
+ feeds_ok: int = 0
43
+ feeds_error: int = 0
44
+ new_articles: int = 0
45
+ duration_ms: float = 0.0
46
+ errors: list[dict] = field(default_factory=list)
47
+
48
+
49
+ async def fetch_feed(
50
+ url: str,
51
+ *,
52
+ count: int = 50,
53
+ timeout: float = DEFAULT_TIMEOUT,
54
+ ) -> list[FeedEntry]:
55
+ """Fetch and parse a single RSS/Atom feed.
56
+
57
+ Args:
58
+ url: Feed URL.
59
+ count: Maximum entries to return.
60
+ timeout: HTTP timeout in seconds.
61
+ """
62
+ headers = {"User-Agent": USER_AGENT}
63
+
64
+ async with httpx.AsyncClient(timeout=httpx.Timeout(timeout), headers=headers, follow_redirects=True) as client:
65
+ response = await client.get(url)
66
+ response.raise_for_status()
67
+
68
+ parsed = feedparser.parse(response.text)
69
+ feed_title = parsed.feed.get("title", "")
70
+ entries = []
71
+
72
+ for entry in parsed.entries[:count]:
73
+ guid = entry.get("id") or entry.get("link") or entry.get("title", "")
74
+ published = None
75
+ if hasattr(entry, "published_parsed") and entry.published_parsed:
76
+ try:
77
+ published = datetime(*entry.published_parsed[:6]).isoformat()
78
+ except (ValueError, TypeError):
79
+ pass
80
+
81
+ entries.append(FeedEntry(
82
+ title=entry.get("title", ""),
83
+ url=entry.get("link", ""),
84
+ summary=_clean_summary(entry.get("summary", "")),
85
+ published=published,
86
+ author=entry.get("author", ""),
87
+ feed_url=url,
88
+ feed_title=feed_title,
89
+ guid=guid,
90
+ ))
91
+
92
+ return entries
93
+
94
+
95
+ async def collect(
96
+ store: FeedStore,
97
+ *,
98
+ category: str = "",
99
+ concurrency: int = DEFAULT_CONCURRENCY,
100
+ timeout: float = DEFAULT_TIMEOUT,
101
+ count_per_feed: int = 50,
102
+ ) -> CollectResult:
103
+ """Collect articles from all subscribed feeds.
104
+
105
+ Args:
106
+ store: FeedStore instance.
107
+ category: Only collect from this category (empty = all).
108
+ concurrency: Max concurrent HTTP requests.
109
+ timeout: Per-feed HTTP timeout.
110
+ count_per_feed: Max entries per feed.
111
+ """
112
+ start = time.monotonic()
113
+ subs = store.list_subscriptions()
114
+
115
+ if category:
116
+ subs = [s for s in subs if s.category == category]
117
+
118
+ result = CollectResult(feeds_total=len(subs))
119
+ semaphore = asyncio.Semaphore(concurrency)
120
+
121
+ async def _fetch_one(feed_url: str) -> tuple[str, list[FeedEntry] | None, str]:
122
+ async with semaphore:
123
+ try:
124
+ entries = await fetch_feed(feed_url, count=count_per_feed, timeout=timeout)
125
+ return feed_url, entries, ""
126
+ except Exception as e:
127
+ return feed_url, None, str(e)
128
+
129
+ tasks = [_fetch_one(s.feed_url) for s in subs]
130
+ outcomes = await asyncio.gather(*tasks)
131
+
132
+ for feed_url, entries, error in outcomes:
133
+ if entries is not None:
134
+ result.feeds_ok += 1
135
+ articles = [
136
+ Article(
137
+ guid=e.guid,
138
+ feed_url=e.feed_url,
139
+ title=e.title,
140
+ url=e.url,
141
+ summary=e.summary,
142
+ author=e.author,
143
+ published=e.published,
144
+ )
145
+ for e in entries
146
+ ]
147
+ new = store.save_articles(articles)
148
+ result.new_articles += new
149
+ store.update_fetch_status(feed_url, success=True)
150
+ else:
151
+ result.feeds_error += 1
152
+ result.errors.append({"feed_url": feed_url, "error": error})
153
+ store.update_fetch_status(feed_url, success=False, error=error)
154
+
155
+ result.duration_ms = (time.monotonic() - start) * 1000
156
+ return result
157
+
158
+
159
+ def _clean_summary(text: str, max_len: int = 500) -> str:
160
+ """Strip HTML tags and truncate summary."""
161
+ import re
162
+
163
+ clean = re.sub(r"<[^>]+>", "", text)
164
+ clean = re.sub(r"\s+", " ", clean).strip()
165
+ if len(clean) > max_len:
166
+ clean = clean[:max_len] + "..."
167
+ return clean