newscli-tool 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
newscli/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ """
2
+ newscli/__init__.py — news-cli package entry
3
+ """
4
+
5
+ __version__ = "1.0.0"
6
+ __all__ = ["__version__"]
newscli/aggregator.py ADDED
@@ -0,0 +1,263 @@
1
+ """
2
+ aggregator.py — News Aggregator
3
+
4
+ 统一调度多个 source,支持:
5
+ - 多 source 并行拉取(ThreadPoolExecutor)
6
+ - 每个 source 的 module 子模块指定
7
+ - source-specific 额外参数(category, language, node, subreddit 等)
8
+ - 统一 JSON 输出(完整 schema)
9
+ """
10
+
11
+ import concurrent.futures
12
+ from dataclasses import asdict
13
+ from typing import Optional
14
+ from newscli.sources import REGISTRY, NewsItem, SourceError, rss as rss_module
15
+
16
+
17
+ class NewsAggregator:
18
+ """多 source 统一聚合器"""
19
+
20
+ def __init__(
21
+ self,
22
+ sources: list[str] | None = None,
23
+ limit_per_source: int = 10,
24
+ ):
25
+ """
26
+ Args:
27
+ sources : source 名称列表(如 ["hackernews", "github"])
28
+ None = 所有注册 source(不含 rss)
29
+ limit_per_source: 每个 source 最大拉取条数
30
+ """
31
+ self.sources = sources or list(REGISTRY.keys())
32
+ self.limit_per_source = limit_per_source
33
+
34
+ def fetch(
35
+ self,
36
+ source_filter: str | None = None,
37
+ limit: int | None = None,
38
+ keyword: str | None = None,
39
+ params: dict | None = None,
40
+ enrich: bool = True, # 默认对 summary=null 的 item 拉取原文摘要
41
+ ) -> dict:
42
+ """
43
+ 并行拉取所有 source,返回统一格式。
44
+
45
+ Args:
46
+ source_filter : 逗号分隔的 source:module 对列表
47
+ 如 "hackernews:topstories,github:trending,v2ex:latest"
48
+ limit : 全局返回上限(None = 所有)
49
+ keyword : 关键词过滤(所有 source 生效)
50
+ params : source-specific 参数 dict
51
+ 格式:{"<source>": {"module": "...", "category": "..."}}
52
+
53
+ Returns:
54
+ {"ok": bool, "schema": str, "sources": {}, "items": [], "total": int, "errors": []}
55
+ items 每项 = NewsItem.to_dict(),即所有字段(含 None)
56
+ """
57
+ parsed = self._parse_filter(source_filter)
58
+ params = params or {}
59
+
60
+ # results key = source:module(如 "hackernews:topstories")
61
+ results: dict[str, list[dict]] = {}
62
+ errors: list[str] = []
63
+
64
+ def _fetch_one(name: str, module: str | None, extra: dict):
65
+ """执行单次 fetch,返回 (key, items_list, errors_list)"""
66
+ # 唯一 key:不同 module 的同一 source 不会互相覆盖
67
+ key = f"{name}:{module}" if module else name
68
+
69
+ # RSS 特殊处理(不注册到 REGISTRY)
70
+ if name == "rss":
71
+ src = rss_module.RSSSource(
72
+ source_key=extra.get("source_key"),
73
+ feed_url=extra.get("url"),
74
+ )
75
+ else:
76
+ src_cls = REGISTRY.get(name)
77
+ if not src_cls:
78
+ return key, [], [f"Unknown source: {name}"]
79
+ src = src_cls()
80
+
81
+ # global keyword filter applied at source.fetch() call
82
+ fetch_kwargs = {
83
+ "module": module,
84
+ "limit": self.limit_per_source,
85
+ "keyword": keyword, # may be None = no filtering
86
+ }
87
+ # per-source params override (from merged extra)
88
+ for k in ("category", "language", "node", "subreddit",
89
+ "start_time", "end_time", "url"):
90
+ if k in extra:
91
+ fetch_kwargs[k] = extra[k]
92
+ # per-source keyword (params level) takes precedence over global
93
+ if "keyword" in extra:
94
+ fetch_kwargs["keyword"] = extra["keyword"]
95
+
96
+ try:
97
+ items = src.fetch(**fetch_kwargs)
98
+ return key, [item.to_dict() for item in items], []
99
+ except SourceError as e:
100
+ return key, [], [str(e)]
101
+ except Exception as e:
102
+ return key, [], [f"{name} unexpected error: {e}"]
103
+
104
+ # 构建 work items
105
+ work: list[tuple] = []
106
+ if parsed:
107
+ for name, module, extra in parsed:
108
+ # CLI params 覆盖 hard-coded extra
109
+ sp = params.get(name, {})
110
+ merged_extra = {**extra, **sp}
111
+ work.append((name, module, merged_extra))
112
+ else:
113
+ for name in self.sources:
114
+ sp = params.get(name, {})
115
+ work.append((name, None, sp))
116
+
117
+ # 并行执行
118
+ with concurrent.futures.ThreadPoolExecutor(max_workers=min(len(work), 8)) as ex:
119
+ futures = {ex.submit(_fetch_one, *w): w for w in work}
120
+ for fut in concurrent.futures.as_completed(futures):
121
+ key, items, errs = fut.result()
122
+ if items:
123
+ results[key] = items
124
+ errors.extend(errs)
125
+
126
+ # 合并所有 items(按 source:module 分别存储,输出时全部扁平化)
127
+ all_items: list[dict] = []
128
+ for key, items in results.items():
129
+ all_items.extend(items)
130
+
131
+ # 跨 source 去重:标题相似度 ≥ 70%(仅多 source 时触发)
132
+ if len(results) > 1:
133
+ all_items = self._deduplicate(all_items)
134
+
135
+ # 可选:enrich — 对 summary=null 的 item 并发拉取原文 description
136
+ if enrich:
137
+ from .enrich import enrich_items
138
+ all_items = enrich_items(all_items)
139
+
140
+ if limit is not None:
141
+ all_items = all_items[:limit]
142
+
143
+ return {
144
+ "ok": True,
145
+ "schema": "NewsItem v1.0",
146
+ "sources": {k: len(v) for k, v in results.items()},
147
+ "items": all_items,
148
+ "total": len(all_items),
149
+ "errors": errors,
150
+ }
151
+
152
+ @staticmethod
153
+ def _deduplicate(items: list[dict]) -> list[dict]:
154
+ """
155
+ 跨 source 去重。相似度阈值 70%(标题 normalized 后比对)。
156
+ 保留第一条出现的item,移除后续相似项。
157
+ """
158
+ from urllib.parse import urlparse
159
+ def normalize_title(t: str) -> str:
160
+ """小写 + 去除标点 + strip()"""
161
+ import re
162
+ t = t.lower().strip()
163
+ t = re.sub(r'[^\w\s]', ' ', t)
164
+ t = re.sub(r'\s+', ' ', t).strip()
165
+ return t
166
+
167
+ def similarity(a: str, b: str) -> float:
168
+ """简单词集合 Jaccard 相似度"""
169
+ sa = set(a.split())
170
+ sb = set(b.split())
171
+ if not sa or not sb:
172
+ return 0.0
173
+ inter = len(sa & sb)
174
+ union = len(sa | sb)
175
+ return inter / union if union > 0 else 0.0
176
+
177
+ def item_key(item: dict) -> str:
178
+ domain = ""
179
+ if item.get("url"):
180
+ try:
181
+ domain = urlparse(item["url"]).netloc
182
+ except Exception:
183
+ pass
184
+ return f"{normalize_title(item['title'])}|{domain}"
185
+
186
+ seen: list[dict] = []
187
+ for item in items:
188
+ norm = normalize_title(item.get("title", ""))
189
+ if not norm:
190
+ seen.append(item)
191
+ continue
192
+ dup_idx = None
193
+ for i, s in enumerate(seen):
194
+ s_norm = normalize_title(s.get("title", ""))
195
+ # Different domain → different article, skip
196
+ s_domain = urlparse(s.get("url", "")).netloc or ""
197
+ item_domain = urlparse(item.get("url", "")).netloc or ""
198
+ if s_domain and item_domain and s_domain != item_domain:
199
+ continue
200
+ if similarity(norm, s_norm) >= 0.70:
201
+ dup_idx = i
202
+ break
203
+ if dup_idx is not None:
204
+ # 保留信息更丰富的那个(非 null 字段数量多的)
205
+ existing = seen[dup_idx]
206
+ existing_nulls = sum(1 for k, v in existing.items() if v is None and k != 'extra')
207
+ item_nulls = sum(1 for k, v in item.items() if v is None and k != 'extra')
208
+ if item_nulls < existing_nulls:
209
+ seen[dup_idx] = item
210
+ else:
211
+ seen.append(item)
212
+ return seen
213
+
214
+ @staticmethod
215
+ def _parse_filter(filter_str: str | None) -> list[tuple]:
216
+ """
217
+ 解析 'source:module,source2:module2' → [(name, module, extra_dict), ...]
218
+
219
+ 支持格式:
220
+ hackernews:topstories → ("hackernews", "topstories", {})
221
+ v2ex:node:python → ("v2ex", "node", {"node": "python"})
222
+ rss:bensbites → ("rss", "bensbites", {"source_key": "bensbites"})
223
+ github:trending:language=Python → ("github", "trending", {"language": "Python"})
224
+ zaker:category:category=technology → ("zaker", "category", {"category": "technology"})
225
+ reddit:r/technology → ("reddit", None, {"subreddit": "technology"})
226
+ """
227
+ if not filter_str:
228
+ return []
229
+ items = []
230
+ for part in filter_str.split("&"):
231
+ part = part.strip()
232
+ if not part:
233
+ continue
234
+ name, rest = part.split(":", 1) if ":" in part else (part, "")
235
+ name = name.strip()
236
+
237
+ extra = {}
238
+ tokens = rest.split(":") if rest else []
239
+ module = None
240
+ for token in tokens:
241
+ if "=" in token:
242
+ k, v = token.split("=", 1)
243
+ extra[k.strip()] = v.strip()
244
+ elif not module:
245
+ module = token
246
+
247
+ # Reddit r/<subreddit> shorthand
248
+ if name == "reddit" and module and module.startswith("r/"):
249
+ extra["subreddit"] = module[2:]
250
+ module = None
251
+
252
+ # RSS source_key shorthand
253
+ if name == "rss" and module and "=" not in module:
254
+ extra["source_key"] = module
255
+
256
+ items.append((name, module, extra))
257
+ return items
258
+
259
+
260
+ def fetch_all(keyword: str | None = None, limit: int | None = None) -> dict:
261
+ """快速入口:拉取所有注册 source"""
262
+ agg = NewsAggregator()
263
+ return agg.fetch(keyword=keyword, limit=limit)
newscli/cli.py ADDED
@@ -0,0 +1,232 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ cli.py — News CLI 入口
4
+
5
+ 两种接口共存:
6
+ 1. flag 模式(--source / --limit / --json)— 机器友好
7
+ 2. 自然语言模式(get hackernews topstories 5 json)— 人类友好
8
+
9
+ 用法(flag):
10
+ python cli.py --source hackernews:topstories --limit 5
11
+ python cli.py --source all --json
12
+
13
+ 用法(自然语言):
14
+ python cli.py get hackernews topstories
15
+ python cli.py get hackernews topstories 5 json
16
+ python cli.py get hackernews topstories 5 and github trending 10
17
+ python cli.py get all 10 json noenrich
18
+ python cli.py list
19
+ python cli.py list hackernews
20
+ """
21
+
22
+ import argparse
23
+ import json
24
+ import sys
25
+ from .aggregator import NewsAggregator
26
+ from .sources import REGISTRY, rss as rss_module
27
+ from .parser import parseNL, ParseError, list_sources, list_source_modules
28
+
29
+
30
+ # ─── 输出格式器 ─────────────────────────────────────────────────
31
+
32
+ def _render_text(items: list[dict]) -> str:
33
+ """人类可读文本输出 — None 字段不显示"""
34
+ if not items:
35
+ return "📰 无内容"
36
+ lines = [f"📰 共 {len(items)} 条"]
37
+ lines.append("─" * 50)
38
+ for i, item in enumerate(items, 1):
39
+ # source + module badge
40
+ module = item.get("module") or ""
41
+ source = item.get("source") or "?"
42
+ badge = f"[{source}]" if not module else f"[{source}/{module}]"
43
+ heat = item.get("heat") or ""
44
+ time = item.get("time") or ""
45
+ meta = " | ".join(x for x in [heat, time] if x)
46
+ lines.append(f"\n[{i}] {badge} {meta}")
47
+ lines.append(f" {item.get('title', '')}")
48
+ url = item.get("url")
49
+ if url:
50
+ lines.append(f" 🔗 {url}")
51
+ summary = item.get("summary")
52
+ if summary:
53
+ lines.append(f" 📝 {summary[:150]}{'...' if len(summary) > 150 else ''}")
54
+ author = item.get("author")
55
+ if author:
56
+ lines.append(f" 👤 {author}")
57
+ return "\n".join(lines)
58
+
59
+
60
+ def _json_output(result: dict) -> str:
61
+ """JSON 输出 — 完整 schema,包含 None"""
62
+ return json.dumps(result, indent=2, ensure_ascii=False)
63
+
64
+
65
+ # ─── CLI 参数解析 ───────────────────────────────────────────────
66
+
67
+ def _build_parser() -> argparse.ArgumentParser:
68
+ p = argparse.ArgumentParser(
69
+ prog="news",
70
+ description="模块化新闻聚合 CLI — 支持 HN/GitHub/HuggingFace/ZAKER/V2EX/Reddit/RSS",
71
+ )
72
+ p.add_argument(
73
+ "--source", "-s", default="all",
74
+ help="source:module 格式,逗号分隔多源\n"
75
+ " hackernews:topstories|newest|ask|show|jobs\n"
76
+ " github:trending\n"
77
+ " huggingface:daily|trending\n"
78
+ " zaker:hot|category|search\n"
79
+ " v2ex:hot|latest|node:<name>\n"
80
+ " reddit:popular|hot|r/<subreddit>\n"
81
+ " rss:<preset>|custom\n"
82
+ " all = 所有注册 source(不含 rss)"
83
+ )
84
+ p.add_argument("--limit", "-n", type=int, default=10, help="每 source 最大条数")
85
+ p.add_argument(
86
+ "--keyword", "-k", default=None,
87
+ help="关键词过滤(逗号分隔多词,AND 匹配)"
88
+ )
89
+ p.add_argument(
90
+ "--params", "-p", default=None,
91
+ help="source-specific 参数,JSON 格式\n"
92
+ ' 如: {"zaker": {"category": "technology"}, "github": {"language": "Python"}}'
93
+ )
94
+ p.add_argument("--json", action="store_true", help="JSON 输出(供 agent 解析)")
95
+ p.add_argument(
96
+ "--enrich", "-e", action="store_true", default=True,
97
+ help="对 summary=null 的 item 并发拉取原文 og:description(curl,8s 超时,默认开启)"
98
+ )
99
+ p.add_argument(
100
+ "--no-enrich", dest="enrich", action="store_false",
101
+ help="禁用 --enrich,不拉取原文摘要"
102
+ )
103
+ p.add_argument(
104
+ "--modules", action="store_true",
105
+ help="列出所有 source 的可用 module"
106
+ )
107
+ return p
108
+
109
+
110
+ def _parse_params(params_str: str | None) -> dict:
111
+ """解析 --params JSON 字符串"""
112
+ if not params_str:
113
+ return {}
114
+ try:
115
+ return json.loads(params_str)
116
+ except Exception:
117
+ return {}
118
+
119
+
120
+ # ─── 主入口 ─────────────────────────────────────────────────────
121
+
122
+ def main() -> None:
123
+ # ── 自然语言模式检测 ──
124
+ # 如果没有任何 flag,且 positional args 以 list/get/fetch/看/拉/找 开头,走 NL parser
125
+ if len(sys.argv) > 1 and not sys.argv[1].startswith("-"):
126
+ raw = " ".join(sys.argv[1:])
127
+ try:
128
+ result = _run_nl(raw)
129
+ sys.exit(0 if result["ok"] else 1)
130
+ except ParseError as e:
131
+ print(f"❗ 语法错误:{e}")
132
+ print(f"提示:get hackernews topstories 5 / list hackernews")
133
+ sys.exit(1)
134
+ except Exception as e:
135
+ print(f"❗ 执行出错:{e}")
136
+ sys.exit(1)
137
+
138
+ # ── Flag 模式(原有行为)──
139
+ parser = _build_parser()
140
+ args = parser.parse_args()
141
+
142
+ if args.modules:
143
+ _print_modules()
144
+ return
145
+
146
+ source_filter = None
147
+ if args.source != "all":
148
+ source_filter = args.source
149
+
150
+ params = _parse_params(args.params)
151
+ agg = NewsAggregator(limit_per_source=args.limit)
152
+ result = agg.fetch(
153
+ source_filter=source_filter,
154
+ limit=None,
155
+ keyword=args.keyword,
156
+ params=params,
157
+ enrich=args.enrich,
158
+ )
159
+
160
+ if args.json:
161
+ print(_json_output(result))
162
+ else:
163
+ print(_render_text(result["items"]))
164
+ if result["errors"]:
165
+ print(f"\n⚠️ {len(result['errors'])} 个 source 出错:")
166
+ for e in result["errors"]:
167
+ print(f" - {e}")
168
+
169
+
170
+ def _run_nl(raw: str) -> dict:
171
+ """运行自然语言命令。"""
172
+ result = parseNL(raw)
173
+
174
+ if result.command == "list":
175
+ if result.list_target == "sources":
176
+ print(list_sources())
177
+ elif result.list_target == "modules":
178
+ # 列出所有源的所有模块
179
+ for src in REGISTRY.keys():
180
+ print(list_source_modules(src))
181
+ print()
182
+ else:
183
+ print(list_source_modules(result.list_target))
184
+ return {"ok": True, "items": []}
185
+
186
+ # ── fetch 模式 ──
187
+ f = result.fetch
188
+ source_filter = f["source_filter"] if f["source_filter"] != "all" else None
189
+
190
+ agg = NewsAggregator(limit_per_source=f["limit"])
191
+ agg_result = agg.fetch(
192
+ source_filter=source_filter,
193
+ limit=None,
194
+ keyword=f["keyword"],
195
+ params=f["params"],
196
+ enrich=f["enrich"],
197
+ )
198
+
199
+ if f["output"] == "json":
200
+ print(_json_output(agg_result))
201
+ else:
202
+ print(_render_text(agg_result["items"]))
203
+ if agg_result["errors"]:
204
+ print(f"\n⚠️ {len(agg_result['errors'])} 个 source 出错:")
205
+ for e in agg_result["errors"]:
206
+ print(f" - {e}")
207
+
208
+ return agg_result
209
+
210
+
211
+ def _print_modules() -> None:
212
+ """打印所有 source 及其支持的 module"""
213
+ all_sources = dict(REGISTRY)
214
+ all_sources["rss"] = rss_module.RSSSource
215
+
216
+ print(f"{'Source':<15} {'Display Name':<25} {'Modules'}")
217
+ print("─" * 80)
218
+
219
+ # RSS presets
220
+ for key, info in sorted(rss_module.PRESET_SOURCES.items()):
221
+ print(f"{'rss:'+key:<15} {info['name']:<25} (RSS preset)")
222
+
223
+ print()
224
+ for name, cls in sorted(all_sources.items()):
225
+ if name == "rss":
226
+ continue
227
+ modules = ", ".join(cls.modules) if cls.modules else "(all)"
228
+ print(f"{name:<15} {cls.display_name:<25} {modules}")
229
+
230
+
231
+ if __name__ == "__main__":
232
+ main()
newscli/enrich.py ADDED
@@ -0,0 +1,112 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ enrich.py — URL enrichment: fetch og:description / meta description via curl
4
+
5
+ 对 summary=null 的 item 并发拉取原文 description,填充 summary 字段。
6
+ 使用 curl(subprocess)绕过 Python requests 对某些站点的 TLS 超时问题。
7
+ """
8
+
9
+ import concurrent.futures
10
+ import re
11
+ import subprocess
12
+ from dataclasses import dataclass
13
+
14
+
15
+ @dataclass
16
+ class EnrichResult:
17
+ url: str
18
+ description: str | None # None = failed/not found
19
+ error: str | None
20
+
21
+
22
+ def fetch_description(url: str, timeout: int = 8) -> EnrichResult:
23
+ """
24
+ 用 curl 获取页面 meta og:description,失败返回 None。
25
+ 超时视为失败,不阻塞。
26
+ """
27
+ try:
28
+ result = subprocess.run(
29
+ [
30
+ "curl", "-s", "-L", "--max-time", str(timeout),
31
+ "-A", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
32
+ "-H", "Accept: text/html,application/xhtml+xml,*/*",
33
+ "-H", "Accept-Language: en-US,en;q=0.9",
34
+ url,
35
+ ],
36
+ capture_output=True,
37
+ text=True,
38
+ timeout=timeout + 1, # process-level timeout slightly higher
39
+ )
40
+ html = result.stdout
41
+ # og:description first (richer)
42
+ m = _extract_description(html)
43
+ return EnrichResult(url=url, description=m, error=None)
44
+ except subprocess.TimeoutExpired:
45
+ return EnrichResult(url=url, description=None, error="timeout")
46
+ except Exception as e:
47
+ return EnrichResult(url=url, description=None, error=str(e))
48
+
49
+
50
+ def _extract_description(html: str) -> str | None:
51
+ """从 HTML 中提取 og:description 或 meta description"""
52
+ patterns = [
53
+ # og:description
54
+ r'<meta[^>]+property=["\']og:description["\'][^>]+content=["\'](.*?)["\']',
55
+ r'<meta[^>]+content=["\'](.*?)["\'][^>]+property=["\']og:description["\']',
56
+ # meta description
57
+ r'<meta[^>]+name=["\']description["\'][^>]+content=["\'](.*?)["\']',
58
+ r'<meta[^>]+content=["\'](.*?)["\'][^>]+name=["\']description["\']',
59
+ ]
60
+ for pattern in patterns:
61
+ m = re.search(pattern, html, re.IGNORECASE)
62
+ if m:
63
+ text = m.group(1).strip()
64
+ # Clean HTML entities
65
+ text = text.replace("&amp;", "&").replace("&quot;", '"').replace("&#x27;", "'").replace("&lt;", "<").replace("&gt;", ">").replace("&#x2F;", "/").replace("&nbsp;", " ")
66
+ if text:
67
+ return text[:500]
68
+ return None
69
+
70
+
71
+ def enrich_items(items: list[dict], max_workers: int = 8) -> list[dict]:
72
+ """
73
+ 对所有 summary=null 的 item 并发拉取 description。
74
+
75
+ Args:
76
+ items: NewsItem.to_dict() 列表
77
+ max_workers: 并发线程数
78
+
79
+ Returns:
80
+ 同输入结构,summary 已填充的 item(不改变顺序)
81
+ """
82
+ # Build work: index → (item, url)
83
+ null_items = [(i, items[i]) for i in range(len(items)) if items[i].get("summary") is None]
84
+
85
+ if not null_items:
86
+ return items
87
+
88
+ urls_to_fetch = [(i, item["url"]) for i, item in null_items if item.get("url")]
89
+
90
+ if not urls_to_fetch:
91
+ return items
92
+
93
+ results_map: dict[int, str | None] = {} # index → description
94
+
95
+ with concurrent.futures.ThreadPoolExecutor(max_workers=min(max_workers, len(urls_to_fetch))) as ex:
96
+ futures = {ex.submit(fetch_description, url): idx for idx, url in urls_to_fetch}
97
+ for fut in concurrent.futures.as_completed(futures):
98
+ idx = futures[fut]
99
+ try:
100
+ res = fut.result()
101
+ results_map[idx] = res.description
102
+ except Exception:
103
+ results_map[idx] = None
104
+
105
+ # Merge back
106
+ enriched = []
107
+ for i, item in enumerate(items):
108
+ if i in results_map and results_map[i]:
109
+ item = {**item, "summary": results_map[i]}
110
+ enriched.append(item)
111
+
112
+ return enriched