newscli-tool 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- newscli/__init__.py +6 -0
- newscli/aggregator.py +263 -0
- newscli/cli.py +232 -0
- newscli/enrich.py +112 -0
- newscli/parser.py +386 -0
- newscli/sources/__init__.py +29 -0
- newscli/sources/base.py +181 -0
- newscli/sources/devto.py +87 -0
- newscli/sources/github.py +77 -0
- newscli/sources/hackernews.py +94 -0
- newscli/sources/huggingface.py +95 -0
- newscli/sources/lobsters.py +77 -0
- newscli/sources/reddit.py +93 -0
- newscli/sources/rss.py +155 -0
- newscli/sources/v2ex.py +121 -0
- newscli/sources/zaker.py +140 -0
- newscli_tool-1.0.0.dist-info/METADATA +184 -0
- newscli_tool-1.0.0.dist-info/RECORD +21 -0
- newscli_tool-1.0.0.dist-info/WHEEL +5 -0
- newscli_tool-1.0.0.dist-info/entry_points.txt +2 -0
- newscli_tool-1.0.0.dist-info/top_level.txt +1 -0
newscli/__init__.py
ADDED
newscli/aggregator.py
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
"""
|
|
2
|
+
aggregator.py — News Aggregator
|
|
3
|
+
|
|
4
|
+
统一调度多个 source,支持:
|
|
5
|
+
- 多 source 并行拉取(ThreadPoolExecutor)
|
|
6
|
+
- 每个 source 的 module 子模块指定
|
|
7
|
+
- source-specific 额外参数(category, language, node, subreddit 等)
|
|
8
|
+
- 统一 JSON 输出(完整 schema)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import concurrent.futures
|
|
12
|
+
from dataclasses import asdict
|
|
13
|
+
from typing import Optional
|
|
14
|
+
from newscli.sources import REGISTRY, NewsItem, SourceError, rss as rss_module
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class NewsAggregator:
|
|
18
|
+
"""多 source 统一聚合器"""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
sources: list[str] | None = None,
|
|
23
|
+
limit_per_source: int = 10,
|
|
24
|
+
):
|
|
25
|
+
"""
|
|
26
|
+
Args:
|
|
27
|
+
sources : source 名称列表(如 ["hackernews", "github"])
|
|
28
|
+
None = 所有注册 source(不含 rss)
|
|
29
|
+
limit_per_source: 每个 source 最大拉取条数
|
|
30
|
+
"""
|
|
31
|
+
self.sources = sources or list(REGISTRY.keys())
|
|
32
|
+
self.limit_per_source = limit_per_source
|
|
33
|
+
|
|
34
|
+
def fetch(
|
|
35
|
+
self,
|
|
36
|
+
source_filter: str | None = None,
|
|
37
|
+
limit: int | None = None,
|
|
38
|
+
keyword: str | None = None,
|
|
39
|
+
params: dict | None = None,
|
|
40
|
+
enrich: bool = True, # 默认对 summary=null 的 item 拉取原文摘要
|
|
41
|
+
) -> dict:
|
|
42
|
+
"""
|
|
43
|
+
并行拉取所有 source,返回统一格式。
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
source_filter : 逗号分隔的 source:module 对列表
|
|
47
|
+
如 "hackernews:topstories,github:trending,v2ex:latest"
|
|
48
|
+
limit : 全局返回上限(None = 所有)
|
|
49
|
+
keyword : 关键词过滤(所有 source 生效)
|
|
50
|
+
params : source-specific 参数 dict
|
|
51
|
+
格式:{"<source>": {"module": "...", "category": "..."}}
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
{"ok": bool, "schema": str, "sources": {}, "items": [], "total": int, "errors": []}
|
|
55
|
+
items 每项 = NewsItem.to_dict(),即所有字段(含 None)
|
|
56
|
+
"""
|
|
57
|
+
parsed = self._parse_filter(source_filter)
|
|
58
|
+
params = params or {}
|
|
59
|
+
|
|
60
|
+
# results key = source:module(如 "hackernews:topstories")
|
|
61
|
+
results: dict[str, list[dict]] = {}
|
|
62
|
+
errors: list[str] = []
|
|
63
|
+
|
|
64
|
+
def _fetch_one(name: str, module: str | None, extra: dict):
|
|
65
|
+
"""执行单次 fetch,返回 (key, items_list, errors_list)"""
|
|
66
|
+
# 唯一 key:不同 module 的同一 source 不会互相覆盖
|
|
67
|
+
key = f"{name}:{module}" if module else name
|
|
68
|
+
|
|
69
|
+
# RSS 特殊处理(不注册到 REGISTRY)
|
|
70
|
+
if name == "rss":
|
|
71
|
+
src = rss_module.RSSSource(
|
|
72
|
+
source_key=extra.get("source_key"),
|
|
73
|
+
feed_url=extra.get("url"),
|
|
74
|
+
)
|
|
75
|
+
else:
|
|
76
|
+
src_cls = REGISTRY.get(name)
|
|
77
|
+
if not src_cls:
|
|
78
|
+
return key, [], [f"Unknown source: {name}"]
|
|
79
|
+
src = src_cls()
|
|
80
|
+
|
|
81
|
+
# global keyword filter applied at source.fetch() call
|
|
82
|
+
fetch_kwargs = {
|
|
83
|
+
"module": module,
|
|
84
|
+
"limit": self.limit_per_source,
|
|
85
|
+
"keyword": keyword, # may be None = no filtering
|
|
86
|
+
}
|
|
87
|
+
# per-source params override (from merged extra)
|
|
88
|
+
for k in ("category", "language", "node", "subreddit",
|
|
89
|
+
"start_time", "end_time", "url"):
|
|
90
|
+
if k in extra:
|
|
91
|
+
fetch_kwargs[k] = extra[k]
|
|
92
|
+
# per-source keyword (params level) takes precedence over global
|
|
93
|
+
if "keyword" in extra:
|
|
94
|
+
fetch_kwargs["keyword"] = extra["keyword"]
|
|
95
|
+
|
|
96
|
+
try:
|
|
97
|
+
items = src.fetch(**fetch_kwargs)
|
|
98
|
+
return key, [item.to_dict() for item in items], []
|
|
99
|
+
except SourceError as e:
|
|
100
|
+
return key, [], [str(e)]
|
|
101
|
+
except Exception as e:
|
|
102
|
+
return key, [], [f"{name} unexpected error: {e}"]
|
|
103
|
+
|
|
104
|
+
# 构建 work items
|
|
105
|
+
work: list[tuple] = []
|
|
106
|
+
if parsed:
|
|
107
|
+
for name, module, extra in parsed:
|
|
108
|
+
# CLI params 覆盖 hard-coded extra
|
|
109
|
+
sp = params.get(name, {})
|
|
110
|
+
merged_extra = {**extra, **sp}
|
|
111
|
+
work.append((name, module, merged_extra))
|
|
112
|
+
else:
|
|
113
|
+
for name in self.sources:
|
|
114
|
+
sp = params.get(name, {})
|
|
115
|
+
work.append((name, None, sp))
|
|
116
|
+
|
|
117
|
+
# 并行执行
|
|
118
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=min(len(work), 8)) as ex:
|
|
119
|
+
futures = {ex.submit(_fetch_one, *w): w for w in work}
|
|
120
|
+
for fut in concurrent.futures.as_completed(futures):
|
|
121
|
+
key, items, errs = fut.result()
|
|
122
|
+
if items:
|
|
123
|
+
results[key] = items
|
|
124
|
+
errors.extend(errs)
|
|
125
|
+
|
|
126
|
+
# 合并所有 items(按 source:module 分别存储,输出时全部扁平化)
|
|
127
|
+
all_items: list[dict] = []
|
|
128
|
+
for key, items in results.items():
|
|
129
|
+
all_items.extend(items)
|
|
130
|
+
|
|
131
|
+
# 跨 source 去重:标题相似度 ≥ 70%(仅多 source 时触发)
|
|
132
|
+
if len(results) > 1:
|
|
133
|
+
all_items = self._deduplicate(all_items)
|
|
134
|
+
|
|
135
|
+
# 可选:enrich — 对 summary=null 的 item 并发拉取原文 description
|
|
136
|
+
if enrich:
|
|
137
|
+
from .enrich import enrich_items
|
|
138
|
+
all_items = enrich_items(all_items)
|
|
139
|
+
|
|
140
|
+
if limit is not None:
|
|
141
|
+
all_items = all_items[:limit]
|
|
142
|
+
|
|
143
|
+
return {
|
|
144
|
+
"ok": True,
|
|
145
|
+
"schema": "NewsItem v1.0",
|
|
146
|
+
"sources": {k: len(v) for k, v in results.items()},
|
|
147
|
+
"items": all_items,
|
|
148
|
+
"total": len(all_items),
|
|
149
|
+
"errors": errors,
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
@staticmethod
|
|
153
|
+
def _deduplicate(items: list[dict]) -> list[dict]:
|
|
154
|
+
"""
|
|
155
|
+
跨 source 去重。相似度阈值 70%(标题 normalized 后比对)。
|
|
156
|
+
保留第一条出现的item,移除后续相似项。
|
|
157
|
+
"""
|
|
158
|
+
from urllib.parse import urlparse
|
|
159
|
+
def normalize_title(t: str) -> str:
|
|
160
|
+
"""小写 + 去除标点 + strip()"""
|
|
161
|
+
import re
|
|
162
|
+
t = t.lower().strip()
|
|
163
|
+
t = re.sub(r'[^\w\s]', ' ', t)
|
|
164
|
+
t = re.sub(r'\s+', ' ', t).strip()
|
|
165
|
+
return t
|
|
166
|
+
|
|
167
|
+
def similarity(a: str, b: str) -> float:
|
|
168
|
+
"""简单词集合 Jaccard 相似度"""
|
|
169
|
+
sa = set(a.split())
|
|
170
|
+
sb = set(b.split())
|
|
171
|
+
if not sa or not sb:
|
|
172
|
+
return 0.0
|
|
173
|
+
inter = len(sa & sb)
|
|
174
|
+
union = len(sa | sb)
|
|
175
|
+
return inter / union if union > 0 else 0.0
|
|
176
|
+
|
|
177
|
+
def item_key(item: dict) -> str:
|
|
178
|
+
domain = ""
|
|
179
|
+
if item.get("url"):
|
|
180
|
+
try:
|
|
181
|
+
domain = urlparse(item["url"]).netloc
|
|
182
|
+
except Exception:
|
|
183
|
+
pass
|
|
184
|
+
return f"{normalize_title(item['title'])}|{domain}"
|
|
185
|
+
|
|
186
|
+
seen: list[dict] = []
|
|
187
|
+
for item in items:
|
|
188
|
+
norm = normalize_title(item.get("title", ""))
|
|
189
|
+
if not norm:
|
|
190
|
+
seen.append(item)
|
|
191
|
+
continue
|
|
192
|
+
dup_idx = None
|
|
193
|
+
for i, s in enumerate(seen):
|
|
194
|
+
s_norm = normalize_title(s.get("title", ""))
|
|
195
|
+
# Different domain → different article, skip
|
|
196
|
+
s_domain = urlparse(s.get("url", "")).netloc or ""
|
|
197
|
+
item_domain = urlparse(item.get("url", "")).netloc or ""
|
|
198
|
+
if s_domain and item_domain and s_domain != item_domain:
|
|
199
|
+
continue
|
|
200
|
+
if similarity(norm, s_norm) >= 0.70:
|
|
201
|
+
dup_idx = i
|
|
202
|
+
break
|
|
203
|
+
if dup_idx is not None:
|
|
204
|
+
# 保留信息更丰富的那个(非 null 字段数量多的)
|
|
205
|
+
existing = seen[dup_idx]
|
|
206
|
+
existing_nulls = sum(1 for k, v in existing.items() if v is None and k != 'extra')
|
|
207
|
+
item_nulls = sum(1 for k, v in item.items() if v is None and k != 'extra')
|
|
208
|
+
if item_nulls < existing_nulls:
|
|
209
|
+
seen[dup_idx] = item
|
|
210
|
+
else:
|
|
211
|
+
seen.append(item)
|
|
212
|
+
return seen
|
|
213
|
+
|
|
214
|
+
@staticmethod
|
|
215
|
+
def _parse_filter(filter_str: str | None) -> list[tuple]:
|
|
216
|
+
"""
|
|
217
|
+
解析 'source:module,source2:module2' → [(name, module, extra_dict), ...]
|
|
218
|
+
|
|
219
|
+
支持格式:
|
|
220
|
+
hackernews:topstories → ("hackernews", "topstories", {})
|
|
221
|
+
v2ex:node:python → ("v2ex", "node", {"node": "python"})
|
|
222
|
+
rss:bensbites → ("rss", "bensbites", {"source_key": "bensbites"})
|
|
223
|
+
github:trending:language=Python → ("github", "trending", {"language": "Python"})
|
|
224
|
+
zaker:category:category=technology → ("zaker", "category", {"category": "technology"})
|
|
225
|
+
reddit:r/technology → ("reddit", None, {"subreddit": "technology"})
|
|
226
|
+
"""
|
|
227
|
+
if not filter_str:
|
|
228
|
+
return []
|
|
229
|
+
items = []
|
|
230
|
+
for part in filter_str.split("&"):
|
|
231
|
+
part = part.strip()
|
|
232
|
+
if not part:
|
|
233
|
+
continue
|
|
234
|
+
name, rest = part.split(":", 1) if ":" in part else (part, "")
|
|
235
|
+
name = name.strip()
|
|
236
|
+
|
|
237
|
+
extra = {}
|
|
238
|
+
tokens = rest.split(":") if rest else []
|
|
239
|
+
module = None
|
|
240
|
+
for token in tokens:
|
|
241
|
+
if "=" in token:
|
|
242
|
+
k, v = token.split("=", 1)
|
|
243
|
+
extra[k.strip()] = v.strip()
|
|
244
|
+
elif not module:
|
|
245
|
+
module = token
|
|
246
|
+
|
|
247
|
+
# Reddit r/<subreddit> shorthand
|
|
248
|
+
if name == "reddit" and module and module.startswith("r/"):
|
|
249
|
+
extra["subreddit"] = module[2:]
|
|
250
|
+
module = None
|
|
251
|
+
|
|
252
|
+
# RSS source_key shorthand
|
|
253
|
+
if name == "rss" and module and "=" not in module:
|
|
254
|
+
extra["source_key"] = module
|
|
255
|
+
|
|
256
|
+
items.append((name, module, extra))
|
|
257
|
+
return items
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def fetch_all(keyword: str | None = None, limit: int | None = None) -> dict:
|
|
261
|
+
"""快速入口:拉取所有注册 source"""
|
|
262
|
+
agg = NewsAggregator()
|
|
263
|
+
return agg.fetch(keyword=keyword, limit=limit)
|
newscli/cli.py
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
cli.py — News CLI 入口
|
|
4
|
+
|
|
5
|
+
两种接口共存:
|
|
6
|
+
1. flag 模式(--source / --limit / --json)— 机器友好
|
|
7
|
+
2. 自然语言模式(get hackernews topstories 5 json)— 人类友好
|
|
8
|
+
|
|
9
|
+
用法(flag):
|
|
10
|
+
python cli.py --source hackernews:topstories --limit 5
|
|
11
|
+
python cli.py --source all --json
|
|
12
|
+
|
|
13
|
+
用法(自然语言):
|
|
14
|
+
python cli.py get hackernews topstories
|
|
15
|
+
python cli.py get hackernews topstories 5 json
|
|
16
|
+
python cli.py get hackernews topstories 5 and github trending 10
|
|
17
|
+
python cli.py get all 10 json noenrich
|
|
18
|
+
python cli.py list
|
|
19
|
+
python cli.py list hackernews
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
import argparse
|
|
23
|
+
import json
|
|
24
|
+
import sys
|
|
25
|
+
from .aggregator import NewsAggregator
|
|
26
|
+
from .sources import REGISTRY, rss as rss_module
|
|
27
|
+
from .parser import parseNL, ParseError, list_sources, list_source_modules
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# ─── 输出格式器 ─────────────────────────────────────────────────
|
|
31
|
+
|
|
32
|
+
def _render_text(items: list[dict]) -> str:
|
|
33
|
+
"""人类可读文本输出 — None 字段不显示"""
|
|
34
|
+
if not items:
|
|
35
|
+
return "📰 无内容"
|
|
36
|
+
lines = [f"📰 共 {len(items)} 条"]
|
|
37
|
+
lines.append("─" * 50)
|
|
38
|
+
for i, item in enumerate(items, 1):
|
|
39
|
+
# source + module badge
|
|
40
|
+
module = item.get("module") or ""
|
|
41
|
+
source = item.get("source") or "?"
|
|
42
|
+
badge = f"[{source}]" if not module else f"[{source}/{module}]"
|
|
43
|
+
heat = item.get("heat") or ""
|
|
44
|
+
time = item.get("time") or ""
|
|
45
|
+
meta = " | ".join(x for x in [heat, time] if x)
|
|
46
|
+
lines.append(f"\n[{i}] {badge} {meta}")
|
|
47
|
+
lines.append(f" {item.get('title', '')}")
|
|
48
|
+
url = item.get("url")
|
|
49
|
+
if url:
|
|
50
|
+
lines.append(f" 🔗 {url}")
|
|
51
|
+
summary = item.get("summary")
|
|
52
|
+
if summary:
|
|
53
|
+
lines.append(f" 📝 {summary[:150]}{'...' if len(summary) > 150 else ''}")
|
|
54
|
+
author = item.get("author")
|
|
55
|
+
if author:
|
|
56
|
+
lines.append(f" 👤 {author}")
|
|
57
|
+
return "\n".join(lines)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _json_output(result: dict) -> str:
|
|
61
|
+
"""JSON 输出 — 完整 schema,包含 None"""
|
|
62
|
+
return json.dumps(result, indent=2, ensure_ascii=False)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# ─── CLI 参数解析 ───────────────────────────────────────────────
|
|
66
|
+
|
|
67
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
68
|
+
p = argparse.ArgumentParser(
|
|
69
|
+
prog="news",
|
|
70
|
+
description="模块化新闻聚合 CLI — 支持 HN/GitHub/HuggingFace/ZAKER/V2EX/Reddit/RSS",
|
|
71
|
+
)
|
|
72
|
+
p.add_argument(
|
|
73
|
+
"--source", "-s", default="all",
|
|
74
|
+
help="source:module 格式,逗号分隔多源\n"
|
|
75
|
+
" hackernews:topstories|newest|ask|show|jobs\n"
|
|
76
|
+
" github:trending\n"
|
|
77
|
+
" huggingface:daily|trending\n"
|
|
78
|
+
" zaker:hot|category|search\n"
|
|
79
|
+
" v2ex:hot|latest|node:<name>\n"
|
|
80
|
+
" reddit:popular|hot|r/<subreddit>\n"
|
|
81
|
+
" rss:<preset>|custom\n"
|
|
82
|
+
" all = 所有注册 source(不含 rss)"
|
|
83
|
+
)
|
|
84
|
+
p.add_argument("--limit", "-n", type=int, default=10, help="每 source 最大条数")
|
|
85
|
+
p.add_argument(
|
|
86
|
+
"--keyword", "-k", default=None,
|
|
87
|
+
help="关键词过滤(逗号分隔多词,AND 匹配)"
|
|
88
|
+
)
|
|
89
|
+
p.add_argument(
|
|
90
|
+
"--params", "-p", default=None,
|
|
91
|
+
help="source-specific 参数,JSON 格式\n"
|
|
92
|
+
' 如: {"zaker": {"category": "technology"}, "github": {"language": "Python"}}'
|
|
93
|
+
)
|
|
94
|
+
p.add_argument("--json", action="store_true", help="JSON 输出(供 agent 解析)")
|
|
95
|
+
p.add_argument(
|
|
96
|
+
"--enrich", "-e", action="store_true", default=True,
|
|
97
|
+
help="对 summary=null 的 item 并发拉取原文 og:description(curl,8s 超时,默认开启)"
|
|
98
|
+
)
|
|
99
|
+
p.add_argument(
|
|
100
|
+
"--no-enrich", dest="enrich", action="store_false",
|
|
101
|
+
help="禁用 --enrich,不拉取原文摘要"
|
|
102
|
+
)
|
|
103
|
+
p.add_argument(
|
|
104
|
+
"--modules", action="store_true",
|
|
105
|
+
help="列出所有 source 的可用 module"
|
|
106
|
+
)
|
|
107
|
+
return p
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _parse_params(params_str: str | None) -> dict:
|
|
111
|
+
"""解析 --params JSON 字符串"""
|
|
112
|
+
if not params_str:
|
|
113
|
+
return {}
|
|
114
|
+
try:
|
|
115
|
+
return json.loads(params_str)
|
|
116
|
+
except Exception:
|
|
117
|
+
return {}
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
# ─── 主入口 ─────────────────────────────────────────────────────
|
|
121
|
+
|
|
122
|
+
def main() -> None:
|
|
123
|
+
# ── 自然语言模式检测 ──
|
|
124
|
+
# 如果没有任何 flag,且 positional args 以 list/get/fetch/看/拉/找 开头,走 NL parser
|
|
125
|
+
if len(sys.argv) > 1 and not sys.argv[1].startswith("-"):
|
|
126
|
+
raw = " ".join(sys.argv[1:])
|
|
127
|
+
try:
|
|
128
|
+
result = _run_nl(raw)
|
|
129
|
+
sys.exit(0 if result["ok"] else 1)
|
|
130
|
+
except ParseError as e:
|
|
131
|
+
print(f"❗ 语法错误:{e}")
|
|
132
|
+
print(f"提示:get hackernews topstories 5 / list hackernews")
|
|
133
|
+
sys.exit(1)
|
|
134
|
+
except Exception as e:
|
|
135
|
+
print(f"❗ 执行出错:{e}")
|
|
136
|
+
sys.exit(1)
|
|
137
|
+
|
|
138
|
+
# ── Flag 模式(原有行为)──
|
|
139
|
+
parser = _build_parser()
|
|
140
|
+
args = parser.parse_args()
|
|
141
|
+
|
|
142
|
+
if args.modules:
|
|
143
|
+
_print_modules()
|
|
144
|
+
return
|
|
145
|
+
|
|
146
|
+
source_filter = None
|
|
147
|
+
if args.source != "all":
|
|
148
|
+
source_filter = args.source
|
|
149
|
+
|
|
150
|
+
params = _parse_params(args.params)
|
|
151
|
+
agg = NewsAggregator(limit_per_source=args.limit)
|
|
152
|
+
result = agg.fetch(
|
|
153
|
+
source_filter=source_filter,
|
|
154
|
+
limit=None,
|
|
155
|
+
keyword=args.keyword,
|
|
156
|
+
params=params,
|
|
157
|
+
enrich=args.enrich,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
if args.json:
|
|
161
|
+
print(_json_output(result))
|
|
162
|
+
else:
|
|
163
|
+
print(_render_text(result["items"]))
|
|
164
|
+
if result["errors"]:
|
|
165
|
+
print(f"\n⚠️ {len(result['errors'])} 个 source 出错:")
|
|
166
|
+
for e in result["errors"]:
|
|
167
|
+
print(f" - {e}")
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _run_nl(raw: str) -> dict:
|
|
171
|
+
"""运行自然语言命令。"""
|
|
172
|
+
result = parseNL(raw)
|
|
173
|
+
|
|
174
|
+
if result.command == "list":
|
|
175
|
+
if result.list_target == "sources":
|
|
176
|
+
print(list_sources())
|
|
177
|
+
elif result.list_target == "modules":
|
|
178
|
+
# 列出所有源的所有模块
|
|
179
|
+
for src in REGISTRY.keys():
|
|
180
|
+
print(list_source_modules(src))
|
|
181
|
+
print()
|
|
182
|
+
else:
|
|
183
|
+
print(list_source_modules(result.list_target))
|
|
184
|
+
return {"ok": True, "items": []}
|
|
185
|
+
|
|
186
|
+
# ── fetch 模式 ──
|
|
187
|
+
f = result.fetch
|
|
188
|
+
source_filter = f["source_filter"] if f["source_filter"] != "all" else None
|
|
189
|
+
|
|
190
|
+
agg = NewsAggregator(limit_per_source=f["limit"])
|
|
191
|
+
agg_result = agg.fetch(
|
|
192
|
+
source_filter=source_filter,
|
|
193
|
+
limit=None,
|
|
194
|
+
keyword=f["keyword"],
|
|
195
|
+
params=f["params"],
|
|
196
|
+
enrich=f["enrich"],
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
if f["output"] == "json":
|
|
200
|
+
print(_json_output(agg_result))
|
|
201
|
+
else:
|
|
202
|
+
print(_render_text(agg_result["items"]))
|
|
203
|
+
if agg_result["errors"]:
|
|
204
|
+
print(f"\n⚠️ {len(agg_result['errors'])} 个 source 出错:")
|
|
205
|
+
for e in agg_result["errors"]:
|
|
206
|
+
print(f" - {e}")
|
|
207
|
+
|
|
208
|
+
return agg_result
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _print_modules() -> None:
|
|
212
|
+
"""打印所有 source 及其支持的 module"""
|
|
213
|
+
all_sources = dict(REGISTRY)
|
|
214
|
+
all_sources["rss"] = rss_module.RSSSource
|
|
215
|
+
|
|
216
|
+
print(f"{'Source':<15} {'Display Name':<25} {'Modules'}")
|
|
217
|
+
print("─" * 80)
|
|
218
|
+
|
|
219
|
+
# RSS presets
|
|
220
|
+
for key, info in sorted(rss_module.PRESET_SOURCES.items()):
|
|
221
|
+
print(f"{'rss:'+key:<15} {info['name']:<25} (RSS preset)")
|
|
222
|
+
|
|
223
|
+
print()
|
|
224
|
+
for name, cls in sorted(all_sources.items()):
|
|
225
|
+
if name == "rss":
|
|
226
|
+
continue
|
|
227
|
+
modules = ", ".join(cls.modules) if cls.modules else "(all)"
|
|
228
|
+
print(f"{name:<15} {cls.display_name:<25} {modules}")
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
if __name__ == "__main__":
|
|
232
|
+
main()
|
newscli/enrich.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
enrich.py — URL enrichment: fetch og:description / meta description via curl
|
|
4
|
+
|
|
5
|
+
对 summary=null 的 item 并发拉取原文 description,填充 summary 字段。
|
|
6
|
+
使用 curl(subprocess)绕过 Python requests 对某些站点的 TLS 超时问题。
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import concurrent.futures
|
|
10
|
+
import re
|
|
11
|
+
import subprocess
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class EnrichResult:
|
|
17
|
+
url: str
|
|
18
|
+
description: str | None # None = failed/not found
|
|
19
|
+
error: str | None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def fetch_description(url: str, timeout: int = 8) -> EnrichResult:
|
|
23
|
+
"""
|
|
24
|
+
用 curl 获取页面 meta og:description,失败返回 None。
|
|
25
|
+
超时视为失败,不阻塞。
|
|
26
|
+
"""
|
|
27
|
+
try:
|
|
28
|
+
result = subprocess.run(
|
|
29
|
+
[
|
|
30
|
+
"curl", "-s", "-L", "--max-time", str(timeout),
|
|
31
|
+
"-A", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
32
|
+
"-H", "Accept: text/html,application/xhtml+xml,*/*",
|
|
33
|
+
"-H", "Accept-Language: en-US,en;q=0.9",
|
|
34
|
+
url,
|
|
35
|
+
],
|
|
36
|
+
capture_output=True,
|
|
37
|
+
text=True,
|
|
38
|
+
timeout=timeout + 1, # process-level timeout slightly higher
|
|
39
|
+
)
|
|
40
|
+
html = result.stdout
|
|
41
|
+
# og:description first (richer)
|
|
42
|
+
m = _extract_description(html)
|
|
43
|
+
return EnrichResult(url=url, description=m, error=None)
|
|
44
|
+
except subprocess.TimeoutExpired:
|
|
45
|
+
return EnrichResult(url=url, description=None, error="timeout")
|
|
46
|
+
except Exception as e:
|
|
47
|
+
return EnrichResult(url=url, description=None, error=str(e))
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _extract_description(html: str) -> str | None:
|
|
51
|
+
"""从 HTML 中提取 og:description 或 meta description"""
|
|
52
|
+
patterns = [
|
|
53
|
+
# og:description
|
|
54
|
+
r'<meta[^>]+property=["\']og:description["\'][^>]+content=["\'](.*?)["\']',
|
|
55
|
+
r'<meta[^>]+content=["\'](.*?)["\'][^>]+property=["\']og:description["\']',
|
|
56
|
+
# meta description
|
|
57
|
+
r'<meta[^>]+name=["\']description["\'][^>]+content=["\'](.*?)["\']',
|
|
58
|
+
r'<meta[^>]+content=["\'](.*?)["\'][^>]+name=["\']description["\']',
|
|
59
|
+
]
|
|
60
|
+
for pattern in patterns:
|
|
61
|
+
m = re.search(pattern, html, re.IGNORECASE)
|
|
62
|
+
if m:
|
|
63
|
+
text = m.group(1).strip()
|
|
64
|
+
# Clean HTML entities
|
|
65
|
+
text = text.replace("&", "&").replace(""", '"').replace("'", "'").replace("<", "<").replace(">", ">").replace("/", "/").replace(" ", " ")
|
|
66
|
+
if text:
|
|
67
|
+
return text[:500]
|
|
68
|
+
return None
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def enrich_items(items: list[dict], max_workers: int = 8) -> list[dict]:
|
|
72
|
+
"""
|
|
73
|
+
对所有 summary=null 的 item 并发拉取 description。
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
items: NewsItem.to_dict() 列表
|
|
77
|
+
max_workers: 并发线程数
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
同输入结构,summary 已填充的 item(不改变顺序)
|
|
81
|
+
"""
|
|
82
|
+
# Build work: index → (item, url)
|
|
83
|
+
null_items = [(i, items[i]) for i in range(len(items)) if items[i].get("summary") is None]
|
|
84
|
+
|
|
85
|
+
if not null_items:
|
|
86
|
+
return items
|
|
87
|
+
|
|
88
|
+
urls_to_fetch = [(i, item["url"]) for i, item in null_items if item.get("url")]
|
|
89
|
+
|
|
90
|
+
if not urls_to_fetch:
|
|
91
|
+
return items
|
|
92
|
+
|
|
93
|
+
results_map: dict[int, str | None] = {} # index → description
|
|
94
|
+
|
|
95
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=min(max_workers, len(urls_to_fetch))) as ex:
|
|
96
|
+
futures = {ex.submit(fetch_description, url): idx for idx, url in urls_to_fetch}
|
|
97
|
+
for fut in concurrent.futures.as_completed(futures):
|
|
98
|
+
idx = futures[fut]
|
|
99
|
+
try:
|
|
100
|
+
res = fut.result()
|
|
101
|
+
results_map[idx] = res.description
|
|
102
|
+
except Exception:
|
|
103
|
+
results_map[idx] = None
|
|
104
|
+
|
|
105
|
+
# Merge back
|
|
106
|
+
enriched = []
|
|
107
|
+
for i, item in enumerate(items):
|
|
108
|
+
if i in results_map and results_map[i]:
|
|
109
|
+
item = {**item, "summary": results_map[i]}
|
|
110
|
+
enriched.append(item)
|
|
111
|
+
|
|
112
|
+
return enriched
|