newscli-tool 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- newscli_tool-1.0.0/PKG-INFO +184 -0
- newscli_tool-1.0.0/README.md +158 -0
- newscli_tool-1.0.0/newscli/__init__.py +6 -0
- newscli_tool-1.0.0/newscli/aggregator.py +263 -0
- newscli_tool-1.0.0/newscli/cli.py +232 -0
- newscli_tool-1.0.0/newscli/enrich.py +112 -0
- newscli_tool-1.0.0/newscli/parser.py +386 -0
- newscli_tool-1.0.0/newscli/sources/__init__.py +29 -0
- newscli_tool-1.0.0/newscli/sources/base.py +181 -0
- newscli_tool-1.0.0/newscli/sources/devto.py +87 -0
- newscli_tool-1.0.0/newscli/sources/github.py +77 -0
- newscli_tool-1.0.0/newscli/sources/hackernews.py +94 -0
- newscli_tool-1.0.0/newscli/sources/huggingface.py +95 -0
- newscli_tool-1.0.0/newscli/sources/lobsters.py +77 -0
- newscli_tool-1.0.0/newscli/sources/reddit.py +93 -0
- newscli_tool-1.0.0/newscli/sources/rss.py +155 -0
- newscli_tool-1.0.0/newscli/sources/v2ex.py +121 -0
- newscli_tool-1.0.0/newscli/sources/zaker.py +140 -0
- newscli_tool-1.0.0/newscli_tool.egg-info/PKG-INFO +184 -0
- newscli_tool-1.0.0/newscli_tool.egg-info/SOURCES.txt +24 -0
- newscli_tool-1.0.0/newscli_tool.egg-info/dependency_links.txt +1 -0
- newscli_tool-1.0.0/newscli_tool.egg-info/entry_points.txt +2 -0
- newscli_tool-1.0.0/newscli_tool.egg-info/requires.txt +3 -0
- newscli_tool-1.0.0/newscli_tool.egg-info/top_level.txt +1 -0
- newscli_tool-1.0.0/pyproject.toml +43 -0
- newscli_tool-1.0.0/setup.cfg +4 -0
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: newscli-tool
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Unified news aggregation CLI — 9 sources, 25+ modules, natural language DSL
|
|
5
|
+
Author-email: kzclaw <kzclaw@users.noreply.github.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/kzclaw/news-cli
|
|
8
|
+
Project-URL: Repository, https://github.com/kzclaw/news-cli
|
|
9
|
+
Project-URL: Issues, https://github.com/kzclaw/news-cli/issues
|
|
10
|
+
Keywords: news,hackernews,github,cli,aggregation,rss
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
Requires-Dist: requests>=2.28.0
|
|
24
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
25
|
+
Requires-Dist: lxml>=4.9.0
|
|
26
|
+
|
|
27
|
+
# newscli
|
|
28
|
+
|
|
29
|
+
**Unified news aggregation CLI** — pull from 9 sources in one command.
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
newscli get hackernews topstories 5
|
|
33
|
+
newscli get github trending 10 language python
|
|
34
|
+
newscli get all 15 json
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
[](https://pypi.org/project/newscli-tool/)
|
|
38
|
+
[](https://pypi.org/project/newscli-tool/)
|
|
39
|
+
[](LICENSE)
|
|
40
|
+
|
|
41
|
+
📄 [中文版](README_zh.md)
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Install
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
# From TestPyPI (works now, globally accessible)
|
|
49
|
+
pip install --index-url https://test.pypi.org/simple/ newscli-tool
|
|
50
|
+
|
|
51
|
+
# From GitHub (dev branch, latest)
|
|
52
|
+
pip install git+https://github.com/kzclaw/news-cli.git
|
|
53
|
+
|
|
54
|
+
# One-liner (any machine with curl + python3)
|
|
55
|
+
curl -sSL https://raw.githubusercontent.com/kzclaw/news-cli/main/install.sh | bash
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
After install, `newscli` is available in your terminal globally.
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## Features
|
|
63
|
+
|
|
64
|
+
| Feature | Details |
|
|
65
|
+
|---------|---------|
|
|
66
|
+
| **9 sources** | Hacker News · GitHub Trending · Hugging Face · ZAKER · V2EX · Reddit · DEV.to · Lobsters · RSS |
|
|
67
|
+
| **25+ modules** | Each source has multiple views — topstories, trending, ask, show, job, by language, by node, by subreddit… |
|
|
68
|
+
| **Natural language DSL** | `get`, `list`, `看`, `拉` — no flags to remember |
|
|
69
|
+
| **URL enrichment** | Auto-fetches og:description for every item with `summary = null` |
|
|
70
|
+
| **Deduplication** | Cross-source Jaccard similarity, 70% threshold, keeps richer item |
|
|
71
|
+
| **JSON output** | `NewsItem v1.0` schema — always 9 fields, `null` means "source doesn't have it" |
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## Usage
|
|
76
|
+
|
|
77
|
+
### Natural language mode
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
newscli get <source> <module> [limit] [json] [noenrich]
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
**Examples**
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
# Single source
|
|
87
|
+
newscli get hackernews topstories 5
|
|
88
|
+
newscli get github trending 10 language python
|
|
89
|
+
newscli get v2ex node python 10
|
|
90
|
+
newscli get reddit subreddit technology 10
|
|
91
|
+
newscli get huggingface daily 5
|
|
92
|
+
|
|
93
|
+
# Multi-source (AND)
|
|
94
|
+
newscli get hackernews topstories 5 and github trending 10 and reddit subreddit programming 5
|
|
95
|
+
|
|
96
|
+
# All sources
|
|
97
|
+
newscli get all 15 json
|
|
98
|
+
|
|
99
|
+
# JSON output (machine-readable)
|
|
100
|
+
newscli get hackernews topstories 5 json
|
|
101
|
+
newscli get all 3 json noenrich
|
|
102
|
+
|
|
103
|
+
# List available sources / modules
|
|
104
|
+
newscli list
|
|
105
|
+
newscli list github
|
|
106
|
+
newscli list v2ex
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Flag mode
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
python3 -m newscli --source hackernews:topstories --limit 5 --json
|
|
113
|
+
python3 -m newscli --source "hackernews:topstories&github:trending" --limit 5
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## Source & Module Reference
|
|
119
|
+
|
|
120
|
+
| Source | Modules |
|
|
121
|
+
|--------|---------|
|
|
122
|
+
| `hackernews` | `topstories` · `new` · `ask` · `show` · `jobs` |
|
|
123
|
+
| `github` | `trending` · `trending-weekly` · `trending-monthly` · language parameter |
|
|
124
|
+
| `huggingface` | `daily` · `weekly` · `monthly` |
|
|
125
|
+
| `zaker` | `hot` · `news` · `search` · category parameter |
|
|
126
|
+
| `v2ex` | `hot` · `latest` · `node:<name>` |
|
|
127
|
+
| `reddit` | `subreddit:<name>` · `popular` · `hot` |
|
|
128
|
+
| `devto` | `latest` · `top` · `tags:<tag>` |
|
|
129
|
+
| `lobsters` | `newest` · `hot` · `top` · `upcoming` |
|
|
130
|
+
| `rss` | `feed:<url>` — any RSS/Atom feed by URL |
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
## Output Schema
|
|
135
|
+
|
|
136
|
+
Every item follows `NewsItem v1.0` — null means "source doesn't provide this", never faked:
|
|
137
|
+
|
|
138
|
+
| Field | Type | Description |
|
|
139
|
+
|-------|------|-------------|
|
|
140
|
+
| `source` | `str` | Source name (e.g. `hackernews`) |
|
|
141
|
+
| `module` | `str` | Sub-module (e.g. `topstories`) |
|
|
142
|
+
| `title` | `str` | Item title |
|
|
143
|
+
| `url` | `str` | Link to item |
|
|
144
|
+
| `author` | `str\|null` | Author / submitter |
|
|
145
|
+
| `published_at` | `datetime\|null` | Publication time |
|
|
146
|
+
| `summary` | `str\|null` | Description or og:description |
|
|
147
|
+
| `score` | `int\|null` | Score / points (if available) |
|
|
148
|
+
| `comments` | `int\|null` | Comment count (if available) |
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## Architecture
|
|
153
|
+
|
|
154
|
+
```
|
|
155
|
+
newscli/
|
|
156
|
+
├── cli.py # Dual-mode entry: flags + NL DSL
|
|
157
|
+
├── aggregator.py # ThreadPoolExecutor dispatcher + deduplication
|
|
158
|
+
├── parser.py # NL DSL parser (no AI — pure rules)
|
|
159
|
+
├── enrich.py # Concurrent og:description fetcher
|
|
160
|
+
└── sources/
|
|
161
|
+
├── base.py # NewsSource ABC + NewsItem schema
|
|
162
|
+
├── hackernews.py
|
|
163
|
+
├── github.py
|
|
164
|
+
├── huggingface.py
|
|
165
|
+
├── zaker.py
|
|
166
|
+
├── v2ex.py
|
|
167
|
+
├── reddit.py
|
|
168
|
+
├── devto.py
|
|
169
|
+
├── lobsters.py
|
|
170
|
+
└── rss.py
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
---
|
|
174
|
+
|
|
175
|
+
## Requirements
|
|
176
|
+
|
|
177
|
+
- Python 3.10+
|
|
178
|
+
- `requests` · `beautifulsoup4` · `lxml` (installed automatically)
|
|
179
|
+
|
|
180
|
+
---
|
|
181
|
+
|
|
182
|
+
## License
|
|
183
|
+
|
|
184
|
+
MIT · [kzclaw/news-cli](https://github.com/kzclaw/news-cli)
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
# newscli
|
|
2
|
+
|
|
3
|
+
**Unified news aggregation CLI** — pull from 9 sources in one command.
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
newscli get hackernews topstories 5
|
|
7
|
+
newscli get github trending 10 language python
|
|
8
|
+
newscli get all 15 json
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
[](https://pypi.org/project/newscli-tool/)
|
|
12
|
+
[](https://pypi.org/project/newscli-tool/)
|
|
13
|
+
[](LICENSE)
|
|
14
|
+
|
|
15
|
+
📄 [中文版](README_zh.md)
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## Install
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
# From TestPyPI (works now, globally accessible)
|
|
23
|
+
pip install --index-url https://test.pypi.org/simple/ newscli-tool
|
|
24
|
+
|
|
25
|
+
# From GitHub (dev branch, latest)
|
|
26
|
+
pip install git+https://github.com/kzclaw/news-cli.git
|
|
27
|
+
|
|
28
|
+
# One-liner (any machine with curl + python3)
|
|
29
|
+
curl -sSL https://raw.githubusercontent.com/kzclaw/news-cli/main/install.sh | bash
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
After install, `newscli` is available in your terminal globally.
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## Features
|
|
37
|
+
|
|
38
|
+
| Feature | Details |
|
|
39
|
+
|---------|---------|
|
|
40
|
+
| **9 sources** | Hacker News · GitHub Trending · Hugging Face · ZAKER · V2EX · Reddit · DEV.to · Lobsters · RSS |
|
|
41
|
+
| **25+ modules** | Each source has multiple views — topstories, trending, ask, show, job, by language, by node, by subreddit… |
|
|
42
|
+
| **Natural language DSL** | `get`, `list`, `看`, `拉` — no flags to remember |
|
|
43
|
+
| **URL enrichment** | Auto-fetches og:description for every item with `summary = null` |
|
|
44
|
+
| **Deduplication** | Cross-source Jaccard similarity, 70% threshold, keeps richer item |
|
|
45
|
+
| **JSON output** | `NewsItem v1.0` schema — always 9 fields, `null` means "source doesn't have it" |
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## Usage
|
|
50
|
+
|
|
51
|
+
### Natural language mode
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
newscli get <source> <module> [limit] [json] [noenrich]
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
**Examples**
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
# Single source
|
|
61
|
+
newscli get hackernews topstories 5
|
|
62
|
+
newscli get github trending 10 language python
|
|
63
|
+
newscli get v2ex node python 10
|
|
64
|
+
newscli get reddit subreddit technology 10
|
|
65
|
+
newscli get huggingface daily 5
|
|
66
|
+
|
|
67
|
+
# Multi-source (AND)
|
|
68
|
+
newscli get hackernews topstories 5 and github trending 10 and reddit subreddit programming 5
|
|
69
|
+
|
|
70
|
+
# All sources
|
|
71
|
+
newscli get all 15 json
|
|
72
|
+
|
|
73
|
+
# JSON output (machine-readable)
|
|
74
|
+
newscli get hackernews topstories 5 json
|
|
75
|
+
newscli get all 3 json noenrich
|
|
76
|
+
|
|
77
|
+
# List available sources / modules
|
|
78
|
+
newscli list
|
|
79
|
+
newscli list github
|
|
80
|
+
newscli list v2ex
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Flag mode
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
python3 -m newscli --source hackernews:topstories --limit 5 --json
|
|
87
|
+
python3 -m newscli --source "hackernews:topstories&github:trending" --limit 5
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
---
|
|
91
|
+
|
|
92
|
+
## Source & Module Reference
|
|
93
|
+
|
|
94
|
+
| Source | Modules |
|
|
95
|
+
|--------|---------|
|
|
96
|
+
| `hackernews` | `topstories` · `new` · `ask` · `show` · `jobs` |
|
|
97
|
+
| `github` | `trending` · `trending-weekly` · `trending-monthly` · language parameter |
|
|
98
|
+
| `huggingface` | `daily` · `weekly` · `monthly` |
|
|
99
|
+
| `zaker` | `hot` · `news` · `search` · category parameter |
|
|
100
|
+
| `v2ex` | `hot` · `latest` · `node:<name>` |
|
|
101
|
+
| `reddit` | `subreddit:<name>` · `popular` · `hot` |
|
|
102
|
+
| `devto` | `latest` · `top` · `tags:<tag>` |
|
|
103
|
+
| `lobsters` | `newest` · `hot` · `top` · `upcoming` |
|
|
104
|
+
| `rss` | `feed:<url>` — any RSS/Atom feed by URL |
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## Output Schema
|
|
109
|
+
|
|
110
|
+
Every item follows `NewsItem v1.0` — null means "source doesn't provide this", never faked:
|
|
111
|
+
|
|
112
|
+
| Field | Type | Description |
|
|
113
|
+
|-------|------|-------------|
|
|
114
|
+
| `source` | `str` | Source name (e.g. `hackernews`) |
|
|
115
|
+
| `module` | `str` | Sub-module (e.g. `topstories`) |
|
|
116
|
+
| `title` | `str` | Item title |
|
|
117
|
+
| `url` | `str` | Link to item |
|
|
118
|
+
| `author` | `str\|null` | Author / submitter |
|
|
119
|
+
| `published_at` | `datetime\|null` | Publication time |
|
|
120
|
+
| `summary` | `str\|null` | Description or og:description |
|
|
121
|
+
| `score` | `int\|null` | Score / points (if available) |
|
|
122
|
+
| `comments` | `int\|null` | Comment count (if available) |
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
## Architecture
|
|
127
|
+
|
|
128
|
+
```
|
|
129
|
+
newscli/
|
|
130
|
+
├── cli.py # Dual-mode entry: flags + NL DSL
|
|
131
|
+
├── aggregator.py # ThreadPoolExecutor dispatcher + deduplication
|
|
132
|
+
├── parser.py # NL DSL parser (no AI — pure rules)
|
|
133
|
+
├── enrich.py # Concurrent og:description fetcher
|
|
134
|
+
└── sources/
|
|
135
|
+
├── base.py # NewsSource ABC + NewsItem schema
|
|
136
|
+
├── hackernews.py
|
|
137
|
+
├── github.py
|
|
138
|
+
├── huggingface.py
|
|
139
|
+
├── zaker.py
|
|
140
|
+
├── v2ex.py
|
|
141
|
+
├── reddit.py
|
|
142
|
+
├── devto.py
|
|
143
|
+
├── lobsters.py
|
|
144
|
+
└── rss.py
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## Requirements
|
|
150
|
+
|
|
151
|
+
- Python 3.10+
|
|
152
|
+
- `requests` · `beautifulsoup4` · `lxml` (installed automatically)
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
## License
|
|
157
|
+
|
|
158
|
+
MIT · [kzclaw/news-cli](https://github.com/kzclaw/news-cli)
|
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
"""
|
|
2
|
+
aggregator.py — News Aggregator
|
|
3
|
+
|
|
4
|
+
统一调度多个 source,支持:
|
|
5
|
+
- 多 source 并行拉取(ThreadPoolExecutor)
|
|
6
|
+
- 每个 source 的 module 子模块指定
|
|
7
|
+
- source-specific 额外参数(category, language, node, subreddit 等)
|
|
8
|
+
- 统一 JSON 输出(完整 schema)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import concurrent.futures
|
|
12
|
+
from dataclasses import asdict
|
|
13
|
+
from typing import Optional
|
|
14
|
+
from newscli.sources import REGISTRY, NewsItem, SourceError, rss as rss_module
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class NewsAggregator:
|
|
18
|
+
"""多 source 统一聚合器"""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
sources: list[str] | None = None,
|
|
23
|
+
limit_per_source: int = 10,
|
|
24
|
+
):
|
|
25
|
+
"""
|
|
26
|
+
Args:
|
|
27
|
+
sources : source 名称列表(如 ["hackernews", "github"])
|
|
28
|
+
None = 所有注册 source(不含 rss)
|
|
29
|
+
limit_per_source: 每个 source 最大拉取条数
|
|
30
|
+
"""
|
|
31
|
+
self.sources = sources or list(REGISTRY.keys())
|
|
32
|
+
self.limit_per_source = limit_per_source
|
|
33
|
+
|
|
34
|
+
def fetch(
|
|
35
|
+
self,
|
|
36
|
+
source_filter: str | None = None,
|
|
37
|
+
limit: int | None = None,
|
|
38
|
+
keyword: str | None = None,
|
|
39
|
+
params: dict | None = None,
|
|
40
|
+
enrich: bool = True, # 默认对 summary=null 的 item 拉取原文摘要
|
|
41
|
+
) -> dict:
|
|
42
|
+
"""
|
|
43
|
+
并行拉取所有 source,返回统一格式。
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
source_filter : 逗号分隔的 source:module 对列表
|
|
47
|
+
如 "hackernews:topstories,github:trending,v2ex:latest"
|
|
48
|
+
limit : 全局返回上限(None = 所有)
|
|
49
|
+
keyword : 关键词过滤(所有 source 生效)
|
|
50
|
+
params : source-specific 参数 dict
|
|
51
|
+
格式:{"<source>": {"module": "...", "category": "..."}}
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
{"ok": bool, "schema": str, "sources": {}, "items": [], "total": int, "errors": []}
|
|
55
|
+
items 每项 = NewsItem.to_dict(),即所有字段(含 None)
|
|
56
|
+
"""
|
|
57
|
+
parsed = self._parse_filter(source_filter)
|
|
58
|
+
params = params or {}
|
|
59
|
+
|
|
60
|
+
# results key = source:module(如 "hackernews:topstories")
|
|
61
|
+
results: dict[str, list[dict]] = {}
|
|
62
|
+
errors: list[str] = []
|
|
63
|
+
|
|
64
|
+
def _fetch_one(name: str, module: str | None, extra: dict):
|
|
65
|
+
"""执行单次 fetch,返回 (key, items_list, errors_list)"""
|
|
66
|
+
# 唯一 key:不同 module 的同一 source 不会互相覆盖
|
|
67
|
+
key = f"{name}:{module}" if module else name
|
|
68
|
+
|
|
69
|
+
# RSS 特殊处理(不注册到 REGISTRY)
|
|
70
|
+
if name == "rss":
|
|
71
|
+
src = rss_module.RSSSource(
|
|
72
|
+
source_key=extra.get("source_key"),
|
|
73
|
+
feed_url=extra.get("url"),
|
|
74
|
+
)
|
|
75
|
+
else:
|
|
76
|
+
src_cls = REGISTRY.get(name)
|
|
77
|
+
if not src_cls:
|
|
78
|
+
return key, [], [f"Unknown source: {name}"]
|
|
79
|
+
src = src_cls()
|
|
80
|
+
|
|
81
|
+
# global keyword filter applied at source.fetch() call
|
|
82
|
+
fetch_kwargs = {
|
|
83
|
+
"module": module,
|
|
84
|
+
"limit": self.limit_per_source,
|
|
85
|
+
"keyword": keyword, # may be None = no filtering
|
|
86
|
+
}
|
|
87
|
+
# per-source params override (from merged extra)
|
|
88
|
+
for k in ("category", "language", "node", "subreddit",
|
|
89
|
+
"start_time", "end_time", "url"):
|
|
90
|
+
if k in extra:
|
|
91
|
+
fetch_kwargs[k] = extra[k]
|
|
92
|
+
# per-source keyword (params level) takes precedence over global
|
|
93
|
+
if "keyword" in extra:
|
|
94
|
+
fetch_kwargs["keyword"] = extra["keyword"]
|
|
95
|
+
|
|
96
|
+
try:
|
|
97
|
+
items = src.fetch(**fetch_kwargs)
|
|
98
|
+
return key, [item.to_dict() for item in items], []
|
|
99
|
+
except SourceError as e:
|
|
100
|
+
return key, [], [str(e)]
|
|
101
|
+
except Exception as e:
|
|
102
|
+
return key, [], [f"{name} unexpected error: {e}"]
|
|
103
|
+
|
|
104
|
+
# 构建 work items
|
|
105
|
+
work: list[tuple] = []
|
|
106
|
+
if parsed:
|
|
107
|
+
for name, module, extra in parsed:
|
|
108
|
+
# CLI params 覆盖 hard-coded extra
|
|
109
|
+
sp = params.get(name, {})
|
|
110
|
+
merged_extra = {**extra, **sp}
|
|
111
|
+
work.append((name, module, merged_extra))
|
|
112
|
+
else:
|
|
113
|
+
for name in self.sources:
|
|
114
|
+
sp = params.get(name, {})
|
|
115
|
+
work.append((name, None, sp))
|
|
116
|
+
|
|
117
|
+
# 并行执行
|
|
118
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=min(len(work), 8)) as ex:
|
|
119
|
+
futures = {ex.submit(_fetch_one, *w): w for w in work}
|
|
120
|
+
for fut in concurrent.futures.as_completed(futures):
|
|
121
|
+
key, items, errs = fut.result()
|
|
122
|
+
if items:
|
|
123
|
+
results[key] = items
|
|
124
|
+
errors.extend(errs)
|
|
125
|
+
|
|
126
|
+
# 合并所有 items(按 source:module 分别存储,输出时全部扁平化)
|
|
127
|
+
all_items: list[dict] = []
|
|
128
|
+
for key, items in results.items():
|
|
129
|
+
all_items.extend(items)
|
|
130
|
+
|
|
131
|
+
# 跨 source 去重:标题相似度 ≥ 70%(仅多 source 时触发)
|
|
132
|
+
if len(results) > 1:
|
|
133
|
+
all_items = self._deduplicate(all_items)
|
|
134
|
+
|
|
135
|
+
# 可选:enrich — 对 summary=null 的 item 并发拉取原文 description
|
|
136
|
+
if enrich:
|
|
137
|
+
from .enrich import enrich_items
|
|
138
|
+
all_items = enrich_items(all_items)
|
|
139
|
+
|
|
140
|
+
if limit is not None:
|
|
141
|
+
all_items = all_items[:limit]
|
|
142
|
+
|
|
143
|
+
return {
|
|
144
|
+
"ok": True,
|
|
145
|
+
"schema": "NewsItem v1.0",
|
|
146
|
+
"sources": {k: len(v) for k, v in results.items()},
|
|
147
|
+
"items": all_items,
|
|
148
|
+
"total": len(all_items),
|
|
149
|
+
"errors": errors,
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
@staticmethod
|
|
153
|
+
def _deduplicate(items: list[dict]) -> list[dict]:
|
|
154
|
+
"""
|
|
155
|
+
跨 source 去重。相似度阈值 70%(标题 normalized 后比对)。
|
|
156
|
+
保留第一条出现的item,移除后续相似项。
|
|
157
|
+
"""
|
|
158
|
+
from urllib.parse import urlparse
|
|
159
|
+
def normalize_title(t: str) -> str:
|
|
160
|
+
"""小写 + 去除标点 + strip()"""
|
|
161
|
+
import re
|
|
162
|
+
t = t.lower().strip()
|
|
163
|
+
t = re.sub(r'[^\w\s]', ' ', t)
|
|
164
|
+
t = re.sub(r'\s+', ' ', t).strip()
|
|
165
|
+
return t
|
|
166
|
+
|
|
167
|
+
def similarity(a: str, b: str) -> float:
|
|
168
|
+
"""简单词集合 Jaccard 相似度"""
|
|
169
|
+
sa = set(a.split())
|
|
170
|
+
sb = set(b.split())
|
|
171
|
+
if not sa or not sb:
|
|
172
|
+
return 0.0
|
|
173
|
+
inter = len(sa & sb)
|
|
174
|
+
union = len(sa | sb)
|
|
175
|
+
return inter / union if union > 0 else 0.0
|
|
176
|
+
|
|
177
|
+
def item_key(item: dict) -> str:
|
|
178
|
+
domain = ""
|
|
179
|
+
if item.get("url"):
|
|
180
|
+
try:
|
|
181
|
+
domain = urlparse(item["url"]).netloc
|
|
182
|
+
except Exception:
|
|
183
|
+
pass
|
|
184
|
+
return f"{normalize_title(item['title'])}|{domain}"
|
|
185
|
+
|
|
186
|
+
seen: list[dict] = []
|
|
187
|
+
for item in items:
|
|
188
|
+
norm = normalize_title(item.get("title", ""))
|
|
189
|
+
if not norm:
|
|
190
|
+
seen.append(item)
|
|
191
|
+
continue
|
|
192
|
+
dup_idx = None
|
|
193
|
+
for i, s in enumerate(seen):
|
|
194
|
+
s_norm = normalize_title(s.get("title", ""))
|
|
195
|
+
# Different domain → different article, skip
|
|
196
|
+
s_domain = urlparse(s.get("url", "")).netloc or ""
|
|
197
|
+
item_domain = urlparse(item.get("url", "")).netloc or ""
|
|
198
|
+
if s_domain and item_domain and s_domain != item_domain:
|
|
199
|
+
continue
|
|
200
|
+
if similarity(norm, s_norm) >= 0.70:
|
|
201
|
+
dup_idx = i
|
|
202
|
+
break
|
|
203
|
+
if dup_idx is not None:
|
|
204
|
+
# 保留信息更丰富的那个(非 null 字段数量多的)
|
|
205
|
+
existing = seen[dup_idx]
|
|
206
|
+
existing_nulls = sum(1 for k, v in existing.items() if v is None and k != 'extra')
|
|
207
|
+
item_nulls = sum(1 for k, v in item.items() if v is None and k != 'extra')
|
|
208
|
+
if item_nulls < existing_nulls:
|
|
209
|
+
seen[dup_idx] = item
|
|
210
|
+
else:
|
|
211
|
+
seen.append(item)
|
|
212
|
+
return seen
|
|
213
|
+
|
|
214
|
+
@staticmethod
|
|
215
|
+
def _parse_filter(filter_str: str | None) -> list[tuple]:
|
|
216
|
+
"""
|
|
217
|
+
解析 'source:module,source2:module2' → [(name, module, extra_dict), ...]
|
|
218
|
+
|
|
219
|
+
支持格式:
|
|
220
|
+
hackernews:topstories → ("hackernews", "topstories", {})
|
|
221
|
+
v2ex:node:python → ("v2ex", "node", {"node": "python"})
|
|
222
|
+
rss:bensbites → ("rss", "bensbites", {"source_key": "bensbites"})
|
|
223
|
+
github:trending:language=Python → ("github", "trending", {"language": "Python"})
|
|
224
|
+
zaker:category:category=technology → ("zaker", "category", {"category": "technology"})
|
|
225
|
+
reddit:r/technology → ("reddit", None, {"subreddit": "technology"})
|
|
226
|
+
"""
|
|
227
|
+
if not filter_str:
|
|
228
|
+
return []
|
|
229
|
+
items = []
|
|
230
|
+
for part in filter_str.split("&"):
|
|
231
|
+
part = part.strip()
|
|
232
|
+
if not part:
|
|
233
|
+
continue
|
|
234
|
+
name, rest = part.split(":", 1) if ":" in part else (part, "")
|
|
235
|
+
name = name.strip()
|
|
236
|
+
|
|
237
|
+
extra = {}
|
|
238
|
+
tokens = rest.split(":") if rest else []
|
|
239
|
+
module = None
|
|
240
|
+
for token in tokens:
|
|
241
|
+
if "=" in token:
|
|
242
|
+
k, v = token.split("=", 1)
|
|
243
|
+
extra[k.strip()] = v.strip()
|
|
244
|
+
elif not module:
|
|
245
|
+
module = token
|
|
246
|
+
|
|
247
|
+
# Reddit r/<subreddit> shorthand
|
|
248
|
+
if name == "reddit" and module and module.startswith("r/"):
|
|
249
|
+
extra["subreddit"] = module[2:]
|
|
250
|
+
module = None
|
|
251
|
+
|
|
252
|
+
# RSS source_key shorthand
|
|
253
|
+
if name == "rss" and module and "=" not in module:
|
|
254
|
+
extra["source_key"] = module
|
|
255
|
+
|
|
256
|
+
items.append((name, module, extra))
|
|
257
|
+
return items
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def fetch_all(keyword: str | None = None, limit: int | None = None) -> dict:
|
|
261
|
+
"""快速入口:拉取所有注册 source"""
|
|
262
|
+
agg = NewsAggregator()
|
|
263
|
+
return agg.fetch(keyword=keyword, limit=limit)
|