rolling-reader 0.3.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {rolling_reader-0.3.0 → rolling_reader-0.4.0}/PKG-INFO +2 -1
- {rolling_reader-0.3.0 → rolling_reader-0.4.0}/pyproject.toml +2 -1
- {rolling_reader-0.3.0 → rolling_reader-0.4.0}/src/rolling_reader/cli.py +6 -0
- {rolling_reader-0.3.0 → rolling_reader-0.4.0}/src/rolling_reader/dispatcher.py +10 -6
- {rolling_reader-0.3.0 → rolling_reader-0.4.0}/src/rolling_reader/extractor/cdp.py +8 -1
- rolling_reader-0.4.0/src/rolling_reader/extractor/clean.py +44 -0
- {rolling_reader-0.3.0 → rolling_reader-0.4.0}/src/rolling_reader/extractor/http.py +9 -1
- {rolling_reader-0.3.0 → rolling_reader-0.4.0}/.gitignore +0 -0
- {rolling_reader-0.3.0 → rolling_reader-0.4.0}/README.md +0 -0
- {rolling_reader-0.3.0 → rolling_reader-0.4.0}/src/rolling_reader/__init__.py +0 -0
- {rolling_reader-0.3.0 → rolling_reader-0.4.0}/src/rolling_reader/cache/__init__.py +0 -0
- {rolling_reader-0.3.0 → rolling_reader-0.4.0}/src/rolling_reader/cache/profile.py +0 -0
- {rolling_reader-0.3.0 → rolling_reader-0.4.0}/src/rolling_reader/extractor/__init__.py +0 -0
- {rolling_reader-0.3.0 → rolling_reader-0.4.0}/src/rolling_reader/extractor/state.py +0 -0
- {rolling_reader-0.3.0 → rolling_reader-0.4.0}/src/rolling_reader/models.py +0 -0
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rolling-reader
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Local-first web scraper that automatically rolls through HTTP → browser → JS state extraction
|
|
5
5
|
License: MIT
|
|
6
6
|
Requires-Python: >=3.11
|
|
7
7
|
Requires-Dist: beautifulsoup4>=4.14
|
|
8
8
|
Requires-Dist: httpx>=0.28
|
|
9
9
|
Requires-Dist: playwright>=1.44
|
|
10
|
+
Requires-Dist: trafilatura>=1.12
|
|
10
11
|
Requires-Dist: typer>=0.12
|
|
11
12
|
Description-Content-Type: text/markdown
|
|
12
13
|
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "rolling-reader"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.4.0"
|
|
8
8
|
description = "Local-first web scraper that automatically rolls through HTTP → browser → JS state extraction"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { text = "MIT" }
|
|
@@ -14,6 +14,7 @@ dependencies = [
|
|
|
14
14
|
"beautifulsoup4>=4.14",
|
|
15
15
|
"typer>=0.12",
|
|
16
16
|
"playwright>=1.44",
|
|
17
|
+
"trafilatura>=1.12",
|
|
17
18
|
]
|
|
18
19
|
|
|
19
20
|
[project.scripts]
|
|
@@ -90,6 +90,11 @@ def scrape(
|
|
|
90
90
|
"--verbose", "-v",
|
|
91
91
|
help="Print escalation steps to stderr",
|
|
92
92
|
),
|
|
93
|
+
clean: bool = typer.Option(
|
|
94
|
+
False,
|
|
95
|
+
"--clean", "-c",
|
|
96
|
+
help="Extract article body only, filtering out navigation, ads, and footers",
|
|
97
|
+
),
|
|
93
98
|
) -> None:
|
|
94
99
|
"""Scrape a URL and output structured data."""
|
|
95
100
|
|
|
@@ -101,6 +106,7 @@ def scrape(
|
|
|
101
106
|
cdp_endpoint=cdp_endpoint,
|
|
102
107
|
verbose=verbose,
|
|
103
108
|
use_cache=not no_cache,
|
|
109
|
+
clean=clean,
|
|
104
110
|
)
|
|
105
111
|
)
|
|
106
112
|
except ExtractionError as e:
|
|
@@ -38,6 +38,7 @@ async def dispatch(
|
|
|
38
38
|
page_timeout: float = 30.0,
|
|
39
39
|
verbose: bool = False,
|
|
40
40
|
use_cache: bool = True,
|
|
41
|
+
clean: bool = False,
|
|
41
42
|
) -> ExtractResult:
|
|
42
43
|
"""
|
|
43
44
|
自动选择最优抓取策略并执行。
|
|
@@ -64,11 +65,11 @@ async def dispatch(
|
|
|
64
65
|
# ── 强制指定层级 ──────────────────────────────────────────────────────
|
|
65
66
|
if force_level == 1:
|
|
66
67
|
log("forced Level 1 (HTTP)")
|
|
67
|
-
return await http_extract(url, timeout=http_timeout)
|
|
68
|
+
return await http_extract(url, timeout=http_timeout, clean=clean)
|
|
68
69
|
|
|
69
70
|
if force_level in (2, 3):
|
|
70
71
|
log(f"forced Level 2/3 (CDP)")
|
|
71
|
-
return await _try_level2(url, cdp_endpoint, page_timeout, log)
|
|
72
|
+
return await _try_level2(url, cdp_endpoint, page_timeout, log, clean=clean)
|
|
72
73
|
|
|
73
74
|
# ── Profile Cache:命中时直接跳到已知层级 ─────────────────────────────
|
|
74
75
|
if use_cache:
|
|
@@ -78,7 +79,7 @@ async def dispatch(
|
|
|
78
79
|
log(f"cache hit → Level {preferred} for {cached.get('domain')}")
|
|
79
80
|
if preferred == 1:
|
|
80
81
|
try:
|
|
81
|
-
result = await http_extract(url, timeout=http_timeout)
|
|
82
|
+
result = await http_extract(url, timeout=http_timeout, clean=clean)
|
|
82
83
|
profile_cache.save(url, result.level)
|
|
83
84
|
return result
|
|
84
85
|
except Exception:
|
|
@@ -86,7 +87,7 @@ async def dispatch(
|
|
|
86
87
|
profile_cache.invalidate(url)
|
|
87
88
|
else:
|
|
88
89
|
try:
|
|
89
|
-
result = await _try_level2(url, cdp_endpoint, page_timeout, log)
|
|
90
|
+
result = await _try_level2(url, cdp_endpoint, page_timeout, log, clean=clean)
|
|
90
91
|
profile_cache.save(url, result.level,
|
|
91
92
|
state_var=cached.get("state_var"))
|
|
92
93
|
return result
|
|
@@ -99,7 +100,7 @@ async def dispatch(
|
|
|
99
100
|
# Level 1:HTTP 直取
|
|
100
101
|
log(f"Level 1 → {url}")
|
|
101
102
|
try:
|
|
102
|
-
result = await http_extract(url, timeout=http_timeout)
|
|
103
|
+
result = await http_extract(url, timeout=http_timeout, clean=clean)
|
|
103
104
|
log(f"Level 1 succeeded ({result.elapsed_ms:.0f}ms)")
|
|
104
105
|
if use_cache:
|
|
105
106
|
profile_cache.save(url, result.level)
|
|
@@ -112,7 +113,7 @@ async def dispatch(
|
|
|
112
113
|
log(f"Level 1 → error ({e.reason}), escalating to Level 2/3")
|
|
113
114
|
|
|
114
115
|
# Level 2/3:CDP + 已有 Chrome(内部自动尝试 Level 3 state 提取)
|
|
115
|
-
result = await _try_level2(url, cdp_endpoint, page_timeout, log)
|
|
116
|
+
result = await _try_level2(url, cdp_endpoint, page_timeout, log, clean=clean)
|
|
116
117
|
if use_cache:
|
|
117
118
|
state_var = None
|
|
118
119
|
if result.level == 3:
|
|
@@ -127,6 +128,8 @@ async def _try_level2(
|
|
|
127
128
|
cdp_endpoint: str,
|
|
128
129
|
page_timeout: float,
|
|
129
130
|
log,
|
|
131
|
+
*,
|
|
132
|
+
clean: bool = False,
|
|
130
133
|
) -> ExtractResult:
|
|
131
134
|
"""尝试 Level 2,Chrome 不可用时给出清晰错误。"""
|
|
132
135
|
from rolling_reader.extractor.cdp import ChromeNotRunningError
|
|
@@ -146,6 +149,7 @@ async def _try_level2(
|
|
|
146
149
|
url,
|
|
147
150
|
cdp_endpoint=cdp_endpoint,
|
|
148
151
|
page_timeout=page_timeout,
|
|
152
|
+
clean=clean,
|
|
149
153
|
)
|
|
150
154
|
log(f"Level 2 succeeded ({result.elapsed_ms:.0f}ms)")
|
|
151
155
|
return result
|
|
@@ -73,6 +73,7 @@ async def extract(
|
|
|
73
73
|
cdp_endpoint: str = CDP_ENDPOINT,
|
|
74
74
|
page_timeout: float = 30.0,
|
|
75
75
|
wait_networkidle: bool = True,
|
|
76
|
+
clean: bool = False,
|
|
76
77
|
) -> ExtractResult:
|
|
77
78
|
"""
|
|
78
79
|
Level 2 CDP 抓取。
|
|
@@ -175,12 +176,18 @@ async def extract(
|
|
|
175
176
|
|
|
176
177
|
# ── 9. Level 2:回退到 DOM 提取 ───────────────────────────────────────
|
|
177
178
|
soup = BeautifulSoup(html, "html.parser")
|
|
179
|
+
if clean:
|
|
180
|
+
from rolling_reader.extractor.clean import clean_extract
|
|
181
|
+
cleaned = clean_extract(html, url=final_url)
|
|
182
|
+
text = cleaned if cleaned else _extract_text(BeautifulSoup(html, "html.parser"))
|
|
183
|
+
else:
|
|
184
|
+
text = _extract_text(BeautifulSoup(html, "html.parser"))
|
|
178
185
|
return ExtractResult(
|
|
179
186
|
url=final_url,
|
|
180
187
|
level=2,
|
|
181
188
|
status_code=200,
|
|
182
189
|
title=_extract_title(soup),
|
|
183
|
-
text=
|
|
190
|
+
text=text,
|
|
184
191
|
links=_extract_links(soup, final_url),
|
|
185
192
|
elapsed_ms=round(elapsed, 1),
|
|
186
193
|
)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""
|
|
2
|
+
rolling_reader/extractor/clean.py
|
|
3
|
+
==================================
|
|
4
|
+
正文提取(Article Extraction)
|
|
5
|
+
|
|
6
|
+
使用 trafilatura 从 HTML 中识别并提取主体文章内容,
|
|
7
|
+
过滤导航栏、广告、页脚、侧边栏等噪音。
|
|
8
|
+
|
|
9
|
+
对比默认的 BeautifulSoup 文本提取:
|
|
10
|
+
默认:把 <body> 里所有文字全部返回(快,但夹杂噪音)
|
|
11
|
+
--clean:只返回主体文章文字(慢约 50ms,但干净)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def clean_extract(html: str, url: str = "") -> Optional[str]:
|
|
19
|
+
"""
|
|
20
|
+
从 HTML 中提取正文。
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
html: 完整 HTML 字符串
|
|
24
|
+
url: 原始 URL(trafilatura 用于辅助判断,可选)
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
正文文字,或 None(trafilatura 无法识别正文时)
|
|
28
|
+
"""
|
|
29
|
+
try:
|
|
30
|
+
import trafilatura
|
|
31
|
+
except ImportError:
|
|
32
|
+
raise ImportError(
|
|
33
|
+
"trafilatura is required for --clean mode: pip install trafilatura"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
text = trafilatura.extract(
|
|
37
|
+
html,
|
|
38
|
+
url=url or None,
|
|
39
|
+
include_comments=False,
|
|
40
|
+
include_tables=True,
|
|
41
|
+
no_fallback=False, # 允许回退到其他算法
|
|
42
|
+
favor_precision=True,
|
|
43
|
+
)
|
|
44
|
+
return text or None
|
|
@@ -182,6 +182,7 @@ async def extract(
|
|
|
182
182
|
timeout: float = 15.0,
|
|
183
183
|
headers: Optional[dict] = None,
|
|
184
184
|
client: Optional[httpx.AsyncClient] = None,
|
|
185
|
+
clean: bool = False,
|
|
185
186
|
) -> ExtractResult:
|
|
186
187
|
"""
|
|
187
188
|
Level 1 HTTP 抓取。
|
|
@@ -219,9 +220,16 @@ async def extract(
|
|
|
219
220
|
# 解析内容
|
|
220
221
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
221
222
|
title = _extract_title(soup)
|
|
222
|
-
text = _extract_text(BeautifulSoup(response.text, "html.parser")) # 用新 soup 避免修改影响
|
|
223
223
|
links = _extract_links(soup, str(response.url))
|
|
224
224
|
|
|
225
|
+
# --clean 模式:用 trafilatura 替换 BeautifulSoup 文本提取
|
|
226
|
+
if clean:
|
|
227
|
+
from rolling_reader.extractor.clean import clean_extract
|
|
228
|
+
cleaned = clean_extract(response.text, url=str(response.url))
|
|
229
|
+
text = cleaned if cleaned else _extract_text(soup)
|
|
230
|
+
else:
|
|
231
|
+
text = _extract_text(BeautifulSoup(response.text, "html.parser"))
|
|
232
|
+
|
|
225
233
|
return ExtractResult(
|
|
226
234
|
url=str(response.url),
|
|
227
235
|
level=1,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|