rolling-reader 0.3.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rolling-reader
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: Local-first web scraper that automatically rolls through HTTP → browser → JS state extraction
5
5
  License: MIT
6
6
  Requires-Python: >=3.11
7
7
  Requires-Dist: beautifulsoup4>=4.14
8
8
  Requires-Dist: httpx>=0.28
9
9
  Requires-Dist: playwright>=1.44
10
+ Requires-Dist: trafilatura>=1.12
10
11
  Requires-Dist: typer>=0.12
11
12
  Description-Content-Type: text/markdown
12
13
 
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "rolling-reader"
7
- version = "0.3.0"
7
+ version = "0.4.0"
8
8
  description = "Local-first web scraper that automatically rolls through HTTP → browser → JS state extraction"
9
9
  readme = "README.md"
10
10
  license = { text = "MIT" }
@@ -14,6 +14,7 @@ dependencies = [
14
14
  "beautifulsoup4>=4.14",
15
15
  "typer>=0.12",
16
16
  "playwright>=1.44",
17
+ "trafilatura>=1.12",
17
18
  ]
18
19
 
19
20
  [project.scripts]
@@ -90,6 +90,11 @@ def scrape(
90
90
  "--verbose", "-v",
91
91
  help="Print escalation steps to stderr",
92
92
  ),
93
+ clean: bool = typer.Option(
94
+ False,
95
+ "--clean", "-c",
96
+ help="Extract article body only, filtering out navigation, ads, and footers",
97
+ ),
93
98
  ) -> None:
94
99
  """Scrape a URL and output structured data."""
95
100
 
@@ -101,6 +106,7 @@ def scrape(
101
106
  cdp_endpoint=cdp_endpoint,
102
107
  verbose=verbose,
103
108
  use_cache=not no_cache,
109
+ clean=clean,
104
110
  )
105
111
  )
106
112
  except ExtractionError as e:
@@ -38,6 +38,7 @@ async def dispatch(
38
38
  page_timeout: float = 30.0,
39
39
  verbose: bool = False,
40
40
  use_cache: bool = True,
41
+ clean: bool = False,
41
42
  ) -> ExtractResult:
42
43
  """
43
44
  自动选择最优抓取策略并执行。
@@ -64,11 +65,11 @@ async def dispatch(
64
65
  # ── 强制指定层级 ──────────────────────────────────────────────────────
65
66
  if force_level == 1:
66
67
  log("forced Level 1 (HTTP)")
67
- return await http_extract(url, timeout=http_timeout)
68
+ return await http_extract(url, timeout=http_timeout, clean=clean)
68
69
 
69
70
  if force_level in (2, 3):
70
71
  log(f"forced Level 2/3 (CDP)")
71
- return await _try_level2(url, cdp_endpoint, page_timeout, log)
72
+ return await _try_level2(url, cdp_endpoint, page_timeout, log, clean=clean)
72
73
 
73
74
  # ── Profile Cache:命中时直接跳到已知层级 ─────────────────────────────
74
75
  if use_cache:
@@ -78,7 +79,7 @@ async def dispatch(
78
79
  log(f"cache hit → Level {preferred} for {cached.get('domain')}")
79
80
  if preferred == 1:
80
81
  try:
81
- result = await http_extract(url, timeout=http_timeout)
82
+ result = await http_extract(url, timeout=http_timeout, clean=clean)
82
83
  profile_cache.save(url, result.level)
83
84
  return result
84
85
  except Exception:
@@ -86,7 +87,7 @@ async def dispatch(
86
87
  profile_cache.invalidate(url)
87
88
  else:
88
89
  try:
89
- result = await _try_level2(url, cdp_endpoint, page_timeout, log)
90
+ result = await _try_level2(url, cdp_endpoint, page_timeout, log, clean=clean)
90
91
  profile_cache.save(url, result.level,
91
92
  state_var=cached.get("state_var"))
92
93
  return result
@@ -99,7 +100,7 @@ async def dispatch(
99
100
  # Level 1:HTTP 直取
100
101
  log(f"Level 1 → {url}")
101
102
  try:
102
- result = await http_extract(url, timeout=http_timeout)
103
+ result = await http_extract(url, timeout=http_timeout, clean=clean)
103
104
  log(f"Level 1 succeeded ({result.elapsed_ms:.0f}ms)")
104
105
  if use_cache:
105
106
  profile_cache.save(url, result.level)
@@ -112,7 +113,7 @@ async def dispatch(
112
113
  log(f"Level 1 → error ({e.reason}), escalating to Level 2/3")
113
114
 
114
115
  # Level 2/3:CDP + 已有 Chrome(内部自动尝试 Level 3 state 提取)
115
- result = await _try_level2(url, cdp_endpoint, page_timeout, log)
116
+ result = await _try_level2(url, cdp_endpoint, page_timeout, log, clean=clean)
116
117
  if use_cache:
117
118
  state_var = None
118
119
  if result.level == 3:
@@ -127,6 +128,8 @@ async def _try_level2(
127
128
  cdp_endpoint: str,
128
129
  page_timeout: float,
129
130
  log,
131
+ *,
132
+ clean: bool = False,
130
133
  ) -> ExtractResult:
131
134
  """尝试 Level 2,Chrome 不可用时给出清晰错误。"""
132
135
  from rolling_reader.extractor.cdp import ChromeNotRunningError
@@ -146,6 +149,7 @@ async def _try_level2(
146
149
  url,
147
150
  cdp_endpoint=cdp_endpoint,
148
151
  page_timeout=page_timeout,
152
+ clean=clean,
149
153
  )
150
154
  log(f"Level 2 succeeded ({result.elapsed_ms:.0f}ms)")
151
155
  return result
@@ -73,6 +73,7 @@ async def extract(
73
73
  cdp_endpoint: str = CDP_ENDPOINT,
74
74
  page_timeout: float = 30.0,
75
75
  wait_networkidle: bool = True,
76
+ clean: bool = False,
76
77
  ) -> ExtractResult:
77
78
  """
78
79
  Level 2 CDP 抓取。
@@ -175,12 +176,18 @@ async def extract(
175
176
 
176
177
  # ── 9. Level 2:回退到 DOM 提取 ───────────────────────────────────────
177
178
  soup = BeautifulSoup(html, "html.parser")
179
+ if clean:
180
+ from rolling_reader.extractor.clean import clean_extract
181
+ cleaned = clean_extract(html, url=final_url)
182
+ text = cleaned if cleaned else _extract_text(BeautifulSoup(html, "html.parser"))
183
+ else:
184
+ text = _extract_text(BeautifulSoup(html, "html.parser"))
178
185
  return ExtractResult(
179
186
  url=final_url,
180
187
  level=2,
181
188
  status_code=200,
182
189
  title=_extract_title(soup),
183
- text=_extract_text(BeautifulSoup(html, "html.parser")),
190
+ text=text,
184
191
  links=_extract_links(soup, final_url),
185
192
  elapsed_ms=round(elapsed, 1),
186
193
  )
@@ -0,0 +1,44 @@
1
+ """
2
+ rolling_reader/extractor/clean.py
3
+ ==================================
4
+ 正文提取(Article Extraction)
5
+
6
+ 使用 trafilatura 从 HTML 中识别并提取主体文章内容,
7
+ 过滤导航栏、广告、页脚、侧边栏等噪音。
8
+
9
+ 对比默认的 BeautifulSoup 文本提取:
10
+ 默认:把 <body> 里所有文字全部返回(快,但夹杂噪音)
11
+ --clean:只返回主体文章文字(慢约 50ms,但干净)
12
+ """
13
+
14
+ from __future__ import annotations
15
+ from typing import Optional
16
+
17
+
18
+ def clean_extract(html: str, url: str = "") -> Optional[str]:
19
+ """
20
+ 从 HTML 中提取正文。
21
+
22
+ Args:
23
+ html: 完整 HTML 字符串
24
+ url: 原始 URL(trafilatura 用于辅助判断,可选)
25
+
26
+ Returns:
27
+ 正文文字,或 None(trafilatura 无法识别正文时)
28
+ """
29
+ try:
30
+ import trafilatura
31
+ except ImportError:
32
+ raise ImportError(
33
+ "trafilatura is required for --clean mode: pip install trafilatura"
34
+ )
35
+
36
+ text = trafilatura.extract(
37
+ html,
38
+ url=url or None,
39
+ include_comments=False,
40
+ include_tables=True,
41
+ no_fallback=False, # 允许回退到其他算法
42
+ favor_precision=True,
43
+ )
44
+ return text or None
@@ -182,6 +182,7 @@ async def extract(
182
182
  timeout: float = 15.0,
183
183
  headers: Optional[dict] = None,
184
184
  client: Optional[httpx.AsyncClient] = None,
185
+ clean: bool = False,
185
186
  ) -> ExtractResult:
186
187
  """
187
188
  Level 1 HTTP 抓取。
@@ -219,9 +220,16 @@ async def extract(
219
220
  # 解析内容
220
221
  soup = BeautifulSoup(response.text, "html.parser")
221
222
  title = _extract_title(soup)
222
- text = _extract_text(BeautifulSoup(response.text, "html.parser")) # 用新 soup 避免修改影响
223
223
  links = _extract_links(soup, str(response.url))
224
224
 
225
+ # --clean 模式:用 trafilatura 替换 BeautifulSoup 文本提取
226
+ if clean:
227
+ from rolling_reader.extractor.clean import clean_extract
228
+ cleaned = clean_extract(response.text, url=str(response.url))
229
+ text = cleaned if cleaned else _extract_text(soup)
230
+ else:
231
+ text = _extract_text(BeautifulSoup(response.text, "html.parser"))
232
+
225
233
  return ExtractResult(
226
234
  url=str(response.url),
227
235
  level=1,
File without changes