rolling-reader 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ """ScrapeKit — local-first web scraper with automatic strategy selection."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,3 @@
1
+ from rolling_reader.cache import profile
2
+
3
+ __all__ = ["profile"]
@@ -0,0 +1,172 @@
1
+ """
2
+ rolling_reader/cache/profile.py
3
+ ===========================
4
+ Profile Cache — 按 domain 缓存最优抓取策略。
5
+
6
+ 作用:
7
+ 第一次访问某站点时,rolling-reader 会探索最优策略(Level 1 / 2 / 3)。
8
+ 成功后,将"配方"写入本地 JSON 文件。
9
+ 后续请求直接跳到已知的最优层级,跳过探索开销。
10
+
11
+ 存储位置:~/.rolling-reader/profiles/<domain>.json
12
+
13
+ 配方格式:
14
+ {
15
+ "domain": "sportsbet.com.au",
16
+ "preferred_level": 3,
17
+ "state_var": "window.__PRELOADED_STATE__", // Level 3 专用
18
+ "discovered_at": "2026-04-16T00:00:00Z",
19
+ "last_success": "2026-04-16T00:08:00Z",
20
+ "success_count": 12,
21
+ "failure_count": 0
22
+ }
23
+
24
+ v0.1 范围:
25
+ - domain 级别匹配(保守,安全)
26
+ - 30 天过期
27
+ - 提取失败时立即失效,触发重新探索
28
+ """
29
+
30
+ from __future__ import annotations
31
+
32
+ import json
33
+ import os
34
+ from datetime import datetime, timezone, timedelta
35
+ from pathlib import Path
36
+ from typing import Optional
37
+ from urllib.parse import urlparse
38
+
39
+ # 缓存目录
40
+ CACHE_DIR = Path.home() / ".rolling-reader" / "profiles"
41
+
42
+ # 配方有效期(天)
43
+ STALE_DAYS = 30
44
+
45
+
46
+ # ---------------------------------------------------------------------------
47
+ # 工具函数
48
+ # ---------------------------------------------------------------------------
49
+
50
+ def _domain(url: str) -> str:
51
+ """从 URL 提取 domain(不含 www.)。"""
52
+ parsed = urlparse(url)
53
+ host = parsed.netloc or parsed.path
54
+ # 去掉端口
55
+ host = host.split(":")[0]
56
+ # 去掉 www.
57
+ if host.startswith("www."):
58
+ host = host[4:]
59
+ return host.lower()
60
+
61
+
62
+ def _profile_path(domain: str) -> Path:
63
+ return CACHE_DIR / f"{domain}.json"
64
+
65
+
66
+ def _now_iso() -> str:
67
+ return datetime.now(timezone.utc).isoformat()
68
+
69
+
70
+ # ---------------------------------------------------------------------------
71
+ # 公开 API
72
+ # ---------------------------------------------------------------------------
73
+
74
+ def load(url: str) -> Optional[dict]:
75
+ """
76
+ 读取 domain 的缓存配方。
77
+
78
+ Returns:
79
+ 配方 dict,或 None(未命中 / 已过期)
80
+ """
81
+ domain = _domain(url)
82
+ path = _profile_path(domain)
83
+
84
+ if not path.exists():
85
+ return None
86
+
87
+ try:
88
+ with open(path, encoding="utf-8") as f:
89
+ profile = json.load(f)
90
+ except (json.JSONDecodeError, OSError):
91
+ return None
92
+
93
+ # 检查是否过期
94
+ discovered = profile.get("discovered_at", "")
95
+ if discovered:
96
+ try:
97
+ dt = datetime.fromisoformat(discovered)
98
+ if datetime.now(timezone.utc) - dt > timedelta(days=STALE_DAYS):
99
+ path.unlink(missing_ok=True)
100
+ return None
101
+ except ValueError:
102
+ pass
103
+
104
+ return profile
105
+
106
+
107
+ def save(url: str, result_level: int, state_var: Optional[str] = None) -> None:
108
+ """
109
+ 保存或更新 domain 的缓存配方。
110
+
111
+ Args:
112
+ url: 成功抓取的 URL
113
+ result_level: 实际使用的层级(1 / 2 / 3)
114
+ state_var: Level 3 时用到的 JS 变量名
115
+ """
116
+ domain = _domain(url)
117
+ path = _profile_path(domain)
118
+ CACHE_DIR.mkdir(parents=True, exist_ok=True)
119
+
120
+ # 读取已有记录(更新计数器)
121
+ existing: dict = {}
122
+ if path.exists():
123
+ try:
124
+ with open(path, encoding="utf-8") as f:
125
+ existing = json.load(f)
126
+ except (json.JSONDecodeError, OSError):
127
+ existing = {}
128
+
129
+ now = _now_iso()
130
+ profile = {
131
+ "domain": domain,
132
+ "preferred_level": result_level,
133
+ "state_var": state_var,
134
+ "discovered_at": existing.get("discovered_at", now),
135
+ "last_success": now,
136
+ "success_count": existing.get("success_count", 0) + 1,
137
+ "failure_count": existing.get("failure_count", 0),
138
+ }
139
+
140
+ with open(path, "w", encoding="utf-8") as f:
141
+ json.dump(profile, f, ensure_ascii=False, indent=2)
142
+
143
+
144
+ def invalidate(url: str) -> None:
145
+ """提取失败时调用,立即删除缓存配方,触发下次重新探索。"""
146
+ domain = _domain(url)
147
+ path = _profile_path(domain)
148
+ if path.exists():
149
+ # 增加失败计数,但不删除(保留历史,只标记失效)
150
+ try:
151
+ with open(path, encoding="utf-8") as f:
152
+ profile = json.load(f)
153
+ profile["failure_count"] = profile.get("failure_count", 0) + 1
154
+ profile["discovered_at"] = "" # 清空,下次 load() 会返回 None
155
+ with open(path, "w", encoding="utf-8") as f:
156
+ json.dump(profile, f, ensure_ascii=False, indent=2)
157
+ except (json.JSONDecodeError, OSError):
158
+ path.unlink(missing_ok=True)
159
+
160
+
161
+ def list_profiles() -> list[dict]:
162
+ """列出所有已缓存的配方(用于 scrape profile list 命令,Phase 2)。"""
163
+ if not CACHE_DIR.exists():
164
+ return []
165
+ profiles = []
166
+ for p in sorted(CACHE_DIR.glob("*.json")):
167
+ try:
168
+ with open(p, encoding="utf-8") as f:
169
+ profiles.append(json.load(f))
170
+ except (json.JSONDecodeError, OSError):
171
+ continue
172
+ return profiles
rolling_reader/cli.py ADDED
@@ -0,0 +1,83 @@
1
+ """
2
+ rolling_reader/cli.py
3
+ =================
4
+ CLI 入口(typer)
5
+
6
+ 用法:
7
+ scrape <url>
8
+ scrape <url> --output md
9
+ scrape <url> --force-level 2
10
+ scrape <url> --verbose
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import asyncio
16
+ import sys
17
+ from enum import Enum
18
+ from typing import Optional
19
+
20
+ import typer
21
+
22
+ from rolling_reader.dispatcher import dispatch
23
+ from rolling_reader.models import ExtractionError
24
+
25
+ app = typer.Typer(
26
+ name="scrape",
27
+ help="Local-first web scraper with automatic strategy selection.",
28
+ add_completion=False,
29
+ )
30
+
31
+
32
+ class OutputFormat(str, Enum):
33
+ json = "json"
34
+ md = "md"
35
+
36
+
37
+ @app.command()
38
+ def scrape(
39
+ url: str = typer.Argument(..., help="Target URL to scrape"),
40
+ output: OutputFormat = typer.Option(
41
+ OutputFormat.json,
42
+ "--output", "-o",
43
+ help="Output format: json (default) or md (markdown)",
44
+ ),
45
+ force_level: Optional[int] = typer.Option(
46
+ None,
47
+ "--force-level", "-l",
48
+ help="Force a specific extraction level (1=HTTP, 2=CDP)",
49
+ min=1,
50
+ max=2,
51
+ ),
52
+ cdp_endpoint: str = typer.Option(
53
+ "http://localhost:9222",
54
+ "--cdp",
55
+ help="Chrome DevTools endpoint",
56
+ ),
57
+ verbose: bool = typer.Option(
58
+ False,
59
+ "--verbose", "-v",
60
+ help="Print escalation steps to stderr",
61
+ ),
62
+ ) -> None:
63
+ """Scrape a URL and output structured data."""
64
+
65
+ try:
66
+ result = asyncio.run(
67
+ dispatch(
68
+ url,
69
+ force_level=force_level,
70
+ cdp_endpoint=cdp_endpoint,
71
+ verbose=verbose,
72
+ )
73
+ )
74
+ except ExtractionError as e:
75
+ typer.echo(f"Error: {e.reason}", err=True)
76
+ raise typer.Exit(code=1)
77
+ except KeyboardInterrupt:
78
+ raise typer.Exit(code=130)
79
+
80
+ if output == OutputFormat.json:
81
+ typer.echo(result.to_json())
82
+ else:
83
+ typer.echo(result.to_markdown())
@@ -0,0 +1,158 @@
1
+ """
2
+ rolling_reader/dispatcher.py
3
+ ========================
4
+ 核心调度器:按策略阶梯自动升级。
5
+
6
+ Level 1 → Level 2 → Level 3(v0.1 只支持到 Level 2)
7
+
8
+ 升级条件:
9
+ - Level 1 抛出 NeedsBrowserError → 升级到 Level 2
10
+ - Level 1 抛出 ExtractionError → 升级到 Level 2(网络失败也值得用浏览器重试)
11
+ - --force-level N → 跳过前面的层级
12
+
13
+ 未来扩展:
14
+ - Level 2 → Level 3:检测到 __PRELOADED_STATE__ 等 JS state 变量后升级
15
+ - Profile Cache:命中缓存时直接跳到已知层级
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import asyncio
21
+ from typing import Optional
22
+
23
+ from rolling_reader.models import ExtractResult, NeedsBrowserError, ExtractionError
24
+ from rolling_reader.extractor import http_extract, cdp_extract, is_chrome_available
25
+ from rolling_reader.cache import profile as profile_cache
26
+
27
+
28
+ # ---------------------------------------------------------------------------
29
+ # 主入口
30
+ # ---------------------------------------------------------------------------
31
+
32
+ async def dispatch(
33
+ url: str,
34
+ *,
35
+ force_level: Optional[int] = None,
36
+ cdp_endpoint: str = "http://localhost:9222",
37
+ http_timeout: float = 15.0,
38
+ page_timeout: float = 30.0,
39
+ verbose: bool = False,
40
+ use_cache: bool = True,
41
+ ) -> ExtractResult:
42
+ """
43
+ 自动选择最优抓取策略并执行。
44
+
45
+ Args:
46
+ url: 目标 URL
47
+ force_level: 强制使用指定层级(1/2/3),跳过自动判断
48
+ cdp_endpoint: Chrome 调试端点
49
+ http_timeout: Level 1 HTTP 超时(秒)
50
+ page_timeout: Level 2/3 页面加载超时(秒)
51
+ verbose: 打印升级过程
52
+ use_cache: 是否使用 Profile Cache
53
+
54
+ Returns:
55
+ ExtractResult
56
+
57
+ Raises:
58
+ ExtractionError: 所有层级均失败
59
+ """
60
+ def log(msg: str) -> None:
61
+ if verbose:
62
+ print(f"[rolling-reader] {msg}", flush=True)
63
+
64
+ # ── 强制指定层级 ──────────────────────────────────────────────────────
65
+ if force_level == 1:
66
+ log("forced Level 1 (HTTP)")
67
+ return await http_extract(url, timeout=http_timeout)
68
+
69
+ if force_level in (2, 3):
70
+ log(f"forced Level 2/3 (CDP)")
71
+ return await _try_level2(url, cdp_endpoint, page_timeout, log)
72
+
73
+ # ── Profile Cache:命中时直接跳到已知层级 ─────────────────────────────
74
+ if use_cache:
75
+ cached = profile_cache.load(url)
76
+ if cached:
77
+ preferred = cached.get("preferred_level", 1)
78
+ log(f"cache hit → Level {preferred} for {cached.get('domain')}")
79
+ if preferred == 1:
80
+ try:
81
+ result = await http_extract(url, timeout=http_timeout)
82
+ profile_cache.save(url, result.level)
83
+ return result
84
+ except Exception:
85
+ log("cache: Level 1 failed, invalidating and re-exploring")
86
+ profile_cache.invalidate(url)
87
+ else:
88
+ try:
89
+ result = await _try_level2(url, cdp_endpoint, page_timeout, log)
90
+ profile_cache.save(url, result.level,
91
+ state_var=cached.get("state_var"))
92
+ return result
93
+ except Exception:
94
+ log("cache: Level 2 failed, invalidating and re-exploring")
95
+ profile_cache.invalidate(url)
96
+
97
+ # ── 自动升级探索 ──────────────────────────────────────────────────────
98
+
99
+ # Level 1:HTTP 直取
100
+ log(f"Level 1 → {url}")
101
+ try:
102
+ result = await http_extract(url, timeout=http_timeout)
103
+ log(f"Level 1 succeeded ({result.elapsed_ms:.0f}ms)")
104
+ if use_cache:
105
+ profile_cache.save(url, result.level)
106
+ return result
107
+
108
+ except NeedsBrowserError as e:
109
+ log(f"Level 1 → needs browser ({e.reason}), escalating to Level 2/3")
110
+
111
+ except ExtractionError as e:
112
+ log(f"Level 1 → error ({e.reason}), escalating to Level 2/3")
113
+
114
+ # Level 2/3:CDP + 已有 Chrome(内部自动尝试 Level 3 state 提取)
115
+ result = await _try_level2(url, cdp_endpoint, page_timeout, log)
116
+ if use_cache:
117
+ state_var = None
118
+ if result.level == 3:
119
+ from rolling_reader.extractor.state import KNOWN_STATE_VARS
120
+ state_var = KNOWN_STATE_VARS[0] # v0.1 固定
121
+ profile_cache.save(url, result.level, state_var=state_var)
122
+ return result
123
+
124
+
125
+ async def _try_level2(
126
+ url: str,
127
+ cdp_endpoint: str,
128
+ page_timeout: float,
129
+ log,
130
+ ) -> ExtractResult:
131
+ """尝试 Level 2,Chrome 不可用时给出清晰错误。"""
132
+ from rolling_reader.extractor.cdp import ChromeNotRunningError
133
+
134
+ log(f"Level 2 → {url}")
135
+
136
+ # 提前探测 Chrome,给出更友好的错误
137
+ if not await is_chrome_available(cdp_endpoint):
138
+ raise ExtractionError(
139
+ url,
140
+ f"Level 1 failed and Chrome is not available at {cdp_endpoint}. "
141
+ "Start Chrome with: chrome --remote-debugging-port=9222",
142
+ )
143
+
144
+ try:
145
+ result = await cdp_extract(
146
+ url,
147
+ cdp_endpoint=cdp_endpoint,
148
+ page_timeout=page_timeout,
149
+ )
150
+ log(f"Level 2 succeeded ({result.elapsed_ms:.0f}ms)")
151
+ return result
152
+
153
+ except ChromeNotRunningError:
154
+ raise
155
+ except ExtractionError:
156
+ raise
157
+ except Exception as e:
158
+ raise ExtractionError(url, f"Level 2 unexpected error: {e}") from e
@@ -0,0 +1,4 @@
1
+ from rolling_reader.extractor.http import extract as http_extract, needs_browser
2
+ from rolling_reader.extractor.cdp import extract as cdp_extract, is_chrome_available
3
+
4
+ __all__ = ["http_extract", "needs_browser", "cdp_extract", "is_chrome_available"]
@@ -0,0 +1,201 @@
1
+ """
2
+ rolling_reader/extractor/cdp.py
3
+ ===========================
4
+ Level 2 — CDP + 已有 Chrome Session(Playwright connect_over_cdp)
5
+
6
+ 核心优势:
7
+ 复用用户已经登录的 Chrome,无需重新认证、无需存储凭据。
8
+ Chrome 需提前以 --remote-debugging-port=9222 启动(见 README)。
9
+
10
+ 流程:
11
+ 1. 连接到 localhost:9222
12
+ 2. 取已有 context(继承登录态 / cookies)
13
+ 3. 新开一个标签页,导航到目标 URL
14
+ 4. 等待页面加载(domcontentloaded + networkidle)
15
+ 5. 提取 HTML,复用 Level 1 的 BeautifulSoup 逻辑
16
+ 6. 关闭标签页,不污染 Chrome 会话
17
+
18
+ 错误处理:
19
+ - Chrome 未启动 → ChromeNotRunningError(清晰提示)
20
+ - 页面加载超时 → ExtractionError
21
+ - 其他 → ExtractionError
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import time
27
+ from typing import Optional
28
+
29
+ from bs4 import BeautifulSoup
30
+
31
+ from rolling_reader.models import ExtractResult, ExtractionError
32
+ from rolling_reader.extractor.http import (
33
+ _extract_title,
34
+ _extract_text,
35
+ _extract_links,
36
+ )
37
+
38
+ # CDP 端口(可通过环境变量覆盖)
39
+ CDP_ENDPOINT = "http://localhost:9222"
40
+
41
+ # 等待策略
42
+ WAIT_UNTIL = "domcontentloaded" # 第一阶段:DOM 就绪
43
+ NETWORK_IDLE_TIMEOUT = 5_000 # ms,等 networkidle 的最长时间(不强制)
44
+
45
+
46
+ # ---------------------------------------------------------------------------
47
+ # 专属异常
48
+ # ---------------------------------------------------------------------------
49
+
50
+ class ChromeNotRunningError(ExtractionError):
51
+ """
52
+ Chrome 未以 --remote-debugging-port=9222 运行时抛出。
53
+ 提示用户如何启动 Chrome。
54
+ """
55
+ def __init__(self, endpoint: str = CDP_ENDPOINT):
56
+ super().__init__(
57
+ url="",
58
+ reason=(
59
+ f"Cannot connect to Chrome at {endpoint}. "
60
+ "Start Chrome with: "
61
+ "chrome --remote-debugging-port=9222 --user-data-dir=/tmp/chrome-debug"
62
+ ),
63
+ )
64
+
65
+
66
+ # ---------------------------------------------------------------------------
67
+ # 公开 API
68
+ # ---------------------------------------------------------------------------
69
+
70
+ async def extract(
71
+ url: str,
72
+ *,
73
+ cdp_endpoint: str = CDP_ENDPOINT,
74
+ page_timeout: float = 30.0,
75
+ wait_networkidle: bool = True,
76
+ ) -> ExtractResult:
77
+ """
78
+ Level 2 CDP 抓取。
79
+
80
+ Args:
81
+ url: 目标 URL
82
+ cdp_endpoint: Chrome DevTools 端点,默认 http://localhost:9222
83
+ page_timeout: 页面导航超时(秒)
84
+ wait_networkidle: 是否等待 networkidle(SPA 内容渲染完毕)
85
+
86
+ Returns:
87
+ ExtractResult(level=2)
88
+
89
+ Raises:
90
+ ChromeNotRunningError: Chrome 未启动或未开启远程调试
91
+ ExtractionError: 页面加载或提取失败
92
+ """
93
+ try:
94
+ from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
95
+ except ImportError as e:
96
+ raise ExtractionError(url, "playwright not installed: pip install playwright") from e
97
+
98
+ t0 = time.perf_counter()
99
+
100
+ async with async_playwright() as pw:
101
+ # ── 1. 连接已有 Chrome ────────────────────────────────────────────
102
+ try:
103
+ browser = await pw.chromium.connect_over_cdp(
104
+ cdp_endpoint,
105
+ timeout=5_000, # 连接超时 5s,快速失败
106
+ )
107
+ except Exception as e:
108
+ err_str = str(e).lower()
109
+ if any(k in err_str for k in ("connection refused", "connect", "econnrefused", "failed to connect")):
110
+ raise ChromeNotRunningError(cdp_endpoint) from e
111
+ raise ExtractionError(url, f"cdp connect error: {e}") from e
112
+
113
+ # ── 2. 取已有 context(继承登录态)────────────────────────────────
114
+ if browser.contexts:
115
+ context = browser.contexts[0]
116
+ else:
117
+ # 极少数情况:Chrome 连上了但没有 context(无窗口模式)
118
+ context = await browser.new_context()
119
+
120
+ # ── 3. 开新标签页 ─────────────────────────────────────────────────
121
+ page = await context.new_page()
122
+
123
+ try:
124
+ # ── 4. 导航 ───────────────────────────────────────────────────
125
+ try:
126
+ await page.goto(
127
+ url,
128
+ wait_until=WAIT_UNTIL,
129
+ timeout=page_timeout * 1000,
130
+ )
131
+ except PlaywrightTimeout as e:
132
+ raise ExtractionError(url, f"page load timeout: {e}") from e
133
+ except Exception as e:
134
+ raise ExtractionError(url, f"navigation error: {e}") from e
135
+
136
+ # ── 5. 等待 networkidle(可选,给 SPA 时间完成渲染)───────────
137
+ # 文档说 networkidle 不推荐用于 CI 测试(flaky),
138
+ # 但对爬虫来说是正确选择:等 JS 执行完毕再抓 DOM
139
+ if wait_networkidle:
140
+ try:
141
+ await page.wait_for_load_state(
142
+ "networkidle",
143
+ timeout=NETWORK_IDLE_TIMEOUT,
144
+ )
145
+ except PlaywrightTimeout:
146
+ # networkidle 超时不致命,继续提取已有内容
147
+ pass
148
+
149
+ # ── 6. 尝试 Level 3 JS State 提取(在关闭标签页之前)────────────
150
+ from rolling_reader.extractor.state import try_extract_state, state_to_text
151
+ state_var, state_data = await try_extract_state(page)
152
+
153
+ # ── 7. 提取 HTML(Level 2 DOM 路径)──────────────────────────────
154
+ html = await page.content()
155
+ final_url = page.url
156
+
157
+ finally:
158
+ # 始终关闭标签页,不污染 Chrome
159
+ await page.close()
160
+
161
+ elapsed = (time.perf_counter() - t0) * 1000
162
+
163
+ # ── 8. Level 3:有 JS state → 直接返回结构化 JSON ─────────────────────
164
+ if state_data is not None:
165
+ soup = BeautifulSoup(html, "html.parser")
166
+ return ExtractResult(
167
+ url=final_url,
168
+ level=3,
169
+ status_code=200,
170
+ title=_extract_title(soup),
171
+ text=state_to_text(state_var, state_data),
172
+ links=_extract_links(soup, final_url),
173
+ elapsed_ms=round(elapsed, 1),
174
+ )
175
+
176
+ # ── 9. Level 2:回退到 DOM 提取 ───────────────────────────────────────
177
+ soup = BeautifulSoup(html, "html.parser")
178
+ return ExtractResult(
179
+ url=final_url,
180
+ level=2,
181
+ status_code=200,
182
+ title=_extract_title(soup),
183
+ text=_extract_text(BeautifulSoup(html, "html.parser")),
184
+ links=_extract_links(soup, final_url),
185
+ elapsed_ms=round(elapsed, 1),
186
+ )
187
+
188
+
189
+ # ---------------------------------------------------------------------------
190
+ # 工具:检查 Chrome 是否可连接
191
+ # ---------------------------------------------------------------------------
192
+
193
+ async def is_chrome_available(cdp_endpoint: str = CDP_ENDPOINT) -> bool:
194
+ """快速探测 Chrome 是否在指定端口运行。"""
195
+ import httpx
196
+ try:
197
+ async with httpx.AsyncClient(timeout=2.0) as client:
198
+ resp = await client.get(f"{cdp_endpoint}/json/version")
199
+ return resp.status_code == 200
200
+ except Exception:
201
+ return False
@@ -0,0 +1,243 @@
1
+ """
2
+ rolling_reader/extractor/http.py
3
+ ============================
4
+ Level 1 — HTTP 直取(httpx + beautifulsoup4)
5
+
6
+ 核心职责:
7
+ 1. 发起 HTTP 请求
8
+ 2. 通过 needs_browser() 判断是否需要升级
9
+ 3. 提取 title、正文、链接
10
+ 4. 返回 ExtractResult,或 raise NeedsBrowserError
11
+
12
+ needs_browser() 版本:V3(经过 50+ URL 验证,准确率 96%)
13
+ 关键改进:
14
+ - 检查前剥离 <noscript>,避免 PyPI 类误报
15
+ - 小页面误判修复:tlen < 200 同时要求 ratio < 0.15
16
+ - 尺寸感知阈值:大页面(>50KB)用 < 0.018,小页面用 < 0.05
17
+ - 4xx 直接升级(Level 1 已失败)
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import time
23
+ from urllib.parse import urljoin, urlparse
24
+ from typing import Optional
25
+
26
+ import httpx
27
+ from bs4 import BeautifulSoup
28
+
29
+ from rolling_reader.models import ExtractResult, NeedsBrowserError, ExtractionError
30
+
31
+
32
+ # ---------------------------------------------------------------------------
33
+ # 请求头(模拟真实 Chrome,减少 bot 拦截)
34
+ # ---------------------------------------------------------------------------
35
+
36
+ DEFAULT_HEADERS = {
37
+ "User-Agent": (
38
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
39
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
40
+ "Chrome/124.0.0.0 Safari/537.36"
41
+ ),
42
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
43
+ "Accept-Language": "en-US,en;q=0.5",
44
+ "Accept-Encoding": "gzip, deflate, br",
45
+ "DNT": "1",
46
+ "Connection": "keep-alive",
47
+ "Upgrade-Insecure-Requests": "1",
48
+ }
49
+
50
+
51
+ # ---------------------------------------------------------------------------
52
+ # needs_browser() — V3
53
+ # ---------------------------------------------------------------------------
54
+
55
+ def needs_browser(response: httpx.Response) -> tuple[bool, str]:
56
+ """
57
+ 判断 HTTP 响应是否需要浏览器渲染。
58
+
59
+ Returns:
60
+ (needs_browser: bool, reason: str)
61
+ """
62
+ html = response.text
63
+
64
+ # 1. 空 body → API 端点,不是 SPA
65
+ if len(html) == 0:
66
+ return False, ""
67
+
68
+ # 2. 4xx → Level 1 已失败,升级(Chrome 通常能绕过 bot 检测 / 登录墙)
69
+ if response.status_code in (400, 401, 403, 407):
70
+ return True, f"http_{response.status_code}"
71
+
72
+ # 3. 很短的 2xx → SPA shell
73
+ if len(html) < 500:
74
+ return True, "short_response"
75
+
76
+ # 4. 解析 HTML,去掉 <noscript>(避免 noscript 里的功能提示触发误判)
77
+ # 反例:PyPI 在 <noscript> 里写 "Enable javascript to filter wheels"
78
+ soup = BeautifulSoup(html, "html.parser")
79
+ for tag in soup.find_all("noscript"):
80
+ tag.decompose()
81
+ cleaned_html = str(soup).lower()
82
+
83
+ # 5. 显式 JS 要求标记
84
+ js_markers = [
85
+ "enable javascript",
86
+ "you need javascript",
87
+ "javascript is required",
88
+ "javascript is disabled",
89
+ '<div id="app"></div>',
90
+ "<div id='app'></div>",
91
+ '<div id="root"></div>',
92
+ "<div id='root'></div>",
93
+ ]
94
+ for marker in js_markers:
95
+ if marker in cleaned_html:
96
+ return True, f"js_marker:{marker[:30]}"
97
+
98
+ # 6. 文本内容分析
99
+ text_content = soup.get_text(strip=True)
100
+ text_len = len(text_content)
101
+ html_len = len(html) # 分母用原始 HTML 保持一致
102
+ text_ratio = text_len / max(html_len, 1)
103
+
104
+ # 6a. 极低比例 → 肯定是 SPA(Instagram / YouTube 类型)
105
+ if text_ratio < 0.005:
106
+ return True, f"ratio_near_zero:{text_ratio:.4f}"
107
+
108
+ # 6b. 文字量极少 + ratio 也低 → SPA shell
109
+ # example.com(tlen=139, ratio=0.263) 不应被触发
110
+ # Facebook(tlen=111, ratio=0.072) 应被触发
111
+ if text_len < 200 and text_ratio < 0.15:
112
+ return True, f"tiny_shell:tlen={text_len}"
113
+
114
+ # 6c. 尺寸感知 ratio 阈值
115
+ # 大页面天然 ratio 偏低(大量 HTML 标签)
116
+ # < 0.018:覆盖 Airtable(0.015)/Notion(0.015)/Replit(0.014) 等 SPA
117
+ # 不触发 GitHub(0.019)/BBC(0.031)/PyPI(0.075)
118
+ if html_len > 50_000:
119
+ if text_ratio < 0.018:
120
+ return True, f"large_page_low_ratio:{text_ratio:.4f}"
121
+ else:
122
+ if text_ratio < 0.05:
123
+ return True, f"small_page_low_ratio:{text_ratio:.4f}"
124
+
125
+ return False, ""
126
+
127
+
128
+ # ---------------------------------------------------------------------------
129
+ # 内容提取
130
+ # ---------------------------------------------------------------------------
131
+
132
+ def _extract_title(soup: BeautifulSoup) -> str:
133
+ tag = soup.find("title")
134
+ if tag:
135
+ return tag.get_text(strip=True)
136
+ h1 = soup.find("h1")
137
+ if h1:
138
+ return h1.get_text(strip=True)
139
+ return ""
140
+
141
+
142
+ def _extract_text(soup: BeautifulSoup) -> str:
143
+ """
144
+ 提取页面主要文字内容。
145
+ 去除 script / style / noscript,保留段落文字。
146
+ """
147
+ for tag in soup.find_all(["script", "style", "noscript", "header", "footer", "nav"]):
148
+ tag.decompose()
149
+
150
+ # 优先从 <main> / <article> 提取;否则用 <body>
151
+ container = soup.find("main") or soup.find("article") or soup.find("body") or soup
152
+ lines = [line.strip() for line in container.get_text(separator="\n").splitlines()]
153
+ lines = [line for line in lines if line]
154
+ return "\n".join(lines)
155
+
156
+
157
+ def _extract_links(soup: BeautifulSoup, base_url: str) -> list[str]:
158
+ """提取所有 <a href> 链接,转为绝对 URL,去重。"""
159
+ seen: set[str] = set()
160
+ links: list[str] = []
161
+ for a in soup.find_all("a", href=True):
162
+ href = a["href"].strip()
163
+ if not href or href.startswith(("#", "javascript:", "mailto:", "tel:")):
164
+ continue
165
+ absolute = urljoin(base_url, href)
166
+ parsed = urlparse(absolute)
167
+ if parsed.scheme not in ("http", "https"):
168
+ continue
169
+ if absolute not in seen:
170
+ seen.add(absolute)
171
+ links.append(absolute)
172
+ return links
173
+
174
+
175
+ # ---------------------------------------------------------------------------
176
+ # 公开 API
177
+ # ---------------------------------------------------------------------------
178
+
179
+ async def extract(
180
+ url: str,
181
+ *,
182
+ timeout: float = 15.0,
183
+ headers: Optional[dict] = None,
184
+ client: Optional[httpx.AsyncClient] = None,
185
+ ) -> ExtractResult:
186
+ """
187
+ Level 1 HTTP 抓取。
188
+
189
+ Args:
190
+ url: 目标 URL
191
+ timeout: 请求超时秒数
192
+ headers: 额外请求头(会合并到 DEFAULT_HEADERS)
193
+ client: 可复用的 httpx.AsyncClient(不传则自动创建)
194
+
195
+ Returns:
196
+ ExtractResult
197
+
198
+ Raises:
199
+ NeedsBrowserError: 页面需要浏览器渲染
200
+ ExtractionError: 请求或解析失败
201
+ """
202
+ merged_headers = {**DEFAULT_HEADERS, **(headers or {})}
203
+
204
+ async def _do_request(c: httpx.AsyncClient) -> ExtractResult:
205
+ t0 = time.perf_counter()
206
+ try:
207
+ response = await c.get(url, follow_redirects=True)
208
+ except httpx.TimeoutException as e:
209
+ raise ExtractionError(url, f"timeout: {e}") from e
210
+ except httpx.RequestError as e:
211
+ raise ExtractionError(url, f"request error: {e}") from e
212
+ elapsed = (time.perf_counter() - t0) * 1000
213
+
214
+ # 判断是否需要浏览器
215
+ browser_needed, reason = needs_browser(response)
216
+ if browser_needed:
217
+ raise NeedsBrowserError(url, reason)
218
+
219
+ # 解析内容
220
+ soup = BeautifulSoup(response.text, "html.parser")
221
+ title = _extract_title(soup)
222
+ text = _extract_text(BeautifulSoup(response.text, "html.parser")) # 用新 soup 避免修改影响
223
+ links = _extract_links(soup, str(response.url))
224
+
225
+ return ExtractResult(
226
+ url=str(response.url),
227
+ level=1,
228
+ status_code=response.status_code,
229
+ title=title,
230
+ text=text,
231
+ links=links,
232
+ elapsed_ms=round(elapsed, 1),
233
+ )
234
+
235
+ if client is not None:
236
+ return await _do_request(client)
237
+
238
+ async with httpx.AsyncClient(
239
+ headers=merged_headers,
240
+ timeout=httpx.Timeout(timeout),
241
+ follow_redirects=True,
242
+ ) as c:
243
+ return await _do_request(c)
@@ -0,0 +1,66 @@
1
+ """
2
+ rolling_reader/extractor/state.py
3
+ =============================
4
+ Level 3 — JS State 提取
5
+
6
+ 核心思路:
7
+ 现代 SSR 框架(Next.js / Nuxt / 自定义)会在 HTML 里注入完整数据集,
8
+ 以 JavaScript 变量的形式存在(如 window.__PRELOADED_STATE__)。
9
+ 这些数据在 DOM 渲染之前就已存在,可以直接用 page.evaluate() 提取,
10
+ 比 DOM 解析更快、更完整、更稳定。
11
+
12
+ v0.1 范围:
13
+ 只支持 window.__PRELOADED_STATE__(已在 Sportsbet.com.au 生产验证)。
14
+ 其他变量(__NEXT_DATA__ / __NUXT__ 等)是 Phase 2 的扩展点。
15
+
16
+ 与 Level 2 的关系:
17
+ Level 3 复用 Level 2 的 CDP 页面加载流程,
18
+ 在页面加载完成后额外调用 page.evaluate() 尝试提取 state 变量。
19
+ - 成功 → 返回 level=3 结果(结构化 JSON)
20
+ - 失败 → 调用方回退到 Level 2 DOM 提取
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import json
26
+ from typing import Optional, Any
27
+
28
+ # v0.1 支持的 JS state 变量列表(按优先级排序)
29
+ # Phase 2 会扩展这个列表并加入 HAR 自动发现
30
+ KNOWN_STATE_VARS: list[str] = [
31
+ "window.__PRELOADED_STATE__",
32
+ ]
33
+
34
+
35
+ async def try_extract_state(
36
+ page, # playwright Page 对象
37
+ state_vars: list[str] = KNOWN_STATE_VARS,
38
+ ) -> tuple[Optional[str], Optional[Any]]:
39
+ """
40
+ 尝试从已加载的页面中提取 JS state 变量。
41
+
42
+ Args:
43
+ page: 已导航到目标 URL 的 Playwright Page 对象
44
+ state_vars: 要尝试的变量名列表(按优先级)
45
+
46
+ Returns:
47
+ (var_name, data) 如果找到
48
+ (None, None) 如果均未找到
49
+ """
50
+ for var in state_vars:
51
+ try:
52
+ # page.evaluate 会把 JS 返回值序列化为 Python 对象
53
+ # 如果变量不存在,JS 返回 undefined → Python 返回 None
54
+ data = await page.evaluate(f"() => {var}")
55
+ if data is not None:
56
+ return var, data
57
+ except Exception:
58
+ # 变量不存在或 JS 执行出错,继续尝试下一个
59
+ continue
60
+
61
+ return None, None
62
+
63
+
64
+ def state_to_text(var_name: str, data: Any) -> str:
65
+ """将 JS state 对象序列化为可读文本(JSON 格式)。"""
66
+ return json.dumps(data, ensure_ascii=False, indent=2)
@@ -0,0 +1,96 @@
1
+ """
2
+ rolling_reader/models.py
3
+ ===================
4
+ 共享数据类型:ExtractResult、异常类。
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from dataclasses import dataclass, field
11
+ from datetime import datetime, timezone
12
+ from typing import Optional
13
+
14
+
15
+ # ---------------------------------------------------------------------------
16
+ # 异常
17
+ # ---------------------------------------------------------------------------
18
+
19
+ class NeedsBrowserError(Exception):
20
+ """Level 1 检测到页面需要浏览器渲染,应升级到 Level 2。"""
21
+ def __init__(self, url: str, reason: str = ""):
22
+ self.url = url
23
+ self.reason = reason
24
+ super().__init__(f"Browser required for {url}: {reason}")
25
+
26
+
27
+ class ExtractionError(Exception):
28
+ """抓取过程中发生不可恢复的错误。"""
29
+ def __init__(self, url: str, reason: str = ""):
30
+ self.url = url
31
+ self.reason = reason
32
+ super().__init__(f"Extraction failed for {url}: {reason}")
33
+
34
+
35
+ # ---------------------------------------------------------------------------
36
+ # 抓取结果
37
+ # ---------------------------------------------------------------------------
38
+
39
+ @dataclass
40
+ class ExtractResult:
41
+ """
42
+ 单次抓取的结构化结果。
43
+
44
+ level:
45
+ 1 = HTTP 直取(httpx)
46
+ 2 = CDP + 已有 Chrome Session
47
+ 3 = JS State 提取
48
+ """
49
+ url: str
50
+ level: int
51
+ status_code: int
52
+ title: str
53
+ text: str # 主要文字内容(BeautifulSoup 提取)
54
+ links: list[str] # 页面内所有链接(绝对 URL)
55
+ elapsed_ms: float
56
+ extracted_at: str = field(
57
+ default_factory=lambda: datetime.now(timezone.utc).isoformat()
58
+ )
59
+ error: Optional[str] = None
60
+
61
+ # ── 输出格式 ──────────────────────────────────────────────────────────
62
+
63
+ def to_dict(self) -> dict:
64
+ return {
65
+ "url": self.url,
66
+ "level": self.level,
67
+ "status_code": self.status_code,
68
+ "title": self.title,
69
+ "text": self.text,
70
+ "links": self.links,
71
+ "elapsed_ms": self.elapsed_ms,
72
+ "extracted_at": self.extracted_at,
73
+ "error": self.error,
74
+ }
75
+
76
+ def to_json(self, indent: int = 2) -> str:
77
+ return json.dumps(self.to_dict(), ensure_ascii=False, indent=indent)
78
+
79
+ def to_markdown(self) -> str:
80
+ lines = [
81
+ f"# {self.title or self.url}",
82
+ f"",
83
+ f"> URL: {self.url} ",
84
+ f"> Level: {self.level} ",
85
+ f"> Extracted: {self.extracted_at}",
86
+ f"",
87
+ "---",
88
+ f"",
89
+ self.text,
90
+ ]
91
+ if self.links:
92
+ lines += ["", "---", "", "## Links", ""]
93
+ lines += [f"- {link}" for link in self.links[:50]]
94
+ if len(self.links) > 50:
95
+ lines.append(f"- *(+{len(self.links) - 50} more)*")
96
+ return "\n".join(lines)
@@ -0,0 +1,9 @@
1
+ Metadata-Version: 2.4
2
+ Name: rolling-reader
3
+ Version: 0.1.0
4
+ Summary: Local-first CLI web scraper with automatic strategy selection
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: beautifulsoup4>=4.14
7
+ Requires-Dist: httpx>=0.28
8
+ Requires-Dist: playwright>=1.44
9
+ Requires-Dist: typer>=0.12
@@ -0,0 +1,14 @@
1
+ rolling_reader/__init__.py,sha256=L_5Wy8PyyS832rmG5opWAN6xARD56EJnRcX7Mo328eI,102
2
+ rolling_reader/cli.py,sha256=gZVuciY6B9OdhIYw0lsY7_7NM7398wJB5KEf1xDorQ8,1931
3
+ rolling_reader/dispatcher.py,sha256=t8A4Fpu9r8WuYUPN2XherQBUiB6xWp8LCRgmcwp6cHw,5788
4
+ rolling_reader/models.py,sha256=cEkKkMN_86VHncuWJ2lLGAXyB84LAE2N2yzH-b78m6g,3028
5
+ rolling_reader/cache/__init__.py,sha256=pCXz8q5UOj-FPtA-V9r4twqoC2AGw8L6AZR8Rj2Jq0g,64
6
+ rolling_reader/cache/profile.py,sha256=930L7qteK7BKXXZWyryZWSYuEM5Y-umq1oXrVOyH7V4,5105
7
+ rolling_reader/extractor/__init__.py,sha256=NpwCyAIvs_xSx0dh9oFHx9O4X3kdAC6yZA6wx590d5w,249
8
+ rolling_reader/extractor/cdp.py,sha256=D0QkvGpxNNa6Q4LiBrlK_mSzQPzyY4qIVezJ1Gjg-tU,7901
9
+ rolling_reader/extractor/http.py,sha256=EoFXcfY5RiT7jDNnvqRKWu7jNrX7kvMqVb-lI66gX9g,8110
10
+ rolling_reader/extractor/state.py,sha256=QBWClPf7JpBljMpQphUdTTEhNvpSLgTDvLNZHb7L6hw,2269
11
+ rolling_reader-0.1.0.dist-info/METADATA,sha256=OKeg5OpdT4gp4I9AjLz_7ZlD7GveksrIMWRfns6I0-g,275
12
+ rolling_reader-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
13
+ rolling_reader-0.1.0.dist-info/entry_points.txt,sha256=19sgkCG_BSnvzgSzT9LUIelB24Q-ZfC-eQLfRYEIQqA,46
14
+ rolling_reader-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ rr = rolling_reader.cli:app