rolling-reader 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rolling-reader
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: Local-first CLI web scraper with automatic strategy selection
5
5
  Requires-Python: >=3.11
6
6
  Requires-Dist: beautifulsoup4>=4.14
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "rolling-reader"
7
- version = "0.1.0"
7
+ version = "0.2.0"
8
8
  description = "Local-first CLI web scraper with automatic strategy selection"
9
9
  requires-python = ">=3.11"
10
10
  dependencies = [
@@ -0,0 +1,208 @@
1
+ """
2
+ rolling_reader/extractor/state.py
3
+ =============================
4
+ Level 3 — JS State 提取
5
+
6
+ 核心思路:
7
+ 现代 SSR 框架(Next.js / Nuxt / SvelteKit / 自定义)会在 HTML 里注入完整数据集,
8
+ 以 JavaScript 变量的形式存在(如 window.__NEXT_DATA__)。
9
+ 这些数据在 DOM 渲染之前就已存在,可以直接用 page.evaluate() 提取,
10
+ 比 DOM 解析更快、更完整、更稳定。
11
+
12
+ v0.2 改动:
13
+ - 扩展 KNOWN_STATE_VARS(Next.js / Nuxt / Redux 等主流框架)
14
+ - 新增 auto_scan_state():扫 <script> 标签里的未知大 JSON 对象
15
+ - 框架感知提取:__NEXT_DATA__ 自动钻入 props.pageProps
16
+
17
+ 与 Level 2 的关系:
18
+ Level 3 复用 Level 2 的 CDP 页面加载流程,
19
+ 在页面加载完成后额外调用 page.evaluate() 尝试提取 state 变量。
20
+ - 成功 → 返回 level=3 结果(结构化 JSON)
21
+ - 失败 → 调用方回退到 Level 2 DOM 提取
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import json
27
+ import re
28
+ from typing import Optional, Any
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # 已知 JS state 变量(按常见程度排序)
32
+ # ---------------------------------------------------------------------------
33
+
34
+ KNOWN_STATE_VARS: list[str] = [
35
+ "window.__NEXT_DATA__", # Next.js(Vercel 生态,极其普遍)
36
+ "window.__NUXT__", # Nuxt.js
37
+ "window.__PRELOADED_STATE__", # Redux / 自定义(v0.1 已验证)
38
+ "window.__INITIAL_STATE__", # 各类框架
39
+ "window.__REDUX_STATE__", # Redux explicit naming
40
+ "window.__APP_STATE__", # 各类框架
41
+ "window.__STATE__", # 通用
42
+ "window.__STORE__", # MobX / 自定义
43
+ "window.APP_STATE", # 无下划线变体
44
+ "window.initialState", # camelCase 变体
45
+ "window.__remixContext", # Remix
46
+ "window.__staticRouterHydrationData", # React Router v6 SSR
47
+ ]
48
+
49
+ # auto_scan 的最小有效 JSON 字节数(过滤掉小型配置对象)
50
+ _AUTO_SCAN_MIN_BYTES = 1_000
51
+
52
+ # auto_scan 用于匹配 window.VAR = {...} 的正则
53
+ # 只捕获变量名,数据通过 page.evaluate 取(避免正则解析 JSON 的陷阱)
54
+ _WINDOW_VAR_RE = re.compile(
55
+ r'window\.([A-Za-z_$][A-Za-z0-9_$]*)(?:\s*[=:]\s*)(\{|\[)',
56
+ re.MULTILINE,
57
+ )
58
+
59
+ # ---------------------------------------------------------------------------
60
+ # 框架感知:从 state 里提取最有价值的子集
61
+ # ---------------------------------------------------------------------------
62
+
63
+ _FRAMEWORK_EXTRACTORS: dict[str, list[str]] = {
64
+ # Next.js:核心数据在 props.pageProps
65
+ "window.__NEXT_DATA__": ["props", "pageProps"],
66
+ # Nuxt.js:核心数据在 data 或 state
67
+ "window.__NUXT__": ["data"],
68
+ # Remix:核心数据在 state
69
+ "window.__remixContext": ["state", "loaderData"],
70
+ }
71
+
72
+
73
+ def _deep_get(data: Any, path: list[str]) -> Any:
74
+ """按路径深层取值,任意节点缺失时返回原始 data。"""
75
+ current = data
76
+ for key in path:
77
+ if isinstance(current, dict) and key in current:
78
+ current = current[key]
79
+ else:
80
+ return data # 路径不存在,返回完整 data
81
+ return current
82
+
83
+
84
+ # ---------------------------------------------------------------------------
85
+ # 公开 API
86
+ # ---------------------------------------------------------------------------
87
+
88
+ async def try_extract_state(
89
+ page,
90
+ state_vars: list[str] = KNOWN_STATE_VARS,
91
+ auto_scan: bool = True,
92
+ ) -> tuple[Optional[str], Optional[Any]]:
93
+ """
94
+ 尝试从已加载的页面中提取 JS state 变量。
95
+
96
+ 策略:
97
+ 1. 按优先级逐一尝试 KNOWN_STATE_VARS
98
+ 2. 若均未命中,调用 auto_scan_state() 扫描未知变量
99
+
100
+ Args:
101
+ page: 已导航到目标 URL 的 Playwright Page 对象
102
+ state_vars: 要尝试的变量名列表(按优先级)
103
+ auto_scan: 是否在已知变量均未命中时自动扫描
104
+
105
+ Returns:
106
+ (var_name, data) 如果找到
107
+ (None, None) 如果均未找到
108
+ """
109
+ # 第一轮:已知变量
110
+ for var in state_vars:
111
+ try:
112
+ data = await page.evaluate(f"() => {var}")
113
+ if data is not None:
114
+ # 框架感知:提取最有价值的子集
115
+ if var in _FRAMEWORK_EXTRACTORS:
116
+ path = _FRAMEWORK_EXTRACTORS[var]
117
+ data = _deep_get(data, path)
118
+ return var, data
119
+ except Exception:
120
+ continue
121
+
122
+ # 第二轮:自动扫描
123
+ if auto_scan:
124
+ return await auto_scan_state(page)
125
+
126
+ return None, None
127
+
128
+
129
+ async def auto_scan_state(page) -> tuple[Optional[str], Optional[Any]]:
130
+ """
131
+ 扫描 <script> 标签,寻找未在 KNOWN_STATE_VARS 中列出的大型 JSON 对象。
132
+
133
+ 策略:
134
+ 1. 从 page.content() 拿到原始 HTML
135
+ 2. 用正则找所有 window.VAR = { 或 window.VAR = [ 形式的赋值
136
+ 3. 跳过已知变量和浏览器内置名
137
+ 4. 用 page.evaluate() 实际取值(让 JS 引擎做反序列化,比 regex 解 JSON 可靠)
138
+ 5. 返回第一个超过阈值的候选
139
+
140
+ Returns:
141
+ (var_name, data) 或 (None, None)
142
+ """
143
+ try:
144
+ html = await page.content()
145
+ except Exception:
146
+ return None, None
147
+
148
+ # 从 HTML 里提取候选变量名
149
+ candidates: list[str] = []
150
+ seen: set[str] = set()
151
+ known_set = set(KNOWN_STATE_VARS)
152
+
153
+ for match in _WINDOW_VAR_RE.finditer(html):
154
+ var_name = f"window.{match.group(1)}"
155
+ if var_name not in known_set and var_name not in seen:
156
+ # 跳过常见的非数据全局变量
157
+ raw = match.group(1)
158
+ if raw in _BROWSER_BUILTINS:
159
+ continue
160
+ candidates.append(var_name)
161
+ seen.add(var_name)
162
+
163
+ # 逐个尝试
164
+ for var in candidates:
165
+ try:
166
+ data = await page.evaluate(f"() => {var}")
167
+ if data is None:
168
+ continue
169
+ # 只接受足够大的对象(过滤 GA/GTM 等小型配置)
170
+ serialized = json.dumps(data, ensure_ascii=False)
171
+ if len(serialized.encode()) >= _AUTO_SCAN_MIN_BYTES:
172
+ return var, data
173
+ except Exception:
174
+ continue
175
+
176
+ return None, None
177
+
178
+
179
+ # 浏览器内置全局变量名黑名单(不值得尝试提取)
180
+ _BROWSER_BUILTINS: frozenset[str] = frozenset({
181
+ "addEventListener", "alert", "atob", "blur", "btoa",
182
+ "clearInterval", "clearTimeout", "close", "closed",
183
+ "confirm", "console", "crypto", "customElements",
184
+ "devicePixelRatio", "dispatchEvent", "document",
185
+ "fetch", "focus", "frameElement", "frames",
186
+ "getComputedStyle", "getSelection", "history",
187
+ "indexedDB", "innerHeight", "innerWidth",
188
+ "length", "localStorage", "location",
189
+ "matchMedia", "moveTo", "name", "navigator",
190
+ "onload", "open", "opener", "origin", "outerHeight",
191
+ "outerWidth", "pageXOffset", "pageYOffset",
192
+ "parent", "performance", "postMessage", "print",
193
+ "prompt", "removeEventListener", "requestAnimationFrame",
194
+ "resizeTo", "screen", "screenLeft", "screenTop",
195
+ "screenX", "screenY", "scroll", "scrollBy",
196
+ "scrollTo", "scrollX", "scrollY", "self",
197
+ "sessionStorage", "setInterval", "setTimeout",
198
+ "speechSynthesis", "status", "stop", "top",
199
+ "visualViewport", "window",
200
+ # 常见第三方(非数据)
201
+ "ga", "gtag", "dataLayer", "fbq", "twq",
202
+ "Intercom", "analytics", "mixpanel", "amplitude",
203
+ })
204
+
205
+
206
+ def state_to_text(var_name: str, data: Any) -> str:
207
+ """将 JS state 对象序列化为可读文本(JSON 格式)。"""
208
+ return json.dumps(data, ensure_ascii=False, indent=2)
@@ -1,66 +0,0 @@
1
- """
2
- rolling_reader/extractor/state.py
3
- =============================
4
- Level 3 — JS State 提取
5
-
6
- 核心思路:
7
- 现代 SSR 框架(Next.js / Nuxt / 自定义)会在 HTML 里注入完整数据集,
8
- 以 JavaScript 变量的形式存在(如 window.__PRELOADED_STATE__)。
9
- 这些数据在 DOM 渲染之前就已存在,可以直接用 page.evaluate() 提取,
10
- 比 DOM 解析更快、更完整、更稳定。
11
-
12
- v0.1 范围:
13
- 只支持 window.__PRELOADED_STATE__(已在 Sportsbet.com.au 生产验证)。
14
- 其他变量(__NEXT_DATA__ / __NUXT__ 等)是 Phase 2 的扩展点。
15
-
16
- 与 Level 2 的关系:
17
- Level 3 复用 Level 2 的 CDP 页面加载流程,
18
- 在页面加载完成后额外调用 page.evaluate() 尝试提取 state 变量。
19
- - 成功 → 返回 level=3 结果(结构化 JSON)
20
- - 失败 → 调用方回退到 Level 2 DOM 提取
21
- """
22
-
23
- from __future__ import annotations
24
-
25
- import json
26
- from typing import Optional, Any
27
-
28
- # v0.1 支持的 JS state 变量列表(按优先级排序)
29
- # Phase 2 会扩展这个列表并加入 HAR 自动发现
30
- KNOWN_STATE_VARS: list[str] = [
31
- "window.__PRELOADED_STATE__",
32
- ]
33
-
34
-
35
- async def try_extract_state(
36
- page, # playwright Page 对象
37
- state_vars: list[str] = KNOWN_STATE_VARS,
38
- ) -> tuple[Optional[str], Optional[Any]]:
39
- """
40
- 尝试从已加载的页面中提取 JS state 变量。
41
-
42
- Args:
43
- page: 已导航到目标 URL 的 Playwright Page 对象
44
- state_vars: 要尝试的变量名列表(按优先级)
45
-
46
- Returns:
47
- (var_name, data) 如果找到
48
- (None, None) 如果均未找到
49
- """
50
- for var in state_vars:
51
- try:
52
- # page.evaluate 会把 JS 返回值序列化为 Python 对象
53
- # 如果变量不存在,JS 返回 undefined → Python 返回 None
54
- data = await page.evaluate(f"() => {var}")
55
- if data is not None:
56
- return var, data
57
- except Exception:
58
- # 变量不存在或 JS 执行出错,继续尝试下一个
59
- continue
60
-
61
- return None, None
62
-
63
-
64
- def state_to_text(var_name: str, data: Any) -> str:
65
- """将 JS state 对象序列化为可读文本(JSON 格式)。"""
66
- return json.dumps(data, ensure_ascii=False, indent=2)