rolling-reader 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,116 @@
1
+ Metadata-Version: 2.4
2
+ Name: rolling-reader
3
+ Version: 0.3.0
4
+ Summary: Local-first web scraper that automatically rolls through HTTP → browser → JS state extraction
5
+ License: MIT
6
+ Requires-Python: >=3.11
7
+ Requires-Dist: beautifulsoup4>=4.14
8
+ Requires-Dist: httpx>=0.28
9
+ Requires-Dist: playwright>=1.44
10
+ Requires-Dist: typer>=0.12
11
+ Description-Content-Type: text/markdown
12
+
13
+ # rolling-reader
14
+
15
+ Local-first web scraper that automatically rolls through HTTP → browser → JS state extraction.
16
+
17
+ ## Install
18
+
19
+ ```bash
20
+ pip install rolling-reader
21
+ playwright install chromium # required for Level 2 / Level 3
22
+ ```
23
+
24
+ Python 3.11+. No Node.js required.
25
+
26
+ ## Quick start
27
+
28
+ **Static page** (Level 1, no browser needed):
29
+
30
+ ```bash
31
+ rr https://news.ycombinator.com/
32
+ ```
33
+
34
+ **SPA or login-required page** (Level 2, reuses your existing Chrome session):
35
+
36
+ ```bash
37
+ # 1. Start Chrome with remote debugging enabled (see section below)
38
+ # 2. Run the command — Level 2 is selected automatically
39
+ rr https://app.example.com/dashboard
40
+ ```
41
+
42
+ **Output as Markdown:**
43
+
44
+ ```bash
45
+ rr https://example.com --output md
46
+ ```
47
+
48
+ ## How it works
49
+
50
+ | Level | Trigger | Speed |
51
+ |-------|---------|-------|
52
+ | 1 HTTP | Standard SSR page, no JS rendering needed | ~500 ms |
53
+ | 2 CDP | SPA, JS rendering required, or auth-gated | ~3 s |
54
+ | 3 JS State | Next.js / Nuxt / Redux / Remix state variable detected | ~1 s (3–4x faster than Level 2 DOM parse) |
55
+
56
+ The dispatcher probes each level in order and stops at the first one that returns usable content. Level 3 is attempted after Level 2 attaches to the browser — if a known JS state variable is found, DOM parsing is skipped entirely.
57
+
58
+ ## Starting Chrome for Level 2 / Level 3
59
+
60
+ Chrome must be running with remote debugging before invoking Level 2 or Level 3:
61
+
62
+ ```bash
63
+ # macOS
64
+ open -a "Google Chrome" --args --remote-debugging-port=9222
65
+
66
+ # Windows
67
+ chrome --remote-debugging-port=9222
68
+
69
+ # Linux
70
+ google-chrome --remote-debugging-port=9222
71
+ ```
72
+
73
+ The existing Chrome session (including cookies and local storage) is reused — no separate login step required.
74
+
75
+ ## CLI options
76
+
77
+ | Flag | Values | Description |
78
+ |------|--------|-------------|
79
+ | `--output` | `json`, `md` | Output format (default: plain text) |
80
+ | `--force-level` | `1`, `2`, `3` | Skip auto-detection, force a specific level |
81
+ | `--json-path` | dot-notation string | Extract a nested key from JSON output, e.g. `title` or `props.pageProps` |
82
+ | `--no-cache` | — | Disable response cache |
83
+ | `--cdp` | — | Force CDP connection (equivalent to `--force-level 2`) |
84
+ | `--verbose` | — | Print level selection reasoning and timing |
85
+
86
+ ## Why not X
87
+
88
+ | Tool | Limitation |
89
+ |------|-----------|
90
+ | **Scrapling** | Cannot reuse an existing logged-in Chrome session; no JS state extraction |
91
+ | **Firecrawl** | Cloud API — data leaves your machine, metered pricing |
92
+ | **Jina Reader** | Cloud API — data leaves your machine, metered pricing |
93
+ | **rolling-reader** | Fully local, reuses your Chrome session and cookies, free forever |
94
+
95
+ ## Supported JS state variables (v0.2)
96
+
97
+ The following `window.*` variables are probed automatically for Level 3 extraction:
98
+
99
+ - `window.__NEXT_DATA__` — Next.js (Vercel ecosystem)
100
+ - `window.__NUXT__` — Nuxt.js
101
+ - `window.__PRELOADED_STATE__` — Redux / custom
102
+ - `window.__INITIAL_STATE__` — various frameworks
103
+ - `window.__REDUX_STATE__` — Redux explicit naming
104
+ - `window.__APP_STATE__` — various frameworks
105
+ - `window.__STATE__` — generic
106
+ - `window.__STORE__` — MobX / custom
107
+ - `window.APP_STATE` — no-underscore variant
108
+ - `window.initialState` — camelCase variant
109
+ - `window.__remixContext` — Remix
110
+ - `window.__staticRouterHydrationData` — React Router v6 SSR
111
+
112
+ Unknown variables matching the pattern `window.VAR = {…}` are also detected via regex scan.
113
+
114
+ ## License
115
+
116
+ MIT
@@ -0,0 +1,104 @@
1
+ # rolling-reader
2
+
3
+ Local-first web scraper that automatically rolls through HTTP → browser → JS state extraction.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ pip install rolling-reader
9
+ playwright install chromium # required for Level 2 / Level 3
10
+ ```
11
+
12
+ Python 3.11+. No Node.js required.
13
+
14
+ ## Quick start
15
+
16
+ **Static page** (Level 1, no browser needed):
17
+
18
+ ```bash
19
+ rr https://news.ycombinator.com/
20
+ ```
21
+
22
+ **SPA or login-required page** (Level 2, reuses your existing Chrome session):
23
+
24
+ ```bash
25
+ # 1. Start Chrome with remote debugging enabled (see section below)
26
+ # 2. Run the command — Level 2 is selected automatically
27
+ rr https://app.example.com/dashboard
28
+ ```
29
+
30
+ **Output as Markdown:**
31
+
32
+ ```bash
33
+ rr https://example.com --output md
34
+ ```
35
+
36
+ ## How it works
37
+
38
+ | Level | Trigger | Speed |
39
+ |-------|---------|-------|
40
+ | 1 HTTP | Standard SSR page, no JS rendering needed | ~500 ms |
41
+ | 2 CDP | SPA, JS rendering required, or auth-gated | ~3 s |
42
+ | 3 JS State | Next.js / Nuxt / Redux / Remix state variable detected | ~1 s (3–4x faster than Level 2 DOM parse) |
43
+
44
+ The dispatcher probes each level in order and stops at the first one that returns usable content. Level 3 is attempted after Level 2 attaches to the browser — if a known JS state variable is found, DOM parsing is skipped entirely.
45
+
46
+ ## Starting Chrome for Level 2 / Level 3
47
+
48
+ Chrome must be running with remote debugging before invoking Level 2 or Level 3:
49
+
50
+ ```bash
51
+ # macOS
52
+ open -a "Google Chrome" --args --remote-debugging-port=9222
53
+
54
+ # Windows
55
+ chrome --remote-debugging-port=9222
56
+
57
+ # Linux
58
+ google-chrome --remote-debugging-port=9222
59
+ ```
60
+
61
+ The existing Chrome session (including cookies and local storage) is reused — no separate login step required.
62
+
63
+ ## CLI options
64
+
65
+ | Flag | Values | Description |
66
+ |------|--------|-------------|
67
+ | `--output` | `json`, `md` | Output format (default: plain text) |
68
+ | `--force-level` | `1`, `2`, `3` | Skip auto-detection, force a specific level |
69
+ | `--json-path` | dot-notation string | Extract a nested key from JSON output, e.g. `title` or `props.pageProps` |
70
+ | `--no-cache` | — | Disable response cache |
71
+ | `--cdp` | — | Force CDP connection (equivalent to `--force-level 2`) |
72
+ | `--verbose` | — | Print level selection reasoning and timing |
73
+
74
+ ## Why not X
75
+
76
+ | Tool | Limitation |
77
+ |------|-----------|
78
+ | **Scrapling** | Cannot reuse an existing logged-in Chrome session; no JS state extraction |
79
+ | **Firecrawl** | Cloud API — data leaves your machine, metered pricing |
80
+ | **Jina Reader** | Cloud API — data leaves your machine, metered pricing |
81
+ | **rolling-reader** | Fully local, reuses your Chrome session and cookies, free forever |
82
+
83
+ ## Supported JS state variables (v0.2)
84
+
85
+ The following `window.*` variables are probed automatically for Level 3 extraction:
86
+
87
+ - `window.__NEXT_DATA__` — Next.js (Vercel ecosystem)
88
+ - `window.__NUXT__` — Nuxt.js
89
+ - `window.__PRELOADED_STATE__` — Redux / custom
90
+ - `window.__INITIAL_STATE__` — various frameworks
91
+ - `window.__REDUX_STATE__` — Redux explicit naming
92
+ - `window.__APP_STATE__` — various frameworks
93
+ - `window.__STATE__` — generic
94
+ - `window.__STORE__` — MobX / custom
95
+ - `window.APP_STATE` — no-underscore variant
96
+ - `window.initialState` — camelCase variant
97
+ - `window.__remixContext` — Remix
98
+ - `window.__staticRouterHydrationData` — React Router v6 SSR
99
+
100
+ Unknown variables matching the pattern `window.VAR = {…}` are also detected via regex scan.
101
+
102
+ ## License
103
+
104
+ MIT
@@ -4,8 +4,10 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "rolling-reader"
7
- version = "0.2.0"
8
- description = "Local-first CLI web scraper with automatic strategy selection"
7
+ version = "0.3.0"
8
+ description = "Local-first web scraper that automatically rolls through HTTP → browser → JS state extraction"
9
+ readme = "README.md"
10
+ license = { text = "MIT" }
9
11
  requires-python = ">=3.11"
10
12
  dependencies = [
11
13
  "httpx>=0.28",
@@ -0,0 +1,162 @@
1
+ """
2
+ rolling_reader/cli.py
3
+ =================
4
+ CLI 入口(typer)
5
+
6
+ 用法:
7
+ rr <url>
8
+ rr <url> --output md
9
+ rr <url> --force-level 2
10
+ rr <url> --json-path props.pageProps
11
+ rr <url> --no-cache --verbose
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import asyncio
17
+ import json
18
+ import sys
19
+ from enum import Enum
20
+ from typing import Optional
21
+
22
+ import typer
23
+
24
+ from rolling_reader.dispatcher import dispatch
25
+ from rolling_reader.models import ExtractionError
26
+
27
+ app = typer.Typer(
28
+ name="rr",
29
+ help="Local-first web scraper — automatically selects HTTP, CDP, or JS state extraction.",
30
+ add_completion=False,
31
+ )
32
+
33
+
34
+ class OutputFormat(str, Enum):
35
+ json = "json"
36
+ md = "md"
37
+
38
+
39
+ def _resolve_json_path(data: dict, path: str):
40
+ """
41
+ 按 dot-notation 路径从字典中取值。
42
+ 例:path="props.pageProps.title" → data["props"]["pageProps"]["title"]
43
+ 路径不存在时返回 None。
44
+ """
45
+ current = data
46
+ for key in path.split("."):
47
+ if isinstance(current, dict) and key in current:
48
+ current = current[key]
49
+ else:
50
+ return None
51
+ return current
52
+
53
+
54
+ @app.command()
55
+ def scrape(
56
+ url: str = typer.Argument(..., help="Target URL to scrape"),
57
+ output: OutputFormat = typer.Option(
58
+ OutputFormat.json,
59
+ "--output", "-o",
60
+ help="Output format: json (default) or md (markdown)",
61
+ ),
62
+ force_level: Optional[int] = typer.Option(
63
+ None,
64
+ "--force-level", "-l",
65
+ help="Force a specific extraction level (1=HTTP, 2=CDP, 3=JS state)",
66
+ min=1,
67
+ max=3,
68
+ ),
69
+ json_path: Optional[str] = typer.Option(
70
+ None,
71
+ "--json-path", "-p",
72
+ help=(
73
+ "Dot-notation path into the result JSON. "
74
+ "Top-level keys: url, level, title, text, links, elapsed_ms. "
75
+ "For Level 3 JS state results, dig into nested data, e.g. --json-path text"
76
+ ),
77
+ ),
78
+ no_cache: bool = typer.Option(
79
+ False,
80
+ "--no-cache",
81
+ help="Bypass profile cache and always re-explore the best strategy",
82
+ ),
83
+ cdp_endpoint: str = typer.Option(
84
+ "http://localhost:9222",
85
+ "--cdp",
86
+ help="Chrome DevTools endpoint (default: http://localhost:9222)",
87
+ ),
88
+ verbose: bool = typer.Option(
89
+ False,
90
+ "--verbose", "-v",
91
+ help="Print escalation steps to stderr",
92
+ ),
93
+ ) -> None:
94
+ """Scrape a URL and output structured data."""
95
+
96
+ try:
97
+ result = asyncio.run(
98
+ dispatch(
99
+ url,
100
+ force_level=force_level,
101
+ cdp_endpoint=cdp_endpoint,
102
+ verbose=verbose,
103
+ use_cache=not no_cache,
104
+ )
105
+ )
106
+ except ExtractionError as e:
107
+ _print_error(e)
108
+ raise typer.Exit(code=1)
109
+ except KeyboardInterrupt:
110
+ raise typer.Exit(code=130)
111
+
112
+ # ── --json-path:从结果中提取指定字段 ──────────────────────────────────
113
+ if json_path:
114
+ value = _resolve_json_path(result.to_dict(), json_path)
115
+ if value is None:
116
+ typer.echo(
117
+ f"Error: path '{json_path}' not found in result. "
118
+ f"Available top-level keys: {', '.join(result.to_dict().keys())}",
119
+ err=True,
120
+ )
121
+ raise typer.Exit(code=1)
122
+ # 字符串直接输出,其他类型序列化为 JSON
123
+ if isinstance(value, str):
124
+ typer.echo(value)
125
+ else:
126
+ typer.echo(json.dumps(value, ensure_ascii=False, indent=2))
127
+ return
128
+
129
+ # ── 正常输出 ────────────────────────────────────────────────────────────
130
+ if output == OutputFormat.json:
131
+ typer.echo(result.to_json())
132
+ else:
133
+ typer.echo(result.to_markdown())
134
+
135
+
136
+ def _print_error(e: ExtractionError) -> None:
137
+ """格式化错误输出,给出可行动的提示。"""
138
+ reason = e.reason or str(e)
139
+
140
+ # Chrome 未启动
141
+ if "Chrome is not available" in reason or "Cannot connect to Chrome" in reason:
142
+ typer.echo(
143
+ "\nError: Chrome is not running with remote debugging enabled.\n\n"
144
+ "Start Chrome first:\n"
145
+ " macOS: open -a 'Google Chrome' --args --remote-debugging-port=9222\n"
146
+ " Windows: chrome --remote-debugging-port=9222\n"
147
+ " Linux: google-chrome --remote-debugging-port=9222\n",
148
+ err=True,
149
+ )
150
+ return
151
+
152
+ # 超时
153
+ if "timeout" in reason.lower():
154
+ typer.echo(
155
+ f"\nError: Request timed out — {reason}\n\n"
156
+ "Try: rr <url> --force-level 2 (use Chrome for slow-loading pages)\n",
157
+ err=True,
158
+ )
159
+ return
160
+
161
+ # 通用
162
+ typer.echo(f"\nError: {reason}\n", err=True)
@@ -1,9 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: rolling-reader
3
- Version: 0.2.0
4
- Summary: Local-first CLI web scraper with automatic strategy selection
5
- Requires-Python: >=3.11
6
- Requires-Dist: beautifulsoup4>=4.14
7
- Requires-Dist: httpx>=0.28
8
- Requires-Dist: playwright>=1.44
9
- Requires-Dist: typer>=0.12
@@ -1,83 +0,0 @@
1
- """
2
- rolling_reader/cli.py
3
- =================
4
- CLI 入口(typer)
5
-
6
- 用法:
7
- scrape <url>
8
- scrape <url> --output md
9
- scrape <url> --force-level 2
10
- scrape <url> --verbose
11
- """
12
-
13
- from __future__ import annotations
14
-
15
- import asyncio
16
- import sys
17
- from enum import Enum
18
- from typing import Optional
19
-
20
- import typer
21
-
22
- from rolling_reader.dispatcher import dispatch
23
- from rolling_reader.models import ExtractionError
24
-
25
- app = typer.Typer(
26
- name="scrape",
27
- help="Local-first web scraper with automatic strategy selection.",
28
- add_completion=False,
29
- )
30
-
31
-
32
- class OutputFormat(str, Enum):
33
- json = "json"
34
- md = "md"
35
-
36
-
37
- @app.command()
38
- def scrape(
39
- url: str = typer.Argument(..., help="Target URL to scrape"),
40
- output: OutputFormat = typer.Option(
41
- OutputFormat.json,
42
- "--output", "-o",
43
- help="Output format: json (default) or md (markdown)",
44
- ),
45
- force_level: Optional[int] = typer.Option(
46
- None,
47
- "--force-level", "-l",
48
- help="Force a specific extraction level (1=HTTP, 2=CDP)",
49
- min=1,
50
- max=2,
51
- ),
52
- cdp_endpoint: str = typer.Option(
53
- "http://localhost:9222",
54
- "--cdp",
55
- help="Chrome DevTools endpoint",
56
- ),
57
- verbose: bool = typer.Option(
58
- False,
59
- "--verbose", "-v",
60
- help="Print escalation steps to stderr",
61
- ),
62
- ) -> None:
63
- """Scrape a URL and output structured data."""
64
-
65
- try:
66
- result = asyncio.run(
67
- dispatch(
68
- url,
69
- force_level=force_level,
70
- cdp_endpoint=cdp_endpoint,
71
- verbose=verbose,
72
- )
73
- )
74
- except ExtractionError as e:
75
- typer.echo(f"Error: {e.reason}", err=True)
76
- raise typer.Exit(code=1)
77
- except KeyboardInterrupt:
78
- raise typer.Exit(code=130)
79
-
80
- if output == OutputFormat.json:
81
- typer.echo(result.to_json())
82
- else:
83
- typer.echo(result.to_markdown())