chrome-scraper 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chrome_scraper-0.1.0/PKG-INFO +268 -0
- chrome_scraper-0.1.0/README.md +252 -0
- chrome_scraper-0.1.0/pyproject.toml +28 -0
- chrome_scraper-0.1.0/src/chrome_scraper/__init__.py +2 -0
- chrome_scraper-0.1.0/src/chrome_scraper/browser_api/__init__.py +0 -0
- chrome_scraper-0.1.0/src/chrome_scraper/browser_api/cli.py +160 -0
- chrome_scraper-0.1.0/src/chrome_scraper/browser_api/client.py +170 -0
- chrome_scraper-0.1.0/src/chrome_scraper/browser_api/server.py +446 -0
- chrome_scraper-0.1.0/src/chrome_scraper/cli_output.py +23 -0
- chrome_scraper-0.1.0/src/chrome_scraper/html_to_md/__init__.py +6 -0
- chrome_scraper-0.1.0/src/chrome_scraper/html_to_md/cli.py +146 -0
- chrome_scraper-0.1.0/src/chrome_scraper/html_to_md/extract.js +121 -0
- chrome_scraper-0.1.0/src/chrome_scraper/html_to_md/extract.py +117 -0
- chrome_scraper-0.1.0/src/chrome_scraper/html_to_md/render.py +504 -0
- chrome_scraper-0.1.0/src/chrome_scraper/py.typed +0 -0
- chrome_scraper-0.1.0/src/chrome_scraper/web_scrapers/__init__.py +0 -0
- chrome_scraper-0.1.0/src/chrome_scraper/web_scrapers/_fetch_common.py +75 -0
- chrome_scraper-0.1.0/src/chrome_scraper/web_scrapers/base.py +254 -0
- chrome_scraper-0.1.0/src/chrome_scraper/web_scrapers/google_fetch.py +247 -0
- chrome_scraper-0.1.0/src/chrome_scraper/web_scrapers/google_fetch_cli.py +126 -0
- chrome_scraper-0.1.0/src/chrome_scraper/web_scrapers/google_search.py +253 -0
- chrome_scraper-0.1.0/src/chrome_scraper/web_scrapers/scripts/__init__.py +0 -0
- chrome_scraper-0.1.0/src/chrome_scraper/web_scrapers/scripts/google_search_results.js +50 -0
- chrome_scraper-0.1.0/src/chrome_scraper/web_scrapers/scripts/xcom_search_results.js +31 -0
- chrome_scraper-0.1.0/src/chrome_scraper/web_scrapers/xcom_fetch.py +298 -0
- chrome_scraper-0.1.0/src/chrome_scraper/web_scrapers/xcom_fetch_cli.py +108 -0
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: chrome-scraper
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Steatlh and stateful Chrome scraper
|
|
5
|
+
Author: Mauro Sciancalepore
|
|
6
|
+
Author-email: Mauro Sciancalepore <maurosciancalepore98@gmail.com>
|
|
7
|
+
Requires-Dist: fastapi>=0.115.0
|
|
8
|
+
Requires-Dist: httpx>=0.28.0
|
|
9
|
+
Requires-Dist: jinja2>=3.1.4
|
|
10
|
+
Requires-Dist: patchright>=1.52.0
|
|
11
|
+
Requires-Dist: playwright>=1.59.0
|
|
12
|
+
Requires-Dist: python-multipart>=0.0.9
|
|
13
|
+
Requires-Dist: uvicorn[standard]>=0.30.0
|
|
14
|
+
Requires-Python: >=3.10
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
|
|
17
|
+
# chrome-scraper
|
|
18
|
+
|
|
19
|
+
Stealth and stateful Chrome scraper, built for AI agents.
|
|
20
|
+
Shared browser server with profile persistence with pre-built CLI scrapers for Google and x.com that dump content in HTML + clean markdown.
|
|
21
|
+
|
|
22
|
+
`chrome-scraper` is a Python package built on [Patchright](https://pypi.org/project/patchright/) (a maintained Playwright fork) that runs a single long-lived Chrome instance behind a FastAPI HTTP API. Clients can connect concurrently, share cookies/sessions via a persistent profile, and avoid the cold-start/teardown cost of per-task browser launch.
|
|
23
|
+
|
|
24
|
+
## Install
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
# Core package
|
|
28
|
+
uv add chrome-scraper
|
|
29
|
+
|
|
30
|
+
# Or install from source
|
|
31
|
+
uv sync
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Supports Python 3.10–3.13.
|
|
35
|
+
|
|
36
|
+
## Quickstart
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
# 1. Start the shared browser server (keep running)
|
|
40
|
+
uv run browser-api [--headless]
|
|
41
|
+
|
|
42
|
+
# 2. In another terminal — render a URL to markdown
|
|
43
|
+
html-to-md https://example.com --output out/example.md
|
|
44
|
+
|
|
45
|
+
# 3. Search Google and download results
|
|
46
|
+
google-fetch --query "machine learning" --max-results 3 --num-pages 1 --out-dir data/research/ml
|
|
47
|
+
|
|
48
|
+
# 4. Search x.com and download tweets
|
|
49
|
+
xcom-fetch --query "ai safety" --max-results 10 --out-dir data/research/xcom-ai
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
`browser-api` is a persistent server — start it once, run scrapers against it from multiple terminals or scripts. Chrome keeps cookies, logins, and profile state across sessions.
|
|
53
|
+
|
|
54
|
+
## Architecture
|
|
55
|
+
|
|
56
|
+
```
|
|
57
|
+
┌──────────────────────────────────────────────────────┐
|
|
58
|
+
│ browser-api (port 9333) │
|
|
59
|
+
│ ┌──────────────────────────────────────────────────┐ │
|
|
60
|
+
│ │ FastAPI server │ │
|
|
61
|
+
│ │ /status /tabs /tabs/{id}/goto /eval /type │ │
|
|
62
|
+
│ └──────────┬───────────────────────────────────────┘ │
|
|
63
|
+
│ │ owns │
|
|
64
|
+
│ ┌──────────▼───────────────────────────────────────┐ │
|
|
65
|
+
│ │ Chrome (Patchright persistent context) │ │
|
|
66
|
+
│ │ Profile │ │
|
|
67
|
+
│ │ Tabs: separate label-keyed sandboxes │ │
|
|
68
|
+
│ └──────────────────────────────────────────────────┘ │
|
|
69
|
+
└──────────────────────────────────────────────────────┘
|
|
70
|
+
▲ HTTP ▲ HTTP
|
|
71
|
+
│ │
|
|
72
|
+
┌────────┴──────────┐ ┌──────────────┴──────────────┐
|
|
73
|
+
│ html-to-md │ │ google-fetch / xcom-fetch │
|
|
74
|
+
│ short-lived │ │ short-lived CLIs │
|
|
75
|
+
│ open tab → │ │ open tab → search → │
|
|
76
|
+
│ extract → close │ │ fetch each result → close │
|
|
77
|
+
└───────────────────┘ └─────────────────────────────┘
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
The server patches Patchright's `crBrowser.js` so new tabs open in background — Chrome never steals focus during concurrent scraping.
|
|
81
|
+
|
|
82
|
+
## CLI reference
|
|
83
|
+
|
|
84
|
+
### browser-api
|
|
85
|
+
|
|
86
|
+
Start, stop, and check status of the shared browser server.
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
uv run browser-api # start on :9333 (default)
|
|
90
|
+
uv run browser-api --port 8080 # custom port
|
|
91
|
+
uv run browser-api --headless # headless mode
|
|
92
|
+
uv run browser-api --chrome-path /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome
|
|
93
|
+
uv run browser-api --profile-dir ~/my-chrome-profile
|
|
94
|
+
uv run browser-api --hide # hide Chrome window (macOS)
|
|
95
|
+
uv run browser-api --proxy http://proxy:8080
|
|
96
|
+
uv run browser-api --browser-args="--disable-gpu --no-sandbox"
|
|
97
|
+
uv run browser-api status # check if running
|
|
98
|
+
uv run browser-api stop # shut down
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
Stateful features:
|
|
102
|
+
|
|
103
|
+
- **Persistent profile** — cookies, localStorage, and logins survive restarts. Profile lives at `~/Library/Application Support/thebase/playwright/profile/` on macOS.
|
|
104
|
+
- **Headless identity** — when `--headless` is used, the server automatically probes a throwaway headless instance, strips the `HeadlessChrome` token from the User-Agent, and passes the clean UA to the real persistent context.
|
|
105
|
+
- **Hide macOS window** — `--hide` runs `osascript` to hide Chrome from sight without quitting (non-headless only).
|
|
106
|
+
|
|
107
|
+
### html-to-md
|
|
108
|
+
|
|
109
|
+
Render any URL (or local HTML file) as layout-preserving markdown.
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
# Live URL
|
|
113
|
+
html-to-md https://example.com --output out/page.md
|
|
114
|
+
|
|
115
|
+
# Local HTML file
|
|
116
|
+
html-to-md --from-file page.html --output out/page.md
|
|
117
|
+
|
|
118
|
+
# Print to stdout
|
|
119
|
+
html-to-md https://example.com --output -
|
|
120
|
+
|
|
121
|
+
# Save raw text-node payload alongside markdown
|
|
122
|
+
html-to-md https://example.com --save-items
|
|
123
|
+
|
|
124
|
+
# Skip scroll pass (for already-scrolled SPAs)
|
|
125
|
+
html-to-md https://example.com --no-scroll
|
|
126
|
+
|
|
127
|
+
# Verbose layout diagnostics
|
|
128
|
+
html-to-md https://example.com -v
|
|
129
|
+
|
|
130
|
+
# Custom browser-api URL
|
|
131
|
+
html-to-md https://example.com --browser-api http://localhost:9333
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
Rendering preserves multi-column layout, code blocks, headings, links, inline code, and list structure. Sidebar content is separated by a `---` rule. See [`docs/layout.md`](docs/layout.md) for details on the layout-to-markdown algorithm.
|
|
135
|
+
|
|
136
|
+
### google-fetch
|
|
137
|
+
|
|
138
|
+
Search Google and download each result as HTML + markdown.
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
# Basic search, 1 page
|
|
142
|
+
google-fetch --query "quantum computing"
|
|
143
|
+
|
|
144
|
+
# Multi-page with output dir
|
|
145
|
+
google-fetch --query "machine learning transformers" \
|
|
146
|
+
--num-pages 3 --out-dir data/research/transformers
|
|
147
|
+
|
|
148
|
+
# Filter by hostname
|
|
149
|
+
google-fetch --query "python typing" \
|
|
150
|
+
--allowed-hosts docs.python.org peps.python.org
|
|
151
|
+
|
|
152
|
+
# Limit total results
|
|
153
|
+
google-fetch --query "Rust async" --max-results 5
|
|
154
|
+
|
|
155
|
+
# Custom browser-api server
|
|
156
|
+
google-fetch --query "agents" --browser-api http://localhost:9333
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
Output layout:
|
|
160
|
+
|
|
161
|
+
```
|
|
162
|
+
data/research/<query-slug>/<tag>/
|
|
163
|
+
├── results.json # title + URL index
|
|
164
|
+
├── 01-introduction-to.html
|
|
165
|
+
├── 01-introduction-to.md # frontmatter + rendered markdown
|
|
166
|
+
├── 02-advanced-topics.html
|
|
167
|
+
├── 02-advanced-topics.md
|
|
168
|
+
└── ...
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### xcom-fetch
|
|
172
|
+
|
|
173
|
+
Search x.com and download tweets as HTML + markdown.
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
# Keyword search
|
|
177
|
+
xcom-fetch --query "reinforcement learning"
|
|
178
|
+
|
|
179
|
+
# Restrict to one account
|
|
180
|
+
xcom-fetch --query "safety" --from "Anthropic"
|
|
181
|
+
|
|
182
|
+
# Limit results
|
|
183
|
+
xcom-fetch --query "alignment" --max-results 5
|
|
184
|
+
|
|
185
|
+
# Custom output dir
|
|
186
|
+
xcom-fetch --query "scaling laws" --out-dir data/tweets
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
Output layout:
|
|
190
|
+
|
|
191
|
+
```
|
|
192
|
+
data/research/xcom-<query>/
|
|
193
|
+
├── results.json # permalink + author + text snippet
|
|
194
|
+
├── 01-anthropic-12345.html
|
|
195
|
+
├── 01-anthropic-12345.md # frontmatter + rendered markdown
|
|
196
|
+
└── ...
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
## Python API
|
|
200
|
+
|
|
201
|
+
```python
|
|
202
|
+
from pathlib import Path
|
|
203
|
+
from chrome_scraper.html_to_md import extract_from_url, render_page
|
|
204
|
+
|
|
205
|
+
# Extract text-node payload from a URL
|
|
206
|
+
payload = extract_from_url(
|
|
207
|
+
"https://example.com",
|
|
208
|
+
browser_api_url="http://localhost:9333",
|
|
209
|
+
timeout=30.0,
|
|
210
|
+
scroll=True,
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
# Render to layout-preserving markdown
|
|
214
|
+
items = payload.get("items", [])
|
|
215
|
+
page_width = (payload.get("viewport") or {}).get("scroll_w", 1280)
|
|
216
|
+
md = render_page(items, page_width)
|
|
217
|
+
Path("out/example.md").write_text(md, encoding="utf-8")
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
Or manage the browser lifecycle yourself:
|
|
221
|
+
|
|
222
|
+
```python
|
|
223
|
+
from chrome_scraper.browser_api.client import BrowserAPIClient
|
|
224
|
+
from chrome_scraper.html_to_md.extract import extract_page
|
|
225
|
+
|
|
226
|
+
client = BrowserAPIClient(timeout=30.0)
|
|
227
|
+
with client.tab("my-tab"):
|
|
228
|
+
payload = extract_page(
|
|
229
|
+
"https://example.com",
|
|
230
|
+
client,
|
|
231
|
+
tab_ref="my-tab",
|
|
232
|
+
timeout=30.0,
|
|
233
|
+
scroll=True,
|
|
234
|
+
)
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
## At a glance
|
|
238
|
+
|
|
239
|
+
**browser-api** — shared Chrome behind HTTP:
|
|
240
|
+
|
|
241
|
+
- Persistent profile with cookies/logins.
|
|
242
|
+
- Label-keyed tabs for concurrent clients.
|
|
243
|
+
- Tab lifecycle isolated per client (open → use → close).
|
|
244
|
+
- Background-tab patch so Chrome stays out of the way.
|
|
245
|
+
- Headless-mode UA cleaning (strips `HeadlessChrome`).
|
|
246
|
+
- macOS hide support (`--hide`).
|
|
247
|
+
|
|
248
|
+
**html-to-md** — layout-preserving markdown via Chrome CDP:
|
|
249
|
+
|
|
250
|
+
- Extracts every rendered text node with position and styling.
|
|
251
|
+
- Detects columns via x-start histogram peaks.
|
|
252
|
+
- Splits main/sidebar content via widest vertical gutter.
|
|
253
|
+
- Preserves code blocks, headings, links, inline code, lists.
|
|
254
|
+
- Row boundaries computed from *intersecting* column gap sets — long main-column paragraphs stay intact regardless of sidebar density.
|
|
255
|
+
|
|
256
|
+
**google-fetch** — multi-page Google scraping:
|
|
257
|
+
|
|
258
|
+
- Paginates through Google result pages.
|
|
259
|
+
- Visits each result link, dumps outerHTML + rendered markdown.
|
|
260
|
+
- Navigates back to search results after each fetch.
|
|
261
|
+
- Optional hostname filtering and result count limits.
|
|
262
|
+
|
|
263
|
+
**xcom-fetch** — x.com tweet scraping:
|
|
264
|
+
|
|
265
|
+
- Drives x.com's React search UI via native keyboard (Patchright).
|
|
266
|
+
- Virtual list scrolling to populate results.
|
|
267
|
+
- Visits each tweet permalink, dumps HTML + markdown.
|
|
268
|
+
- SPA-safe navigation; falls back to direct URL navigation if anchor click fails.
|
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
# chrome-scraper
|
|
2
|
+
|
|
3
|
+
Stealth and stateful Chrome scraper, built for AI agents.
|
|
4
|
+
Shared browser server with profile persistence with pre-built CLI scrapers for Google and x.com that dump content in HTML + clean markdown.
|
|
5
|
+
|
|
6
|
+
`chrome-scraper` is a Python package built on [Patchright](https://pypi.org/project/patchright/) (a maintained Playwright fork) that runs a single long-lived Chrome instance behind a FastAPI HTTP API. Clients can connect concurrently, share cookies/sessions via a persistent profile, and avoid the cold-start/teardown cost of per-task browser launch.
|
|
7
|
+
|
|
8
|
+
## Install
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
# Core package
|
|
12
|
+
uv add chrome-scraper
|
|
13
|
+
|
|
14
|
+
# Or install from source
|
|
15
|
+
uv sync
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
Supports Python 3.10–3.13.
|
|
19
|
+
|
|
20
|
+
## Quickstart
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
# 1. Start the shared browser server (keep running)
|
|
24
|
+
uv run browser-api [--headless]
|
|
25
|
+
|
|
26
|
+
# 2. In another terminal — render a URL to markdown
|
|
27
|
+
html-to-md https://example.com --output out/example.md
|
|
28
|
+
|
|
29
|
+
# 3. Search Google and download results
|
|
30
|
+
google-fetch --query "machine learning" --max-results 3 --num-pages 1 --out-dir data/research/ml
|
|
31
|
+
|
|
32
|
+
# 4. Search x.com and download tweets
|
|
33
|
+
xcom-fetch --query "ai safety" --max-results 10 --out-dir data/research/xcom-ai
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
`browser-api` is a persistent server — start it once, run scrapers against it from multiple terminals or scripts. Chrome keeps cookies, logins, and profile state across sessions.
|
|
37
|
+
|
|
38
|
+
## Architecture
|
|
39
|
+
|
|
40
|
+
```
|
|
41
|
+
┌──────────────────────────────────────────────────────┐
|
|
42
|
+
│ browser-api (port 9333) │
|
|
43
|
+
│ ┌──────────────────────────────────────────────────┐ │
|
|
44
|
+
│ │ FastAPI server │ │
|
|
45
|
+
│ │ /status /tabs /tabs/{id}/goto /eval /type │ │
|
|
46
|
+
│ └──────────┬───────────────────────────────────────┘ │
|
|
47
|
+
│ │ owns │
|
|
48
|
+
│ ┌──────────▼───────────────────────────────────────┐ │
|
|
49
|
+
│ │ Chrome (Patchright persistent context) │ │
|
|
50
|
+
│ │ Profile │ │
|
|
51
|
+
│ │ Tabs: separate label-keyed sandboxes │ │
|
|
52
|
+
│ └──────────────────────────────────────────────────┘ │
|
|
53
|
+
└──────────────────────────────────────────────────────┘
|
|
54
|
+
▲ HTTP ▲ HTTP
|
|
55
|
+
│ │
|
|
56
|
+
┌────────┴──────────┐ ┌──────────────┴──────────────┐
|
|
57
|
+
│ html-to-md │ │ google-fetch / xcom-fetch │
|
|
58
|
+
│ short-lived │ │ short-lived CLIs │
|
|
59
|
+
│ open tab → │ │ open tab → search → │
|
|
60
|
+
│ extract → close │ │ fetch each result → close │
|
|
61
|
+
└───────────────────┘ └─────────────────────────────┘
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
The server patches Patchright's `crBrowser.js` so new tabs open in background — Chrome never steals focus during concurrent scraping.
|
|
65
|
+
|
|
66
|
+
## CLI reference
|
|
67
|
+
|
|
68
|
+
### browser-api
|
|
69
|
+
|
|
70
|
+
Start, stop, and check status of the shared browser server.
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
uv run browser-api # start on :9333 (default)
|
|
74
|
+
uv run browser-api --port 8080 # custom port
|
|
75
|
+
uv run browser-api --headless # headless mode
|
|
76
|
+
uv run browser-api --chrome-path /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome
|
|
77
|
+
uv run browser-api --profile-dir ~/my-chrome-profile
|
|
78
|
+
uv run browser-api --hide # hide Chrome window (macOS)
|
|
79
|
+
uv run browser-api --proxy http://proxy:8080
|
|
80
|
+
uv run browser-api --browser-args="--disable-gpu --no-sandbox"
|
|
81
|
+
uv run browser-api status # check if running
|
|
82
|
+
uv run browser-api stop # shut down
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Stateful features:
|
|
86
|
+
|
|
87
|
+
- **Persistent profile** — cookies, localStorage, and logins survive restarts. Profile lives at `~/Library/Application Support/thebase/playwright/profile/` on macOS.
|
|
88
|
+
- **Headless identity** — when `--headless` is used, the server automatically probes a throwaway headless instance, strips the `HeadlessChrome` token from the User-Agent, and passes the clean UA to the real persistent context.
|
|
89
|
+
- **Hide macOS window** — `--hide` runs `osascript` to hide Chrome from sight without quitting (non-headless only).
|
|
90
|
+
|
|
91
|
+
### html-to-md
|
|
92
|
+
|
|
93
|
+
Render any URL (or local HTML file) as layout-preserving markdown.
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
# Live URL
|
|
97
|
+
html-to-md https://example.com --output out/page.md
|
|
98
|
+
|
|
99
|
+
# Local HTML file
|
|
100
|
+
html-to-md --from-file page.html --output out/page.md
|
|
101
|
+
|
|
102
|
+
# Print to stdout
|
|
103
|
+
html-to-md https://example.com --output -
|
|
104
|
+
|
|
105
|
+
# Save raw text-node payload alongside markdown
|
|
106
|
+
html-to-md https://example.com --save-items
|
|
107
|
+
|
|
108
|
+
# Skip scroll pass (for already-scrolled SPAs)
|
|
109
|
+
html-to-md https://example.com --no-scroll
|
|
110
|
+
|
|
111
|
+
# Verbose layout diagnostics
|
|
112
|
+
html-to-md https://example.com -v
|
|
113
|
+
|
|
114
|
+
# Custom browser-api URL
|
|
115
|
+
html-to-md https://example.com --browser-api http://localhost:9333
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
Rendering preserves multi-column layout, code blocks, headings, links, inline code, and list structure. Sidebar content is separated by a `---` rule. See [`docs/layout.md`](docs/layout.md) for details on the layout-to-markdown algorithm.
|
|
119
|
+
|
|
120
|
+
### google-fetch
|
|
121
|
+
|
|
122
|
+
Search Google and download each result as HTML + markdown.
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
# Basic search, 1 page
|
|
126
|
+
google-fetch --query "quantum computing"
|
|
127
|
+
|
|
128
|
+
# Multi-page with output dir
|
|
129
|
+
google-fetch --query "machine learning transformers" \
|
|
130
|
+
--num-pages 3 --out-dir data/research/transformers
|
|
131
|
+
|
|
132
|
+
# Filter by hostname
|
|
133
|
+
google-fetch --query "python typing" \
|
|
134
|
+
--allowed-hosts docs.python.org peps.python.org
|
|
135
|
+
|
|
136
|
+
# Limit total results
|
|
137
|
+
google-fetch --query "Rust async" --max-results 5
|
|
138
|
+
|
|
139
|
+
# Custom browser-api server
|
|
140
|
+
google-fetch --query "agents" --browser-api http://localhost:9333
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
Output layout:
|
|
144
|
+
|
|
145
|
+
```
|
|
146
|
+
data/research/<query-slug>/<tag>/
|
|
147
|
+
├── results.json # title + URL index
|
|
148
|
+
├── 01-introduction-to.html
|
|
149
|
+
├── 01-introduction-to.md # frontmatter + rendered markdown
|
|
150
|
+
├── 02-advanced-topics.html
|
|
151
|
+
├── 02-advanced-topics.md
|
|
152
|
+
└── ...
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### xcom-fetch
|
|
156
|
+
|
|
157
|
+
Search x.com and download tweets as HTML + markdown.
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
# Keyword search
|
|
161
|
+
xcom-fetch --query "reinforcement learning"
|
|
162
|
+
|
|
163
|
+
# Restrict to one account
|
|
164
|
+
xcom-fetch --query "safety" --from "Anthropic"
|
|
165
|
+
|
|
166
|
+
# Limit results
|
|
167
|
+
xcom-fetch --query "alignment" --max-results 5
|
|
168
|
+
|
|
169
|
+
# Custom output dir
|
|
170
|
+
xcom-fetch --query "scaling laws" --out-dir data/tweets
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
Output layout:
|
|
174
|
+
|
|
175
|
+
```
|
|
176
|
+
data/research/xcom-<query>/
|
|
177
|
+
├── results.json # permalink + author + text snippet
|
|
178
|
+
├── 01-anthropic-12345.html
|
|
179
|
+
├── 01-anthropic-12345.md # frontmatter + rendered markdown
|
|
180
|
+
└── ...
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
## Python API
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
from pathlib import Path
|
|
187
|
+
from chrome_scraper.html_to_md import extract_from_url, render_page
|
|
188
|
+
|
|
189
|
+
# Extract text-node payload from a URL
|
|
190
|
+
payload = extract_from_url(
|
|
191
|
+
"https://example.com",
|
|
192
|
+
browser_api_url="http://localhost:9333",
|
|
193
|
+
timeout=30.0,
|
|
194
|
+
scroll=True,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
# Render to layout-preserving markdown
|
|
198
|
+
items = payload.get("items", [])
|
|
199
|
+
page_width = (payload.get("viewport") or {}).get("scroll_w", 1280)
|
|
200
|
+
md = render_page(items, page_width)
|
|
201
|
+
Path("out/example.md").write_text(md, encoding="utf-8")
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
Or manage the browser lifecycle yourself:
|
|
205
|
+
|
|
206
|
+
```python
|
|
207
|
+
from chrome_scraper.browser_api.client import BrowserAPIClient
|
|
208
|
+
from chrome_scraper.html_to_md.extract import extract_page
|
|
209
|
+
|
|
210
|
+
client = BrowserAPIClient(timeout=30.0)
|
|
211
|
+
with client.tab("my-tab"):
|
|
212
|
+
payload = extract_page(
|
|
213
|
+
"https://example.com",
|
|
214
|
+
client,
|
|
215
|
+
tab_ref="my-tab",
|
|
216
|
+
timeout=30.0,
|
|
217
|
+
scroll=True,
|
|
218
|
+
)
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
## At a glance
|
|
222
|
+
|
|
223
|
+
**browser-api** — shared Chrome behind HTTP:
|
|
224
|
+
|
|
225
|
+
- Persistent profile with cookies/logins.
|
|
226
|
+
- Label-keyed tabs for concurrent clients.
|
|
227
|
+
- Tab lifecycle isolated per client (open → use → close).
|
|
228
|
+
- Background-tab patch so Chrome stays out of the way.
|
|
229
|
+
- Headless-mode UA cleaning (strips `HeadlessChrome`).
|
|
230
|
+
- macOS hide support (`--hide`).
|
|
231
|
+
|
|
232
|
+
**html-to-md** — layout-preserving markdown via Chrome CDP:
|
|
233
|
+
|
|
234
|
+
- Extracts every rendered text node with position and styling.
|
|
235
|
+
- Detects columns via x-start histogram peaks.
|
|
236
|
+
- Splits main/sidebar content via widest vertical gutter.
|
|
237
|
+
- Preserves code blocks, headings, links, inline code, lists.
|
|
238
|
+
- Row boundaries computed from *intersecting* column gap sets — long main-column paragraphs stay intact regardless of sidebar density.
|
|
239
|
+
|
|
240
|
+
**google-fetch** — multi-page Google scraping:
|
|
241
|
+
|
|
242
|
+
- Paginates through Google result pages.
|
|
243
|
+
- Visits each result link, dumps outerHTML + rendered markdown.
|
|
244
|
+
- Navigates back to search results after each fetch.
|
|
245
|
+
- Optional hostname filtering and result count limits.
|
|
246
|
+
|
|
247
|
+
**xcom-fetch** — x.com tweet scraping:
|
|
248
|
+
|
|
249
|
+
- Drives x.com's React search UI via native keyboard (Patchright).
|
|
250
|
+
- Virtual list scrolling to populate results.
|
|
251
|
+
- Visits each tweet permalink, dumps HTML + markdown.
|
|
252
|
+
- SPA-safe navigation; falls back to direct URL navigation if anchor click fails.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "chrome-scraper"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Steatlh and stateful Chrome scraper"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "Mauro Sciancalepore", email = "maurosciancalepore98@gmail.com" }
|
|
8
|
+
]
|
|
9
|
+
requires-python = ">=3.10"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"fastapi>=0.115.0",
|
|
12
|
+
"httpx>=0.28.0",
|
|
13
|
+
"jinja2>=3.1.4",
|
|
14
|
+
"patchright>=1.52.0",
|
|
15
|
+
"playwright>=1.59.0",
|
|
16
|
+
"python-multipart>=0.0.9",
|
|
17
|
+
"uvicorn[standard]>=0.30.0",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
[project.scripts]
|
|
21
|
+
html-to-md = "chrome_scraper.html_to_md.cli:main"
|
|
22
|
+
browser-api = "chrome_scraper.browser_api.cli:main"
|
|
23
|
+
google-fetch = "chrome_scraper.web_scrapers.google_fetch_cli:main"
|
|
24
|
+
xcom-fetch = "chrome_scraper.web_scrapers.xcom_fetch_cli:main"
|
|
25
|
+
|
|
26
|
+
[build-system]
|
|
27
|
+
requires = ["uv_build>=0.10.0,<0.11.0"]
|
|
28
|
+
build-backend = "uv_build"
|
|
File without changes
|