chrome-scraper 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. chrome_scraper-0.1.0/PKG-INFO +268 -0
  2. chrome_scraper-0.1.0/README.md +252 -0
  3. chrome_scraper-0.1.0/pyproject.toml +28 -0
  4. chrome_scraper-0.1.0/src/chrome_scraper/__init__.py +2 -0
  5. chrome_scraper-0.1.0/src/chrome_scraper/browser_api/__init__.py +0 -0
  6. chrome_scraper-0.1.0/src/chrome_scraper/browser_api/cli.py +160 -0
  7. chrome_scraper-0.1.0/src/chrome_scraper/browser_api/client.py +170 -0
  8. chrome_scraper-0.1.0/src/chrome_scraper/browser_api/server.py +446 -0
  9. chrome_scraper-0.1.0/src/chrome_scraper/cli_output.py +23 -0
  10. chrome_scraper-0.1.0/src/chrome_scraper/html_to_md/__init__.py +6 -0
  11. chrome_scraper-0.1.0/src/chrome_scraper/html_to_md/cli.py +146 -0
  12. chrome_scraper-0.1.0/src/chrome_scraper/html_to_md/extract.js +121 -0
  13. chrome_scraper-0.1.0/src/chrome_scraper/html_to_md/extract.py +117 -0
  14. chrome_scraper-0.1.0/src/chrome_scraper/html_to_md/render.py +504 -0
  15. chrome_scraper-0.1.0/src/chrome_scraper/py.typed +0 -0
  16. chrome_scraper-0.1.0/src/chrome_scraper/web_scrapers/__init__.py +0 -0
  17. chrome_scraper-0.1.0/src/chrome_scraper/web_scrapers/_fetch_common.py +75 -0
  18. chrome_scraper-0.1.0/src/chrome_scraper/web_scrapers/base.py +254 -0
  19. chrome_scraper-0.1.0/src/chrome_scraper/web_scrapers/google_fetch.py +247 -0
  20. chrome_scraper-0.1.0/src/chrome_scraper/web_scrapers/google_fetch_cli.py +126 -0
  21. chrome_scraper-0.1.0/src/chrome_scraper/web_scrapers/google_search.py +253 -0
  22. chrome_scraper-0.1.0/src/chrome_scraper/web_scrapers/scripts/__init__.py +0 -0
  23. chrome_scraper-0.1.0/src/chrome_scraper/web_scrapers/scripts/google_search_results.js +50 -0
  24. chrome_scraper-0.1.0/src/chrome_scraper/web_scrapers/scripts/xcom_search_results.js +31 -0
  25. chrome_scraper-0.1.0/src/chrome_scraper/web_scrapers/xcom_fetch.py +298 -0
  26. chrome_scraper-0.1.0/src/chrome_scraper/web_scrapers/xcom_fetch_cli.py +108 -0
@@ -0,0 +1,268 @@
1
+ Metadata-Version: 2.3
2
+ Name: chrome-scraper
3
+ Version: 0.1.0
4
+ Summary: Steatlh and stateful Chrome scraper
5
+ Author: Mauro Sciancalepore
6
+ Author-email: Mauro Sciancalepore <maurosciancalepore98@gmail.com>
7
+ Requires-Dist: fastapi>=0.115.0
8
+ Requires-Dist: httpx>=0.28.0
9
+ Requires-Dist: jinja2>=3.1.4
10
+ Requires-Dist: patchright>=1.52.0
11
+ Requires-Dist: playwright>=1.59.0
12
+ Requires-Dist: python-multipart>=0.0.9
13
+ Requires-Dist: uvicorn[standard]>=0.30.0
14
+ Requires-Python: >=3.10
15
+ Description-Content-Type: text/markdown
16
+
17
+ # chrome-scraper
18
+
19
+ Stealth and stateful Chrome scraper, built for AI agents.
20
+ Shared browser server with profile persistence with pre-built CLI scrapers for Google and x.com that dump content in HTML + clean markdown.
21
+
22
+ `chrome-scraper` is a Python package built on [Patchright](https://pypi.org/project/patchright/) (a maintained Playwright fork) that runs a single long-lived Chrome instance behind a FastAPI HTTP API. Clients can connect concurrently, share cookies/sessions via a persistent profile, and avoid the cold-start/teardown cost of per-task browser launch.
23
+
24
+ ## Install
25
+
26
+ ```bash
27
+ # Core package
28
+ uv add chrome-scraper
29
+
30
+ # Or install from source
31
+ uv sync
32
+ ```
33
+
34
+ Supports Python 3.10–3.13.
35
+
36
+ ## Quickstart
37
+
38
+ ```bash
39
+ # 1. Start the shared browser server (keep running)
40
+ uv run browser-api [--headless]
41
+
42
+ # 2. In another terminal — render a URL to markdown
43
+ html-to-md https://example.com --output out/example.md
44
+
45
+ # 3. Search Google and download results
46
+ google-fetch --query "machine learning" --max-results 3 --num-pages 1 --out-dir data/research/ml
47
+
48
+ # 4. Search x.com and download tweets
49
+ xcom-fetch --query "ai safety" --max-results 10 --out-dir data/research/xcom-ai
50
+ ```
51
+
52
+ `browser-api` is a persistent server — start it once, run scrapers against it from multiple terminals or scripts. Chrome keeps cookies, logins, and profile state across sessions.
53
+
54
+ ## Architecture
55
+
56
+ ```
57
+ ┌──────────────────────────────────────────────────────┐
58
+ │ browser-api (port 9333) │
59
+ │ ┌──────────────────────────────────────────────────┐ │
60
+ │ │ FastAPI server │ │
61
+ │ │ /status /tabs /tabs/{id}/goto /eval /type │ │
62
+ │ └──────────┬───────────────────────────────────────┘ │
63
+ │ │ owns │
64
+ │ ┌──────────▼───────────────────────────────────────┐ │
65
+ │ │ Chrome (Patchright persistent context) │ │
66
+ │ │ Profile │ │
67
+ │ │ Tabs: separate label-keyed sandboxes │ │
68
+ │ └──────────────────────────────────────────────────┘ │
69
+ └──────────────────────────────────────────────────────┘
70
+ ▲ HTTP ▲ HTTP
71
+ │ │
72
+ ┌────────┴──────────┐ ┌──────────────┴──────────────┐
73
+ │ html-to-md │ │ google-fetch / xcom-fetch │
74
+ │ short-lived │ │ short-lived CLIs │
75
+ │ open tab → │ │ open tab → search → │
76
+ │ extract → close │ │ fetch each result → close │
77
+ └───────────────────┘ └─────────────────────────────┘
78
+ ```
79
+
80
+ The server patches Patchright's `crBrowser.js` so new tabs open in background — Chrome never steals focus during concurrent scraping.
81
+
82
+ ## CLI reference
83
+
84
+ ### browser-api
85
+
86
+ Start, stop, and check status of the shared browser server.
87
+
88
+ ```bash
89
+ uv run browser-api # start on :9333 (default)
90
+ uv run browser-api --port 8080 # custom port
91
+ uv run browser-api --headless # headless mode
92
+ uv run browser-api --chrome-path /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome
93
+ uv run browser-api --profile-dir ~/my-chrome-profile
94
+ uv run browser-api --hide # hide Chrome window (macOS)
95
+ uv run browser-api --proxy http://proxy:8080
96
+ uv run browser-api --browser-args="--disable-gpu --no-sandbox"
97
+ uv run browser-api status # check if running
98
+ uv run browser-api stop # shut down
99
+ ```
100
+
101
+ Stateful features:
102
+
103
+ - **Persistent profile** — cookies, localStorage, and logins survive restarts. Profile lives at `~/Library/Application Support/thebase/playwright/profile/` on macOS.
104
+ - **Headless identity** — when `--headless` is used, the server automatically probes a throwaway headless instance, strips the `HeadlessChrome` token from the User-Agent, and passes the clean UA to the real persistent context.
105
+ - **Hide macOS window** — `--hide` runs `osascript` to hide Chrome from sight without quitting (non-headless only).
106
+
107
+ ### html-to-md
108
+
109
+ Render any URL (or local HTML file) as layout-preserving markdown.
110
+
111
+ ```bash
112
+ # Live URL
113
+ html-to-md https://example.com --output out/page.md
114
+
115
+ # Local HTML file
116
+ html-to-md --from-file page.html --output out/page.md
117
+
118
+ # Print to stdout
119
+ html-to-md https://example.com --output -
120
+
121
+ # Save raw text-node payload alongside markdown
122
+ html-to-md https://example.com --save-items
123
+
124
+ # Skip scroll pass (for already-scrolled SPAs)
125
+ html-to-md https://example.com --no-scroll
126
+
127
+ # Verbose layout diagnostics
128
+ html-to-md https://example.com -v
129
+
130
+ # Custom browser-api URL
131
+ html-to-md https://example.com --browser-api http://localhost:9333
132
+ ```
133
+
134
+ Rendering preserves multi-column layout, code blocks, headings, links, inline code, and list structure. Sidebar content is separated by a `---` rule. See [`docs/layout.md`](docs/layout.md) for details on the layout-to-markdown algorithm.
135
+
136
+ ### google-fetch
137
+
138
+ Search Google and download each result as HTML + markdown.
139
+
140
+ ```bash
141
+ # Basic search, 1 page
142
+ google-fetch --query "quantum computing"
143
+
144
+ # Multi-page with output dir
145
+ google-fetch --query "machine learning transformers" \
146
+ --num-pages 3 --out-dir data/research/transformers
147
+
148
+ # Filter by hostname
149
+ google-fetch --query "python typing" \
150
+ --allowed-hosts docs.python.org peps.python.org
151
+
152
+ # Limit total results
153
+ google-fetch --query "Rust async" --max-results 5
154
+
155
+ # Custom browser-api server
156
+ google-fetch --query "agents" --browser-api http://localhost:9333
157
+ ```
158
+
159
+ Output layout:
160
+
161
+ ```
162
+ data/research/<query-slug>/<tag>/
163
+ ├── results.json # title + URL index
164
+ ├── 01-introduction-to.html
165
+ ├── 01-introduction-to.md # frontmatter + rendered markdown
166
+ ├── 02-advanced-topics.html
167
+ ├── 02-advanced-topics.md
168
+ └── ...
169
+ ```
170
+
171
+ ### xcom-fetch
172
+
173
+ Search x.com and download tweets as HTML + markdown.
174
+
175
+ ```bash
176
+ # Keyword search
177
+ xcom-fetch --query "reinforcement learning"
178
+
179
+ # Restrict to one account
180
+ xcom-fetch --query "safety" --from "Anthropic"
181
+
182
+ # Limit results
183
+ xcom-fetch --query "alignment" --max-results 5
184
+
185
+ # Custom output dir
186
+ xcom-fetch --query "scaling laws" --out-dir data/tweets
187
+ ```
188
+
189
+ Output layout:
190
+
191
+ ```
192
+ data/research/xcom-<query>/
193
+ ├── results.json # permalink + author + text snippet
194
+ ├── 01-anthropic-12345.html
195
+ ├── 01-anthropic-12345.md # frontmatter + rendered markdown
196
+ └── ...
197
+ ```
198
+
199
+ ## Python API
200
+
201
+ ```python
202
+ from pathlib import Path
203
+ from chrome_scraper.html_to_md import extract_from_url, render_page
204
+
205
+ # Extract text-node payload from a URL
206
+ payload = extract_from_url(
207
+ "https://example.com",
208
+ browser_api_url="http://localhost:9333",
209
+ timeout=30.0,
210
+ scroll=True,
211
+ )
212
+
213
+ # Render to layout-preserving markdown
214
+ items = payload.get("items", [])
215
+ page_width = (payload.get("viewport") or {}).get("scroll_w", 1280)
216
+ md = render_page(items, page_width)
217
+ Path("out/example.md").write_text(md, encoding="utf-8")
218
+ ```
219
+
220
+ Or manage the browser lifecycle yourself:
221
+
222
+ ```python
223
+ from chrome_scraper.browser_api.client import BrowserAPIClient
224
+ from chrome_scraper.html_to_md.extract import extract_page
225
+
226
+ client = BrowserAPIClient(timeout=30.0)
227
+ with client.tab("my-tab"):
228
+ payload = extract_page(
229
+ "https://example.com",
230
+ client,
231
+ tab_ref="my-tab",
232
+ timeout=30.0,
233
+ scroll=True,
234
+ )
235
+ ```
236
+
237
+ ## At a glance
238
+
239
+ **browser-api** — shared Chrome behind HTTP:
240
+
241
+ - Persistent profile with cookies/logins.
242
+ - Label-keyed tabs for concurrent clients.
243
+ - Tab lifecycle isolated per client (open → use → close).
244
+ - Background-tab patch so Chrome stays out of the way.
245
+ - Headless-mode UA cleaning (strips `HeadlessChrome`).
246
+ - macOS hide support (`--hide`).
247
+
248
+ **html-to-md** — layout-preserving markdown via Chrome CDP:
249
+
250
+ - Extracts every rendered text node with position and styling.
251
+ - Detects columns via x-start histogram peaks.
252
+ - Splits main/sidebar content via widest vertical gutter.
253
+ - Preserves code blocks, headings, links, inline code, lists.
254
+ - Row boundaries computed from *intersecting* column gap sets — long main-column paragraphs stay intact regardless of sidebar density.
255
+
256
+ **google-fetch** — multi-page Google scraping:
257
+
258
+ - Paginates through Google result pages.
259
+ - Visits each result link, dumps outerHTML + rendered markdown.
260
+ - Navigates back to search results after each fetch.
261
+ - Optional hostname filtering and result count limits.
262
+
263
+ **xcom-fetch** — x.com tweet scraping:
264
+
265
+ - Drives x.com's React search UI via native keyboard (Patchright).
266
+ - Virtual list scrolling to populate results.
267
+ - Visits each tweet permalink, dumps HTML + markdown.
268
+ - SPA-safe navigation; falls back to direct URL navigation if anchor click fails.
@@ -0,0 +1,252 @@
1
+ # chrome-scraper
2
+
3
+ Stealth and stateful Chrome scraper, built for AI agents.
4
+ Shared browser server with profile persistence with pre-built CLI scrapers for Google and x.com that dump content in HTML + clean markdown.
5
+
6
+ `chrome-scraper` is a Python package built on [Patchright](https://pypi.org/project/patchright/) (a maintained Playwright fork) that runs a single long-lived Chrome instance behind a FastAPI HTTP API. Clients can connect concurrently, share cookies/sessions via a persistent profile, and avoid the cold-start/teardown cost of per-task browser launch.
7
+
8
+ ## Install
9
+
10
+ ```bash
11
+ # Core package
12
+ uv add chrome-scraper
13
+
14
+ # Or install from source
15
+ uv sync
16
+ ```
17
+
18
+ Supports Python 3.10–3.13.
19
+
20
+ ## Quickstart
21
+
22
+ ```bash
23
+ # 1. Start the shared browser server (keep running)
24
+ uv run browser-api [--headless]
25
+
26
+ # 2. In another terminal — render a URL to markdown
27
+ html-to-md https://example.com --output out/example.md
28
+
29
+ # 3. Search Google and download results
30
+ google-fetch --query "machine learning" --max-results 3 --num-pages 1 --out-dir data/research/ml
31
+
32
+ # 4. Search x.com and download tweets
33
+ xcom-fetch --query "ai safety" --max-results 10 --out-dir data/research/xcom-ai
34
+ ```
35
+
36
+ `browser-api` is a persistent server — start it once, run scrapers against it from multiple terminals or scripts. Chrome keeps cookies, logins, and profile state across sessions.
37
+
38
+ ## Architecture
39
+
40
+ ```
41
+ ┌──────────────────────────────────────────────────────┐
42
+ │ browser-api (port 9333) │
43
+ │ ┌──────────────────────────────────────────────────┐ │
44
+ │ │ FastAPI server │ │
45
+ │ │ /status /tabs /tabs/{id}/goto /eval /type │ │
46
+ │ └──────────┬───────────────────────────────────────┘ │
47
+ │ │ owns │
48
+ │ ┌──────────▼───────────────────────────────────────┐ │
49
+ │ │ Chrome (Patchright persistent context) │ │
50
+ │ │ Profile │ │
51
+ │ │ Tabs: separate label-keyed sandboxes │ │
52
+ │ └──────────────────────────────────────────────────┘ │
53
+ └──────────────────────────────────────────────────────┘
54
+ ▲ HTTP ▲ HTTP
55
+ │ │
56
+ ┌────────┴──────────┐ ┌──────────────┴──────────────┐
57
+ │ html-to-md │ │ google-fetch / xcom-fetch │
58
+ │ short-lived │ │ short-lived CLIs │
59
+ │ open tab → │ │ open tab → search → │
60
+ │ extract → close │ │ fetch each result → close │
61
+ └───────────────────┘ └─────────────────────────────┘
62
+ ```
63
+
64
+ The server patches Patchright's `crBrowser.js` so new tabs open in background — Chrome never steals focus during concurrent scraping.
65
+
66
+ ## CLI reference
67
+
68
+ ### browser-api
69
+
70
+ Start, stop, and check status of the shared browser server.
71
+
72
+ ```bash
73
+ uv run browser-api # start on :9333 (default)
74
+ uv run browser-api --port 8080 # custom port
75
+ uv run browser-api --headless # headless mode
76
+ uv run browser-api --chrome-path /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome
77
+ uv run browser-api --profile-dir ~/my-chrome-profile
78
+ uv run browser-api --hide # hide Chrome window (macOS)
79
+ uv run browser-api --proxy http://proxy:8080
80
+ uv run browser-api --browser-args="--disable-gpu --no-sandbox"
81
+ uv run browser-api status # check if running
82
+ uv run browser-api stop # shut down
83
+ ```
84
+
85
+ Stateful features:
86
+
87
+ - **Persistent profile** — cookies, localStorage, and logins survive restarts. Profile lives at `~/Library/Application Support/thebase/playwright/profile/` on macOS.
88
+ - **Headless identity** — when `--headless` is used, the server automatically probes a throwaway headless instance, strips the `HeadlessChrome` token from the User-Agent, and passes the clean UA to the real persistent context.
89
+ - **Hide macOS window** — `--hide` runs `osascript` to hide Chrome from sight without quitting (non-headless only).
90
+
91
+ ### html-to-md
92
+
93
+ Render any URL (or local HTML file) as layout-preserving markdown.
94
+
95
+ ```bash
96
+ # Live URL
97
+ html-to-md https://example.com --output out/page.md
98
+
99
+ # Local HTML file
100
+ html-to-md --from-file page.html --output out/page.md
101
+
102
+ # Print to stdout
103
+ html-to-md https://example.com --output -
104
+
105
+ # Save raw text-node payload alongside markdown
106
+ html-to-md https://example.com --save-items
107
+
108
+ # Skip scroll pass (for already-scrolled SPAs)
109
+ html-to-md https://example.com --no-scroll
110
+
111
+ # Verbose layout diagnostics
112
+ html-to-md https://example.com -v
113
+
114
+ # Custom browser-api URL
115
+ html-to-md https://example.com --browser-api http://localhost:9333
116
+ ```
117
+
118
+ Rendering preserves multi-column layout, code blocks, headings, links, inline code, and list structure. Sidebar content is separated by a `---` rule. See [`docs/layout.md`](docs/layout.md) for details on the layout-to-markdown algorithm.
119
+
120
+ ### google-fetch
121
+
122
+ Search Google and download each result as HTML + markdown.
123
+
124
+ ```bash
125
+ # Basic search, 1 page
126
+ google-fetch --query "quantum computing"
127
+
128
+ # Multi-page with output dir
129
+ google-fetch --query "machine learning transformers" \
130
+ --num-pages 3 --out-dir data/research/transformers
131
+
132
+ # Filter by hostname
133
+ google-fetch --query "python typing" \
134
+ --allowed-hosts docs.python.org peps.python.org
135
+
136
+ # Limit total results
137
+ google-fetch --query "Rust async" --max-results 5
138
+
139
+ # Custom browser-api server
140
+ google-fetch --query "agents" --browser-api http://localhost:9333
141
+ ```
142
+
143
+ Output layout:
144
+
145
+ ```
146
+ data/research/<query-slug>/<tag>/
147
+ ├── results.json # title + URL index
148
+ ├── 01-introduction-to.html
149
+ ├── 01-introduction-to.md # frontmatter + rendered markdown
150
+ ├── 02-advanced-topics.html
151
+ ├── 02-advanced-topics.md
152
+ └── ...
153
+ ```
154
+
155
+ ### xcom-fetch
156
+
157
+ Search x.com and download tweets as HTML + markdown.
158
+
159
+ ```bash
160
+ # Keyword search
161
+ xcom-fetch --query "reinforcement learning"
162
+
163
+ # Restrict to one account
164
+ xcom-fetch --query "safety" --from "Anthropic"
165
+
166
+ # Limit results
167
+ xcom-fetch --query "alignment" --max-results 5
168
+
169
+ # Custom output dir
170
+ xcom-fetch --query "scaling laws" --out-dir data/tweets
171
+ ```
172
+
173
+ Output layout:
174
+
175
+ ```
176
+ data/research/xcom-<query>/
177
+ ├── results.json # permalink + author + text snippet
178
+ ├── 01-anthropic-12345.html
179
+ ├── 01-anthropic-12345.md # frontmatter + rendered markdown
180
+ └── ...
181
+ ```
182
+
183
+ ## Python API
184
+
185
+ ```python
186
+ from pathlib import Path
187
+ from chrome_scraper.html_to_md import extract_from_url, render_page
188
+
189
+ # Extract text-node payload from a URL
190
+ payload = extract_from_url(
191
+ "https://example.com",
192
+ browser_api_url="http://localhost:9333",
193
+ timeout=30.0,
194
+ scroll=True,
195
+ )
196
+
197
+ # Render to layout-preserving markdown
198
+ items = payload.get("items", [])
199
+ page_width = (payload.get("viewport") or {}).get("scroll_w", 1280)
200
+ md = render_page(items, page_width)
201
+ Path("out/example.md").write_text(md, encoding="utf-8")
202
+ ```
203
+
204
+ Or manage the browser lifecycle yourself:
205
+
206
+ ```python
207
+ from chrome_scraper.browser_api.client import BrowserAPIClient
208
+ from chrome_scraper.html_to_md.extract import extract_page
209
+
210
+ client = BrowserAPIClient(timeout=30.0)
211
+ with client.tab("my-tab"):
212
+ payload = extract_page(
213
+ "https://example.com",
214
+ client,
215
+ tab_ref="my-tab",
216
+ timeout=30.0,
217
+ scroll=True,
218
+ )
219
+ ```
220
+
221
+ ## At a glance
222
+
223
+ **browser-api** — shared Chrome behind HTTP:
224
+
225
+ - Persistent profile with cookies/logins.
226
+ - Label-keyed tabs for concurrent clients.
227
+ - Tab lifecycle isolated per client (open → use → close).
228
+ - Background-tab patch so Chrome stays out of the way.
229
+ - Headless-mode UA cleaning (strips `HeadlessChrome`).
230
+ - macOS hide support (`--hide`).
231
+
232
+ **html-to-md** — layout-preserving markdown via Chrome CDP:
233
+
234
+ - Extracts every rendered text node with position and styling.
235
+ - Detects columns via x-start histogram peaks.
236
+ - Splits main/sidebar content via widest vertical gutter.
237
+ - Preserves code blocks, headings, links, inline code, lists.
238
+ - Row boundaries computed from *intersecting* column gap sets — long main-column paragraphs stay intact regardless of sidebar density.
239
+
240
+ **google-fetch** — multi-page Google scraping:
241
+
242
+ - Paginates through Google result pages.
243
+ - Visits each result link, dumps outerHTML + rendered markdown.
244
+ - Navigates back to search results after each fetch.
245
+ - Optional hostname filtering and result count limits.
246
+
247
+ **xcom-fetch** — x.com tweet scraping:
248
+
249
+ - Drives x.com's React search UI via native keyboard (Patchright).
250
+ - Virtual list scrolling to populate results.
251
+ - Visits each tweet permalink, dumps HTML + markdown.
252
+ - SPA-safe navigation; falls back to direct URL navigation if anchor click fails.
@@ -0,0 +1,28 @@
1
+ [project]
2
+ name = "chrome-scraper"
3
+ version = "0.1.0"
4
+ description = "Steatlh and stateful Chrome scraper"
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "Mauro Sciancalepore", email = "maurosciancalepore98@gmail.com" }
8
+ ]
9
+ requires-python = ">=3.10"
10
+ dependencies = [
11
+ "fastapi>=0.115.0",
12
+ "httpx>=0.28.0",
13
+ "jinja2>=3.1.4",
14
+ "patchright>=1.52.0",
15
+ "playwright>=1.59.0",
16
+ "python-multipart>=0.0.9",
17
+ "uvicorn[standard]>=0.30.0",
18
+ ]
19
+
20
+ [project.scripts]
21
+ html-to-md = "chrome_scraper.html_to_md.cli:main"
22
+ browser-api = "chrome_scraper.browser_api.cli:main"
23
+ google-fetch = "chrome_scraper.web_scrapers.google_fetch_cli:main"
24
+ xcom-fetch = "chrome_scraper.web_scrapers.xcom_fetch_cli:main"
25
+
26
+ [build-system]
27
+ requires = ["uv_build>=0.10.0,<0.11.0"]
28
+ build-backend = "uv_build"
@@ -0,0 +1,2 @@
1
+ def hello() -> str:
2
+ return "Hello from chrome-scraper!"