contextractor 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contextractor-0.4.1/.gitignore +18 -0
- contextractor-0.4.1/PKG-INFO +231 -0
- contextractor-0.4.1/README.md +203 -0
- contextractor-0.4.1/SPEC.md +140 -0
- contextractor-0.4.1/hatch_build.py +40 -0
- contextractor-0.4.1/pyproject.toml +81 -0
- contextractor-0.4.1/scripts/stage_vendor.py +100 -0
- contextractor-0.4.1/src/contextractor/__init__.py +46 -0
- contextractor-0.4.1/src/contextractor/__main__.py +25 -0
- contextractor-0.4.1/src/contextractor/_errors.py +19 -0
- contextractor-0.4.1/src/contextractor/_install.py +23 -0
- contextractor-0.4.1/src/contextractor/_manifest.py +59 -0
- contextractor-0.4.1/src/contextractor/_options.py +267 -0
- contextractor-0.4.1/src/contextractor/_redact.py +46 -0
- contextractor-0.4.1/src/contextractor/_run.py +392 -0
- contextractor-0.4.1/src/contextractor/_runtime.py +95 -0
- contextractor-0.4.1/src/contextractor/_vendor/__init__.py +3 -0
- contextractor-0.4.1/src/contextractor/py.typed +0 -0
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Build-time staged assets (flattened JS tree + current-platform .node).
|
|
2
|
+
# Produced by scripts/stage_vendor.py; never committed.
|
|
3
|
+
/src/contextractor/_vendor/cli/
|
|
4
|
+
|
|
5
|
+
# Python build/test artifacts
|
|
6
|
+
/dist/
|
|
7
|
+
/build/
|
|
8
|
+
*.egg-info/
|
|
9
|
+
__pycache__/
|
|
10
|
+
.pytest_cache/
|
|
11
|
+
.ruff_cache/
|
|
12
|
+
.mypy_cache/
|
|
13
|
+
|
|
14
|
+
# Local virtualenvs + uv lockfile (this is a published library; the lock is a
|
|
15
|
+
# dev-only convenience, kept out of the tree per the repo's lockfiles-gitignored policy)
|
|
16
|
+
/.venv/
|
|
17
|
+
/venv/
|
|
18
|
+
uv.lock
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: contextractor
|
|
3
|
+
Version: 0.4.1
|
|
4
|
+
Summary: Drive the Contextractor Node crawler/extractor from Python — clean main-content text in txt, markdown, json, or html.
|
|
5
|
+
Project-URL: Homepage, https://apify.com/glueo/contextractor
|
|
6
|
+
Project-URL: Repository, https://github.com/contextractor/contextractor
|
|
7
|
+
Author-email: glueo <company@glueo.com>
|
|
8
|
+
License-Expression: Apache-2.0
|
|
9
|
+
Keywords: content-extraction,crawlee,crawler,markdown,playwright,trafilatura,web-scraping
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Operating System :: MacOS
|
|
14
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
15
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
19
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
20
|
+
Classifier: Typing :: Typed
|
|
21
|
+
Requires-Python: >=3.12
|
|
22
|
+
Requires-Dist: nodejs-wheel-binaries<25,>=24.16
|
|
23
|
+
Provides-Extra: test
|
|
24
|
+
Requires-Dist: pytest-asyncio>=1.4; extra == 'test'
|
|
25
|
+
Requires-Dist: pytest>=9; extra == 'test'
|
|
26
|
+
Requires-Dist: pyyaml>=6; extra == 'test'
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
# contextractor
|
|
30
|
+
|
|
31
|
+
<p align="center"><img width="220" src="https://www.contextractor.com/media/cover-mini.svg" alt="Contextractor"></p>
|
|
32
|
+
|
|
33
|
+
Crawl web pages and extract clean main-content text — `txt`, `markdown`, `json`,
|
|
34
|
+
or `html` — from Python. Built on [`rs-trafilatura`](https://github.com/Murrough-Foley/rs-trafilatura)
|
|
35
|
+
(extraction) and [Crawlee](https://crawlee.dev/) + [Playwright](https://playwright.dev/)
|
|
36
|
+
(crawling).
|
|
37
|
+
|
|
38
|
+
This package is a thin, typed wrapper that **drives the bundled Node engine** — it
|
|
39
|
+
does not reimplement the crawler. A self-contained Node runtime ships with the
|
|
40
|
+
wheel (via [`nodejs-wheel-binaries`](https://pypi.org/project/nodejs-wheel-binaries/)),
|
|
41
|
+
so **no Node.js install is required**.
|
|
42
|
+
|
|
43
|
+
## Install
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install contextractor
|
|
47
|
+
python -m contextractor install # one-time: download the Chromium browser
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Platform wheels are published for macOS (arm64, x86_64), Linux (x86_64, aarch64;
|
|
51
|
+
glibc ≥ 2.28), and Windows (x64). Requires Python 3.12+.
|
|
52
|
+
|
|
53
|
+
## Quick start
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
import contextractor
|
|
57
|
+
|
|
58
|
+
summary = contextractor.extract(
|
|
59
|
+
["https://example.com"],
|
|
60
|
+
save=["markdown-kvs"],
|
|
61
|
+
output_dir="./out",
|
|
62
|
+
max_requests_per_crawl=10,
|
|
63
|
+
)
|
|
64
|
+
print(summary)
|
|
65
|
+
# ExtractSummary(total=1, succeeded=1, failed=0, skipped=0,
|
|
66
|
+
# output_dir='/abs/out', manifest_path='/abs/out/manifest.json')
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Extracted files and a `manifest.json` index are written to `output_dir`
|
|
70
|
+
(default: `./contextractor-output`). The manifest is a JSON array of records, each
|
|
71
|
+
tagged `status: "success" | "failed" | "skipped"`.
|
|
72
|
+
|
|
73
|
+
### Async
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
import asyncio
|
|
77
|
+
import contextractor
|
|
78
|
+
|
|
79
|
+
async def main():
|
|
80
|
+
summary = await contextractor.aextract(
|
|
81
|
+
["https://example.com", "https://example.org"],
|
|
82
|
+
save=["markdown-dataset", "original-kvs"],
|
|
83
|
+
output_dir="./out",
|
|
84
|
+
max_concurrency=5,
|
|
85
|
+
)
|
|
86
|
+
print(summary.succeeded, "of", summary.total)
|
|
87
|
+
|
|
88
|
+
asyncio.run(main())
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Single-page extraction
|
|
92
|
+
|
|
93
|
+
`extract_one()` crawls exactly one URL (no link-following) and returns the
|
|
94
|
+
extracted content directly — no output directory, nothing written to disk:
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
import contextractor
|
|
98
|
+
|
|
99
|
+
markdown = contextractor.extract_one("https://example.com")
|
|
100
|
+
print(markdown) # str — markdown is the default format
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Request several formats to get a `dict` keyed by format:
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
contents = contextractor.extract_one(
|
|
107
|
+
"https://example.com",
|
|
108
|
+
formats=["markdown", "json", "original"],
|
|
109
|
+
)
|
|
110
|
+
print(contents["markdown"]) # extracted markdown
|
|
111
|
+
print(contents["original"]) # raw page HTML
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Formats: `txt`, `markdown` (default), `json`, `html`, `original` (the raw page
|
|
115
|
+
HTML). With one requested format the return value is a `str`; with several it is
|
|
116
|
+
a `dict[str, str]`. `extract_one` accepts the single-page subset of the crawl
|
|
117
|
+
options (`ExtractOneOptions`) — e.g. `proxy`, `mode`, `user_agent`, `cookies`,
|
|
118
|
+
`headers`, `headless` — but not crawl-frontier options like `globs` or
|
|
119
|
+
`max_crawl_depth`. A page that cannot be fetched or extracted raises
|
|
120
|
+
`ContextractorError`. If the page yields no content for one of several requested
|
|
121
|
+
formats, that format's key is simply absent from the returned `dict`; when the
|
|
122
|
+
single requested format yields no content, `ContextractorError` is raised.
|
|
123
|
+
|
|
124
|
+
### Async single page
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
import asyncio
|
|
128
|
+
import contextractor
|
|
129
|
+
|
|
130
|
+
async def main():
|
|
131
|
+
markdown = await contextractor.aextract_one("https://example.com")
|
|
132
|
+
print(markdown)
|
|
133
|
+
|
|
134
|
+
asyncio.run(main())
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## Return value
|
|
138
|
+
|
|
139
|
+
`extract()` / `aextract()` return an `ExtractSummary`:
|
|
140
|
+
|
|
141
|
+
| Field | Meaning |
|
|
142
|
+
| --- | --- |
|
|
143
|
+
| `total` | Number of records in the manifest |
|
|
144
|
+
| `succeeded` / `failed` / `skipped` | Counts by record `status` |
|
|
145
|
+
| `output_dir` | Absolute path where files + manifest were written |
|
|
146
|
+
| `manifest_path` | Absolute path to `manifest.json` |
|
|
147
|
+
|
|
148
|
+
Partial failures (some URLs failed) **do not raise** — they are reflected in
|
|
149
|
+
`summary.failed`. Validation/config errors and real crawl failures raise
|
|
150
|
+
`ContextractorError`; a missing browser raises `MissingBrowserError` pointing you
|
|
151
|
+
at `python -m contextractor install`.
|
|
152
|
+
|
|
153
|
+
## Options
|
|
154
|
+
|
|
155
|
+
All crawl options are typed keyword arguments (`ExtractOptions`). A selection:
|
|
156
|
+
|
|
157
|
+
| Option | Type | Notes |
|
|
158
|
+
| --- | --- | --- |
|
|
159
|
+
| `save` | `list[str]` | `format-destination` tokens: `{txt,markdown,json,html,original}-{dataset,kvs}` (e.g. `markdown-kvs`, `original-dataset`). Default `markdown-kvs`; list a format twice to save to both. Saving `original`/`html` to the dataset risks OOM on large pages |
|
|
160
|
+
| `mode` | `str` | `precision`, `balanced` (default), `recall` |
|
|
161
|
+
| `max_requests_per_crawl` | `int` | `0` = unlimited |
|
|
162
|
+
| `max_crawl_depth` | `int` | `0` = unlimited |
|
|
163
|
+
| `globs` / `exclude` | `list[str]` | enqueue / skip URL patterns |
|
|
164
|
+
| `headless` | `bool` | `False` runs a headed browser |
|
|
165
|
+
| `block_media` / `images` | `bool` | toggle media loading / image extraction |
|
|
166
|
+
| `links` / `comments` / `tables` | `bool` | `False` excludes that content |
|
|
167
|
+
| `proxy` | `list[str]` | `http`, `https`, `socks4`, `socks5` URLs |
|
|
168
|
+
| `cookies` | `list[dict]` | initial cookies (JSON) |
|
|
169
|
+
| `headers` | `dict[str, str]` | custom HTTP headers (JSON) |
|
|
170
|
+
| `selector` | `str` | restrict extraction to a CSS selector |
|
|
171
|
+
| `deduplication` | `str` | `minimal`, `standard` (default), `aggressive` |
|
|
172
|
+
| `output_dir` | `str` | where files + manifest are written |
|
|
173
|
+
|
|
174
|
+
Boolean options that have a CLI default emit a flag only when you set them.
|
|
175
|
+
Editor autocomplete and type-checkers see every option via the `ExtractOptions`
|
|
176
|
+
`TypedDict`.
|
|
177
|
+
|
|
178
|
+
### Config file
|
|
179
|
+
|
|
180
|
+
Share configuration across runs with a JSON config file:
|
|
181
|
+
|
|
182
|
+
```json
|
|
183
|
+
{
|
|
184
|
+
"mode": "precision",
|
|
185
|
+
"save": ["markdown-kvs", "json-dataset"],
|
|
186
|
+
"maxRequestsPerCrawl": 25,
|
|
187
|
+
"maxCrawlDepth": 2
|
|
188
|
+
}
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
contextractor.extract(
|
|
193
|
+
["https://example.com"],
|
|
194
|
+
config_file="config.json",
|
|
195
|
+
output_dir="./out",
|
|
196
|
+
)
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
Keyword arguments override values from the config file.
|
|
200
|
+
|
|
201
|
+
### Proxies
|
|
202
|
+
|
|
203
|
+
Only `http`, `https`, `socks4`, and `socks5` proxy URLs are accepted; an
|
|
204
|
+
unsupported scheme raises `ProxySchemeError` before anything runs. Proxy
|
|
205
|
+
credentials are never echoed in errors or logs.
|
|
206
|
+
|
|
207
|
+
```python
|
|
208
|
+
contextractor.extract(
|
|
209
|
+
["https://example.com"],
|
|
210
|
+
proxy=["http://user:pass@proxy.example.com:3128"],
|
|
211
|
+
output_dir="./out",
|
|
212
|
+
)
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
## Browser provisioning
|
|
216
|
+
|
|
217
|
+
Browsers are not bundled in the wheel. Run `python -m contextractor install` once
|
|
218
|
+
to download Chromium for the bundled engine. The standard
|
|
219
|
+
`PLAYWRIGHT_BROWSERS_PATH` environment variable is honored.
|
|
220
|
+
|
|
221
|
+
## Advanced
|
|
222
|
+
|
|
223
|
+
- `CONTEXTRACTOR_NODE_PATH` — point at a host Node binary to use instead of the
|
|
224
|
+
bundled runtime.
|
|
225
|
+
- `storage_dir` — reuse a Crawlee storage directory across runs (defaults to a
|
|
226
|
+
private temporary directory cleaned up after each call).
|
|
227
|
+
- `timeout` — per-process wall-clock limit (seconds).
|
|
228
|
+
|
|
229
|
+
## License
|
|
230
|
+
|
|
231
|
+
Apache-2.0
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
# contextractor
|
|
2
|
+
|
|
3
|
+
<p align="center"><img width="220" src="https://www.contextractor.com/media/cover-mini.svg" alt="Contextractor"></p>
|
|
4
|
+
|
|
5
|
+
Crawl web pages and extract clean main-content text — `txt`, `markdown`, `json`,
|
|
6
|
+
or `html` — from Python. Built on [`rs-trafilatura`](https://github.com/Murrough-Foley/rs-trafilatura)
|
|
7
|
+
(extraction) and [Crawlee](https://crawlee.dev/) + [Playwright](https://playwright.dev/)
|
|
8
|
+
(crawling).
|
|
9
|
+
|
|
10
|
+
This package is a thin, typed wrapper that **drives the bundled Node engine** — it
|
|
11
|
+
does not reimplement the crawler. A self-contained Node runtime ships with the
|
|
12
|
+
wheel (via [`nodejs-wheel-binaries`](https://pypi.org/project/nodejs-wheel-binaries/)),
|
|
13
|
+
so **no Node.js install is required**.
|
|
14
|
+
|
|
15
|
+
## Install
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install contextractor
|
|
19
|
+
python -m contextractor install # one-time: download the Chromium browser
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
Platform wheels are published for macOS (arm64, x86_64), Linux (x86_64, aarch64;
|
|
23
|
+
glibc ≥ 2.28), and Windows (x64). Requires Python 3.12+.
|
|
24
|
+
|
|
25
|
+
## Quick start
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
import contextractor
|
|
29
|
+
|
|
30
|
+
summary = contextractor.extract(
|
|
31
|
+
["https://example.com"],
|
|
32
|
+
save=["markdown-kvs"],
|
|
33
|
+
output_dir="./out",
|
|
34
|
+
max_requests_per_crawl=10,
|
|
35
|
+
)
|
|
36
|
+
print(summary)
|
|
37
|
+
# ExtractSummary(total=1, succeeded=1, failed=0, skipped=0,
|
|
38
|
+
# output_dir='/abs/out', manifest_path='/abs/out/manifest.json')
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Extracted files and a `manifest.json` index are written to `output_dir`
|
|
42
|
+
(default: `./contextractor-output`). The manifest is a JSON array of records, each
|
|
43
|
+
tagged `status: "success" | "failed" | "skipped"`.
|
|
44
|
+
|
|
45
|
+
### Async
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
import asyncio
|
|
49
|
+
import contextractor
|
|
50
|
+
|
|
51
|
+
async def main():
|
|
52
|
+
summary = await contextractor.aextract(
|
|
53
|
+
["https://example.com", "https://example.org"],
|
|
54
|
+
save=["markdown-dataset", "original-kvs"],
|
|
55
|
+
output_dir="./out",
|
|
56
|
+
max_concurrency=5,
|
|
57
|
+
)
|
|
58
|
+
print(summary.succeeded, "of", summary.total)
|
|
59
|
+
|
|
60
|
+
asyncio.run(main())
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Single-page extraction
|
|
64
|
+
|
|
65
|
+
`extract_one()` crawls exactly one URL (no link-following) and returns the
|
|
66
|
+
extracted content directly — no output directory, nothing written to disk:
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
import contextractor
|
|
70
|
+
|
|
71
|
+
markdown = contextractor.extract_one("https://example.com")
|
|
72
|
+
print(markdown) # str — markdown is the default format
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Request several formats to get a `dict` keyed by format:
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
contents = contextractor.extract_one(
|
|
79
|
+
"https://example.com",
|
|
80
|
+
formats=["markdown", "json", "original"],
|
|
81
|
+
)
|
|
82
|
+
print(contents["markdown"]) # extracted markdown
|
|
83
|
+
print(contents["original"]) # raw page HTML
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Formats: `txt`, `markdown` (default), `json`, `html`, `original` (the raw page
|
|
87
|
+
HTML). With one requested format the return value is a `str`; with several it is
|
|
88
|
+
a `dict[str, str]`. `extract_one` accepts the single-page subset of the crawl
|
|
89
|
+
options (`ExtractOneOptions`) — e.g. `proxy`, `mode`, `user_agent`, `cookies`,
|
|
90
|
+
`headers`, `headless` — but not crawl-frontier options like `globs` or
|
|
91
|
+
`max_crawl_depth`. A page that cannot be fetched or extracted raises
|
|
92
|
+
`ContextractorError`. If the page yields no content for one of several requested
|
|
93
|
+
formats, that format's key is simply absent from the returned `dict`; when the
|
|
94
|
+
single requested format yields no content, `ContextractorError` is raised.
|
|
95
|
+
|
|
96
|
+
### Async single page
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
import asyncio
|
|
100
|
+
import contextractor
|
|
101
|
+
|
|
102
|
+
async def main():
|
|
103
|
+
markdown = await contextractor.aextract_one("https://example.com")
|
|
104
|
+
print(markdown)
|
|
105
|
+
|
|
106
|
+
asyncio.run(main())
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Return value
|
|
110
|
+
|
|
111
|
+
`extract()` / `aextract()` return an `ExtractSummary`:
|
|
112
|
+
|
|
113
|
+
| Field | Meaning |
|
|
114
|
+
| --- | --- |
|
|
115
|
+
| `total` | Number of records in the manifest |
|
|
116
|
+
| `succeeded` / `failed` / `skipped` | Counts by record `status` |
|
|
117
|
+
| `output_dir` | Absolute path where files + manifest were written |
|
|
118
|
+
| `manifest_path` | Absolute path to `manifest.json` |
|
|
119
|
+
|
|
120
|
+
Partial failures (some URLs failed) **do not raise** — they are reflected in
|
|
121
|
+
`summary.failed`. Validation/config errors and real crawl failures raise
|
|
122
|
+
`ContextractorError`; a missing browser raises `MissingBrowserError` pointing you
|
|
123
|
+
at `python -m contextractor install`.
|
|
124
|
+
|
|
125
|
+
## Options
|
|
126
|
+
|
|
127
|
+
All crawl options are typed keyword arguments (`ExtractOptions`). A selection:
|
|
128
|
+
|
|
129
|
+
| Option | Type | Notes |
|
|
130
|
+
| --- | --- | --- |
|
|
131
|
+
| `save` | `list[str]` | `format-destination` tokens: `{txt,markdown,json,html,original}-{dataset,kvs}` (e.g. `markdown-kvs`, `original-dataset`). Default `markdown-kvs`; list a format twice to save to both. Saving `original`/`html` to the dataset risks OOM on large pages |
|
|
132
|
+
| `mode` | `str` | `precision`, `balanced` (default), `recall` |
|
|
133
|
+
| `max_requests_per_crawl` | `int` | `0` = unlimited |
|
|
134
|
+
| `max_crawl_depth` | `int` | `0` = unlimited |
|
|
135
|
+
| `globs` / `exclude` | `list[str]` | enqueue / skip URL patterns |
|
|
136
|
+
| `headless` | `bool` | `False` runs a headed browser |
|
|
137
|
+
| `block_media` / `images` | `bool` | toggle media loading / image extraction |
|
|
138
|
+
| `links` / `comments` / `tables` | `bool` | `False` excludes that content |
|
|
139
|
+
| `proxy` | `list[str]` | `http`, `https`, `socks4`, `socks5` URLs |
|
|
140
|
+
| `cookies` | `list[dict]` | initial cookies (JSON) |
|
|
141
|
+
| `headers` | `dict[str, str]` | custom HTTP headers (JSON) |
|
|
142
|
+
| `selector` | `str` | restrict extraction to a CSS selector |
|
|
143
|
+
| `deduplication` | `str` | `minimal`, `standard` (default), `aggressive` |
|
|
144
|
+
| `output_dir` | `str` | where files + manifest are written |
|
|
145
|
+
|
|
146
|
+
Boolean options that have a CLI default emit a flag only when you set them.
|
|
147
|
+
Editor autocomplete and type-checkers see every option via the `ExtractOptions`
|
|
148
|
+
`TypedDict`.
|
|
149
|
+
|
|
150
|
+
### Config file
|
|
151
|
+
|
|
152
|
+
Share configuration across runs with a JSON config file:
|
|
153
|
+
|
|
154
|
+
```json
|
|
155
|
+
{
|
|
156
|
+
"mode": "precision",
|
|
157
|
+
"save": ["markdown-kvs", "json-dataset"],
|
|
158
|
+
"maxRequestsPerCrawl": 25,
|
|
159
|
+
"maxCrawlDepth": 2
|
|
160
|
+
}
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
contextractor.extract(
|
|
165
|
+
["https://example.com"],
|
|
166
|
+
config_file="config.json",
|
|
167
|
+
output_dir="./out",
|
|
168
|
+
)
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
Keyword arguments override values from the config file.
|
|
172
|
+
|
|
173
|
+
### Proxies
|
|
174
|
+
|
|
175
|
+
Only `http`, `https`, `socks4`, and `socks5` proxy URLs are accepted; an
|
|
176
|
+
unsupported scheme raises `ProxySchemeError` before anything runs. Proxy
|
|
177
|
+
credentials are never echoed in errors or logs.
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
contextractor.extract(
|
|
181
|
+
["https://example.com"],
|
|
182
|
+
proxy=["http://user:pass@proxy.example.com:3128"],
|
|
183
|
+
output_dir="./out",
|
|
184
|
+
)
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
## Browser provisioning
|
|
188
|
+
|
|
189
|
+
Browsers are not bundled in the wheel. Run `python -m contextractor install` once
|
|
190
|
+
to download Chromium for the bundled engine. The standard
|
|
191
|
+
`PLAYWRIGHT_BROWSERS_PATH` environment variable is honored.
|
|
192
|
+
|
|
193
|
+
## Advanced
|
|
194
|
+
|
|
195
|
+
- `CONTEXTRACTOR_NODE_PATH` — point at a host Node binary to use instead of the
|
|
196
|
+
bundled runtime.
|
|
197
|
+
- `storage_dir` — reuse a Crawlee storage directory across runs (defaults to a
|
|
198
|
+
private temporary directory cleaned up after each call).
|
|
199
|
+
- `timeout` — per-process wall-clock limit (seconds).
|
|
200
|
+
|
|
201
|
+
## License
|
|
202
|
+
|
|
203
|
+
Apache-2.0
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# contextractor (Python) — Specification
|
|
2
|
+
|
|
3
|
+
A library-only PyPI package that drives the Contextractor Node CLI from Python. It
|
|
4
|
+
**reimplements nothing**: `extract`/`aextract` spawn the bundled `dist/cli.js`,
|
|
5
|
+
translate snake_case options to CLI flags, let the CLI write to disk, then read the
|
|
6
|
+
export `manifest.json` back; `extract_one`/`aextract_one` spawn a single
|
|
7
|
+
`extract-one` child and return the content as values. Python loads no JavaScript
|
|
8
|
+
and no napi `.node` — Node does, when it runs `cli.js`. Standalone hatchling
|
|
9
|
+
package; **not** a pnpm/turbo workspace member.
|
|
10
|
+
|
|
11
|
+
## Public API
|
|
12
|
+
|
|
13
|
+
`src/contextractor/__init__.py` exports:
|
|
14
|
+
|
|
15
|
+
- `extract(urls, *, output_dir=None, storage_dir=None, timeout=None, **opts) -> ExtractSummary` — sync, primary.
|
|
16
|
+
- `aextract(urls, *, output_dir=None, storage_dir=None, timeout=None, **opts) -> ExtractSummary` — async (one crawl per child process via `asyncio.create_subprocess_exec`).
|
|
17
|
+
- `extract_one(url, *, formats=None, timeout=None, **opts) -> str | dict[str, str]` — sync single-page extraction: crawls exactly one URL (no link-following) and returns the content as values. `formats` defaults to `"markdown"`; one requested format returns a `str`, several return a `dict[str, str]` keyed by format. Nothing is persisted. Raises `ContextractorError` when the page cannot be extracted.
|
|
18
|
+
- `aextract_one(url, *, formats=None, timeout=None, **opts) -> str | dict[str, str]` — async counterpart of `extract_one`.
|
|
19
|
+
- `install(browser="chromium") -> None` — provision a Playwright browser via the bundled engine. Also reachable as `python -m contextractor install [browser]`.
|
|
20
|
+
- `ExtractOptions` — `TypedDict(total=False)` of all crawl options (the typed surface for `extract`'s `**opts`).
|
|
21
|
+
- `ExtractOneOptions` — `TypedDict(total=False)` of the single-page option subset (the typed surface for `extract_one`'s `**opts`).
|
|
22
|
+
- `ExtractSummary` — frozen dataclass: `total`, `succeeded`, `failed`, `skipped`, `output_dir`, `manifest_path`.
|
|
23
|
+
- Errors: `ContextractorError` (base), `ProxySchemeError`, `NodeRuntimeError`, `MissingBrowserError`.
|
|
24
|
+
- `__version__` — read via `importlib.metadata.version("contextractor")`.
|
|
25
|
+
|
|
26
|
+
`urls` accepts a single string or a list. `output_dir` defaults to
|
|
27
|
+
`./contextractor-output` (resolved against the CWD). `storage_dir` defaults to a
|
|
28
|
+
private temp dir that is removed after the call so the manifest reflects only the
|
|
29
|
+
current run; an explicitly-passed `storage_dir` is preserved.
|
|
30
|
+
|
|
31
|
+
## Orchestration (two invocations)
|
|
32
|
+
|
|
33
|
+
The CLI `extract` subcommand writes only to Crawlee storage and exits `2` on
|
|
34
|
+
partial failure; the separate `export` subcommand writes `<output_dir>/manifest.json`.
|
|
35
|
+
So each `extract()`/`aextract()` runs **two** child processes:
|
|
36
|
+
|
|
37
|
+
1. `node cli.js extract <urls> --storage <STORAGE> <mapped flags>`
|
|
38
|
+
2. `node cli.js export --storage <STORAGE> --output-dir <OUTPUT_DIR>`
|
|
39
|
+
|
|
40
|
+
then `read_summary(<OUTPUT_DIR>/manifest.json)`. One `--storage` path fully
|
|
41
|
+
identifies a run's storage (the CLI always uses the `default` buckets), so export
|
|
42
|
+
reads exactly what extract wrote — no bucket names are threaded.
|
|
43
|
+
|
|
44
|
+
### Exit-code semantics (`_run.py`)
|
|
45
|
+
|
|
46
|
+
- extract `0` → continue; `2` → partial success, continue (do **not** raise); `1`/other → raise `ContextractorError`.
|
|
47
|
+
- extract-one `0` → success; `2` → partial (a requested format yielded no content — see the single-page section); `1`/other → raise.
|
|
48
|
+
- export `0` → read manifest; non-zero → raise.
|
|
49
|
+
- Both runners (`_run_sync` / `_run_async`) capture raw bytes and decode stdout/stderr as UTF-8 with `errors="replace"` — never the locale codec (Windows cp1252/cp932 mojibake) and never universal-newline translation (which would corrupt `original` raw HTML).
|
|
50
|
+
- Playwright "Executable doesn't exist" in stderr → `MissingBrowserError` pointing at `python -m contextractor install`.
|
|
51
|
+
- Child stderr is redacted (proxy credentials masked) before being surfaced; argv is never echoed when it carries a proxy.
|
|
52
|
+
- A `timeout` (sync or async) raises `ContextractorError("contextractor timed out")` — never the raw `subprocess.TimeoutExpired`, whose `cmd` would leak the `--proxy` argv.
|
|
53
|
+
|
|
54
|
+
## Single-page orchestration (`extract_one`)
|
|
55
|
+
|
|
56
|
+
`extract_one()`/`aextract_one()` run **one** child process and return the content
|
|
57
|
+
as values — nothing is persisted, and no save/output/file/stdout options are
|
|
58
|
+
exposed; the wrapper drives the CLI `extract-one` subcommand internally:
|
|
59
|
+
|
|
60
|
+
- One requested format → `node cli.js extract-one <url> <mapped flags> --save <fmt>-stdout`; the child's stdout is the raw content (diagnostics go to stderr) and is returned as a `str`.
|
|
61
|
+
- Several formats → one `--save <fmt>-file` per format plus `--output <tempdir>/page`; the wrapper reads the files back into a `dict[str, str]` keyed by format, then removes the temp dir.
|
|
62
|
+
- Read-back names follow the CLI's multi-format `--output` prefix: `page.txt`, `page.md`, `page.json`, `page.html`; `original` lands at `page.original.html` only when `html` is also requested (the CLI's collision tag), else at `page.html`.
|
|
63
|
+
- `formats` accepts a string or a sequence, defaults to `"markdown"`, and deduplicates preserving order; an unknown format raises before spawn.
|
|
64
|
+
- Exit `0` → success. Exit `2` → partial: a requested format yielded no content (the CLI warns on stderr and skips that output). The multi-format dict simply omits that format's key — the npm library's `Partial<Record<…>>` semantics; the single-format route raises `ContextractorError("extract-one produced no <fmt> output")`, since a `str` cannot represent absence. Exit `1`/other → raise (hard failure).
|
|
65
|
+
|
|
66
|
+
## Option mapping (`_options.py`)
|
|
67
|
+
|
|
68
|
+
A single data-driven table, `OPTION_SPECS`, applied immediately before spawn (per
|
|
69
|
+
`.claude/rules/python-option-mapping.md`). `ExtractOptions` keys must equal
|
|
70
|
+
`OPTION_SPECS` keys (enforced by `tests/test_options.py`). Categories:
|
|
71
|
+
|
|
72
|
+
- **scalar** → `--flag <value>` (e.g. `max_crawl_depth`, `mode`, `start_urls_file`, …).
|
|
73
|
+
- **bool-pair** → `--flag` / `--no-flag`: `headless`, `block_media`, `images`.
|
|
74
|
+
- **negation-only** (default include; `False` emits the `--no-` flag): `links`, `comments`, `tables`.
|
|
75
|
+
- **bare-switch** (`True` emits the flag): `purge`, `ignore_cors_and_csp`, `close_cookie_modals`, `ignore_https_errors`, `keep_url_fragment`, `use_sitemaps`, `respect_robots_txt`, `store_skipped_urls`, `verbose` (`-v`).
|
|
76
|
+
- **repeatable** (one flag per item): `proxy`, `globs`, `exclude`, `save` (`format-destination` tokens, e.g. `markdown-kvs`).
|
|
77
|
+
- **json** (`--flag <json.dumps>`): `cookies`, `headers`.
|
|
78
|
+
|
|
79
|
+
`storage_dir`, `output_dir`, `timeout` are explicit parameters, not in the table.
|
|
80
|
+
`dataset` / `key_value_store` / `request_queue` no longer exist — the CLI always
|
|
81
|
+
uses the `default` buckets under `--storage`.
|
|
82
|
+
`apify_proxy` / `groups` / `use_apify_proxy` are intentionally absent — the CLI
|
|
83
|
+
accepts only `http`/`https`/`socks4`/`socks5` proxies; unknown keys raise, and bad
|
|
84
|
+
proxy schemes raise `ProxySchemeError` before spawn.
|
|
85
|
+
|
|
86
|
+
`ExtractOneOptions` is the single-page subset of `ExtractOptions` (the proxy,
|
|
87
|
+
session, rendering, network, content, and verbosity knobs only).
|
|
88
|
+
`EXTRACT_ONE_OPTION_KEYS` (frozen from its annotations; enforced by
|
|
89
|
+
`tests/test_options.py` to stay a subset of `OPTION_SPECS`) gates
|
|
90
|
+
`build_extract_one_args`, which raises `ContextractorError` for every
|
|
91
|
+
`extract`-only key (crawl-frontier/storage/output options such as `globs`,
|
|
92
|
+
`selector`, `max_crawl_depth`, `save`, `purge`, plus `session_pool_name` —
|
|
93
|
+
cross-run session sharing needs persisted session-pool state under
|
|
94
|
+
`--storage`, which `extract-one` never touches) before delegating to
|
|
95
|
+
`build_extract_args`.
|
|
96
|
+
|
|
97
|
+
## Runtime resolution (`_runtime.py`)
|
|
98
|
+
|
|
99
|
+
- `resolve_node()` — `CONTEXTRACTOR_NODE_PATH` override, else the `nodejs-wheel-binaries` binary at `nodejs_wheel.executable.ROOT_DIR` (`bin/node` on POSIX, `node.exe` on Windows). Restores the exec bit (POSIX) if a wheel ZIP dropped it.
|
|
100
|
+
- `vendor_cli_dir()` — context manager that materializes the staged `_vendor/cli` tree as a real directory via `importlib.resources.as_file()` (a no-op yielding the on-disk path for a normal wheel; extracts to a temp dir, removed on exit, when imported from a zip/pex/shiv). Stays open across the whole subprocess run so the tree outlives the child.
|
|
101
|
+
- `cli_js(cli_dir)` / `playwright_cli_js(cli_dir)` — resolve `dist/cli.js` and `node_modules/playwright/cli.js` inside that materialized tree; raise `NodeRuntimeError` if assets were not staged.
|
|
102
|
+
|
|
103
|
+
## Asset bundling
|
|
104
|
+
|
|
105
|
+
The Node CLI ships **un-bundled** (plain `tsc` output; Crawlee/Playwright/commander
|
|
106
|
+
resolve assets via `__dirname`, so esbuild/ncc/SEA are forbidden). At wheel-build
|
|
107
|
+
time `scripts/stage_vendor.py` copies a `pnpm deploy --prod --config.node-linker=hoisted`
|
|
108
|
+
tree (npm-style real files — wheels can't carry pnpm's symlink store) into
|
|
109
|
+
`src/contextractor/_vendor/cli/`, restores `"type": "module"` (pnpm deploy strips
|
|
110
|
+
it), prunes every non-build-platform `.node` — both the bundled
|
|
111
|
+
`dist/native/contextractor-extraction-native.*.node` prebuilds and any legacy
|
|
112
|
+
`node_modules/@contextractor/extraction-native-*` packages (defensive only:
|
|
113
|
+
`@contextractor/*` are devDependencies now, so `pnpm deploy --prod` no longer
|
|
114
|
+
carries them) — and seeds an `__init__.py` in every subdir (for
|
|
115
|
+
`importlib.resources`).
|
|
116
|
+
`_vendor/cli` is gitignored and force-included via the wheel `artifacts` glob. The
|
|
117
|
+
Node runtime itself is **not** bundled (it comes from `nodejs-wheel-binaries`);
|
|
118
|
+
browsers are never bundled (`python -m contextractor install`).
|
|
119
|
+
|
|
120
|
+
## Packaging & distribution
|
|
121
|
+
|
|
122
|
+
- Backend: hatchling + `hatch_build.py` (`pure_python=False`, `infer_tag=True`) → `py3-none-{platform}` wheels. Forbid maturin / scikit-build-core / uv_build.
|
|
123
|
+
- `version` is static in `pyproject.toml` (the `/git:release` and `/publish:all` bump target); `__version__` is read from installed metadata.
|
|
124
|
+
- `readme = "README.md"` → the PyPI project page (per `.claude/rules/user-facing-docs.md`); included in the sdist.
|
|
125
|
+
- Wheel matrix: `macosx_*_arm64`, `macosx_*_x86_64`, `manylinux_2_28_x86_64`, `manylinux_2_28_aarch64`, `win_amd64`, plus an sdist. **musl is unsupported** — the napi loader throws a clear import error rather than ship a broken `.node`.
|
|
126
|
+
- CI: `.github/workflows/release-pypi.yml` (cibuildwheel; `CIBW_BEFORE_ALL` stages `_vendor`; auditwheel/delocate repair disabled — there is no ELF Python extension; publish via PyPI Trusted Publishing / OIDC). It is sequenced **after** the napi-refresh PR opened by `build-napi.yml` for a `v*` tag, so wheels bundle current `.node` files — the gate is encoded in `/publish:all`.
|
|
127
|
+
|
|
128
|
+
## Tests
|
|
129
|
+
|
|
130
|
+
`pytest` + `pytest-asyncio` (subprocess boundary mocked; no network): argv mapping
|
|
131
|
+
per category, manifest tally, exit-2-is-partial (for both `extract` and
|
|
132
|
+
`extract-one`, incl. the partial multi-format dict and the single-format raise),
|
|
133
|
+
exit-1/other raise, UTF-8/CRLF byte fidelity through the sync runner, async path,
|
|
134
|
+
proxy redaction (incl. the sync-timeout path), node/CLI resolution + exec-bit
|
|
135
|
+
restore, and the Windows-napi consistency check. Two non-mocked layers:
|
|
136
|
+
`tests/test_integration_real_cli.py` drives the repo-built
|
|
137
|
+
`packages/standalone/dist/cli.js` with the system `node` through a multi-format
|
|
138
|
+
`extract_one` against a local `http.server` page (guards the file-naming
|
|
139
|
+
contract the fake CLI re-implements; auto-skips when `node` or the built CLI is
|
|
140
|
+
absent), and one env-gated real e2e (`CONTEXTRACTOR_E2E`).
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Hatchling build hook forcing a platform-tagged wheel.
|
|
2
|
+
|
|
3
|
+
The wheel bundles a flattened Node CLI tree plus the build platform's native
|
|
4
|
+
`.node` addon, so it must be tagged `py3-none-{platform}` rather than
|
|
5
|
+
`py3-none-any`. The assets themselves are staged into
|
|
6
|
+
``src/contextractor/_vendor/`` before the build by ``scripts/stage_vendor.py``
|
|
7
|
+
(driven by ``CIBW_BEFORE_ALL`` in CI); this hook only sets the wheel tag.
|
|
8
|
+
|
|
9
|
+
``CONTEXTRACTOR_WHEEL_PLATFORM`` pins the platform tag explicitly (e.g.
|
|
10
|
+
``manylinux_2_28_x86_64``). CI sets it per matrix row because auditwheel/delocate
|
|
11
|
+
repair is disabled — there is no ELF Python extension to relabel a bare
|
|
12
|
+
``linux_x86_64`` wheel into a PyPI-acceptable ``manylinux`` tag. When unset (e.g. a
|
|
13
|
+
local ``python -m build`` on the native platform), the tag is inferred.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import os
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
from hatchling.builders.hooks.plugin.interface import BuildHookInterface
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class CustomBuildHook(BuildHookInterface):
|
|
25
|
+
def initialize(self, version: str, build_data: dict[str, Any]) -> None:
|
|
26
|
+
# Platform-specific (bundles a .node) but ABI-agnostic: the package is pure
|
|
27
|
+
# Python with no CPython extension, so it runs on any Python 3.x →
|
|
28
|
+
# py3-none-{platform}, not cp312-cp312-{platform}.
|
|
29
|
+
build_data["pure_python"] = False
|
|
30
|
+
build_data["tag"] = f"py3-none-{self._platform()}"
|
|
31
|
+
|
|
32
|
+
def _platform(self) -> str:
|
|
33
|
+
# CI pins the platform tag per matrix row; locally infer it (auditwheel
|
|
34
|
+
# repair is disabled, so the tag must be correct at build time).
|
|
35
|
+
pinned = os.environ.get("CONTEXTRACTOR_WHEEL_PLATFORM")
|
|
36
|
+
if pinned:
|
|
37
|
+
return pinned
|
|
38
|
+
from packaging.tags import sys_tags
|
|
39
|
+
|
|
40
|
+
return next(iter(sys_tags())).platform
|