contextractor 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,18 @@
1
+ # Build-time staged assets (flattened JS tree + current-platform .node).
2
+ # Produced by scripts/stage_vendor.py; never committed.
3
+ /src/contextractor/_vendor/cli/
4
+
5
+ # Python build/test artifacts
6
+ /dist/
7
+ /build/
8
+ *.egg-info/
9
+ __pycache__/
10
+ .pytest_cache/
11
+ .ruff_cache/
12
+ .mypy_cache/
13
+
14
+ # Local virtualenvs + uv lockfile (this is a published library; the lock is a
15
+ # dev-only convenience, kept out of the tree per the repo's lockfiles-gitignored policy)
16
+ /.venv/
17
+ /venv/
18
+ uv.lock
@@ -0,0 +1,231 @@
1
+ Metadata-Version: 2.4
2
+ Name: contextractor
3
+ Version: 0.4.1
4
+ Summary: Drive the Contextractor Node crawler/extractor from Python — clean main-content text in txt, markdown, json, or html.
5
+ Project-URL: Homepage, https://apify.com/glueo/contextractor
6
+ Project-URL: Repository, https://github.com/contextractor/contextractor
7
+ Author-email: glueo <company@glueo.com>
8
+ License-Expression: Apache-2.0
9
+ Keywords: content-extraction,crawlee,crawler,markdown,playwright,trafilatura,web-scraping
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: Operating System :: MacOS
14
+ Classifier: Operating System :: Microsoft :: Windows
15
+ Classifier: Operating System :: POSIX :: Linux
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
19
+ Classifier: Topic :: Text Processing :: Markup :: HTML
20
+ Classifier: Typing :: Typed
21
+ Requires-Python: >=3.12
22
+ Requires-Dist: nodejs-wheel-binaries<25,>=24.16
23
+ Provides-Extra: test
24
+ Requires-Dist: pytest-asyncio>=1.4; extra == 'test'
25
+ Requires-Dist: pytest>=9; extra == 'test'
26
+ Requires-Dist: pyyaml>=6; extra == 'test'
27
+ Description-Content-Type: text/markdown
28
+
29
+ # contextractor
30
+
31
+ <p align="center"><img width="220" src="https://www.contextractor.com/media/cover-mini.svg" alt="Contextractor"></p>
32
+
33
+ Crawl web pages and extract clean main-content text — `txt`, `markdown`, `json`,
34
+ or `html` — from Python. Built on [`rs-trafilatura`](https://github.com/Murrough-Foley/rs-trafilatura)
35
+ (extraction) and [Crawlee](https://crawlee.dev/) + [Playwright](https://playwright.dev/)
36
+ (crawling).
37
+
38
+ This package is a thin, typed wrapper that **drives the bundled Node engine** — it
39
+ does not reimplement the crawler. A self-contained Node runtime ships with the
40
+ wheel (via [`nodejs-wheel-binaries`](https://pypi.org/project/nodejs-wheel-binaries/)),
41
+ so **no Node.js install is required**.
42
+
43
+ ## Install
44
+
45
+ ```bash
46
+ pip install contextractor
47
+ python -m contextractor install # one-time: download the Chromium browser
48
+ ```
49
+
50
+ Platform wheels are published for macOS (arm64, x86_64), Linux (x86_64, aarch64;
51
+ glibc ≥ 2.28), and Windows (x64). Requires Python 3.12+.
52
+
53
+ ## Quick start
54
+
55
+ ```python
56
+ import contextractor
57
+
58
+ summary = contextractor.extract(
59
+ ["https://example.com"],
60
+ save=["markdown-kvs"],
61
+ output_dir="./out",
62
+ max_requests_per_crawl=10,
63
+ )
64
+ print(summary)
65
+ # ExtractSummary(total=1, succeeded=1, failed=0, skipped=0,
66
+ # output_dir='/abs/out', manifest_path='/abs/out/manifest.json')
67
+ ```
68
+
69
+ Extracted files and a `manifest.json` index are written to `output_dir`
70
+ (default: `./contextractor-output`). The manifest is a JSON array of records, each
71
+ tagged `status: "success" | "failed" | "skipped"`.
72
+
73
+ ### Async
74
+
75
+ ```python
76
+ import asyncio
77
+ import contextractor
78
+
79
+ async def main():
80
+ summary = await contextractor.aextract(
81
+ ["https://example.com", "https://example.org"],
82
+ save=["markdown-dataset", "original-kvs"],
83
+ output_dir="./out",
84
+ max_concurrency=5,
85
+ )
86
+ print(summary.succeeded, "of", summary.total)
87
+
88
+ asyncio.run(main())
89
+ ```
90
+
91
+ ## Single-page extraction
92
+
93
+ `extract_one()` crawls exactly one URL (no link-following) and returns the
94
+ extracted content directly — no output directory, nothing written to disk:
95
+
96
+ ```python
97
+ import contextractor
98
+
99
+ markdown = contextractor.extract_one("https://example.com")
100
+ print(markdown) # str — markdown is the default format
101
+ ```
102
+
103
+ Request several formats to get a `dict` keyed by format:
104
+
105
+ ```python
106
+ contents = contextractor.extract_one(
107
+ "https://example.com",
108
+ formats=["markdown", "json", "original"],
109
+ )
110
+ print(contents["markdown"]) # extracted markdown
111
+ print(contents["original"]) # raw page HTML
112
+ ```
113
+
114
+ Formats: `txt`, `markdown` (default), `json`, `html`, `original` (the raw page
115
+ HTML). With one requested format the return value is a `str`; with several it is
116
+ a `dict[str, str]`. `extract_one` accepts the single-page subset of the crawl
117
+ options (`ExtractOneOptions`) — e.g. `proxy`, `mode`, `user_agent`, `cookies`,
118
+ `headers`, `headless` — but not crawl-frontier options like `globs` or
119
+ `max_crawl_depth`. A page that cannot be fetched or extracted raises
120
+ `ContextractorError`. If the page yields no content for one of several requested
121
+ formats, that format's key is simply absent from the returned `dict`; when the
122
+ single requested format yields no content, `ContextractorError` is raised.
123
+
124
+ ### Async single page
125
+
126
+ ```python
127
+ import asyncio
128
+ import contextractor
129
+
130
+ async def main():
131
+ markdown = await contextractor.aextract_one("https://example.com")
132
+ print(markdown)
133
+
134
+ asyncio.run(main())
135
+ ```
136
+
137
+ ## Return value
138
+
139
+ `extract()` / `aextract()` return an `ExtractSummary`:
140
+
141
+ | Field | Meaning |
142
+ | --- | --- |
143
+ | `total` | Number of records in the manifest |
144
+ | `succeeded` / `failed` / `skipped` | Counts by record `status` |
145
+ | `output_dir` | Absolute path where files + manifest were written |
146
+ | `manifest_path` | Absolute path to `manifest.json` |
147
+
148
+ Partial failures (some URLs failed) **do not raise** — they are reflected in
149
+ `summary.failed`. Validation/config errors and real crawl failures raise
150
+ `ContextractorError`; a missing browser raises `MissingBrowserError` pointing you
151
+ at `python -m contextractor install`.
152
+
153
+ ## Options
154
+
155
+ All crawl options are typed keyword arguments (`ExtractOptions`). A selection:
156
+
157
+ | Option | Type | Notes |
158
+ | --- | --- | --- |
159
+ | `save` | `list[str]` | `format-destination` tokens: `{txt,markdown,json,html,original}-{dataset,kvs}` (e.g. `markdown-kvs`, `original-dataset`). Default `markdown-kvs`; list a format twice to save to both. Saving `original`/`html` to the dataset risks OOM on large pages |
160
+ | `mode` | `str` | `precision`, `balanced` (default), `recall` |
161
+ | `max_requests_per_crawl` | `int` | `0` = unlimited |
162
+ | `max_crawl_depth` | `int` | `0` = unlimited |
163
+ | `globs` / `exclude` | `list[str]` | enqueue / skip URL patterns |
164
+ | `headless` | `bool` | `False` runs a headed browser |
165
+ | `block_media` / `images` | `bool` | toggle media loading / image extraction |
166
+ | `links` / `comments` / `tables` | `bool` | `False` excludes that content |
167
+ | `proxy` | `list[str]` | `http`, `https`, `socks4`, `socks5` URLs |
168
+ | `cookies` | `list[dict]` | initial cookies (JSON) |
169
+ | `headers` | `dict[str, str]` | custom HTTP headers (JSON) |
170
+ | `selector` | `str` | restrict extraction to a CSS selector |
171
+ | `deduplication` | `str` | `minimal`, `standard` (default), `aggressive` |
172
+ | `output_dir` | `str` | where files + manifest are written |
173
+
174
+ Boolean options that have a CLI default emit a flag only when you set them.
175
+ Editor autocomplete and type-checkers see every option via the `ExtractOptions`
176
+ `TypedDict`.
177
+
178
+ ### Config file
179
+
180
+ Share configuration across runs with a JSON config file:
181
+
182
+ ```json
183
+ {
184
+ "mode": "precision",
185
+ "save": ["markdown-kvs", "json-dataset"],
186
+ "maxRequestsPerCrawl": 25,
187
+ "maxCrawlDepth": 2
188
+ }
189
+ ```
190
+
191
+ ```python
192
+ contextractor.extract(
193
+ ["https://example.com"],
194
+ config_file="config.json",
195
+ output_dir="./out",
196
+ )
197
+ ```
198
+
199
+ Keyword arguments override values from the config file.
200
+
201
+ ### Proxies
202
+
203
+ Only `http`, `https`, `socks4`, and `socks5` proxy URLs are accepted; an
204
+ unsupported scheme raises `ProxySchemeError` before anything runs. Proxy
205
+ credentials are never echoed in errors or logs.
206
+
207
+ ```python
208
+ contextractor.extract(
209
+ ["https://example.com"],
210
+ proxy=["http://user:pass@proxy.example.com:3128"],
211
+ output_dir="./out",
212
+ )
213
+ ```
214
+
215
+ ## Browser provisioning
216
+
217
+ Browsers are not bundled in the wheel. Run `python -m contextractor install` once
218
+ to download Chromium for the bundled engine. The standard
219
+ `PLAYWRIGHT_BROWSERS_PATH` environment variable is honored.
220
+
221
+ ## Advanced
222
+
223
+ - `CONTEXTRACTOR_NODE_PATH` — point at a host Node binary to use instead of the
224
+ bundled runtime.
225
+ - `storage_dir` — reuse a Crawlee storage directory across runs (defaults to a
226
+ private temporary directory cleaned up after each call).
227
+ - `timeout` — per-process wall-clock limit (seconds).
228
+
229
+ ## License
230
+
231
+ Apache-2.0
@@ -0,0 +1,203 @@
1
+ # contextractor
2
+
3
+ <p align="center"><img width="220" src="https://www.contextractor.com/media/cover-mini.svg" alt="Contextractor"></p>
4
+
5
+ Crawl web pages and extract clean main-content text — `txt`, `markdown`, `json`,
6
+ or `html` — from Python. Built on [`rs-trafilatura`](https://github.com/Murrough-Foley/rs-trafilatura)
7
+ (extraction) and [Crawlee](https://crawlee.dev/) + [Playwright](https://playwright.dev/)
8
+ (crawling).
9
+
10
+ This package is a thin, typed wrapper that **drives the bundled Node engine** — it
11
+ does not reimplement the crawler. A self-contained Node runtime ships with the
12
+ wheel (via [`nodejs-wheel-binaries`](https://pypi.org/project/nodejs-wheel-binaries/)),
13
+ so **no Node.js install is required**.
14
+
15
+ ## Install
16
+
17
+ ```bash
18
+ pip install contextractor
19
+ python -m contextractor install # one-time: download the Chromium browser
20
+ ```
21
+
22
+ Platform wheels are published for macOS (arm64, x86_64), Linux (x86_64, aarch64;
23
+ glibc ≥ 2.28), and Windows (x64). Requires Python 3.12+.
24
+
25
+ ## Quick start
26
+
27
+ ```python
28
+ import contextractor
29
+
30
+ summary = contextractor.extract(
31
+ ["https://example.com"],
32
+ save=["markdown-kvs"],
33
+ output_dir="./out",
34
+ max_requests_per_crawl=10,
35
+ )
36
+ print(summary)
37
+ # ExtractSummary(total=1, succeeded=1, failed=0, skipped=0,
38
+ # output_dir='/abs/out', manifest_path='/abs/out/manifest.json')
39
+ ```
40
+
41
+ Extracted files and a `manifest.json` index are written to `output_dir`
42
+ (default: `./contextractor-output`). The manifest is a JSON array of records, each
43
+ tagged `status: "success" | "failed" | "skipped"`.
44
+
45
+ ### Async
46
+
47
+ ```python
48
+ import asyncio
49
+ import contextractor
50
+
51
+ async def main():
52
+ summary = await contextractor.aextract(
53
+ ["https://example.com", "https://example.org"],
54
+ save=["markdown-dataset", "original-kvs"],
55
+ output_dir="./out",
56
+ max_concurrency=5,
57
+ )
58
+ print(summary.succeeded, "of", summary.total)
59
+
60
+ asyncio.run(main())
61
+ ```
62
+
63
+ ## Single-page extraction
64
+
65
+ `extract_one()` crawls exactly one URL (no link-following) and returns the
66
+ extracted content directly — no output directory, nothing written to disk:
67
+
68
+ ```python
69
+ import contextractor
70
+
71
+ markdown = contextractor.extract_one("https://example.com")
72
+ print(markdown) # str — markdown is the default format
73
+ ```
74
+
75
+ Request several formats to get a `dict` keyed by format:
76
+
77
+ ```python
78
+ contents = contextractor.extract_one(
79
+ "https://example.com",
80
+ formats=["markdown", "json", "original"],
81
+ )
82
+ print(contents["markdown"]) # extracted markdown
83
+ print(contents["original"]) # raw page HTML
84
+ ```
85
+
86
+ Formats: `txt`, `markdown` (default), `json`, `html`, `original` (the raw page
87
+ HTML). With one requested format the return value is a `str`; with several it is
88
+ a `dict[str, str]`. `extract_one` accepts the single-page subset of the crawl
89
+ options (`ExtractOneOptions`) — e.g. `proxy`, `mode`, `user_agent`, `cookies`,
90
+ `headers`, `headless` — but not crawl-frontier options like `globs` or
91
+ `max_crawl_depth`. A page that cannot be fetched or extracted raises
92
+ `ContextractorError`. If the page yields no content for one of several requested
93
+ formats, that format's key is simply absent from the returned `dict`; when the
94
+ single requested format yields no content, `ContextractorError` is raised.
95
+
96
+ ### Async single page
97
+
98
+ ```python
99
+ import asyncio
100
+ import contextractor
101
+
102
+ async def main():
103
+ markdown = await contextractor.aextract_one("https://example.com")
104
+ print(markdown)
105
+
106
+ asyncio.run(main())
107
+ ```
108
+
109
+ ## Return value
110
+
111
+ `extract()` / `aextract()` return an `ExtractSummary`:
112
+
113
+ | Field | Meaning |
114
+ | --- | --- |
115
+ | `total` | Number of records in the manifest |
116
+ | `succeeded` / `failed` / `skipped` | Counts by record `status` |
117
+ | `output_dir` | Absolute path where files + manifest were written |
118
+ | `manifest_path` | Absolute path to `manifest.json` |
119
+
120
+ Partial failures (some URLs failed) **do not raise** — they are reflected in
121
+ `summary.failed`. Validation/config errors and real crawl failures raise
122
+ `ContextractorError`; a missing browser raises `MissingBrowserError` pointing you
123
+ at `python -m contextractor install`.
124
+
125
+ ## Options
126
+
127
+ All crawl options are typed keyword arguments (`ExtractOptions`). A selection:
128
+
129
+ | Option | Type | Notes |
130
+ | --- | --- | --- |
131
+ | `save` | `list[str]` | `format-destination` tokens: `{txt,markdown,json,html,original}-{dataset,kvs}` (e.g. `markdown-kvs`, `original-dataset`). Default `markdown-kvs`; list a format twice to save to both. Saving `original`/`html` to the dataset risks OOM on large pages |
132
+ | `mode` | `str` | `precision`, `balanced` (default), `recall` |
133
+ | `max_requests_per_crawl` | `int` | `0` = unlimited |
134
+ | `max_crawl_depth` | `int` | `0` = unlimited |
135
+ | `globs` / `exclude` | `list[str]` | enqueue / skip URL patterns |
136
+ | `headless` | `bool` | `False` runs a headed browser |
137
+ | `block_media` / `images` | `bool` | toggle media loading / image extraction |
138
+ | `links` / `comments` / `tables` | `bool` | `False` excludes that content |
139
+ | `proxy` | `list[str]` | `http`, `https`, `socks4`, `socks5` URLs |
140
+ | `cookies` | `list[dict]` | initial cookies (JSON) |
141
+ | `headers` | `dict[str, str]` | custom HTTP headers (JSON) |
142
+ | `selector` | `str` | restrict extraction to a CSS selector |
143
+ | `deduplication` | `str` | `minimal`, `standard` (default), `aggressive` |
144
+ | `output_dir` | `str` | where files + manifest are written |
145
+
146
+ Boolean options that have a CLI default emit a flag only when you set them.
147
+ Editor autocomplete and type-checkers see every option via the `ExtractOptions`
148
+ `TypedDict`.
149
+
150
+ ### Config file
151
+
152
+ Share configuration across runs with a JSON config file:
153
+
154
+ ```json
155
+ {
156
+ "mode": "precision",
157
+ "save": ["markdown-kvs", "json-dataset"],
158
+ "maxRequestsPerCrawl": 25,
159
+ "maxCrawlDepth": 2
160
+ }
161
+ ```
162
+
163
+ ```python
164
+ contextractor.extract(
165
+ ["https://example.com"],
166
+ config_file="config.json",
167
+ output_dir="./out",
168
+ )
169
+ ```
170
+
171
+ Keyword arguments override values from the config file.
172
+
173
+ ### Proxies
174
+
175
+ Only `http`, `https`, `socks4`, and `socks5` proxy URLs are accepted; an
176
+ unsupported scheme raises `ProxySchemeError` before anything runs. Proxy
177
+ credentials are never echoed in errors or logs.
178
+
179
+ ```python
180
+ contextractor.extract(
181
+ ["https://example.com"],
182
+ proxy=["http://user:pass@proxy.example.com:3128"],
183
+ output_dir="./out",
184
+ )
185
+ ```
186
+
187
+ ## Browser provisioning
188
+
189
+ Browsers are not bundled in the wheel. Run `python -m contextractor install` once
190
+ to download Chromium for the bundled engine. The standard
191
+ `PLAYWRIGHT_BROWSERS_PATH` environment variable is honored.
192
+
193
+ ## Advanced
194
+
195
+ - `CONTEXTRACTOR_NODE_PATH` — point at a host Node binary to use instead of the
196
+ bundled runtime.
197
+ - `storage_dir` — reuse a Crawlee storage directory across runs (defaults to a
198
+ private temporary directory cleaned up after each call).
199
+ - `timeout` — per-process wall-clock limit (seconds).
200
+
201
+ ## License
202
+
203
+ Apache-2.0
@@ -0,0 +1,140 @@
1
+ # contextractor (Python) — Specification
2
+
3
+ A library-only PyPI package that drives the Contextractor Node CLI from Python. It
4
+ **reimplements nothing**: `extract`/`aextract` spawn the bundled `dist/cli.js`,
5
+ translate snake_case options to CLI flags, let the CLI write to disk, then read the
6
+ export `manifest.json` back; `extract_one`/`aextract_one` spawn a single
7
+ `extract-one` child and return the content as values. Python loads no JavaScript
8
+ and no napi `.node` — Node does, when it runs `cli.js`. Standalone hatchling
9
+ package; **not** a pnpm/turbo workspace member.
10
+
11
+ ## Public API
12
+
13
+ `src/contextractor/__init__.py` exports:
14
+
15
+ - `extract(urls, *, output_dir=None, storage_dir=None, timeout=None, **opts) -> ExtractSummary` — sync, primary.
16
+ - `aextract(urls, *, output_dir=None, storage_dir=None, timeout=None, **opts) -> ExtractSummary` — async (one crawl per child process via `asyncio.create_subprocess_exec`).
17
+ - `extract_one(url, *, formats=None, timeout=None, **opts) -> str | dict[str, str]` — sync single-page extraction: crawls exactly one URL (no link-following) and returns the content as values. `formats` defaults to `"markdown"`; one requested format returns a `str`, several return a `dict[str, str]` keyed by format. Nothing is persisted. Raises `ContextractorError` when the page cannot be extracted.
18
+ - `aextract_one(url, *, formats=None, timeout=None, **opts) -> str | dict[str, str]` — async counterpart of `extract_one`.
19
+ - `install(browser="chromium") -> None` — provision a Playwright browser via the bundled engine. Also reachable as `python -m contextractor install [browser]`.
20
+ - `ExtractOptions` — `TypedDict(total=False)` of all crawl options (the typed surface for `extract`'s `**opts`).
21
+ - `ExtractOneOptions` — `TypedDict(total=False)` of the single-page option subset (the typed surface for `extract_one`'s `**opts`).
22
+ - `ExtractSummary` — frozen dataclass: `total`, `succeeded`, `failed`, `skipped`, `output_dir`, `manifest_path`.
23
+ - Errors: `ContextractorError` (base), `ProxySchemeError`, `NodeRuntimeError`, `MissingBrowserError`.
24
+ - `__version__` — read via `importlib.metadata.version("contextractor")`.
25
+
26
+ `urls` accepts a single string or a list. `output_dir` defaults to
27
+ `./contextractor-output` (resolved against the CWD). `storage_dir` defaults to a
28
+ private temp dir that is removed after the call so the manifest reflects only the
29
+ current run; an explicitly-passed `storage_dir` is preserved.
30
+
31
+ ## Orchestration (two invocations)
32
+
33
+ The CLI `extract` subcommand writes only to Crawlee storage and exits `2` on
34
+ partial failure; the separate `export` subcommand writes `<output_dir>/manifest.json`.
35
+ So each `extract()`/`aextract()` runs **two** child processes:
36
+
37
+ 1. `node cli.js extract <urls> --storage <STORAGE> <mapped flags>`
38
+ 2. `node cli.js export --storage <STORAGE> --output-dir <OUTPUT_DIR>`
39
+
40
+ then `read_summary(<OUTPUT_DIR>/manifest.json)`. One `--storage` path fully
41
+ identifies a run's storage (the CLI always uses the `default` buckets), so export
42
+ reads exactly what extract wrote — no bucket names are threaded.
43
+
44
+ ### Exit-code semantics (`_run.py`)
45
+
46
+ - extract `0` → continue; `2` → partial success, continue (do **not** raise); `1`/other → raise `ContextractorError`.
47
+ - extract-one `0` → success; `2` → partial (a requested format yielded no content — see the single-page section); `1`/other → raise.
48
+ - export `0` → read manifest; non-zero → raise.
49
+ - Both runners (`_run_sync` / `_run_async`) capture raw bytes and decode stdout/stderr as UTF-8 with `errors="replace"` — never the locale codec (Windows cp1252/cp932 mojibake) and never universal-newline translation (which would corrupt `original` raw HTML).
50
+ - Playwright "Executable doesn't exist" in stderr → `MissingBrowserError` pointing at `python -m contextractor install`.
51
+ - Child stderr is redacted (proxy credentials masked) before being surfaced; argv is never echoed when it carries a proxy.
52
+ - A `timeout` (sync or async) raises `ContextractorError("contextractor timed out")` — never the raw `subprocess.TimeoutExpired`, whose `cmd` would leak the `--proxy` argv.
53
+
54
+ ## Single-page orchestration (`extract_one`)
55
+
56
+ `extract_one()`/`aextract_one()` run **one** child process and return the content
57
+ as values — nothing is persisted, and no save/output/file/stdout options are
58
+ exposed; the wrapper drives the CLI `extract-one` subcommand internally:
59
+
60
+ - One requested format → `node cli.js extract-one <url> <mapped flags> --save <fmt>-stdout`; the child's stdout is the raw content (diagnostics go to stderr) and is returned as a `str`.
61
+ - Several formats → one `--save <fmt>-file` per format plus `--output <tempdir>/page`; the wrapper reads the files back into a `dict[str, str]` keyed by format, then removes the temp dir.
62
+ - Read-back names follow the CLI's multi-format `--output` prefix: `page.txt`, `page.md`, `page.json`, `page.html`; `original` lands at `page.original.html` only when `html` is also requested (the CLI's collision tag), else at `page.html`.
63
+ - `formats` accepts a string or a sequence, defaults to `"markdown"`, and deduplicates preserving order; an unknown format raises before spawn.
64
+ - Exit `0` → success. Exit `2` → partial: a requested format yielded no content (the CLI warns on stderr and skips that output). The multi-format dict simply omits that format's key — the npm library's `Partial<Record<…>>` semantics; the single-format route raises `ContextractorError("extract-one produced no <fmt> output")`, since a `str` cannot represent absence. Exit `1`/other → raise (hard failure).
65
+
66
+ ## Option mapping (`_options.py`)
67
+
68
+ A single data-driven table, `OPTION_SPECS`, applied immediately before spawn (per
69
+ `.claude/rules/python-option-mapping.md`). `ExtractOptions` keys must equal
70
+ `OPTION_SPECS` keys (enforced by `tests/test_options.py`). Categories:
71
+
72
+ - **scalar** → `--flag <value>` (e.g. `max_crawl_depth`, `mode`, `start_urls_file`, …).
73
+ - **bool-pair** → `--flag` / `--no-flag`: `headless`, `block_media`, `images`.
74
+ - **negation-only** (default include; `False` emits the `--no-` flag): `links`, `comments`, `tables`.
75
+ - **bare-switch** (`True` emits the flag): `purge`, `ignore_cors_and_csp`, `close_cookie_modals`, `ignore_https_errors`, `keep_url_fragment`, `use_sitemaps`, `respect_robots_txt`, `store_skipped_urls`, `verbose` (`-v`).
76
+ - **repeatable** (one flag per item): `proxy`, `globs`, `exclude`, `save` (`format-destination` tokens, e.g. `markdown-kvs`).
77
+ - **json** (`--flag <json.dumps>`): `cookies`, `headers`.
78
+
79
+ `storage_dir`, `output_dir`, `timeout` are explicit parameters, not in the table.
80
+ `dataset` / `key_value_store` / `request_queue` no longer exist — the CLI always
81
+ uses the `default` buckets under `--storage`.
82
+ `apify_proxy` / `groups` / `use_apify_proxy` are intentionally absent — the CLI
83
+ accepts only `http`/`https`/`socks4`/`socks5` proxies; unknown keys raise, and bad
84
+ proxy schemes raise `ProxySchemeError` before spawn.
85
+
86
+ `ExtractOneOptions` is the single-page subset of `ExtractOptions` (the proxy,
87
+ session, rendering, network, content, and verbosity knobs only).
88
+ `EXTRACT_ONE_OPTION_KEYS` (frozen from its annotations; enforced by
89
+ `tests/test_options.py` to stay a subset of `OPTION_SPECS`) gates
90
+ `build_extract_one_args`, which raises `ContextractorError` for every
91
+ `extract`-only key (crawl-frontier/storage/output options such as `globs`,
92
+ `selector`, `max_crawl_depth`, `save`, `purge`, plus `session_pool_name` —
93
+ cross-run session sharing needs persisted session-pool state under
94
+ `--storage`, which `extract-one` never touches) before delegating to
95
+ `build_extract_args`.
96
+
97
+ ## Runtime resolution (`_runtime.py`)
98
+
99
+ - `resolve_node()` — `CONTEXTRACTOR_NODE_PATH` override, else the `nodejs-wheel-binaries` binary at `nodejs_wheel.executable.ROOT_DIR` (`bin/node` on POSIX, `node.exe` on Windows). Restores the exec bit (POSIX) if a wheel ZIP dropped it.
100
+ - `vendor_cli_dir()` — context manager that materializes the staged `_vendor/cli` tree as a real directory via `importlib.resources.as_file()` (a no-op yielding the on-disk path for a normal wheel; extracts to a temp dir, removed on exit, when imported from a zip/pex/shiv). Stays open across the whole subprocess run so the tree outlives the child.
101
+ - `cli_js(cli_dir)` / `playwright_cli_js(cli_dir)` — resolve `dist/cli.js` and `node_modules/playwright/cli.js` inside that materialized tree; raise `NodeRuntimeError` if assets were not staged.
102
+
103
+ ## Asset bundling
104
+
105
+ The Node CLI ships **un-bundled** (plain `tsc` output; Crawlee/Playwright/commander
106
+ resolve assets via `__dirname`, so esbuild/ncc/SEA are forbidden). At wheel-build
107
+ time `scripts/stage_vendor.py` copies a `pnpm deploy --prod --config.node-linker=hoisted`
108
+ tree (npm-style real files — wheels can't carry pnpm's symlink store) into
109
+ `src/contextractor/_vendor/cli/`, restores `"type": "module"` (pnpm deploy strips
110
+ it), prunes every non-build-platform `.node` — both the bundled
111
+ `dist/native/contextractor-extraction-native.*.node` prebuilds and any legacy
112
+ `node_modules/@contextractor/extraction-native-*` packages (defensive only:
113
+ `@contextractor/*` are devDependencies now, so `pnpm deploy --prod` no longer
114
+ carries them) — and seeds an `__init__.py` in every subdir (for
115
+ `importlib.resources`).
116
+ `_vendor/cli` is gitignored and force-included via the wheel `artifacts` glob. The
117
+ Node runtime itself is **not** bundled (it comes from `nodejs-wheel-binaries`);
118
+ browsers are never bundled (`python -m contextractor install`).
119
+
120
+ ## Packaging & distribution
121
+
122
+ - Backend: hatchling + `hatch_build.py` (`pure_python=False`, `infer_tag=True`) → `py3-none-{platform}` wheels. Forbid maturin / scikit-build-core / uv_build.
123
+ - `version` is static in `pyproject.toml` (the `/git:release` and `/publish:all` bump target); `__version__` is read from installed metadata.
124
+ - `readme = "README.md"` → the PyPI project page (per `.claude/rules/user-facing-docs.md`); included in the sdist.
125
+ - Wheel matrix: `macosx_*_arm64`, `macosx_*_x86_64`, `manylinux_2_28_x86_64`, `manylinux_2_28_aarch64`, `win_amd64`, plus an sdist. **musl is unsupported** — the napi loader throws a clear import error rather than ship a broken `.node`.
126
+ - CI: `.github/workflows/release-pypi.yml` (cibuildwheel; `CIBW_BEFORE_ALL` stages `_vendor`; auditwheel/delocate repair disabled — there is no ELF Python extension; publish via PyPI Trusted Publishing / OIDC). It is sequenced **after** the napi-refresh PR opened by `build-napi.yml` for a `v*` tag, so wheels bundle current `.node` files — the gate is encoded in `/publish:all`.
127
+
128
+ ## Tests
129
+
130
+ `pytest` + `pytest-asyncio` (subprocess boundary mocked; no network): argv mapping
131
+ per category, manifest tally, exit-2-is-partial (for both `extract` and
132
+ `extract-one`, incl. the partial multi-format dict and the single-format raise),
133
+ exit-1/other raise, UTF-8/CRLF byte fidelity through the sync runner, async path,
134
+ proxy redaction (incl. the sync-timeout path), node/CLI resolution + exec-bit
135
+ restore, and the Windows-napi consistency check. Two non-mocked layers:
136
+ `tests/test_integration_real_cli.py` drives the repo-built
137
+ `packages/standalone/dist/cli.js` with the system `node` through a multi-format
138
+ `extract_one` against a local `http.server` page (guards the file-naming
139
+ contract the fake CLI re-implements; auto-skips when `node` or the built CLI is
140
+ absent), and one env-gated real e2e (`CONTEXTRACTOR_E2E`).
@@ -0,0 +1,40 @@
1
+ """Hatchling build hook forcing a platform-tagged wheel.
2
+
3
+ The wheel bundles a flattened Node CLI tree plus the build platform's native
4
+ `.node` addon, so it must be tagged `py3-none-{platform}` rather than
5
+ `py3-none-any`. The assets themselves are staged into
6
+ ``src/contextractor/_vendor/`` before the build by ``scripts/stage_vendor.py``
7
+ (driven by ``CIBW_BEFORE_ALL`` in CI); this hook only sets the wheel tag.
8
+
9
+ ``CONTEXTRACTOR_WHEEL_PLATFORM`` pins the platform tag explicitly (e.g.
10
+ ``manylinux_2_28_x86_64``). CI sets it per matrix row because auditwheel/delocate
11
+ repair is disabled — there is no ELF Python extension to relabel a bare
12
+ ``linux_x86_64`` wheel into a PyPI-acceptable ``manylinux`` tag. When unset (e.g. a
13
+ local ``python -m build`` on the native platform), the tag is inferred.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import os
19
+ from typing import Any
20
+
21
+ from hatchling.builders.hooks.plugin.interface import BuildHookInterface
22
+
23
+
24
+ class CustomBuildHook(BuildHookInterface):
25
+ def initialize(self, version: str, build_data: dict[str, Any]) -> None:
26
+ # Platform-specific (bundles a .node) but ABI-agnostic: the package is pure
27
+ # Python with no CPython extension, so it runs on any Python 3.x →
28
+ # py3-none-{platform}, not cp312-cp312-{platform}.
29
+ build_data["pure_python"] = False
30
+ build_data["tag"] = f"py3-none-{self._platform()}"
31
+
32
+ def _platform(self) -> str:
33
+ # CI pins the platform tag per matrix row; locally infer it (auditwheel
34
+ # repair is disabled, so the tag must be correct at build time).
35
+ pinned = os.environ.get("CONTEXTRACTOR_WHEEL_PLATFORM")
36
+ if pinned:
37
+ return pinned
38
+ from packaging.tags import sys_tags
39
+
40
+ return next(iter(sys_tags())).platform