contextractor 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contextractor-0.3.7.dist-info/METADATA +260 -0
- contextractor-0.3.7.dist-info/RECORD +14 -0
- contextractor-0.3.7.dist-info/WHEEL +4 -0
- contextractor-0.3.7.dist-info/entry_points.txt +2 -0
- contextractor_cli/__init__.py +1 -0
- contextractor_cli/__main__.py +5 -0
- contextractor_cli/config.py +159 -0
- contextractor_cli/crawler.py +323 -0
- contextractor_cli/main.py +333 -0
- contextractor_engine/__init__.py +22 -0
- contextractor_engine/extractor.py +70 -0
- contextractor_engine/models.py +126 -0
- contextractor_engine/py.typed +0 -0
- contextractor_engine/utils.py +34 -0
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: contextractor
|
|
3
|
+
Version: 0.3.7
|
|
4
|
+
Summary: Extract clean, readable content from any website
|
|
5
|
+
Project-URL: Homepage, https://contextractor.com
|
|
6
|
+
Project-URL: Repository, https://github.com/contextractor/contextractor
|
|
7
|
+
Project-URL: Issues, https://github.com/contextractor/contextractor/issues
|
|
8
|
+
Author-email: Miroslav Sekera <miroslav@glueo.com>
|
|
9
|
+
License-Expression: Apache-2.0
|
|
10
|
+
Requires-Python: >=3.12
|
|
11
|
+
Requires-Dist: browserforge<1.2.4
|
|
12
|
+
Requires-Dist: crawlee[playwright]>=0.4.0
|
|
13
|
+
Requires-Dist: pyyaml>=6.0
|
|
14
|
+
Requires-Dist: trafilatura>=2.0.0
|
|
15
|
+
Requires-Dist: typer>=0.15.0
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
# Contextractor
|
|
19
|
+
|
|
20
|
+
Extract clean, readable content from any website using [Trafilatura](https://trafilatura.readthedocs.io/).
|
|
21
|
+
|
|
22
|
+
Available as: [pip](#install) | [npm](#install) | [Docker](#docker) | [Apify actor](https://apify.com/glueo/contextractor)
|
|
23
|
+
|
|
24
|
+
Try the [Playground](https://contextractor.com) to configure extraction settings and preview commands before running.
|
|
25
|
+
|
|
26
|
+
## Install
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install contextractor
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
or
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
npm install -g contextractor
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Requires Python 3.12+ (pip) or Node.js 18+ (npm). Playwright Chromium is installed automatically.
|
|
39
|
+
|
|
40
|
+
## Usage
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
contextractor https://example.com
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Works with zero config. Pass URLs directly, or use a config file for complex setups:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
contextractor https://example.com --precision --save-json -o ./results
|
|
50
|
+
contextractor --config config.json --max-pages 10
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### CLI Options
|
|
54
|
+
|
|
55
|
+
```
|
|
56
|
+
contextractor [OPTIONS] [URLS...]
|
|
57
|
+
|
|
58
|
+
Crawl Settings:
|
|
59
|
+
--config, -c Path to JSON config file
|
|
60
|
+
--output-dir, -o Output directory
|
|
61
|
+
--max-pages Max pages to crawl (0 = unlimited)
|
|
62
|
+
--crawl-depth Max link depth from start URLs (0 = start only)
|
|
63
|
+
--headless/--no-headless Browser headless mode (default: headless)
|
|
64
|
+
--max-concurrency Max parallel requests (default: 50)
|
|
65
|
+
--max-retries Max request retries (default: 3)
|
|
66
|
+
--max-results Max results per crawl (0 = unlimited)
|
|
67
|
+
|
|
68
|
+
Proxy:
|
|
69
|
+
--proxy-urls Comma-separated proxy URLs (http://user:pass@host:port)
|
|
70
|
+
--proxy-rotation Rotation: recommended, per_request, until_failure
|
|
71
|
+
|
|
72
|
+
Browser:
|
|
73
|
+
--launcher Browser engine: chromium, firefox (default: chromium)
|
|
74
|
+
--wait-until Page load event: load, networkidle, domcontentloaded (default: load)
|
|
75
|
+
--page-load-timeout Timeout in seconds (default: 60)
|
|
76
|
+
--ignore-cors Disable CORS/CSP restrictions
|
|
77
|
+
--close-cookie-modals Auto-dismiss cookie banners
|
|
78
|
+
--max-scroll-height Max scroll height in pixels (default: 5000)
|
|
79
|
+
--ignore-ssl-errors Skip SSL certificate verification
|
|
80
|
+
--user-agent Custom User-Agent string
|
|
81
|
+
|
|
82
|
+
Crawl Filtering:
|
|
83
|
+
--globs Comma-separated glob patterns to include
|
|
84
|
+
--excludes Comma-separated glob patterns to exclude
|
|
85
|
+
--link-selector CSS selector for links to follow
|
|
86
|
+
--keep-url-fragments Preserve URL fragments
|
|
87
|
+
--respect-robots-txt Honor robots.txt
|
|
88
|
+
|
|
89
|
+
Cookies & Headers:
|
|
90
|
+
--cookies JSON array of cookie objects
|
|
91
|
+
--headers JSON object of custom HTTP headers
|
|
92
|
+
|
|
93
|
+
Output Toggles:
|
|
94
|
+
--save-markdown/--no-save-markdown Save extracted markdown (default: true)
|
|
95
|
+
--save-raw-html Save raw HTML to output
|
|
96
|
+
--save-text Save extracted text
|
|
97
|
+
--save-json Save extracted JSON
|
|
98
|
+
--save-jsonl Save all pages as JSONL (single file)
|
|
99
|
+
--save-xml Save extracted XML
|
|
100
|
+
--save-xml-tei Save extracted XML-TEI
|
|
101
|
+
|
|
102
|
+
Content Extraction:
|
|
103
|
+
--precision High precision mode (less noise)
|
|
104
|
+
--recall High recall mode (more content)
|
|
105
|
+
--fast Fast extraction mode (less thorough)
|
|
106
|
+
--no-links Exclude links from output
|
|
107
|
+
--no-comments Exclude comments from output
|
|
108
|
+
--include-tables/--no-tables Include tables (default: include)
|
|
109
|
+
--include-images Include image descriptions
|
|
110
|
+
--include-formatting/--no-formatting Preserve formatting (default: preserve)
|
|
111
|
+
--deduplicate Deduplicate extracted content
|
|
112
|
+
--target-language Filter by language (e.g. "en")
|
|
113
|
+
--with-metadata/--no-metadata Extract metadata (default: with)
|
|
114
|
+
--prune-xpath XPath patterns to remove from content
|
|
115
|
+
|
|
116
|
+
Diagnostics:
|
|
117
|
+
--verbose, -v Enable verbose logging
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
CLI flags override config file settings. Merge order: `defaults → config file → CLI args`
|
|
121
|
+
|
|
122
|
+
### Config File (optional)
|
|
123
|
+
|
|
124
|
+
Use a JSON config file to set options:
|
|
125
|
+
|
|
126
|
+
```json
|
|
127
|
+
{
|
|
128
|
+
"urls": ["https://example.com", "https://docs.example.com"],
|
|
129
|
+
"saveMarkdown": true,
|
|
130
|
+
"outputDir": "./output",
|
|
131
|
+
"crawlDepth": 1,
|
|
132
|
+
"proxy": {
|
|
133
|
+
"urls": ["http://user:pass@host:port"],
|
|
134
|
+
"rotation": "recommended"
|
|
135
|
+
},
|
|
136
|
+
"extraction": {
|
|
137
|
+
"favorPrecision": true,
|
|
138
|
+
"includeLinks": true,
|
|
139
|
+
"includeTables": true,
|
|
140
|
+
"deduplicate": true
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### Crawl Settings
|
|
146
|
+
|
|
147
|
+
| Field | Type | Default | Description |
|
|
148
|
+
|-------|------|---------|-------------|
|
|
149
|
+
| `urls` | array | `[]` | URLs to extract content from |
|
|
150
|
+
| `maxPages` | int | 0 | Max pages to crawl (0 = unlimited) |
|
|
151
|
+
| `outputDir` | string | `"./output"` | Directory for extracted content |
|
|
152
|
+
| `crawlDepth` | int | 0 | How deep to follow links (0 = start URLs only) |
|
|
153
|
+
| `headless` | bool | true | Browser headless mode |
|
|
154
|
+
| `maxConcurrency` | int | 50 | Max parallel browser pages |
|
|
155
|
+
| `maxRetries` | int | 3 | Max retries for failed requests |
|
|
156
|
+
| `maxResults` | int | 0 | Max results per crawl (0 = unlimited) |
|
|
157
|
+
|
|
158
|
+
### Proxy Configuration
|
|
159
|
+
|
|
160
|
+
| Field | Type | Default | Description |
|
|
161
|
+
|-------|------|---------|-------------|
|
|
162
|
+
| `proxy.urls` | array | `[]` | Proxy URLs (`http://user:pass@host:port` or `socks5://host:port`) |
|
|
163
|
+
| `proxy.rotation` | string | `"recommended"` | `recommended`, `per_request`, `until_failure` |
|
|
164
|
+
| `proxy.tiered` | array | `[]` | Tiered proxy escalation (config-file only) |
|
|
165
|
+
|
|
166
|
+
### Browser Settings
|
|
167
|
+
|
|
168
|
+
| Field | Type | Default | Description |
|
|
169
|
+
|-------|------|---------|-------------|
|
|
170
|
+
| `launcher` | string | `"chromium"` | Browser engine: `chromium`, `firefox` |
|
|
171
|
+
| `waitUntil` | string | `"load"` | Page load event: `load`, `networkidle`, `domcontentloaded` |
|
|
172
|
+
| `pageLoadTimeout` | int | 60 | Page load timeout in seconds |
|
|
173
|
+
| `ignoreCors` | bool | false | Disable CORS/CSP restrictions |
|
|
174
|
+
| `closeCookieModals` | bool | true | Auto-dismiss cookie consent banners |
|
|
175
|
+
| `maxScrollHeight` | int | 5000 | Max scroll height in pixels (0 = disable) |
|
|
176
|
+
| `ignoreSslErrors` | bool | false | Skip SSL certificate verification |
|
|
177
|
+
| `userAgent` | string | `""` | Custom User-Agent string |
|
|
178
|
+
|
|
179
|
+
### Crawl Filtering
|
|
180
|
+
|
|
181
|
+
| Field | Type | Default | Description |
|
|
182
|
+
|-------|------|---------|-------------|
|
|
183
|
+
| `globs` | array | `[]` | Glob patterns for URLs to include |
|
|
184
|
+
| `excludes` | array | `[]` | Glob patterns for URLs to exclude |
|
|
185
|
+
| `linkSelector` | string | `""` | CSS selector for links to follow |
|
|
186
|
+
| `keepUrlFragments` | bool | false | Treat URLs with different fragments as different pages |
|
|
187
|
+
| `respectRobotsTxt` | bool | false | Honor robots.txt |
|
|
188
|
+
|
|
189
|
+
### Cookies & Headers
|
|
190
|
+
|
|
191
|
+
| Field | Type | Default | Description |
|
|
192
|
+
|-------|------|---------|-------------|
|
|
193
|
+
| `cookies` | array | `[]` | Initial cookies (`[{"name": "...", "value": "...", "domain": "..."}]`) |
|
|
194
|
+
| `headers` | object | `{}` | Custom HTTP headers (`{"Authorization": "Bearer token"}`) |
|
|
195
|
+
|
|
196
|
+
### Output Toggles
|
|
197
|
+
|
|
198
|
+
Each toggle saves its format independently. Multiple can be enabled at once:
|
|
199
|
+
|
|
200
|
+
| Field | Type | Default | Description |
|
|
201
|
+
|-------|------|---------|-------------|
|
|
202
|
+
| `saveMarkdown` | bool | true | Save extracted markdown |
|
|
203
|
+
| `saveRawHtml` | bool | false | Save raw HTML |
|
|
204
|
+
| `saveText` | bool | false | Save extracted plain text |
|
|
205
|
+
| `saveJson` | bool | false | Save extracted JSON |
|
|
206
|
+
| `saveJsonl` | bool | false | Save all pages as JSONL (single file) |
|
|
207
|
+
| `saveXml` | bool | false | Save extracted XML |
|
|
208
|
+
| `saveXmlTei` | bool | false | Save extracted XML-TEI |
|
|
209
|
+
|
|
210
|
+
### Content Extraction
|
|
211
|
+
|
|
212
|
+
All options go under the `extraction` key in config files, or use the equivalent CLI flags:
|
|
213
|
+
|
|
214
|
+
| Field | Type | Default | Description |
|
|
215
|
+
|-------|------|---------|-------------|
|
|
216
|
+
| `favorPrecision` | bool | false | High precision, less noise |
|
|
217
|
+
| `favorRecall` | bool | false | High recall, more content |
|
|
218
|
+
| `includeComments` | bool | true | Include comments |
|
|
219
|
+
| `includeTables` | bool | true | Include tables |
|
|
220
|
+
| `includeImages` | bool | false | Include images |
|
|
221
|
+
| `includeFormatting` | bool | true | Preserve formatting |
|
|
222
|
+
| `includeLinks` | bool | true | Include links |
|
|
223
|
+
| `deduplicate` | bool | false | Deduplicate content |
|
|
224
|
+
| `withMetadata` | bool | true | Extract metadata (title, author, date) |
|
|
225
|
+
| `targetLanguage` | string | null | Filter by language (e.g. `"en"`) |
|
|
226
|
+
| `fast` | bool | false | Fast mode (less thorough) |
|
|
227
|
+
| `pruneXpath` | array | null | XPath patterns to remove from content |
|
|
228
|
+
|
|
229
|
+
## Docker
|
|
230
|
+
|
|
231
|
+
```bash
|
|
232
|
+
docker run ghcr.io/contextractor/contextractor https://example.com
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
Save output to your local machine:
|
|
236
|
+
|
|
237
|
+
```bash
|
|
238
|
+
docker run -v ./output:/output ghcr.io/contextractor/contextractor https://example.com -o /output
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
Use a config file:
|
|
242
|
+
|
|
243
|
+
```bash
|
|
244
|
+
docker run -v ./config.json:/config.json ghcr.io/contextractor/contextractor --config /config.json
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
All CLI flags work the same inside Docker.
|
|
248
|
+
|
|
249
|
+
## Output
|
|
250
|
+
|
|
251
|
+
One file per crawled page, named from the URL slug (e.g. `example-com-page.md`). Metadata (title, author, date) is included in the output header when available.
|
|
252
|
+
|
|
253
|
+
## Platforms
|
|
254
|
+
|
|
255
|
+
- npm: macOS arm64, Linux (x64, arm64), Windows x64
|
|
256
|
+
- Docker: linux/amd64, linux/arm64
|
|
257
|
+
|
|
258
|
+
## License
|
|
259
|
+
|
|
260
|
+
Apache-2.0
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
contextractor_cli/__init__.py,sha256=jO_XdncP1L1FKAbLCKnkAgwYh_ou9LhYFwBZL9YBZuY,66
|
|
2
|
+
contextractor_cli/__main__.py,sha256=hRU0ca8FU89pZuU0ZuyIYp6DrgKD_toAC0TM-F9GGrg,81
|
|
3
|
+
contextractor_cli/config.py,sha256=ksdOFvabIHt7Me2Ov-A9S7VqVj-Bx0apLyHTYDxoXcc,6372
|
|
4
|
+
contextractor_cli/crawler.py,sha256=ofXzue7SYC8_88u5TakxLzwmPMkPVNcFjd25UkmJc6c,12456
|
|
5
|
+
contextractor_cli/main.py,sha256=z11ZV5Gpt8Z1BCECYXoAJouGXCcP57su51Ra3bUslJI,11471
|
|
6
|
+
contextractor_engine/__init__.py,sha256=el3zmnxW5l6b5bj79hGZ95gC4CFjM0eLNv7b60RNw3I,596
|
|
7
|
+
contextractor_engine/extractor.py,sha256=_gRPl5FomRJvZquFDXuyLLLznhNFbZQH2zBVUtVaivQ,2478
|
|
8
|
+
contextractor_engine/models.py,sha256=XX14iZt19nxQLfGRDcXf0me6KKbTjQMw-RX5QjOUXFo,4212
|
|
9
|
+
contextractor_engine/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
contextractor_engine/utils.py,sha256=ma8NWrKficfQejWnnSDEA6WrEWqqEYolbtaurl7t1MA,1149
|
|
11
|
+
contextractor-0.3.7.dist-info/METADATA,sha256=NTGMXnYT7a-4l8FsH9n3oiZhnxCRh3IaCOm8j4CEd98,9481
|
|
12
|
+
contextractor-0.3.7.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
13
|
+
contextractor-0.3.7.dist-info/entry_points.txt,sha256=jpnlbk9gdLVAReuc9--8SumbPbpgIe10DrGvlx3d2Co,61
|
|
14
|
+
contextractor-0.3.7.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Contextractor CLI - Standalone web content extraction tool."""
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
"""Configuration loading from JSON files (YAML also supported)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from dataclasses import dataclass, field, fields as dataclass_fields
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import yaml
|
|
11
|
+
|
|
12
|
+
from contextractor_engine import TrafilaturaConfig
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# TrafilaturaConfig field names for routing in merge()
|
|
16
|
+
_EXTRACTION_FIELDS = {f.name for f in dataclass_fields(TrafilaturaConfig)}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class CrawlConfig:
|
|
21
|
+
"""Configuration for a crawl run."""
|
|
22
|
+
|
|
23
|
+
# Core
|
|
24
|
+
urls: list[str] = field(default_factory=list)
|
|
25
|
+
max_pages: int = 0
|
|
26
|
+
output_dir: str = "./output"
|
|
27
|
+
crawl_depth: int = 0
|
|
28
|
+
headless: bool = True
|
|
29
|
+
extraction: TrafilaturaConfig = field(default_factory=TrafilaturaConfig.balanced)
|
|
30
|
+
|
|
31
|
+
# Proxy
|
|
32
|
+
proxy_urls: list[str] = field(default_factory=list)
|
|
33
|
+
proxy_rotation: str = "recommended"
|
|
34
|
+
proxy_tiered: list[list[str | None]] = field(default_factory=list)
|
|
35
|
+
|
|
36
|
+
# Browser
|
|
37
|
+
launcher: str = "chromium"
|
|
38
|
+
wait_until: str = "load"
|
|
39
|
+
page_load_timeout: int = 60
|
|
40
|
+
ignore_cors: bool = False
|
|
41
|
+
close_cookie_modals: bool = True
|
|
42
|
+
max_scroll_height: int = 5000
|
|
43
|
+
ignore_ssl_errors: bool = False
|
|
44
|
+
user_agent: str = ""
|
|
45
|
+
|
|
46
|
+
# Crawl filtering
|
|
47
|
+
globs: list[str] = field(default_factory=list)
|
|
48
|
+
excludes: list[str] = field(default_factory=list)
|
|
49
|
+
link_selector: str = ""
|
|
50
|
+
keep_url_fragments: bool = False
|
|
51
|
+
respect_robots_txt: bool = False
|
|
52
|
+
|
|
53
|
+
# Cookies & headers
|
|
54
|
+
cookies: list[dict[str, Any]] = field(default_factory=list)
|
|
55
|
+
headers: dict[str, str] = field(default_factory=dict)
|
|
56
|
+
|
|
57
|
+
# Concurrency & retries
|
|
58
|
+
max_concurrency: int = 50
|
|
59
|
+
max_retries: int = 3
|
|
60
|
+
max_results: int = 0
|
|
61
|
+
|
|
62
|
+
# Output toggles
|
|
63
|
+
save_markdown: bool = True
|
|
64
|
+
save_raw_html: bool = False
|
|
65
|
+
save_text: bool = False
|
|
66
|
+
save_json: bool = False
|
|
67
|
+
save_jsonl: bool = False
|
|
68
|
+
save_xml: bool = False
|
|
69
|
+
save_xml_tei: bool = False
|
|
70
|
+
|
|
71
|
+
@classmethod
|
|
72
|
+
def from_file(cls, path: Path) -> CrawlConfig:
|
|
73
|
+
"""Load config from a JSON (or YAML) file."""
|
|
74
|
+
text = path.read_text(encoding="utf-8")
|
|
75
|
+
if path.suffix in (".yaml", ".yml"):
|
|
76
|
+
data = yaml.safe_load(text) or {}
|
|
77
|
+
elif path.suffix == ".json":
|
|
78
|
+
data = json.loads(text)
|
|
79
|
+
else:
|
|
80
|
+
# Try YAML first, fall back to JSON
|
|
81
|
+
try:
|
|
82
|
+
data = yaml.safe_load(text) or {}
|
|
83
|
+
except yaml.YAMLError:
|
|
84
|
+
data = json.loads(text)
|
|
85
|
+
return cls.from_dict(data)
|
|
86
|
+
|
|
87
|
+
@classmethod
|
|
88
|
+
def from_dict(cls, data: dict[str, Any]) -> CrawlConfig:
|
|
89
|
+
"""Create config from a dictionary."""
|
|
90
|
+
extraction = TrafilaturaConfig.from_json_dict(data.get("extraction"))
|
|
91
|
+
|
|
92
|
+
# Parse proxy section (nested object or flat keys)
|
|
93
|
+
proxy_section = data.get("proxy", {})
|
|
94
|
+
proxy_urls = proxy_section.get("urls", []) if isinstance(proxy_section, dict) else []
|
|
95
|
+
proxy_rotation = proxy_section.get("rotation", "recommended") if isinstance(proxy_section, dict) else "recommended"
|
|
96
|
+
|
|
97
|
+
# Parse cookies/headers (can be nested or flat)
|
|
98
|
+
cookies = data.get("initialCookies", data.get("cookies", []))
|
|
99
|
+
headers = data.get("customHttpHeaders", data.get("headers", {}))
|
|
100
|
+
|
|
101
|
+
return cls(
|
|
102
|
+
urls=data.get("urls", []),
|
|
103
|
+
max_pages=data.get("maxPages", 0),
|
|
104
|
+
output_dir=data.get("outputDir", "./output"),
|
|
105
|
+
crawl_depth=data.get("crawlDepth", 0),
|
|
106
|
+
headless=data.get("headless", True),
|
|
107
|
+
extraction=extraction,
|
|
108
|
+
# Proxy
|
|
109
|
+
proxy_urls=proxy_urls,
|
|
110
|
+
proxy_rotation=proxy_rotation,
|
|
111
|
+
proxy_tiered=proxy_section.get("tiered", []) if isinstance(proxy_section, dict) else [],
|
|
112
|
+
# Browser
|
|
113
|
+
launcher=data.get("launcher", "chromium").lower(),
|
|
114
|
+
wait_until=data.get("waitUntil", "load").lower(),
|
|
115
|
+
page_load_timeout=data.get("pageLoadTimeoutSecs", data.get("pageLoadTimeout", 60)),
|
|
116
|
+
ignore_cors=data.get("ignoreCorsAndCsp", data.get("ignoreCors", False)),
|
|
117
|
+
close_cookie_modals=data.get("closeCookieModals", True),
|
|
118
|
+
max_scroll_height=data.get("maxScrollHeightPixels", data.get("maxScrollHeight", 5000)),
|
|
119
|
+
ignore_ssl_errors=data.get("ignoreSslErrors", False),
|
|
120
|
+
user_agent=data.get("userAgent", ""),
|
|
121
|
+
# Crawl filtering
|
|
122
|
+
globs=data.get("globs", []),
|
|
123
|
+
excludes=data.get("excludes", []),
|
|
124
|
+
link_selector=data.get("linkSelector", ""),
|
|
125
|
+
keep_url_fragments=data.get("keepUrlFragments", False),
|
|
126
|
+
respect_robots_txt=data.get("respectRobotsTxtFile", data.get("respectRobotsTxt", False)),
|
|
127
|
+
# Cookies & headers
|
|
128
|
+
cookies=cookies,
|
|
129
|
+
headers=headers,
|
|
130
|
+
# Concurrency & retries
|
|
131
|
+
max_concurrency=data.get("maxConcurrency", 50),
|
|
132
|
+
max_retries=data.get("maxRequestRetries", data.get("maxRetries", 3)),
|
|
133
|
+
max_results=data.get("maxResultsPerCrawl", data.get("maxResults", 0)),
|
|
134
|
+
# Output toggles
|
|
135
|
+
save_markdown=data.get("saveMarkdown", data.get("saveExtractedMarkdownToKeyValueStore", True)),
|
|
136
|
+
save_raw_html=data.get("saveRawHtml", data.get("saveRawHtmlToKeyValueStore", False)),
|
|
137
|
+
save_text=data.get("saveText", data.get("saveExtractedTextToKeyValueStore", False)),
|
|
138
|
+
save_json=data.get("saveJson", data.get("saveExtractedJsonToKeyValueStore", False)),
|
|
139
|
+
save_jsonl=data.get("saveJsonl", False),
|
|
140
|
+
save_xml=data.get("saveXml", data.get("saveExtractedXmlToKeyValueStore", False)),
|
|
141
|
+
save_xml_tei=data.get("saveXmlTei", data.get("saveExtractedXmlTeiToKeyValueStore", False)),
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
def merge(self, overrides: dict[str, Any]) -> None:
|
|
145
|
+
"""Merge non-None overrides into this config.
|
|
146
|
+
|
|
147
|
+
Keys matching TrafilaturaConfig fields are routed to self.extraction.
|
|
148
|
+
Keys matching CrawlConfig fields are set directly.
|
|
149
|
+
Unknown keys are ignored.
|
|
150
|
+
"""
|
|
151
|
+
crawl_fields = {f.name for f in dataclass_fields(self)} - {"extraction"}
|
|
152
|
+
|
|
153
|
+
for key, value in overrides.items():
|
|
154
|
+
if value is None:
|
|
155
|
+
continue
|
|
156
|
+
if key in crawl_fields:
|
|
157
|
+
setattr(self, key, value)
|
|
158
|
+
elif key in _EXTRACTION_FIELDS:
|
|
159
|
+
setattr(self.extraction, key, value)
|
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
"""Crawling and content extraction using crawlee."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
import re
|
|
10
|
+
from datetime import timedelta
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from crawlee import Request
|
|
15
|
+
from crawlee._autoscaling.autoscaled_pool import ConcurrencySettings
|
|
16
|
+
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
|
|
17
|
+
from crawlee.crawlers._playwright._types import GotoOptions
|
|
18
|
+
from crawlee.proxy_configuration import ProxyConfiguration
|
|
19
|
+
|
|
20
|
+
from contextractor_engine import ContentExtractor
|
|
21
|
+
|
|
22
|
+
from .config import CrawlConfig
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger("contextractor")
|
|
25
|
+
|
|
26
|
+
FORMAT_EXTENSIONS = {
|
|
27
|
+
"txt": ".txt",
|
|
28
|
+
"markdown": ".md",
|
|
29
|
+
"json": ".json",
|
|
30
|
+
"xml": ".xml",
|
|
31
|
+
"xmltei": ".tei.xml",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _url_to_filename(url: str) -> str:
|
|
36
|
+
"""Convert a URL to a safe filename slug."""
|
|
37
|
+
# Remove protocol
|
|
38
|
+
slug = re.sub(r"^https?://", "", url)
|
|
39
|
+
# Replace non-alphanumeric chars with hyphens
|
|
40
|
+
slug = re.sub(r"[^a-zA-Z0-9]+", "-", slug)
|
|
41
|
+
# Remove leading/trailing hyphens
|
|
42
|
+
slug = slug.strip("-")
|
|
43
|
+
# Truncate and add hash for uniqueness
|
|
44
|
+
if len(slug) > 100:
|
|
45
|
+
url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
|
|
46
|
+
slug = f"{slug[:100]}-{url_hash}"
|
|
47
|
+
return slug
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _build_browser_launch_options(config: CrawlConfig) -> dict[str, Any]:
|
|
51
|
+
"""Build browser launch options from config."""
|
|
52
|
+
options: dict[str, Any] = {}
|
|
53
|
+
args = []
|
|
54
|
+
|
|
55
|
+
# Anti-detection: prevent navigator.webdriver=true (Chromium only)
|
|
56
|
+
if config.launcher == "chromium":
|
|
57
|
+
args.append("--disable-blink-features=AutomationControlled")
|
|
58
|
+
|
|
59
|
+
# Disable Chromium sandbox in Docker (set CONTEXTRACTOR_NO_SANDBOX=1)
|
|
60
|
+
if os.environ.get("CONTEXTRACTOR_NO_SANDBOX"):
|
|
61
|
+
args.append("--no-sandbox")
|
|
62
|
+
|
|
63
|
+
if args:
|
|
64
|
+
options["args"] = args
|
|
65
|
+
|
|
66
|
+
if config.ignore_ssl_errors:
|
|
67
|
+
options["ignore_https_errors"] = True
|
|
68
|
+
|
|
69
|
+
return options
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _build_browser_context_options(config: CrawlConfig) -> dict[str, Any] | None:
|
|
73
|
+
"""Build browser context options from config."""
|
|
74
|
+
options: dict[str, Any] = {}
|
|
75
|
+
|
|
76
|
+
if config.ignore_cors:
|
|
77
|
+
options["bypass_csp"] = True
|
|
78
|
+
|
|
79
|
+
if config.cookies:
|
|
80
|
+
options["storage_state"] = {"cookies": config.cookies}
|
|
81
|
+
|
|
82
|
+
if config.headers:
|
|
83
|
+
options["extra_http_headers"] = config.headers
|
|
84
|
+
|
|
85
|
+
if config.user_agent:
|
|
86
|
+
options["user_agent"] = config.user_agent
|
|
87
|
+
|
|
88
|
+
return options if options else None
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
async def run_crawl(config: CrawlConfig) -> None:
|
|
92
|
+
"""Run the crawl with the given configuration."""
|
|
93
|
+
output_dir = Path(config.output_dir)
|
|
94
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
95
|
+
|
|
96
|
+
extractor = ContentExtractor(config=config.extraction)
|
|
97
|
+
pages_extracted = 0
|
|
98
|
+
max_results = config.max_results
|
|
99
|
+
|
|
100
|
+
# Configure proxy (tiered takes precedence over flat proxy_urls)
|
|
101
|
+
proxy_cfg = None
|
|
102
|
+
if config.proxy_tiered:
|
|
103
|
+
proxy_cfg = ProxyConfiguration(tiered_proxy_urls=config.proxy_tiered)
|
|
104
|
+
logger.info(f"Using tiered proxy with {len(config.proxy_tiered)} tier(s)")
|
|
105
|
+
elif config.proxy_urls:
|
|
106
|
+
proxy_cfg = ProxyConfiguration(proxy_urls=config.proxy_urls)
|
|
107
|
+
logger.info(f"Using {len(config.proxy_urls)} proxy URL(s), rotation: {config.proxy_rotation}")
|
|
108
|
+
if config.proxy_rotation == "until_failure":
|
|
109
|
+
logger.warning(
|
|
110
|
+
"proxy_rotation 'until_failure' uses round-robin rotation; "
|
|
111
|
+
"full sticky-session behavior requires Crawlee SessionPool integration"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Build browser options
|
|
115
|
+
browser_launch_options = _build_browser_launch_options(config)
|
|
116
|
+
browser_context_options = _build_browser_context_options(config)
|
|
117
|
+
|
|
118
|
+
# Build crawler kwargs
|
|
119
|
+
crawler_kwargs: dict[str, Any] = {
|
|
120
|
+
"headless": config.headless,
|
|
121
|
+
"browser_type": config.launcher,
|
|
122
|
+
"browser_launch_options": browser_launch_options,
|
|
123
|
+
"max_requests_per_crawl": config.max_pages if config.max_pages > 0 else None,
|
|
124
|
+
"max_request_retries": config.max_retries,
|
|
125
|
+
"request_handler_timeout": timedelta(seconds=config.page_load_timeout),
|
|
126
|
+
"concurrency_settings": ConcurrencySettings(
|
|
127
|
+
max_concurrency=config.max_concurrency,
|
|
128
|
+
desired_concurrency=min(10, config.max_concurrency),
|
|
129
|
+
),
|
|
130
|
+
"respect_robots_txt_file": config.respect_robots_txt,
|
|
131
|
+
"max_crawl_depth": config.crawl_depth if config.crawl_depth > 0 else None,
|
|
132
|
+
"goto_options": GotoOptions(wait_until=config.wait_until),
|
|
133
|
+
}
|
|
134
|
+
if proxy_cfg:
|
|
135
|
+
crawler_kwargs["proxy_configuration"] = proxy_cfg
|
|
136
|
+
if browser_context_options:
|
|
137
|
+
crawler_kwargs["browser_new_context_options"] = browser_context_options
|
|
138
|
+
|
|
139
|
+
crawler = PlaywrightCrawler(**crawler_kwargs)
|
|
140
|
+
|
|
141
|
+
@crawler.router.default_handler
|
|
142
|
+
async def handler(context: PlaywrightCrawlingContext) -> None:
|
|
143
|
+
nonlocal pages_extracted
|
|
144
|
+
url = context.request.url
|
|
145
|
+
logger.info(f"Processing {url}")
|
|
146
|
+
|
|
147
|
+
# Check max results limit
|
|
148
|
+
if max_results > 0 and pages_extracted >= max_results:
|
|
149
|
+
logger.info(f"Reached max results limit ({max_results}), skipping {url}")
|
|
150
|
+
return
|
|
151
|
+
|
|
152
|
+
# Auto-dismiss cookie modals (CMP-aware)
|
|
153
|
+
if config.close_cookie_modals:
|
|
154
|
+
try:
|
|
155
|
+
await context.page.evaluate("""
|
|
156
|
+
() => {
|
|
157
|
+
// 1. Didomi CMP
|
|
158
|
+
if (window.Didomi) {
|
|
159
|
+
try { window.Didomi.setUserAgreeToAll(); return; } catch {}
|
|
160
|
+
}
|
|
161
|
+
// 2. OneTrust
|
|
162
|
+
const onetrust = document.querySelector('#onetrust-accept-btn-handler');
|
|
163
|
+
if (onetrust) { onetrust.click(); return; }
|
|
164
|
+
// 3. CookieBot
|
|
165
|
+
const cookiebot = document.querySelector('#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll');
|
|
166
|
+
if (cookiebot) { cookiebot.click(); return; }
|
|
167
|
+
// 4. Quantcast / TCF
|
|
168
|
+
const quantcast = document.querySelector('.qc-cmp2-summary-buttons button[mode="primary"]');
|
|
169
|
+
if (quantcast) { quantcast.click(); return; }
|
|
170
|
+
// 5. Generic fallback selectors
|
|
171
|
+
const selectors = [
|
|
172
|
+
'[class*="cookie"] button', '[id*="cookie"] button',
|
|
173
|
+
'[class*="consent"] button', '[id*="consent"] button',
|
|
174
|
+
'button[class*="accept"]', 'button[id*="accept"]',
|
|
175
|
+
];
|
|
176
|
+
for (const sel of selectors) {
|
|
177
|
+
const btn = document.querySelector(sel);
|
|
178
|
+
if (btn) { btn.click(); return; }
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
""")
|
|
182
|
+
await context.page.wait_for_timeout(1000)
|
|
183
|
+
except Exception:
|
|
184
|
+
pass # Best effort
|
|
185
|
+
|
|
186
|
+
# Scroll page to load dynamic content
|
|
187
|
+
if config.max_scroll_height > 0:
|
|
188
|
+
try:
|
|
189
|
+
await context.page.evaluate(f"""
|
|
190
|
+
async () => {{
|
|
191
|
+
let scrolled = 0;
|
|
192
|
+
const maxScroll = {config.max_scroll_height};
|
|
193
|
+
while (scrolled < maxScroll) {{
|
|
194
|
+
window.scrollBy(0, 500);
|
|
195
|
+
scrolled += 500;
|
|
196
|
+
await new Promise(r => setTimeout(r, 100));
|
|
197
|
+
}}
|
|
198
|
+
window.scrollTo(0, 0);
|
|
199
|
+
}}
|
|
200
|
+
""")
|
|
201
|
+
except Exception:
|
|
202
|
+
pass # Best effort
|
|
203
|
+
|
|
204
|
+
html = await context.page.content()
|
|
205
|
+
slug = _url_to_filename(url)
|
|
206
|
+
|
|
207
|
+
# Extract metadata for text-based format headers
|
|
208
|
+
metadata = extractor.extract_metadata(html, url=url)
|
|
209
|
+
|
|
210
|
+
def _build_text_content(raw_content: str, fmt: str) -> str:
|
|
211
|
+
"""Prepend metadata header for markdown/txt formats."""
|
|
212
|
+
parts: list[str] = []
|
|
213
|
+
if (metadata.title or metadata.author or metadata.date) and fmt in ("markdown", "txt"):
|
|
214
|
+
if metadata.title:
|
|
215
|
+
parts.append(f"Title: {metadata.title}")
|
|
216
|
+
if metadata.author:
|
|
217
|
+
parts.append(f"Author: {metadata.author}")
|
|
218
|
+
if metadata.date:
|
|
219
|
+
parts.append(f"Date: {metadata.date}")
|
|
220
|
+
parts.append(f"URL: {url}")
|
|
221
|
+
parts.append("")
|
|
222
|
+
parts.append("---")
|
|
223
|
+
parts.append("")
|
|
224
|
+
parts.append(raw_content)
|
|
225
|
+
return "\n".join(parts)
|
|
226
|
+
|
|
227
|
+
# Track whether any content was extracted
|
|
228
|
+
any_saved = False
|
|
229
|
+
|
|
230
|
+
# Save each enabled format
|
|
231
|
+
if config.save_markdown:
|
|
232
|
+
result = extractor.extract(html, url=url, output_format="markdown")
|
|
233
|
+
if result:
|
|
234
|
+
content = _build_text_content(result.content, "markdown")
|
|
235
|
+
filepath = output_dir / f"{slug}.md"
|
|
236
|
+
filepath.write_text(content, encoding="utf-8")
|
|
237
|
+
logger.info(f"Saved {filepath}")
|
|
238
|
+
any_saved = True
|
|
239
|
+
|
|
240
|
+
if config.save_text:
|
|
241
|
+
result = extractor.extract(html, url=url, output_format="txt")
|
|
242
|
+
if result:
|
|
243
|
+
content = _build_text_content(result.content, "txt")
|
|
244
|
+
filepath = output_dir / f"{slug}.txt"
|
|
245
|
+
filepath.write_text(content, encoding="utf-8")
|
|
246
|
+
logger.info(f"Saved {filepath}")
|
|
247
|
+
any_saved = True
|
|
248
|
+
|
|
249
|
+
if config.save_json:
|
|
250
|
+
result = extractor.extract(html, url=url, output_format="json")
|
|
251
|
+
if result:
|
|
252
|
+
filepath = output_dir / f"{slug}.json"
|
|
253
|
+
filepath.write_text(result.content, encoding="utf-8")
|
|
254
|
+
logger.info(f"Saved {filepath}")
|
|
255
|
+
any_saved = True
|
|
256
|
+
|
|
257
|
+
if config.save_jsonl:
|
|
258
|
+
result = extractor.extract(html, url=url, output_format="markdown")
|
|
259
|
+
if result:
|
|
260
|
+
jsonl_path = output_dir / "output.jsonl"
|
|
261
|
+
entry = {
|
|
262
|
+
"url": url,
|
|
263
|
+
"title": metadata.title or "",
|
|
264
|
+
"author": metadata.author or "",
|
|
265
|
+
"date": metadata.date or "",
|
|
266
|
+
"content": result.content,
|
|
267
|
+
}
|
|
268
|
+
with open(jsonl_path, "a", encoding="utf-8") as f:
|
|
269
|
+
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
|
270
|
+
logger.info(f"Appended to {jsonl_path}")
|
|
271
|
+
any_saved = True
|
|
272
|
+
|
|
273
|
+
if config.save_xml:
|
|
274
|
+
result = extractor.extract(html, url=url, output_format="xml")
|
|
275
|
+
if result:
|
|
276
|
+
filepath = output_dir / f"{slug}.xml"
|
|
277
|
+
filepath.write_text(result.content, encoding="utf-8")
|
|
278
|
+
logger.info(f"Saved {filepath}")
|
|
279
|
+
any_saved = True
|
|
280
|
+
|
|
281
|
+
if config.save_xml_tei:
|
|
282
|
+
result = extractor.extract(html, url=url, output_format="xmltei")
|
|
283
|
+
if result:
|
|
284
|
+
filepath = output_dir / f"{slug}.tei.xml"
|
|
285
|
+
filepath.write_text(result.content, encoding="utf-8")
|
|
286
|
+
logger.info(f"Saved {filepath}")
|
|
287
|
+
any_saved = True
|
|
288
|
+
|
|
289
|
+
if config.save_raw_html:
|
|
290
|
+
filepath = output_dir / f"{slug}.html"
|
|
291
|
+
filepath.write_text(html, encoding="utf-8")
|
|
292
|
+
logger.info(f"Saved {filepath}")
|
|
293
|
+
any_saved = True
|
|
294
|
+
|
|
295
|
+
if not any_saved:
|
|
296
|
+
logger.warning(f"No content extracted from {url}")
|
|
297
|
+
return
|
|
298
|
+
|
|
299
|
+
pages_extracted += 1
|
|
300
|
+
|
|
301
|
+
# Enqueue links if crawl depth is configured (Crawlee handles depth limiting natively)
|
|
302
|
+
if config.crawl_depth > 0:
|
|
303
|
+
enqueue_kwargs: dict[str, Any] = {}
|
|
304
|
+
if config.link_selector:
|
|
305
|
+
enqueue_kwargs["selector"] = config.link_selector
|
|
306
|
+
if config.globs:
|
|
307
|
+
enqueue_kwargs["globs"] = config.globs
|
|
308
|
+
if config.excludes:
|
|
309
|
+
enqueue_kwargs["exclude_globs"] = config.excludes
|
|
310
|
+
|
|
311
|
+
await context.enqueue_links(**enqueue_kwargs)
|
|
312
|
+
|
|
313
|
+
# Build requests
|
|
314
|
+
requests = [
|
|
315
|
+
Request.from_url(
|
|
316
|
+
url,
|
|
317
|
+
keep_url_fragment=config.keep_url_fragments,
|
|
318
|
+
)
|
|
319
|
+
for url in config.urls
|
|
320
|
+
]
|
|
321
|
+
|
|
322
|
+
await crawler.run(requests)
|
|
323
|
+
logger.info(f"Done. Extracted {pages_extracted} pages to {output_dir}")
|
|
@@ -0,0 +1,333 @@
|
|
|
1
|
+
"""CLI entry point using Typer."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Annotated, Optional
|
|
9
|
+
|
|
10
|
+
import typer
|
|
11
|
+
|
|
12
|
+
from .config import CrawlConfig
|
|
13
|
+
from .crawler import run_crawl
|
|
14
|
+
|
|
15
|
+
app = typer.Typer(
|
|
16
|
+
name="contextractor",
|
|
17
|
+
help="Extract web content from URLs using configurable extraction options.",
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@app.command()
|
|
22
|
+
def extract(
|
|
23
|
+
urls: Annotated[
|
|
24
|
+
Optional[list[str]],
|
|
25
|
+
typer.Argument(help="URLs to extract content from"),
|
|
26
|
+
] = None,
|
|
27
|
+
# -- Config file --
|
|
28
|
+
config: Annotated[
|
|
29
|
+
Optional[Path],
|
|
30
|
+
typer.Option("--config", "-c", help="Path to JSON config file",
|
|
31
|
+
exists=True, readable=True),
|
|
32
|
+
] = None,
|
|
33
|
+
# -- CrawlConfig fields --
|
|
34
|
+
max_pages: Annotated[
|
|
35
|
+
Optional[int],
|
|
36
|
+
typer.Option("--max-pages", help="Max pages to crawl (0 = unlimited)"),
|
|
37
|
+
] = None,
|
|
38
|
+
crawl_depth: Annotated[
|
|
39
|
+
Optional[int],
|
|
40
|
+
typer.Option("--crawl-depth", help="Max link depth from start URLs (0 = start only)"),
|
|
41
|
+
] = None,
|
|
42
|
+
headless: Annotated[
|
|
43
|
+
Optional[bool],
|
|
44
|
+
typer.Option("--headless/--no-headless", help="Run browser in headless mode"),
|
|
45
|
+
] = None,
|
|
46
|
+
output_dir: Annotated[
|
|
47
|
+
Optional[str],
|
|
48
|
+
typer.Option("--output-dir", "-o", help="Output directory"),
|
|
49
|
+
] = None,
|
|
50
|
+
# -- Proxy --
|
|
51
|
+
proxy_urls: Annotated[
|
|
52
|
+
Optional[str],
|
|
53
|
+
typer.Option("--proxy-urls",
|
|
54
|
+
help="Comma-separated proxy URLs (http://user:pass@host:port)"),
|
|
55
|
+
] = None,
|
|
56
|
+
proxy_rotation: Annotated[
|
|
57
|
+
Optional[str],
|
|
58
|
+
typer.Option("--proxy-rotation",
|
|
59
|
+
help="Proxy rotation: recommended, per_request, until_failure"),
|
|
60
|
+
] = None,
|
|
61
|
+
# -- Browser settings --
|
|
62
|
+
launcher: Annotated[
|
|
63
|
+
Optional[str],
|
|
64
|
+
typer.Option("--launcher", help="Browser engine: chromium, firefox"),
|
|
65
|
+
] = None,
|
|
66
|
+
wait_until: Annotated[
|
|
67
|
+
Optional[str],
|
|
68
|
+
typer.Option("--wait-until",
|
|
69
|
+
help="Page load event: networkidle, load, domcontentloaded"),
|
|
70
|
+
] = None,
|
|
71
|
+
page_load_timeout: Annotated[
|
|
72
|
+
Optional[int],
|
|
73
|
+
typer.Option("--page-load-timeout", help="Page load timeout in seconds"),
|
|
74
|
+
] = None,
|
|
75
|
+
ignore_cors: Annotated[
|
|
76
|
+
Optional[bool],
|
|
77
|
+
typer.Option("--ignore-cors", help="Disable CORS/CSP restrictions"),
|
|
78
|
+
] = None,
|
|
79
|
+
close_cookie_modals: Annotated[
|
|
80
|
+
Optional[bool],
|
|
81
|
+
typer.Option("--close-cookie-modals", help="Auto-dismiss cookie banners"),
|
|
82
|
+
] = None,
|
|
83
|
+
max_scroll_height: Annotated[
|
|
84
|
+
Optional[int],
|
|
85
|
+
typer.Option("--max-scroll-height", help="Max scroll height in pixels"),
|
|
86
|
+
] = None,
|
|
87
|
+
ignore_ssl_errors: Annotated[
|
|
88
|
+
Optional[bool],
|
|
89
|
+
typer.Option("--ignore-ssl-errors", help="Skip SSL certificate verification"),
|
|
90
|
+
] = None,
|
|
91
|
+
user_agent: Annotated[
|
|
92
|
+
Optional[str],
|
|
93
|
+
typer.Option("--user-agent", help="Custom User-Agent string"),
|
|
94
|
+
] = None,
|
|
95
|
+
# -- Crawl filtering --
|
|
96
|
+
globs: Annotated[
|
|
97
|
+
Optional[str],
|
|
98
|
+
typer.Option("--globs", help="Comma-separated glob patterns to include"),
|
|
99
|
+
] = None,
|
|
100
|
+
excludes: Annotated[
|
|
101
|
+
Optional[str],
|
|
102
|
+
typer.Option("--excludes", help="Comma-separated glob patterns to exclude"),
|
|
103
|
+
] = None,
|
|
104
|
+
link_selector: Annotated[
|
|
105
|
+
Optional[str],
|
|
106
|
+
typer.Option("--link-selector", help="CSS selector for links to follow"),
|
|
107
|
+
] = None,
|
|
108
|
+
keep_url_fragments: Annotated[
|
|
109
|
+
Optional[bool],
|
|
110
|
+
typer.Option("--keep-url-fragments", help="Preserve URL fragments"),
|
|
111
|
+
] = None,
|
|
112
|
+
respect_robots_txt: Annotated[
|
|
113
|
+
Optional[bool],
|
|
114
|
+
typer.Option("--respect-robots-txt", help="Honor robots.txt"),
|
|
115
|
+
] = None,
|
|
116
|
+
# -- Cookies & headers --
|
|
117
|
+
cookies: Annotated[
|
|
118
|
+
Optional[str],
|
|
119
|
+
typer.Option("--cookies", help="JSON array of cookie objects"),
|
|
120
|
+
] = None,
|
|
121
|
+
headers: Annotated[
|
|
122
|
+
Optional[str],
|
|
123
|
+
typer.Option("--headers", help="JSON object of custom HTTP headers"),
|
|
124
|
+
] = None,
|
|
125
|
+
# -- Concurrency & retries --
|
|
126
|
+
max_concurrency: Annotated[
|
|
127
|
+
Optional[int],
|
|
128
|
+
typer.Option("--max-concurrency", help="Max parallel requests"),
|
|
129
|
+
] = None,
|
|
130
|
+
max_retries: Annotated[
|
|
131
|
+
Optional[int],
|
|
132
|
+
typer.Option("--max-retries", help="Max request retries"),
|
|
133
|
+
] = None,
|
|
134
|
+
max_results: Annotated[
|
|
135
|
+
Optional[int],
|
|
136
|
+
typer.Option("--max-results", help="Max results per crawl (0 = unlimited)"),
|
|
137
|
+
] = None,
|
|
138
|
+
# -- Output toggles --
|
|
139
|
+
save_markdown: Annotated[
|
|
140
|
+
Optional[bool],
|
|
141
|
+
typer.Option("--save-markdown/--no-save-markdown",
|
|
142
|
+
help="Save extracted markdown (default: true)"),
|
|
143
|
+
] = None,
|
|
144
|
+
save_raw_html: Annotated[
|
|
145
|
+
Optional[bool],
|
|
146
|
+
typer.Option("--save-raw-html", help="Save raw HTML to output"),
|
|
147
|
+
] = None,
|
|
148
|
+
save_text: Annotated[
|
|
149
|
+
Optional[bool],
|
|
150
|
+
typer.Option("--save-text", help="Save extracted text"),
|
|
151
|
+
] = None,
|
|
152
|
+
save_json: Annotated[
|
|
153
|
+
Optional[bool],
|
|
154
|
+
typer.Option("--save-json", help="Save extracted JSON"),
|
|
155
|
+
] = None,
|
|
156
|
+
save_jsonl: Annotated[
|
|
157
|
+
Optional[bool],
|
|
158
|
+
typer.Option("--save-jsonl", help="Save all pages as JSONL (single file)"),
|
|
159
|
+
] = None,
|
|
160
|
+
save_xml: Annotated[
|
|
161
|
+
Optional[bool],
|
|
162
|
+
typer.Option("--save-xml", help="Save extracted XML"),
|
|
163
|
+
] = None,
|
|
164
|
+
save_xml_tei: Annotated[
|
|
165
|
+
Optional[bool],
|
|
166
|
+
typer.Option("--save-xml-tei", help="Save extracted XML-TEI"),
|
|
167
|
+
] = None,
|
|
168
|
+
# -- TrafilaturaConfig fields --
|
|
169
|
+
precision: Annotated[
|
|
170
|
+
Optional[bool],
|
|
171
|
+
typer.Option("--precision", help="High precision mode (less noise)"),
|
|
172
|
+
] = None,
|
|
173
|
+
recall: Annotated[
|
|
174
|
+
Optional[bool],
|
|
175
|
+
typer.Option("--recall", help="High recall mode (more content)"),
|
|
176
|
+
] = None,
|
|
177
|
+
fast: Annotated[
|
|
178
|
+
Optional[bool],
|
|
179
|
+
typer.Option("--fast", help="Fast extraction mode (less thorough)"),
|
|
180
|
+
] = None,
|
|
181
|
+
no_links: Annotated[
|
|
182
|
+
Optional[bool],
|
|
183
|
+
typer.Option("--no-links", help="Exclude links from output"),
|
|
184
|
+
] = None,
|
|
185
|
+
no_comments: Annotated[
|
|
186
|
+
Optional[bool],
|
|
187
|
+
typer.Option("--no-comments", help="Exclude comments from output"),
|
|
188
|
+
] = None,
|
|
189
|
+
include_tables: Annotated[
|
|
190
|
+
Optional[bool],
|
|
191
|
+
typer.Option("--include-tables/--no-tables", help="Include tables in output"),
|
|
192
|
+
] = None,
|
|
193
|
+
include_images: Annotated[
|
|
194
|
+
Optional[bool],
|
|
195
|
+
typer.Option("--include-images", help="Include image descriptions"),
|
|
196
|
+
] = None,
|
|
197
|
+
include_formatting: Annotated[
|
|
198
|
+
Optional[bool],
|
|
199
|
+
typer.Option("--include-formatting/--no-formatting",
|
|
200
|
+
help="Preserve text formatting"),
|
|
201
|
+
] = None,
|
|
202
|
+
deduplicate: Annotated[
|
|
203
|
+
Optional[bool],
|
|
204
|
+
typer.Option("--deduplicate", help="Deduplicate extracted content"),
|
|
205
|
+
] = None,
|
|
206
|
+
target_language: Annotated[
|
|
207
|
+
Optional[str],
|
|
208
|
+
typer.Option("--target-language", help="Filter by language (e.g. 'en')"),
|
|
209
|
+
] = None,
|
|
210
|
+
with_metadata: Annotated[
|
|
211
|
+
Optional[bool],
|
|
212
|
+
typer.Option("--with-metadata/--no-metadata",
|
|
213
|
+
help="Extract metadata along with content"),
|
|
214
|
+
] = None,
|
|
215
|
+
prune_xpath: Annotated[
|
|
216
|
+
Optional[list[str]],
|
|
217
|
+
typer.Option("--prune-xpath", help="XPath patterns to remove from content"),
|
|
218
|
+
] = None,
|
|
219
|
+
# -- Diagnostics --
|
|
220
|
+
verbose: Annotated[
|
|
221
|
+
bool,
|
|
222
|
+
typer.Option("--verbose", "-v", help="Enable verbose logging"),
|
|
223
|
+
] = False,
|
|
224
|
+
) -> None:
|
|
225
|
+
"""Extract content from web pages."""
|
|
226
|
+
import json as json_mod
|
|
227
|
+
|
|
228
|
+
# Set up logging
|
|
229
|
+
log_level = logging.DEBUG if verbose else logging.INFO
|
|
230
|
+
logging.basicConfig(
|
|
231
|
+
level=log_level,
|
|
232
|
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
233
|
+
datefmt="%H:%M:%S",
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
# 1. Start with defaults
|
|
237
|
+
cfg = CrawlConfig()
|
|
238
|
+
|
|
239
|
+
# 2. If config file provided, load and merge
|
|
240
|
+
if config is not None:
|
|
241
|
+
file_config = CrawlConfig.from_file(config)
|
|
242
|
+
# Replace with file-loaded config as base
|
|
243
|
+
cfg = file_config
|
|
244
|
+
|
|
245
|
+
# 3. Merge CLI args (CLI wins over file)
|
|
246
|
+
cli_overrides: dict[str, object] = {
|
|
247
|
+
"max_pages": max_pages,
|
|
248
|
+
"crawl_depth": crawl_depth,
|
|
249
|
+
"headless": headless,
|
|
250
|
+
"output_dir": output_dir,
|
|
251
|
+
# Proxy
|
|
252
|
+
"proxy_urls": [u.strip() for u in proxy_urls.split(",")] if proxy_urls else None,
|
|
253
|
+
"proxy_rotation": proxy_rotation,
|
|
254
|
+
# Browser
|
|
255
|
+
"launcher": launcher.lower() if launcher else None,
|
|
256
|
+
"wait_until": wait_until.lower() if wait_until else None,
|
|
257
|
+
"page_load_timeout": page_load_timeout,
|
|
258
|
+
"ignore_cors": ignore_cors,
|
|
259
|
+
"close_cookie_modals": close_cookie_modals,
|
|
260
|
+
"max_scroll_height": max_scroll_height,
|
|
261
|
+
"ignore_ssl_errors": ignore_ssl_errors,
|
|
262
|
+
"user_agent": user_agent,
|
|
263
|
+
# Crawl filtering
|
|
264
|
+
"globs": [g.strip() for g in globs.split(",")] if globs else None,
|
|
265
|
+
"excludes": [e.strip() for e in excludes.split(",")] if excludes else None,
|
|
266
|
+
"link_selector": link_selector,
|
|
267
|
+
"keep_url_fragments": keep_url_fragments,
|
|
268
|
+
"respect_robots_txt": respect_robots_txt,
|
|
269
|
+
# Cookies & headers
|
|
270
|
+
"cookies": json_mod.loads(cookies) if cookies else None,
|
|
271
|
+
"headers": json_mod.loads(headers) if headers else None,
|
|
272
|
+
# Concurrency & retries
|
|
273
|
+
"max_concurrency": max_concurrency,
|
|
274
|
+
"max_retries": max_retries,
|
|
275
|
+
"max_results": max_results,
|
|
276
|
+
# Output toggles
|
|
277
|
+
"save_markdown": save_markdown,
|
|
278
|
+
"save_raw_html": save_raw_html,
|
|
279
|
+
"save_text": save_text,
|
|
280
|
+
"save_json": save_json,
|
|
281
|
+
"save_jsonl": save_jsonl,
|
|
282
|
+
"save_xml": save_xml,
|
|
283
|
+
"save_xml_tei": save_xml_tei,
|
|
284
|
+
# Extraction settings
|
|
285
|
+
"fast": fast,
|
|
286
|
+
"favor_precision": precision,
|
|
287
|
+
"favor_recall": recall,
|
|
288
|
+
"include_tables": include_tables,
|
|
289
|
+
"include_images": include_images,
|
|
290
|
+
"include_formatting": include_formatting,
|
|
291
|
+
"deduplicate": deduplicate,
|
|
292
|
+
"target_language": target_language,
|
|
293
|
+
"with_metadata": with_metadata,
|
|
294
|
+
"prune_xpath": prune_xpath if prune_xpath else None,
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
# Handle --no-links and --no-comments (invert to include_*)
|
|
298
|
+
if no_links:
|
|
299
|
+
cli_overrides["include_links"] = False
|
|
300
|
+
if no_comments:
|
|
301
|
+
cli_overrides["include_comments"] = False
|
|
302
|
+
|
|
303
|
+
cfg.merge(cli_overrides)
|
|
304
|
+
|
|
305
|
+
# 4. URLs from positional args extend/override config urls
|
|
306
|
+
if urls:
|
|
307
|
+
cfg.urls = list(urls)
|
|
308
|
+
|
|
309
|
+
# 5. Validate
|
|
310
|
+
if not cfg.urls:
|
|
311
|
+
typer.echo("Error: No URLs specified. Provide URLs as arguments or via --config.", err=True)
|
|
312
|
+
raise typer.Exit(1)
|
|
313
|
+
|
|
314
|
+
# Build list of active output formats for display
|
|
315
|
+
active_formats = []
|
|
316
|
+
if cfg.save_markdown:
|
|
317
|
+
active_formats.append("markdown")
|
|
318
|
+
if cfg.save_raw_html:
|
|
319
|
+
active_formats.append("html")
|
|
320
|
+
if cfg.save_text:
|
|
321
|
+
active_formats.append("text")
|
|
322
|
+
if cfg.save_json:
|
|
323
|
+
active_formats.append("json")
|
|
324
|
+
if cfg.save_jsonl:
|
|
325
|
+
active_formats.append("jsonl")
|
|
326
|
+
if cfg.save_xml:
|
|
327
|
+
active_formats.append("xml")
|
|
328
|
+
if cfg.save_xml_tei:
|
|
329
|
+
active_formats.append("xml-tei")
|
|
330
|
+
formats_str = ", ".join(active_formats) if active_formats else "markdown"
|
|
331
|
+
|
|
332
|
+
typer.echo(f"Extracting {len(cfg.urls)} URL(s) → {cfg.output_dir}/ ({formats_str})")
|
|
333
|
+
asyncio.run(run_crawl(cfg))
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Contextractor Engine - Trafilatura extraction wrapper with configurable options."""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from .extractor import ContentExtractor
|
|
6
|
+
from .models import ExtractionResult, MetadataResult, TrafilaturaConfig
|
|
7
|
+
from .utils import normalize_config_keys
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_default_config() -> dict[str, Any]:
|
|
11
|
+
"""Get default TrafilaturaConfig as JSON dict (camelCase keys)."""
|
|
12
|
+
return TrafilaturaConfig.get_default_json()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"ContentExtractor",
|
|
17
|
+
"TrafilaturaConfig",
|
|
18
|
+
"ExtractionResult",
|
|
19
|
+
"MetadataResult",
|
|
20
|
+
"normalize_config_keys",
|
|
21
|
+
"get_default_config",
|
|
22
|
+
]
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Content extraction wrapper using trafilatura."""
|
|
2
|
+
|
|
3
|
+
import trafilatura
|
|
4
|
+
|
|
5
|
+
from .models import ExtractionResult, MetadataResult, TrafilaturaConfig
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ContentExtractor:
|
|
9
|
+
"""Trafilatura wrapper with configurable extraction."""
|
|
10
|
+
|
|
11
|
+
DEFAULT_FORMATS = ["txt", "markdown", "json", "xml"]
|
|
12
|
+
|
|
13
|
+
def __init__(self, config: TrafilaturaConfig | None = None) -> None:
|
|
14
|
+
self.config = config or TrafilaturaConfig.balanced()
|
|
15
|
+
|
|
16
|
+
def extract(
|
|
17
|
+
self,
|
|
18
|
+
html: str,
|
|
19
|
+
url: str | None = None,
|
|
20
|
+
output_format: str = "txt",
|
|
21
|
+
) -> ExtractionResult | None:
|
|
22
|
+
"""Extract content in specified format."""
|
|
23
|
+
kwargs = self.config.to_trafilatura_kwargs()
|
|
24
|
+
result = trafilatura.extract(
|
|
25
|
+
html,
|
|
26
|
+
url=url,
|
|
27
|
+
output_format=output_format,
|
|
28
|
+
**kwargs,
|
|
29
|
+
)
|
|
30
|
+
if result is None:
|
|
31
|
+
return None
|
|
32
|
+
return ExtractionResult(content=result, output_format=output_format)
|
|
33
|
+
|
|
34
|
+
def extract_metadata(self, html: str, url: str | None = None) -> MetadataResult:
|
|
35
|
+
"""Extract metadata from HTML.
|
|
36
|
+
|
|
37
|
+
Note: bare_extraction returns a Document object with attributes,
|
|
38
|
+
not a dict. Use getattr() to access fields safely.
|
|
39
|
+
"""
|
|
40
|
+
raw = trafilatura.bare_extraction(html, url=url, with_metadata=True)
|
|
41
|
+
if not raw:
|
|
42
|
+
return MetadataResult() # All fields default to None
|
|
43
|
+
# bare_extraction returns a Document object with attributes
|
|
44
|
+
return MetadataResult(
|
|
45
|
+
title=getattr(raw, "title", None),
|
|
46
|
+
author=getattr(raw, "author", None),
|
|
47
|
+
date=getattr(raw, "date", None),
|
|
48
|
+
description=getattr(raw, "description", None),
|
|
49
|
+
sitename=getattr(raw, "sitename", None),
|
|
50
|
+
language=getattr(raw, "language", None),
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
def extract_all_formats(
|
|
54
|
+
self,
|
|
55
|
+
html: str,
|
|
56
|
+
url: str | None = None,
|
|
57
|
+
formats: list[str] | None = None,
|
|
58
|
+
) -> dict[str, ExtractionResult]:
|
|
59
|
+
"""Extract content in multiple formats at once.
|
|
60
|
+
|
|
61
|
+
Default formats: ["txt", "markdown", "json", "xml"]
|
|
62
|
+
Returns dict keyed by format name. Failed extractions are omitted.
|
|
63
|
+
"""
|
|
64
|
+
formats = formats or self.DEFAULT_FORMATS
|
|
65
|
+
results: dict[str, ExtractionResult] = {}
|
|
66
|
+
for fmt in formats:
|
|
67
|
+
result = self.extract(html, url=url, output_format=fmt)
|
|
68
|
+
if result is not None:
|
|
69
|
+
results[fmt] = result
|
|
70
|
+
return results
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""Data models for contextractor-engine."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field, fields
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from .utils import normalize_config_keys, to_camel_case
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class TrafilaturaConfig:
|
|
11
|
+
"""Configuration for trafilatura extraction.
|
|
12
|
+
|
|
13
|
+
Maps all non-deprecated trafilatura.extract() parameters.
|
|
14
|
+
Excluded (deprecated): no_fallback, as_dict, max_tree_size, settingsfile, config, options.
|
|
15
|
+
Excluded (per-call): url, record_id, output_format.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
fast: bool = False
|
|
19
|
+
favor_precision: bool = False
|
|
20
|
+
favor_recall: bool = False
|
|
21
|
+
include_comments: bool = True
|
|
22
|
+
include_tables: bool = True
|
|
23
|
+
include_images: bool = False
|
|
24
|
+
include_formatting: bool = True
|
|
25
|
+
include_links: bool = True
|
|
26
|
+
deduplicate: bool = False
|
|
27
|
+
target_language: str | None = None
|
|
28
|
+
with_metadata: bool = True
|
|
29
|
+
only_with_metadata: bool = False
|
|
30
|
+
tei_validation: bool = False
|
|
31
|
+
prune_xpath: str | list[str] | None = None
|
|
32
|
+
url_blacklist: set[str] | None = field(default=None)
|
|
33
|
+
author_blacklist: set[str] | None = field(default=None)
|
|
34
|
+
date_extraction_params: dict[str, Any] | None = None
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def balanced(cls) -> "TrafilaturaConfig":
|
|
38
|
+
"""Default balanced extraction."""
|
|
39
|
+
return cls()
|
|
40
|
+
|
|
41
|
+
@classmethod
|
|
42
|
+
def precision(cls) -> "TrafilaturaConfig":
|
|
43
|
+
"""High precision, less noise."""
|
|
44
|
+
return cls(favor_precision=True)
|
|
45
|
+
|
|
46
|
+
@classmethod
|
|
47
|
+
def recall(cls) -> "TrafilaturaConfig":
|
|
48
|
+
"""High recall, more content."""
|
|
49
|
+
return cls(favor_recall=True)
|
|
50
|
+
|
|
51
|
+
def to_trafilatura_kwargs(self) -> dict[str, Any]:
|
|
52
|
+
"""Convert to trafilatura.extract() keyword arguments.
|
|
53
|
+
|
|
54
|
+
Excludes url, record_id, output_format — those are per-call.
|
|
55
|
+
Only includes optional params if they are set (not None).
|
|
56
|
+
"""
|
|
57
|
+
return {
|
|
58
|
+
f.name: getattr(self, f.name)
|
|
59
|
+
for f in fields(self)
|
|
60
|
+
if getattr(self, f.name) is not None
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
def to_json_dict(self) -> dict[str, Any]:
|
|
64
|
+
"""Convert config to JSON-serializable dict with camelCase keys.
|
|
65
|
+
|
|
66
|
+
Used for API responses and GUI defaults.
|
|
67
|
+
Excludes None values. Sets are converted to lists for JSON compatibility.
|
|
68
|
+
"""
|
|
69
|
+
result: dict[str, Any] = {}
|
|
70
|
+
for f in fields(self):
|
|
71
|
+
value = getattr(self, f.name)
|
|
72
|
+
if value is None:
|
|
73
|
+
continue
|
|
74
|
+
if isinstance(value, set):
|
|
75
|
+
value = list(value)
|
|
76
|
+
result[to_camel_case(f.name)] = value
|
|
77
|
+
return result
|
|
78
|
+
|
|
79
|
+
@classmethod
|
|
80
|
+
def from_json_dict(cls, data: dict[str, Any] | None) -> "TrafilaturaConfig":
|
|
81
|
+
"""Create config from a camelCase (or snake_case) dict.
|
|
82
|
+
|
|
83
|
+
This is the single canonical way to build a TrafilaturaConfig from
|
|
84
|
+
external input (JSON, YAML, API). Handles key normalization, None
|
|
85
|
+
filtering, and type coercion (lists → sets for blacklist fields).
|
|
86
|
+
Unknown keys are ignored. Returns balanced defaults for empty/None input.
|
|
87
|
+
"""
|
|
88
|
+
if not data:
|
|
89
|
+
return cls.balanced()
|
|
90
|
+
normalized = normalize_config_keys(data)
|
|
91
|
+
valid_fields = {f.name for f in fields(cls)}
|
|
92
|
+
kwargs: dict[str, Any] = {}
|
|
93
|
+
for key, value in normalized.items():
|
|
94
|
+
if key not in valid_fields or value is None:
|
|
95
|
+
continue
|
|
96
|
+
if isinstance(value, list):
|
|
97
|
+
f = next(f for f in fields(cls) if f.name == key)
|
|
98
|
+
if "set" in str(f.type):
|
|
99
|
+
value = set(value)
|
|
100
|
+
kwargs[key] = value
|
|
101
|
+
return cls(**kwargs)
|
|
102
|
+
|
|
103
|
+
@classmethod
|
|
104
|
+
def get_default_json(cls) -> dict[str, Any]:
|
|
105
|
+
"""Get default config as JSON-serializable dict with camelCase keys."""
|
|
106
|
+
return cls().to_json_dict()
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@dataclass
|
|
110
|
+
class ExtractionResult:
|
|
111
|
+
"""Result from a single format extraction."""
|
|
112
|
+
|
|
113
|
+
content: str
|
|
114
|
+
output_format: str # "txt", "json", "markdown", "xml", "xmltei"
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
@dataclass
|
|
118
|
+
class MetadataResult:
|
|
119
|
+
"""Extracted metadata from HTML."""
|
|
120
|
+
|
|
121
|
+
title: str | None = None
|
|
122
|
+
author: str | None = None
|
|
123
|
+
date: str | None = None
|
|
124
|
+
description: str | None = None
|
|
125
|
+
sitename: str | None = None
|
|
126
|
+
language: str | None = None
|
|
File without changes
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Utility functions for contextractor-engine."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def to_snake_case(key: str) -> str:
|
|
8
|
+
"""Convert camelCase to snake_case. Leave snake_case unchanged."""
|
|
9
|
+
if "_" in key:
|
|
10
|
+
return key
|
|
11
|
+
return re.sub(r"(?<!^)(?=[A-Z])", "_", key).lower()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def to_camel_case(key: str) -> str:
|
|
15
|
+
"""Convert snake_case to camelCase. Leave camelCase unchanged."""
|
|
16
|
+
parts = key.split("_")
|
|
17
|
+
return parts[0] + "".join(p.capitalize() for p in parts[1:])
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def normalize_config_keys(config: dict[str, Any]) -> dict[str, Any]:
|
|
21
|
+
"""Normalize config dictionary keys to snake_case.
|
|
22
|
+
|
|
23
|
+
Accepts both camelCase (JSON/API convention) and snake_case (Python convention).
|
|
24
|
+
Auto-detects the format and converts camelCase to snake_case.
|
|
25
|
+
Keys already in snake_case are left unchanged.
|
|
26
|
+
|
|
27
|
+
Examples:
|
|
28
|
+
{"favorPrecision": True} -> {"favor_precision": True}
|
|
29
|
+
{"favor_precision": True} -> {"favor_precision": True}
|
|
30
|
+
{"includeLinks": True, "fast": False} -> {"include_links": True, "fast": False}
|
|
31
|
+
"""
|
|
32
|
+
if not config:
|
|
33
|
+
return {}
|
|
34
|
+
return {to_snake_case(k): v for k, v in config.items()}
|