contextractor 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -8
- package/index.js +3 -1
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -22,7 +22,7 @@ Works with zero config. Pass URLs directly, or use a config file for complex set
|
|
|
22
22
|
|
|
23
23
|
```bash
|
|
24
24
|
contextractor https://example.com --precision --format json -o ./results
|
|
25
|
-
contextractor --config config.
|
|
25
|
+
contextractor --config config.json --max-pages 10
|
|
26
26
|
```
|
|
27
27
|
|
|
28
28
|
### CLI Options
|
|
@@ -31,9 +31,9 @@ contextractor --config config.yaml --max-pages 10
|
|
|
31
31
|
contextractor [OPTIONS] [URLS...]
|
|
32
32
|
|
|
33
33
|
Crawl Settings:
|
|
34
|
-
--config, -c Path to
|
|
34
|
+
--config, -c Path to JSON config file
|
|
35
35
|
--output-dir, -o Output directory
|
|
36
|
-
--format, -f Output format (txt, markdown, json, xml, xmltei)
|
|
36
|
+
--format, -f Output format (txt, markdown, json, jsonl, xml, xmltei)
|
|
37
37
|
--max-pages Max pages to crawl (0 = unlimited)
|
|
38
38
|
--crawl-depth Max link depth from start URLs (0 = start only)
|
|
39
39
|
--headless/--no-headless Browser headless mode (default: headless)
|
|
@@ -47,12 +47,13 @@ Proxy:
|
|
|
47
47
|
|
|
48
48
|
Browser:
|
|
49
49
|
--launcher Browser engine: chromium, firefox (default: chromium)
|
|
50
|
-
--wait-until Page load event:
|
|
50
|
+
--wait-until Page load event: load, networkidle, domcontentloaded (default: load)
|
|
51
51
|
--page-load-timeout Timeout in seconds (default: 60)
|
|
52
52
|
--ignore-cors Disable CORS/CSP restrictions
|
|
53
53
|
--close-cookie-modals Auto-dismiss cookie banners
|
|
54
54
|
--max-scroll-height Max scroll height in pixels (default: 5000)
|
|
55
55
|
--ignore-ssl-errors Skip SSL certificate verification
|
|
56
|
+
--user-agent Custom User-Agent string
|
|
56
57
|
|
|
57
58
|
Crawl Filtering:
|
|
58
59
|
--globs Comma-separated glob patterns to include
|
|
@@ -94,7 +95,7 @@ CLI flags override config file settings. Merge order: `defaults → config file
|
|
|
94
95
|
|
|
95
96
|
### Config File (optional)
|
|
96
97
|
|
|
97
|
-
|
|
98
|
+
Use a JSON config file to set options:
|
|
98
99
|
|
|
99
100
|
```json
|
|
100
101
|
{
|
|
@@ -121,7 +122,7 @@ Supports both JSON and YAML format. JSON examples shown below:
|
|
|
121
122
|
|-------|------|---------|-------------|
|
|
122
123
|
| `urls` | array | `[]` | URLs to extract content from |
|
|
123
124
|
| `maxPages` | int | 0 | Max pages to crawl (0 = unlimited) |
|
|
124
|
-
| `outputFormat` | string | `"markdown"` | `txt`, `markdown`, `json`, `xml`, `xmltei` |
|
|
125
|
+
| `outputFormat` | string | `"markdown"` | `txt`, `markdown`, `json`, `jsonl`, `xml`, `xmltei` |
|
|
125
126
|
| `outputDir` | string | `"./output"` | Directory for extracted content |
|
|
126
127
|
| `crawlDepth` | int | 0 | How deep to follow links (0 = start URLs only) |
|
|
127
128
|
| `headless` | bool | true | Browser headless mode |
|
|
@@ -135,18 +136,20 @@ Supports both JSON and YAML format. JSON examples shown below:
|
|
|
135
136
|
|-------|------|---------|-------------|
|
|
136
137
|
| `proxy.urls` | array | `[]` | Proxy URLs (`http://user:pass@host:port` or `socks5://host:port`) |
|
|
137
138
|
| `proxy.rotation` | string | `"recommended"` | `recommended`, `per_request`, `until_failure` |
|
|
139
|
+
| `proxy.tiered` | array | `[]` | Tiered proxy escalation (config-file only) |
|
|
138
140
|
|
|
139
141
|
### Browser Settings
|
|
140
142
|
|
|
141
143
|
| Field | Type | Default | Description |
|
|
142
144
|
|-------|------|---------|-------------|
|
|
143
145
|
| `launcher` | string | `"chromium"` | Browser engine: `chromium`, `firefox` |
|
|
144
|
-
| `waitUntil` | string | `"
|
|
146
|
+
| `waitUntil` | string | `"load"` | Page load event: `load`, `networkidle`, `domcontentloaded` |
|
|
145
147
|
| `pageLoadTimeout` | int | 60 | Page load timeout in seconds |
|
|
146
148
|
| `ignoreCors` | bool | false | Disable CORS/CSP restrictions |
|
|
147
149
|
| `closeCookieModals` | bool | false | Auto-dismiss cookie consent banners |
|
|
148
150
|
| `maxScrollHeight` | int | 5000 | Max scroll height in pixels (0 = disable) |
|
|
149
151
|
| `ignoreSslErrors` | bool | false | Skip SSL certificate verification |
|
|
152
|
+
| `userAgent` | string | `""` | Custom User-Agent string |
|
|
150
153
|
|
|
151
154
|
### Crawl Filtering
|
|
152
155
|
|
|
@@ -194,6 +197,7 @@ All options go under the `extraction` key in config files, or use the equivalent
|
|
|
194
197
|
| `withMetadata` | bool | true | Extract metadata (title, author, date) |
|
|
195
198
|
| `targetLanguage` | string | null | Filter by language (e.g. `"en"`) |
|
|
196
199
|
| `fast` | bool | false | Fast mode (less thorough) |
|
|
200
|
+
| `pruneXpath` | array | null | XPath patterns to remove from content |
|
|
197
201
|
|
|
198
202
|
## Docker
|
|
199
203
|
|
|
@@ -210,7 +214,7 @@ docker run -v ./output:/output ghcr.io/contextractor/contextractor https://examp
|
|
|
210
214
|
Use a config file:
|
|
211
215
|
|
|
212
216
|
```bash
|
|
213
|
-
docker run -v ./config.
|
|
217
|
+
docker run -v ./config.json:/config.json ghcr.io/contextractor/contextractor --config /config.json
|
|
214
218
|
```
|
|
215
219
|
|
|
216
220
|
All CLI flags work the same inside Docker.
|
package/index.js
CHANGED
|
@@ -51,7 +51,7 @@ function getBinaryPath() {
|
|
|
51
51
|
* @param {boolean} [options.noLinks] - Exclude links
|
|
52
52
|
* @param {boolean} [options.noComments] - Exclude comments
|
|
53
53
|
* @param {string} [options.outputDir] - Output directory
|
|
54
|
-
* @param {string} [options.format] - Output format (txt, markdown, json, xml, xmltei)
|
|
54
|
+
* @param {string} [options.format] - Output format (txt, markdown, json, jsonl, xml, xmltei)
|
|
55
55
|
* @param {number} [options.maxPages] - Max pages to crawl
|
|
56
56
|
* @param {number} [options.crawlDepth] - Max crawl depth
|
|
57
57
|
* @param {boolean} [options.headless] - Run headless (default true)
|
|
@@ -71,6 +71,7 @@ function getBinaryPath() {
|
|
|
71
71
|
* @param {boolean} [options.closeCookieModals] - Auto-dismiss cookie banners
|
|
72
72
|
* @param {number} [options.maxScrollHeight] - Max scroll height in pixels
|
|
73
73
|
* @param {boolean} [options.ignoreSslErrors] - Skip SSL verification
|
|
74
|
+
* @param {string} [options.userAgent] - Custom User-Agent string
|
|
74
75
|
* @param {string|string[]} [options.globs] - Glob patterns to include
|
|
75
76
|
* @param {string|string[]} [options.excludes] - Glob patterns to exclude
|
|
76
77
|
* @param {string} [options.linkSelector] - CSS selector for links
|
|
@@ -133,6 +134,7 @@ function extract(urls, options = {}) {
|
|
|
133
134
|
if (options.closeCookieModals) args.push("--close-cookie-modals");
|
|
134
135
|
if (options.maxScrollHeight != null) args.push("--max-scroll-height", String(options.maxScrollHeight));
|
|
135
136
|
if (options.ignoreSslErrors) args.push("--ignore-ssl-errors");
|
|
137
|
+
if (options.userAgent) args.push("--user-agent", options.userAgent);
|
|
136
138
|
|
|
137
139
|
// Crawl filtering
|
|
138
140
|
if (options.globs) {
|
package/package.json
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "contextractor",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.2",
|
|
4
4
|
"description": "Extract web content from URLs with configurable extraction options",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"repository": {
|
|
7
7
|
"type": "git",
|
|
8
8
|
"url": "https://github.com/contextractor/contextractor.git"
|
|
9
9
|
},
|
|
10
|
-
"homepage": "https://
|
|
10
|
+
"homepage": "https://www.contextractor.com/",
|
|
11
11
|
"bugs": {
|
|
12
12
|
"url": "https://github.com/contextractor/contextractor/issues"
|
|
13
13
|
},
|