contextractor 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +112 -16
- package/index.js +69 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -30,13 +30,49 @@ contextractor --config config.yaml --max-pages 10
|
|
|
30
30
|
```
|
|
31
31
|
contextractor [OPTIONS] [URLS...]
|
|
32
32
|
|
|
33
|
-
|
|
33
|
+
Crawl Settings:
|
|
34
34
|
--config, -c Path to YAML or JSON config file
|
|
35
35
|
--output-dir, -o Output directory
|
|
36
36
|
--format, -f Output format (txt, markdown, json, xml, xmltei)
|
|
37
37
|
--max-pages Max pages to crawl (0 = unlimited)
|
|
38
38
|
--crawl-depth Max link depth from start URLs (0 = start only)
|
|
39
39
|
--headless/--no-headless Browser headless mode (default: headless)
|
|
40
|
+
--max-concurrency Max parallel requests (default: 50)
|
|
41
|
+
--max-retries Max request retries (default: 3)
|
|
42
|
+
--max-results Max results per crawl (0 = unlimited)
|
|
43
|
+
|
|
44
|
+
Proxy:
|
|
45
|
+
--proxy-urls Comma-separated proxy URLs (http://user:pass@host:port)
|
|
46
|
+
--proxy-rotation Rotation: recommended, per_request, until_failure
|
|
47
|
+
|
|
48
|
+
Browser:
|
|
49
|
+
--launcher Browser engine: chromium, firefox (default: chromium)
|
|
50
|
+
--wait-until Page load event: networkidle, load, domcontentloaded
|
|
51
|
+
--page-load-timeout Timeout in seconds (default: 60)
|
|
52
|
+
--ignore-cors Disable CORS/CSP restrictions
|
|
53
|
+
--close-cookie-modals Auto-dismiss cookie banners
|
|
54
|
+
--max-scroll-height Max scroll height in pixels (default: 5000)
|
|
55
|
+
--ignore-ssl-errors Skip SSL certificate verification
|
|
56
|
+
|
|
57
|
+
Crawl Filtering:
|
|
58
|
+
--globs Comma-separated glob patterns to include
|
|
59
|
+
--excludes Comma-separated glob patterns to exclude
|
|
60
|
+
--link-selector CSS selector for links to follow
|
|
61
|
+
--keep-url-fragments Preserve URL fragments
|
|
62
|
+
--respect-robots-txt Honor robots.txt
|
|
63
|
+
|
|
64
|
+
Cookies & Headers:
|
|
65
|
+
--cookies JSON array of cookie objects
|
|
66
|
+
--headers JSON object of custom HTTP headers
|
|
67
|
+
|
|
68
|
+
Output Toggles:
|
|
69
|
+
--save-raw-html Save raw HTML to output
|
|
70
|
+
--save-text Save extracted text
|
|
71
|
+
--save-json Save extracted JSON
|
|
72
|
+
--save-xml Save extracted XML
|
|
73
|
+
--save-xml-tei Save extracted XML-TEI
|
|
74
|
+
|
|
75
|
+
Content Extraction:
|
|
40
76
|
--precision High precision mode (less noise)
|
|
41
77
|
--recall High recall mode (more content)
|
|
42
78
|
--fast Fast extraction mode (less thorough)
|
|
@@ -49,6 +85,8 @@ Options:
|
|
|
49
85
|
--target-language Filter by language (e.g. "en")
|
|
50
86
|
--with-metadata/--no-metadata Extract metadata (default: with)
|
|
51
87
|
--prune-xpath XPath patterns to remove from content
|
|
88
|
+
|
|
89
|
+
Diagnostics:
|
|
52
90
|
--verbose, -v Enable verbose logging
|
|
53
91
|
```
|
|
54
92
|
|
|
@@ -56,21 +94,29 @@ CLI flags override config file settings. Merge order: `defaults → config file
|
|
|
56
94
|
|
|
57
95
|
### Config File (optional)
|
|
58
96
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
97
|
+
Supports both JSON and YAML format. JSON examples shown below:
|
|
98
|
+
|
|
99
|
+
```json
|
|
100
|
+
{
|
|
101
|
+
"urls": ["https://example.com", "https://docs.example.com"],
|
|
102
|
+
"outputFormat": "markdown",
|
|
103
|
+
"outputDir": "./output",
|
|
104
|
+
"crawlDepth": 1,
|
|
105
|
+
"proxy": {
|
|
106
|
+
"urls": ["http://user:pass@host:port"],
|
|
107
|
+
"rotation": "recommended"
|
|
108
|
+
},
|
|
109
|
+
"extraction": {
|
|
110
|
+
"favorPrecision": true,
|
|
111
|
+
"includeLinks": true,
|
|
112
|
+
"includeTables": true,
|
|
113
|
+
"deduplicate": true
|
|
114
|
+
}
|
|
115
|
+
}
|
|
72
116
|
```
|
|
73
117
|
|
|
118
|
+
### Crawl Settings
|
|
119
|
+
|
|
74
120
|
| Field | Type | Default | Description |
|
|
75
121
|
|-------|------|---------|-------------|
|
|
76
122
|
| `urls` | array | `[]` | URLs to extract content from |
|
|
@@ -79,9 +125,59 @@ extraction:
|
|
|
79
125
|
| `outputDir` | string | `"./output"` | Directory for extracted content |
|
|
80
126
|
| `crawlDepth` | int | 0 | How deep to follow links (0 = start URLs only) |
|
|
81
127
|
| `headless` | bool | true | Browser headless mode |
|
|
82
|
-
| `
|
|
128
|
+
| `maxConcurrency` | int | 50 | Max parallel browser pages |
|
|
129
|
+
| `maxRetries` | int | 3 | Max retries for failed requests |
|
|
130
|
+
| `maxResults` | int | 0 | Max results per crawl (0 = unlimited) |
|
|
131
|
+
|
|
132
|
+
### Proxy Configuration
|
|
133
|
+
|
|
134
|
+
| Field | Type | Default | Description |
|
|
135
|
+
|-------|------|---------|-------------|
|
|
136
|
+
| `proxy.urls` | array | `[]` | Proxy URLs (`http://user:pass@host:port` or `socks5://host:port`) |
|
|
137
|
+
| `proxy.rotation` | string | `"recommended"` | `recommended`, `per_request`, `until_failure` |
|
|
138
|
+
|
|
139
|
+
### Browser Settings
|
|
140
|
+
|
|
141
|
+
| Field | Type | Default | Description |
|
|
142
|
+
|-------|------|---------|-------------|
|
|
143
|
+
| `launcher` | string | `"chromium"` | Browser engine: `chromium`, `firefox` |
|
|
144
|
+
| `waitUntil` | string | `"networkidle"` | Page load event: `networkidle`, `load`, `domcontentloaded` |
|
|
145
|
+
| `pageLoadTimeout` | int | 60 | Page load timeout in seconds |
|
|
146
|
+
| `ignoreCors` | bool | false | Disable CORS/CSP restrictions |
|
|
147
|
+
| `closeCookieModals` | bool | false | Auto-dismiss cookie consent banners |
|
|
148
|
+
| `maxScrollHeight` | int | 5000 | Max scroll height in pixels (0 = disable) |
|
|
149
|
+
| `ignoreSslErrors` | bool | false | Skip SSL certificate verification |
|
|
150
|
+
|
|
151
|
+
### Crawl Filtering
|
|
152
|
+
|
|
153
|
+
| Field | Type | Default | Description |
|
|
154
|
+
|-------|------|---------|-------------|
|
|
155
|
+
| `globs` | array | `[]` | Glob patterns for URLs to include |
|
|
156
|
+
| `excludes` | array | `[]` | Glob patterns for URLs to exclude |
|
|
157
|
+
| `linkSelector` | string | `""` | CSS selector for links to follow |
|
|
158
|
+
| `keepUrlFragments` | bool | false | Treat URLs with different fragments as different pages |
|
|
159
|
+
| `respectRobotsTxt` | bool | false | Honor robots.txt |
|
|
160
|
+
|
|
161
|
+
### Cookies & Headers
|
|
162
|
+
|
|
163
|
+
| Field | Type | Default | Description |
|
|
164
|
+
|-------|------|---------|-------------|
|
|
165
|
+
| `cookies` | array | `[]` | Initial cookies (`[{"name": "...", "value": "...", "domain": "..."}]`) |
|
|
166
|
+
| `headers` | object | `{}` | Custom HTTP headers (`{"Authorization": "Bearer token"}`) |
|
|
167
|
+
|
|
168
|
+
### Output Toggles
|
|
169
|
+
|
|
170
|
+
Save additional formats alongside the primary output:
|
|
171
|
+
|
|
172
|
+
| Field | Type | Default | Description |
|
|
173
|
+
|-------|------|---------|-------------|
|
|
174
|
+
| `saveRawHtml` | bool | false | Save raw HTML |
|
|
175
|
+
| `saveText` | bool | false | Save extracted plain text |
|
|
176
|
+
| `saveJson` | bool | false | Save extracted JSON |
|
|
177
|
+
| `saveXml` | bool | false | Save extracted XML |
|
|
178
|
+
| `saveXmlTei` | bool | false | Save extracted XML-TEI |
|
|
83
179
|
|
|
84
|
-
### Extraction
|
|
180
|
+
### Content Extraction
|
|
85
181
|
|
|
86
182
|
All options go under the `extraction` key in config files, or use the equivalent CLI flags:
|
|
87
183
|
|
package/index.js
CHANGED
|
@@ -62,6 +62,30 @@ function getBinaryPath() {
|
|
|
62
62
|
* @param {string} [options.targetLanguage] - Filter by language
|
|
63
63
|
* @param {boolean} [options.withMetadata] - Extract metadata
|
|
64
64
|
* @param {string|string[]} [options.pruneXpath] - XPath patterns to prune
|
|
65
|
+
* @param {string|string[]} [options.proxyUrls] - Proxy URLs
|
|
66
|
+
* @param {string} [options.proxyRotation] - Proxy rotation strategy
|
|
67
|
+
* @param {string} [options.launcher] - Browser engine (chromium, firefox)
|
|
68
|
+
* @param {string} [options.waitUntil] - Page load event (networkidle, load, domcontentloaded)
|
|
69
|
+
* @param {number} [options.pageLoadTimeout] - Page load timeout in seconds
|
|
70
|
+
* @param {boolean} [options.ignoreCors] - Disable CORS/CSP
|
|
71
|
+
* @param {boolean} [options.closeCookieModals] - Auto-dismiss cookie banners
|
|
72
|
+
* @param {number} [options.maxScrollHeight] - Max scroll height in pixels
|
|
73
|
+
* @param {boolean} [options.ignoreSslErrors] - Skip SSL verification
|
|
74
|
+
* @param {string|string[]} [options.globs] - Glob patterns to include
|
|
75
|
+
* @param {string|string[]} [options.excludes] - Glob patterns to exclude
|
|
76
|
+
* @param {string} [options.linkSelector] - CSS selector for links
|
|
77
|
+
* @param {boolean} [options.keepUrlFragments] - Preserve URL fragments
|
|
78
|
+
* @param {boolean} [options.respectRobotsTxt] - Honor robots.txt
|
|
79
|
+
* @param {object[]} [options.cookies] - Initial cookies array
|
|
80
|
+
* @param {object} [options.headers] - Custom HTTP headers
|
|
81
|
+
* @param {number} [options.maxConcurrency] - Max parallel requests
|
|
82
|
+
* @param {number} [options.maxRetries] - Max request retries
|
|
83
|
+
* @param {number} [options.maxResults] - Max results (0 = unlimited)
|
|
84
|
+
* @param {boolean} [options.saveRawHtml] - Save raw HTML
|
|
85
|
+
* @param {boolean} [options.saveText] - Save extracted text
|
|
86
|
+
* @param {boolean} [options.saveJson] - Save extracted JSON
|
|
87
|
+
* @param {boolean} [options.saveXml] - Save extracted XML
|
|
88
|
+
* @param {boolean} [options.saveXmlTei] - Save extracted XML-TEI
|
|
65
89
|
* @param {boolean} [options.verbose] - Verbose logging
|
|
66
90
|
* @param {string} [options.stdio] - stdio option for child process
|
|
67
91
|
* @returns {Promise<void>}
|
|
@@ -94,6 +118,51 @@ function extract(urls, options = {}) {
|
|
|
94
118
|
if (options.outputDir) args.push("--output-dir", options.outputDir);
|
|
95
119
|
if (options.format) args.push("--format", options.format);
|
|
96
120
|
|
|
121
|
+
// Proxy
|
|
122
|
+
if (options.proxyUrls) {
|
|
123
|
+
const proxyList = Array.isArray(options.proxyUrls) ? options.proxyUrls : [options.proxyUrls];
|
|
124
|
+
args.push("--proxy-urls", proxyList.join(","));
|
|
125
|
+
}
|
|
126
|
+
if (options.proxyRotation) args.push("--proxy-rotation", options.proxyRotation);
|
|
127
|
+
|
|
128
|
+
// Browser settings
|
|
129
|
+
if (options.launcher) args.push("--launcher", options.launcher);
|
|
130
|
+
if (options.waitUntil) args.push("--wait-until", options.waitUntil);
|
|
131
|
+
if (options.pageLoadTimeout != null) args.push("--page-load-timeout", String(options.pageLoadTimeout));
|
|
132
|
+
if (options.ignoreCors) args.push("--ignore-cors");
|
|
133
|
+
if (options.closeCookieModals) args.push("--close-cookie-modals");
|
|
134
|
+
if (options.maxScrollHeight != null) args.push("--max-scroll-height", String(options.maxScrollHeight));
|
|
135
|
+
if (options.ignoreSslErrors) args.push("--ignore-ssl-errors");
|
|
136
|
+
|
|
137
|
+
// Crawl filtering
|
|
138
|
+
if (options.globs) {
|
|
139
|
+
const globList = Array.isArray(options.globs) ? options.globs : [options.globs];
|
|
140
|
+
args.push("--globs", globList.join(","));
|
|
141
|
+
}
|
|
142
|
+
if (options.excludes) {
|
|
143
|
+
const excludeList = Array.isArray(options.excludes) ? options.excludes : [options.excludes];
|
|
144
|
+
args.push("--excludes", excludeList.join(","));
|
|
145
|
+
}
|
|
146
|
+
if (options.linkSelector) args.push("--link-selector", options.linkSelector);
|
|
147
|
+
if (options.keepUrlFragments) args.push("--keep-url-fragments");
|
|
148
|
+
if (options.respectRobotsTxt) args.push("--respect-robots-txt");
|
|
149
|
+
|
|
150
|
+
// Cookies & headers
|
|
151
|
+
if (options.cookies) args.push("--cookies", JSON.stringify(options.cookies));
|
|
152
|
+
if (options.headers) args.push("--headers", JSON.stringify(options.headers));
|
|
153
|
+
|
|
154
|
+
// Concurrency & retries
|
|
155
|
+
if (options.maxConcurrency != null) args.push("--max-concurrency", String(options.maxConcurrency));
|
|
156
|
+
if (options.maxRetries != null) args.push("--max-retries", String(options.maxRetries));
|
|
157
|
+
if (options.maxResults != null) args.push("--max-results", String(options.maxResults));
|
|
158
|
+
|
|
159
|
+
// Output toggles
|
|
160
|
+
if (options.saveRawHtml) args.push("--save-raw-html");
|
|
161
|
+
if (options.saveText) args.push("--save-text");
|
|
162
|
+
if (options.saveJson) args.push("--save-json");
|
|
163
|
+
if (options.saveXml) args.push("--save-xml");
|
|
164
|
+
if (options.saveXmlTei) args.push("--save-xml-tei");
|
|
165
|
+
|
|
97
166
|
// Extraction options
|
|
98
167
|
if (options.precision) args.push("--precision");
|
|
99
168
|
if (options.recall) args.push("--recall");
|