contextractor 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +112 -16
  2. package/index.js +69 -0
  3. package/package.json +1 -1
package/README.md CHANGED
@@ -30,13 +30,49 @@ contextractor --config config.yaml --max-pages 10
30
30
  ```
31
31
  contextractor [OPTIONS] [URLS...]
32
32
 
33
- Options:
33
+ Crawl Settings:
34
34
  --config, -c Path to YAML or JSON config file
35
35
  --output-dir, -o Output directory
36
36
  --format, -f Output format (txt, markdown, json, xml, xmltei)
37
37
  --max-pages Max pages to crawl (0 = unlimited)
38
38
  --crawl-depth Max link depth from start URLs (0 = start only)
39
39
  --headless/--no-headless Browser headless mode (default: headless)
40
+ --max-concurrency Max parallel requests (default: 50)
41
+ --max-retries Max request retries (default: 3)
42
+ --max-results Max results per crawl (0 = unlimited)
43
+
44
+ Proxy:
45
+ --proxy-urls Comma-separated proxy URLs (http://user:pass@host:port)
46
+ --proxy-rotation Rotation: recommended, per_request, until_failure
47
+
48
+ Browser:
49
+ --launcher Browser engine: chromium, firefox (default: chromium)
50
+ --wait-until Page load event: networkidle, load, domcontentloaded
51
+ --page-load-timeout Timeout in seconds (default: 60)
52
+ --ignore-cors Disable CORS/CSP restrictions
53
+ --close-cookie-modals Auto-dismiss cookie banners
54
+ --max-scroll-height Max scroll height in pixels (default: 5000)
55
+ --ignore-ssl-errors Skip SSL certificate verification
56
+
57
+ Crawl Filtering:
58
+ --globs Comma-separated glob patterns to include
59
+ --excludes Comma-separated glob patterns to exclude
60
+ --link-selector CSS selector for links to follow
61
+ --keep-url-fragments Preserve URL fragments
62
+ --respect-robots-txt Honor robots.txt
63
+
64
+ Cookies & Headers:
65
+ --cookies JSON array of cookie objects
66
+ --headers JSON object of custom HTTP headers
67
+
68
+ Output Toggles:
69
+ --save-raw-html Save raw HTML to output
70
+ --save-text Save extracted text
71
+ --save-json Save extracted JSON
72
+ --save-xml Save extracted XML
73
+ --save-xml-tei Save extracted XML-TEI
74
+
75
+ Content Extraction:
40
76
  --precision High precision mode (less noise)
41
77
  --recall High recall mode (more content)
42
78
  --fast Fast extraction mode (less thorough)
@@ -49,6 +85,8 @@ Options:
49
85
  --target-language Filter by language (e.g. "en")
50
86
  --with-metadata/--no-metadata Extract metadata (default: with)
51
87
  --prune-xpath XPath patterns to remove from content
88
+
89
+ Diagnostics:
52
90
  --verbose, -v Enable verbose logging
53
91
  ```
54
92
 
@@ -56,21 +94,29 @@ CLI flags override config file settings. Merge order: `defaults → config file
56
94
 
57
95
  ### Config File (optional)
58
96
 
59
- ```yaml
60
- urls:
61
- - https://example.com
62
- - https://docs.example.com
63
- outputFormat: markdown
64
- outputDir: ./output
65
- crawlDepth: 1
66
-
67
- extraction:
68
- favorPrecision: true
69
- includeLinks: true
70
- includeTables: true
71
- deduplicate: true
97
+ Supports both JSON and YAML format. JSON examples shown below:
98
+
99
+ ```json
100
+ {
101
+ "urls": ["https://example.com", "https://docs.example.com"],
102
+ "outputFormat": "markdown",
103
+ "outputDir": "./output",
104
+ "crawlDepth": 1,
105
+ "proxy": {
106
+ "urls": ["http://user:pass@host:port"],
107
+ "rotation": "recommended"
108
+ },
109
+ "extraction": {
110
+ "favorPrecision": true,
111
+ "includeLinks": true,
112
+ "includeTables": true,
113
+ "deduplicate": true
114
+ }
115
+ }
72
116
  ```
73
117
 
118
+ ### Crawl Settings
119
+
74
120
  | Field | Type | Default | Description |
75
121
  |-------|------|---------|-------------|
76
122
  | `urls` | array | `[]` | URLs to extract content from |
@@ -79,9 +125,59 @@ extraction:
79
125
  | `outputDir` | string | `"./output"` | Directory for extracted content |
80
126
  | `crawlDepth` | int | 0 | How deep to follow links (0 = start URLs only) |
81
127
  | `headless` | bool | true | Browser headless mode |
82
- | `extraction` | object | `{}` | Trafilatura extraction options (see below) |
128
+ | `maxConcurrency` | int | 50 | Max parallel browser pages |
129
+ | `maxRetries` | int | 3 | Max retries for failed requests |
130
+ | `maxResults` | int | 0 | Max results per crawl (0 = unlimited) |
131
+
132
+ ### Proxy Configuration
133
+
134
+ | Field | Type | Default | Description |
135
+ |-------|------|---------|-------------|
136
+ | `proxy.urls` | array | `[]` | Proxy URLs (`http://user:pass@host:port` or `socks5://host:port`) |
137
+ | `proxy.rotation` | string | `"recommended"` | `recommended`, `per_request`, `until_failure` |
138
+
139
+ ### Browser Settings
140
+
141
+ | Field | Type | Default | Description |
142
+ |-------|------|---------|-------------|
143
+ | `launcher` | string | `"chromium"` | Browser engine: `chromium`, `firefox` |
144
+ | `waitUntil` | string | `"networkidle"` | Page load event: `networkidle`, `load`, `domcontentloaded` |
145
+ | `pageLoadTimeout` | int | 60 | Page load timeout in seconds |
146
+ | `ignoreCors` | bool | false | Disable CORS/CSP restrictions |
147
+ | `closeCookieModals` | bool | false | Auto-dismiss cookie consent banners |
148
+ | `maxScrollHeight` | int | 5000 | Max scroll height in pixels (0 = disable) |
149
+ | `ignoreSslErrors` | bool | false | Skip SSL certificate verification |
150
+
151
+ ### Crawl Filtering
152
+
153
+ | Field | Type | Default | Description |
154
+ |-------|------|---------|-------------|
155
+ | `globs` | array | `[]` | Glob patterns for URLs to include |
156
+ | `excludes` | array | `[]` | Glob patterns for URLs to exclude |
157
+ | `linkSelector` | string | `""` | CSS selector for links to follow |
158
+ | `keepUrlFragments` | bool | false | Treat URLs with different fragments as different pages |
159
+ | `respectRobotsTxt` | bool | false | Honor robots.txt |
160
+
161
+ ### Cookies & Headers
162
+
163
+ | Field | Type | Default | Description |
164
+ |-------|------|---------|-------------|
165
+ | `cookies` | array | `[]` | Initial cookies (`[{"name": "...", "value": "...", "domain": "..."}]`) |
166
+ | `headers` | object | `{}` | Custom HTTP headers (`{"Authorization": "Bearer token"}`) |
167
+
168
+ ### Output Toggles
169
+
170
+ Save additional formats alongside the primary output:
171
+
172
+ | Field | Type | Default | Description |
173
+ |-------|------|---------|-------------|
174
+ | `saveRawHtml` | bool | false | Save raw HTML |
175
+ | `saveText` | bool | false | Save extracted plain text |
176
+ | `saveJson` | bool | false | Save extracted JSON |
177
+ | `saveXml` | bool | false | Save extracted XML |
178
+ | `saveXmlTei` | bool | false | Save extracted XML-TEI |
83
179
 
84
- ### Extraction Options
180
+ ### Content Extraction
85
181
 
86
182
  All options go under the `extraction` key in config files, or use the equivalent CLI flags:
87
183
 
package/index.js CHANGED
@@ -62,6 +62,30 @@ function getBinaryPath() {
62
62
  * @param {string} [options.targetLanguage] - Filter by language
63
63
  * @param {boolean} [options.withMetadata] - Extract metadata
64
64
  * @param {string|string[]} [options.pruneXpath] - XPath patterns to prune
65
+ * @param {string|string[]} [options.proxyUrls] - Proxy URLs
66
+ * @param {string} [options.proxyRotation] - Proxy rotation strategy
67
+ * @param {string} [options.launcher] - Browser engine (chromium, firefox)
68
+ * @param {string} [options.waitUntil] - Page load event (networkidle, load, domcontentloaded)
69
+ * @param {number} [options.pageLoadTimeout] - Page load timeout in seconds
70
+ * @param {boolean} [options.ignoreCors] - Disable CORS/CSP
71
+ * @param {boolean} [options.closeCookieModals] - Auto-dismiss cookie banners
72
+ * @param {number} [options.maxScrollHeight] - Max scroll height in pixels
73
+ * @param {boolean} [options.ignoreSslErrors] - Skip SSL verification
74
+ * @param {string|string[]} [options.globs] - Glob patterns to include
75
+ * @param {string|string[]} [options.excludes] - Glob patterns to exclude
76
+ * @param {string} [options.linkSelector] - CSS selector for links
77
+ * @param {boolean} [options.keepUrlFragments] - Preserve URL fragments
78
+ * @param {boolean} [options.respectRobotsTxt] - Honor robots.txt
79
+ * @param {object[]} [options.cookies] - Initial cookies array
80
+ * @param {object} [options.headers] - Custom HTTP headers
81
+ * @param {number} [options.maxConcurrency] - Max parallel requests
82
+ * @param {number} [options.maxRetries] - Max request retries
83
+ * @param {number} [options.maxResults] - Max results (0 = unlimited)
84
+ * @param {boolean} [options.saveRawHtml] - Save raw HTML
85
+ * @param {boolean} [options.saveText] - Save extracted text
86
+ * @param {boolean} [options.saveJson] - Save extracted JSON
87
+ * @param {boolean} [options.saveXml] - Save extracted XML
88
+ * @param {boolean} [options.saveXmlTei] - Save extracted XML-TEI
65
89
  * @param {boolean} [options.verbose] - Verbose logging
66
90
  * @param {string} [options.stdio] - stdio option for child process
67
91
  * @returns {Promise<void>}
@@ -94,6 +118,51 @@ function extract(urls, options = {}) {
94
118
  if (options.outputDir) args.push("--output-dir", options.outputDir);
95
119
  if (options.format) args.push("--format", options.format);
96
120
 
121
+ // Proxy
122
+ if (options.proxyUrls) {
123
+ const proxyList = Array.isArray(options.proxyUrls) ? options.proxyUrls : [options.proxyUrls];
124
+ args.push("--proxy-urls", proxyList.join(","));
125
+ }
126
+ if (options.proxyRotation) args.push("--proxy-rotation", options.proxyRotation);
127
+
128
+ // Browser settings
129
+ if (options.launcher) args.push("--launcher", options.launcher);
130
+ if (options.waitUntil) args.push("--wait-until", options.waitUntil);
131
+ if (options.pageLoadTimeout != null) args.push("--page-load-timeout", String(options.pageLoadTimeout));
132
+ if (options.ignoreCors) args.push("--ignore-cors");
133
+ if (options.closeCookieModals) args.push("--close-cookie-modals");
134
+ if (options.maxScrollHeight != null) args.push("--max-scroll-height", String(options.maxScrollHeight));
135
+ if (options.ignoreSslErrors) args.push("--ignore-ssl-errors");
136
+
137
+ // Crawl filtering
138
+ if (options.globs) {
139
+ const globList = Array.isArray(options.globs) ? options.globs : [options.globs];
140
+ args.push("--globs", globList.join(","));
141
+ }
142
+ if (options.excludes) {
143
+ const excludeList = Array.isArray(options.excludes) ? options.excludes : [options.excludes];
144
+ args.push("--excludes", excludeList.join(","));
145
+ }
146
+ if (options.linkSelector) args.push("--link-selector", options.linkSelector);
147
+ if (options.keepUrlFragments) args.push("--keep-url-fragments");
148
+ if (options.respectRobotsTxt) args.push("--respect-robots-txt");
149
+
150
+ // Cookies & headers
151
+ if (options.cookies) args.push("--cookies", JSON.stringify(options.cookies));
152
+ if (options.headers) args.push("--headers", JSON.stringify(options.headers));
153
+
154
+ // Concurrency & retries
155
+ if (options.maxConcurrency != null) args.push("--max-concurrency", String(options.maxConcurrency));
156
+ if (options.maxRetries != null) args.push("--max-retries", String(options.maxRetries));
157
+ if (options.maxResults != null) args.push("--max-results", String(options.maxResults));
158
+
159
+ // Output toggles
160
+ if (options.saveRawHtml) args.push("--save-raw-html");
161
+ if (options.saveText) args.push("--save-text");
162
+ if (options.saveJson) args.push("--save-json");
163
+ if (options.saveXml) args.push("--save-xml");
164
+ if (options.saveXmlTei) args.push("--save-xml-tei");
165
+
97
166
  // Extraction options
98
167
  if (options.precision) args.push("--precision");
99
168
  if (options.recall) args.push("--recall");
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "contextractor",
3
- "version": "0.2.1",
3
+ "version": "0.3.0",
4
4
  "description": "Extract web content from URLs with configurable extraction options",
5
5
  "license": "MIT",
6
6
  "repository": {