contextractor 0.2.1 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +118 -18
  2. package/index.js +72 -1
  3. package/package.json +2 -2
package/README.md CHANGED
@@ -30,13 +30,50 @@ contextractor --config config.yaml --max-pages 10
30
30
  ```
31
31
  contextractor [OPTIONS] [URLS...]
32
32
 
33
- Options:
33
+ Crawl Settings:
34
34
  --config, -c Path to YAML or JSON config file
35
35
  --output-dir, -o Output directory
36
- --format, -f Output format (txt, markdown, json, xml, xmltei)
36
+ --format, -f Output format (txt, markdown, json, jsonl, xml, xmltei)
37
37
  --max-pages Max pages to crawl (0 = unlimited)
38
38
  --crawl-depth Max link depth from start URLs (0 = start only)
39
39
  --headless/--no-headless Browser headless mode (default: headless)
40
+ --max-concurrency Max parallel requests (default: 50)
41
+ --max-retries Max request retries (default: 3)
42
+ --max-results Max results per crawl (0 = unlimited)
43
+
44
+ Proxy:
45
+ --proxy-urls Comma-separated proxy URLs (http://user:pass@host:port)
46
+ --proxy-rotation Rotation: recommended, per_request, until_failure
47
+
48
+ Browser:
49
+ --launcher Browser engine: chromium, firefox (default: chromium)
50
+ --wait-until Page load event: load, networkidle, domcontentloaded (default: load)
51
+ --page-load-timeout Timeout in seconds (default: 60)
52
+ --ignore-cors Disable CORS/CSP restrictions
53
+ --close-cookie-modals Auto-dismiss cookie banners
54
+ --max-scroll-height Max scroll height in pixels (default: 5000)
55
+ --ignore-ssl-errors Skip SSL certificate verification
56
+ --user-agent Custom User-Agent string
57
+
58
+ Crawl Filtering:
59
+ --globs Comma-separated glob patterns to include
60
+ --excludes Comma-separated glob patterns to exclude
61
+ --link-selector CSS selector for links to follow
62
+ --keep-url-fragments Preserve URL fragments
63
+ --respect-robots-txt Honor robots.txt
64
+
65
+ Cookies & Headers:
66
+ --cookies JSON array of cookie objects
67
+ --headers JSON object of custom HTTP headers
68
+
69
+ Output Toggles:
70
+ --save-raw-html Save raw HTML to output
71
+ --save-text Save extracted text
72
+ --save-json Save extracted JSON
73
+ --save-xml Save extracted XML
74
+ --save-xml-tei Save extracted XML-TEI
75
+
76
+ Content Extraction:
40
77
  --precision High precision mode (less noise)
41
78
  --recall High recall mode (more content)
42
79
  --fast Fast extraction mode (less thorough)
@@ -49,6 +86,8 @@ Options:
49
86
  --target-language Filter by language (e.g. "en")
50
87
  --with-metadata/--no-metadata Extract metadata (default: with)
51
88
  --prune-xpath XPath patterns to remove from content
89
+
90
+ Diagnostics:
52
91
  --verbose, -v Enable verbose logging
53
92
  ```
54
93
 
@@ -56,32 +95,92 @@ CLI flags override config file settings. Merge order: `defaults → config file
56
95
 
57
96
  ### Config File (optional)
58
97
 
59
- ```yaml
60
- urls:
61
- - https://example.com
62
- - https://docs.example.com
63
- outputFormat: markdown
64
- outputDir: ./output
65
- crawlDepth: 1
66
-
67
- extraction:
68
- favorPrecision: true
69
- includeLinks: true
70
- includeTables: true
71
- deduplicate: true
98
+ Supports both JSON and YAML format. JSON examples shown below:
99
+
100
+ ```json
101
+ {
102
+ "urls": ["https://example.com", "https://docs.example.com"],
103
+ "outputFormat": "markdown",
104
+ "outputDir": "./output",
105
+ "crawlDepth": 1,
106
+ "proxy": {
107
+ "urls": ["http://user:pass@host:port"],
108
+ "rotation": "recommended"
109
+ },
110
+ "extraction": {
111
+ "favorPrecision": true,
112
+ "includeLinks": true,
113
+ "includeTables": true,
114
+ "deduplicate": true
115
+ }
116
+ }
72
117
  ```
73
118
 
119
+ ### Crawl Settings
120
+
74
121
  | Field | Type | Default | Description |
75
122
  |-------|------|---------|-------------|
76
123
  | `urls` | array | `[]` | URLs to extract content from |
77
124
  | `maxPages` | int | 0 | Max pages to crawl (0 = unlimited) |
78
- | `outputFormat` | string | `"markdown"` | `txt`, `markdown`, `json`, `xml`, `xmltei` |
125
+ | `outputFormat` | string | `"markdown"` | `txt`, `markdown`, `json`, `jsonl`, `xml`, `xmltei` |
79
126
  | `outputDir` | string | `"./output"` | Directory for extracted content |
80
127
  | `crawlDepth` | int | 0 | How deep to follow links (0 = start URLs only) |
81
128
  | `headless` | bool | true | Browser headless mode |
82
- | `extraction` | object | `{}` | Trafilatura extraction options (see below) |
129
+ | `maxConcurrency` | int | 50 | Max parallel browser pages |
130
+ | `maxRetries` | int | 3 | Max retries for failed requests |
131
+ | `maxResults` | int | 0 | Max results per crawl (0 = unlimited) |
132
+
133
+ ### Proxy Configuration
134
+
135
+ | Field | Type | Default | Description |
136
+ |-------|------|---------|-------------|
137
+ | `proxy.urls` | array | `[]` | Proxy URLs (`http://user:pass@host:port` or `socks5://host:port`) |
138
+ | `proxy.rotation` | string | `"recommended"` | `recommended`, `per_request`, `until_failure` |
139
+ | `proxy.tiered` | array | `[]` | Tiered proxy escalation (config-file only) |
140
+
141
+ ### Browser Settings
142
+
143
+ | Field | Type | Default | Description |
144
+ |-------|------|---------|-------------|
145
+ | `launcher` | string | `"chromium"` | Browser engine: `chromium`, `firefox` |
146
+ | `waitUntil` | string | `"load"` | Page load event: `load`, `networkidle`, `domcontentloaded` |
147
+ | `pageLoadTimeout` | int | 60 | Page load timeout in seconds |
148
+ | `ignoreCors` | bool | false | Disable CORS/CSP restrictions |
149
+ | `closeCookieModals` | bool | false | Auto-dismiss cookie consent banners |
150
+ | `maxScrollHeight` | int | 5000 | Max scroll height in pixels (0 = disable) |
151
+ | `ignoreSslErrors` | bool | false | Skip SSL certificate verification |
152
+ | `userAgent` | string | `""` | Custom User-Agent string |
153
+
154
+ ### Crawl Filtering
155
+
156
+ | Field | Type | Default | Description |
157
+ |-------|------|---------|-------------|
158
+ | `globs` | array | `[]` | Glob patterns for URLs to include |
159
+ | `excludes` | array | `[]` | Glob patterns for URLs to exclude |
160
+ | `linkSelector` | string | `""` | CSS selector for links to follow |
161
+ | `keepUrlFragments` | bool | false | Treat URLs with different fragments as different pages |
162
+ | `respectRobotsTxt` | bool | false | Honor robots.txt |
163
+
164
+ ### Cookies & Headers
165
+
166
+ | Field | Type | Default | Description |
167
+ |-------|------|---------|-------------|
168
+ | `cookies` | array | `[]` | Initial cookies (`[{"name": "...", "value": "...", "domain": "..."}]`) |
169
+ | `headers` | object | `{}` | Custom HTTP headers (`{"Authorization": "Bearer token"}`) |
170
+
171
+ ### Output Toggles
172
+
173
+ Save additional formats alongside the primary output:
174
+
175
+ | Field | Type | Default | Description |
176
+ |-------|------|---------|-------------|
177
+ | `saveRawHtml` | bool | false | Save raw HTML |
178
+ | `saveText` | bool | false | Save extracted plain text |
179
+ | `saveJson` | bool | false | Save extracted JSON |
180
+ | `saveXml` | bool | false | Save extracted XML |
181
+ | `saveXmlTei` | bool | false | Save extracted XML-TEI |
83
182
 
84
- ### Extraction Options
183
+ ### Content Extraction
85
184
 
86
185
  All options go under the `extraction` key in config files, or use the equivalent CLI flags:
87
186
 
@@ -98,6 +197,7 @@ All options go under the `extraction` key in config files, or use the equivalent
98
197
  | `withMetadata` | bool | true | Extract metadata (title, author, date) |
99
198
  | `targetLanguage` | string | null | Filter by language (e.g. `"en"`) |
100
199
  | `fast` | bool | false | Fast mode (less thorough) |
200
+ | `pruneXpath` | array | null | XPath patterns to remove from content |
101
201
 
102
202
  ## Docker
103
203
 
package/index.js CHANGED
@@ -51,7 +51,7 @@ function getBinaryPath() {
51
51
  * @param {boolean} [options.noLinks] - Exclude links
52
52
  * @param {boolean} [options.noComments] - Exclude comments
53
53
  * @param {string} [options.outputDir] - Output directory
54
- * @param {string} [options.format] - Output format (txt, markdown, json, xml, xmltei)
54
+ * @param {string} [options.format] - Output format (txt, markdown, json, jsonl, xml, xmltei)
55
55
  * @param {number} [options.maxPages] - Max pages to crawl
56
56
  * @param {number} [options.crawlDepth] - Max crawl depth
57
57
  * @param {boolean} [options.headless] - Run headless (default true)
@@ -62,6 +62,31 @@ function getBinaryPath() {
62
62
  * @param {string} [options.targetLanguage] - Filter by language
63
63
  * @param {boolean} [options.withMetadata] - Extract metadata
64
64
  * @param {string|string[]} [options.pruneXpath] - XPath patterns to prune
65
+ * @param {string|string[]} [options.proxyUrls] - Proxy URLs
66
+ * @param {string} [options.proxyRotation] - Proxy rotation strategy
67
+ * @param {string} [options.launcher] - Browser engine (chromium, firefox)
68
+ * @param {string} [options.waitUntil] - Page load event (networkidle, load, domcontentloaded)
69
+ * @param {number} [options.pageLoadTimeout] - Page load timeout in seconds
70
+ * @param {boolean} [options.ignoreCors] - Disable CORS/CSP
71
+ * @param {boolean} [options.closeCookieModals] - Auto-dismiss cookie banners
72
+ * @param {number} [options.maxScrollHeight] - Max scroll height in pixels
73
+ * @param {boolean} [options.ignoreSslErrors] - Skip SSL verification
74
+ * @param {string} [options.userAgent] - Custom User-Agent string
75
+ * @param {string|string[]} [options.globs] - Glob patterns to include
76
+ * @param {string|string[]} [options.excludes] - Glob patterns to exclude
77
+ * @param {string} [options.linkSelector] - CSS selector for links
78
+ * @param {boolean} [options.keepUrlFragments] - Preserve URL fragments
79
+ * @param {boolean} [options.respectRobotsTxt] - Honor robots.txt
80
+ * @param {object[]} [options.cookies] - Initial cookies array
81
+ * @param {object} [options.headers] - Custom HTTP headers
82
+ * @param {number} [options.maxConcurrency] - Max parallel requests
83
+ * @param {number} [options.maxRetries] - Max request retries
84
+ * @param {number} [options.maxResults] - Max results (0 = unlimited)
85
+ * @param {boolean} [options.saveRawHtml] - Save raw HTML
86
+ * @param {boolean} [options.saveText] - Save extracted text
87
+ * @param {boolean} [options.saveJson] - Save extracted JSON
88
+ * @param {boolean} [options.saveXml] - Save extracted XML
89
+ * @param {boolean} [options.saveXmlTei] - Save extracted XML-TEI
65
90
  * @param {boolean} [options.verbose] - Verbose logging
66
91
  * @param {string} [options.stdio] - stdio option for child process
67
92
  * @returns {Promise<void>}
@@ -94,6 +119,52 @@ function extract(urls, options = {}) {
94
119
  if (options.outputDir) args.push("--output-dir", options.outputDir);
95
120
  if (options.format) args.push("--format", options.format);
96
121
 
122
+ // Proxy
123
+ if (options.proxyUrls) {
124
+ const proxyList = Array.isArray(options.proxyUrls) ? options.proxyUrls : [options.proxyUrls];
125
+ args.push("--proxy-urls", proxyList.join(","));
126
+ }
127
+ if (options.proxyRotation) args.push("--proxy-rotation", options.proxyRotation);
128
+
129
+ // Browser settings
130
+ if (options.launcher) args.push("--launcher", options.launcher);
131
+ if (options.waitUntil) args.push("--wait-until", options.waitUntil);
132
+ if (options.pageLoadTimeout != null) args.push("--page-load-timeout", String(options.pageLoadTimeout));
133
+ if (options.ignoreCors) args.push("--ignore-cors");
134
+ if (options.closeCookieModals) args.push("--close-cookie-modals");
135
+ if (options.maxScrollHeight != null) args.push("--max-scroll-height", String(options.maxScrollHeight));
136
+ if (options.ignoreSslErrors) args.push("--ignore-ssl-errors");
137
+ if (options.userAgent) args.push("--user-agent", options.userAgent);
138
+
139
+ // Crawl filtering
140
+ if (options.globs) {
141
+ const globList = Array.isArray(options.globs) ? options.globs : [options.globs];
142
+ args.push("--globs", globList.join(","));
143
+ }
144
+ if (options.excludes) {
145
+ const excludeList = Array.isArray(options.excludes) ? options.excludes : [options.excludes];
146
+ args.push("--excludes", excludeList.join(","));
147
+ }
148
+ if (options.linkSelector) args.push("--link-selector", options.linkSelector);
149
+ if (options.keepUrlFragments) args.push("--keep-url-fragments");
150
+ if (options.respectRobotsTxt) args.push("--respect-robots-txt");
151
+
152
+ // Cookies & headers
153
+ if (options.cookies) args.push("--cookies", JSON.stringify(options.cookies));
154
+ if (options.headers) args.push("--headers", JSON.stringify(options.headers));
155
+
156
+ // Concurrency & retries
157
+ if (options.maxConcurrency != null) args.push("--max-concurrency", String(options.maxConcurrency));
158
+ if (options.maxRetries != null) args.push("--max-retries", String(options.maxRetries));
159
+ if (options.maxResults != null) args.push("--max-results", String(options.maxResults));
160
+
161
+ // Output toggles
162
+ if (options.saveRawHtml) args.push("--save-raw-html");
163
+ if (options.saveText) args.push("--save-text");
164
+ if (options.saveJson) args.push("--save-json");
165
+ if (options.saveXml) args.push("--save-xml");
166
+ if (options.saveXmlTei) args.push("--save-xml-tei");
167
+
97
168
  // Extraction options
98
169
  if (options.precision) args.push("--precision");
99
170
  if (options.recall) args.push("--recall");
package/package.json CHANGED
@@ -1,13 +1,13 @@
1
1
  {
2
2
  "name": "contextractor",
3
- "version": "0.2.1",
3
+ "version": "0.3.1",
4
4
  "description": "Extract web content from URLs with configurable extraction options",
5
5
  "license": "MIT",
6
6
  "repository": {
7
7
  "type": "git",
8
8
  "url": "https://github.com/contextractor/contextractor.git"
9
9
  },
10
- "homepage": "https://github.com/contextractor/contextractor",
10
+ "homepage": "https://www.contextractor.com/",
11
11
  "bugs": {
12
12
  "url": "https://github.com/contextractor/contextractor/issues"
13
13
  },