contextractor 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +229 -0
  2. package/index.js +69 -0
  3. package/package.json +3 -2
package/README.md ADDED
@@ -0,0 +1,229 @@
1
+ # Contextractor
2
+
3
+ Extract clean, readable content from any website using [Trafilatura](https://trafilatura.readthedocs.io/).
4
+
5
+ Available as: [npm CLI](#install) | [Docker](#docker) | [Apify actor](https://apify.com/shortc/contextractor) | [Web app](https://contextractor.com)
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ npm install -g contextractor
11
+ ```
12
+
13
+ Requires Node.js 18+. Playwright Chromium is installed automatically.
14
+
15
+ ## Usage
16
+
17
+ ```bash
18
+ contextractor https://example.com
19
+ ```
20
+
21
+ Works with zero config. Pass URLs directly, or use a config file for complex setups:
22
+
23
+ ```bash
24
+ contextractor https://example.com --precision --format json -o ./results
25
+ contextractor --config config.yaml --max-pages 10
26
+ ```
27
+
28
+ ### CLI Options
29
+
30
+ ```
31
+ contextractor [OPTIONS] [URLS...]
32
+
33
+ Crawl Settings:
34
+ --config, -c Path to YAML or JSON config file
35
+ --output-dir, -o Output directory
36
+ --format, -f Output format (txt, markdown, json, xml, xmltei)
37
+ --max-pages Max pages to crawl (0 = unlimited)
38
+ --crawl-depth Max link depth from start URLs (0 = start only)
39
+ --headless/--no-headless Browser headless mode (default: headless)
40
+ --max-concurrency Max parallel requests (default: 50)
41
+ --max-retries Max request retries (default: 3)
42
+ --max-results Max results per crawl (0 = unlimited)
43
+
44
+ Proxy:
45
+ --proxy-urls Comma-separated proxy URLs (http://user:pass@host:port)
46
+ --proxy-rotation Rotation: recommended, per_request, until_failure
47
+
48
+ Browser:
49
+ --launcher Browser engine: chromium, firefox (default: chromium)
50
+ --wait-until Page load event: networkidle, load, domcontentloaded
51
+ --page-load-timeout Timeout in seconds (default: 60)
52
+ --ignore-cors Disable CORS/CSP restrictions
53
+ --close-cookie-modals Auto-dismiss cookie banners
54
+ --max-scroll-height Max scroll height in pixels (default: 5000)
55
+ --ignore-ssl-errors Skip SSL certificate verification
56
+
57
+ Crawl Filtering:
58
+ --globs Comma-separated glob patterns to include
59
+ --excludes Comma-separated glob patterns to exclude
60
+ --link-selector CSS selector for links to follow
61
+ --keep-url-fragments Preserve URL fragments
62
+ --respect-robots-txt Honor robots.txt
63
+
64
+ Cookies & Headers:
65
+ --cookies JSON array of cookie objects
66
+ --headers JSON object of custom HTTP headers
67
+
68
+ Output Toggles:
69
+ --save-raw-html Save raw HTML to output
70
+ --save-text Save extracted text
71
+ --save-json Save extracted JSON
72
+ --save-xml Save extracted XML
73
+ --save-xml-tei Save extracted XML-TEI
74
+
75
+ Content Extraction:
76
+ --precision High precision mode (less noise)
77
+ --recall High recall mode (more content)
78
+ --fast Fast extraction mode (less thorough)
79
+ --no-links Exclude links from output
80
+ --no-comments Exclude comments from output
81
+ --include-tables/--no-tables Include tables (default: include)
82
+ --include-images Include image descriptions
83
+ --include-formatting/--no-formatting Preserve formatting (default: preserve)
84
+ --deduplicate Deduplicate extracted content
85
+ --target-language Filter by language (e.g. "en")
86
+ --with-metadata/--no-metadata Extract metadata (default: with)
87
+ --prune-xpath XPath patterns to remove from content
88
+
89
+ Diagnostics:
90
+ --verbose, -v Enable verbose logging
91
+ ```
92
+
93
+ CLI flags override config file settings. Merge order: `defaults → config file → CLI args`
94
+
95
+ ### Config File (optional)
96
+
97
+ Supports both JSON and YAML format. JSON examples shown below:
98
+
99
+ ```json
100
+ {
101
+ "urls": ["https://example.com", "https://docs.example.com"],
102
+ "outputFormat": "markdown",
103
+ "outputDir": "./output",
104
+ "crawlDepth": 1,
105
+ "proxy": {
106
+ "urls": ["http://user:pass@host:port"],
107
+ "rotation": "recommended"
108
+ },
109
+ "extraction": {
110
+ "favorPrecision": true,
111
+ "includeLinks": true,
112
+ "includeTables": true,
113
+ "deduplicate": true
114
+ }
115
+ }
116
+ ```
117
+
118
+ ### Crawl Settings
119
+
120
+ | Field | Type | Default | Description |
121
+ |-------|------|---------|-------------|
122
+ | `urls` | array | `[]` | URLs to extract content from |
123
+ | `maxPages` | int | 0 | Max pages to crawl (0 = unlimited) |
124
+ | `outputFormat` | string | `"markdown"` | `txt`, `markdown`, `json`, `xml`, `xmltei` |
125
+ | `outputDir` | string | `"./output"` | Directory for extracted content |
126
+ | `crawlDepth` | int | 0 | How deep to follow links (0 = start URLs only) |
127
+ | `headless` | bool | true | Browser headless mode |
128
+ | `maxConcurrency` | int | 50 | Max parallel browser pages |
129
+ | `maxRetries` | int | 3 | Max retries for failed requests |
130
+ | `maxResults` | int | 0 | Max results per crawl (0 = unlimited) |
131
+
132
+ ### Proxy Configuration
133
+
134
+ | Field | Type | Default | Description |
135
+ |-------|------|---------|-------------|
136
+ | `proxy.urls` | array | `[]` | Proxy URLs (`http://user:pass@host:port` or `socks5://host:port`) |
137
+ | `proxy.rotation` | string | `"recommended"` | `recommended`, `per_request`, `until_failure` |
138
+
139
+ ### Browser Settings
140
+
141
+ | Field | Type | Default | Description |
142
+ |-------|------|---------|-------------|
143
+ | `launcher` | string | `"chromium"` | Browser engine: `chromium`, `firefox` |
144
+ | `waitUntil` | string | `"networkidle"` | Page load event: `networkidle`, `load`, `domcontentloaded` |
145
+ | `pageLoadTimeout` | int | 60 | Page load timeout in seconds |
146
+ | `ignoreCors` | bool | false | Disable CORS/CSP restrictions |
147
+ | `closeCookieModals` | bool | false | Auto-dismiss cookie consent banners |
148
+ | `maxScrollHeight` | int | 5000 | Max scroll height in pixels (0 = disable) |
149
+ | `ignoreSslErrors` | bool | false | Skip SSL certificate verification |
150
+
151
+ ### Crawl Filtering
152
+
153
+ | Field | Type | Default | Description |
154
+ |-------|------|---------|-------------|
155
+ | `globs` | array | `[]` | Glob patterns for URLs to include |
156
+ | `excludes` | array | `[]` | Glob patterns for URLs to exclude |
157
+ | `linkSelector` | string | `""` | CSS selector for links to follow |
158
+ | `keepUrlFragments` | bool | false | Treat URLs with different fragments as different pages |
159
+ | `respectRobotsTxt` | bool | false | Honor robots.txt |
160
+
161
+ ### Cookies & Headers
162
+
163
+ | Field | Type | Default | Description |
164
+ |-------|------|---------|-------------|
165
+ | `cookies` | array | `[]` | Initial cookies (`[{"name": "...", "value": "...", "domain": "..."}]`) |
166
+ | `headers` | object | `{}` | Custom HTTP headers (`{"Authorization": "Bearer token"}`) |
167
+
168
+ ### Output Toggles
169
+
170
+ Save additional formats alongside the primary output:
171
+
172
+ | Field | Type | Default | Description |
173
+ |-------|------|---------|-------------|
174
+ | `saveRawHtml` | bool | false | Save raw HTML |
175
+ | `saveText` | bool | false | Save extracted plain text |
176
+ | `saveJson` | bool | false | Save extracted JSON |
177
+ | `saveXml` | bool | false | Save extracted XML |
178
+ | `saveXmlTei` | bool | false | Save extracted XML-TEI |
179
+
180
+ ### Content Extraction
181
+
182
+ All options go under the `extraction` key in config files, or use the equivalent CLI flags:
183
+
184
+ | Field | Type | Default | Description |
185
+ |-------|------|---------|-------------|
186
+ | `favorPrecision` | bool | false | High precision, less noise |
187
+ | `favorRecall` | bool | false | High recall, more content |
188
+ | `includeComments` | bool | true | Include comments |
189
+ | `includeTables` | bool | true | Include tables |
190
+ | `includeImages` | bool | false | Include images |
191
+ | `includeFormatting` | bool | true | Preserve formatting |
192
+ | `includeLinks` | bool | true | Include links |
193
+ | `deduplicate` | bool | false | Deduplicate content |
194
+ | `withMetadata` | bool | true | Extract metadata (title, author, date) |
195
+ | `targetLanguage` | string | null | Filter by language (e.g. `"en"`) |
196
+ | `fast` | bool | false | Fast mode (less thorough) |
197
+
198
+ ## Docker
199
+
200
+ ```bash
201
+ docker run ghcr.io/contextractor/contextractor https://example.com
202
+ ```
203
+
204
+ Save output to your local machine:
205
+
206
+ ```bash
207
+ docker run -v ./output:/output ghcr.io/contextractor/contextractor https://example.com -o /output
208
+ ```
209
+
210
+ Use a config file:
211
+
212
+ ```bash
213
+ docker run -v ./config.yaml:/config.yaml ghcr.io/contextractor/contextractor --config /config.yaml
214
+ ```
215
+
216
+ All CLI flags work the same inside Docker.
217
+
218
+ ## Output
219
+
220
+ One file per crawled page, named from the URL slug (e.g. `example-com-page.md`). Metadata (title, author, date) is included in the output header when available.
221
+
222
+ ## Platforms
223
+
224
+ - npm: macOS arm64, Linux (x64, arm64), Windows x64
225
+ - Docker: linux/amd64, linux/arm64
226
+
227
+ ## License
228
+
229
+ MIT
package/index.js CHANGED
@@ -62,6 +62,30 @@ function getBinaryPath() {
62
62
  * @param {string} [options.targetLanguage] - Filter by language
63
63
  * @param {boolean} [options.withMetadata] - Extract metadata
64
64
  * @param {string|string[]} [options.pruneXpath] - XPath patterns to prune
65
+ * @param {string|string[]} [options.proxyUrls] - Proxy URLs
66
+ * @param {string} [options.proxyRotation] - Proxy rotation strategy
67
+ * @param {string} [options.launcher] - Browser engine (chromium, firefox)
68
+ * @param {string} [options.waitUntil] - Page load event (networkidle, load, domcontentloaded)
69
+ * @param {number} [options.pageLoadTimeout] - Page load timeout in seconds
70
+ * @param {boolean} [options.ignoreCors] - Disable CORS/CSP
71
+ * @param {boolean} [options.closeCookieModals] - Auto-dismiss cookie banners
72
+ * @param {number} [options.maxScrollHeight] - Max scroll height in pixels
73
+ * @param {boolean} [options.ignoreSslErrors] - Skip SSL verification
74
+ * @param {string|string[]} [options.globs] - Glob patterns to include
75
+ * @param {string|string[]} [options.excludes] - Glob patterns to exclude
76
+ * @param {string} [options.linkSelector] - CSS selector for links
77
+ * @param {boolean} [options.keepUrlFragments] - Preserve URL fragments
78
+ * @param {boolean} [options.respectRobotsTxt] - Honor robots.txt
79
+ * @param {object[]} [options.cookies] - Initial cookies array
80
+ * @param {object} [options.headers] - Custom HTTP headers
81
+ * @param {number} [options.maxConcurrency] - Max parallel requests
82
+ * @param {number} [options.maxRetries] - Max request retries
83
+ * @param {number} [options.maxResults] - Max results (0 = unlimited)
84
+ * @param {boolean} [options.saveRawHtml] - Save raw HTML
85
+ * @param {boolean} [options.saveText] - Save extracted text
86
+ * @param {boolean} [options.saveJson] - Save extracted JSON
87
+ * @param {boolean} [options.saveXml] - Save extracted XML
88
+ * @param {boolean} [options.saveXmlTei] - Save extracted XML-TEI
65
89
  * @param {boolean} [options.verbose] - Verbose logging
66
90
  * @param {string} [options.stdio] - stdio option for child process
67
91
  * @returns {Promise<void>}
@@ -94,6 +118,51 @@ function extract(urls, options = {}) {
94
118
  if (options.outputDir) args.push("--output-dir", options.outputDir);
95
119
  if (options.format) args.push("--format", options.format);
96
120
 
121
+ // Proxy
122
+ if (options.proxyUrls) {
123
+ const proxyList = Array.isArray(options.proxyUrls) ? options.proxyUrls : [options.proxyUrls];
124
+ args.push("--proxy-urls", proxyList.join(","));
125
+ }
126
+ if (options.proxyRotation) args.push("--proxy-rotation", options.proxyRotation);
127
+
128
+ // Browser settings
129
+ if (options.launcher) args.push("--launcher", options.launcher);
130
+ if (options.waitUntil) args.push("--wait-until", options.waitUntil);
131
+ if (options.pageLoadTimeout != null) args.push("--page-load-timeout", String(options.pageLoadTimeout));
132
+ if (options.ignoreCors) args.push("--ignore-cors");
133
+ if (options.closeCookieModals) args.push("--close-cookie-modals");
134
+ if (options.maxScrollHeight != null) args.push("--max-scroll-height", String(options.maxScrollHeight));
135
+ if (options.ignoreSslErrors) args.push("--ignore-ssl-errors");
136
+
137
+ // Crawl filtering
138
+ if (options.globs) {
139
+ const globList = Array.isArray(options.globs) ? options.globs : [options.globs];
140
+ args.push("--globs", globList.join(","));
141
+ }
142
+ if (options.excludes) {
143
+ const excludeList = Array.isArray(options.excludes) ? options.excludes : [options.excludes];
144
+ args.push("--excludes", excludeList.join(","));
145
+ }
146
+ if (options.linkSelector) args.push("--link-selector", options.linkSelector);
147
+ if (options.keepUrlFragments) args.push("--keep-url-fragments");
148
+ if (options.respectRobotsTxt) args.push("--respect-robots-txt");
149
+
150
+ // Cookies & headers
151
+ if (options.cookies) args.push("--cookies", JSON.stringify(options.cookies));
152
+ if (options.headers) args.push("--headers", JSON.stringify(options.headers));
153
+
154
+ // Concurrency & retries
155
+ if (options.maxConcurrency != null) args.push("--max-concurrency", String(options.maxConcurrency));
156
+ if (options.maxRetries != null) args.push("--max-retries", String(options.maxRetries));
157
+ if (options.maxResults != null) args.push("--max-results", String(options.maxResults));
158
+
159
+ // Output toggles
160
+ if (options.saveRawHtml) args.push("--save-raw-html");
161
+ if (options.saveText) args.push("--save-text");
162
+ if (options.saveJson) args.push("--save-json");
163
+ if (options.saveXml) args.push("--save-xml");
164
+ if (options.saveXmlTei) args.push("--save-xml-tei");
165
+
97
166
  // Extraction options
98
167
  if (options.precision) args.push("--precision");
99
168
  if (options.recall) args.push("--recall");
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "contextractor",
3
- "version": "0.2.0",
3
+ "version": "0.3.0",
4
4
  "description": "Extract web content from URLs with configurable extraction options",
5
5
  "license": "MIT",
6
6
  "repository": {
@@ -29,7 +29,8 @@
29
29
  "files": [
30
30
  "cli.js",
31
31
  "index.js",
32
- "postinstall.js"
32
+ "postinstall.js",
33
+ "README.md"
33
34
  ],
34
35
  "scripts": {
35
36
  "postinstall": "node postinstall.js"