contextractor 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +12 -8
  2. package/index.js +3 -1
  3. package/package.json +2 -2
package/README.md CHANGED
@@ -22,7 +22,7 @@ Works with zero config. Pass URLs directly, or use a config file for complex set
22
22
 
23
23
  ```bash
24
24
  contextractor https://example.com --precision --format json -o ./results
25
- contextractor --config config.yaml --max-pages 10
25
+ contextractor --config config.json --max-pages 10
26
26
  ```
27
27
 
28
28
  ### CLI Options
@@ -31,9 +31,9 @@ contextractor --config config.yaml --max-pages 10
31
31
  contextractor [OPTIONS] [URLS...]
32
32
 
33
33
  Crawl Settings:
34
- --config, -c Path to YAML or JSON config file
34
+ --config, -c Path to JSON config file
35
35
  --output-dir, -o Output directory
36
- --format, -f Output format (txt, markdown, json, xml, xmltei)
36
+ --format, -f Output format (txt, markdown, json, jsonl, xml, xmltei)
37
37
  --max-pages Max pages to crawl (0 = unlimited)
38
38
  --crawl-depth Max link depth from start URLs (0 = start only)
39
39
  --headless/--no-headless Browser headless mode (default: headless)
@@ -47,12 +47,13 @@ Proxy:
47
47
 
48
48
  Browser:
49
49
  --launcher Browser engine: chromium, firefox (default: chromium)
50
- --wait-until Page load event: networkidle, load, domcontentloaded
50
+ --wait-until Page load event: load, networkidle, domcontentloaded (default: load)
51
51
  --page-load-timeout Timeout in seconds (default: 60)
52
52
  --ignore-cors Disable CORS/CSP restrictions
53
53
  --close-cookie-modals Auto-dismiss cookie banners
54
54
  --max-scroll-height Max scroll height in pixels (default: 5000)
55
55
  --ignore-ssl-errors Skip SSL certificate verification
56
+ --user-agent Custom User-Agent string
56
57
 
57
58
  Crawl Filtering:
58
59
  --globs Comma-separated glob patterns to include
@@ -94,7 +95,7 @@ CLI flags override config file settings. Merge order: `defaults → config file
94
95
 
95
96
  ### Config File (optional)
96
97
 
97
- Supports both JSON and YAML format. JSON examples shown below:
98
+ Use a JSON config file to set options:
98
99
 
99
100
  ```json
100
101
  {
@@ -121,7 +122,7 @@ Supports both JSON and YAML format. JSON examples shown below:
121
122
  |-------|------|---------|-------------|
122
123
  | `urls` | array | `[]` | URLs to extract content from |
123
124
  | `maxPages` | int | 0 | Max pages to crawl (0 = unlimited) |
124
- | `outputFormat` | string | `"markdown"` | `txt`, `markdown`, `json`, `xml`, `xmltei` |
125
+ | `outputFormat` | string | `"markdown"` | `txt`, `markdown`, `json`, `jsonl`, `xml`, `xmltei` |
125
126
  | `outputDir` | string | `"./output"` | Directory for extracted content |
126
127
  | `crawlDepth` | int | 0 | How deep to follow links (0 = start URLs only) |
127
128
  | `headless` | bool | true | Browser headless mode |
@@ -135,18 +136,20 @@ Supports both JSON and YAML format. JSON examples shown below:
135
136
  |-------|------|---------|-------------|
136
137
  | `proxy.urls` | array | `[]` | Proxy URLs (`http://user:pass@host:port` or `socks5://host:port`) |
137
138
  | `proxy.rotation` | string | `"recommended"` | `recommended`, `per_request`, `until_failure` |
139
+ | `proxy.tiered` | array | `[]` | Tiered proxy escalation (config-file only) |
138
140
 
139
141
  ### Browser Settings
140
142
 
141
143
  | Field | Type | Default | Description |
142
144
  |-------|------|---------|-------------|
143
145
  | `launcher` | string | `"chromium"` | Browser engine: `chromium`, `firefox` |
144
- | `waitUntil` | string | `"networkidle"` | Page load event: `networkidle`, `load`, `domcontentloaded` |
146
+ | `waitUntil` | string | `"load"` | Page load event: `load`, `networkidle`, `domcontentloaded` |
145
147
  | `pageLoadTimeout` | int | 60 | Page load timeout in seconds |
146
148
  | `ignoreCors` | bool | false | Disable CORS/CSP restrictions |
147
149
  | `closeCookieModals` | bool | false | Auto-dismiss cookie consent banners |
148
150
  | `maxScrollHeight` | int | 5000 | Max scroll height in pixels (0 = disable) |
149
151
  | `ignoreSslErrors` | bool | false | Skip SSL certificate verification |
152
+ | `userAgent` | string | `""` | Custom User-Agent string |
150
153
 
151
154
  ### Crawl Filtering
152
155
 
@@ -194,6 +197,7 @@ All options go under the `extraction` key in config files, or use the equivalent
194
197
  | `withMetadata` | bool | true | Extract metadata (title, author, date) |
195
198
  | `targetLanguage` | string | null | Filter by language (e.g. `"en"`) |
196
199
  | `fast` | bool | false | Fast mode (less thorough) |
200
+ | `pruneXpath` | array | null | XPath patterns to remove from content |
197
201
 
198
202
  ## Docker
199
203
 
@@ -210,7 +214,7 @@ docker run -v ./output:/output ghcr.io/contextractor/contextractor https://examp
210
214
  Use a config file:
211
215
 
212
216
  ```bash
213
- docker run -v ./config.yaml:/config.yaml ghcr.io/contextractor/contextractor --config /config.yaml
217
+ docker run -v ./config.json:/config.json ghcr.io/contextractor/contextractor --config /config.json
214
218
  ```
215
219
 
216
220
  All CLI flags work the same inside Docker.
package/index.js CHANGED
@@ -51,7 +51,7 @@ function getBinaryPath() {
51
51
  * @param {boolean} [options.noLinks] - Exclude links
52
52
  * @param {boolean} [options.noComments] - Exclude comments
53
53
  * @param {string} [options.outputDir] - Output directory
54
- * @param {string} [options.format] - Output format (txt, markdown, json, xml, xmltei)
54
+ * @param {string} [options.format] - Output format (txt, markdown, json, jsonl, xml, xmltei)
55
55
  * @param {number} [options.maxPages] - Max pages to crawl
56
56
  * @param {number} [options.crawlDepth] - Max crawl depth
57
57
  * @param {boolean} [options.headless] - Run headless (default true)
@@ -71,6 +71,7 @@ function getBinaryPath() {
71
71
  * @param {boolean} [options.closeCookieModals] - Auto-dismiss cookie banners
72
72
  * @param {number} [options.maxScrollHeight] - Max scroll height in pixels
73
73
  * @param {boolean} [options.ignoreSslErrors] - Skip SSL verification
74
+ * @param {string} [options.userAgent] - Custom User-Agent string
74
75
  * @param {string|string[]} [options.globs] - Glob patterns to include
75
76
  * @param {string|string[]} [options.excludes] - Glob patterns to exclude
76
77
  * @param {string} [options.linkSelector] - CSS selector for links
@@ -133,6 +134,7 @@ function extract(urls, options = {}) {
133
134
  if (options.closeCookieModals) args.push("--close-cookie-modals");
134
135
  if (options.maxScrollHeight != null) args.push("--max-scroll-height", String(options.maxScrollHeight));
135
136
  if (options.ignoreSslErrors) args.push("--ignore-ssl-errors");
137
+ if (options.userAgent) args.push("--user-agent", options.userAgent);
136
138
 
137
139
  // Crawl filtering
138
140
  if (options.globs) {
package/package.json CHANGED
@@ -1,13 +1,13 @@
1
1
  {
2
2
  "name": "contextractor",
3
- "version": "0.3.0",
3
+ "version": "0.3.2",
4
4
  "description": "Extract web content from URLs with configurable extraction options",
5
5
  "license": "MIT",
6
6
  "repository": {
7
7
  "type": "git",
8
8
  "url": "https://github.com/contextractor/contextractor.git"
9
9
  },
10
- "homepage": "https://github.com/contextractor/contextractor",
10
+ "homepage": "https://www.contextractor.com/",
11
11
  "bugs": {
12
12
  "url": "https://github.com/contextractor/contextractor/issues"
13
13
  },