contextractor 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +8 -4
  2. package/index.js +3 -1
  3. package/package.json +2 -2
package/README.md CHANGED
@@ -33,7 +33,7 @@ contextractor [OPTIONS] [URLS...]
33
33
  Crawl Settings:
34
34
  --config, -c Path to YAML or JSON config file
35
35
  --output-dir, -o Output directory
36
- --format, -f Output format (txt, markdown, json, xml, xmltei)
36
+ --format, -f Output format (txt, markdown, json, jsonl, xml, xmltei)
37
37
  --max-pages Max pages to crawl (0 = unlimited)
38
38
  --crawl-depth Max link depth from start URLs (0 = start only)
39
39
  --headless/--no-headless Browser headless mode (default: headless)
@@ -47,12 +47,13 @@ Proxy:
47
47
 
48
48
  Browser:
49
49
  --launcher Browser engine: chromium, firefox (default: chromium)
50
- --wait-until Page load event: networkidle, load, domcontentloaded
50
+ --wait-until Page load event: load, networkidle, domcontentloaded (default: load)
51
51
  --page-load-timeout Timeout in seconds (default: 60)
52
52
  --ignore-cors Disable CORS/CSP restrictions
53
53
  --close-cookie-modals Auto-dismiss cookie banners
54
54
  --max-scroll-height Max scroll height in pixels (default: 5000)
55
55
  --ignore-ssl-errors Skip SSL certificate verification
56
+ --user-agent Custom User-Agent string
56
57
 
57
58
  Crawl Filtering:
58
59
  --globs Comma-separated glob patterns to include
@@ -121,7 +122,7 @@ Supports both JSON and YAML format. JSON examples shown below:
121
122
  |-------|------|---------|-------------|
122
123
  | `urls` | array | `[]` | URLs to extract content from |
123
124
  | `maxPages` | int | 0 | Max pages to crawl (0 = unlimited) |
124
- | `outputFormat` | string | `"markdown"` | `txt`, `markdown`, `json`, `xml`, `xmltei` |
125
+ | `outputFormat` | string | `"markdown"` | `txt`, `markdown`, `json`, `jsonl`, `xml`, `xmltei` |
125
126
  | `outputDir` | string | `"./output"` | Directory for extracted content |
126
127
  | `crawlDepth` | int | 0 | How deep to follow links (0 = start URLs only) |
127
128
  | `headless` | bool | true | Browser headless mode |
@@ -135,18 +136,20 @@ Supports both JSON and YAML format. JSON examples shown below:
135
136
  |-------|------|---------|-------------|
136
137
  | `proxy.urls` | array | `[]` | Proxy URLs (`http://user:pass@host:port` or `socks5://host:port`) |
137
138
  | `proxy.rotation` | string | `"recommended"` | `recommended`, `per_request`, `until_failure` |
139
+ | `proxy.tiered` | array | `[]` | Tiered proxy escalation (config-file only) |
138
140
 
139
141
  ### Browser Settings
140
142
 
141
143
  | Field | Type | Default | Description |
142
144
  |-------|------|---------|-------------|
143
145
  | `launcher` | string | `"chromium"` | Browser engine: `chromium`, `firefox` |
144
- | `waitUntil` | string | `"networkidle"` | Page load event: `networkidle`, `load`, `domcontentloaded` |
146
+ | `waitUntil` | string | `"load"` | Page load event: `load`, `networkidle`, `domcontentloaded` |
145
147
  | `pageLoadTimeout` | int | 60 | Page load timeout in seconds |
146
148
  | `ignoreCors` | bool | false | Disable CORS/CSP restrictions |
147
149
  | `closeCookieModals` | bool | false | Auto-dismiss cookie consent banners |
148
150
  | `maxScrollHeight` | int | 5000 | Max scroll height in pixels (0 = disable) |
149
151
  | `ignoreSslErrors` | bool | false | Skip SSL certificate verification |
152
+ | `userAgent` | string | `""` | Custom User-Agent string |
150
153
 
151
154
  ### Crawl Filtering
152
155
 
@@ -194,6 +197,7 @@ All options go under the `extraction` key in config files, or use the equivalent
194
197
  | `withMetadata` | bool | true | Extract metadata (title, author, date) |
195
198
  | `targetLanguage` | string | null | Filter by language (e.g. `"en"`) |
196
199
  | `fast` | bool | false | Fast mode (less thorough) |
200
+ | `pruneXpath` | array | null | XPath patterns to remove from content |
197
201
 
198
202
  ## Docker
199
203
 
package/index.js CHANGED
@@ -51,7 +51,7 @@ function getBinaryPath() {
51
51
  * @param {boolean} [options.noLinks] - Exclude links
52
52
  * @param {boolean} [options.noComments] - Exclude comments
53
53
  * @param {string} [options.outputDir] - Output directory
54
- * @param {string} [options.format] - Output format (txt, markdown, json, xml, xmltei)
54
+ * @param {string} [options.format] - Output format (txt, markdown, json, jsonl, xml, xmltei)
55
55
  * @param {number} [options.maxPages] - Max pages to crawl
56
56
  * @param {number} [options.crawlDepth] - Max crawl depth
57
57
  * @param {boolean} [options.headless] - Run headless (default true)
@@ -71,6 +71,7 @@ function getBinaryPath() {
71
71
  * @param {boolean} [options.closeCookieModals] - Auto-dismiss cookie banners
72
72
  * @param {number} [options.maxScrollHeight] - Max scroll height in pixels
73
73
  * @param {boolean} [options.ignoreSslErrors] - Skip SSL verification
74
+ * @param {string} [options.userAgent] - Custom User-Agent string
74
75
  * @param {string|string[]} [options.globs] - Glob patterns to include
75
76
  * @param {string|string[]} [options.excludes] - Glob patterns to exclude
76
77
  * @param {string} [options.linkSelector] - CSS selector for links
@@ -133,6 +134,7 @@ function extract(urls, options = {}) {
133
134
  if (options.closeCookieModals) args.push("--close-cookie-modals");
134
135
  if (options.maxScrollHeight != null) args.push("--max-scroll-height", String(options.maxScrollHeight));
135
136
  if (options.ignoreSslErrors) args.push("--ignore-ssl-errors");
137
+ if (options.userAgent) args.push("--user-agent", options.userAgent);
136
138
 
137
139
  // Crawl filtering
138
140
  if (options.globs) {
package/package.json CHANGED
@@ -1,13 +1,13 @@
1
1
  {
2
2
  "name": "contextractor",
3
- "version": "0.3.0",
3
+ "version": "0.3.1",
4
4
  "description": "Extract web content from URLs with configurable extraction options",
5
5
  "license": "MIT",
6
6
  "repository": {
7
7
  "type": "git",
8
8
  "url": "https://github.com/contextractor/contextractor.git"
9
9
  },
10
- "homepage": "https://github.com/contextractor/contextractor",
10
+ "homepage": "https://www.contextractor.com/",
11
11
  "bugs": {
12
12
  "url": "https://github.com/contextractor/contextractor/issues"
13
13
  },