contextractor 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -4
- package/index.js +3 -1
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -33,7 +33,7 @@ contextractor [OPTIONS] [URLS...]
|
|
|
33
33
|
Crawl Settings:
|
|
34
34
|
--config, -c Path to YAML or JSON config file
|
|
35
35
|
--output-dir, -o Output directory
|
|
36
|
-
--format, -f Output format (txt, markdown, json, xml, xmltei)
|
|
36
|
+
--format, -f Output format (txt, markdown, json, jsonl, xml, xmltei)
|
|
37
37
|
--max-pages Max pages to crawl (0 = unlimited)
|
|
38
38
|
--crawl-depth Max link depth from start URLs (0 = start only)
|
|
39
39
|
--headless/--no-headless Browser headless mode (default: headless)
|
|
@@ -47,12 +47,13 @@ Proxy:
|
|
|
47
47
|
|
|
48
48
|
Browser:
|
|
49
49
|
--launcher Browser engine: chromium, firefox (default: chromium)
|
|
50
|
-
--wait-until Page load event:
|
|
50
|
+
--wait-until Page load event: load, networkidle, domcontentloaded (default: load)
|
|
51
51
|
--page-load-timeout Timeout in seconds (default: 60)
|
|
52
52
|
--ignore-cors Disable CORS/CSP restrictions
|
|
53
53
|
--close-cookie-modals Auto-dismiss cookie banners
|
|
54
54
|
--max-scroll-height Max scroll height in pixels (default: 5000)
|
|
55
55
|
--ignore-ssl-errors Skip SSL certificate verification
|
|
56
|
+
--user-agent Custom User-Agent string
|
|
56
57
|
|
|
57
58
|
Crawl Filtering:
|
|
58
59
|
--globs Comma-separated glob patterns to include
|
|
@@ -121,7 +122,7 @@ Supports both JSON and YAML format. JSON examples shown below:
|
|
|
121
122
|
|-------|------|---------|-------------|
|
|
122
123
|
| `urls` | array | `[]` | URLs to extract content from |
|
|
123
124
|
| `maxPages` | int | 0 | Max pages to crawl (0 = unlimited) |
|
|
124
|
-
| `outputFormat` | string | `"markdown"` | `txt`, `markdown`, `json`, `xml`, `xmltei` |
|
|
125
|
+
| `outputFormat` | string | `"markdown"` | `txt`, `markdown`, `json`, `jsonl`, `xml`, `xmltei` |
|
|
125
126
|
| `outputDir` | string | `"./output"` | Directory for extracted content |
|
|
126
127
|
| `crawlDepth` | int | 0 | How deep to follow links (0 = start URLs only) |
|
|
127
128
|
| `headless` | bool | true | Browser headless mode |
|
|
@@ -135,18 +136,20 @@ Supports both JSON and YAML format. JSON examples shown below:
|
|
|
135
136
|
|-------|------|---------|-------------|
|
|
136
137
|
| `proxy.urls` | array | `[]` | Proxy URLs (`http://user:pass@host:port` or `socks5://host:port`) |
|
|
137
138
|
| `proxy.rotation` | string | `"recommended"` | `recommended`, `per_request`, `until_failure` |
|
|
139
|
+
| `proxy.tiered` | array | `[]` | Tiered proxy escalation (config-file only) |
|
|
138
140
|
|
|
139
141
|
### Browser Settings
|
|
140
142
|
|
|
141
143
|
| Field | Type | Default | Description |
|
|
142
144
|
|-------|------|---------|-------------|
|
|
143
145
|
| `launcher` | string | `"chromium"` | Browser engine: `chromium`, `firefox` |
|
|
144
|
-
| `waitUntil` | string | `"
|
|
146
|
+
| `waitUntil` | string | `"load"` | Page load event: `load`, `networkidle`, `domcontentloaded` |
|
|
145
147
|
| `pageLoadTimeout` | int | 60 | Page load timeout in seconds |
|
|
146
148
|
| `ignoreCors` | bool | false | Disable CORS/CSP restrictions |
|
|
147
149
|
| `closeCookieModals` | bool | false | Auto-dismiss cookie consent banners |
|
|
148
150
|
| `maxScrollHeight` | int | 5000 | Max scroll height in pixels (0 = disable) |
|
|
149
151
|
| `ignoreSslErrors` | bool | false | Skip SSL certificate verification |
|
|
152
|
+
| `userAgent` | string | `""` | Custom User-Agent string |
|
|
150
153
|
|
|
151
154
|
### Crawl Filtering
|
|
152
155
|
|
|
@@ -194,6 +197,7 @@ All options go under the `extraction` key in config files, or use the equivalent
|
|
|
194
197
|
| `withMetadata` | bool | true | Extract metadata (title, author, date) |
|
|
195
198
|
| `targetLanguage` | string | null | Filter by language (e.g. `"en"`) |
|
|
196
199
|
| `fast` | bool | false | Fast mode (less thorough) |
|
|
200
|
+
| `pruneXpath` | array | null | XPath patterns to remove from content |
|
|
197
201
|
|
|
198
202
|
## Docker
|
|
199
203
|
|
package/index.js
CHANGED
|
@@ -51,7 +51,7 @@ function getBinaryPath() {
|
|
|
51
51
|
* @param {boolean} [options.noLinks] - Exclude links
|
|
52
52
|
* @param {boolean} [options.noComments] - Exclude comments
|
|
53
53
|
* @param {string} [options.outputDir] - Output directory
|
|
54
|
-
* @param {string} [options.format] - Output format (txt, markdown, json, xml, xmltei)
|
|
54
|
+
* @param {string} [options.format] - Output format (txt, markdown, json, jsonl, xml, xmltei)
|
|
55
55
|
* @param {number} [options.maxPages] - Max pages to crawl
|
|
56
56
|
* @param {number} [options.crawlDepth] - Max crawl depth
|
|
57
57
|
* @param {boolean} [options.headless] - Run headless (default true)
|
|
@@ -71,6 +71,7 @@ function getBinaryPath() {
|
|
|
71
71
|
* @param {boolean} [options.closeCookieModals] - Auto-dismiss cookie banners
|
|
72
72
|
* @param {number} [options.maxScrollHeight] - Max scroll height in pixels
|
|
73
73
|
* @param {boolean} [options.ignoreSslErrors] - Skip SSL verification
|
|
74
|
+
* @param {string} [options.userAgent] - Custom User-Agent string
|
|
74
75
|
* @param {string|string[]} [options.globs] - Glob patterns to include
|
|
75
76
|
* @param {string|string[]} [options.excludes] - Glob patterns to exclude
|
|
76
77
|
* @param {string} [options.linkSelector] - CSS selector for links
|
|
@@ -133,6 +134,7 @@ function extract(urls, options = {}) {
|
|
|
133
134
|
if (options.closeCookieModals) args.push("--close-cookie-modals");
|
|
134
135
|
if (options.maxScrollHeight != null) args.push("--max-scroll-height", String(options.maxScrollHeight));
|
|
135
136
|
if (options.ignoreSslErrors) args.push("--ignore-ssl-errors");
|
|
137
|
+
if (options.userAgent) args.push("--user-agent", options.userAgent);
|
|
136
138
|
|
|
137
139
|
// Crawl filtering
|
|
138
140
|
if (options.globs) {
|
package/package.json
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "contextractor",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.1",
|
|
4
4
|
"description": "Extract web content from URLs with configurable extraction options",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"repository": {
|
|
7
7
|
"type": "git",
|
|
8
8
|
"url": "https://github.com/contextractor/contextractor.git"
|
|
9
9
|
},
|
|
10
|
-
"homepage": "https://
|
|
10
|
+
"homepage": "https://www.contextractor.com/",
|
|
11
11
|
"bugs": {
|
|
12
12
|
"url": "https://github.com/contextractor/contextractor/issues"
|
|
13
13
|
},
|