npm - contextractor - Versions diffs - 0.3.0 → 0.3.1 - Mend

contextractor 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/README.md CHANGED Viewed

@@ -33,7 +33,7 @@ contextractor [OPTIONS] [URLS...]
 Crawl Settings:
   --config, -c          Path to YAML or JSON config file
   --output-dir, -o      Output directory
-  --format, -f          Output format (txt, markdown, json, xml, xmltei)
+  --format, -f          Output format (txt, markdown, json, jsonl, xml, xmltei)
   --max-pages           Max pages to crawl (0 = unlimited)
   --crawl-depth         Max link depth from start URLs (0 = start only)
   --headless/--no-headless  Browser headless mode (default: headless)
@@ -47,12 +47,13 @@ Proxy:
 Browser:
   --launcher            Browser engine: chromium, firefox (default: chromium)
-  --wait-until          Page load event: networkidle, load, domcontentloaded
+  --wait-until          Page load event: load, networkidle, domcontentloaded (default: load)
   --page-load-timeout   Timeout in seconds (default: 60)
   --ignore-cors         Disable CORS/CSP restrictions
   --close-cookie-modals Auto-dismiss cookie banners
   --max-scroll-height   Max scroll height in pixels (default: 5000)
   --ignore-ssl-errors   Skip SSL certificate verification
+  --user-agent          Custom User-Agent string
 Crawl Filtering:
   --globs               Comma-separated glob patterns to include
@@ -121,7 +122,7 @@ Supports both JSON and YAML format. JSON examples shown below:
 |-------|------|---------|-------------|
 | `urls` | array | `[]` | URLs to extract content from |
 | `maxPages` | int | 0 | Max pages to crawl (0 = unlimited) |
-| `outputFormat` | string | `"markdown"` | `txt`, `markdown`, `json`, `xml`, `xmltei` |
+| `outputFormat` | string | `"markdown"` | `txt`, `markdown`, `json`, `jsonl`, `xml`, `xmltei` |
 | `outputDir` | string | `"./output"` | Directory for extracted content |
 | `crawlDepth` | int | 0 | How deep to follow links (0 = start URLs only) |
 | `headless` | bool | true | Browser headless mode |
@@ -135,18 +136,20 @@ Supports both JSON and YAML format. JSON examples shown below:
 |-------|------|---------|-------------|
 | `proxy.urls` | array | `[]` | Proxy URLs (`http://user:pass@host:port` or `socks5://host:port`) |
 | `proxy.rotation` | string | `"recommended"` | `recommended`, `per_request`, `until_failure` |
+| `proxy.tiered` | array | `[]` | Tiered proxy escalation (config-file only) |
 ### Browser Settings
 | Field | Type | Default | Description |
 |-------|------|---------|-------------|
 | `launcher` | string | `"chromium"` | Browser engine: `chromium`, `firefox` |
-| `waitUntil` | string | `"networkidle"` | Page load event: `networkidle`, `load`, `domcontentloaded` |
+| `waitUntil` | string | `"load"` | Page load event: `load`, `networkidle`, `domcontentloaded` |
 | `pageLoadTimeout` | int | 60 | Page load timeout in seconds |
 | `ignoreCors` | bool | false | Disable CORS/CSP restrictions |
 | `closeCookieModals` | bool | false | Auto-dismiss cookie consent banners |
 | `maxScrollHeight` | int | 5000 | Max scroll height in pixels (0 = disable) |
 | `ignoreSslErrors` | bool | false | Skip SSL certificate verification |
+| `userAgent` | string | `""` | Custom User-Agent string |
 ### Crawl Filtering
@@ -194,6 +197,7 @@ All options go under the `extraction` key in config files, or use the equivalent
 | `withMetadata` | bool | true | Extract metadata (title, author, date) |
 | `targetLanguage` | string | null | Filter by language (e.g. `"en"`) |
 | `fast` | bool | false | Fast mode (less thorough) |
+| `pruneXpath` | array | null | XPath patterns to remove from content |
 ## Docker

package/index.js CHANGED Viewed

@@ -51,7 +51,7 @@ function getBinaryPath() {
  * @param {boolean} [options.noLinks] - Exclude links
  * @param {boolean} [options.noComments] - Exclude comments
  * @param {string} [options.outputDir] - Output directory
- * @param {string} [options.format] - Output format (txt, markdown, json, xml, xmltei)
+ * @param {string} [options.format] - Output format (txt, markdown, json, jsonl, xml, xmltei)
  * @param {number} [options.maxPages] - Max pages to crawl
  * @param {number} [options.crawlDepth] - Max crawl depth
  * @param {boolean} [options.headless] - Run headless (default true)
@@ -71,6 +71,7 @@ function getBinaryPath() {
  * @param {boolean} [options.closeCookieModals] - Auto-dismiss cookie banners
  * @param {number} [options.maxScrollHeight] - Max scroll height in pixels
  * @param {boolean} [options.ignoreSslErrors] - Skip SSL verification
+ * @param {string} [options.userAgent] - Custom User-Agent string
  * @param {string|string[]} [options.globs] - Glob patterns to include
  * @param {string|string[]} [options.excludes] - Glob patterns to exclude
  * @param {string} [options.linkSelector] - CSS selector for links
@@ -133,6 +134,7 @@ function extract(urls, options = {}) {
     if (options.closeCookieModals) args.push("--close-cookie-modals");
     if (options.maxScrollHeight != null) args.push("--max-scroll-height", String(options.maxScrollHeight));
     if (options.ignoreSslErrors) args.push("--ignore-ssl-errors");
+    if (options.userAgent) args.push("--user-agent", options.userAgent);
     // Crawl filtering
     if (options.globs) {

package/package.json CHANGED Viewed

@@ -1,13 +1,13 @@
 {
   "name": "contextractor",
-  "version": "0.3.0",
+  "version": "0.3.1",
   "description": "Extract web content from URLs with configurable extraction options",
   "license": "MIT",
   "repository": {
     "type": "git",
     "url": "https://github.com/contextractor/contextractor.git"
   },
-  "homepage": "https://github.com/contextractor/contextractor",
+  "homepage": "https://www.contextractor.com/",
   "bugs": {
     "url": "https://github.com/contextractor/contextractor/issues"
   },