npm - contextractor - Versions diffs - 0.2.0 → 0.3.0 - Mend

contextractor 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/README.md ADDED Viewed

@@ -0,0 +1,229 @@
+# Contextractor
+Extract clean, readable content from any website using [Trafilatura](https://trafilatura.readthedocs.io/).
+Available as: [npm CLI](#install) | [Docker](#docker) | [Apify actor](https://apify.com/shortc/contextractor) | [Web app](https://contextractor.com)
+## Install
+```bash
+npm install -g contextractor
+```
+Requires Node.js 18+. Playwright Chromium is installed automatically.
+## Usage
+```bash
+contextractor https://example.com
+```
+Works with zero config. Pass URLs directly, or use a config file for complex setups:
+```bash
+contextractor https://example.com --precision --format json -o ./results
+contextractor --config config.yaml --max-pages 10
+```
+### CLI Options
+```
+contextractor [OPTIONS] [URLS...]
+Crawl Settings:
+  --config, -c          Path to YAML or JSON config file
+  --output-dir, -o      Output directory
+  --format, -f          Output format (txt, markdown, json, xml, xmltei)
+  --max-pages           Max pages to crawl (0 = unlimited)
+  --crawl-depth         Max link depth from start URLs (0 = start only)
+  --headless/--no-headless  Browser headless mode (default: headless)
+  --max-concurrency     Max parallel requests (default: 50)
+  --max-retries         Max request retries (default: 3)
+  --max-results         Max results per crawl (0 = unlimited)
+Proxy:
+  --proxy-urls          Comma-separated proxy URLs (http://user:pass@host:port)
+  --proxy-rotation      Rotation: recommended, per_request, until_failure
+Browser:
+  --launcher            Browser engine: chromium, firefox (default: chromium)
+  --wait-until          Page load event: networkidle, load, domcontentloaded
+  --page-load-timeout   Timeout in seconds (default: 60)
+  --ignore-cors         Disable CORS/CSP restrictions
+  --close-cookie-modals Auto-dismiss cookie banners
+  --max-scroll-height   Max scroll height in pixels (default: 5000)
+  --ignore-ssl-errors   Skip SSL certificate verification
+Crawl Filtering:
+  --globs               Comma-separated glob patterns to include
+  --excludes            Comma-separated glob patterns to exclude
+  --link-selector       CSS selector for links to follow
+  --keep-url-fragments  Preserve URL fragments
+  --respect-robots-txt  Honor robots.txt
+Cookies & Headers:
+  --cookies             JSON array of cookie objects
+  --headers             JSON object of custom HTTP headers
+Output Toggles:
+  --save-raw-html       Save raw HTML to output
+  --save-text           Save extracted text
+  --save-json           Save extracted JSON
+  --save-xml            Save extracted XML
+  --save-xml-tei        Save extracted XML-TEI
+Content Extraction:
+  --precision           High precision mode (less noise)
+  --recall              High recall mode (more content)
+  --fast                Fast extraction mode (less thorough)
+  --no-links            Exclude links from output
+  --no-comments         Exclude comments from output
+  --include-tables/--no-tables  Include tables (default: include)
+  --include-images      Include image descriptions
+  --include-formatting/--no-formatting  Preserve formatting (default: preserve)
+  --deduplicate         Deduplicate extracted content
+  --target-language     Filter by language (e.g. "en")
+  --with-metadata/--no-metadata  Extract metadata (default: with)
+  --prune-xpath         XPath patterns to remove from content
+Diagnostics:
+  --verbose, -v         Enable verbose logging
+```
+CLI flags override config file settings. Merge order: `defaults → config file → CLI args`
+### Config File (optional)
+Supports both JSON and YAML format. JSON examples shown below:
+```json
+{
+  "urls": ["https://example.com", "https://docs.example.com"],
+  "outputFormat": "markdown",
+  "outputDir": "./output",
+  "crawlDepth": 1,
+  "proxy": {
+    "urls": ["http://user:pass@host:port"],
+    "rotation": "recommended"
+  },
+  "extraction": {
+    "favorPrecision": true,
+    "includeLinks": true,
+    "includeTables": true,
+    "deduplicate": true
+  }
+}
+```
+### Crawl Settings
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `urls` | array | `[]` | URLs to extract content from |
+| `maxPages` | int | 0 | Max pages to crawl (0 = unlimited) |
+| `outputFormat` | string | `"markdown"` | `txt`, `markdown`, `json`, `xml`, `xmltei` |
+| `outputDir` | string | `"./output"` | Directory for extracted content |
+| `crawlDepth` | int | 0 | How deep to follow links (0 = start URLs only) |
+| `headless` | bool | true | Browser headless mode |
+| `maxConcurrency` | int | 50 | Max parallel browser pages |
+| `maxRetries` | int | 3 | Max retries for failed requests |
+| `maxResults` | int | 0 | Max results per crawl (0 = unlimited) |
+### Proxy Configuration
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `proxy.urls` | array | `[]` | Proxy URLs (`http://user:pass@host:port` or `socks5://host:port`) |
+| `proxy.rotation` | string | `"recommended"` | `recommended`, `per_request`, `until_failure` |
+### Browser Settings
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `launcher` | string | `"chromium"` | Browser engine: `chromium`, `firefox` |
+| `waitUntil` | string | `"networkidle"` | Page load event: `networkidle`, `load`, `domcontentloaded` |
+| `pageLoadTimeout` | int | 60 | Page load timeout in seconds |
+| `ignoreCors` | bool | false | Disable CORS/CSP restrictions |
+| `closeCookieModals` | bool | false | Auto-dismiss cookie consent banners |
+| `maxScrollHeight` | int | 5000 | Max scroll height in pixels (0 = disable) |
+| `ignoreSslErrors` | bool | false | Skip SSL certificate verification |
+### Crawl Filtering
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `globs` | array | `[]` | Glob patterns for URLs to include |
+| `excludes` | array | `[]` | Glob patterns for URLs to exclude |
+| `linkSelector` | string | `""` | CSS selector for links to follow |
+| `keepUrlFragments` | bool | false | Treat URLs with different fragments as different pages |
+| `respectRobotsTxt` | bool | false | Honor robots.txt |
+### Cookies & Headers
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `cookies` | array | `[]` | Initial cookies (`[{"name": "...", "value": "...", "domain": "..."}]`) |
+| `headers` | object | `{}` | Custom HTTP headers (`{"Authorization": "Bearer token"}`) |
+### Output Toggles
+Save additional formats alongside the primary output:
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `saveRawHtml` | bool | false | Save raw HTML |
+| `saveText` | bool | false | Save extracted plain text |
+| `saveJson` | bool | false | Save extracted JSON |
+| `saveXml` | bool | false | Save extracted XML |
+| `saveXmlTei` | bool | false | Save extracted XML-TEI |
+### Content Extraction
+All options go under the `extraction` key in config files, or use the equivalent CLI flags:
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `favorPrecision` | bool | false | High precision, less noise |
+| `favorRecall` | bool | false | High recall, more content |
+| `includeComments` | bool | true | Include comments |
+| `includeTables` | bool | true | Include tables |
+| `includeImages` | bool | false | Include images |
+| `includeFormatting` | bool | true | Preserve formatting |
+| `includeLinks` | bool | true | Include links |
+| `deduplicate` | bool | false | Deduplicate content |
+| `withMetadata` | bool | true | Extract metadata (title, author, date) |
+| `targetLanguage` | string | null | Filter by language (e.g. `"en"`) |
+| `fast` | bool | false | Fast mode (less thorough) |
+## Docker
+```bash
+docker run ghcr.io/contextractor/contextractor https://example.com
+```
+Save output to your local machine:
+```bash
+docker run -v ./output:/output ghcr.io/contextractor/contextractor https://example.com -o /output
+```
+Use a config file:
+```bash
+docker run -v ./config.yaml:/config.yaml ghcr.io/contextractor/contextractor --config /config.yaml
+```
+All CLI flags work the same inside Docker.
+## Output
+One file per crawled page, named from the URL slug (e.g. `example-com-page.md`). Metadata (title, author, date) is included in the output header when available.
+## Platforms
+- npm: macOS arm64, Linux (x64, arm64), Windows x64
+- Docker: linux/amd64, linux/arm64
+## License
+MIT

package/index.js CHANGED Viewed

@@ -62,6 +62,30 @@ function getBinaryPath() {
  * @param {string} [options.targetLanguage] - Filter by language
  * @param {boolean} [options.withMetadata] - Extract metadata
  * @param {string|string[]} [options.pruneXpath] - XPath patterns to prune
+ * @param {string|string[]} [options.proxyUrls] - Proxy URLs
+ * @param {string} [options.proxyRotation] - Proxy rotation strategy
+ * @param {string} [options.launcher] - Browser engine (chromium, firefox)
+ * @param {string} [options.waitUntil] - Page load event (networkidle, load, domcontentloaded)
+ * @param {number} [options.pageLoadTimeout] - Page load timeout in seconds
+ * @param {boolean} [options.ignoreCors] - Disable CORS/CSP
+ * @param {boolean} [options.closeCookieModals] - Auto-dismiss cookie banners
+ * @param {number} [options.maxScrollHeight] - Max scroll height in pixels
+ * @param {boolean} [options.ignoreSslErrors] - Skip SSL verification
+ * @param {string|string[]} [options.globs] - Glob patterns to include
+ * @param {string|string[]} [options.excludes] - Glob patterns to exclude
+ * @param {string} [options.linkSelector] - CSS selector for links
+ * @param {boolean} [options.keepUrlFragments] - Preserve URL fragments
+ * @param {boolean} [options.respectRobotsTxt] - Honor robots.txt
+ * @param {object[]} [options.cookies] - Initial cookies array
+ * @param {object} [options.headers] - Custom HTTP headers
+ * @param {number} [options.maxConcurrency] - Max parallel requests
+ * @param {number} [options.maxRetries] - Max request retries
+ * @param {number} [options.maxResults] - Max results (0 = unlimited)
+ * @param {boolean} [options.saveRawHtml] - Save raw HTML
+ * @param {boolean} [options.saveText] - Save extracted text
+ * @param {boolean} [options.saveJson] - Save extracted JSON
+ * @param {boolean} [options.saveXml] - Save extracted XML
+ * @param {boolean} [options.saveXmlTei] - Save extracted XML-TEI
  * @param {boolean} [options.verbose] - Verbose logging
  * @param {string} [options.stdio] - stdio option for child process
  * @returns {Promise<void>}
@@ -94,6 +118,51 @@ function extract(urls, options = {}) {
     if (options.outputDir) args.push("--output-dir", options.outputDir);
     if (options.format) args.push("--format", options.format);
+    // Proxy
+    if (options.proxyUrls) {
+      const proxyList = Array.isArray(options.proxyUrls) ? options.proxyUrls : [options.proxyUrls];
+      args.push("--proxy-urls", proxyList.join(","));
+    }
+    if (options.proxyRotation) args.push("--proxy-rotation", options.proxyRotation);
+    // Browser settings
+    if (options.launcher) args.push("--launcher", options.launcher);
+    if (options.waitUntil) args.push("--wait-until", options.waitUntil);
+    if (options.pageLoadTimeout != null) args.push("--page-load-timeout", String(options.pageLoadTimeout));
+    if (options.ignoreCors) args.push("--ignore-cors");
+    if (options.closeCookieModals) args.push("--close-cookie-modals");
+    if (options.maxScrollHeight != null) args.push("--max-scroll-height", String(options.maxScrollHeight));
+    if (options.ignoreSslErrors) args.push("--ignore-ssl-errors");
+    // Crawl filtering
+    if (options.globs) {
+      const globList = Array.isArray(options.globs) ? options.globs : [options.globs];
+      args.push("--globs", globList.join(","));
+    }
+    if (options.excludes) {
+      const excludeList = Array.isArray(options.excludes) ? options.excludes : [options.excludes];
+      args.push("--excludes", excludeList.join(","));
+    }
+    if (options.linkSelector) args.push("--link-selector", options.linkSelector);
+    if (options.keepUrlFragments) args.push("--keep-url-fragments");
+    if (options.respectRobotsTxt) args.push("--respect-robots-txt");
+    // Cookies & headers
+    if (options.cookies) args.push("--cookies", JSON.stringify(options.cookies));
+    if (options.headers) args.push("--headers", JSON.stringify(options.headers));
+    // Concurrency & retries
+    if (options.maxConcurrency != null) args.push("--max-concurrency", String(options.maxConcurrency));
+    if (options.maxRetries != null) args.push("--max-retries", String(options.maxRetries));
+    if (options.maxResults != null) args.push("--max-results", String(options.maxResults));
+    // Output toggles
+    if (options.saveRawHtml) args.push("--save-raw-html");
+    if (options.saveText) args.push("--save-text");
+    if (options.saveJson) args.push("--save-json");
+    if (options.saveXml) args.push("--save-xml");
+    if (options.saveXmlTei) args.push("--save-xml-tei");
     // Extraction options
     if (options.precision) args.push("--precision");
     if (options.recall) args.push("--recall");

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "contextractor",
-  "version": "0.2.0",
+  "version": "0.3.0",
   "description": "Extract web content from URLs with configurable extraction options",
   "license": "MIT",
   "repository": {
@@ -29,7 +29,8 @@
   "files": [
     "cli.js",
     "index.js",
-    "postinstall.js"
+    "postinstall.js",
+    "README.md"
   ],
   "scripts": {
     "postinstall": "node postinstall.js"