contextractor 0.1.1 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +133 -0
  2. package/index.js +76 -4
  3. package/package.json +3 -2
package/README.md ADDED
@@ -0,0 +1,133 @@
1
+ # Contextractor
2
+
3
+ Extract clean, readable content from any website using [Trafilatura](https://trafilatura.readthedocs.io/).
4
+
5
+ Available as: [npm CLI](#install) | [Docker](#docker) | [Apify actor](https://apify.com/shortc/contextractor) | [Web app](https://contextractor.com)
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ npm install -g contextractor
11
+ ```
12
+
13
+ Requires Node.js 18+. Playwright Chromium is installed automatically.
14
+
15
+ ## Usage
16
+
17
+ ```bash
18
+ contextractor https://example.com
19
+ ```
20
+
21
+ Works with zero config. Pass URLs directly, or use a config file for complex setups:
22
+
23
+ ```bash
24
+ contextractor https://example.com --precision --format json -o ./results
25
+ contextractor --config config.yaml --max-pages 10
26
+ ```
27
+
28
+ ### CLI Options
29
+
30
+ ```
31
+ contextractor [OPTIONS] [URLS...]
32
+
33
+ Options:
34
+ --config, -c Path to YAML or JSON config file
35
+ --output-dir, -o Output directory
36
+ --format, -f Output format (txt, markdown, json, xml, xmltei)
37
+ --max-pages Max pages to crawl (0 = unlimited)
38
+ --crawl-depth Max link depth from start URLs (0 = start only)
39
+ --headless/--no-headless Browser headless mode (default: headless)
40
+ --precision High precision mode (less noise)
41
+ --recall High recall mode (more content)
42
+ --fast Fast extraction mode (less thorough)
43
+ --no-links Exclude links from output
44
+ --no-comments Exclude comments from output
45
+ --include-tables/--no-tables Include tables (default: include)
46
+ --include-images Include image descriptions
47
+ --include-formatting/--no-formatting Preserve formatting (default: preserve)
48
+ --deduplicate Deduplicate extracted content
49
+ --target-language Filter by language (e.g. "en")
50
+ --with-metadata/--no-metadata Extract metadata (default: with)
51
+ --prune-xpath XPath patterns to remove from content
52
+ --verbose, -v Enable verbose logging
53
+ ```
54
+
55
+ CLI flags override config file settings. Merge order: `defaults → config file → CLI args`
56
+
57
+ ### Config File (optional)
58
+
59
+ ```yaml
60
+ urls:
61
+ - https://example.com
62
+ - https://docs.example.com
63
+ outputFormat: markdown
64
+ outputDir: ./output
65
+ crawlDepth: 1
66
+
67
+ extraction:
68
+ favorPrecision: true
69
+ includeLinks: true
70
+ includeTables: true
71
+ deduplicate: true
72
+ ```
73
+
74
+ | Field | Type | Default | Description |
75
+ |-------|------|---------|-------------|
76
+ | `urls` | array | `[]` | URLs to extract content from |
77
+ | `maxPages` | int | 0 | Max pages to crawl (0 = unlimited) |
78
+ | `outputFormat` | string | `"markdown"` | `txt`, `markdown`, `json`, `xml`, `xmltei` |
79
+ | `outputDir` | string | `"./output"` | Directory for extracted content |
80
+ | `crawlDepth` | int | 0 | How deep to follow links (0 = start URLs only) |
81
+ | `headless` | bool | true | Browser headless mode |
82
+ | `extraction` | object | `{}` | Trafilatura extraction options (see below) |
83
+
84
+ ### Extraction Options
85
+
86
+ All options go under the `extraction` key in config files, or use the equivalent CLI flags:
87
+
88
+ | Field | Type | Default | Description |
89
+ |-------|------|---------|-------------|
90
+ | `favorPrecision` | bool | false | High precision, less noise |
91
+ | `favorRecall` | bool | false | High recall, more content |
92
+ | `includeComments` | bool | true | Include comments |
93
+ | `includeTables` | bool | true | Include tables |
94
+ | `includeImages` | bool | false | Include images |
95
+ | `includeFormatting` | bool | true | Preserve formatting |
96
+ | `includeLinks` | bool | true | Include links |
97
+ | `deduplicate` | bool | false | Deduplicate content |
98
+ | `withMetadata` | bool | true | Extract metadata (title, author, date) |
99
+ | `targetLanguage` | string | null | Filter by language (e.g. `"en"`) |
100
+ | `fast` | bool | false | Fast mode (less thorough) |
101
+
102
+ ## Docker
103
+
104
+ ```bash
105
+ docker run ghcr.io/contextractor/contextractor https://example.com
106
+ ```
107
+
108
+ Save output to your local machine:
109
+
110
+ ```bash
111
+ docker run -v ./output:/output ghcr.io/contextractor/contextractor https://example.com -o /output
112
+ ```
113
+
114
+ Use a config file:
115
+
116
+ ```bash
117
+ docker run -v ./config.yaml:/config.yaml ghcr.io/contextractor/contextractor --config /config.yaml
118
+ ```
119
+
120
+ All CLI flags work the same inside Docker.
121
+
122
+ ## Output
123
+
124
+ One file per crawled page, named from the URL slug (e.g. `example-com-page.md`). Metadata (title, author, date) is included in the output header when available.
125
+
126
+ ## Platforms
127
+
128
+ - npm: macOS arm64, Linux (x64, arm64), Windows x64
129
+ - Docker: linux/amd64, linux/arm64
130
+
131
+ ## License
132
+
133
+ MIT
package/index.js CHANGED
@@ -39,17 +39,89 @@ function getBinaryPath() {
39
39
  return path.join(__dirname, "bin", getBinaryName());
40
40
  }
41
41
 
42
- function extract(configPath, options = {}) {
42
+ /**
43
+ * Extract content from web pages.
44
+ *
45
+ * @param {string|string[]} urls - URL(s) to extract, or a config file path for backward compat
46
+ * @param {object} [options={}] - Extraction options
47
+ * @param {string} [options.config] - Path to YAML/JSON config file
48
+ * @param {boolean} [options.precision] - High precision mode
49
+ * @param {boolean} [options.recall] - High recall mode
50
+ * @param {boolean} [options.fast] - Fast extraction mode
51
+ * @param {boolean} [options.noLinks] - Exclude links
52
+ * @param {boolean} [options.noComments] - Exclude comments
53
+ * @param {string} [options.outputDir] - Output directory
54
+ * @param {string} [options.format] - Output format (txt, markdown, json, xml, xmltei)
55
+ * @param {number} [options.maxPages] - Max pages to crawl
56
+ * @param {number} [options.crawlDepth] - Max crawl depth
57
+ * @param {boolean} [options.headless] - Run headless (default true)
58
+ * @param {boolean} [options.includeTables] - Include tables
59
+ * @param {boolean} [options.includeImages] - Include images
60
+ * @param {boolean} [options.includeFormatting] - Preserve formatting
61
+ * @param {boolean} [options.deduplicate] - Deduplicate content
62
+ * @param {string} [options.targetLanguage] - Filter by language
63
+ * @param {boolean} [options.withMetadata] - Extract metadata
64
+ * @param {string|string[]} [options.pruneXpath] - XPath patterns to prune
65
+ * @param {boolean} [options.verbose] - Verbose logging
66
+ * @param {string} [options.stdio] - stdio option for child process
67
+ * @returns {Promise<void>}
68
+ */
69
+ function extract(urls, options = {}) {
43
70
  return new Promise((resolve, reject) => {
44
- const args = [configPath];
71
+ const args = [];
72
+
73
+ // Determine if first arg is a URL or config file path (backward compat)
74
+ let urlList = [];
75
+ if (typeof urls === "string") {
76
+ if (urls.startsWith("http://") || urls.startsWith("https://")) {
77
+ urlList = [urls];
78
+ } else {
79
+ // Backward compat: treat as config file path
80
+ options = { ...options, config: urls };
81
+ }
82
+ } else if (Array.isArray(urls)) {
83
+ urlList = urls;
84
+ }
85
+
86
+ // Config file
87
+ if (options.config) args.push("--config", options.config);
88
+
89
+ // CrawlConfig options
90
+ if (options.maxPages != null) args.push("--max-pages", String(options.maxPages));
91
+ if (options.crawlDepth != null) args.push("--crawl-depth", String(options.crawlDepth));
92
+ if (options.headless === true) args.push("--headless");
93
+ if (options.headless === false) args.push("--no-headless");
94
+ if (options.outputDir) args.push("--output-dir", options.outputDir);
95
+ if (options.format) args.push("--format", options.format);
96
+
97
+ // Extraction options
45
98
  if (options.precision) args.push("--precision");
46
99
  if (options.recall) args.push("--recall");
100
+ if (options.fast) args.push("--fast");
47
101
  if (options.noLinks) args.push("--no-links");
48
102
  if (options.noComments) args.push("--no-comments");
49
- if (options.outputDir) args.push("--output-dir", options.outputDir);
50
- if (options.format) args.push("--format", options.format);
103
+ if (options.includeTables === true) args.push("--include-tables");
104
+ if (options.includeTables === false) args.push("--no-tables");
105
+ if (options.includeImages) args.push("--include-images");
106
+ if (options.includeFormatting === true) args.push("--include-formatting");
107
+ if (options.includeFormatting === false) args.push("--no-formatting");
108
+ if (options.deduplicate) args.push("--deduplicate");
109
+ if (options.targetLanguage) args.push("--target-language", options.targetLanguage);
110
+ if (options.withMetadata === true) args.push("--with-metadata");
111
+ if (options.withMetadata === false) args.push("--no-metadata");
112
+ if (options.pruneXpath) {
113
+ const xpaths = Array.isArray(options.pruneXpath) ? options.pruneXpath : [options.pruneXpath];
114
+ for (const xp of xpaths) {
115
+ args.push("--prune-xpath", xp);
116
+ }
117
+ }
118
+
119
+ // Diagnostics
51
120
  if (options.verbose) args.push("--verbose");
52
121
 
122
+ // URLs as positional args (at the end)
123
+ args.push(...urlList);
124
+
53
125
  const child = spawn(getBinaryPath(), args, {
54
126
  stdio: options.stdio || "inherit",
55
127
  });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "contextractor",
3
- "version": "0.1.1",
3
+ "version": "0.2.1",
4
4
  "description": "Extract web content from URLs with configurable extraction options",
5
5
  "license": "MIT",
6
6
  "repository": {
@@ -29,7 +29,8 @@
29
29
  "files": [
30
30
  "cli.js",
31
31
  "index.js",
32
- "postinstall.js"
32
+ "postinstall.js",
33
+ "README.md"
33
34
  ],
34
35
  "scripts": {
35
36
  "postinstall": "node postinstall.js"