contextractor 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/index.js +76 -4
  2. package/package.json +1 -1
package/index.js CHANGED
@@ -39,17 +39,89 @@ function getBinaryPath() {
39
39
  return path.join(__dirname, "bin", getBinaryName());
40
40
  }
41
41
 
42
- function extract(configPath, options = {}) {
42
+ /**
43
+ * Extract content from web pages.
44
+ *
45
+ * @param {string|string[]} urls - URL(s) to extract, or a config file path for backward compat
46
+ * @param {object} [options={}] - Extraction options
47
+ * @param {string} [options.config] - Path to YAML/JSON config file
48
+ * @param {boolean} [options.precision] - High precision mode
49
+ * @param {boolean} [options.recall] - High recall mode
50
+ * @param {boolean} [options.fast] - Fast extraction mode
51
+ * @param {boolean} [options.noLinks] - Exclude links
52
+ * @param {boolean} [options.noComments] - Exclude comments
53
+ * @param {string} [options.outputDir] - Output directory
54
+ * @param {string} [options.format] - Output format (txt, markdown, json, xml, xmltei)
55
+ * @param {number} [options.maxPages] - Max pages to crawl
56
+ * @param {number} [options.crawlDepth] - Max crawl depth
57
+ * @param {boolean} [options.headless] - Run headless (default true)
58
+ * @param {boolean} [options.includeTables] - Include tables
59
+ * @param {boolean} [options.includeImages] - Include images
60
+ * @param {boolean} [options.includeFormatting] - Preserve formatting
61
+ * @param {boolean} [options.deduplicate] - Deduplicate content
62
+ * @param {string} [options.targetLanguage] - Filter by language
63
+ * @param {boolean} [options.withMetadata] - Extract metadata
64
+ * @param {string|string[]} [options.pruneXpath] - XPath patterns to prune
65
+ * @param {boolean} [options.verbose] - Verbose logging
66
+ * @param {string} [options.stdio] - stdio option for child process
67
+ * @returns {Promise<void>}
68
+ */
69
+ function extract(urls, options = {}) {
43
70
  return new Promise((resolve, reject) => {
44
- const args = [configPath];
71
+ const args = [];
72
+
73
+ // Determine if first arg is a URL or config file path (backward compat)
74
+ let urlList = [];
75
+ if (typeof urls === "string") {
76
+ if (urls.startsWith("http://") || urls.startsWith("https://")) {
77
+ urlList = [urls];
78
+ } else {
79
+ // Backward compat: treat as config file path
80
+ options = { ...options, config: urls };
81
+ }
82
+ } else if (Array.isArray(urls)) {
83
+ urlList = urls;
84
+ }
85
+
86
+ // Config file
87
+ if (options.config) args.push("--config", options.config);
88
+
89
+ // CrawlConfig options
90
+ if (options.maxPages != null) args.push("--max-pages", String(options.maxPages));
91
+ if (options.crawlDepth != null) args.push("--crawl-depth", String(options.crawlDepth));
92
+ if (options.headless === true) args.push("--headless");
93
+ if (options.headless === false) args.push("--no-headless");
94
+ if (options.outputDir) args.push("--output-dir", options.outputDir);
95
+ if (options.format) args.push("--format", options.format);
96
+
97
+ // Extraction options
45
98
  if (options.precision) args.push("--precision");
46
99
  if (options.recall) args.push("--recall");
100
+ if (options.fast) args.push("--fast");
47
101
  if (options.noLinks) args.push("--no-links");
48
102
  if (options.noComments) args.push("--no-comments");
49
- if (options.outputDir) args.push("--output-dir", options.outputDir);
50
- if (options.format) args.push("--format", options.format);
103
+ if (options.includeTables === true) args.push("--include-tables");
104
+ if (options.includeTables === false) args.push("--no-tables");
105
+ if (options.includeImages) args.push("--include-images");
106
+ if (options.includeFormatting === true) args.push("--include-formatting");
107
+ if (options.includeFormatting === false) args.push("--no-formatting");
108
+ if (options.deduplicate) args.push("--deduplicate");
109
+ if (options.targetLanguage) args.push("--target-language", options.targetLanguage);
110
+ if (options.withMetadata === true) args.push("--with-metadata");
111
+ if (options.withMetadata === false) args.push("--no-metadata");
112
+ if (options.pruneXpath) {
113
+ const xpaths = Array.isArray(options.pruneXpath) ? options.pruneXpath : [options.pruneXpath];
114
+ for (const xp of xpaths) {
115
+ args.push("--prune-xpath", xp);
116
+ }
117
+ }
118
+
119
+ // Diagnostics
51
120
  if (options.verbose) args.push("--verbose");
52
121
 
122
+ // URLs as positional args (at the end)
123
+ args.push(...urlList);
124
+
53
125
  const child = spawn(getBinaryPath(), args, {
54
126
  stdio: options.stdio || "inherit",
55
127
  });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "contextractor",
3
- "version": "0.1.0",
3
+ "version": "0.2.0",
4
4
  "description": "Extract web content from URLs with configurable extraction options",
5
5
  "license": "MIT",
6
6
  "repository": {