contextractor 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +76 -4
- package/package.json +1 -1
package/index.js
CHANGED
|
@@ -39,17 +39,89 @@ function getBinaryPath() {
|
|
|
39
39
|
return path.join(__dirname, "bin", getBinaryName());
|
|
40
40
|
}
|
|
41
41
|
|
|
42
|
-
|
|
42
|
+
/**
|
|
43
|
+
* Extract content from web pages.
|
|
44
|
+
*
|
|
45
|
+
* @param {string|string[]} urls - URL(s) to extract, or a config file path for backward compat
|
|
46
|
+
* @param {object} [options={}] - Extraction options
|
|
47
|
+
* @param {string} [options.config] - Path to YAML/JSON config file
|
|
48
|
+
* @param {boolean} [options.precision] - High precision mode
|
|
49
|
+
* @param {boolean} [options.recall] - High recall mode
|
|
50
|
+
* @param {boolean} [options.fast] - Fast extraction mode
|
|
51
|
+
* @param {boolean} [options.noLinks] - Exclude links
|
|
52
|
+
* @param {boolean} [options.noComments] - Exclude comments
|
|
53
|
+
* @param {string} [options.outputDir] - Output directory
|
|
54
|
+
* @param {string} [options.format] - Output format (txt, markdown, json, xml, xmltei)
|
|
55
|
+
* @param {number} [options.maxPages] - Max pages to crawl
|
|
56
|
+
* @param {number} [options.crawlDepth] - Max crawl depth
|
|
57
|
+
* @param {boolean} [options.headless] - Run headless (default true)
|
|
58
|
+
* @param {boolean} [options.includeTables] - Include tables
|
|
59
|
+
* @param {boolean} [options.includeImages] - Include images
|
|
60
|
+
* @param {boolean} [options.includeFormatting] - Preserve formatting
|
|
61
|
+
* @param {boolean} [options.deduplicate] - Deduplicate content
|
|
62
|
+
* @param {string} [options.targetLanguage] - Filter by language
|
|
63
|
+
* @param {boolean} [options.withMetadata] - Extract metadata
|
|
64
|
+
* @param {string|string[]} [options.pruneXpath] - XPath patterns to prune
|
|
65
|
+
* @param {boolean} [options.verbose] - Verbose logging
|
|
66
|
+
* @param {string} [options.stdio] - stdio option for child process
|
|
67
|
+
* @returns {Promise<void>}
|
|
68
|
+
*/
|
|
69
|
+
function extract(urls, options = {}) {
|
|
43
70
|
return new Promise((resolve, reject) => {
|
|
44
|
-
const args = [
|
|
71
|
+
const args = [];
|
|
72
|
+
|
|
73
|
+
// Determine if first arg is a URL or config file path (backward compat)
|
|
74
|
+
let urlList = [];
|
|
75
|
+
if (typeof urls === "string") {
|
|
76
|
+
if (urls.startsWith("http://") || urls.startsWith("https://")) {
|
|
77
|
+
urlList = [urls];
|
|
78
|
+
} else {
|
|
79
|
+
// Backward compat: treat as config file path
|
|
80
|
+
options = { ...options, config: urls };
|
|
81
|
+
}
|
|
82
|
+
} else if (Array.isArray(urls)) {
|
|
83
|
+
urlList = urls;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// Config file
|
|
87
|
+
if (options.config) args.push("--config", options.config);
|
|
88
|
+
|
|
89
|
+
// CrawlConfig options
|
|
90
|
+
if (options.maxPages != null) args.push("--max-pages", String(options.maxPages));
|
|
91
|
+
if (options.crawlDepth != null) args.push("--crawl-depth", String(options.crawlDepth));
|
|
92
|
+
if (options.headless === true) args.push("--headless");
|
|
93
|
+
if (options.headless === false) args.push("--no-headless");
|
|
94
|
+
if (options.outputDir) args.push("--output-dir", options.outputDir);
|
|
95
|
+
if (options.format) args.push("--format", options.format);
|
|
96
|
+
|
|
97
|
+
// Extraction options
|
|
45
98
|
if (options.precision) args.push("--precision");
|
|
46
99
|
if (options.recall) args.push("--recall");
|
|
100
|
+
if (options.fast) args.push("--fast");
|
|
47
101
|
if (options.noLinks) args.push("--no-links");
|
|
48
102
|
if (options.noComments) args.push("--no-comments");
|
|
49
|
-
if (options.
|
|
50
|
-
if (options.
|
|
103
|
+
if (options.includeTables === true) args.push("--include-tables");
|
|
104
|
+
if (options.includeTables === false) args.push("--no-tables");
|
|
105
|
+
if (options.includeImages) args.push("--include-images");
|
|
106
|
+
if (options.includeFormatting === true) args.push("--include-formatting");
|
|
107
|
+
if (options.includeFormatting === false) args.push("--no-formatting");
|
|
108
|
+
if (options.deduplicate) args.push("--deduplicate");
|
|
109
|
+
if (options.targetLanguage) args.push("--target-language", options.targetLanguage);
|
|
110
|
+
if (options.withMetadata === true) args.push("--with-metadata");
|
|
111
|
+
if (options.withMetadata === false) args.push("--no-metadata");
|
|
112
|
+
if (options.pruneXpath) {
|
|
113
|
+
const xpaths = Array.isArray(options.pruneXpath) ? options.pruneXpath : [options.pruneXpath];
|
|
114
|
+
for (const xp of xpaths) {
|
|
115
|
+
args.push("--prune-xpath", xp);
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Diagnostics
|
|
51
120
|
if (options.verbose) args.push("--verbose");
|
|
52
121
|
|
|
122
|
+
// URLs as positional args (at the end)
|
|
123
|
+
args.push(...urlList);
|
|
124
|
+
|
|
53
125
|
const child = spawn(getBinaryPath(), args, {
|
|
54
126
|
stdio: options.stdio || "inherit",
|
|
55
127
|
});
|