contextractor 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli.js +26 -0
- package/index.js +65 -0
- package/package.json +49 -0
- package/postinstall.js +97 -0
package/cli.js
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
"use strict";
|
|
4
|
+
|
|
5
|
+
const { spawn } = require("child_process");
|
|
6
|
+
const { getBinaryPath } = require("./index");
|
|
7
|
+
|
|
8
|
+
const binaryPath = getBinaryPath();
|
|
9
|
+
const child = spawn(binaryPath, process.argv.slice(2), {
|
|
10
|
+
stdio: "inherit",
|
|
11
|
+
});
|
|
12
|
+
|
|
13
|
+
child.on("error", (err) => {
|
|
14
|
+
if (err.code === "ENOENT") {
|
|
15
|
+
console.error(
|
|
16
|
+
`contextractor binary not found at ${binaryPath}\n` +
|
|
17
|
+
"Try reinstalling: npm install contextractor"
|
|
18
|
+
);
|
|
19
|
+
process.exit(1);
|
|
20
|
+
}
|
|
21
|
+
throw err;
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
child.on("exit", (code) => {
|
|
25
|
+
process.exit(code ?? 1);
|
|
26
|
+
});
|
package/index.js
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
const path = require("path");
|
|
4
|
+
const os = require("os");
|
|
5
|
+
const { execFileSync, spawn } = require("child_process");
|
|
6
|
+
|
|
7
|
+
const PLATFORM_MAP = {
|
|
8
|
+
darwin: "darwin",
|
|
9
|
+
linux: "linux",
|
|
10
|
+
win32: "win",
|
|
11
|
+
};
|
|
12
|
+
|
|
13
|
+
const ARCH_MAP = {
|
|
14
|
+
x64: "x64",
|
|
15
|
+
arm64: "arm64",
|
|
16
|
+
};
|
|
17
|
+
|
|
18
|
+
function getBinaryName() {
|
|
19
|
+
const platform = PLATFORM_MAP[os.platform()];
|
|
20
|
+
let arch = ARCH_MAP[os.arch()];
|
|
21
|
+
|
|
22
|
+
if (!platform || !arch) {
|
|
23
|
+
throw new Error(
|
|
24
|
+
`Unsupported platform: ${os.platform()}-${os.arch()}\n` +
|
|
25
|
+
"Supported: darwin-arm64, linux-x64, linux-arm64, win-x64"
|
|
26
|
+
);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// macOS x64 uses arm64 binary via Rosetta
|
|
30
|
+
if (platform === "darwin" && arch === "x64") {
|
|
31
|
+
arch = "arm64";
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
const ext = os.platform() === "win32" ? ".exe" : "";
|
|
35
|
+
return `contextractor-${platform}-${arch}${ext}`;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function getBinaryPath() {
|
|
39
|
+
return path.join(__dirname, "bin", getBinaryName());
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
function extract(configPath, options = {}) {
|
|
43
|
+
return new Promise((resolve, reject) => {
|
|
44
|
+
const args = [configPath];
|
|
45
|
+
if (options.precision) args.push("--precision");
|
|
46
|
+
if (options.recall) args.push("--recall");
|
|
47
|
+
if (options.noLinks) args.push("--no-links");
|
|
48
|
+
if (options.noComments) args.push("--no-comments");
|
|
49
|
+
if (options.outputDir) args.push("--output-dir", options.outputDir);
|
|
50
|
+
if (options.format) args.push("--format", options.format);
|
|
51
|
+
if (options.verbose) args.push("--verbose");
|
|
52
|
+
|
|
53
|
+
const child = spawn(getBinaryPath(), args, {
|
|
54
|
+
stdio: options.stdio || "inherit",
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
child.on("error", reject);
|
|
58
|
+
child.on("exit", (code) => {
|
|
59
|
+
if (code === 0) resolve();
|
|
60
|
+
else reject(new Error(`contextractor exited with code ${code}`));
|
|
61
|
+
});
|
|
62
|
+
});
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
module.exports = { getBinaryPath, getBinaryName, extract };
|
package/package.json
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "contextractor",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Extract web content from URLs with configurable extraction options",
|
|
5
|
+
"license": "MIT",
|
|
6
|
+
"repository": {
|
|
7
|
+
"type": "git",
|
|
8
|
+
"url": "https://github.com/contextractor/contextractor.git"
|
|
9
|
+
},
|
|
10
|
+
"homepage": "https://github.com/contextractor/contextractor",
|
|
11
|
+
"bugs": {
|
|
12
|
+
"url": "https://github.com/contextractor/contextractor/issues"
|
|
13
|
+
},
|
|
14
|
+
"keywords": [
|
|
15
|
+
"web-scraping",
|
|
16
|
+
"content-extraction",
|
|
17
|
+
"crawling",
|
|
18
|
+
"trafilatura",
|
|
19
|
+
"cli"
|
|
20
|
+
],
|
|
21
|
+
"bin": {
|
|
22
|
+
"contextractor": "./cli.js"
|
|
23
|
+
},
|
|
24
|
+
"main": "./index.js",
|
|
25
|
+
"exports": {
|
|
26
|
+
".": "./index.js",
|
|
27
|
+
"./cli": "./cli.js"
|
|
28
|
+
},
|
|
29
|
+
"files": [
|
|
30
|
+
"cli.js",
|
|
31
|
+
"index.js",
|
|
32
|
+
"postinstall.js"
|
|
33
|
+
],
|
|
34
|
+
"scripts": {
|
|
35
|
+
"postinstall": "node postinstall.js"
|
|
36
|
+
},
|
|
37
|
+
"engines": {
|
|
38
|
+
"node": ">=18"
|
|
39
|
+
},
|
|
40
|
+
"os": [
|
|
41
|
+
"darwin",
|
|
42
|
+
"linux",
|
|
43
|
+
"win32"
|
|
44
|
+
],
|
|
45
|
+
"cpu": [
|
|
46
|
+
"x64",
|
|
47
|
+
"arm64"
|
|
48
|
+
]
|
|
49
|
+
}
|
package/postinstall.js
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
|
|
3
|
+
const https = require("https");
|
|
4
|
+
const http = require("http");
|
|
5
|
+
const fs = require("fs");
|
|
6
|
+
const path = require("path");
|
|
7
|
+
const { execSync } = require("child_process");
|
|
8
|
+
const { getBinaryName } = require("./index");
|
|
9
|
+
|
|
10
|
+
const REPO = "contextractor/contextractor";
|
|
11
|
+
const BIN_DIR = path.join(__dirname, "bin");
|
|
12
|
+
|
|
13
|
+
function getPackageVersion() {
|
|
14
|
+
const pkg = JSON.parse(
|
|
15
|
+
fs.readFileSync(path.join(__dirname, "package.json"), "utf8")
|
|
16
|
+
);
|
|
17
|
+
return pkg.version;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
function follow(url) {
|
|
21
|
+
return new Promise((resolve, reject) => {
|
|
22
|
+
const mod = url.startsWith("https") ? https : http;
|
|
23
|
+
mod.get(url, { headers: { "User-Agent": "contextractor-npm" } }, (res) => {
|
|
24
|
+
if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
|
|
25
|
+
return follow(res.headers.location).then(resolve, reject);
|
|
26
|
+
}
|
|
27
|
+
if (res.statusCode !== 200) {
|
|
28
|
+
return reject(new Error(`HTTP ${res.statusCode} for ${url}`));
|
|
29
|
+
}
|
|
30
|
+
resolve(res);
|
|
31
|
+
}).on("error", reject);
|
|
32
|
+
});
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
async function download(url, dest) {
|
|
36
|
+
const res = await follow(url);
|
|
37
|
+
return new Promise((resolve, reject) => {
|
|
38
|
+
const file = fs.createWriteStream(dest);
|
|
39
|
+
res.pipe(file);
|
|
40
|
+
file.on("finish", () => file.close(resolve));
|
|
41
|
+
file.on("error", reject);
|
|
42
|
+
});
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
async function main() {
|
|
46
|
+
const binaryName = getBinaryName();
|
|
47
|
+
const version = getPackageVersion();
|
|
48
|
+
const tag = `v${version}`;
|
|
49
|
+
const url = `https://github.com/${REPO}/releases/download/${tag}/${binaryName}`;
|
|
50
|
+
|
|
51
|
+
fs.mkdirSync(BIN_DIR, { recursive: true });
|
|
52
|
+
const dest = path.join(BIN_DIR, binaryName);
|
|
53
|
+
|
|
54
|
+
// Skip if binary already exists
|
|
55
|
+
if (fs.existsSync(dest)) {
|
|
56
|
+
console.log(`Binary already exists: ${dest}`);
|
|
57
|
+
return;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
console.log(`Downloading contextractor ${tag} for ${process.platform}-${process.arch}...`);
|
|
61
|
+
console.log(` ${url}`);
|
|
62
|
+
|
|
63
|
+
try {
|
|
64
|
+
await download(url, dest);
|
|
65
|
+
} catch (err) {
|
|
66
|
+
console.error(
|
|
67
|
+
`Failed to download binary: ${err.message}\n\n` +
|
|
68
|
+
`Your platform (${process.platform}-${process.arch}) may not be supported.\n` +
|
|
69
|
+
`Supported: darwin-arm64, linux-x64, linux-arm64, win-x64\n` +
|
|
70
|
+
`See https://github.com/${REPO}/releases for available binaries.`
|
|
71
|
+
);
|
|
72
|
+
process.exit(1);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// Make executable on Unix
|
|
76
|
+
if (process.platform !== "win32") {
|
|
77
|
+
fs.chmodSync(dest, 0o755);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
console.log("Binary installed successfully.");
|
|
81
|
+
|
|
82
|
+
// Install Playwright Chromium
|
|
83
|
+
console.log("Installing Playwright Chromium...");
|
|
84
|
+
try {
|
|
85
|
+
execSync("npx playwright install chromium", {
|
|
86
|
+
stdio: "inherit",
|
|
87
|
+
timeout: 300000,
|
|
88
|
+
});
|
|
89
|
+
} catch {
|
|
90
|
+
console.warn(
|
|
91
|
+
"Warning: Failed to install Playwright Chromium automatically.\n" +
|
|
92
|
+
"Run 'npx playwright install chromium' manually."
|
|
93
|
+
);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
main();
|