paper-manager 0.10.2 → 0.10.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/dep.js +10 -0
- package/dist/extractor/markdown.js +46 -12
- package/package.json +1 -1
package/dist/commands/dep.js
CHANGED
|
@@ -31,6 +31,7 @@ async function checkOpendataLoader() {
|
|
|
31
31
|
const status = await checkOpendataLoaderStatus();
|
|
32
32
|
const pkgIcon = status.packageInstalled ? chalk.green("✔") : chalk.red("✖");
|
|
33
33
|
const javaIcon = status.javaAvailable ? chalk.green("✔") : chalk.red("✖");
|
|
34
|
+
const hybridIcon = status.hybridBackendAvailable ? chalk.green("✔") : chalk.dim("○");
|
|
34
35
|
log.plain(` ${pkgIcon} @opendataloader/pdf package`);
|
|
35
36
|
if (status.javaAvailable) {
|
|
36
37
|
log.plain(` ${javaIcon} Java runtime (${status.javaVersion})`);
|
|
@@ -38,9 +39,18 @@ async function checkOpendataLoader() {
|
|
|
38
39
|
else {
|
|
39
40
|
log.plain(` ${javaIcon} Java runtime (not found)`);
|
|
40
41
|
}
|
|
42
|
+
if (status.hybridBackendAvailable) {
|
|
43
|
+
log.plain(` ${hybridIcon} Hybrid backend (localhost:5002)`);
|
|
44
|
+
}
|
|
45
|
+
else {
|
|
46
|
+
log.plain(` ${hybridIcon} Hybrid backend (not running, optional)`);
|
|
47
|
+
}
|
|
41
48
|
log.newline();
|
|
42
49
|
if (status.packageInstalled && status.javaAvailable) {
|
|
43
50
|
log.success("opendataloader-pdf is ready.");
|
|
51
|
+
if (status.hybridBackendAvailable) {
|
|
52
|
+
log.step("Hybrid mode enabled — using docling backend for improved extraction.");
|
|
53
|
+
}
|
|
44
54
|
}
|
|
45
55
|
else {
|
|
46
56
|
log.error("opendataloader-pdf is not available.");
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { execFile } from "node:child_process";
|
|
2
2
|
import { existsSync, mkdirSync, readdirSync, readFileSync, rmSync, writeFileSync } from "node:fs";
|
|
3
|
+
import { request } from "node:http";
|
|
3
4
|
import { tmpdir } from "node:os";
|
|
4
5
|
import * as path from "node:path";
|
|
5
6
|
/**
|
|
@@ -21,23 +22,20 @@ export async function convertPdfToMarkdown(pdfPath) {
|
|
|
21
22
|
mkdirSync(outDir, { recursive: true });
|
|
22
23
|
try {
|
|
23
24
|
const { convert } = await import("@opendataloader/pdf");
|
|
25
|
+
const hybridAvailable = await isHybridBackendAvailable();
|
|
24
26
|
await convert([pdfPath], {
|
|
25
27
|
outputDir: outDir,
|
|
26
28
|
format: "markdown",
|
|
27
29
|
quiet: true,
|
|
30
|
+
...(hybridAvailable ? { hybrid: "docling-fast", hybridFallback: true } : {}),
|
|
28
31
|
});
|
|
29
|
-
const
|
|
30
|
-
const mdFile = files.find((f) => f.endsWith(".md"));
|
|
32
|
+
const mdFile = readdirSync(outDir).find((f) => f.endsWith(".md"));
|
|
31
33
|
if (!mdFile)
|
|
32
34
|
return null;
|
|
33
35
|
const markdown = readFileSync(path.join(outDir, mdFile), "utf-8");
|
|
34
36
|
const imageExtensions = new Set([".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp", ".bmp"]);
|
|
35
37
|
const images = new Map();
|
|
36
|
-
|
|
37
|
-
if (imageExtensions.has(path.extname(file).toLowerCase())) {
|
|
38
|
-
images.set(file, readFileSync(path.join(outDir, file)));
|
|
39
|
-
}
|
|
40
|
-
}
|
|
38
|
+
collectImages(outDir, outDir, imageExtensions, images);
|
|
41
39
|
return { markdown, images };
|
|
42
40
|
}
|
|
43
41
|
catch {
|
|
@@ -57,10 +55,11 @@ export function saveConvertResult(filesDir, id, result) {
|
|
|
57
55
|
if (result.images.size > 0) {
|
|
58
56
|
const imageSubDir = path.join(filesDir, id);
|
|
59
57
|
mkdirSync(imageSubDir, { recursive: true });
|
|
60
|
-
for (const [
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
58
|
+
for (const [relPath, data] of result.images) {
|
|
59
|
+
const basename = path.basename(relPath);
|
|
60
|
+
writeFileSync(path.join(imageSubDir, basename), data);
|
|
61
|
+
// Rewrite image/link references: ](old/path.png) → ](<id>/basename.png)
|
|
62
|
+
markdown = markdown.replaceAll(`](${relPath})`, `](${id}/${basename})`);
|
|
64
63
|
}
|
|
65
64
|
}
|
|
66
65
|
writeFileSync(path.join(filesDir, `${id}.md`), markdown, "utf-8");
|
|
@@ -75,6 +74,18 @@ export function removeImageDir(filesDir, id) {
|
|
|
75
74
|
}
|
|
76
75
|
}
|
|
77
76
|
// ─── Internal ────────────────────────────────────────────
|
|
77
|
+
/** Recursively collect image files under `dir`, keyed by path relative to `root`. */
|
|
78
|
+
function collectImages(dir, root, extensions, out) {
|
|
79
|
+
for (const entry of readdirSync(dir, { withFileTypes: true })) {
|
|
80
|
+
const full = path.join(dir, entry.name);
|
|
81
|
+
if (entry.isDirectory()) {
|
|
82
|
+
collectImages(full, root, extensions, out);
|
|
83
|
+
}
|
|
84
|
+
else if (extensions.has(path.extname(entry.name).toLowerCase())) {
|
|
85
|
+
out.set(path.relative(root, full), readFileSync(full));
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
78
89
|
let cachedAvailability;
|
|
79
90
|
async function detectAvailability() {
|
|
80
91
|
const [hasPackage, hasJava] = await Promise.all([checkPackage(), checkJava()]);
|
|
@@ -101,13 +112,36 @@ async function checkJava() {
|
|
|
101
112
|
* Detailed availability check for the `dep check` command.
|
|
102
113
|
*/
|
|
103
114
|
export async function checkOpendataLoaderStatus() {
|
|
104
|
-
const [packageInstalled, javaResult] = await Promise.all([
|
|
115
|
+
const [packageInstalled, javaResult, hybridBackendAvailable] = await Promise.all([
|
|
116
|
+
checkPackage(),
|
|
117
|
+
getJavaVersion(),
|
|
118
|
+
isHybridBackendAvailable(),
|
|
119
|
+
]);
|
|
105
120
|
return {
|
|
106
121
|
packageInstalled,
|
|
107
122
|
javaAvailable: javaResult !== null,
|
|
108
123
|
javaVersion: javaResult,
|
|
124
|
+
hybridBackendAvailable,
|
|
109
125
|
};
|
|
110
126
|
}
|
|
127
|
+
const HYBRID_BACKEND_URL = "http://localhost:5002";
|
|
128
|
+
const HYBRID_PROBE_TIMEOUT_MS = 1500;
|
|
129
|
+
/** Check if the opendataloader hybrid backend is reachable at localhost:5002. */
|
|
130
|
+
function isHybridBackendAvailable() {
|
|
131
|
+
return new Promise((resolve) => {
|
|
132
|
+
const req = request(HYBRID_BACKEND_URL, { method: "GET", timeout: HYBRID_PROBE_TIMEOUT_MS }, (res) => {
|
|
133
|
+
// Any response means the server is running
|
|
134
|
+
res.resume();
|
|
135
|
+
resolve(true);
|
|
136
|
+
});
|
|
137
|
+
req.on("error", () => resolve(false));
|
|
138
|
+
req.on("timeout", () => {
|
|
139
|
+
req.destroy();
|
|
140
|
+
resolve(false);
|
|
141
|
+
});
|
|
142
|
+
req.end();
|
|
143
|
+
});
|
|
144
|
+
}
|
|
111
145
|
// execFile is safe — arguments are passed as an array, no shell interpolation.
|
|
112
146
|
function getJavaVersion() {
|
|
113
147
|
return new Promise((resolve) => {
|