paper-manager 0.10.2 → 0.10.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,6 +31,7 @@ async function checkOpendataLoader() {
31
31
  const status = await checkOpendataLoaderStatus();
32
32
  const pkgIcon = status.packageInstalled ? chalk.green("✔") : chalk.red("✖");
33
33
  const javaIcon = status.javaAvailable ? chalk.green("✔") : chalk.red("✖");
34
+ const hybridIcon = status.hybridBackendAvailable ? chalk.green("✔") : chalk.dim("○");
34
35
  log.plain(` ${pkgIcon} @opendataloader/pdf package`);
35
36
  if (status.javaAvailable) {
36
37
  log.plain(` ${javaIcon} Java runtime (${status.javaVersion})`);
@@ -38,9 +39,18 @@ async function checkOpendataLoader() {
38
39
  else {
39
40
  log.plain(` ${javaIcon} Java runtime (not found)`);
40
41
  }
42
+ if (status.hybridBackendAvailable) {
43
+ log.plain(` ${hybridIcon} Hybrid backend (localhost:5002)`);
44
+ }
45
+ else {
46
+ log.plain(` ${hybridIcon} Hybrid backend (not running, optional)`);
47
+ }
41
48
  log.newline();
42
49
  if (status.packageInstalled && status.javaAvailable) {
43
50
  log.success("opendataloader-pdf is ready.");
51
+ if (status.hybridBackendAvailable) {
52
+ log.step("Hybrid mode enabled — using docling backend for improved extraction.");
53
+ }
44
54
  }
45
55
  else {
46
56
  log.error("opendataloader-pdf is not available.");
@@ -1,5 +1,6 @@
1
1
  import { execFile } from "node:child_process";
2
2
  import { existsSync, mkdirSync, readdirSync, readFileSync, rmSync, writeFileSync } from "node:fs";
3
+ import { request } from "node:http";
3
4
  import { tmpdir } from "node:os";
4
5
  import * as path from "node:path";
5
6
  /**
@@ -21,23 +22,20 @@ export async function convertPdfToMarkdown(pdfPath) {
21
22
  mkdirSync(outDir, { recursive: true });
22
23
  try {
23
24
  const { convert } = await import("@opendataloader/pdf");
25
+ const hybridAvailable = await isHybridBackendAvailable();
24
26
  await convert([pdfPath], {
25
27
  outputDir: outDir,
26
28
  format: "markdown",
27
29
  quiet: true,
30
+ ...(hybridAvailable ? { hybrid: "docling-fast", hybridFallback: true } : {}),
28
31
  });
29
- const files = readdirSync(outDir);
30
- const mdFile = files.find((f) => f.endsWith(".md"));
32
+ const mdFile = readdirSync(outDir).find((f) => f.endsWith(".md"));
31
33
  if (!mdFile)
32
34
  return null;
33
35
  const markdown = readFileSync(path.join(outDir, mdFile), "utf-8");
34
36
  const imageExtensions = new Set([".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp", ".bmp"]);
35
37
  const images = new Map();
36
- for (const file of files) {
37
- if (imageExtensions.has(path.extname(file).toLowerCase())) {
38
- images.set(file, readFileSync(path.join(outDir, file)));
39
- }
40
- }
38
+ collectImages(outDir, outDir, imageExtensions, images);
41
39
  return { markdown, images };
42
40
  }
43
41
  catch {
@@ -57,10 +55,11 @@ export function saveConvertResult(filesDir, id, result) {
57
55
  if (result.images.size > 0) {
58
56
  const imageSubDir = path.join(filesDir, id);
59
57
  mkdirSync(imageSubDir, { recursive: true });
60
- for (const [filename, data] of result.images) {
61
- writeFileSync(path.join(imageSubDir, filename), data);
62
- // Rewrite image/link references: ](filename) → ](<id>/filename)
63
- markdown = markdown.replaceAll(`](${filename})`, `](${id}/${filename})`);
58
+ for (const [relPath, data] of result.images) {
59
+ const basename = path.basename(relPath);
60
+ writeFileSync(path.join(imageSubDir, basename), data);
61
+ // Rewrite image/link references: ](old/path.png) ](<id>/basename.png)
62
+ markdown = markdown.replaceAll(`](${relPath})`, `](${id}/${basename})`);
64
63
  }
65
64
  }
66
65
  writeFileSync(path.join(filesDir, `${id}.md`), markdown, "utf-8");
@@ -75,6 +74,18 @@ export function removeImageDir(filesDir, id) {
75
74
  }
76
75
  }
77
76
  // ─── Internal ────────────────────────────────────────────
77
+ /** Recursively collect image files under `dir`, keyed by path relative to `root`. */
78
+ function collectImages(dir, root, extensions, out) {
79
+ for (const entry of readdirSync(dir, { withFileTypes: true })) {
80
+ const full = path.join(dir, entry.name);
81
+ if (entry.isDirectory()) {
82
+ collectImages(full, root, extensions, out);
83
+ }
84
+ else if (extensions.has(path.extname(entry.name).toLowerCase())) {
85
+ out.set(path.relative(root, full), readFileSync(full));
86
+ }
87
+ }
88
+ }
78
89
  let cachedAvailability;
79
90
  async function detectAvailability() {
80
91
  const [hasPackage, hasJava] = await Promise.all([checkPackage(), checkJava()]);
@@ -101,13 +112,36 @@ async function checkJava() {
101
112
  * Detailed availability check for the `dep check` command.
102
113
  */
103
114
  export async function checkOpendataLoaderStatus() {
104
- const [packageInstalled, javaResult] = await Promise.all([checkPackage(), getJavaVersion()]);
115
+ const [packageInstalled, javaResult, hybridBackendAvailable] = await Promise.all([
116
+ checkPackage(),
117
+ getJavaVersion(),
118
+ isHybridBackendAvailable(),
119
+ ]);
105
120
  return {
106
121
  packageInstalled,
107
122
  javaAvailable: javaResult !== null,
108
123
  javaVersion: javaResult,
124
+ hybridBackendAvailable,
109
125
  };
110
126
  }
127
+ const HYBRID_BACKEND_URL = "http://localhost:5002";
128
+ const HYBRID_PROBE_TIMEOUT_MS = 1500;
129
+ /** Check if the opendataloader hybrid backend is reachable at localhost:5002. */
130
+ function isHybridBackendAvailable() {
131
+ return new Promise((resolve) => {
132
+ const req = request(HYBRID_BACKEND_URL, { method: "GET", timeout: HYBRID_PROBE_TIMEOUT_MS }, (res) => {
133
+ // Any response means the server is running
134
+ res.resume();
135
+ resolve(true);
136
+ });
137
+ req.on("error", () => resolve(false));
138
+ req.on("timeout", () => {
139
+ req.destroy();
140
+ resolve(false);
141
+ });
142
+ req.end();
143
+ });
144
+ }
111
145
  // execFile is safe — arguments are passed as an array, no shell interpolation.
112
146
  function getJavaVersion() {
113
147
  return new Promise((resolve) => {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "paper-manager",
3
- "version": "0.10.2",
3
+ "version": "0.10.4",
4
4
  "description": "A paper management system.",
5
5
  "keywords": [],
6
6
  "homepage": "https://github.com/EurFelux/paper-manager",