@heripo/pdf-parser 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/NOTICE +37 -0
- package/README.ko.md +426 -0
- package/README.md +426 -0
- package/dist/chunk-JVYF2SQS.js +495 -0
- package/dist/chunk-JVYF2SQS.js.map +1 -0
- package/dist/chunk-VUNV25KB.js +16 -0
- package/dist/chunk-VUNV25KB.js.map +1 -0
- package/dist/index.cjs +1323 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +129 -0
- package/dist/index.d.ts +129 -0
- package/dist/index.js +1295 -0
- package/dist/index.js.map +1 -0
- package/dist/token-HEEJ7XHP.js +63 -0
- package/dist/token-HEEJ7XHP.js.map +1 -0
- package/dist/token-util-MKHXD2JQ.js +6 -0
- package/dist/token-util-MKHXD2JQ.js.map +1 -0
- package/package.json +96 -0
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,1323 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __defProp = Object.defineProperty;
|
|
4
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __export = (target, all) => {
|
|
9
|
+
for (var name in all)
|
|
10
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
11
|
+
};
|
|
12
|
+
var __copyProps = (to, from, except, desc) => {
|
|
13
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
14
|
+
for (let key of __getOwnPropNames(from))
|
|
15
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
16
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
17
|
+
}
|
|
18
|
+
return to;
|
|
19
|
+
};
|
|
20
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
21
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
22
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
23
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
24
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
25
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
26
|
+
mod
|
|
27
|
+
));
|
|
28
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
|
+
|
|
30
|
+
// src/index.ts
|
|
31
|
+
var src_exports = {};
|
|
32
|
+
__export(src_exports, {
|
|
33
|
+
ImagePdfFallbackError: () => ImagePdfFallbackError,
|
|
34
|
+
PDFParser: () => PDFParser
|
|
35
|
+
});
|
|
36
|
+
module.exports = __toCommonJS(src_exports);
|
|
37
|
+
|
|
38
|
+
// src/core/pdf-parser.ts
|
|
39
|
+
var import_docling_sdk = require("docling-sdk");
|
|
40
|
+
var import_node_child_process3 = require("child_process");
|
|
41
|
+
var import_node_os2 = require("os");
|
|
42
|
+
var import_node_path6 = require("path");
|
|
43
|
+
|
|
44
|
+
// src/config/constants.ts
|
|
45
|
+
var PDF_PARSER = {
|
|
46
|
+
/**
|
|
47
|
+
* Default timeout for API calls in milliseconds
|
|
48
|
+
*/
|
|
49
|
+
DEFAULT_TIMEOUT_MS: 1e5,
|
|
50
|
+
/**
|
|
51
|
+
* Maximum number of health check attempts before giving up
|
|
52
|
+
*/
|
|
53
|
+
MAX_HEALTH_CHECK_ATTEMPTS: 60,
|
|
54
|
+
/**
|
|
55
|
+
* Interval between health check attempts in milliseconds
|
|
56
|
+
*/
|
|
57
|
+
HEALTH_CHECK_INTERVAL_MS: 2e3,
|
|
58
|
+
/**
|
|
59
|
+
* Interval between log messages during health check in milliseconds
|
|
60
|
+
*/
|
|
61
|
+
HEALTH_CHECK_LOG_INTERVAL_MS: 5e3,
|
|
62
|
+
/**
|
|
63
|
+
* Maximum retry attempts for server recovery on ECONNREFUSED
|
|
64
|
+
*/
|
|
65
|
+
MAX_SERVER_RECOVERY_ATTEMPTS: 1
|
|
66
|
+
};
|
|
67
|
+
var PDF_CONVERTER = {
|
|
68
|
+
/**
|
|
69
|
+
* Interval for progress polling in milliseconds
|
|
70
|
+
*/
|
|
71
|
+
POLL_INTERVAL_MS: 1e3
|
|
72
|
+
};
|
|
73
|
+
var DOCLING_ENVIRONMENT = {
|
|
74
|
+
/**
|
|
75
|
+
* Delay after starting docling-serve to allow startup
|
|
76
|
+
*/
|
|
77
|
+
STARTUP_DELAY_MS: 2e3
|
|
78
|
+
};
|
|
79
|
+
var IMAGE_PDF_CONVERTER = {
|
|
80
|
+
/**
|
|
81
|
+
* ImageMagick density option (DPI) for PDF to image conversion
|
|
82
|
+
*/
|
|
83
|
+
DENSITY: 300,
|
|
84
|
+
/**
|
|
85
|
+
* ImageMagick quality option (1-100)
|
|
86
|
+
*/
|
|
87
|
+
QUALITY: 100
|
|
88
|
+
};
|
|
89
|
+
|
|
90
|
+
// ../shared/dist/index.mjs
|
|
91
|
+
var import_child_process = require("child_process");
|
|
92
|
+
function spawnAsync(command, args, options = {}) {
|
|
93
|
+
const {
|
|
94
|
+
captureStdout = true,
|
|
95
|
+
captureStderr = true,
|
|
96
|
+
...spawnOptions
|
|
97
|
+
} = options;
|
|
98
|
+
return new Promise((resolve, reject) => {
|
|
99
|
+
const proc = (0, import_child_process.spawn)(command, args, spawnOptions);
|
|
100
|
+
let stdout = "";
|
|
101
|
+
let stderr = "";
|
|
102
|
+
if (captureStdout && proc.stdout) {
|
|
103
|
+
proc.stdout.on("data", (data) => {
|
|
104
|
+
stdout += data.toString();
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
if (captureStderr && proc.stderr) {
|
|
108
|
+
proc.stderr.on("data", (data) => {
|
|
109
|
+
stderr += data.toString();
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
proc.on("close", (code) => {
|
|
113
|
+
resolve({ stdout, stderr, code: code ?? 0 });
|
|
114
|
+
});
|
|
115
|
+
proc.on("error", reject);
|
|
116
|
+
});
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// src/environment/docling-environment.ts
|
|
120
|
+
var import_node_child_process = require("child_process");
|
|
121
|
+
var import_node_path = require("path");
|
|
122
|
+
|
|
123
|
+
// src/utils/python-version.ts
|
|
124
|
+
var PYTHON_VERSION_REGEX = /Python (\d+)\.(\d+)/;
|
|
125
|
+
var MIN_PYTHON_VERSION = { major: 3, minor: 9 };
|
|
126
|
+
var PythonVersionError = class extends Error {
|
|
127
|
+
constructor(message) {
|
|
128
|
+
super(message);
|
|
129
|
+
this.name = "PythonVersionError";
|
|
130
|
+
}
|
|
131
|
+
};
|
|
132
|
+
function parsePythonVersion(output) {
|
|
133
|
+
const match = output.match(PYTHON_VERSION_REGEX);
|
|
134
|
+
if (!match) return null;
|
|
135
|
+
const major = parseInt(match[1]);
|
|
136
|
+
const minor = parseInt(match[2]);
|
|
137
|
+
return {
|
|
138
|
+
major,
|
|
139
|
+
minor,
|
|
140
|
+
versionString: `${major}.${minor}`
|
|
141
|
+
};
|
|
142
|
+
}
|
|
143
|
+
function validatePythonVersion(version, context = "system") {
|
|
144
|
+
const { major, minor } = version;
|
|
145
|
+
const prefix = context === "venv" ? "Venv Python" : "Python";
|
|
146
|
+
if (major === 3 && minor >= 13) {
|
|
147
|
+
throw new PythonVersionError(
|
|
148
|
+
`${prefix} ${major}.${minor} is too new. docling-serve requires Python 3.11 or 3.12.`
|
|
149
|
+
);
|
|
150
|
+
}
|
|
151
|
+
if (major !== 3 || minor < MIN_PYTHON_VERSION.minor) {
|
|
152
|
+
throw new PythonVersionError("Python 3.9 or higher is required");
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// src/environment/docling-environment.ts
|
|
157
|
+
var DoclingEnvironment = class _DoclingEnvironment {
|
|
158
|
+
logger;
|
|
159
|
+
venvPath;
|
|
160
|
+
port;
|
|
161
|
+
killExistingProcess;
|
|
162
|
+
constructor(options) {
|
|
163
|
+
this.logger = options.logger;
|
|
164
|
+
this.venvPath = options.venvPath;
|
|
165
|
+
this.port = options.port;
|
|
166
|
+
this.killExistingProcess = options.killExistingProcess;
|
|
167
|
+
}
|
|
168
|
+
async setup() {
|
|
169
|
+
this.logger.info("[DoclingEnvironment] Setting up Python environment...");
|
|
170
|
+
await this.checkPythonVersion();
|
|
171
|
+
await this.setupPythonEnvironment();
|
|
172
|
+
await this.upgradePip();
|
|
173
|
+
await this.installSetuptools();
|
|
174
|
+
await this.installPyArrow();
|
|
175
|
+
await this.installDoclingServe();
|
|
176
|
+
const portInUse = await this.isPortInUse(this.port);
|
|
177
|
+
if (portInUse && !this.killExistingProcess) {
|
|
178
|
+
this.logger.info(
|
|
179
|
+
"[DoclingEnvironment] Reusing existing server on port",
|
|
180
|
+
this.port
|
|
181
|
+
);
|
|
182
|
+
} else {
|
|
183
|
+
await this.startDoclingServe();
|
|
184
|
+
}
|
|
185
|
+
this.logger.info("[DoclingEnvironment] Setup completed");
|
|
186
|
+
}
|
|
187
|
+
async checkPythonVersion() {
|
|
188
|
+
const result = await spawnAsync("python3", ["--version"]);
|
|
189
|
+
if (result.code !== 0) {
|
|
190
|
+
throw new Error("Failed to check Python version");
|
|
191
|
+
}
|
|
192
|
+
const output = result.stdout + result.stderr;
|
|
193
|
+
const version = parsePythonVersion(output);
|
|
194
|
+
if (!version) {
|
|
195
|
+
throw new Error("Could not parse Python version");
|
|
196
|
+
}
|
|
197
|
+
this.logger.info(
|
|
198
|
+
"[DoclingEnvironment] Python version:",
|
|
199
|
+
version.versionString
|
|
200
|
+
);
|
|
201
|
+
try {
|
|
202
|
+
validatePythonVersion(version, "system");
|
|
203
|
+
} catch (error) {
|
|
204
|
+
if (error instanceof PythonVersionError && version.minor >= 13) {
|
|
205
|
+
this.logger.error(
|
|
206
|
+
"[DoclingEnvironment] Python 3.13+ is not compatible. Install 3.11 or 3.12 with: pyenv install 3.12.0 && pyenv global 3.12.0"
|
|
207
|
+
);
|
|
208
|
+
}
|
|
209
|
+
throw error;
|
|
210
|
+
}
|
|
211
|
+
return version;
|
|
212
|
+
}
|
|
213
|
+
async setupPythonEnvironment() {
|
|
214
|
+
const result = await spawnAsync("python3", ["-m", "venv", this.venvPath]);
|
|
215
|
+
if (result.code !== 0) {
|
|
216
|
+
throw new Error("Failed to create Python virtual environment");
|
|
217
|
+
}
|
|
218
|
+
await this.verifyVenvPythonVersion();
|
|
219
|
+
}
|
|
220
|
+
async verifyVenvPythonVersion() {
|
|
221
|
+
const pythonPath = (0, import_node_path.join)(this.venvPath, "bin", "python");
|
|
222
|
+
const result = await spawnAsync(pythonPath, ["--version"]);
|
|
223
|
+
if (result.code !== 0) {
|
|
224
|
+
throw new Error("Failed to verify venv Python version");
|
|
225
|
+
}
|
|
226
|
+
const output = result.stdout + result.stderr;
|
|
227
|
+
const version = parsePythonVersion(output);
|
|
228
|
+
if (!version) {
|
|
229
|
+
throw new Error("Could not parse venv Python version");
|
|
230
|
+
}
|
|
231
|
+
validatePythonVersion(version, "venv");
|
|
232
|
+
}
|
|
233
|
+
async upgradePip() {
|
|
234
|
+
const pipPath = (0, import_node_path.join)(this.venvPath, "bin", "pip");
|
|
235
|
+
const result = await spawnAsync(pipPath, ["install", "--upgrade", "pip"]);
|
|
236
|
+
if (result.code !== 0) {
|
|
237
|
+
this.logger.error(
|
|
238
|
+
"[DoclingEnvironment] Failed to upgrade pip:",
|
|
239
|
+
result.stderr
|
|
240
|
+
);
|
|
241
|
+
throw new Error(`Failed to upgrade pip. Exit code: ${result.code}`);
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
async installSetuptools() {
|
|
245
|
+
const pipPath = (0, import_node_path.join)(this.venvPath, "bin", "pip");
|
|
246
|
+
const result = await spawnAsync(pipPath, [
|
|
247
|
+
"install",
|
|
248
|
+
"--upgrade",
|
|
249
|
+
"setuptools",
|
|
250
|
+
"wheel"
|
|
251
|
+
]);
|
|
252
|
+
if (result.code !== 0) {
|
|
253
|
+
this.logger.error(
|
|
254
|
+
"[DoclingEnvironment] Failed to install setuptools:",
|
|
255
|
+
result.stderr
|
|
256
|
+
);
|
|
257
|
+
throw new Error(
|
|
258
|
+
`Failed to install setuptools. Exit code: ${result.code}`
|
|
259
|
+
);
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
async installPyArrow() {
|
|
263
|
+
const pipPath = (0, import_node_path.join)(this.venvPath, "bin", "pip");
|
|
264
|
+
const result = await spawnAsync(pipPath, [
|
|
265
|
+
"install",
|
|
266
|
+
"--only-binary",
|
|
267
|
+
":all:",
|
|
268
|
+
"pyarrow"
|
|
269
|
+
]);
|
|
270
|
+
if (result.code !== 0) {
|
|
271
|
+
this.logger.error(
|
|
272
|
+
"[DoclingEnvironment] Failed to install pyarrow:",
|
|
273
|
+
result.stderr
|
|
274
|
+
);
|
|
275
|
+
throw new Error(`Failed to install pyarrow. Exit code: ${result.code}`);
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
async installDoclingServe() {
|
|
279
|
+
const pipPath = (0, import_node_path.join)(this.venvPath, "bin", "pip");
|
|
280
|
+
const result = await spawnAsync(pipPath, ["install", "docling-serve"]);
|
|
281
|
+
if (result.code !== 0) {
|
|
282
|
+
this.logger.error(
|
|
283
|
+
"[DoclingEnvironment] Failed to install docling-serve:",
|
|
284
|
+
result.stderr
|
|
285
|
+
);
|
|
286
|
+
throw new Error(
|
|
287
|
+
`Failed to install docling-serve. Exit code: ${result.code}`
|
|
288
|
+
);
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
async isPortInUse(port) {
|
|
292
|
+
try {
|
|
293
|
+
const result = await spawnAsync("lsof", ["-ti", `:${port}`]);
|
|
294
|
+
return result.code === 0 && !!result.stdout.trim();
|
|
295
|
+
} catch {
|
|
296
|
+
return false;
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
/**
|
|
300
|
+
* Start the docling-serve server without running full setup.
|
|
301
|
+
* Useful for restarting the server after it has crashed.
|
|
302
|
+
*/
|
|
303
|
+
async startServer() {
|
|
304
|
+
await this.startDoclingServe();
|
|
305
|
+
}
|
|
306
|
+
// Process-killing logic is provided as a static method to allow reuse without instantiation
|
|
307
|
+
static async killProcessOnPort(logger, port) {
|
|
308
|
+
return new Promise((resolve) => {
|
|
309
|
+
const lsof = (0, import_node_child_process.spawn)("lsof", ["-ti", `:${port}`]);
|
|
310
|
+
const pids = [];
|
|
311
|
+
lsof.stdout?.on("data", (data) => {
|
|
312
|
+
const txt = data.toString();
|
|
313
|
+
pids.push(
|
|
314
|
+
...txt.split(/\s+/).map((s) => s.trim()).filter(Boolean)
|
|
315
|
+
);
|
|
316
|
+
});
|
|
317
|
+
lsof.on("close", () => {
|
|
318
|
+
if (pids.length === 0) return resolve();
|
|
319
|
+
let remaining = pids.length;
|
|
320
|
+
const done = () => {
|
|
321
|
+
if (--remaining <= 0) resolve();
|
|
322
|
+
};
|
|
323
|
+
logger.info(
|
|
324
|
+
"[DoclingEnvironment] Killing process",
|
|
325
|
+
pids.join(", "),
|
|
326
|
+
"on port",
|
|
327
|
+
port
|
|
328
|
+
);
|
|
329
|
+
for (const pid of pids) {
|
|
330
|
+
const killProc = (0, import_node_child_process.spawn)("kill", ["-9", pid]);
|
|
331
|
+
killProc.on("close", (killCode) => {
|
|
332
|
+
if (killCode !== 0) {
|
|
333
|
+
logger.info("[DoclingEnvironment] Failed to kill process", pid);
|
|
334
|
+
}
|
|
335
|
+
done();
|
|
336
|
+
});
|
|
337
|
+
killProc.on("error", (Error2) => {
|
|
338
|
+
logger.info("[DoclingEnvironment] Failed to kill process", Error2);
|
|
339
|
+
done();
|
|
340
|
+
});
|
|
341
|
+
}
|
|
342
|
+
});
|
|
343
|
+
lsof.on("error", () => resolve());
|
|
344
|
+
});
|
|
345
|
+
}
|
|
346
|
+
async startDoclingServe() {
|
|
347
|
+
return new Promise(async (resolve, reject) => {
|
|
348
|
+
if (this.killExistingProcess) {
|
|
349
|
+
await _DoclingEnvironment.killProcessOnPort(this.logger, this.port);
|
|
350
|
+
}
|
|
351
|
+
const venvPath = this.venvPath;
|
|
352
|
+
const doclingServePath = (0, import_node_path.join)(venvPath, "bin", "docling-serve");
|
|
353
|
+
const args = ["run", "--port", this.port.toString()];
|
|
354
|
+
this.logger.info(
|
|
355
|
+
"[DoclingEnvironment] Starting docling-serve on port",
|
|
356
|
+
this.port
|
|
357
|
+
);
|
|
358
|
+
const doclingProcess = (0, import_node_child_process.spawn)(doclingServePath, args, {
|
|
359
|
+
detached: true,
|
|
360
|
+
// Detached from parent process
|
|
361
|
+
stdio: "ignore"
|
|
362
|
+
// Remove stdio pipes to prevent event loop from hanging
|
|
363
|
+
});
|
|
364
|
+
doclingProcess.unref();
|
|
365
|
+
doclingProcess.on("error", (error) => {
|
|
366
|
+
this.logger.error("[DoclingEnvironment] docling-serve error:", error);
|
|
367
|
+
reject(error);
|
|
368
|
+
});
|
|
369
|
+
setTimeout(() => {
|
|
370
|
+
resolve();
|
|
371
|
+
}, DOCLING_ENVIRONMENT.STARTUP_DELAY_MS);
|
|
372
|
+
});
|
|
373
|
+
}
|
|
374
|
+
};
|
|
375
|
+
|
|
376
|
+
// src/core/pdf-converter.ts
|
|
377
|
+
var import_es_toolkit = require("es-toolkit");
|
|
378
|
+
var import_node_fs4 = require("fs");
|
|
379
|
+
var import_node_path5 = require("path");
|
|
380
|
+
var import_promises = require("stream/promises");
|
|
381
|
+
|
|
382
|
+
// src/errors/image-pdf-fallback-error.ts
|
|
383
|
+
var ImagePdfFallbackError = class extends Error {
|
|
384
|
+
constructor(originalError, fallbackError) {
|
|
385
|
+
super(
|
|
386
|
+
`PDF conversion failed with fallback. Original: ${originalError.message}. Fallback: ${fallbackError.message}`
|
|
387
|
+
);
|
|
388
|
+
this.originalError = originalError;
|
|
389
|
+
this.fallbackError = fallbackError;
|
|
390
|
+
}
|
|
391
|
+
name = "ImagePdfFallbackError";
|
|
392
|
+
};
|
|
393
|
+
|
|
394
|
+
// src/processors/image-extractor.ts
|
|
395
|
+
var import_node_fs = require("fs");
|
|
396
|
+
var import_node_path2 = require("path");
|
|
397
|
+
var yauzl = __toESM(require("yauzl"), 1);
|
|
398
|
+
|
|
399
|
+
// src/utils/jq.ts
|
|
400
|
+
var import_node_child_process2 = require("child_process");
|
|
401
|
+
function getJqPath() {
|
|
402
|
+
const p = process.env.JQ_PATH?.trim();
|
|
403
|
+
return p && p.length > 0 ? p : "jq";
|
|
404
|
+
}
|
|
405
|
+
function runJqFileJson(program, filePath) {
|
|
406
|
+
return new Promise((resolve, reject) => {
|
|
407
|
+
const jqPath = getJqPath();
|
|
408
|
+
const args = [
|
|
409
|
+
"-c",
|
|
410
|
+
// compact output (single line when possible)
|
|
411
|
+
program,
|
|
412
|
+
filePath
|
|
413
|
+
];
|
|
414
|
+
const child = (0, import_node_child_process2.spawn)(jqPath, args, {
|
|
415
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
416
|
+
env: process.env
|
|
417
|
+
});
|
|
418
|
+
let stdout = "";
|
|
419
|
+
let stderr = "";
|
|
420
|
+
child.stdout.setEncoding("utf-8");
|
|
421
|
+
child.stderr.setEncoding("utf-8");
|
|
422
|
+
child.stdout.on("data", (chunk) => {
|
|
423
|
+
stdout += chunk;
|
|
424
|
+
});
|
|
425
|
+
child.stderr.on("data", (chunk) => {
|
|
426
|
+
stderr += chunk;
|
|
427
|
+
});
|
|
428
|
+
child.on("error", (err) => {
|
|
429
|
+
reject(err);
|
|
430
|
+
});
|
|
431
|
+
child.on("close", (code) => {
|
|
432
|
+
if (code !== 0) {
|
|
433
|
+
const error = new Error(
|
|
434
|
+
`jq exited with code ${code}. ${stderr ? "Stderr: " + stderr : ""}`
|
|
435
|
+
);
|
|
436
|
+
return reject(error);
|
|
437
|
+
}
|
|
438
|
+
try {
|
|
439
|
+
const text = stdout.trim();
|
|
440
|
+
const parsed = JSON.parse(text);
|
|
441
|
+
resolve(parsed);
|
|
442
|
+
} catch (e) {
|
|
443
|
+
reject(
|
|
444
|
+
new Error(
|
|
445
|
+
`Failed to parse jq output as JSON. Output length=${stdout.length}. Error: ${e.message}`
|
|
446
|
+
)
|
|
447
|
+
);
|
|
448
|
+
}
|
|
449
|
+
});
|
|
450
|
+
});
|
|
451
|
+
}
|
|
452
|
+
function jqExtractBase64PngStrings(filePath) {
|
|
453
|
+
const program = `
|
|
454
|
+
[
|
|
455
|
+
.. |
|
|
456
|
+
select(type == "string" and startswith("data:image/png;base64"))
|
|
457
|
+
]
|
|
458
|
+
`;
|
|
459
|
+
return runJqFileJson(program, filePath);
|
|
460
|
+
}
|
|
461
|
+
function jqReplaceBase64WithPaths(filePath, dirName, prefix) {
|
|
462
|
+
const program = `
|
|
463
|
+
reduce paths(type == "string" and startswith("data:image/png;base64")) as $p (
|
|
464
|
+
{data: ., counter: 0};
|
|
465
|
+
.counter as $idx |
|
|
466
|
+
.data |= setpath($p; "${dirName}/${prefix}_\\($idx).png") |
|
|
467
|
+
.counter += 1
|
|
468
|
+
) | {data: .data, count: .counter}
|
|
469
|
+
`;
|
|
470
|
+
return runJqFileJson(program, filePath);
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
// src/processors/image-extractor.ts
|
|
474
|
+
var ImageExtractor = class _ImageExtractor {
|
|
475
|
+
/**
|
|
476
|
+
* Extract a ZIP file to a target directory
|
|
477
|
+
*/
|
|
478
|
+
static async extractZip(zipPath, targetDir) {
|
|
479
|
+
return new Promise((resolve, reject) => {
|
|
480
|
+
yauzl.open(zipPath, { lazyEntries: true }, (err, zipfile) => {
|
|
481
|
+
if (err || !zipfile) {
|
|
482
|
+
reject(err || new Error("Failed to open zip file"));
|
|
483
|
+
return;
|
|
484
|
+
}
|
|
485
|
+
zipfile.readEntry();
|
|
486
|
+
zipfile.on("entry", (entry) => {
|
|
487
|
+
const entryPath = (0, import_node_path2.join)(targetDir, entry.fileName);
|
|
488
|
+
if (/\/$/.test(entry.fileName)) {
|
|
489
|
+
(0, import_node_fs.mkdirSync)(entryPath, { recursive: true });
|
|
490
|
+
zipfile.readEntry();
|
|
491
|
+
} else {
|
|
492
|
+
zipfile.openReadStream(entry, (err2, readStream) => {
|
|
493
|
+
if (err2 || !readStream) {
|
|
494
|
+
reject(err2 || new Error("Failed to open read stream"));
|
|
495
|
+
return;
|
|
496
|
+
}
|
|
497
|
+
(0, import_node_fs.mkdirSync)((0, import_node_path2.join)(entryPath, ".."), { recursive: true });
|
|
498
|
+
const writeStream = (0, import_node_fs.createWriteStream)(entryPath);
|
|
499
|
+
readStream.pipe(writeStream);
|
|
500
|
+
writeStream.on("finish", () => {
|
|
501
|
+
zipfile.readEntry();
|
|
502
|
+
});
|
|
503
|
+
writeStream.on("error", reject);
|
|
504
|
+
});
|
|
505
|
+
}
|
|
506
|
+
});
|
|
507
|
+
zipfile.on("end", () => {
|
|
508
|
+
resolve();
|
|
509
|
+
});
|
|
510
|
+
zipfile.on("error", reject);
|
|
511
|
+
});
|
|
512
|
+
});
|
|
513
|
+
}
|
|
514
|
+
/**
|
|
515
|
+
* Extract base64 images from JSON file using jq (for large files)
|
|
516
|
+
* Returns array of base64 data strings
|
|
517
|
+
*/
|
|
518
|
+
static async extractBase64ImagesFromJsonWithJq(jsonPath) {
|
|
519
|
+
return jqExtractBase64PngStrings(jsonPath);
|
|
520
|
+
}
|
|
521
|
+
/**
|
|
522
|
+
* Replace base64 images with file paths in JSON using jq (for large files)
|
|
523
|
+
* Uses reduce to maintain counter state while walking the JSON
|
|
524
|
+
*/
|
|
525
|
+
static async replaceBase64ImagesInJsonWithJq(jsonPath, outputPath, dirName, prefix) {
|
|
526
|
+
const { data, count } = await jqReplaceBase64WithPaths(
|
|
527
|
+
jsonPath,
|
|
528
|
+
dirName,
|
|
529
|
+
prefix
|
|
530
|
+
);
|
|
531
|
+
(0, import_node_fs.writeFileSync)(outputPath, JSON.stringify(data, null, 2), "utf-8");
|
|
532
|
+
return count;
|
|
533
|
+
}
|
|
534
|
+
/**
|
|
535
|
+
* Extract a base64-encoded image to a file and return the relative path
|
|
536
|
+
*/
|
|
537
|
+
static extractBase64ImageToFile(base64Data, imagesDir, index, prefix, dirName) {
|
|
538
|
+
const PREFIX = "data:image/png;base64,";
|
|
539
|
+
const base64Content = base64Data.startsWith(PREFIX) ? base64Data.slice(PREFIX.length) : base64Data;
|
|
540
|
+
const filename = `${prefix}_${index}.png`;
|
|
541
|
+
const filepath = (0, import_node_path2.join)(imagesDir, filename);
|
|
542
|
+
const buffer = Buffer.from(base64Content, "base64");
|
|
543
|
+
(0, import_node_fs.writeFileSync)(filepath, buffer);
|
|
544
|
+
return `${dirName}/${filename}`;
|
|
545
|
+
}
|
|
546
|
+
/**
|
|
547
|
+
* Save JSON and HTML documents with base64 images extracted to separate files
|
|
548
|
+
* Uses jq for JSON processing to handle large files
|
|
549
|
+
*
|
|
550
|
+
* This method:
|
|
551
|
+
* 1. Extracts base64-encoded images from JSON and HTML content
|
|
552
|
+
* 2. Saves images as separate PNG files
|
|
553
|
+
* 3. Replaces base64 data with relative file paths
|
|
554
|
+
* 4. Saves the transformed documents to the output directory
|
|
555
|
+
*/
|
|
556
|
+
static async saveDocumentsWithExtractedImages(logger, outputDir, filename, jsonSourcePath, htmlContent) {
|
|
557
|
+
try {
|
|
558
|
+
if ((0, import_node_fs.existsSync)(outputDir)) {
|
|
559
|
+
(0, import_node_fs.rmSync)(outputDir, { recursive: true, force: true });
|
|
560
|
+
}
|
|
561
|
+
} catch (e) {
|
|
562
|
+
logger.warn("[PDFConverter] Failed to clear output directory:", e);
|
|
563
|
+
}
|
|
564
|
+
(0, import_node_fs.mkdirSync)(outputDir, { recursive: true });
|
|
565
|
+
const baseName = filename.replace((0, import_node_path2.extname)(filename), "");
|
|
566
|
+
const jsonPath = (0, import_node_path2.join)(outputDir, `${baseName}.json`);
|
|
567
|
+
try {
|
|
568
|
+
const pagesDir = (0, import_node_path2.join)(outputDir, "pages");
|
|
569
|
+
if (!(0, import_node_fs.existsSync)(pagesDir)) {
|
|
570
|
+
(0, import_node_fs.mkdirSync)(pagesDir, { recursive: true });
|
|
571
|
+
}
|
|
572
|
+
const base64Images = await _ImageExtractor.extractBase64ImagesFromJsonWithJq(jsonSourcePath);
|
|
573
|
+
base64Images.forEach((base64Data, index) => {
|
|
574
|
+
_ImageExtractor.extractBase64ImageToFile(
|
|
575
|
+
base64Data,
|
|
576
|
+
pagesDir,
|
|
577
|
+
index,
|
|
578
|
+
"page",
|
|
579
|
+
"pages"
|
|
580
|
+
);
|
|
581
|
+
});
|
|
582
|
+
logger.info(
|
|
583
|
+
`[PDFConverter] Extracted ${base64Images.length} images from JSON to ${pagesDir}`
|
|
584
|
+
);
|
|
585
|
+
const replacedCount = await _ImageExtractor.replaceBase64ImagesInJsonWithJq(
|
|
586
|
+
jsonSourcePath,
|
|
587
|
+
jsonPath,
|
|
588
|
+
"pages",
|
|
589
|
+
"page"
|
|
590
|
+
);
|
|
591
|
+
logger.info(
|
|
592
|
+
`[PDFConverter] Replaced ${replacedCount} base64 images with file paths`
|
|
593
|
+
);
|
|
594
|
+
} catch (e) {
|
|
595
|
+
logger.warn(
|
|
596
|
+
"[PDFConverter] Failed to extract images from JSON using jq. Error:",
|
|
597
|
+
e
|
|
598
|
+
);
|
|
599
|
+
throw e;
|
|
600
|
+
}
|
|
601
|
+
logger.info("[PDFConverter] Saved JSON:", jsonPath);
|
|
602
|
+
const htmlPath = (0, import_node_path2.join)(outputDir, `${baseName}.html`);
|
|
603
|
+
try {
|
|
604
|
+
const imagesDir = (0, import_node_path2.join)(outputDir, "images");
|
|
605
|
+
if (!(0, import_node_fs.existsSync)(imagesDir)) {
|
|
606
|
+
(0, import_node_fs.mkdirSync)(imagesDir, { recursive: true });
|
|
607
|
+
}
|
|
608
|
+
let imageIndex = 0;
|
|
609
|
+
const transformedHtml = htmlContent.replace(
|
|
610
|
+
/src="data:image\/png;base64,([^"]+)"/g,
|
|
611
|
+
(_, base64Content) => {
|
|
612
|
+
const filename2 = `image_${imageIndex}.png`;
|
|
613
|
+
const filepath = (0, import_node_path2.join)(imagesDir, filename2);
|
|
614
|
+
const buffer = Buffer.from(base64Content, "base64");
|
|
615
|
+
(0, import_node_fs.writeFileSync)(filepath, buffer);
|
|
616
|
+
const relativePath = `images/${filename2}`;
|
|
617
|
+
imageIndex += 1;
|
|
618
|
+
return `src="${relativePath}"`;
|
|
619
|
+
}
|
|
620
|
+
);
|
|
621
|
+
logger.info(
|
|
622
|
+
`[PDFConverter] Extracted ${imageIndex} images from HTML to ${imagesDir}`
|
|
623
|
+
);
|
|
624
|
+
(0, import_node_fs.writeFileSync)(htmlPath, transformedHtml, "utf-8");
|
|
625
|
+
} catch (e) {
|
|
626
|
+
logger.warn(
|
|
627
|
+
"[PDFConverter] Failed to extract images from HTML, writing original. Error:",
|
|
628
|
+
e
|
|
629
|
+
);
|
|
630
|
+
(0, import_node_fs.writeFileSync)(htmlPath, htmlContent, "utf-8");
|
|
631
|
+
}
|
|
632
|
+
logger.info("[PDFConverter] Saved HTML:", htmlPath);
|
|
633
|
+
}
|
|
634
|
+
/**
|
|
635
|
+
* Extract documents from ZIP and save with extracted images
|
|
636
|
+
* Uses jq for JSON processing to handle large files without loading into Node.js memory
|
|
637
|
+
*
|
|
638
|
+
* Complete workflow:
|
|
639
|
+
* 1. Extract ZIP file to temporary directory
|
|
640
|
+
* 2. Find JSON and HTML files from extracted files
|
|
641
|
+
* 3. Use jq to extract base64 images from JSON and save as separate files
|
|
642
|
+
* 4. Use jq to replace base64 with file paths in JSON
|
|
643
|
+
* 5. Process HTML with regex to extract and replace images
|
|
644
|
+
* 6. Save transformed documents to output directory (as result.json and result.html)
|
|
645
|
+
*/
|
|
646
|
+
static async extractAndSaveDocumentsFromZip(logger, zipPath, extractDir, outputDir) {
|
|
647
|
+
logger.info("[PDFConverter] Extracting ZIP file...");
|
|
648
|
+
await _ImageExtractor.extractZip(zipPath, extractDir);
|
|
649
|
+
const files = (0, import_node_fs.readdirSync)(extractDir);
|
|
650
|
+
const jsonFile = files.find((f) => (0, import_node_path2.extname)(f).toLowerCase() === ".json");
|
|
651
|
+
const htmlFile = files.find((f) => (0, import_node_path2.extname)(f).toLowerCase() === ".html");
|
|
652
|
+
if (!jsonFile || !htmlFile) {
|
|
653
|
+
throw new Error(
|
|
654
|
+
`Expected one JSON and one HTML file in extracted directory. Found: ${files.join(", ")}`
|
|
655
|
+
);
|
|
656
|
+
}
|
|
657
|
+
const jsonPath = (0, import_node_path2.join)(extractDir, jsonFile);
|
|
658
|
+
const htmlPath = (0, import_node_path2.join)(extractDir, htmlFile);
|
|
659
|
+
const htmlContent = (0, import_node_fs.readFileSync)(htmlPath, "utf-8");
|
|
660
|
+
logger.info("[PDFConverter] Saving converted files to output...");
|
|
661
|
+
await _ImageExtractor.saveDocumentsWithExtractedImages(
|
|
662
|
+
logger,
|
|
663
|
+
outputDir,
|
|
664
|
+
"result",
|
|
665
|
+
jsonPath,
|
|
666
|
+
htmlContent
|
|
667
|
+
);
|
|
668
|
+
logger.info("[PDFConverter] Files saved to:", outputDir);
|
|
669
|
+
}
|
|
670
|
+
};
|
|
671
|
+
|
|
672
|
+
// src/utils/local-file-server.ts
|
|
673
|
+
var import_node_fs2 = require("fs");
|
|
674
|
+
var import_node_http = require("http");
|
|
675
|
+
var import_node_path3 = require("path");
|
|
676
|
+
var LocalFileServer = class {
|
|
677
|
+
server = null;
|
|
678
|
+
port = 0;
|
|
679
|
+
/**
|
|
680
|
+
* Start serving a file and return the URL
|
|
681
|
+
*
|
|
682
|
+
* @param filePath Absolute path to the file to serve
|
|
683
|
+
* @returns URL to access the file
|
|
684
|
+
*/
|
|
685
|
+
async start(filePath) {
|
|
686
|
+
const filename = (0, import_node_path3.basename)(filePath);
|
|
687
|
+
const stat = (0, import_node_fs2.statSync)(filePath);
|
|
688
|
+
return new Promise((resolve, reject) => {
|
|
689
|
+
this.server = (0, import_node_http.createServer)((req, res) => {
|
|
690
|
+
if (req.url === `/${filename}`) {
|
|
691
|
+
res.writeHead(200, {
|
|
692
|
+
"Content-Type": "application/pdf",
|
|
693
|
+
"Content-Length": stat.size
|
|
694
|
+
});
|
|
695
|
+
(0, import_node_fs2.createReadStream)(filePath).pipe(res);
|
|
696
|
+
} else {
|
|
697
|
+
res.writeHead(404);
|
|
698
|
+
res.end("Not Found");
|
|
699
|
+
}
|
|
700
|
+
});
|
|
701
|
+
this.server.on("error", reject);
|
|
702
|
+
this.server.listen(0, "127.0.0.1", () => {
|
|
703
|
+
const address = this.server.address();
|
|
704
|
+
if (typeof address === "object" && address !== null) {
|
|
705
|
+
this.port = address.port;
|
|
706
|
+
resolve(`http://127.0.0.1:${this.port}/${filename}`);
|
|
707
|
+
} else {
|
|
708
|
+
reject(new Error("Failed to get server address"));
|
|
709
|
+
}
|
|
710
|
+
});
|
|
711
|
+
});
|
|
712
|
+
}
|
|
713
|
+
/**
|
|
714
|
+
* Stop the server
|
|
715
|
+
*/
|
|
716
|
+
stop() {
|
|
717
|
+
return new Promise((resolve) => {
|
|
718
|
+
if (this.server) {
|
|
719
|
+
this.server.close(() => {
|
|
720
|
+
this.server = null;
|
|
721
|
+
this.port = 0;
|
|
722
|
+
resolve();
|
|
723
|
+
});
|
|
724
|
+
} else {
|
|
725
|
+
resolve();
|
|
726
|
+
}
|
|
727
|
+
});
|
|
728
|
+
}
|
|
729
|
+
};
|
|
730
|
+
|
|
731
|
+
// src/core/image-pdf-converter.ts
|
|
732
|
+
var import_node_fs3 = require("fs");
|
|
733
|
+
var import_node_os = require("os");
|
|
734
|
+
var import_node_path4 = require("path");
|
|
735
|
+
var ImagePdfConverter = class {
|
|
736
|
+
constructor(logger) {
|
|
737
|
+
this.logger = logger;
|
|
738
|
+
}
|
|
739
|
+
/**
|
|
740
|
+
* Convert a PDF file to an image-based PDF.
|
|
741
|
+
* Downloads the PDF from URL, converts it using ImageMagick, and returns the path.
|
|
742
|
+
*
|
|
743
|
+
* @param pdfUrl - URL of the source PDF
|
|
744
|
+
* @param reportId - Report identifier for temp file naming
|
|
745
|
+
* @returns Path to the converted image PDF in temp directory
|
|
746
|
+
*/
|
|
747
|
+
async convert(pdfUrl, reportId) {
|
|
748
|
+
const timestamp = Date.now();
|
|
749
|
+
const tempDir = (0, import_node_os.tmpdir)();
|
|
750
|
+
const inputPath = (0, import_node_path4.join)(tempDir, `${reportId}-${timestamp}-input.pdf`);
|
|
751
|
+
const outputPath = (0, import_node_path4.join)(tempDir, `${reportId}-${timestamp}-image.pdf`);
|
|
752
|
+
try {
|
|
753
|
+
this.logger.info("[ImagePdfConverter] Downloading PDF from URL...");
|
|
754
|
+
await this.downloadPdf(pdfUrl, inputPath);
|
|
755
|
+
this.logger.info("[ImagePdfConverter] Converting to image PDF...");
|
|
756
|
+
await this.convertToImagePdf(inputPath, outputPath);
|
|
757
|
+
this.logger.info("[ImagePdfConverter] Image PDF created:", outputPath);
|
|
758
|
+
return outputPath;
|
|
759
|
+
} finally {
|
|
760
|
+
if ((0, import_node_fs3.existsSync)(inputPath)) {
|
|
761
|
+
(0, import_node_fs3.rmSync)(inputPath, { force: true });
|
|
762
|
+
}
|
|
763
|
+
}
|
|
764
|
+
}
|
|
765
|
+
/**
|
|
766
|
+
* Download PDF from URL to local path using curl
|
|
767
|
+
*/
|
|
768
|
+
async downloadPdf(url, outputPath) {
|
|
769
|
+
const result = await spawnAsync("curl", [
|
|
770
|
+
"-L",
|
|
771
|
+
// Follow redirects
|
|
772
|
+
"-o",
|
|
773
|
+
outputPath,
|
|
774
|
+
"-s",
|
|
775
|
+
// Silent mode
|
|
776
|
+
"--fail",
|
|
777
|
+
// Fail on HTTP errors
|
|
778
|
+
url
|
|
779
|
+
]);
|
|
780
|
+
if (result.code !== 0) {
|
|
781
|
+
throw new Error(
|
|
782
|
+
`Failed to download PDF: ${result.stderr || "Unknown error"}`
|
|
783
|
+
);
|
|
784
|
+
}
|
|
785
|
+
}
|
|
786
|
+
/**
|
|
787
|
+
* Convert PDF to image-based PDF using ImageMagick
|
|
788
|
+
*/
|
|
789
|
+
async convertToImagePdf(inputPath, outputPath) {
|
|
790
|
+
const result = await spawnAsync("magick", [
|
|
791
|
+
"-density",
|
|
792
|
+
IMAGE_PDF_CONVERTER.DENSITY.toString(),
|
|
793
|
+
inputPath,
|
|
794
|
+
"-quality",
|
|
795
|
+
IMAGE_PDF_CONVERTER.QUALITY.toString(),
|
|
796
|
+
outputPath
|
|
797
|
+
]);
|
|
798
|
+
if (result.code !== 0) {
|
|
799
|
+
throw new Error(
|
|
800
|
+
`Failed to convert PDF to image PDF: ${result.stderr || "Unknown error"}`
|
|
801
|
+
);
|
|
802
|
+
}
|
|
803
|
+
}
|
|
804
|
+
/**
|
|
805
|
+
* Cleanup the temporary image PDF file
|
|
806
|
+
*/
|
|
807
|
+
cleanup(imagePdfPath) {
|
|
808
|
+
if ((0, import_node_fs3.existsSync)(imagePdfPath)) {
|
|
809
|
+
this.logger.info(
|
|
810
|
+
"[ImagePdfConverter] Cleaning up temp file:",
|
|
811
|
+
imagePdfPath
|
|
812
|
+
);
|
|
813
|
+
(0, import_node_fs3.rmSync)(imagePdfPath, { force: true });
|
|
814
|
+
}
|
|
815
|
+
}
|
|
816
|
+
};
|
|
817
|
+
|
|
818
|
+
// src/core/pdf-converter.ts
|
|
819
|
+
var PDFConverter = class {
|
|
820
|
+
constructor(logger, client, enableImagePdfFallback = false) {
|
|
821
|
+
this.logger = logger;
|
|
822
|
+
this.client = client;
|
|
823
|
+
this.enableImagePdfFallback = enableImagePdfFallback;
|
|
824
|
+
}
|
|
825
|
+
async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
|
|
826
|
+
this.logger.info("[PDFConverter] Converting:", url);
|
|
827
|
+
let originalError = null;
|
|
828
|
+
try {
|
|
829
|
+
await this.performConversion(
|
|
830
|
+
url,
|
|
831
|
+
reportId,
|
|
832
|
+
onComplete,
|
|
833
|
+
cleanupAfterCallback,
|
|
834
|
+
options,
|
|
835
|
+
abortSignal
|
|
836
|
+
);
|
|
837
|
+
return;
|
|
838
|
+
} catch (error) {
|
|
839
|
+
if (abortSignal?.aborted) {
|
|
840
|
+
throw error;
|
|
841
|
+
}
|
|
842
|
+
originalError = error;
|
|
843
|
+
this.logger.error("[PDFConverter] Conversion failed:", error);
|
|
844
|
+
if (!this.enableImagePdfFallback) {
|
|
845
|
+
throw error;
|
|
846
|
+
}
|
|
847
|
+
}
|
|
848
|
+
this.logger.info("[PDFConverter] Attempting image PDF fallback...");
|
|
849
|
+
const imagePdfConverter = new ImagePdfConverter(this.logger);
|
|
850
|
+
let imagePdfPath = null;
|
|
851
|
+
try {
|
|
852
|
+
imagePdfPath = await imagePdfConverter.convert(url, reportId);
|
|
853
|
+
const localUrl = `file://${imagePdfPath}`;
|
|
854
|
+
this.logger.info("[PDFConverter] Retrying with image PDF:", localUrl);
|
|
855
|
+
await this.performConversion(
|
|
856
|
+
localUrl,
|
|
857
|
+
reportId,
|
|
858
|
+
onComplete,
|
|
859
|
+
cleanupAfterCallback,
|
|
860
|
+
options,
|
|
861
|
+
abortSignal
|
|
862
|
+
);
|
|
863
|
+
this.logger.info("[PDFConverter] Fallback conversion succeeded");
|
|
864
|
+
} catch (fallbackError) {
|
|
865
|
+
this.logger.error(
|
|
866
|
+
"[PDFConverter] Fallback conversion also failed:",
|
|
867
|
+
fallbackError
|
|
868
|
+
);
|
|
869
|
+
throw new ImagePdfFallbackError(originalError, fallbackError);
|
|
870
|
+
} finally {
|
|
871
|
+
if (imagePdfPath) {
|
|
872
|
+
imagePdfConverter.cleanup(imagePdfPath);
|
|
873
|
+
}
|
|
874
|
+
}
|
|
875
|
+
}
|
|
876
|
+
async performConversion(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
|
|
877
|
+
const startTime = Date.now();
|
|
878
|
+
const conversionOptions = this.buildConversionOptions(options);
|
|
879
|
+
this.logger.info(
|
|
880
|
+
"[PDFConverter] Converting document with Async Source API..."
|
|
881
|
+
);
|
|
882
|
+
this.logger.info("[PDFConverter] Server will download from URL directly");
|
|
883
|
+
this.logger.info(
|
|
884
|
+
"[PDFConverter] Results will be returned as ZIP to avoid memory limits"
|
|
885
|
+
);
|
|
886
|
+
const { httpUrl, server } = await this.resolveUrl(url);
|
|
887
|
+
try {
|
|
888
|
+
const task = await this.startConversionTask(httpUrl, conversionOptions);
|
|
889
|
+
await this.trackTaskProgress(task);
|
|
890
|
+
if (abortSignal?.aborted) {
|
|
891
|
+
this.logger.info(
|
|
892
|
+
"[PDFConverter] Conversion aborted after docling completion"
|
|
893
|
+
);
|
|
894
|
+
const error = new Error("PDF conversion was aborted");
|
|
895
|
+
error.name = "AbortError";
|
|
896
|
+
throw error;
|
|
897
|
+
}
|
|
898
|
+
await this.downloadResult(task.taskId);
|
|
899
|
+
} finally {
|
|
900
|
+
if (server) {
|
|
901
|
+
this.logger.info("[PDFConverter] Stopping local file server...");
|
|
902
|
+
await server.stop();
|
|
903
|
+
}
|
|
904
|
+
}
|
|
905
|
+
const cwd = process.cwd();
|
|
906
|
+
const zipPath = (0, import_node_path5.join)(cwd, "result.zip");
|
|
907
|
+
const extractDir = (0, import_node_path5.join)(cwd, "result_extracted");
|
|
908
|
+
const outputDir = (0, import_node_path5.join)(cwd, "output", reportId);
|
|
909
|
+
try {
|
|
910
|
+
await this.processConvertedFiles(zipPath, extractDir, outputDir);
|
|
911
|
+
if (abortSignal?.aborted) {
|
|
912
|
+
this.logger.info("[PDFConverter] Conversion aborted before callback");
|
|
913
|
+
const error = new Error("PDF conversion was aborted");
|
|
914
|
+
error.name = "AbortError";
|
|
915
|
+
throw error;
|
|
916
|
+
}
|
|
917
|
+
this.logger.info("[PDFConverter] Executing completion callback...");
|
|
918
|
+
await onComplete(outputDir);
|
|
919
|
+
const duration = Date.now() - startTime;
|
|
920
|
+
this.logger.info("[PDFConverter] Conversion completed successfully!");
|
|
921
|
+
this.logger.info("[PDFConverter] Total time:", duration, "ms");
|
|
922
|
+
} finally {
|
|
923
|
+
this.logger.info("[PDFConverter] Cleaning up temporary files...");
|
|
924
|
+
if ((0, import_node_fs4.existsSync)(zipPath)) {
|
|
925
|
+
(0, import_node_fs4.rmSync)(zipPath, { force: true });
|
|
926
|
+
}
|
|
927
|
+
if ((0, import_node_fs4.existsSync)(extractDir)) {
|
|
928
|
+
(0, import_node_fs4.rmSync)(extractDir, { recursive: true, force: true });
|
|
929
|
+
}
|
|
930
|
+
if (cleanupAfterCallback) {
|
|
931
|
+
this.logger.info(
|
|
932
|
+
"[PDFConverter] Cleaning up output directory:",
|
|
933
|
+
outputDir
|
|
934
|
+
);
|
|
935
|
+
if ((0, import_node_fs4.existsSync)(outputDir)) {
|
|
936
|
+
(0, import_node_fs4.rmSync)(outputDir, { recursive: true, force: true });
|
|
937
|
+
}
|
|
938
|
+
} else {
|
|
939
|
+
this.logger.info("[PDFConverter] Output preserved at:", outputDir);
|
|
940
|
+
}
|
|
941
|
+
}
|
|
942
|
+
}
|
|
943
|
+
buildConversionOptions(options) {
|
|
944
|
+
return {
|
|
945
|
+
...(0, import_es_toolkit.omit)(options, ["num_threads"]),
|
|
946
|
+
to_formats: ["json", "html"],
|
|
947
|
+
image_export_mode: "embedded",
|
|
948
|
+
ocr_engine: "ocrmac",
|
|
949
|
+
generate_picture_images: true,
|
|
950
|
+
images_scale: 2,
|
|
951
|
+
/**
|
|
952
|
+
* While disabling this option yields the most accurate text extraction for readable PDFs,
|
|
953
|
+
* text layers overlaid on images or drawings can introduce noise when not merged properly.
|
|
954
|
+
* In practice, archaeological report PDFs almost always contain such overlapping cases.
|
|
955
|
+
* Enabling force_ocr mitigates this risk. Although OCR may introduce minor errors compared
|
|
956
|
+
* to direct text extraction, the accuracy remains high since the source is digital, not scanned paper.
|
|
957
|
+
*/
|
|
958
|
+
force_ocr: true,
|
|
959
|
+
accelerator_options: {
|
|
960
|
+
device: "mps",
|
|
961
|
+
num_threads: options.num_threads
|
|
962
|
+
}
|
|
963
|
+
};
|
|
964
|
+
}
|
|
965
|
+
async startConversionTask(url, conversionOptions) {
|
|
966
|
+
const task = await this.client.convertSourceAsync({
|
|
967
|
+
sources: [
|
|
968
|
+
{
|
|
969
|
+
kind: "http",
|
|
970
|
+
url
|
|
971
|
+
}
|
|
972
|
+
],
|
|
973
|
+
options: conversionOptions,
|
|
974
|
+
target: {
|
|
975
|
+
kind: "zip"
|
|
976
|
+
}
|
|
977
|
+
});
|
|
978
|
+
this.logger.info(`[PDFConverter] Task created: ${task.taskId}`);
|
|
979
|
+
this.logger.info("[PDFConverter] Polling for progress...");
|
|
980
|
+
return task;
|
|
981
|
+
}
|
|
982
|
+
/**
|
|
983
|
+
* Start a local file server for file:// URLs
|
|
984
|
+
*
|
|
985
|
+
* @param url URL to check (file:// or http://)
|
|
986
|
+
* @returns Object with httpUrl and optional server to stop later
|
|
987
|
+
*/
|
|
988
|
+
async resolveUrl(url) {
|
|
989
|
+
if (url.startsWith("file://")) {
|
|
990
|
+
const filePath = url.slice(7);
|
|
991
|
+
const server = new LocalFileServer();
|
|
992
|
+
const httpUrl = await server.start(filePath);
|
|
993
|
+
this.logger.info("[PDFConverter] Started local file server:", httpUrl);
|
|
994
|
+
return { httpUrl, server };
|
|
995
|
+
}
|
|
996
|
+
return { httpUrl: url };
|
|
997
|
+
}
|
|
998
|
+
async trackTaskProgress(task) {
|
|
999
|
+
const conversionStartTime = Date.now();
|
|
1000
|
+
let lastStatus = "";
|
|
1001
|
+
let isCompleted = false;
|
|
1002
|
+
const pollInterval = setInterval(() => {
|
|
1003
|
+
if (isCompleted) return;
|
|
1004
|
+
const elapsed = Math.floor((Date.now() - conversionStartTime) / 1e3);
|
|
1005
|
+
process.stdout.write(
|
|
1006
|
+
`\r[PDFConverter] Status: ${lastStatus || "processing"} (${elapsed}s elapsed)`
|
|
1007
|
+
);
|
|
1008
|
+
}, PDF_CONVERTER.POLL_INTERVAL_MS);
|
|
1009
|
+
task.on("progress", (status) => {
|
|
1010
|
+
lastStatus = status.task_status;
|
|
1011
|
+
if (status.task_position !== void 0) {
|
|
1012
|
+
process.stdout.write(
|
|
1013
|
+
`\r[PDFConverter] Status: ${status.task_status} (position: ${status.task_position})`
|
|
1014
|
+
);
|
|
1015
|
+
}
|
|
1016
|
+
});
|
|
1017
|
+
task.on("complete", () => {
|
|
1018
|
+
isCompleted = true;
|
|
1019
|
+
clearInterval(pollInterval);
|
|
1020
|
+
this.logger.info("\n[PDFConverter] Conversion completed!");
|
|
1021
|
+
});
|
|
1022
|
+
task.on("error", (error) => {
|
|
1023
|
+
isCompleted = true;
|
|
1024
|
+
clearInterval(pollInterval);
|
|
1025
|
+
this.logger.error("\n[PDFConverter] Conversion error:", error.message);
|
|
1026
|
+
});
|
|
1027
|
+
try {
|
|
1028
|
+
await task.waitForCompletion();
|
|
1029
|
+
} finally {
|
|
1030
|
+
isCompleted = true;
|
|
1031
|
+
clearInterval(pollInterval);
|
|
1032
|
+
}
|
|
1033
|
+
}
|
|
1034
|
+
async downloadResult(taskId) {
|
|
1035
|
+
this.logger.info(
|
|
1036
|
+
"\n[PDFConverter] Task completed, downloading ZIP file..."
|
|
1037
|
+
);
|
|
1038
|
+
const zipResult = await this.client.getTaskResultFile(taskId);
|
|
1039
|
+
if (!zipResult.success || !zipResult.fileStream) {
|
|
1040
|
+
throw new Error("Failed to get ZIP file result");
|
|
1041
|
+
}
|
|
1042
|
+
const zipPath = (0, import_node_path5.join)(process.cwd(), "result.zip");
|
|
1043
|
+
this.logger.info("[PDFConverter] Saving ZIP file to:", zipPath);
|
|
1044
|
+
const writeStream = (0, import_node_fs4.createWriteStream)(zipPath);
|
|
1045
|
+
await (0, import_promises.pipeline)(zipResult.fileStream, writeStream);
|
|
1046
|
+
}
|
|
1047
|
+
async processConvertedFiles(zipPath, extractDir, outputDir) {
|
|
1048
|
+
await ImageExtractor.extractAndSaveDocumentsFromZip(
|
|
1049
|
+
this.logger,
|
|
1050
|
+
zipPath,
|
|
1051
|
+
extractDir,
|
|
1052
|
+
outputDir
|
|
1053
|
+
);
|
|
1054
|
+
}
|
|
1055
|
+
};
|
|
1056
|
+
|
|
1057
|
+
// src/core/pdf-parser.ts
|
|
1058
|
+
var PDFParser = class {
|
|
1059
|
+
logger;
|
|
1060
|
+
port;
|
|
1061
|
+
baseUrl;
|
|
1062
|
+
timeout;
|
|
1063
|
+
venvPath;
|
|
1064
|
+
killExistingProcess;
|
|
1065
|
+
enableImagePdfFallback;
|
|
1066
|
+
client = null;
|
|
1067
|
+
constructor(options) {
|
|
1068
|
+
const {
|
|
1069
|
+
logger,
|
|
1070
|
+
timeout = PDF_PARSER.DEFAULT_TIMEOUT_MS,
|
|
1071
|
+
venvPath,
|
|
1072
|
+
killExistingProcess = false,
|
|
1073
|
+
enableImagePdfFallback = false
|
|
1074
|
+
} = options;
|
|
1075
|
+
this.logger = logger;
|
|
1076
|
+
if ("baseUrl" in options) {
|
|
1077
|
+
this.baseUrl = options.baseUrl;
|
|
1078
|
+
this.port = void 0;
|
|
1079
|
+
} else {
|
|
1080
|
+
this.port = options.port;
|
|
1081
|
+
this.baseUrl = void 0;
|
|
1082
|
+
}
|
|
1083
|
+
this.timeout = timeout;
|
|
1084
|
+
this.venvPath = venvPath || (0, import_node_path6.join)(process.cwd(), ".venv");
|
|
1085
|
+
this.killExistingProcess = killExistingProcess;
|
|
1086
|
+
this.enableImagePdfFallback = enableImagePdfFallback;
|
|
1087
|
+
}
|
|
1088
|
+
async init() {
|
|
1089
|
+
this.logger.info("[PDFParser] Initializing...");
|
|
1090
|
+
this.checkOperatingSystem();
|
|
1091
|
+
this.checkJqInstalled();
|
|
1092
|
+
this.checkMacOSVersion();
|
|
1093
|
+
if (this.enableImagePdfFallback && !this.baseUrl) {
|
|
1094
|
+
this.checkImageMagickInstalled();
|
|
1095
|
+
this.checkGhostscriptInstalled();
|
|
1096
|
+
} else if (this.enableImagePdfFallback && this.baseUrl) {
|
|
1097
|
+
this.logger.warn(
|
|
1098
|
+
"[PDFParser] enableImagePdfFallback is ignored when using external server (baseUrl)"
|
|
1099
|
+
);
|
|
1100
|
+
}
|
|
1101
|
+
if (this.baseUrl) {
|
|
1102
|
+
this.logger.info("[PDFParser] Using external server:", this.baseUrl);
|
|
1103
|
+
this.client = new import_docling_sdk.Docling({
|
|
1104
|
+
api: { baseUrl: this.baseUrl, timeout: this.timeout }
|
|
1105
|
+
});
|
|
1106
|
+
await this.waitForServerReady();
|
|
1107
|
+
return;
|
|
1108
|
+
}
|
|
1109
|
+
this.logger.info("[PDFParser] Setting up local server...");
|
|
1110
|
+
try {
|
|
1111
|
+
const environment = new DoclingEnvironment({
|
|
1112
|
+
logger: this.logger,
|
|
1113
|
+
venvPath: this.venvPath,
|
|
1114
|
+
port: this.port,
|
|
1115
|
+
killExistingProcess: this.killExistingProcess
|
|
1116
|
+
});
|
|
1117
|
+
await environment.setup();
|
|
1118
|
+
const clientUrl = `http://localhost:${this.port}`;
|
|
1119
|
+
this.client = new import_docling_sdk.Docling({
|
|
1120
|
+
api: {
|
|
1121
|
+
baseUrl: clientUrl,
|
|
1122
|
+
timeout: this.timeout
|
|
1123
|
+
}
|
|
1124
|
+
});
|
|
1125
|
+
await this.waitForServerReady();
|
|
1126
|
+
this.logger.info("[PDFParser] Ready");
|
|
1127
|
+
} catch (error) {
|
|
1128
|
+
this.logger.error("[PDFParser] Initialization failed:", error);
|
|
1129
|
+
throw new Error(`Failed to initialize PDFParser: ${error}`);
|
|
1130
|
+
}
|
|
1131
|
+
}
|
|
1132
|
+
checkOperatingSystem() {
|
|
1133
|
+
if ((0, import_node_os2.platform)() !== "darwin") {
|
|
1134
|
+
throw new Error(
|
|
1135
|
+
"PDFParser is only supported on macOS. Current platform: " + (0, import_node_os2.platform)()
|
|
1136
|
+
);
|
|
1137
|
+
}
|
|
1138
|
+
}
|
|
1139
|
+
checkJqInstalled() {
|
|
1140
|
+
try {
|
|
1141
|
+
(0, import_node_child_process3.execSync)("which jq", { stdio: "ignore" });
|
|
1142
|
+
} catch {
|
|
1143
|
+
throw new Error(
|
|
1144
|
+
"jq is not installed. Please install jq using: brew install jq"
|
|
1145
|
+
);
|
|
1146
|
+
}
|
|
1147
|
+
}
|
|
1148
|
+
checkMacOSVersion() {
|
|
1149
|
+
try {
|
|
1150
|
+
const versionOutput = (0, import_node_child_process3.execSync)("sw_vers -productVersion", {
|
|
1151
|
+
encoding: "utf-8"
|
|
1152
|
+
}).trim();
|
|
1153
|
+
const versionMatch = versionOutput.match(/^(\d+)\.(\d+)/);
|
|
1154
|
+
if (versionMatch) {
|
|
1155
|
+
const major = parseInt(versionMatch[1]);
|
|
1156
|
+
const minor = parseInt(versionMatch[2]);
|
|
1157
|
+
if (major < 10 || major === 10 && minor < 15) {
|
|
1158
|
+
throw new Error(
|
|
1159
|
+
`macOS 10.15 or later is required. Current version: ${versionOutput}`
|
|
1160
|
+
);
|
|
1161
|
+
}
|
|
1162
|
+
}
|
|
1163
|
+
} catch (error) {
|
|
1164
|
+
if (error instanceof Error && error.message.includes("macOS 10.15")) {
|
|
1165
|
+
throw error;
|
|
1166
|
+
}
|
|
1167
|
+
throw new Error("Failed to check macOS version");
|
|
1168
|
+
}
|
|
1169
|
+
}
|
|
1170
|
+
checkImageMagickInstalled() {
|
|
1171
|
+
try {
|
|
1172
|
+
(0, import_node_child_process3.execSync)("which magick", { stdio: "ignore" });
|
|
1173
|
+
} catch {
|
|
1174
|
+
throw new Error(
|
|
1175
|
+
"ImageMagick is not installed but enableImagePdfFallback is enabled. Please install ImageMagick using: brew install imagemagick"
|
|
1176
|
+
);
|
|
1177
|
+
}
|
|
1178
|
+
}
|
|
1179
|
+
checkGhostscriptInstalled() {
|
|
1180
|
+
try {
|
|
1181
|
+
(0, import_node_child_process3.execSync)("which gs", { stdio: "ignore" });
|
|
1182
|
+
} catch {
|
|
1183
|
+
throw new Error(
|
|
1184
|
+
"Ghostscript is not installed but enableImagePdfFallback is enabled. Please install Ghostscript using: brew install ghostscript"
|
|
1185
|
+
);
|
|
1186
|
+
}
|
|
1187
|
+
}
|
|
1188
|
+
/**
|
|
1189
|
+
* Check if an error is a connection refused error (ECONNREFUSED).
|
|
1190
|
+
* This typically indicates the Docling server has crashed.
|
|
1191
|
+
*/
|
|
1192
|
+
isConnectionRefusedError(error) {
|
|
1193
|
+
if (error instanceof Error) {
|
|
1194
|
+
const errorStr = JSON.stringify(error);
|
|
1195
|
+
return errorStr.includes("ECONNREFUSED");
|
|
1196
|
+
}
|
|
1197
|
+
return false;
|
|
1198
|
+
}
|
|
1199
|
+
/**
|
|
1200
|
+
* Restart the Docling server after it has crashed.
|
|
1201
|
+
* This kills any existing process on the port, starts a new server,
|
|
1202
|
+
* and waits for it to become ready.
|
|
1203
|
+
*
|
|
1204
|
+
* Note: This method is only called when canRecover is true,
|
|
1205
|
+
* which guarantees this.port is defined.
|
|
1206
|
+
*/
|
|
1207
|
+
async restartServer() {
|
|
1208
|
+
this.logger.info("[PDFParser] Restarting server...");
|
|
1209
|
+
await DoclingEnvironment.killProcessOnPort(this.logger, this.port);
|
|
1210
|
+
const environment = new DoclingEnvironment({
|
|
1211
|
+
logger: this.logger,
|
|
1212
|
+
venvPath: this.venvPath,
|
|
1213
|
+
port: this.port,
|
|
1214
|
+
killExistingProcess: false
|
|
1215
|
+
// Already killed above
|
|
1216
|
+
});
|
|
1217
|
+
await environment.startServer();
|
|
1218
|
+
this.client?.destroy();
|
|
1219
|
+
this.client = new import_docling_sdk.Docling({
|
|
1220
|
+
api: {
|
|
1221
|
+
baseUrl: `http://localhost:${this.port}`,
|
|
1222
|
+
timeout: this.timeout
|
|
1223
|
+
}
|
|
1224
|
+
});
|
|
1225
|
+
await this.waitForServerReady();
|
|
1226
|
+
this.logger.info("[PDFParser] Server restarted successfully");
|
|
1227
|
+
}
|
|
1228
|
+
async waitForServerReady() {
|
|
1229
|
+
const maxAttempts = PDF_PARSER.MAX_HEALTH_CHECK_ATTEMPTS;
|
|
1230
|
+
const checkInterval = PDF_PARSER.HEALTH_CHECK_INTERVAL_MS;
|
|
1231
|
+
const logInterval = PDF_PARSER.HEALTH_CHECK_LOG_INTERVAL_MS;
|
|
1232
|
+
let lastLogTime = 0;
|
|
1233
|
+
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
|
|
1234
|
+
try {
|
|
1235
|
+
await this.client.health();
|
|
1236
|
+
this.logger.info("[PDFParser] Server is ready");
|
|
1237
|
+
return;
|
|
1238
|
+
} catch {
|
|
1239
|
+
const now = Date.now();
|
|
1240
|
+
if (now - lastLogTime >= logInterval) {
|
|
1241
|
+
this.logger.info(
|
|
1242
|
+
"[PDFParser] Waiting for server... (attempt",
|
|
1243
|
+
attempt,
|
|
1244
|
+
"/",
|
|
1245
|
+
maxAttempts,
|
|
1246
|
+
")"
|
|
1247
|
+
);
|
|
1248
|
+
lastLogTime = now;
|
|
1249
|
+
}
|
|
1250
|
+
if (attempt < maxAttempts) {
|
|
1251
|
+
await new Promise((resolve) => setTimeout(resolve, checkInterval));
|
|
1252
|
+
}
|
|
1253
|
+
}
|
|
1254
|
+
}
|
|
1255
|
+
throw new Error("Server failed to become ready after maximum attempts");
|
|
1256
|
+
}
|
|
1257
|
+
async parse(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
|
|
1258
|
+
if (!this.client) {
|
|
1259
|
+
throw new Error(
|
|
1260
|
+
"PDFParser is not initialized. Call init() before using parse()"
|
|
1261
|
+
);
|
|
1262
|
+
}
|
|
1263
|
+
const canRecover = !this.baseUrl && this.port !== void 0;
|
|
1264
|
+
const maxAttempts = PDF_PARSER.MAX_SERVER_RECOVERY_ATTEMPTS;
|
|
1265
|
+
let attempt = 0;
|
|
1266
|
+
while (attempt <= maxAttempts) {
|
|
1267
|
+
try {
|
|
1268
|
+
const effectiveFallbackEnabled = this.enableImagePdfFallback && !this.baseUrl;
|
|
1269
|
+
const converter = new PDFConverter(
|
|
1270
|
+
this.logger,
|
|
1271
|
+
this.client,
|
|
1272
|
+
effectiveFallbackEnabled
|
|
1273
|
+
);
|
|
1274
|
+
return await converter.convert(
|
|
1275
|
+
url,
|
|
1276
|
+
reportId,
|
|
1277
|
+
onComplete,
|
|
1278
|
+
cleanupAfterCallback,
|
|
1279
|
+
options,
|
|
1280
|
+
abortSignal
|
|
1281
|
+
);
|
|
1282
|
+
} catch (error) {
|
|
1283
|
+
if (abortSignal?.aborted) {
|
|
1284
|
+
throw error;
|
|
1285
|
+
}
|
|
1286
|
+
if (canRecover && this.isConnectionRefusedError(error) && attempt < maxAttempts) {
|
|
1287
|
+
this.logger.warn(
|
|
1288
|
+
"[PDFParser] Connection refused, attempting server recovery..."
|
|
1289
|
+
);
|
|
1290
|
+
await this.restartServer();
|
|
1291
|
+
attempt++;
|
|
1292
|
+
continue;
|
|
1293
|
+
}
|
|
1294
|
+
throw error;
|
|
1295
|
+
}
|
|
1296
|
+
}
|
|
1297
|
+
}
|
|
1298
|
+
/**
|
|
1299
|
+
* Dispose the parser instance.
|
|
1300
|
+
* - Sets the internal client to null
|
|
1301
|
+
* - If a local docling server was started (no baseUrl), kills the process on the configured port
|
|
1302
|
+
*/
|
|
1303
|
+
async dispose() {
|
|
1304
|
+
this.logger.info("[PDFParser] Disposing...");
|
|
1305
|
+
try {
|
|
1306
|
+
if (!this.baseUrl && this.port) {
|
|
1307
|
+
await DoclingEnvironment.killProcessOnPort(this.logger, this.port);
|
|
1308
|
+
}
|
|
1309
|
+
} catch (error) {
|
|
1310
|
+
this.logger.error("[PDFParser] Error while disposing:", error);
|
|
1311
|
+
} finally {
|
|
1312
|
+
this.client?.destroy();
|
|
1313
|
+
this.client = null;
|
|
1314
|
+
this.logger.info("[PDFParser] Disposed");
|
|
1315
|
+
}
|
|
1316
|
+
}
|
|
1317
|
+
};
|
|
1318
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
1319
|
+
0 && (module.exports = {
|
|
1320
|
+
ImagePdfFallbackError,
|
|
1321
|
+
PDFParser
|
|
1322
|
+
});
|
|
1323
|
+
//# sourceMappingURL=index.cjs.map
|