@heripo/pdf-parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs ADDED
@@ -0,0 +1,1323 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __export = (target, all) => {
9
+ for (var name in all)
10
+ __defProp(target, name, { get: all[name], enumerable: true });
11
+ };
12
+ var __copyProps = (to, from, except, desc) => {
13
+ if (from && typeof from === "object" || typeof from === "function") {
14
+ for (let key of __getOwnPropNames(from))
15
+ if (!__hasOwnProp.call(to, key) && key !== except)
16
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
+ }
18
+ return to;
19
+ };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
28
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
+
30
+ // src/index.ts
31
+ var src_exports = {};
32
+ __export(src_exports, {
33
+ ImagePdfFallbackError: () => ImagePdfFallbackError,
34
+ PDFParser: () => PDFParser
35
+ });
36
+ module.exports = __toCommonJS(src_exports);
37
+
38
+ // src/core/pdf-parser.ts
39
+ var import_docling_sdk = require("docling-sdk");
40
+ var import_node_child_process3 = require("child_process");
41
+ var import_node_os2 = require("os");
42
+ var import_node_path6 = require("path");
43
+
44
+ // src/config/constants.ts
45
+ var PDF_PARSER = {
46
+ /**
47
+ * Default timeout for API calls in milliseconds
48
+ */
49
+ DEFAULT_TIMEOUT_MS: 1e5,
50
+ /**
51
+ * Maximum number of health check attempts before giving up
52
+ */
53
+ MAX_HEALTH_CHECK_ATTEMPTS: 60,
54
+ /**
55
+ * Interval between health check attempts in milliseconds
56
+ */
57
+ HEALTH_CHECK_INTERVAL_MS: 2e3,
58
+ /**
59
+ * Interval between log messages during health check in milliseconds
60
+ */
61
+ HEALTH_CHECK_LOG_INTERVAL_MS: 5e3,
62
+ /**
63
+ * Maximum retry attempts for server recovery on ECONNREFUSED
64
+ */
65
+ MAX_SERVER_RECOVERY_ATTEMPTS: 1
66
+ };
67
+ var PDF_CONVERTER = {
68
+ /**
69
+ * Interval for progress polling in milliseconds
70
+ */
71
+ POLL_INTERVAL_MS: 1e3
72
+ };
73
+ var DOCLING_ENVIRONMENT = {
74
+ /**
75
+ * Delay after starting docling-serve to allow startup
76
+ */
77
+ STARTUP_DELAY_MS: 2e3
78
+ };
79
+ var IMAGE_PDF_CONVERTER = {
80
+ /**
81
+ * ImageMagick density option (DPI) for PDF to image conversion
82
+ */
83
+ DENSITY: 300,
84
+ /**
85
+ * ImageMagick quality option (1-100)
86
+ */
87
+ QUALITY: 100
88
+ };
89
+
90
+ // ../shared/dist/index.mjs
91
+ var import_child_process = require("child_process");
92
+ function spawnAsync(command, args, options = {}) {
93
+ const {
94
+ captureStdout = true,
95
+ captureStderr = true,
96
+ ...spawnOptions
97
+ } = options;
98
+ return new Promise((resolve, reject) => {
99
+ const proc = (0, import_child_process.spawn)(command, args, spawnOptions);
100
+ let stdout = "";
101
+ let stderr = "";
102
+ if (captureStdout && proc.stdout) {
103
+ proc.stdout.on("data", (data) => {
104
+ stdout += data.toString();
105
+ });
106
+ }
107
+ if (captureStderr && proc.stderr) {
108
+ proc.stderr.on("data", (data) => {
109
+ stderr += data.toString();
110
+ });
111
+ }
112
+ proc.on("close", (code) => {
113
+ resolve({ stdout, stderr, code: code ?? 0 });
114
+ });
115
+ proc.on("error", reject);
116
+ });
117
+ }
118
+
119
+ // src/environment/docling-environment.ts
120
+ var import_node_child_process = require("child_process");
121
+ var import_node_path = require("path");
122
+
123
+ // src/utils/python-version.ts
124
+ var PYTHON_VERSION_REGEX = /Python (\d+)\.(\d+)/;
125
+ var MIN_PYTHON_VERSION = { major: 3, minor: 9 };
126
+ var PythonVersionError = class extends Error {
127
+ constructor(message) {
128
+ super(message);
129
+ this.name = "PythonVersionError";
130
+ }
131
+ };
132
+ function parsePythonVersion(output) {
133
+ const match = output.match(PYTHON_VERSION_REGEX);
134
+ if (!match) return null;
135
+ const major = parseInt(match[1]);
136
+ const minor = parseInt(match[2]);
137
+ return {
138
+ major,
139
+ minor,
140
+ versionString: `${major}.${minor}`
141
+ };
142
+ }
143
+ function validatePythonVersion(version, context = "system") {
144
+ const { major, minor } = version;
145
+ const prefix = context === "venv" ? "Venv Python" : "Python";
146
+ if (major === 3 && minor >= 13) {
147
+ throw new PythonVersionError(
148
+ `${prefix} ${major}.${minor} is too new. docling-serve requires Python 3.11 or 3.12.`
149
+ );
150
+ }
151
+ if (major !== 3 || minor < MIN_PYTHON_VERSION.minor) {
152
+ throw new PythonVersionError("Python 3.9 or higher is required");
153
+ }
154
+ }
155
+
156
+ // src/environment/docling-environment.ts
157
+ var DoclingEnvironment = class _DoclingEnvironment {
158
+ logger;
159
+ venvPath;
160
+ port;
161
+ killExistingProcess;
162
+ constructor(options) {
163
+ this.logger = options.logger;
164
+ this.venvPath = options.venvPath;
165
+ this.port = options.port;
166
+ this.killExistingProcess = options.killExistingProcess;
167
+ }
168
+ async setup() {
169
+ this.logger.info("[DoclingEnvironment] Setting up Python environment...");
170
+ await this.checkPythonVersion();
171
+ await this.setupPythonEnvironment();
172
+ await this.upgradePip();
173
+ await this.installSetuptools();
174
+ await this.installPyArrow();
175
+ await this.installDoclingServe();
176
+ const portInUse = await this.isPortInUse(this.port);
177
+ if (portInUse && !this.killExistingProcess) {
178
+ this.logger.info(
179
+ "[DoclingEnvironment] Reusing existing server on port",
180
+ this.port
181
+ );
182
+ } else {
183
+ await this.startDoclingServe();
184
+ }
185
+ this.logger.info("[DoclingEnvironment] Setup completed");
186
+ }
187
+ async checkPythonVersion() {
188
+ const result = await spawnAsync("python3", ["--version"]);
189
+ if (result.code !== 0) {
190
+ throw new Error("Failed to check Python version");
191
+ }
192
+ const output = result.stdout + result.stderr;
193
+ const version = parsePythonVersion(output);
194
+ if (!version) {
195
+ throw new Error("Could not parse Python version");
196
+ }
197
+ this.logger.info(
198
+ "[DoclingEnvironment] Python version:",
199
+ version.versionString
200
+ );
201
+ try {
202
+ validatePythonVersion(version, "system");
203
+ } catch (error) {
204
+ if (error instanceof PythonVersionError && version.minor >= 13) {
205
+ this.logger.error(
206
+ "[DoclingEnvironment] Python 3.13+ is not compatible. Install 3.11 or 3.12 with: pyenv install 3.12.0 && pyenv global 3.12.0"
207
+ );
208
+ }
209
+ throw error;
210
+ }
211
+ return version;
212
+ }
213
+ async setupPythonEnvironment() {
214
+ const result = await spawnAsync("python3", ["-m", "venv", this.venvPath]);
215
+ if (result.code !== 0) {
216
+ throw new Error("Failed to create Python virtual environment");
217
+ }
218
+ await this.verifyVenvPythonVersion();
219
+ }
220
+ async verifyVenvPythonVersion() {
221
+ const pythonPath = (0, import_node_path.join)(this.venvPath, "bin", "python");
222
+ const result = await spawnAsync(pythonPath, ["--version"]);
223
+ if (result.code !== 0) {
224
+ throw new Error("Failed to verify venv Python version");
225
+ }
226
+ const output = result.stdout + result.stderr;
227
+ const version = parsePythonVersion(output);
228
+ if (!version) {
229
+ throw new Error("Could not parse venv Python version");
230
+ }
231
+ validatePythonVersion(version, "venv");
232
+ }
233
+ async upgradePip() {
234
+ const pipPath = (0, import_node_path.join)(this.venvPath, "bin", "pip");
235
+ const result = await spawnAsync(pipPath, ["install", "--upgrade", "pip"]);
236
+ if (result.code !== 0) {
237
+ this.logger.error(
238
+ "[DoclingEnvironment] Failed to upgrade pip:",
239
+ result.stderr
240
+ );
241
+ throw new Error(`Failed to upgrade pip. Exit code: ${result.code}`);
242
+ }
243
+ }
244
+ async installSetuptools() {
245
+ const pipPath = (0, import_node_path.join)(this.venvPath, "bin", "pip");
246
+ const result = await spawnAsync(pipPath, [
247
+ "install",
248
+ "--upgrade",
249
+ "setuptools",
250
+ "wheel"
251
+ ]);
252
+ if (result.code !== 0) {
253
+ this.logger.error(
254
+ "[DoclingEnvironment] Failed to install setuptools:",
255
+ result.stderr
256
+ );
257
+ throw new Error(
258
+ `Failed to install setuptools. Exit code: ${result.code}`
259
+ );
260
+ }
261
+ }
262
+ async installPyArrow() {
263
+ const pipPath = (0, import_node_path.join)(this.venvPath, "bin", "pip");
264
+ const result = await spawnAsync(pipPath, [
265
+ "install",
266
+ "--only-binary",
267
+ ":all:",
268
+ "pyarrow"
269
+ ]);
270
+ if (result.code !== 0) {
271
+ this.logger.error(
272
+ "[DoclingEnvironment] Failed to install pyarrow:",
273
+ result.stderr
274
+ );
275
+ throw new Error(`Failed to install pyarrow. Exit code: ${result.code}`);
276
+ }
277
+ }
278
+ async installDoclingServe() {
279
+ const pipPath = (0, import_node_path.join)(this.venvPath, "bin", "pip");
280
+ const result = await spawnAsync(pipPath, ["install", "docling-serve"]);
281
+ if (result.code !== 0) {
282
+ this.logger.error(
283
+ "[DoclingEnvironment] Failed to install docling-serve:",
284
+ result.stderr
285
+ );
286
+ throw new Error(
287
+ `Failed to install docling-serve. Exit code: ${result.code}`
288
+ );
289
+ }
290
+ }
291
+ async isPortInUse(port) {
292
+ try {
293
+ const result = await spawnAsync("lsof", ["-ti", `:${port}`]);
294
+ return result.code === 0 && !!result.stdout.trim();
295
+ } catch {
296
+ return false;
297
+ }
298
+ }
299
+ /**
300
+ * Start the docling-serve server without running full setup.
301
+ * Useful for restarting the server after it has crashed.
302
+ */
303
+ async startServer() {
304
+ await this.startDoclingServe();
305
+ }
306
+ // Process-killing logic is provided as a static method to allow reuse without instantiation
307
+ static async killProcessOnPort(logger, port) {
308
+ return new Promise((resolve) => {
309
+ const lsof = (0, import_node_child_process.spawn)("lsof", ["-ti", `:${port}`]);
310
+ const pids = [];
311
+ lsof.stdout?.on("data", (data) => {
312
+ const txt = data.toString();
313
+ pids.push(
314
+ ...txt.split(/\s+/).map((s) => s.trim()).filter(Boolean)
315
+ );
316
+ });
317
+ lsof.on("close", () => {
318
+ if (pids.length === 0) return resolve();
319
+ let remaining = pids.length;
320
+ const done = () => {
321
+ if (--remaining <= 0) resolve();
322
+ };
323
+ logger.info(
324
+ "[DoclingEnvironment] Killing process",
325
+ pids.join(", "),
326
+ "on port",
327
+ port
328
+ );
329
+ for (const pid of pids) {
330
+ const killProc = (0, import_node_child_process.spawn)("kill", ["-9", pid]);
331
+ killProc.on("close", (killCode) => {
332
+ if (killCode !== 0) {
333
+ logger.info("[DoclingEnvironment] Failed to kill process", pid);
334
+ }
335
+ done();
336
+ });
337
+ killProc.on("error", (Error2) => {
338
+ logger.info("[DoclingEnvironment] Failed to kill process", Error2);
339
+ done();
340
+ });
341
+ }
342
+ });
343
+ lsof.on("error", () => resolve());
344
+ });
345
+ }
346
+ async startDoclingServe() {
347
+ return new Promise(async (resolve, reject) => {
348
+ if (this.killExistingProcess) {
349
+ await _DoclingEnvironment.killProcessOnPort(this.logger, this.port);
350
+ }
351
+ const venvPath = this.venvPath;
352
+ const doclingServePath = (0, import_node_path.join)(venvPath, "bin", "docling-serve");
353
+ const args = ["run", "--port", this.port.toString()];
354
+ this.logger.info(
355
+ "[DoclingEnvironment] Starting docling-serve on port",
356
+ this.port
357
+ );
358
+ const doclingProcess = (0, import_node_child_process.spawn)(doclingServePath, args, {
359
+ detached: true,
360
+ // Detached from parent process
361
+ stdio: "ignore"
362
+ // Remove stdio pipes to prevent event loop from hanging
363
+ });
364
+ doclingProcess.unref();
365
+ doclingProcess.on("error", (error) => {
366
+ this.logger.error("[DoclingEnvironment] docling-serve error:", error);
367
+ reject(error);
368
+ });
369
+ setTimeout(() => {
370
+ resolve();
371
+ }, DOCLING_ENVIRONMENT.STARTUP_DELAY_MS);
372
+ });
373
+ }
374
+ };
375
+
376
+ // src/core/pdf-converter.ts
377
+ var import_es_toolkit = require("es-toolkit");
378
+ var import_node_fs4 = require("fs");
379
+ var import_node_path5 = require("path");
380
+ var import_promises = require("stream/promises");
381
+
382
+ // src/errors/image-pdf-fallback-error.ts
383
+ var ImagePdfFallbackError = class extends Error {
384
+ constructor(originalError, fallbackError) {
385
+ super(
386
+ `PDF conversion failed with fallback. Original: ${originalError.message}. Fallback: ${fallbackError.message}`
387
+ );
388
+ this.originalError = originalError;
389
+ this.fallbackError = fallbackError;
390
+ }
391
+ name = "ImagePdfFallbackError";
392
+ };
393
+
394
+ // src/processors/image-extractor.ts
395
+ var import_node_fs = require("fs");
396
+ var import_node_path2 = require("path");
397
+ var yauzl = __toESM(require("yauzl"), 1);
398
+
399
+ // src/utils/jq.ts
400
+ var import_node_child_process2 = require("child_process");
401
+ function getJqPath() {
402
+ const p = process.env.JQ_PATH?.trim();
403
+ return p && p.length > 0 ? p : "jq";
404
+ }
405
+ function runJqFileJson(program, filePath) {
406
+ return new Promise((resolve, reject) => {
407
+ const jqPath = getJqPath();
408
+ const args = [
409
+ "-c",
410
+ // compact output (single line when possible)
411
+ program,
412
+ filePath
413
+ ];
414
+ const child = (0, import_node_child_process2.spawn)(jqPath, args, {
415
+ stdio: ["ignore", "pipe", "pipe"],
416
+ env: process.env
417
+ });
418
+ let stdout = "";
419
+ let stderr = "";
420
+ child.stdout.setEncoding("utf-8");
421
+ child.stderr.setEncoding("utf-8");
422
+ child.stdout.on("data", (chunk) => {
423
+ stdout += chunk;
424
+ });
425
+ child.stderr.on("data", (chunk) => {
426
+ stderr += chunk;
427
+ });
428
+ child.on("error", (err) => {
429
+ reject(err);
430
+ });
431
+ child.on("close", (code) => {
432
+ if (code !== 0) {
433
+ const error = new Error(
434
+ `jq exited with code ${code}. ${stderr ? "Stderr: " + stderr : ""}`
435
+ );
436
+ return reject(error);
437
+ }
438
+ try {
439
+ const text = stdout.trim();
440
+ const parsed = JSON.parse(text);
441
+ resolve(parsed);
442
+ } catch (e) {
443
+ reject(
444
+ new Error(
445
+ `Failed to parse jq output as JSON. Output length=${stdout.length}. Error: ${e.message}`
446
+ )
447
+ );
448
+ }
449
+ });
450
+ });
451
+ }
452
+ function jqExtractBase64PngStrings(filePath) {
453
+ const program = `
454
+ [
455
+ .. |
456
+ select(type == "string" and startswith("data:image/png;base64"))
457
+ ]
458
+ `;
459
+ return runJqFileJson(program, filePath);
460
+ }
461
+ function jqReplaceBase64WithPaths(filePath, dirName, prefix) {
462
+ const program = `
463
+ reduce paths(type == "string" and startswith("data:image/png;base64")) as $p (
464
+ {data: ., counter: 0};
465
+ .counter as $idx |
466
+ .data |= setpath($p; "${dirName}/${prefix}_\\($idx).png") |
467
+ .counter += 1
468
+ ) | {data: .data, count: .counter}
469
+ `;
470
+ return runJqFileJson(program, filePath);
471
+ }
472
+
473
+ // src/processors/image-extractor.ts
474
+ var ImageExtractor = class _ImageExtractor {
475
+ /**
476
+ * Extract a ZIP file to a target directory
477
+ */
478
+ static async extractZip(zipPath, targetDir) {
479
+ return new Promise((resolve, reject) => {
480
+ yauzl.open(zipPath, { lazyEntries: true }, (err, zipfile) => {
481
+ if (err || !zipfile) {
482
+ reject(err || new Error("Failed to open zip file"));
483
+ return;
484
+ }
485
+ zipfile.readEntry();
486
+ zipfile.on("entry", (entry) => {
487
+ const entryPath = (0, import_node_path2.join)(targetDir, entry.fileName);
488
+ if (/\/$/.test(entry.fileName)) {
489
+ (0, import_node_fs.mkdirSync)(entryPath, { recursive: true });
490
+ zipfile.readEntry();
491
+ } else {
492
+ zipfile.openReadStream(entry, (err2, readStream) => {
493
+ if (err2 || !readStream) {
494
+ reject(err2 || new Error("Failed to open read stream"));
495
+ return;
496
+ }
497
+ (0, import_node_fs.mkdirSync)((0, import_node_path2.join)(entryPath, ".."), { recursive: true });
498
+ const writeStream = (0, import_node_fs.createWriteStream)(entryPath);
499
+ readStream.pipe(writeStream);
500
+ writeStream.on("finish", () => {
501
+ zipfile.readEntry();
502
+ });
503
+ writeStream.on("error", reject);
504
+ });
505
+ }
506
+ });
507
+ zipfile.on("end", () => {
508
+ resolve();
509
+ });
510
+ zipfile.on("error", reject);
511
+ });
512
+ });
513
+ }
514
+ /**
515
+ * Extract base64 images from JSON file using jq (for large files)
516
+ * Returns array of base64 data strings
517
+ */
518
+ static async extractBase64ImagesFromJsonWithJq(jsonPath) {
519
+ return jqExtractBase64PngStrings(jsonPath);
520
+ }
521
+ /**
522
+ * Replace base64 images with file paths in JSON using jq (for large files)
523
+ * Uses reduce to maintain counter state while walking the JSON
524
+ */
525
+ static async replaceBase64ImagesInJsonWithJq(jsonPath, outputPath, dirName, prefix) {
526
+ const { data, count } = await jqReplaceBase64WithPaths(
527
+ jsonPath,
528
+ dirName,
529
+ prefix
530
+ );
531
+ (0, import_node_fs.writeFileSync)(outputPath, JSON.stringify(data, null, 2), "utf-8");
532
+ return count;
533
+ }
534
+ /**
535
+ * Extract a base64-encoded image to a file and return the relative path
536
+ */
537
+ static extractBase64ImageToFile(base64Data, imagesDir, index, prefix, dirName) {
538
+ const PREFIX = "data:image/png;base64,";
539
+ const base64Content = base64Data.startsWith(PREFIX) ? base64Data.slice(PREFIX.length) : base64Data;
540
+ const filename = `${prefix}_${index}.png`;
541
+ const filepath = (0, import_node_path2.join)(imagesDir, filename);
542
+ const buffer = Buffer.from(base64Content, "base64");
543
+ (0, import_node_fs.writeFileSync)(filepath, buffer);
544
+ return `${dirName}/${filename}`;
545
+ }
546
+ /**
547
+ * Save JSON and HTML documents with base64 images extracted to separate files
548
+ * Uses jq for JSON processing to handle large files
549
+ *
550
+ * This method:
551
+ * 1. Extracts base64-encoded images from JSON and HTML content
552
+ * 2. Saves images as separate PNG files
553
+ * 3. Replaces base64 data with relative file paths
554
+ * 4. Saves the transformed documents to the output directory
555
+ */
556
+ static async saveDocumentsWithExtractedImages(logger, outputDir, filename, jsonSourcePath, htmlContent) {
557
+ try {
558
+ if ((0, import_node_fs.existsSync)(outputDir)) {
559
+ (0, import_node_fs.rmSync)(outputDir, { recursive: true, force: true });
560
+ }
561
+ } catch (e) {
562
+ logger.warn("[PDFConverter] Failed to clear output directory:", e);
563
+ }
564
+ (0, import_node_fs.mkdirSync)(outputDir, { recursive: true });
565
+ const baseName = filename.replace((0, import_node_path2.extname)(filename), "");
566
+ const jsonPath = (0, import_node_path2.join)(outputDir, `${baseName}.json`);
567
+ try {
568
+ const pagesDir = (0, import_node_path2.join)(outputDir, "pages");
569
+ if (!(0, import_node_fs.existsSync)(pagesDir)) {
570
+ (0, import_node_fs.mkdirSync)(pagesDir, { recursive: true });
571
+ }
572
+ const base64Images = await _ImageExtractor.extractBase64ImagesFromJsonWithJq(jsonSourcePath);
573
+ base64Images.forEach((base64Data, index) => {
574
+ _ImageExtractor.extractBase64ImageToFile(
575
+ base64Data,
576
+ pagesDir,
577
+ index,
578
+ "page",
579
+ "pages"
580
+ );
581
+ });
582
+ logger.info(
583
+ `[PDFConverter] Extracted ${base64Images.length} images from JSON to ${pagesDir}`
584
+ );
585
+ const replacedCount = await _ImageExtractor.replaceBase64ImagesInJsonWithJq(
586
+ jsonSourcePath,
587
+ jsonPath,
588
+ "pages",
589
+ "page"
590
+ );
591
+ logger.info(
592
+ `[PDFConverter] Replaced ${replacedCount} base64 images with file paths`
593
+ );
594
+ } catch (e) {
595
+ logger.warn(
596
+ "[PDFConverter] Failed to extract images from JSON using jq. Error:",
597
+ e
598
+ );
599
+ throw e;
600
+ }
601
+ logger.info("[PDFConverter] Saved JSON:", jsonPath);
602
+ const htmlPath = (0, import_node_path2.join)(outputDir, `${baseName}.html`);
603
+ try {
604
+ const imagesDir = (0, import_node_path2.join)(outputDir, "images");
605
+ if (!(0, import_node_fs.existsSync)(imagesDir)) {
606
+ (0, import_node_fs.mkdirSync)(imagesDir, { recursive: true });
607
+ }
608
+ let imageIndex = 0;
609
+ const transformedHtml = htmlContent.replace(
610
+ /src="data:image\/png;base64,([^"]+)"/g,
611
+ (_, base64Content) => {
612
+ const filename2 = `image_${imageIndex}.png`;
613
+ const filepath = (0, import_node_path2.join)(imagesDir, filename2);
614
+ const buffer = Buffer.from(base64Content, "base64");
615
+ (0, import_node_fs.writeFileSync)(filepath, buffer);
616
+ const relativePath = `images/${filename2}`;
617
+ imageIndex += 1;
618
+ return `src="${relativePath}"`;
619
+ }
620
+ );
621
+ logger.info(
622
+ `[PDFConverter] Extracted ${imageIndex} images from HTML to ${imagesDir}`
623
+ );
624
+ (0, import_node_fs.writeFileSync)(htmlPath, transformedHtml, "utf-8");
625
+ } catch (e) {
626
+ logger.warn(
627
+ "[PDFConverter] Failed to extract images from HTML, writing original. Error:",
628
+ e
629
+ );
630
+ (0, import_node_fs.writeFileSync)(htmlPath, htmlContent, "utf-8");
631
+ }
632
+ logger.info("[PDFConverter] Saved HTML:", htmlPath);
633
+ }
634
+ /**
635
+ * Extract documents from ZIP and save with extracted images
636
+ * Uses jq for JSON processing to handle large files without loading into Node.js memory
637
+ *
638
+ * Complete workflow:
639
+ * 1. Extract ZIP file to temporary directory
640
+ * 2. Find JSON and HTML files from extracted files
641
+ * 3. Use jq to extract base64 images from JSON and save as separate files
642
+ * 4. Use jq to replace base64 with file paths in JSON
643
+ * 5. Process HTML with regex to extract and replace images
644
+ * 6. Save transformed documents to output directory (as result.json and result.html)
645
+ */
646
+ static async extractAndSaveDocumentsFromZip(logger, zipPath, extractDir, outputDir) {
647
+ logger.info("[PDFConverter] Extracting ZIP file...");
648
+ await _ImageExtractor.extractZip(zipPath, extractDir);
649
+ const files = (0, import_node_fs.readdirSync)(extractDir);
650
+ const jsonFile = files.find((f) => (0, import_node_path2.extname)(f).toLowerCase() === ".json");
651
+ const htmlFile = files.find((f) => (0, import_node_path2.extname)(f).toLowerCase() === ".html");
652
+ if (!jsonFile || !htmlFile) {
653
+ throw new Error(
654
+ `Expected one JSON and one HTML file in extracted directory. Found: ${files.join(", ")}`
655
+ );
656
+ }
657
+ const jsonPath = (0, import_node_path2.join)(extractDir, jsonFile);
658
+ const htmlPath = (0, import_node_path2.join)(extractDir, htmlFile);
659
+ const htmlContent = (0, import_node_fs.readFileSync)(htmlPath, "utf-8");
660
+ logger.info("[PDFConverter] Saving converted files to output...");
661
+ await _ImageExtractor.saveDocumentsWithExtractedImages(
662
+ logger,
663
+ outputDir,
664
+ "result",
665
+ jsonPath,
666
+ htmlContent
667
+ );
668
+ logger.info("[PDFConverter] Files saved to:", outputDir);
669
+ }
670
+ };
671
+
672
+ // src/utils/local-file-server.ts
673
+ var import_node_fs2 = require("fs");
674
+ var import_node_http = require("http");
675
+ var import_node_path3 = require("path");
676
+ var LocalFileServer = class {
677
+ server = null;
678
+ port = 0;
679
+ /**
680
+ * Start serving a file and return the URL
681
+ *
682
+ * @param filePath Absolute path to the file to serve
683
+ * @returns URL to access the file
684
+ */
685
+ async start(filePath) {
686
+ const filename = (0, import_node_path3.basename)(filePath);
687
+ const stat = (0, import_node_fs2.statSync)(filePath);
688
+ return new Promise((resolve, reject) => {
689
+ this.server = (0, import_node_http.createServer)((req, res) => {
690
+ if (req.url === `/${filename}`) {
691
+ res.writeHead(200, {
692
+ "Content-Type": "application/pdf",
693
+ "Content-Length": stat.size
694
+ });
695
+ (0, import_node_fs2.createReadStream)(filePath).pipe(res);
696
+ } else {
697
+ res.writeHead(404);
698
+ res.end("Not Found");
699
+ }
700
+ });
701
+ this.server.on("error", reject);
702
+ this.server.listen(0, "127.0.0.1", () => {
703
+ const address = this.server.address();
704
+ if (typeof address === "object" && address !== null) {
705
+ this.port = address.port;
706
+ resolve(`http://127.0.0.1:${this.port}/${filename}`);
707
+ } else {
708
+ reject(new Error("Failed to get server address"));
709
+ }
710
+ });
711
+ });
712
+ }
713
+ /**
714
+ * Stop the server
715
+ */
716
+ stop() {
717
+ return new Promise((resolve) => {
718
+ if (this.server) {
719
+ this.server.close(() => {
720
+ this.server = null;
721
+ this.port = 0;
722
+ resolve();
723
+ });
724
+ } else {
725
+ resolve();
726
+ }
727
+ });
728
+ }
729
+ };
730
+
731
+ // src/core/image-pdf-converter.ts
732
+ var import_node_fs3 = require("fs");
733
+ var import_node_os = require("os");
734
+ var import_node_path4 = require("path");
735
+ var ImagePdfConverter = class {
736
+ constructor(logger) {
737
+ this.logger = logger;
738
+ }
739
+ /**
740
+ * Convert a PDF file to an image-based PDF.
741
+ * Downloads the PDF from URL, converts it using ImageMagick, and returns the path.
742
+ *
743
+ * @param pdfUrl - URL of the source PDF
744
+ * @param reportId - Report identifier for temp file naming
745
+ * @returns Path to the converted image PDF in temp directory
746
+ */
747
+ async convert(pdfUrl, reportId) {
748
+ const timestamp = Date.now();
749
+ const tempDir = (0, import_node_os.tmpdir)();
750
+ const inputPath = (0, import_node_path4.join)(tempDir, `${reportId}-${timestamp}-input.pdf`);
751
+ const outputPath = (0, import_node_path4.join)(tempDir, `${reportId}-${timestamp}-image.pdf`);
752
+ try {
753
+ this.logger.info("[ImagePdfConverter] Downloading PDF from URL...");
754
+ await this.downloadPdf(pdfUrl, inputPath);
755
+ this.logger.info("[ImagePdfConverter] Converting to image PDF...");
756
+ await this.convertToImagePdf(inputPath, outputPath);
757
+ this.logger.info("[ImagePdfConverter] Image PDF created:", outputPath);
758
+ return outputPath;
759
+ } finally {
760
+ if ((0, import_node_fs3.existsSync)(inputPath)) {
761
+ (0, import_node_fs3.rmSync)(inputPath, { force: true });
762
+ }
763
+ }
764
+ }
765
+ /**
766
+ * Download PDF from URL to local path using curl
767
+ */
768
+ async downloadPdf(url, outputPath) {
769
+ const result = await spawnAsync("curl", [
770
+ "-L",
771
+ // Follow redirects
772
+ "-o",
773
+ outputPath,
774
+ "-s",
775
+ // Silent mode
776
+ "--fail",
777
+ // Fail on HTTP errors
778
+ url
779
+ ]);
780
+ if (result.code !== 0) {
781
+ throw new Error(
782
+ `Failed to download PDF: ${result.stderr || "Unknown error"}`
783
+ );
784
+ }
785
+ }
786
+ /**
787
+ * Convert PDF to image-based PDF using ImageMagick
788
+ */
789
+ async convertToImagePdf(inputPath, outputPath) {
790
+ const result = await spawnAsync("magick", [
791
+ "-density",
792
+ IMAGE_PDF_CONVERTER.DENSITY.toString(),
793
+ inputPath,
794
+ "-quality",
795
+ IMAGE_PDF_CONVERTER.QUALITY.toString(),
796
+ outputPath
797
+ ]);
798
+ if (result.code !== 0) {
799
+ throw new Error(
800
+ `Failed to convert PDF to image PDF: ${result.stderr || "Unknown error"}`
801
+ );
802
+ }
803
+ }
804
+ /**
805
+ * Cleanup the temporary image PDF file
806
+ */
807
+ cleanup(imagePdfPath) {
808
+ if ((0, import_node_fs3.existsSync)(imagePdfPath)) {
809
+ this.logger.info(
810
+ "[ImagePdfConverter] Cleaning up temp file:",
811
+ imagePdfPath
812
+ );
813
+ (0, import_node_fs3.rmSync)(imagePdfPath, { force: true });
814
+ }
815
+ }
816
+ };
817
+
818
+ // src/core/pdf-converter.ts
819
+ var PDFConverter = class {
820
+ constructor(logger, client, enableImagePdfFallback = false) {
821
+ this.logger = logger;
822
+ this.client = client;
823
+ this.enableImagePdfFallback = enableImagePdfFallback;
824
+ }
825
+ async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
826
+ this.logger.info("[PDFConverter] Converting:", url);
827
+ let originalError = null;
828
+ try {
829
+ await this.performConversion(
830
+ url,
831
+ reportId,
832
+ onComplete,
833
+ cleanupAfterCallback,
834
+ options,
835
+ abortSignal
836
+ );
837
+ return;
838
+ } catch (error) {
839
+ if (abortSignal?.aborted) {
840
+ throw error;
841
+ }
842
+ originalError = error;
843
+ this.logger.error("[PDFConverter] Conversion failed:", error);
844
+ if (!this.enableImagePdfFallback) {
845
+ throw error;
846
+ }
847
+ }
848
+ this.logger.info("[PDFConverter] Attempting image PDF fallback...");
849
+ const imagePdfConverter = new ImagePdfConverter(this.logger);
850
+ let imagePdfPath = null;
851
+ try {
852
+ imagePdfPath = await imagePdfConverter.convert(url, reportId);
853
+ const localUrl = `file://${imagePdfPath}`;
854
+ this.logger.info("[PDFConverter] Retrying with image PDF:", localUrl);
855
+ await this.performConversion(
856
+ localUrl,
857
+ reportId,
858
+ onComplete,
859
+ cleanupAfterCallback,
860
+ options,
861
+ abortSignal
862
+ );
863
+ this.logger.info("[PDFConverter] Fallback conversion succeeded");
864
+ } catch (fallbackError) {
865
+ this.logger.error(
866
+ "[PDFConverter] Fallback conversion also failed:",
867
+ fallbackError
868
+ );
869
+ throw new ImagePdfFallbackError(originalError, fallbackError);
870
+ } finally {
871
+ if (imagePdfPath) {
872
+ imagePdfConverter.cleanup(imagePdfPath);
873
+ }
874
+ }
875
+ }
876
+ async performConversion(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
877
+ const startTime = Date.now();
878
+ const conversionOptions = this.buildConversionOptions(options);
879
+ this.logger.info(
880
+ "[PDFConverter] Converting document with Async Source API..."
881
+ );
882
+ this.logger.info("[PDFConverter] Server will download from URL directly");
883
+ this.logger.info(
884
+ "[PDFConverter] Results will be returned as ZIP to avoid memory limits"
885
+ );
886
+ const { httpUrl, server } = await this.resolveUrl(url);
887
+ try {
888
+ const task = await this.startConversionTask(httpUrl, conversionOptions);
889
+ await this.trackTaskProgress(task);
890
+ if (abortSignal?.aborted) {
891
+ this.logger.info(
892
+ "[PDFConverter] Conversion aborted after docling completion"
893
+ );
894
+ const error = new Error("PDF conversion was aborted");
895
+ error.name = "AbortError";
896
+ throw error;
897
+ }
898
+ await this.downloadResult(task.taskId);
899
+ } finally {
900
+ if (server) {
901
+ this.logger.info("[PDFConverter] Stopping local file server...");
902
+ await server.stop();
903
+ }
904
+ }
905
+ const cwd = process.cwd();
906
+ const zipPath = (0, import_node_path5.join)(cwd, "result.zip");
907
+ const extractDir = (0, import_node_path5.join)(cwd, "result_extracted");
908
+ const outputDir = (0, import_node_path5.join)(cwd, "output", reportId);
909
+ try {
910
+ await this.processConvertedFiles(zipPath, extractDir, outputDir);
911
+ if (abortSignal?.aborted) {
912
+ this.logger.info("[PDFConverter] Conversion aborted before callback");
913
+ const error = new Error("PDF conversion was aborted");
914
+ error.name = "AbortError";
915
+ throw error;
916
+ }
917
+ this.logger.info("[PDFConverter] Executing completion callback...");
918
+ await onComplete(outputDir);
919
+ const duration = Date.now() - startTime;
920
+ this.logger.info("[PDFConverter] Conversion completed successfully!");
921
+ this.logger.info("[PDFConverter] Total time:", duration, "ms");
922
+ } finally {
923
+ this.logger.info("[PDFConverter] Cleaning up temporary files...");
924
+ if ((0, import_node_fs4.existsSync)(zipPath)) {
925
+ (0, import_node_fs4.rmSync)(zipPath, { force: true });
926
+ }
927
+ if ((0, import_node_fs4.existsSync)(extractDir)) {
928
+ (0, import_node_fs4.rmSync)(extractDir, { recursive: true, force: true });
929
+ }
930
+ if (cleanupAfterCallback) {
931
+ this.logger.info(
932
+ "[PDFConverter] Cleaning up output directory:",
933
+ outputDir
934
+ );
935
+ if ((0, import_node_fs4.existsSync)(outputDir)) {
936
+ (0, import_node_fs4.rmSync)(outputDir, { recursive: true, force: true });
937
+ }
938
+ } else {
939
+ this.logger.info("[PDFConverter] Output preserved at:", outputDir);
940
+ }
941
+ }
942
+ }
943
+ buildConversionOptions(options) {
944
+ return {
945
+ ...(0, import_es_toolkit.omit)(options, ["num_threads"]),
946
+ to_formats: ["json", "html"],
947
+ image_export_mode: "embedded",
948
+ ocr_engine: "ocrmac",
949
+ generate_picture_images: true,
950
+ images_scale: 2,
951
+ /**
952
+ * While disabling this option yields the most accurate text extraction for readable PDFs,
953
+ * text layers overlaid on images or drawings can introduce noise when not merged properly.
954
+ * In practice, archaeological report PDFs almost always contain such overlapping cases.
955
+ * Enabling force_ocr mitigates this risk. Although OCR may introduce minor errors compared
956
+ * to direct text extraction, the accuracy remains high since the source is digital, not scanned paper.
957
+ */
958
+ force_ocr: true,
959
+ accelerator_options: {
960
+ device: "mps",
961
+ num_threads: options.num_threads
962
+ }
963
+ };
964
+ }
965
+ async startConversionTask(url, conversionOptions) {
966
+ const task = await this.client.convertSourceAsync({
967
+ sources: [
968
+ {
969
+ kind: "http",
970
+ url
971
+ }
972
+ ],
973
+ options: conversionOptions,
974
+ target: {
975
+ kind: "zip"
976
+ }
977
+ });
978
+ this.logger.info(`[PDFConverter] Task created: ${task.taskId}`);
979
+ this.logger.info("[PDFConverter] Polling for progress...");
980
+ return task;
981
+ }
982
+ /**
983
+ * Start a local file server for file:// URLs
984
+ *
985
+ * @param url URL to check (file:// or http://)
986
+ * @returns Object with httpUrl and optional server to stop later
987
+ */
988
+ async resolveUrl(url) {
989
+ if (url.startsWith("file://")) {
990
+ const filePath = url.slice(7);
991
+ const server = new LocalFileServer();
992
+ const httpUrl = await server.start(filePath);
993
+ this.logger.info("[PDFConverter] Started local file server:", httpUrl);
994
+ return { httpUrl, server };
995
+ }
996
+ return { httpUrl: url };
997
+ }
998
+ async trackTaskProgress(task) {
999
+ const conversionStartTime = Date.now();
1000
+ let lastStatus = "";
1001
+ let isCompleted = false;
1002
+ const pollInterval = setInterval(() => {
1003
+ if (isCompleted) return;
1004
+ const elapsed = Math.floor((Date.now() - conversionStartTime) / 1e3);
1005
+ process.stdout.write(
1006
+ `\r[PDFConverter] Status: ${lastStatus || "processing"} (${elapsed}s elapsed)`
1007
+ );
1008
+ }, PDF_CONVERTER.POLL_INTERVAL_MS);
1009
+ task.on("progress", (status) => {
1010
+ lastStatus = status.task_status;
1011
+ if (status.task_position !== void 0) {
1012
+ process.stdout.write(
1013
+ `\r[PDFConverter] Status: ${status.task_status} (position: ${status.task_position})`
1014
+ );
1015
+ }
1016
+ });
1017
+ task.on("complete", () => {
1018
+ isCompleted = true;
1019
+ clearInterval(pollInterval);
1020
+ this.logger.info("\n[PDFConverter] Conversion completed!");
1021
+ });
1022
+ task.on("error", (error) => {
1023
+ isCompleted = true;
1024
+ clearInterval(pollInterval);
1025
+ this.logger.error("\n[PDFConverter] Conversion error:", error.message);
1026
+ });
1027
+ try {
1028
+ await task.waitForCompletion();
1029
+ } finally {
1030
+ isCompleted = true;
1031
+ clearInterval(pollInterval);
1032
+ }
1033
+ }
1034
+ async downloadResult(taskId) {
1035
+ this.logger.info(
1036
+ "\n[PDFConverter] Task completed, downloading ZIP file..."
1037
+ );
1038
+ const zipResult = await this.client.getTaskResultFile(taskId);
1039
+ if (!zipResult.success || !zipResult.fileStream) {
1040
+ throw new Error("Failed to get ZIP file result");
1041
+ }
1042
+ const zipPath = (0, import_node_path5.join)(process.cwd(), "result.zip");
1043
+ this.logger.info("[PDFConverter] Saving ZIP file to:", zipPath);
1044
+ const writeStream = (0, import_node_fs4.createWriteStream)(zipPath);
1045
+ await (0, import_promises.pipeline)(zipResult.fileStream, writeStream);
1046
+ }
1047
+ async processConvertedFiles(zipPath, extractDir, outputDir) {
1048
+ await ImageExtractor.extractAndSaveDocumentsFromZip(
1049
+ this.logger,
1050
+ zipPath,
1051
+ extractDir,
1052
+ outputDir
1053
+ );
1054
+ }
1055
+ };
1056
+
1057
+ // src/core/pdf-parser.ts
1058
+ var PDFParser = class {
1059
+ logger;
1060
+ port;
1061
+ baseUrl;
1062
+ timeout;
1063
+ venvPath;
1064
+ killExistingProcess;
1065
+ enableImagePdfFallback;
1066
+ client = null;
1067
+ constructor(options) {
1068
+ const {
1069
+ logger,
1070
+ timeout = PDF_PARSER.DEFAULT_TIMEOUT_MS,
1071
+ venvPath,
1072
+ killExistingProcess = false,
1073
+ enableImagePdfFallback = false
1074
+ } = options;
1075
+ this.logger = logger;
1076
+ if ("baseUrl" in options) {
1077
+ this.baseUrl = options.baseUrl;
1078
+ this.port = void 0;
1079
+ } else {
1080
+ this.port = options.port;
1081
+ this.baseUrl = void 0;
1082
+ }
1083
+ this.timeout = timeout;
1084
+ this.venvPath = venvPath || (0, import_node_path6.join)(process.cwd(), ".venv");
1085
+ this.killExistingProcess = killExistingProcess;
1086
+ this.enableImagePdfFallback = enableImagePdfFallback;
1087
+ }
1088
+ async init() {
1089
+ this.logger.info("[PDFParser] Initializing...");
1090
+ this.checkOperatingSystem();
1091
+ this.checkJqInstalled();
1092
+ this.checkMacOSVersion();
1093
+ if (this.enableImagePdfFallback && !this.baseUrl) {
1094
+ this.checkImageMagickInstalled();
1095
+ this.checkGhostscriptInstalled();
1096
+ } else if (this.enableImagePdfFallback && this.baseUrl) {
1097
+ this.logger.warn(
1098
+ "[PDFParser] enableImagePdfFallback is ignored when using external server (baseUrl)"
1099
+ );
1100
+ }
1101
+ if (this.baseUrl) {
1102
+ this.logger.info("[PDFParser] Using external server:", this.baseUrl);
1103
+ this.client = new import_docling_sdk.Docling({
1104
+ api: { baseUrl: this.baseUrl, timeout: this.timeout }
1105
+ });
1106
+ await this.waitForServerReady();
1107
+ return;
1108
+ }
1109
+ this.logger.info("[PDFParser] Setting up local server...");
1110
+ try {
1111
+ const environment = new DoclingEnvironment({
1112
+ logger: this.logger,
1113
+ venvPath: this.venvPath,
1114
+ port: this.port,
1115
+ killExistingProcess: this.killExistingProcess
1116
+ });
1117
+ await environment.setup();
1118
+ const clientUrl = `http://localhost:${this.port}`;
1119
+ this.client = new import_docling_sdk.Docling({
1120
+ api: {
1121
+ baseUrl: clientUrl,
1122
+ timeout: this.timeout
1123
+ }
1124
+ });
1125
+ await this.waitForServerReady();
1126
+ this.logger.info("[PDFParser] Ready");
1127
+ } catch (error) {
1128
+ this.logger.error("[PDFParser] Initialization failed:", error);
1129
+ throw new Error(`Failed to initialize PDFParser: ${error}`);
1130
+ }
1131
+ }
1132
+ checkOperatingSystem() {
1133
+ if ((0, import_node_os2.platform)() !== "darwin") {
1134
+ throw new Error(
1135
+ "PDFParser is only supported on macOS. Current platform: " + (0, import_node_os2.platform)()
1136
+ );
1137
+ }
1138
+ }
1139
+ checkJqInstalled() {
1140
+ try {
1141
+ (0, import_node_child_process3.execSync)("which jq", { stdio: "ignore" });
1142
+ } catch {
1143
+ throw new Error(
1144
+ "jq is not installed. Please install jq using: brew install jq"
1145
+ );
1146
+ }
1147
+ }
1148
+ checkMacOSVersion() {
1149
+ try {
1150
+ const versionOutput = (0, import_node_child_process3.execSync)("sw_vers -productVersion", {
1151
+ encoding: "utf-8"
1152
+ }).trim();
1153
+ const versionMatch = versionOutput.match(/^(\d+)\.(\d+)/);
1154
+ if (versionMatch) {
1155
+ const major = parseInt(versionMatch[1]);
1156
+ const minor = parseInt(versionMatch[2]);
1157
+ if (major < 10 || major === 10 && minor < 15) {
1158
+ throw new Error(
1159
+ `macOS 10.15 or later is required. Current version: ${versionOutput}`
1160
+ );
1161
+ }
1162
+ }
1163
+ } catch (error) {
1164
+ if (error instanceof Error && error.message.includes("macOS 10.15")) {
1165
+ throw error;
1166
+ }
1167
+ throw new Error("Failed to check macOS version");
1168
+ }
1169
+ }
1170
+ checkImageMagickInstalled() {
1171
+ try {
1172
+ (0, import_node_child_process3.execSync)("which magick", { stdio: "ignore" });
1173
+ } catch {
1174
+ throw new Error(
1175
+ "ImageMagick is not installed but enableImagePdfFallback is enabled. Please install ImageMagick using: brew install imagemagick"
1176
+ );
1177
+ }
1178
+ }
1179
+ checkGhostscriptInstalled() {
1180
+ try {
1181
+ (0, import_node_child_process3.execSync)("which gs", { stdio: "ignore" });
1182
+ } catch {
1183
+ throw new Error(
1184
+ "Ghostscript is not installed but enableImagePdfFallback is enabled. Please install Ghostscript using: brew install ghostscript"
1185
+ );
1186
+ }
1187
+ }
1188
+ /**
1189
+ * Check if an error is a connection refused error (ECONNREFUSED).
1190
+ * This typically indicates the Docling server has crashed.
1191
+ */
1192
+ isConnectionRefusedError(error) {
1193
+ if (error instanceof Error) {
1194
+ const errorStr = JSON.stringify(error);
1195
+ return errorStr.includes("ECONNREFUSED");
1196
+ }
1197
+ return false;
1198
+ }
1199
+ /**
1200
+ * Restart the Docling server after it has crashed.
1201
+ * This kills any existing process on the port, starts a new server,
1202
+ * and waits for it to become ready.
1203
+ *
1204
+ * Note: This method is only called when canRecover is true,
1205
+ * which guarantees this.port is defined.
1206
+ */
1207
+ async restartServer() {
1208
+ this.logger.info("[PDFParser] Restarting server...");
1209
+ await DoclingEnvironment.killProcessOnPort(this.logger, this.port);
1210
+ const environment = new DoclingEnvironment({
1211
+ logger: this.logger,
1212
+ venvPath: this.venvPath,
1213
+ port: this.port,
1214
+ killExistingProcess: false
1215
+ // Already killed above
1216
+ });
1217
+ await environment.startServer();
1218
+ this.client?.destroy();
1219
+ this.client = new import_docling_sdk.Docling({
1220
+ api: {
1221
+ baseUrl: `http://localhost:${this.port}`,
1222
+ timeout: this.timeout
1223
+ }
1224
+ });
1225
+ await this.waitForServerReady();
1226
+ this.logger.info("[PDFParser] Server restarted successfully");
1227
+ }
1228
+ async waitForServerReady() {
1229
+ const maxAttempts = PDF_PARSER.MAX_HEALTH_CHECK_ATTEMPTS;
1230
+ const checkInterval = PDF_PARSER.HEALTH_CHECK_INTERVAL_MS;
1231
+ const logInterval = PDF_PARSER.HEALTH_CHECK_LOG_INTERVAL_MS;
1232
+ let lastLogTime = 0;
1233
+ for (let attempt = 1; attempt <= maxAttempts; attempt++) {
1234
+ try {
1235
+ await this.client.health();
1236
+ this.logger.info("[PDFParser] Server is ready");
1237
+ return;
1238
+ } catch {
1239
+ const now = Date.now();
1240
+ if (now - lastLogTime >= logInterval) {
1241
+ this.logger.info(
1242
+ "[PDFParser] Waiting for server... (attempt",
1243
+ attempt,
1244
+ "/",
1245
+ maxAttempts,
1246
+ ")"
1247
+ );
1248
+ lastLogTime = now;
1249
+ }
1250
+ if (attempt < maxAttempts) {
1251
+ await new Promise((resolve) => setTimeout(resolve, checkInterval));
1252
+ }
1253
+ }
1254
+ }
1255
+ throw new Error("Server failed to become ready after maximum attempts");
1256
+ }
1257
+ async parse(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
1258
+ if (!this.client) {
1259
+ throw new Error(
1260
+ "PDFParser is not initialized. Call init() before using parse()"
1261
+ );
1262
+ }
1263
+ const canRecover = !this.baseUrl && this.port !== void 0;
1264
+ const maxAttempts = PDF_PARSER.MAX_SERVER_RECOVERY_ATTEMPTS;
1265
+ let attempt = 0;
1266
+ while (attempt <= maxAttempts) {
1267
+ try {
1268
+ const effectiveFallbackEnabled = this.enableImagePdfFallback && !this.baseUrl;
1269
+ const converter = new PDFConverter(
1270
+ this.logger,
1271
+ this.client,
1272
+ effectiveFallbackEnabled
1273
+ );
1274
+ return await converter.convert(
1275
+ url,
1276
+ reportId,
1277
+ onComplete,
1278
+ cleanupAfterCallback,
1279
+ options,
1280
+ abortSignal
1281
+ );
1282
+ } catch (error) {
1283
+ if (abortSignal?.aborted) {
1284
+ throw error;
1285
+ }
1286
+ if (canRecover && this.isConnectionRefusedError(error) && attempt < maxAttempts) {
1287
+ this.logger.warn(
1288
+ "[PDFParser] Connection refused, attempting server recovery..."
1289
+ );
1290
+ await this.restartServer();
1291
+ attempt++;
1292
+ continue;
1293
+ }
1294
+ throw error;
1295
+ }
1296
+ }
1297
+ }
1298
+ /**
1299
+ * Dispose the parser instance.
1300
+ * - Sets the internal client to null
1301
+ * - If a local docling server was started (no baseUrl), kills the process on the configured port
1302
+ */
1303
+ async dispose() {
1304
+ this.logger.info("[PDFParser] Disposing...");
1305
+ try {
1306
+ if (!this.baseUrl && this.port) {
1307
+ await DoclingEnvironment.killProcessOnPort(this.logger, this.port);
1308
+ }
1309
+ } catch (error) {
1310
+ this.logger.error("[PDFParser] Error while disposing:", error);
1311
+ } finally {
1312
+ this.client?.destroy();
1313
+ this.client = null;
1314
+ this.logger.info("[PDFParser] Disposed");
1315
+ }
1316
+ }
1317
+ };
1318
+ // Annotate the CommonJS export names for ESM import in node:
1319
+ 0 && (module.exports = {
1320
+ ImagePdfFallbackError,
1321
+ PDFParser
1322
+ });
1323
+ //# sourceMappingURL=index.cjs.map