@heripo/pdf-parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,1295 @@
1
+ import "./chunk-VUNV25KB.js";
2
+
3
+ // src/core/pdf-parser.ts
4
+ import { Docling } from "docling-sdk";
5
+ import { execSync } from "child_process";
6
+ import { platform } from "os";
7
+ import { join as join5 } from "path";
8
+
9
+ // src/config/constants.ts
10
+ var PDF_PARSER = {
11
+ /**
12
+ * Default timeout for API calls in milliseconds
13
+ */
14
+ DEFAULT_TIMEOUT_MS: 1e5,
15
+ /**
16
+ * Maximum number of health check attempts before giving up
17
+ */
18
+ MAX_HEALTH_CHECK_ATTEMPTS: 60,
19
+ /**
20
+ * Interval between health check attempts in milliseconds
21
+ */
22
+ HEALTH_CHECK_INTERVAL_MS: 2e3,
23
+ /**
24
+ * Interval between log messages during health check in milliseconds
25
+ */
26
+ HEALTH_CHECK_LOG_INTERVAL_MS: 5e3,
27
+ /**
28
+ * Maximum retry attempts for server recovery on ECONNREFUSED
29
+ */
30
+ MAX_SERVER_RECOVERY_ATTEMPTS: 1
31
+ };
32
+ var PDF_CONVERTER = {
33
+ /**
34
+ * Interval for progress polling in milliseconds
35
+ */
36
+ POLL_INTERVAL_MS: 1e3
37
+ };
38
+ var DOCLING_ENVIRONMENT = {
39
+ /**
40
+ * Delay after starting docling-serve to allow startup
41
+ */
42
+ STARTUP_DELAY_MS: 2e3
43
+ };
44
+ var IMAGE_PDF_CONVERTER = {
45
+ /**
46
+ * ImageMagick density option (DPI) for PDF to image conversion
47
+ */
48
+ DENSITY: 300,
49
+ /**
50
+ * ImageMagick quality option (1-100)
51
+ */
52
+ QUALITY: 100
53
+ };
54
+
55
+ // ../shared/dist/index.mjs
56
+ import { spawn } from "child_process";
57
+ function spawnAsync(command, args, options = {}) {
58
+ const {
59
+ captureStdout = true,
60
+ captureStderr = true,
61
+ ...spawnOptions
62
+ } = options;
63
+ return new Promise((resolve, reject) => {
64
+ const proc = spawn(command, args, spawnOptions);
65
+ let stdout = "";
66
+ let stderr = "";
67
+ if (captureStdout && proc.stdout) {
68
+ proc.stdout.on("data", (data) => {
69
+ stdout += data.toString();
70
+ });
71
+ }
72
+ if (captureStderr && proc.stderr) {
73
+ proc.stderr.on("data", (data) => {
74
+ stderr += data.toString();
75
+ });
76
+ }
77
+ proc.on("close", (code) => {
78
+ resolve({ stdout, stderr, code: code ?? 0 });
79
+ });
80
+ proc.on("error", reject);
81
+ });
82
+ }
83
+
84
+ // src/environment/docling-environment.ts
85
+ import { spawn as spawn2 } from "child_process";
86
+ import { join } from "path";
87
+
88
+ // src/utils/python-version.ts
89
+ var PYTHON_VERSION_REGEX = /Python (\d+)\.(\d+)/;
90
+ var MIN_PYTHON_VERSION = { major: 3, minor: 9 };
91
+ var PythonVersionError = class extends Error {
92
+ constructor(message) {
93
+ super(message);
94
+ this.name = "PythonVersionError";
95
+ }
96
+ };
97
+ function parsePythonVersion(output) {
98
+ const match = output.match(PYTHON_VERSION_REGEX);
99
+ if (!match) return null;
100
+ const major = parseInt(match[1]);
101
+ const minor = parseInt(match[2]);
102
+ return {
103
+ major,
104
+ minor,
105
+ versionString: `${major}.${minor}`
106
+ };
107
+ }
108
+ function validatePythonVersion(version, context = "system") {
109
+ const { major, minor } = version;
110
+ const prefix = context === "venv" ? "Venv Python" : "Python";
111
+ if (major === 3 && minor >= 13) {
112
+ throw new PythonVersionError(
113
+ `${prefix} ${major}.${minor} is too new. docling-serve requires Python 3.11 or 3.12.`
114
+ );
115
+ }
116
+ if (major !== 3 || minor < MIN_PYTHON_VERSION.minor) {
117
+ throw new PythonVersionError("Python 3.9 or higher is required");
118
+ }
119
+ }
120
+
121
+ // src/environment/docling-environment.ts
122
+ var DoclingEnvironment = class _DoclingEnvironment {
123
+ logger;
124
+ venvPath;
125
+ port;
126
+ killExistingProcess;
127
+ constructor(options) {
128
+ this.logger = options.logger;
129
+ this.venvPath = options.venvPath;
130
+ this.port = options.port;
131
+ this.killExistingProcess = options.killExistingProcess;
132
+ }
133
+ async setup() {
134
+ this.logger.info("[DoclingEnvironment] Setting up Python environment...");
135
+ await this.checkPythonVersion();
136
+ await this.setupPythonEnvironment();
137
+ await this.upgradePip();
138
+ await this.installSetuptools();
139
+ await this.installPyArrow();
140
+ await this.installDoclingServe();
141
+ const portInUse = await this.isPortInUse(this.port);
142
+ if (portInUse && !this.killExistingProcess) {
143
+ this.logger.info(
144
+ "[DoclingEnvironment] Reusing existing server on port",
145
+ this.port
146
+ );
147
+ } else {
148
+ await this.startDoclingServe();
149
+ }
150
+ this.logger.info("[DoclingEnvironment] Setup completed");
151
+ }
152
+ async checkPythonVersion() {
153
+ const result = await spawnAsync("python3", ["--version"]);
154
+ if (result.code !== 0) {
155
+ throw new Error("Failed to check Python version");
156
+ }
157
+ const output = result.stdout + result.stderr;
158
+ const version = parsePythonVersion(output);
159
+ if (!version) {
160
+ throw new Error("Could not parse Python version");
161
+ }
162
+ this.logger.info(
163
+ "[DoclingEnvironment] Python version:",
164
+ version.versionString
165
+ );
166
+ try {
167
+ validatePythonVersion(version, "system");
168
+ } catch (error) {
169
+ if (error instanceof PythonVersionError && version.minor >= 13) {
170
+ this.logger.error(
171
+ "[DoclingEnvironment] Python 3.13+ is not compatible. Install 3.11 or 3.12 with: pyenv install 3.12.0 && pyenv global 3.12.0"
172
+ );
173
+ }
174
+ throw error;
175
+ }
176
+ return version;
177
+ }
178
+ async setupPythonEnvironment() {
179
+ const result = await spawnAsync("python3", ["-m", "venv", this.venvPath]);
180
+ if (result.code !== 0) {
181
+ throw new Error("Failed to create Python virtual environment");
182
+ }
183
+ await this.verifyVenvPythonVersion();
184
+ }
185
+ async verifyVenvPythonVersion() {
186
+ const pythonPath = join(this.venvPath, "bin", "python");
187
+ const result = await spawnAsync(pythonPath, ["--version"]);
188
+ if (result.code !== 0) {
189
+ throw new Error("Failed to verify venv Python version");
190
+ }
191
+ const output = result.stdout + result.stderr;
192
+ const version = parsePythonVersion(output);
193
+ if (!version) {
194
+ throw new Error("Could not parse venv Python version");
195
+ }
196
+ validatePythonVersion(version, "venv");
197
+ }
198
+ async upgradePip() {
199
+ const pipPath = join(this.venvPath, "bin", "pip");
200
+ const result = await spawnAsync(pipPath, ["install", "--upgrade", "pip"]);
201
+ if (result.code !== 0) {
202
+ this.logger.error(
203
+ "[DoclingEnvironment] Failed to upgrade pip:",
204
+ result.stderr
205
+ );
206
+ throw new Error(`Failed to upgrade pip. Exit code: ${result.code}`);
207
+ }
208
+ }
209
+ async installSetuptools() {
210
+ const pipPath = join(this.venvPath, "bin", "pip");
211
+ const result = await spawnAsync(pipPath, [
212
+ "install",
213
+ "--upgrade",
214
+ "setuptools",
215
+ "wheel"
216
+ ]);
217
+ if (result.code !== 0) {
218
+ this.logger.error(
219
+ "[DoclingEnvironment] Failed to install setuptools:",
220
+ result.stderr
221
+ );
222
+ throw new Error(
223
+ `Failed to install setuptools. Exit code: ${result.code}`
224
+ );
225
+ }
226
+ }
227
+ async installPyArrow() {
228
+ const pipPath = join(this.venvPath, "bin", "pip");
229
+ const result = await spawnAsync(pipPath, [
230
+ "install",
231
+ "--only-binary",
232
+ ":all:",
233
+ "pyarrow"
234
+ ]);
235
+ if (result.code !== 0) {
236
+ this.logger.error(
237
+ "[DoclingEnvironment] Failed to install pyarrow:",
238
+ result.stderr
239
+ );
240
+ throw new Error(`Failed to install pyarrow. Exit code: ${result.code}`);
241
+ }
242
+ }
243
+ async installDoclingServe() {
244
+ const pipPath = join(this.venvPath, "bin", "pip");
245
+ const result = await spawnAsync(pipPath, ["install", "docling-serve"]);
246
+ if (result.code !== 0) {
247
+ this.logger.error(
248
+ "[DoclingEnvironment] Failed to install docling-serve:",
249
+ result.stderr
250
+ );
251
+ throw new Error(
252
+ `Failed to install docling-serve. Exit code: ${result.code}`
253
+ );
254
+ }
255
+ }
256
+ async isPortInUse(port) {
257
+ try {
258
+ const result = await spawnAsync("lsof", ["-ti", `:${port}`]);
259
+ return result.code === 0 && !!result.stdout.trim();
260
+ } catch {
261
+ return false;
262
+ }
263
+ }
264
+ /**
265
+ * Start the docling-serve server without running full setup.
266
+ * Useful for restarting the server after it has crashed.
267
+ */
268
+ async startServer() {
269
+ await this.startDoclingServe();
270
+ }
271
+ // Process-killing logic is provided as a static method to allow reuse without instantiation
272
+ static async killProcessOnPort(logger, port) {
273
+ return new Promise((resolve) => {
274
+ const lsof = spawn2("lsof", ["-ti", `:${port}`]);
275
+ const pids = [];
276
+ lsof.stdout?.on("data", (data) => {
277
+ const txt = data.toString();
278
+ pids.push(
279
+ ...txt.split(/\s+/).map((s) => s.trim()).filter(Boolean)
280
+ );
281
+ });
282
+ lsof.on("close", () => {
283
+ if (pids.length === 0) return resolve();
284
+ let remaining = pids.length;
285
+ const done = () => {
286
+ if (--remaining <= 0) resolve();
287
+ };
288
+ logger.info(
289
+ "[DoclingEnvironment] Killing process",
290
+ pids.join(", "),
291
+ "on port",
292
+ port
293
+ );
294
+ for (const pid of pids) {
295
+ const killProc = spawn2("kill", ["-9", pid]);
296
+ killProc.on("close", (killCode) => {
297
+ if (killCode !== 0) {
298
+ logger.info("[DoclingEnvironment] Failed to kill process", pid);
299
+ }
300
+ done();
301
+ });
302
+ killProc.on("error", (Error2) => {
303
+ logger.info("[DoclingEnvironment] Failed to kill process", Error2);
304
+ done();
305
+ });
306
+ }
307
+ });
308
+ lsof.on("error", () => resolve());
309
+ });
310
+ }
311
+ async startDoclingServe() {
312
+ return new Promise(async (resolve, reject) => {
313
+ if (this.killExistingProcess) {
314
+ await _DoclingEnvironment.killProcessOnPort(this.logger, this.port);
315
+ }
316
+ const venvPath = this.venvPath;
317
+ const doclingServePath = join(venvPath, "bin", "docling-serve");
318
+ const args = ["run", "--port", this.port.toString()];
319
+ this.logger.info(
320
+ "[DoclingEnvironment] Starting docling-serve on port",
321
+ this.port
322
+ );
323
+ const doclingProcess = spawn2(doclingServePath, args, {
324
+ detached: true,
325
+ // Detached from parent process
326
+ stdio: "ignore"
327
+ // Remove stdio pipes to prevent event loop from hanging
328
+ });
329
+ doclingProcess.unref();
330
+ doclingProcess.on("error", (error) => {
331
+ this.logger.error("[DoclingEnvironment] docling-serve error:", error);
332
+ reject(error);
333
+ });
334
+ setTimeout(() => {
335
+ resolve();
336
+ }, DOCLING_ENVIRONMENT.STARTUP_DELAY_MS);
337
+ });
338
+ }
339
+ };
340
+
341
+ // src/core/pdf-converter.ts
342
+ import { omit } from "es-toolkit";
343
+ import { createWriteStream as createWriteStream2, existsSync as existsSync3, rmSync as rmSync3 } from "fs";
344
+ import { join as join4 } from "path";
345
+ import { pipeline } from "stream/promises";
346
+
347
+ // src/errors/image-pdf-fallback-error.ts
348
+ var ImagePdfFallbackError = class extends Error {
349
+ constructor(originalError, fallbackError) {
350
+ super(
351
+ `PDF conversion failed with fallback. Original: ${originalError.message}. Fallback: ${fallbackError.message}`
352
+ );
353
+ this.originalError = originalError;
354
+ this.fallbackError = fallbackError;
355
+ }
356
+ name = "ImagePdfFallbackError";
357
+ };
358
+
359
+ // src/processors/image-extractor.ts
360
+ import {
361
+ createWriteStream,
362
+ existsSync,
363
+ mkdirSync,
364
+ readFileSync,
365
+ readdirSync,
366
+ rmSync,
367
+ writeFileSync
368
+ } from "fs";
369
+ import { extname, join as join2 } from "path";
370
+ import * as yauzl from "yauzl";
371
+
372
+ // src/utils/jq.ts
373
+ import { spawn as spawn3 } from "child_process";
374
+ function getJqPath() {
375
+ const p = process.env.JQ_PATH?.trim();
376
+ return p && p.length > 0 ? p : "jq";
377
+ }
378
+ function runJqFileJson(program, filePath) {
379
+ return new Promise((resolve, reject) => {
380
+ const jqPath = getJqPath();
381
+ const args = [
382
+ "-c",
383
+ // compact output (single line when possible)
384
+ program,
385
+ filePath
386
+ ];
387
+ const child = spawn3(jqPath, args, {
388
+ stdio: ["ignore", "pipe", "pipe"],
389
+ env: process.env
390
+ });
391
+ let stdout = "";
392
+ let stderr = "";
393
+ child.stdout.setEncoding("utf-8");
394
+ child.stderr.setEncoding("utf-8");
395
+ child.stdout.on("data", (chunk) => {
396
+ stdout += chunk;
397
+ });
398
+ child.stderr.on("data", (chunk) => {
399
+ stderr += chunk;
400
+ });
401
+ child.on("error", (err) => {
402
+ reject(err);
403
+ });
404
+ child.on("close", (code) => {
405
+ if (code !== 0) {
406
+ const error = new Error(
407
+ `jq exited with code ${code}. ${stderr ? "Stderr: " + stderr : ""}`
408
+ );
409
+ return reject(error);
410
+ }
411
+ try {
412
+ const text = stdout.trim();
413
+ const parsed = JSON.parse(text);
414
+ resolve(parsed);
415
+ } catch (e) {
416
+ reject(
417
+ new Error(
418
+ `Failed to parse jq output as JSON. Output length=${stdout.length}. Error: ${e.message}`
419
+ )
420
+ );
421
+ }
422
+ });
423
+ });
424
+ }
425
+ function jqExtractBase64PngStrings(filePath) {
426
+ const program = `
427
+ [
428
+ .. |
429
+ select(type == "string" and startswith("data:image/png;base64"))
430
+ ]
431
+ `;
432
+ return runJqFileJson(program, filePath);
433
+ }
434
+ function jqReplaceBase64WithPaths(filePath, dirName, prefix) {
435
+ const program = `
436
+ reduce paths(type == "string" and startswith("data:image/png;base64")) as $p (
437
+ {data: ., counter: 0};
438
+ .counter as $idx |
439
+ .data |= setpath($p; "${dirName}/${prefix}_\\($idx).png") |
440
+ .counter += 1
441
+ ) | {data: .data, count: .counter}
442
+ `;
443
+ return runJqFileJson(program, filePath);
444
+ }
445
+
446
+ // src/processors/image-extractor.ts
447
+ var ImageExtractor = class _ImageExtractor {
448
+ /**
449
+ * Extract a ZIP file to a target directory
450
+ */
451
+ static async extractZip(zipPath, targetDir) {
452
+ return new Promise((resolve, reject) => {
453
+ yauzl.open(zipPath, { lazyEntries: true }, (err, zipfile) => {
454
+ if (err || !zipfile) {
455
+ reject(err || new Error("Failed to open zip file"));
456
+ return;
457
+ }
458
+ zipfile.readEntry();
459
+ zipfile.on("entry", (entry) => {
460
+ const entryPath = join2(targetDir, entry.fileName);
461
+ if (/\/$/.test(entry.fileName)) {
462
+ mkdirSync(entryPath, { recursive: true });
463
+ zipfile.readEntry();
464
+ } else {
465
+ zipfile.openReadStream(entry, (err2, readStream) => {
466
+ if (err2 || !readStream) {
467
+ reject(err2 || new Error("Failed to open read stream"));
468
+ return;
469
+ }
470
+ mkdirSync(join2(entryPath, ".."), { recursive: true });
471
+ const writeStream = createWriteStream(entryPath);
472
+ readStream.pipe(writeStream);
473
+ writeStream.on("finish", () => {
474
+ zipfile.readEntry();
475
+ });
476
+ writeStream.on("error", reject);
477
+ });
478
+ }
479
+ });
480
+ zipfile.on("end", () => {
481
+ resolve();
482
+ });
483
+ zipfile.on("error", reject);
484
+ });
485
+ });
486
+ }
487
+ /**
488
+ * Extract base64 images from JSON file using jq (for large files)
489
+ * Returns array of base64 data strings
490
+ */
491
+ static async extractBase64ImagesFromJsonWithJq(jsonPath) {
492
+ return jqExtractBase64PngStrings(jsonPath);
493
+ }
494
+ /**
495
+ * Replace base64 images with file paths in JSON using jq (for large files)
496
+ * Uses reduce to maintain counter state while walking the JSON
497
+ */
498
+ static async replaceBase64ImagesInJsonWithJq(jsonPath, outputPath, dirName, prefix) {
499
+ const { data, count } = await jqReplaceBase64WithPaths(
500
+ jsonPath,
501
+ dirName,
502
+ prefix
503
+ );
504
+ writeFileSync(outputPath, JSON.stringify(data, null, 2), "utf-8");
505
+ return count;
506
+ }
507
+ /**
508
+ * Extract a base64-encoded image to a file and return the relative path
509
+ */
510
+ static extractBase64ImageToFile(base64Data, imagesDir, index, prefix, dirName) {
511
+ const PREFIX = "data:image/png;base64,";
512
+ const base64Content = base64Data.startsWith(PREFIX) ? base64Data.slice(PREFIX.length) : base64Data;
513
+ const filename = `${prefix}_${index}.png`;
514
+ const filepath = join2(imagesDir, filename);
515
+ const buffer = Buffer.from(base64Content, "base64");
516
+ writeFileSync(filepath, buffer);
517
+ return `${dirName}/${filename}`;
518
+ }
519
+ /**
520
+ * Save JSON and HTML documents with base64 images extracted to separate files
521
+ * Uses jq for JSON processing to handle large files
522
+ *
523
+ * This method:
524
+ * 1. Extracts base64-encoded images from JSON and HTML content
525
+ * 2. Saves images as separate PNG files
526
+ * 3. Replaces base64 data with relative file paths
527
+ * 4. Saves the transformed documents to the output directory
528
+ */
529
+ static async saveDocumentsWithExtractedImages(logger, outputDir, filename, jsonSourcePath, htmlContent) {
530
+ try {
531
+ if (existsSync(outputDir)) {
532
+ rmSync(outputDir, { recursive: true, force: true });
533
+ }
534
+ } catch (e) {
535
+ logger.warn("[PDFConverter] Failed to clear output directory:", e);
536
+ }
537
+ mkdirSync(outputDir, { recursive: true });
538
+ const baseName = filename.replace(extname(filename), "");
539
+ const jsonPath = join2(outputDir, `${baseName}.json`);
540
+ try {
541
+ const pagesDir = join2(outputDir, "pages");
542
+ if (!existsSync(pagesDir)) {
543
+ mkdirSync(pagesDir, { recursive: true });
544
+ }
545
+ const base64Images = await _ImageExtractor.extractBase64ImagesFromJsonWithJq(jsonSourcePath);
546
+ base64Images.forEach((base64Data, index) => {
547
+ _ImageExtractor.extractBase64ImageToFile(
548
+ base64Data,
549
+ pagesDir,
550
+ index,
551
+ "page",
552
+ "pages"
553
+ );
554
+ });
555
+ logger.info(
556
+ `[PDFConverter] Extracted ${base64Images.length} images from JSON to ${pagesDir}`
557
+ );
558
+ const replacedCount = await _ImageExtractor.replaceBase64ImagesInJsonWithJq(
559
+ jsonSourcePath,
560
+ jsonPath,
561
+ "pages",
562
+ "page"
563
+ );
564
+ logger.info(
565
+ `[PDFConverter] Replaced ${replacedCount} base64 images with file paths`
566
+ );
567
+ } catch (e) {
568
+ logger.warn(
569
+ "[PDFConverter] Failed to extract images from JSON using jq. Error:",
570
+ e
571
+ );
572
+ throw e;
573
+ }
574
+ logger.info("[PDFConverter] Saved JSON:", jsonPath);
575
+ const htmlPath = join2(outputDir, `${baseName}.html`);
576
+ try {
577
+ const imagesDir = join2(outputDir, "images");
578
+ if (!existsSync(imagesDir)) {
579
+ mkdirSync(imagesDir, { recursive: true });
580
+ }
581
+ let imageIndex = 0;
582
+ const transformedHtml = htmlContent.replace(
583
+ /src="data:image\/png;base64,([^"]+)"/g,
584
+ (_, base64Content) => {
585
+ const filename2 = `image_${imageIndex}.png`;
586
+ const filepath = join2(imagesDir, filename2);
587
+ const buffer = Buffer.from(base64Content, "base64");
588
+ writeFileSync(filepath, buffer);
589
+ const relativePath = `images/${filename2}`;
590
+ imageIndex += 1;
591
+ return `src="${relativePath}"`;
592
+ }
593
+ );
594
+ logger.info(
595
+ `[PDFConverter] Extracted ${imageIndex} images from HTML to ${imagesDir}`
596
+ );
597
+ writeFileSync(htmlPath, transformedHtml, "utf-8");
598
+ } catch (e) {
599
+ logger.warn(
600
+ "[PDFConverter] Failed to extract images from HTML, writing original. Error:",
601
+ e
602
+ );
603
+ writeFileSync(htmlPath, htmlContent, "utf-8");
604
+ }
605
+ logger.info("[PDFConverter] Saved HTML:", htmlPath);
606
+ }
607
+ /**
608
+ * Extract documents from ZIP and save with extracted images
609
+ * Uses jq for JSON processing to handle large files without loading into Node.js memory
610
+ *
611
+ * Complete workflow:
612
+ * 1. Extract ZIP file to temporary directory
613
+ * 2. Find JSON and HTML files from extracted files
614
+ * 3. Use jq to extract base64 images from JSON and save as separate files
615
+ * 4. Use jq to replace base64 with file paths in JSON
616
+ * 5. Process HTML with regex to extract and replace images
617
+ * 6. Save transformed documents to output directory (as result.json and result.html)
618
+ */
619
+ static async extractAndSaveDocumentsFromZip(logger, zipPath, extractDir, outputDir) {
620
+ logger.info("[PDFConverter] Extracting ZIP file...");
621
+ await _ImageExtractor.extractZip(zipPath, extractDir);
622
+ const files = readdirSync(extractDir);
623
+ const jsonFile = files.find((f) => extname(f).toLowerCase() === ".json");
624
+ const htmlFile = files.find((f) => extname(f).toLowerCase() === ".html");
625
+ if (!jsonFile || !htmlFile) {
626
+ throw new Error(
627
+ `Expected one JSON and one HTML file in extracted directory. Found: ${files.join(", ")}`
628
+ );
629
+ }
630
+ const jsonPath = join2(extractDir, jsonFile);
631
+ const htmlPath = join2(extractDir, htmlFile);
632
+ const htmlContent = readFileSync(htmlPath, "utf-8");
633
+ logger.info("[PDFConverter] Saving converted files to output...");
634
+ await _ImageExtractor.saveDocumentsWithExtractedImages(
635
+ logger,
636
+ outputDir,
637
+ "result",
638
+ jsonPath,
639
+ htmlContent
640
+ );
641
+ logger.info("[PDFConverter] Files saved to:", outputDir);
642
+ }
643
+ };
644
+
645
+ // src/utils/local-file-server.ts
646
+ import { createReadStream, statSync } from "fs";
647
+ import { createServer } from "http";
648
+ import { basename } from "path";
649
+ var LocalFileServer = class {
650
+ server = null;
651
+ port = 0;
652
+ /**
653
+ * Start serving a file and return the URL
654
+ *
655
+ * @param filePath Absolute path to the file to serve
656
+ * @returns URL to access the file
657
+ */
658
+ async start(filePath) {
659
+ const filename = basename(filePath);
660
+ const stat = statSync(filePath);
661
+ return new Promise((resolve, reject) => {
662
+ this.server = createServer((req, res) => {
663
+ if (req.url === `/${filename}`) {
664
+ res.writeHead(200, {
665
+ "Content-Type": "application/pdf",
666
+ "Content-Length": stat.size
667
+ });
668
+ createReadStream(filePath).pipe(res);
669
+ } else {
670
+ res.writeHead(404);
671
+ res.end("Not Found");
672
+ }
673
+ });
674
+ this.server.on("error", reject);
675
+ this.server.listen(0, "127.0.0.1", () => {
676
+ const address = this.server.address();
677
+ if (typeof address === "object" && address !== null) {
678
+ this.port = address.port;
679
+ resolve(`http://127.0.0.1:${this.port}/${filename}`);
680
+ } else {
681
+ reject(new Error("Failed to get server address"));
682
+ }
683
+ });
684
+ });
685
+ }
686
+ /**
687
+ * Stop the server
688
+ */
689
+ stop() {
690
+ return new Promise((resolve) => {
691
+ if (this.server) {
692
+ this.server.close(() => {
693
+ this.server = null;
694
+ this.port = 0;
695
+ resolve();
696
+ });
697
+ } else {
698
+ resolve();
699
+ }
700
+ });
701
+ }
702
+ };
703
+
704
+ // src/core/image-pdf-converter.ts
705
+ import { existsSync as existsSync2, rmSync as rmSync2 } from "fs";
706
+ import { tmpdir } from "os";
707
+ import { join as join3 } from "path";
708
+ var ImagePdfConverter = class {
709
+ constructor(logger) {
710
+ this.logger = logger;
711
+ }
712
+ /**
713
+ * Convert a PDF file to an image-based PDF.
714
+ * Downloads the PDF from URL, converts it using ImageMagick, and returns the path.
715
+ *
716
+ * @param pdfUrl - URL of the source PDF
717
+ * @param reportId - Report identifier for temp file naming
718
+ * @returns Path to the converted image PDF in temp directory
719
+ */
720
+ async convert(pdfUrl, reportId) {
721
+ const timestamp = Date.now();
722
+ const tempDir = tmpdir();
723
+ const inputPath = join3(tempDir, `${reportId}-${timestamp}-input.pdf`);
724
+ const outputPath = join3(tempDir, `${reportId}-${timestamp}-image.pdf`);
725
+ try {
726
+ this.logger.info("[ImagePdfConverter] Downloading PDF from URL...");
727
+ await this.downloadPdf(pdfUrl, inputPath);
728
+ this.logger.info("[ImagePdfConverter] Converting to image PDF...");
729
+ await this.convertToImagePdf(inputPath, outputPath);
730
+ this.logger.info("[ImagePdfConverter] Image PDF created:", outputPath);
731
+ return outputPath;
732
+ } finally {
733
+ if (existsSync2(inputPath)) {
734
+ rmSync2(inputPath, { force: true });
735
+ }
736
+ }
737
+ }
738
+ /**
739
+ * Download PDF from URL to local path using curl
740
+ */
741
+ async downloadPdf(url, outputPath) {
742
+ const result = await spawnAsync("curl", [
743
+ "-L",
744
+ // Follow redirects
745
+ "-o",
746
+ outputPath,
747
+ "-s",
748
+ // Silent mode
749
+ "--fail",
750
+ // Fail on HTTP errors
751
+ url
752
+ ]);
753
+ if (result.code !== 0) {
754
+ throw new Error(
755
+ `Failed to download PDF: ${result.stderr || "Unknown error"}`
756
+ );
757
+ }
758
+ }
759
+ /**
760
+ * Convert PDF to image-based PDF using ImageMagick
761
+ */
762
+ async convertToImagePdf(inputPath, outputPath) {
763
+ const result = await spawnAsync("magick", [
764
+ "-density",
765
+ IMAGE_PDF_CONVERTER.DENSITY.toString(),
766
+ inputPath,
767
+ "-quality",
768
+ IMAGE_PDF_CONVERTER.QUALITY.toString(),
769
+ outputPath
770
+ ]);
771
+ if (result.code !== 0) {
772
+ throw new Error(
773
+ `Failed to convert PDF to image PDF: ${result.stderr || "Unknown error"}`
774
+ );
775
+ }
776
+ }
777
+ /**
778
+ * Cleanup the temporary image PDF file
779
+ */
780
+ cleanup(imagePdfPath) {
781
+ if (existsSync2(imagePdfPath)) {
782
+ this.logger.info(
783
+ "[ImagePdfConverter] Cleaning up temp file:",
784
+ imagePdfPath
785
+ );
786
+ rmSync2(imagePdfPath, { force: true });
787
+ }
788
+ }
789
+ };
790
+
791
+ // src/core/pdf-converter.ts
792
+ var PDFConverter = class {
793
+ constructor(logger, client, enableImagePdfFallback = false) {
794
+ this.logger = logger;
795
+ this.client = client;
796
+ this.enableImagePdfFallback = enableImagePdfFallback;
797
+ }
798
+ async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
799
+ this.logger.info("[PDFConverter] Converting:", url);
800
+ let originalError = null;
801
+ try {
802
+ await this.performConversion(
803
+ url,
804
+ reportId,
805
+ onComplete,
806
+ cleanupAfterCallback,
807
+ options,
808
+ abortSignal
809
+ );
810
+ return;
811
+ } catch (error) {
812
+ if (abortSignal?.aborted) {
813
+ throw error;
814
+ }
815
+ originalError = error;
816
+ this.logger.error("[PDFConverter] Conversion failed:", error);
817
+ if (!this.enableImagePdfFallback) {
818
+ throw error;
819
+ }
820
+ }
821
+ this.logger.info("[PDFConverter] Attempting image PDF fallback...");
822
+ const imagePdfConverter = new ImagePdfConverter(this.logger);
823
+ let imagePdfPath = null;
824
+ try {
825
+ imagePdfPath = await imagePdfConverter.convert(url, reportId);
826
+ const localUrl = `file://${imagePdfPath}`;
827
+ this.logger.info("[PDFConverter] Retrying with image PDF:", localUrl);
828
+ await this.performConversion(
829
+ localUrl,
830
+ reportId,
831
+ onComplete,
832
+ cleanupAfterCallback,
833
+ options,
834
+ abortSignal
835
+ );
836
+ this.logger.info("[PDFConverter] Fallback conversion succeeded");
837
+ } catch (fallbackError) {
838
+ this.logger.error(
839
+ "[PDFConverter] Fallback conversion also failed:",
840
+ fallbackError
841
+ );
842
+ throw new ImagePdfFallbackError(originalError, fallbackError);
843
+ } finally {
844
+ if (imagePdfPath) {
845
+ imagePdfConverter.cleanup(imagePdfPath);
846
+ }
847
+ }
848
+ }
849
+ async performConversion(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
850
+ const startTime = Date.now();
851
+ const conversionOptions = this.buildConversionOptions(options);
852
+ this.logger.info(
853
+ "[PDFConverter] Converting document with Async Source API..."
854
+ );
855
+ this.logger.info("[PDFConverter] Server will download from URL directly");
856
+ this.logger.info(
857
+ "[PDFConverter] Results will be returned as ZIP to avoid memory limits"
858
+ );
859
+ const { httpUrl, server } = await this.resolveUrl(url);
860
+ try {
861
+ const task = await this.startConversionTask(httpUrl, conversionOptions);
862
+ await this.trackTaskProgress(task);
863
+ if (abortSignal?.aborted) {
864
+ this.logger.info(
865
+ "[PDFConverter] Conversion aborted after docling completion"
866
+ );
867
+ const error = new Error("PDF conversion was aborted");
868
+ error.name = "AbortError";
869
+ throw error;
870
+ }
871
+ await this.downloadResult(task.taskId);
872
+ } finally {
873
+ if (server) {
874
+ this.logger.info("[PDFConverter] Stopping local file server...");
875
+ await server.stop();
876
+ }
877
+ }
878
+ const cwd = process.cwd();
879
+ const zipPath = join4(cwd, "result.zip");
880
+ const extractDir = join4(cwd, "result_extracted");
881
+ const outputDir = join4(cwd, "output", reportId);
882
+ try {
883
+ await this.processConvertedFiles(zipPath, extractDir, outputDir);
884
+ if (abortSignal?.aborted) {
885
+ this.logger.info("[PDFConverter] Conversion aborted before callback");
886
+ const error = new Error("PDF conversion was aborted");
887
+ error.name = "AbortError";
888
+ throw error;
889
+ }
890
+ this.logger.info("[PDFConverter] Executing completion callback...");
891
+ await onComplete(outputDir);
892
+ const duration = Date.now() - startTime;
893
+ this.logger.info("[PDFConverter] Conversion completed successfully!");
894
+ this.logger.info("[PDFConverter] Total time:", duration, "ms");
895
+ } finally {
896
+ this.logger.info("[PDFConverter] Cleaning up temporary files...");
897
+ if (existsSync3(zipPath)) {
898
+ rmSync3(zipPath, { force: true });
899
+ }
900
+ if (existsSync3(extractDir)) {
901
+ rmSync3(extractDir, { recursive: true, force: true });
902
+ }
903
+ if (cleanupAfterCallback) {
904
+ this.logger.info(
905
+ "[PDFConverter] Cleaning up output directory:",
906
+ outputDir
907
+ );
908
+ if (existsSync3(outputDir)) {
909
+ rmSync3(outputDir, { recursive: true, force: true });
910
+ }
911
+ } else {
912
+ this.logger.info("[PDFConverter] Output preserved at:", outputDir);
913
+ }
914
+ }
915
+ }
916
+ buildConversionOptions(options) {
917
+ return {
918
+ ...omit(options, ["num_threads"]),
919
+ to_formats: ["json", "html"],
920
+ image_export_mode: "embedded",
921
+ ocr_engine: "ocrmac",
922
+ generate_picture_images: true,
923
+ images_scale: 2,
924
+ /**
925
+ * While disabling this option yields the most accurate text extraction for readable PDFs,
926
+ * text layers overlaid on images or drawings can introduce noise when not merged properly.
927
+ * In practice, archaeological report PDFs almost always contain such overlapping cases.
928
+ * Enabling force_ocr mitigates this risk. Although OCR may introduce minor errors compared
929
+ * to direct text extraction, the accuracy remains high since the source is digital, not scanned paper.
930
+ */
931
+ force_ocr: true,
932
+ accelerator_options: {
933
+ device: "mps",
934
+ num_threads: options.num_threads
935
+ }
936
+ };
937
+ }
938
+ async startConversionTask(url, conversionOptions) {
939
+ const task = await this.client.convertSourceAsync({
940
+ sources: [
941
+ {
942
+ kind: "http",
943
+ url
944
+ }
945
+ ],
946
+ options: conversionOptions,
947
+ target: {
948
+ kind: "zip"
949
+ }
950
+ });
951
+ this.logger.info(`[PDFConverter] Task created: ${task.taskId}`);
952
+ this.logger.info("[PDFConverter] Polling for progress...");
953
+ return task;
954
+ }
955
+ /**
956
+ * Start a local file server for file:// URLs
957
+ *
958
+ * @param url URL to check (file:// or http://)
959
+ * @returns Object with httpUrl and optional server to stop later
960
+ */
961
+ async resolveUrl(url) {
962
+ if (url.startsWith("file://")) {
963
+ const filePath = url.slice(7);
964
+ const server = new LocalFileServer();
965
+ const httpUrl = await server.start(filePath);
966
+ this.logger.info("[PDFConverter] Started local file server:", httpUrl);
967
+ return { httpUrl, server };
968
+ }
969
+ return { httpUrl: url };
970
+ }
971
+ async trackTaskProgress(task) {
972
+ const conversionStartTime = Date.now();
973
+ let lastStatus = "";
974
+ let isCompleted = false;
975
+ const pollInterval = setInterval(() => {
976
+ if (isCompleted) return;
977
+ const elapsed = Math.floor((Date.now() - conversionStartTime) / 1e3);
978
+ process.stdout.write(
979
+ `\r[PDFConverter] Status: ${lastStatus || "processing"} (${elapsed}s elapsed)`
980
+ );
981
+ }, PDF_CONVERTER.POLL_INTERVAL_MS);
982
+ task.on("progress", (status) => {
983
+ lastStatus = status.task_status;
984
+ if (status.task_position !== void 0) {
985
+ process.stdout.write(
986
+ `\r[PDFConverter] Status: ${status.task_status} (position: ${status.task_position})`
987
+ );
988
+ }
989
+ });
990
+ task.on("complete", () => {
991
+ isCompleted = true;
992
+ clearInterval(pollInterval);
993
+ this.logger.info("\n[PDFConverter] Conversion completed!");
994
+ });
995
+ task.on("error", (error) => {
996
+ isCompleted = true;
997
+ clearInterval(pollInterval);
998
+ this.logger.error("\n[PDFConverter] Conversion error:", error.message);
999
+ });
1000
+ try {
1001
+ await task.waitForCompletion();
1002
+ } finally {
1003
+ isCompleted = true;
1004
+ clearInterval(pollInterval);
1005
+ }
1006
+ }
1007
+ async downloadResult(taskId) {
1008
+ this.logger.info(
1009
+ "\n[PDFConverter] Task completed, downloading ZIP file..."
1010
+ );
1011
+ const zipResult = await this.client.getTaskResultFile(taskId);
1012
+ if (!zipResult.success || !zipResult.fileStream) {
1013
+ throw new Error("Failed to get ZIP file result");
1014
+ }
1015
+ const zipPath = join4(process.cwd(), "result.zip");
1016
+ this.logger.info("[PDFConverter] Saving ZIP file to:", zipPath);
1017
+ const writeStream = createWriteStream2(zipPath);
1018
+ await pipeline(zipResult.fileStream, writeStream);
1019
+ }
1020
+ async processConvertedFiles(zipPath, extractDir, outputDir) {
1021
+ await ImageExtractor.extractAndSaveDocumentsFromZip(
1022
+ this.logger,
1023
+ zipPath,
1024
+ extractDir,
1025
+ outputDir
1026
+ );
1027
+ }
1028
+ };
1029
+
1030
+ // src/core/pdf-parser.ts
1031
+ var PDFParser = class {
1032
+ logger;
1033
+ port;
1034
+ baseUrl;
1035
+ timeout;
1036
+ venvPath;
1037
+ killExistingProcess;
1038
+ enableImagePdfFallback;
1039
+ client = null;
1040
+ constructor(options) {
1041
+ const {
1042
+ logger,
1043
+ timeout = PDF_PARSER.DEFAULT_TIMEOUT_MS,
1044
+ venvPath,
1045
+ killExistingProcess = false,
1046
+ enableImagePdfFallback = false
1047
+ } = options;
1048
+ this.logger = logger;
1049
+ if ("baseUrl" in options) {
1050
+ this.baseUrl = options.baseUrl;
1051
+ this.port = void 0;
1052
+ } else {
1053
+ this.port = options.port;
1054
+ this.baseUrl = void 0;
1055
+ }
1056
+ this.timeout = timeout;
1057
+ this.venvPath = venvPath || join5(process.cwd(), ".venv");
1058
+ this.killExistingProcess = killExistingProcess;
1059
+ this.enableImagePdfFallback = enableImagePdfFallback;
1060
+ }
1061
+ async init() {
1062
+ this.logger.info("[PDFParser] Initializing...");
1063
+ this.checkOperatingSystem();
1064
+ this.checkJqInstalled();
1065
+ this.checkMacOSVersion();
1066
+ if (this.enableImagePdfFallback && !this.baseUrl) {
1067
+ this.checkImageMagickInstalled();
1068
+ this.checkGhostscriptInstalled();
1069
+ } else if (this.enableImagePdfFallback && this.baseUrl) {
1070
+ this.logger.warn(
1071
+ "[PDFParser] enableImagePdfFallback is ignored when using external server (baseUrl)"
1072
+ );
1073
+ }
1074
+ if (this.baseUrl) {
1075
+ this.logger.info("[PDFParser] Using external server:", this.baseUrl);
1076
+ this.client = new Docling({
1077
+ api: { baseUrl: this.baseUrl, timeout: this.timeout }
1078
+ });
1079
+ await this.waitForServerReady();
1080
+ return;
1081
+ }
1082
+ this.logger.info("[PDFParser] Setting up local server...");
1083
+ try {
1084
+ const environment = new DoclingEnvironment({
1085
+ logger: this.logger,
1086
+ venvPath: this.venvPath,
1087
+ port: this.port,
1088
+ killExistingProcess: this.killExistingProcess
1089
+ });
1090
+ await environment.setup();
1091
+ const clientUrl = `http://localhost:${this.port}`;
1092
+ this.client = new Docling({
1093
+ api: {
1094
+ baseUrl: clientUrl,
1095
+ timeout: this.timeout
1096
+ }
1097
+ });
1098
+ await this.waitForServerReady();
1099
+ this.logger.info("[PDFParser] Ready");
1100
+ } catch (error) {
1101
+ this.logger.error("[PDFParser] Initialization failed:", error);
1102
+ throw new Error(`Failed to initialize PDFParser: ${error}`);
1103
+ }
1104
+ }
1105
+ checkOperatingSystem() {
1106
+ if (platform() !== "darwin") {
1107
+ throw new Error(
1108
+ "PDFParser is only supported on macOS. Current platform: " + platform()
1109
+ );
1110
+ }
1111
+ }
1112
+ checkJqInstalled() {
1113
+ try {
1114
+ execSync("which jq", { stdio: "ignore" });
1115
+ } catch {
1116
+ throw new Error(
1117
+ "jq is not installed. Please install jq using: brew install jq"
1118
+ );
1119
+ }
1120
+ }
1121
+ checkMacOSVersion() {
1122
+ try {
1123
+ const versionOutput = execSync("sw_vers -productVersion", {
1124
+ encoding: "utf-8"
1125
+ }).trim();
1126
+ const versionMatch = versionOutput.match(/^(\d+)\.(\d+)/);
1127
+ if (versionMatch) {
1128
+ const major = parseInt(versionMatch[1]);
1129
+ const minor = parseInt(versionMatch[2]);
1130
+ if (major < 10 || major === 10 && minor < 15) {
1131
+ throw new Error(
1132
+ `macOS 10.15 or later is required. Current version: ${versionOutput}`
1133
+ );
1134
+ }
1135
+ }
1136
+ } catch (error) {
1137
+ if (error instanceof Error && error.message.includes("macOS 10.15")) {
1138
+ throw error;
1139
+ }
1140
+ throw new Error("Failed to check macOS version");
1141
+ }
1142
+ }
1143
+ checkImageMagickInstalled() {
1144
+ try {
1145
+ execSync("which magick", { stdio: "ignore" });
1146
+ } catch {
1147
+ throw new Error(
1148
+ "ImageMagick is not installed but enableImagePdfFallback is enabled. Please install ImageMagick using: brew install imagemagick"
1149
+ );
1150
+ }
1151
+ }
1152
+ checkGhostscriptInstalled() {
1153
+ try {
1154
+ execSync("which gs", { stdio: "ignore" });
1155
+ } catch {
1156
+ throw new Error(
1157
+ "Ghostscript is not installed but enableImagePdfFallback is enabled. Please install Ghostscript using: brew install ghostscript"
1158
+ );
1159
+ }
1160
+ }
1161
+ /**
1162
+ * Check if an error is a connection refused error (ECONNREFUSED).
1163
+ * This typically indicates the Docling server has crashed.
1164
+ */
1165
+ isConnectionRefusedError(error) {
1166
+ if (error instanceof Error) {
1167
+ const errorStr = JSON.stringify(error);
1168
+ return errorStr.includes("ECONNREFUSED");
1169
+ }
1170
+ return false;
1171
+ }
1172
+ /**
1173
+ * Restart the Docling server after it has crashed.
1174
+ * This kills any existing process on the port, starts a new server,
1175
+ * and waits for it to become ready.
1176
+ *
1177
+ * Note: This method is only called when canRecover is true,
1178
+ * which guarantees this.port is defined.
1179
+ */
1180
+ async restartServer() {
1181
+ this.logger.info("[PDFParser] Restarting server...");
1182
+ await DoclingEnvironment.killProcessOnPort(this.logger, this.port);
1183
+ const environment = new DoclingEnvironment({
1184
+ logger: this.logger,
1185
+ venvPath: this.venvPath,
1186
+ port: this.port,
1187
+ killExistingProcess: false
1188
+ // Already killed above
1189
+ });
1190
+ await environment.startServer();
1191
+ this.client?.destroy();
1192
+ this.client = new Docling({
1193
+ api: {
1194
+ baseUrl: `http://localhost:${this.port}`,
1195
+ timeout: this.timeout
1196
+ }
1197
+ });
1198
+ await this.waitForServerReady();
1199
+ this.logger.info("[PDFParser] Server restarted successfully");
1200
+ }
1201
+ async waitForServerReady() {
1202
+ const maxAttempts = PDF_PARSER.MAX_HEALTH_CHECK_ATTEMPTS;
1203
+ const checkInterval = PDF_PARSER.HEALTH_CHECK_INTERVAL_MS;
1204
+ const logInterval = PDF_PARSER.HEALTH_CHECK_LOG_INTERVAL_MS;
1205
+ let lastLogTime = 0;
1206
+ for (let attempt = 1; attempt <= maxAttempts; attempt++) {
1207
+ try {
1208
+ await this.client.health();
1209
+ this.logger.info("[PDFParser] Server is ready");
1210
+ return;
1211
+ } catch {
1212
+ const now = Date.now();
1213
+ if (now - lastLogTime >= logInterval) {
1214
+ this.logger.info(
1215
+ "[PDFParser] Waiting for server... (attempt",
1216
+ attempt,
1217
+ "/",
1218
+ maxAttempts,
1219
+ ")"
1220
+ );
1221
+ lastLogTime = now;
1222
+ }
1223
+ if (attempt < maxAttempts) {
1224
+ await new Promise((resolve) => setTimeout(resolve, checkInterval));
1225
+ }
1226
+ }
1227
+ }
1228
+ throw new Error("Server failed to become ready after maximum attempts");
1229
+ }
1230
+ async parse(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
1231
+ if (!this.client) {
1232
+ throw new Error(
1233
+ "PDFParser is not initialized. Call init() before using parse()"
1234
+ );
1235
+ }
1236
+ const canRecover = !this.baseUrl && this.port !== void 0;
1237
+ const maxAttempts = PDF_PARSER.MAX_SERVER_RECOVERY_ATTEMPTS;
1238
+ let attempt = 0;
1239
+ while (attempt <= maxAttempts) {
1240
+ try {
1241
+ const effectiveFallbackEnabled = this.enableImagePdfFallback && !this.baseUrl;
1242
+ const converter = new PDFConverter(
1243
+ this.logger,
1244
+ this.client,
1245
+ effectiveFallbackEnabled
1246
+ );
1247
+ return await converter.convert(
1248
+ url,
1249
+ reportId,
1250
+ onComplete,
1251
+ cleanupAfterCallback,
1252
+ options,
1253
+ abortSignal
1254
+ );
1255
+ } catch (error) {
1256
+ if (abortSignal?.aborted) {
1257
+ throw error;
1258
+ }
1259
+ if (canRecover && this.isConnectionRefusedError(error) && attempt < maxAttempts) {
1260
+ this.logger.warn(
1261
+ "[PDFParser] Connection refused, attempting server recovery..."
1262
+ );
1263
+ await this.restartServer();
1264
+ attempt++;
1265
+ continue;
1266
+ }
1267
+ throw error;
1268
+ }
1269
+ }
1270
+ }
1271
+ /**
1272
+ * Dispose the parser instance.
1273
+ * - Sets the internal client to null
1274
+ * - If a local docling server was started (no baseUrl), kills the process on the configured port
1275
+ */
1276
+ async dispose() {
1277
+ this.logger.info("[PDFParser] Disposing...");
1278
+ try {
1279
+ if (!this.baseUrl && this.port) {
1280
+ await DoclingEnvironment.killProcessOnPort(this.logger, this.port);
1281
+ }
1282
+ } catch (error) {
1283
+ this.logger.error("[PDFParser] Error while disposing:", error);
1284
+ } finally {
1285
+ this.client?.destroy();
1286
+ this.client = null;
1287
+ this.logger.info("[PDFParser] Disposed");
1288
+ }
1289
+ }
1290
+ };
1291
+ export {
1292
+ ImagePdfFallbackError,
1293
+ PDFParser
1294
+ };
1295
+ //# sourceMappingURL=index.js.map