@heripo/pdf-parser 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-WWNI354M.js +121 -0
- package/dist/chunk-WWNI354M.js.map +1 -0
- package/dist/index.cjs +315 -48
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +19 -5
- package/dist/index.d.ts +19 -5
- package/dist/index.js +195 -41
- package/dist/index.js.map +1 -1
- package/dist/vlm-models.cjs +147 -0
- package/dist/vlm-models.cjs.map +1 -0
- package/dist/vlm-models.d.cts +34 -0
- package/dist/vlm-models.d.ts +34 -0
- package/dist/vlm-models.js +12 -0
- package/dist/vlm-models.js.map +1 -0
- package/package.json +15 -9
package/dist/index.d.cts
CHANGED
|
@@ -1,11 +1,26 @@
|
|
|
1
1
|
import { LoggerMethods } from '@heripo/logger';
|
|
2
|
-
import { ConversionOptions } from 'docling-sdk';
|
|
2
|
+
import { ConversionOptions, VlmModelLocal } from 'docling-sdk';
|
|
3
|
+
export { DEFAULT_VLM_MODEL, VLM_MODELS, VlmModelPreset, resolveVlmModel } from './vlm-models.cjs';
|
|
3
4
|
|
|
4
5
|
/**
|
|
5
6
|
* Callback function invoked after PDF conversion completes
|
|
6
7
|
* @param outputPath Absolute path to the output directory containing result files
|
|
7
8
|
*/
|
|
8
9
|
type ConversionCompleteCallback = (outputPath: string) => Promise<void> | void;
|
|
10
|
+
/**
|
|
11
|
+
* Pipeline type for PDF conversion
|
|
12
|
+
* - 'standard': Use OCR-based pipeline (default, uses ocrmac)
|
|
13
|
+
* - 'vlm': Use Vision Language Model pipeline for better KCJ/complex layout handling
|
|
14
|
+
*/
|
|
15
|
+
type PipelineType = 'standard' | 'vlm';
|
|
16
|
+
/**
|
|
17
|
+
* Extended options for PDF conversion including pipeline selection
|
|
18
|
+
*/
|
|
19
|
+
type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'images_scale' | 'force_ocr' | 'pipeline' | 'vlm_pipeline_model_local'> & {
|
|
20
|
+
num_threads?: number;
|
|
21
|
+
pipeline?: PipelineType;
|
|
22
|
+
vlm_model?: string | VlmModelLocal;
|
|
23
|
+
};
|
|
9
24
|
|
|
10
25
|
type Options = {
|
|
11
26
|
logger: LoggerMethods;
|
|
@@ -82,6 +97,7 @@ declare class PDFParser {
|
|
|
82
97
|
private readonly killExistingProcess;
|
|
83
98
|
private readonly enableImagePdfFallback;
|
|
84
99
|
private client;
|
|
100
|
+
private environment?;
|
|
85
101
|
constructor(options: Options);
|
|
86
102
|
init(): Promise<void>;
|
|
87
103
|
private checkOperatingSystem;
|
|
@@ -104,9 +120,7 @@ declare class PDFParser {
|
|
|
104
120
|
*/
|
|
105
121
|
private restartServer;
|
|
106
122
|
private waitForServerReady;
|
|
107
|
-
parse(url: string, reportId: string, onComplete: ConversionCompleteCallback, cleanupAfterCallback: boolean, options:
|
|
108
|
-
num_threads?: number;
|
|
109
|
-
}, abortSignal?: AbortSignal): Promise<void>;
|
|
123
|
+
parse(url: string, reportId: string, onComplete: ConversionCompleteCallback, cleanupAfterCallback: boolean, options: PDFConvertOptions, abortSignal?: AbortSignal): Promise<void>;
|
|
110
124
|
/**
|
|
111
125
|
* Dispose the parser instance.
|
|
112
126
|
* - Sets the internal client to null
|
|
@@ -126,4 +140,4 @@ declare class ImagePdfFallbackError extends Error {
|
|
|
126
140
|
constructor(originalError: Error, fallbackError: Error);
|
|
127
141
|
}
|
|
128
142
|
|
|
129
|
-
export { type ConversionCompleteCallback, ImagePdfFallbackError, PDFParser };
|
|
143
|
+
export { type ConversionCompleteCallback, ImagePdfFallbackError, type PDFConvertOptions, PDFParser, type PipelineType };
|
package/dist/index.d.ts
CHANGED
|
@@ -1,11 +1,26 @@
|
|
|
1
1
|
import { LoggerMethods } from '@heripo/logger';
|
|
2
|
-
import { ConversionOptions } from 'docling-sdk';
|
|
2
|
+
import { ConversionOptions, VlmModelLocal } from 'docling-sdk';
|
|
3
|
+
export { DEFAULT_VLM_MODEL, VLM_MODELS, VlmModelPreset, resolveVlmModel } from './vlm-models.js';
|
|
3
4
|
|
|
4
5
|
/**
|
|
5
6
|
* Callback function invoked after PDF conversion completes
|
|
6
7
|
* @param outputPath Absolute path to the output directory containing result files
|
|
7
8
|
*/
|
|
8
9
|
type ConversionCompleteCallback = (outputPath: string) => Promise<void> | void;
|
|
10
|
+
/**
|
|
11
|
+
* Pipeline type for PDF conversion
|
|
12
|
+
* - 'standard': Use OCR-based pipeline (default, uses ocrmac)
|
|
13
|
+
* - 'vlm': Use Vision Language Model pipeline for better KCJ/complex layout handling
|
|
14
|
+
*/
|
|
15
|
+
type PipelineType = 'standard' | 'vlm';
|
|
16
|
+
/**
|
|
17
|
+
* Extended options for PDF conversion including pipeline selection
|
|
18
|
+
*/
|
|
19
|
+
type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'images_scale' | 'force_ocr' | 'pipeline' | 'vlm_pipeline_model_local'> & {
|
|
20
|
+
num_threads?: number;
|
|
21
|
+
pipeline?: PipelineType;
|
|
22
|
+
vlm_model?: string | VlmModelLocal;
|
|
23
|
+
};
|
|
9
24
|
|
|
10
25
|
type Options = {
|
|
11
26
|
logger: LoggerMethods;
|
|
@@ -82,6 +97,7 @@ declare class PDFParser {
|
|
|
82
97
|
private readonly killExistingProcess;
|
|
83
98
|
private readonly enableImagePdfFallback;
|
|
84
99
|
private client;
|
|
100
|
+
private environment?;
|
|
85
101
|
constructor(options: Options);
|
|
86
102
|
init(): Promise<void>;
|
|
87
103
|
private checkOperatingSystem;
|
|
@@ -104,9 +120,7 @@ declare class PDFParser {
|
|
|
104
120
|
*/
|
|
105
121
|
private restartServer;
|
|
106
122
|
private waitForServerReady;
|
|
107
|
-
parse(url: string, reportId: string, onComplete: ConversionCompleteCallback, cleanupAfterCallback: boolean, options:
|
|
108
|
-
num_threads?: number;
|
|
109
|
-
}, abortSignal?: AbortSignal): Promise<void>;
|
|
123
|
+
parse(url: string, reportId: string, onComplete: ConversionCompleteCallback, cleanupAfterCallback: boolean, options: PDFConvertOptions, abortSignal?: AbortSignal): Promise<void>;
|
|
110
124
|
/**
|
|
111
125
|
* Dispose the parser instance.
|
|
112
126
|
* - Sets the internal client to null
|
|
@@ -126,4 +140,4 @@ declare class ImagePdfFallbackError extends Error {
|
|
|
126
140
|
constructor(originalError: Error, fallbackError: Error);
|
|
127
141
|
}
|
|
128
142
|
|
|
129
|
-
export { type ConversionCompleteCallback, ImagePdfFallbackError, PDFParser };
|
|
143
|
+
export { type ConversionCompleteCallback, ImagePdfFallbackError, type PDFConvertOptions, PDFParser, type PipelineType };
|
package/dist/index.js
CHANGED
|
@@ -1,9 +1,14 @@
|
|
|
1
|
+
import {
|
|
2
|
+
DEFAULT_VLM_MODEL,
|
|
3
|
+
VLM_MODELS,
|
|
4
|
+
resolveVlmModel
|
|
5
|
+
} from "./chunk-WWNI354M.js";
|
|
1
6
|
import "./chunk-VUNV25KB.js";
|
|
2
7
|
|
|
3
8
|
// src/core/pdf-parser.ts
|
|
4
9
|
import { Docling } from "docling-sdk";
|
|
5
10
|
import { execSync } from "child_process";
|
|
6
|
-
import { platform } from "os";
|
|
11
|
+
import { platform as platform2 } from "os";
|
|
7
12
|
import { join as join5 } from "path";
|
|
8
13
|
|
|
9
14
|
// src/config/constants.ts
|
|
@@ -33,7 +38,11 @@ var PDF_CONVERTER = {
|
|
|
33
38
|
/**
|
|
34
39
|
* Interval for progress polling in milliseconds
|
|
35
40
|
*/
|
|
36
|
-
POLL_INTERVAL_MS: 1e3
|
|
41
|
+
POLL_INTERVAL_MS: 1e3,
|
|
42
|
+
/**
|
|
43
|
+
* Default timeout for task completion in milliseconds (30 minutes)
|
|
44
|
+
*/
|
|
45
|
+
DEFAULT_TIMEOUT_MS: 18e5
|
|
37
46
|
};
|
|
38
47
|
var DOCLING_ENVIRONMENT = {
|
|
39
48
|
/**
|
|
@@ -51,6 +60,19 @@ var IMAGE_PDF_CONVERTER = {
|
|
|
51
60
|
*/
|
|
52
61
|
QUALITY: 100
|
|
53
62
|
};
|
|
63
|
+
var VLM_ENVIRONMENT = {
|
|
64
|
+
/**
|
|
65
|
+
* Timeout for VLM dependency installation (pip install) in milliseconds (3 hours).
|
|
66
|
+
* VLM packages can be very large and may require extended download times
|
|
67
|
+
* depending on network conditions.
|
|
68
|
+
*/
|
|
69
|
+
SETUP_TIMEOUT_MS: 108e5,
|
|
70
|
+
/**
|
|
71
|
+
* Timeout for VLM model download in milliseconds (3 hours).
|
|
72
|
+
* Large VLM models (e.g., multi-GB weights) need sufficient time to download.
|
|
73
|
+
*/
|
|
74
|
+
MODEL_DOWNLOAD_TIMEOUT_MS: 108e5
|
|
75
|
+
};
|
|
54
76
|
|
|
55
77
|
// ../shared/dist/index.mjs
|
|
56
78
|
import { spawn } from "child_process";
|
|
@@ -83,6 +105,7 @@ function spawnAsync(command, args, options = {}) {
|
|
|
83
105
|
|
|
84
106
|
// src/environment/docling-environment.ts
|
|
85
107
|
import { spawn as spawn2 } from "child_process";
|
|
108
|
+
import { arch, platform } from "os";
|
|
86
109
|
import { join } from "path";
|
|
87
110
|
|
|
88
111
|
// src/utils/python-version.ts
|
|
@@ -124,6 +147,7 @@ var DoclingEnvironment = class _DoclingEnvironment {
|
|
|
124
147
|
venvPath;
|
|
125
148
|
port;
|
|
126
149
|
killExistingProcess;
|
|
150
|
+
vlmDependenciesInstalled = false;
|
|
127
151
|
constructor(options) {
|
|
128
152
|
this.logger = options.logger;
|
|
129
153
|
this.venvPath = options.venvPath;
|
|
@@ -253,6 +277,81 @@ var DoclingEnvironment = class _DoclingEnvironment {
|
|
|
253
277
|
);
|
|
254
278
|
}
|
|
255
279
|
}
|
|
280
|
+
/**
|
|
281
|
+
* Install VLM-specific dependencies for the Docling VLM pipeline.
|
|
282
|
+
*
|
|
283
|
+
* Installs:
|
|
284
|
+
* 1. docling-serve[vlm] - VLM model support for docling-serve
|
|
285
|
+
* 2. mlx + mlx-lm (macOS ARM64 only) - Apple Silicon optimized inference
|
|
286
|
+
*
|
|
287
|
+
* This is idempotent - subsequent calls skip if already installed.
|
|
288
|
+
*/
|
|
289
|
+
async setupVlmDependencies() {
|
|
290
|
+
if (this.vlmDependenciesInstalled) {
|
|
291
|
+
this.logger.info(
|
|
292
|
+
"[DoclingEnvironment] VLM dependencies already installed, skipping"
|
|
293
|
+
);
|
|
294
|
+
return;
|
|
295
|
+
}
|
|
296
|
+
if (await this.isVlmReady()) {
|
|
297
|
+
this.vlmDependenciesInstalled = true;
|
|
298
|
+
this.logger.info(
|
|
299
|
+
"[DoclingEnvironment] VLM dependencies already installed, skipping"
|
|
300
|
+
);
|
|
301
|
+
return;
|
|
302
|
+
}
|
|
303
|
+
this.logger.info("[DoclingEnvironment] Installing VLM dependencies...");
|
|
304
|
+
const pipPath = join(this.venvPath, "bin", "pip");
|
|
305
|
+
this.logger.info("[DoclingEnvironment] Installing docling[vlm]...");
|
|
306
|
+
const vlmResult = await spawnAsync(
|
|
307
|
+
pipPath,
|
|
308
|
+
["install", "docling-serve[vlm]"],
|
|
309
|
+
{ timeout: VLM_ENVIRONMENT.SETUP_TIMEOUT_MS }
|
|
310
|
+
);
|
|
311
|
+
if (vlmResult.code !== 0) {
|
|
312
|
+
this.logger.error(
|
|
313
|
+
"[DoclingEnvironment] Failed to install docling-serve[vlm]:",
|
|
314
|
+
vlmResult.stderr
|
|
315
|
+
);
|
|
316
|
+
throw new Error(
|
|
317
|
+
`Failed to install docling-serve[vlm]. Exit code: ${vlmResult.code}`
|
|
318
|
+
);
|
|
319
|
+
}
|
|
320
|
+
if (platform() === "darwin" && arch() === "arm64") {
|
|
321
|
+
this.logger.info(
|
|
322
|
+
"[DoclingEnvironment] Installing mlx + mlx-lm for Apple Silicon..."
|
|
323
|
+
);
|
|
324
|
+
const mlxResult = await spawnAsync(
|
|
325
|
+
pipPath,
|
|
326
|
+
["install", "mlx", "mlx-lm"],
|
|
327
|
+
{ timeout: VLM_ENVIRONMENT.SETUP_TIMEOUT_MS }
|
|
328
|
+
);
|
|
329
|
+
if (mlxResult.code !== 0) {
|
|
330
|
+
this.logger.error(
|
|
331
|
+
"[DoclingEnvironment] Failed to install mlx/mlx-lm:",
|
|
332
|
+
mlxResult.stderr
|
|
333
|
+
);
|
|
334
|
+
throw new Error(
|
|
335
|
+
`Failed to install mlx/mlx-lm. Exit code: ${mlxResult.code}`
|
|
336
|
+
);
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
this.vlmDependenciesInstalled = true;
|
|
340
|
+
this.logger.info(
|
|
341
|
+
"[DoclingEnvironment] VLM dependencies installed successfully"
|
|
342
|
+
);
|
|
343
|
+
}
|
|
344
|
+
/**
|
|
345
|
+
* Check if VLM dependencies are ready by verifying Python module imports
|
|
346
|
+
*/
|
|
347
|
+
async isVlmReady() {
|
|
348
|
+
const pythonPath = join(this.venvPath, "bin", "python");
|
|
349
|
+
const result = await spawnAsync(pythonPath, [
|
|
350
|
+
"-c",
|
|
351
|
+
"import docling_core; import docling"
|
|
352
|
+
]);
|
|
353
|
+
return result.code === 0;
|
|
354
|
+
}
|
|
256
355
|
async isPortInUse(port) {
|
|
257
356
|
try {
|
|
258
357
|
const result = await spawnAsync("lsof", ["-ti", `:${port}`]);
|
|
@@ -339,6 +438,7 @@ var DoclingEnvironment = class _DoclingEnvironment {
|
|
|
339
438
|
};
|
|
340
439
|
|
|
341
440
|
// src/core/pdf-converter.ts
|
|
441
|
+
import { ValidationUtils } from "docling-sdk";
|
|
342
442
|
import { omit } from "es-toolkit";
|
|
343
443
|
import { createWriteStream as createWriteStream2, existsSync as existsSync3, rmSync as rmSync3 } from "fs";
|
|
344
444
|
import { join as join4 } from "path";
|
|
@@ -789,11 +889,17 @@ var ImagePdfConverter = class {
|
|
|
789
889
|
};
|
|
790
890
|
|
|
791
891
|
// src/core/pdf-converter.ts
|
|
892
|
+
var _origAssertValidConversionOptions = ValidationUtils.assertValidConversionOptions.bind(ValidationUtils);
|
|
893
|
+
ValidationUtils.assertValidConversionOptions = (options) => {
|
|
894
|
+
const { pipeline: _pipeline, ...rest } = options;
|
|
895
|
+
_origAssertValidConversionOptions(rest);
|
|
896
|
+
};
|
|
792
897
|
var PDFConverter = class {
|
|
793
|
-
constructor(logger, client, enableImagePdfFallback = false) {
|
|
898
|
+
constructor(logger, client, enableImagePdfFallback = false, timeout = PDF_CONVERTER.DEFAULT_TIMEOUT_MS) {
|
|
794
899
|
this.logger = logger;
|
|
795
900
|
this.client = client;
|
|
796
901
|
this.enableImagePdfFallback = enableImagePdfFallback;
|
|
902
|
+
this.timeout = timeout;
|
|
797
903
|
}
|
|
798
904
|
async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
|
|
799
905
|
this.logger.info("[PDFConverter] Converting:", url);
|
|
@@ -848,7 +954,15 @@ var PDFConverter = class {
|
|
|
848
954
|
}
|
|
849
955
|
async performConversion(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
|
|
850
956
|
const startTime = Date.now();
|
|
851
|
-
const
|
|
957
|
+
const pipelineType = options.pipeline ?? "standard";
|
|
958
|
+
const conversionOptions = pipelineType === "vlm" ? this.buildVlmConversionOptions(options) : this.buildConversionOptions(options);
|
|
959
|
+
if (pipelineType === "vlm") {
|
|
960
|
+
this.logger.info("[PDFConverter] Using VLM pipeline");
|
|
961
|
+
} else {
|
|
962
|
+
this.logger.info(
|
|
963
|
+
`[PDFConverter] OCR languages: ${JSON.stringify(conversionOptions.ocr_options?.lang)}`
|
|
964
|
+
);
|
|
965
|
+
}
|
|
852
966
|
this.logger.info(
|
|
853
967
|
"[PDFConverter] Converting document with Async Source API..."
|
|
854
968
|
);
|
|
@@ -915,7 +1029,7 @@ var PDFConverter = class {
|
|
|
915
1029
|
}
|
|
916
1030
|
buildConversionOptions(options) {
|
|
917
1031
|
return {
|
|
918
|
-
...omit(options, ["num_threads"]),
|
|
1032
|
+
...omit(options, ["num_threads", "pipeline", "vlm_model"]),
|
|
919
1033
|
to_formats: ["json", "html"],
|
|
920
1034
|
image_export_mode: "embedded",
|
|
921
1035
|
ocr_engine: "ocrmac",
|
|
@@ -941,6 +1055,31 @@ var PDFConverter = class {
|
|
|
941
1055
|
}
|
|
942
1056
|
};
|
|
943
1057
|
}
|
|
1058
|
+
/**
|
|
1059
|
+
* Build conversion options for VLM pipeline.
|
|
1060
|
+
*
|
|
1061
|
+
* VLM pipeline uses a Vision Language Model instead of traditional OCR,
|
|
1062
|
+
* providing better accuracy for KCJ characters and complex layouts.
|
|
1063
|
+
*/
|
|
1064
|
+
buildVlmConversionOptions(options) {
|
|
1065
|
+
const vlmModel = resolveVlmModel(options.vlm_model ?? DEFAULT_VLM_MODEL);
|
|
1066
|
+
this.logger.info(
|
|
1067
|
+
`[PDFConverter] VLM model: ${vlmModel.repo_id} (framework: ${vlmModel.inference_framework}, format: ${vlmModel.response_format})`
|
|
1068
|
+
);
|
|
1069
|
+
return {
|
|
1070
|
+
...omit(options, ["num_threads", "pipeline", "vlm_model", "ocr_lang"]),
|
|
1071
|
+
to_formats: ["json", "html"],
|
|
1072
|
+
image_export_mode: "embedded",
|
|
1073
|
+
pipeline: "vlm",
|
|
1074
|
+
vlm_pipeline_model_local: vlmModel,
|
|
1075
|
+
generate_picture_images: true,
|
|
1076
|
+
images_scale: 2,
|
|
1077
|
+
accelerator_options: {
|
|
1078
|
+
device: "mps",
|
|
1079
|
+
num_threads: options.num_threads
|
|
1080
|
+
}
|
|
1081
|
+
};
|
|
1082
|
+
}
|
|
944
1083
|
async startConversionTask(url, conversionOptions) {
|
|
945
1084
|
const task = await this.client.convertSourceAsync({
|
|
946
1085
|
sources: [
|
|
@@ -976,38 +1115,42 @@ var PDFConverter = class {
|
|
|
976
1115
|
}
|
|
977
1116
|
async trackTaskProgress(task) {
|
|
978
1117
|
const conversionStartTime = Date.now();
|
|
979
|
-
let
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
if (isCompleted) return;
|
|
983
|
-
const elapsed = Math.floor((Date.now() - conversionStartTime) / 1e3);
|
|
984
|
-
process.stdout.write(
|
|
985
|
-
`\r[PDFConverter] Status: ${lastStatus || "processing"} (${elapsed}s elapsed)`
|
|
986
|
-
);
|
|
987
|
-
}, PDF_CONVERTER.POLL_INTERVAL_MS);
|
|
988
|
-
task.on("progress", (status) => {
|
|
989
|
-
lastStatus = status.task_status;
|
|
1118
|
+
let lastProgressLine = "";
|
|
1119
|
+
const logProgress = (status) => {
|
|
1120
|
+
const parts = [`Status: ${status.task_status}`];
|
|
990
1121
|
if (status.task_position !== void 0) {
|
|
991
|
-
|
|
992
|
-
`\r[PDFConverter] Status: ${status.task_status} (position: ${status.task_position})`
|
|
993
|
-
);
|
|
1122
|
+
parts.push(`position: ${status.task_position}`);
|
|
994
1123
|
}
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
}
|
|
1009
|
-
|
|
1010
|
-
|
|
1124
|
+
const meta = status.task_meta;
|
|
1125
|
+
if (meta) {
|
|
1126
|
+
if (meta.processed_documents !== void 0 && meta.total_documents !== void 0) {
|
|
1127
|
+
parts.push(
|
|
1128
|
+
`progress: ${meta.processed_documents}/${meta.total_documents}`
|
|
1129
|
+
);
|
|
1130
|
+
}
|
|
1131
|
+
}
|
|
1132
|
+
const progressLine = `\r[PDFConverter] ${parts.join(" | ")}`;
|
|
1133
|
+
if (progressLine !== lastProgressLine) {
|
|
1134
|
+
lastProgressLine = progressLine;
|
|
1135
|
+
process.stdout.write(progressLine);
|
|
1136
|
+
}
|
|
1137
|
+
};
|
|
1138
|
+
while (true) {
|
|
1139
|
+
if (Date.now() - conversionStartTime > this.timeout) {
|
|
1140
|
+
throw new Error("Task timeout");
|
|
1141
|
+
}
|
|
1142
|
+
const status = await task.poll();
|
|
1143
|
+
logProgress(status);
|
|
1144
|
+
if (status.task_status === "success") {
|
|
1145
|
+
this.logger.info("\n[PDFConverter] Conversion completed!");
|
|
1146
|
+
return;
|
|
1147
|
+
}
|
|
1148
|
+
if (status.task_status === "failure") {
|
|
1149
|
+
throw new Error("Task failed with status: failure");
|
|
1150
|
+
}
|
|
1151
|
+
await new Promise(
|
|
1152
|
+
(resolve) => setTimeout(resolve, PDF_CONVERTER.POLL_INTERVAL_MS)
|
|
1153
|
+
);
|
|
1011
1154
|
}
|
|
1012
1155
|
}
|
|
1013
1156
|
async downloadResult(taskId) {
|
|
@@ -1043,6 +1186,7 @@ var PDFParser = class {
|
|
|
1043
1186
|
killExistingProcess;
|
|
1044
1187
|
enableImagePdfFallback;
|
|
1045
1188
|
client = null;
|
|
1189
|
+
environment;
|
|
1046
1190
|
constructor(options) {
|
|
1047
1191
|
const {
|
|
1048
1192
|
logger,
|
|
@@ -1087,13 +1231,13 @@ var PDFParser = class {
|
|
|
1087
1231
|
}
|
|
1088
1232
|
this.logger.info("[PDFParser] Setting up local server...");
|
|
1089
1233
|
try {
|
|
1090
|
-
|
|
1234
|
+
this.environment = new DoclingEnvironment({
|
|
1091
1235
|
logger: this.logger,
|
|
1092
1236
|
venvPath: this.venvPath,
|
|
1093
1237
|
port: this.port,
|
|
1094
1238
|
killExistingProcess: this.killExistingProcess
|
|
1095
1239
|
});
|
|
1096
|
-
await environment.setup();
|
|
1240
|
+
await this.environment.setup();
|
|
1097
1241
|
const clientUrl = `http://localhost:${this.port}`;
|
|
1098
1242
|
this.client = new Docling({
|
|
1099
1243
|
api: {
|
|
@@ -1109,9 +1253,9 @@ var PDFParser = class {
|
|
|
1109
1253
|
}
|
|
1110
1254
|
}
|
|
1111
1255
|
checkOperatingSystem() {
|
|
1112
|
-
if (
|
|
1256
|
+
if (platform2() !== "darwin") {
|
|
1113
1257
|
throw new Error(
|
|
1114
|
-
"PDFParser is only supported on macOS. Current platform: " +
|
|
1258
|
+
"PDFParser is only supported on macOS. Current platform: " + platform2()
|
|
1115
1259
|
);
|
|
1116
1260
|
}
|
|
1117
1261
|
}
|
|
@@ -1239,6 +1383,12 @@ var PDFParser = class {
|
|
|
1239
1383
|
"PDFParser is not initialized. Call init() before using parse()"
|
|
1240
1384
|
);
|
|
1241
1385
|
}
|
|
1386
|
+
if (options.pipeline === "vlm" && this.environment && !this.baseUrl) {
|
|
1387
|
+
this.logger.info(
|
|
1388
|
+
"[PDFParser] VLM pipeline requested, ensuring VLM dependencies..."
|
|
1389
|
+
);
|
|
1390
|
+
await this.environment.setupVlmDependencies();
|
|
1391
|
+
}
|
|
1242
1392
|
const canRecover = !this.baseUrl && this.port !== void 0;
|
|
1243
1393
|
const maxAttempts = PDF_PARSER.MAX_SERVER_RECOVERY_ATTEMPTS;
|
|
1244
1394
|
let attempt = 0;
|
|
@@ -1248,7 +1398,8 @@ var PDFParser = class {
|
|
|
1248
1398
|
const converter = new PDFConverter(
|
|
1249
1399
|
this.logger,
|
|
1250
1400
|
this.client,
|
|
1251
|
-
effectiveFallbackEnabled
|
|
1401
|
+
effectiveFallbackEnabled,
|
|
1402
|
+
this.timeout
|
|
1252
1403
|
);
|
|
1253
1404
|
return await converter.convert(
|
|
1254
1405
|
url,
|
|
@@ -1295,7 +1446,10 @@ var PDFParser = class {
|
|
|
1295
1446
|
}
|
|
1296
1447
|
};
|
|
1297
1448
|
export {
|
|
1449
|
+
DEFAULT_VLM_MODEL,
|
|
1298
1450
|
ImagePdfFallbackError,
|
|
1299
|
-
PDFParser
|
|
1451
|
+
PDFParser,
|
|
1452
|
+
VLM_MODELS,
|
|
1453
|
+
resolveVlmModel
|
|
1300
1454
|
};
|
|
1301
1455
|
//# sourceMappingURL=index.js.map
|