@heripo/pdf-parser 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-WWNI354M.js +121 -0
- package/dist/chunk-WWNI354M.js.map +1 -0
- package/dist/index.cjs +315 -51
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +19 -5
- package/dist/index.d.ts +19 -5
- package/dist/index.js +195 -44
- package/dist/index.js.map +1 -1
- package/dist/vlm-models.cjs +147 -0
- package/dist/vlm-models.cjs.map +1 -0
- package/dist/vlm-models.d.cts +34 -0
- package/dist/vlm-models.d.ts +34 -0
- package/dist/vlm-models.js +12 -0
- package/dist/vlm-models.js.map +1 -0
- package/package.json +11 -5
package/dist/index.d.cts
CHANGED
|
@@ -1,11 +1,26 @@
|
|
|
1
1
|
import { LoggerMethods } from '@heripo/logger';
|
|
2
|
-
import { ConversionOptions } from 'docling-sdk';
|
|
2
|
+
import { ConversionOptions, VlmModelLocal } from 'docling-sdk';
|
|
3
|
+
export { DEFAULT_VLM_MODEL, VLM_MODELS, VlmModelPreset, resolveVlmModel } from './vlm-models.cjs';
|
|
3
4
|
|
|
4
5
|
/**
|
|
5
6
|
* Callback function invoked after PDF conversion completes
|
|
6
7
|
* @param outputPath Absolute path to the output directory containing result files
|
|
7
8
|
*/
|
|
8
9
|
type ConversionCompleteCallback = (outputPath: string) => Promise<void> | void;
|
|
10
|
+
/**
|
|
11
|
+
* Pipeline type for PDF conversion
|
|
12
|
+
* - 'standard': Use OCR-based pipeline (default, uses ocrmac)
|
|
13
|
+
* - 'vlm': Use Vision Language Model pipeline for better KCJ/complex layout handling
|
|
14
|
+
*/
|
|
15
|
+
type PipelineType = 'standard' | 'vlm';
|
|
16
|
+
/**
|
|
17
|
+
* Extended options for PDF conversion including pipeline selection
|
|
18
|
+
*/
|
|
19
|
+
type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'images_scale' | 'force_ocr' | 'pipeline' | 'vlm_pipeline_model_local'> & {
|
|
20
|
+
num_threads?: number;
|
|
21
|
+
pipeline?: PipelineType;
|
|
22
|
+
vlm_model?: string | VlmModelLocal;
|
|
23
|
+
};
|
|
9
24
|
|
|
10
25
|
type Options = {
|
|
11
26
|
logger: LoggerMethods;
|
|
@@ -82,6 +97,7 @@ declare class PDFParser {
|
|
|
82
97
|
private readonly killExistingProcess;
|
|
83
98
|
private readonly enableImagePdfFallback;
|
|
84
99
|
private client;
|
|
100
|
+
private environment?;
|
|
85
101
|
constructor(options: Options);
|
|
86
102
|
init(): Promise<void>;
|
|
87
103
|
private checkOperatingSystem;
|
|
@@ -104,9 +120,7 @@ declare class PDFParser {
|
|
|
104
120
|
*/
|
|
105
121
|
private restartServer;
|
|
106
122
|
private waitForServerReady;
|
|
107
|
-
parse(url: string, reportId: string, onComplete: ConversionCompleteCallback, cleanupAfterCallback: boolean, options:
|
|
108
|
-
num_threads?: number;
|
|
109
|
-
}, abortSignal?: AbortSignal): Promise<void>;
|
|
123
|
+
parse(url: string, reportId: string, onComplete: ConversionCompleteCallback, cleanupAfterCallback: boolean, options: PDFConvertOptions, abortSignal?: AbortSignal): Promise<void>;
|
|
110
124
|
/**
|
|
111
125
|
* Dispose the parser instance.
|
|
112
126
|
* - Sets the internal client to null
|
|
@@ -126,4 +140,4 @@ declare class ImagePdfFallbackError extends Error {
|
|
|
126
140
|
constructor(originalError: Error, fallbackError: Error);
|
|
127
141
|
}
|
|
128
142
|
|
|
129
|
-
export { type ConversionCompleteCallback, ImagePdfFallbackError, PDFParser };
|
|
143
|
+
export { type ConversionCompleteCallback, ImagePdfFallbackError, type PDFConvertOptions, PDFParser, type PipelineType };
|
package/dist/index.d.ts
CHANGED
|
@@ -1,11 +1,26 @@
|
|
|
1
1
|
import { LoggerMethods } from '@heripo/logger';
|
|
2
|
-
import { ConversionOptions } from 'docling-sdk';
|
|
2
|
+
import { ConversionOptions, VlmModelLocal } from 'docling-sdk';
|
|
3
|
+
export { DEFAULT_VLM_MODEL, VLM_MODELS, VlmModelPreset, resolveVlmModel } from './vlm-models.js';
|
|
3
4
|
|
|
4
5
|
/**
|
|
5
6
|
* Callback function invoked after PDF conversion completes
|
|
6
7
|
* @param outputPath Absolute path to the output directory containing result files
|
|
7
8
|
*/
|
|
8
9
|
type ConversionCompleteCallback = (outputPath: string) => Promise<void> | void;
|
|
10
|
+
/**
|
|
11
|
+
* Pipeline type for PDF conversion
|
|
12
|
+
* - 'standard': Use OCR-based pipeline (default, uses ocrmac)
|
|
13
|
+
* - 'vlm': Use Vision Language Model pipeline for better KCJ/complex layout handling
|
|
14
|
+
*/
|
|
15
|
+
type PipelineType = 'standard' | 'vlm';
|
|
16
|
+
/**
|
|
17
|
+
* Extended options for PDF conversion including pipeline selection
|
|
18
|
+
*/
|
|
19
|
+
type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'images_scale' | 'force_ocr' | 'pipeline' | 'vlm_pipeline_model_local'> & {
|
|
20
|
+
num_threads?: number;
|
|
21
|
+
pipeline?: PipelineType;
|
|
22
|
+
vlm_model?: string | VlmModelLocal;
|
|
23
|
+
};
|
|
9
24
|
|
|
10
25
|
type Options = {
|
|
11
26
|
logger: LoggerMethods;
|
|
@@ -82,6 +97,7 @@ declare class PDFParser {
|
|
|
82
97
|
private readonly killExistingProcess;
|
|
83
98
|
private readonly enableImagePdfFallback;
|
|
84
99
|
private client;
|
|
100
|
+
private environment?;
|
|
85
101
|
constructor(options: Options);
|
|
86
102
|
init(): Promise<void>;
|
|
87
103
|
private checkOperatingSystem;
|
|
@@ -104,9 +120,7 @@ declare class PDFParser {
|
|
|
104
120
|
*/
|
|
105
121
|
private restartServer;
|
|
106
122
|
private waitForServerReady;
|
|
107
|
-
parse(url: string, reportId: string, onComplete: ConversionCompleteCallback, cleanupAfterCallback: boolean, options:
|
|
108
|
-
num_threads?: number;
|
|
109
|
-
}, abortSignal?: AbortSignal): Promise<void>;
|
|
123
|
+
parse(url: string, reportId: string, onComplete: ConversionCompleteCallback, cleanupAfterCallback: boolean, options: PDFConvertOptions, abortSignal?: AbortSignal): Promise<void>;
|
|
110
124
|
/**
|
|
111
125
|
* Dispose the parser instance.
|
|
112
126
|
* - Sets the internal client to null
|
|
@@ -126,4 +140,4 @@ declare class ImagePdfFallbackError extends Error {
|
|
|
126
140
|
constructor(originalError: Error, fallbackError: Error);
|
|
127
141
|
}
|
|
128
142
|
|
|
129
|
-
export { type ConversionCompleteCallback, ImagePdfFallbackError, PDFParser };
|
|
143
|
+
export { type ConversionCompleteCallback, ImagePdfFallbackError, type PDFConvertOptions, PDFParser, type PipelineType };
|
package/dist/index.js
CHANGED
|
@@ -1,9 +1,14 @@
|
|
|
1
|
+
import {
|
|
2
|
+
DEFAULT_VLM_MODEL,
|
|
3
|
+
VLM_MODELS,
|
|
4
|
+
resolveVlmModel
|
|
5
|
+
} from "./chunk-WWNI354M.js";
|
|
1
6
|
import "./chunk-VUNV25KB.js";
|
|
2
7
|
|
|
3
8
|
// src/core/pdf-parser.ts
|
|
4
9
|
import { Docling } from "docling-sdk";
|
|
5
10
|
import { execSync } from "child_process";
|
|
6
|
-
import { platform } from "os";
|
|
11
|
+
import { platform as platform2 } from "os";
|
|
7
12
|
import { join as join5 } from "path";
|
|
8
13
|
|
|
9
14
|
// src/config/constants.ts
|
|
@@ -33,7 +38,11 @@ var PDF_CONVERTER = {
|
|
|
33
38
|
/**
|
|
34
39
|
* Interval for progress polling in milliseconds
|
|
35
40
|
*/
|
|
36
|
-
POLL_INTERVAL_MS: 1e3
|
|
41
|
+
POLL_INTERVAL_MS: 1e3,
|
|
42
|
+
/**
|
|
43
|
+
* Default timeout for task completion in milliseconds (30 minutes)
|
|
44
|
+
*/
|
|
45
|
+
DEFAULT_TIMEOUT_MS: 18e5
|
|
37
46
|
};
|
|
38
47
|
var DOCLING_ENVIRONMENT = {
|
|
39
48
|
/**
|
|
@@ -51,6 +60,19 @@ var IMAGE_PDF_CONVERTER = {
|
|
|
51
60
|
*/
|
|
52
61
|
QUALITY: 100
|
|
53
62
|
};
|
|
63
|
+
var VLM_ENVIRONMENT = {
|
|
64
|
+
/**
|
|
65
|
+
* Timeout for VLM dependency installation (pip install) in milliseconds (3 hours).
|
|
66
|
+
* VLM packages can be very large and may require extended download times
|
|
67
|
+
* depending on network conditions.
|
|
68
|
+
*/
|
|
69
|
+
SETUP_TIMEOUT_MS: 108e5,
|
|
70
|
+
/**
|
|
71
|
+
* Timeout for VLM model download in milliseconds (3 hours).
|
|
72
|
+
* Large VLM models (e.g., multi-GB weights) need sufficient time to download.
|
|
73
|
+
*/
|
|
74
|
+
MODEL_DOWNLOAD_TIMEOUT_MS: 108e5
|
|
75
|
+
};
|
|
54
76
|
|
|
55
77
|
// ../shared/dist/index.mjs
|
|
56
78
|
import { spawn } from "child_process";
|
|
@@ -83,6 +105,7 @@ function spawnAsync(command, args, options = {}) {
|
|
|
83
105
|
|
|
84
106
|
// src/environment/docling-environment.ts
|
|
85
107
|
import { spawn as spawn2 } from "child_process";
|
|
108
|
+
import { arch, platform } from "os";
|
|
86
109
|
import { join } from "path";
|
|
87
110
|
|
|
88
111
|
// src/utils/python-version.ts
|
|
@@ -124,6 +147,7 @@ var DoclingEnvironment = class _DoclingEnvironment {
|
|
|
124
147
|
venvPath;
|
|
125
148
|
port;
|
|
126
149
|
killExistingProcess;
|
|
150
|
+
vlmDependenciesInstalled = false;
|
|
127
151
|
constructor(options) {
|
|
128
152
|
this.logger = options.logger;
|
|
129
153
|
this.venvPath = options.venvPath;
|
|
@@ -253,6 +277,81 @@ var DoclingEnvironment = class _DoclingEnvironment {
|
|
|
253
277
|
);
|
|
254
278
|
}
|
|
255
279
|
}
|
|
280
|
+
/**
|
|
281
|
+
* Install VLM-specific dependencies for the Docling VLM pipeline.
|
|
282
|
+
*
|
|
283
|
+
* Installs:
|
|
284
|
+
* 1. docling-serve[vlm] - VLM model support for docling-serve
|
|
285
|
+
* 2. mlx + mlx-lm (macOS ARM64 only) - Apple Silicon optimized inference
|
|
286
|
+
*
|
|
287
|
+
* This is idempotent - subsequent calls skip if already installed.
|
|
288
|
+
*/
|
|
289
|
+
async setupVlmDependencies() {
|
|
290
|
+
if (this.vlmDependenciesInstalled) {
|
|
291
|
+
this.logger.info(
|
|
292
|
+
"[DoclingEnvironment] VLM dependencies already installed, skipping"
|
|
293
|
+
);
|
|
294
|
+
return;
|
|
295
|
+
}
|
|
296
|
+
if (await this.isVlmReady()) {
|
|
297
|
+
this.vlmDependenciesInstalled = true;
|
|
298
|
+
this.logger.info(
|
|
299
|
+
"[DoclingEnvironment] VLM dependencies already installed, skipping"
|
|
300
|
+
);
|
|
301
|
+
return;
|
|
302
|
+
}
|
|
303
|
+
this.logger.info("[DoclingEnvironment] Installing VLM dependencies...");
|
|
304
|
+
const pipPath = join(this.venvPath, "bin", "pip");
|
|
305
|
+
this.logger.info("[DoclingEnvironment] Installing docling[vlm]...");
|
|
306
|
+
const vlmResult = await spawnAsync(
|
|
307
|
+
pipPath,
|
|
308
|
+
["install", "docling-serve[vlm]"],
|
|
309
|
+
{ timeout: VLM_ENVIRONMENT.SETUP_TIMEOUT_MS }
|
|
310
|
+
);
|
|
311
|
+
if (vlmResult.code !== 0) {
|
|
312
|
+
this.logger.error(
|
|
313
|
+
"[DoclingEnvironment] Failed to install docling-serve[vlm]:",
|
|
314
|
+
vlmResult.stderr
|
|
315
|
+
);
|
|
316
|
+
throw new Error(
|
|
317
|
+
`Failed to install docling-serve[vlm]. Exit code: ${vlmResult.code}`
|
|
318
|
+
);
|
|
319
|
+
}
|
|
320
|
+
if (platform() === "darwin" && arch() === "arm64") {
|
|
321
|
+
this.logger.info(
|
|
322
|
+
"[DoclingEnvironment] Installing mlx + mlx-lm for Apple Silicon..."
|
|
323
|
+
);
|
|
324
|
+
const mlxResult = await spawnAsync(
|
|
325
|
+
pipPath,
|
|
326
|
+
["install", "mlx", "mlx-lm"],
|
|
327
|
+
{ timeout: VLM_ENVIRONMENT.SETUP_TIMEOUT_MS }
|
|
328
|
+
);
|
|
329
|
+
if (mlxResult.code !== 0) {
|
|
330
|
+
this.logger.error(
|
|
331
|
+
"[DoclingEnvironment] Failed to install mlx/mlx-lm:",
|
|
332
|
+
mlxResult.stderr
|
|
333
|
+
);
|
|
334
|
+
throw new Error(
|
|
335
|
+
`Failed to install mlx/mlx-lm. Exit code: ${mlxResult.code}`
|
|
336
|
+
);
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
this.vlmDependenciesInstalled = true;
|
|
340
|
+
this.logger.info(
|
|
341
|
+
"[DoclingEnvironment] VLM dependencies installed successfully"
|
|
342
|
+
);
|
|
343
|
+
}
|
|
344
|
+
/**
|
|
345
|
+
* Check if VLM dependencies are ready by verifying Python module imports
|
|
346
|
+
*/
|
|
347
|
+
async isVlmReady() {
|
|
348
|
+
const pythonPath = join(this.venvPath, "bin", "python");
|
|
349
|
+
const result = await spawnAsync(pythonPath, [
|
|
350
|
+
"-c",
|
|
351
|
+
"import docling_core; import docling"
|
|
352
|
+
]);
|
|
353
|
+
return result.code === 0;
|
|
354
|
+
}
|
|
256
355
|
async isPortInUse(port) {
|
|
257
356
|
try {
|
|
258
357
|
const result = await spawnAsync("lsof", ["-ti", `:${port}`]);
|
|
@@ -339,6 +438,7 @@ var DoclingEnvironment = class _DoclingEnvironment {
|
|
|
339
438
|
};
|
|
340
439
|
|
|
341
440
|
// src/core/pdf-converter.ts
|
|
441
|
+
import { ValidationUtils } from "docling-sdk";
|
|
342
442
|
import { omit } from "es-toolkit";
|
|
343
443
|
import { createWriteStream as createWriteStream2, existsSync as existsSync3, rmSync as rmSync3 } from "fs";
|
|
344
444
|
import { join as join4 } from "path";
|
|
@@ -789,11 +889,17 @@ var ImagePdfConverter = class {
|
|
|
789
889
|
};
|
|
790
890
|
|
|
791
891
|
// src/core/pdf-converter.ts
|
|
892
|
+
var _origAssertValidConversionOptions = ValidationUtils.assertValidConversionOptions.bind(ValidationUtils);
|
|
893
|
+
ValidationUtils.assertValidConversionOptions = (options) => {
|
|
894
|
+
const { pipeline: _pipeline, ...rest } = options;
|
|
895
|
+
_origAssertValidConversionOptions(rest);
|
|
896
|
+
};
|
|
792
897
|
var PDFConverter = class {
|
|
793
|
-
constructor(logger, client, enableImagePdfFallback = false) {
|
|
898
|
+
constructor(logger, client, enableImagePdfFallback = false, timeout = PDF_CONVERTER.DEFAULT_TIMEOUT_MS) {
|
|
794
899
|
this.logger = logger;
|
|
795
900
|
this.client = client;
|
|
796
901
|
this.enableImagePdfFallback = enableImagePdfFallback;
|
|
902
|
+
this.timeout = timeout;
|
|
797
903
|
}
|
|
798
904
|
async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
|
|
799
905
|
this.logger.info("[PDFConverter] Converting:", url);
|
|
@@ -848,10 +954,15 @@ var PDFConverter = class {
|
|
|
848
954
|
}
|
|
849
955
|
async performConversion(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
|
|
850
956
|
const startTime = Date.now();
|
|
851
|
-
const
|
|
852
|
-
this.
|
|
853
|
-
|
|
854
|
-
|
|
957
|
+
const pipelineType = options.pipeline ?? "standard";
|
|
958
|
+
const conversionOptions = pipelineType === "vlm" ? this.buildVlmConversionOptions(options) : this.buildConversionOptions(options);
|
|
959
|
+
if (pipelineType === "vlm") {
|
|
960
|
+
this.logger.info("[PDFConverter] Using VLM pipeline");
|
|
961
|
+
} else {
|
|
962
|
+
this.logger.info(
|
|
963
|
+
`[PDFConverter] OCR languages: ${JSON.stringify(conversionOptions.ocr_options?.lang)}`
|
|
964
|
+
);
|
|
965
|
+
}
|
|
855
966
|
this.logger.info(
|
|
856
967
|
"[PDFConverter] Converting document with Async Source API..."
|
|
857
968
|
);
|
|
@@ -918,7 +1029,7 @@ var PDFConverter = class {
|
|
|
918
1029
|
}
|
|
919
1030
|
buildConversionOptions(options) {
|
|
920
1031
|
return {
|
|
921
|
-
...omit(options, ["num_threads"]),
|
|
1032
|
+
...omit(options, ["num_threads", "pipeline", "vlm_model"]),
|
|
922
1033
|
to_formats: ["json", "html"],
|
|
923
1034
|
image_export_mode: "embedded",
|
|
924
1035
|
ocr_engine: "ocrmac",
|
|
@@ -944,6 +1055,31 @@ var PDFConverter = class {
|
|
|
944
1055
|
}
|
|
945
1056
|
};
|
|
946
1057
|
}
|
|
1058
|
+
/**
|
|
1059
|
+
* Build conversion options for VLM pipeline.
|
|
1060
|
+
*
|
|
1061
|
+
* VLM pipeline uses a Vision Language Model instead of traditional OCR,
|
|
1062
|
+
* providing better accuracy for KCJ characters and complex layouts.
|
|
1063
|
+
*/
|
|
1064
|
+
buildVlmConversionOptions(options) {
|
|
1065
|
+
const vlmModel = resolveVlmModel(options.vlm_model ?? DEFAULT_VLM_MODEL);
|
|
1066
|
+
this.logger.info(
|
|
1067
|
+
`[PDFConverter] VLM model: ${vlmModel.repo_id} (framework: ${vlmModel.inference_framework}, format: ${vlmModel.response_format})`
|
|
1068
|
+
);
|
|
1069
|
+
return {
|
|
1070
|
+
...omit(options, ["num_threads", "pipeline", "vlm_model", "ocr_lang"]),
|
|
1071
|
+
to_formats: ["json", "html"],
|
|
1072
|
+
image_export_mode: "embedded",
|
|
1073
|
+
pipeline: "vlm",
|
|
1074
|
+
vlm_pipeline_model_local: vlmModel,
|
|
1075
|
+
generate_picture_images: true,
|
|
1076
|
+
images_scale: 2,
|
|
1077
|
+
accelerator_options: {
|
|
1078
|
+
device: "mps",
|
|
1079
|
+
num_threads: options.num_threads
|
|
1080
|
+
}
|
|
1081
|
+
};
|
|
1082
|
+
}
|
|
947
1083
|
async startConversionTask(url, conversionOptions) {
|
|
948
1084
|
const task = await this.client.convertSourceAsync({
|
|
949
1085
|
sources: [
|
|
@@ -979,38 +1115,42 @@ var PDFConverter = class {
|
|
|
979
1115
|
}
|
|
980
1116
|
async trackTaskProgress(task) {
|
|
981
1117
|
const conversionStartTime = Date.now();
|
|
982
|
-
let
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
if (isCompleted) return;
|
|
986
|
-
const elapsed = Math.floor((Date.now() - conversionStartTime) / 1e3);
|
|
987
|
-
process.stdout.write(
|
|
988
|
-
`\r[PDFConverter] Status: ${lastStatus || "processing"} (${elapsed}s elapsed)`
|
|
989
|
-
);
|
|
990
|
-
}, PDF_CONVERTER.POLL_INTERVAL_MS);
|
|
991
|
-
task.on("progress", (status) => {
|
|
992
|
-
lastStatus = status.task_status;
|
|
1118
|
+
let lastProgressLine = "";
|
|
1119
|
+
const logProgress = (status) => {
|
|
1120
|
+
const parts = [`Status: ${status.task_status}`];
|
|
993
1121
|
if (status.task_position !== void 0) {
|
|
994
|
-
|
|
995
|
-
`\r[PDFConverter] Status: ${status.task_status} (position: ${status.task_position})`
|
|
996
|
-
);
|
|
1122
|
+
parts.push(`position: ${status.task_position}`);
|
|
997
1123
|
}
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
}
|
|
1012
|
-
|
|
1013
|
-
|
|
1124
|
+
const meta = status.task_meta;
|
|
1125
|
+
if (meta) {
|
|
1126
|
+
if (meta.processed_documents !== void 0 && meta.total_documents !== void 0) {
|
|
1127
|
+
parts.push(
|
|
1128
|
+
`progress: ${meta.processed_documents}/${meta.total_documents}`
|
|
1129
|
+
);
|
|
1130
|
+
}
|
|
1131
|
+
}
|
|
1132
|
+
const progressLine = `\r[PDFConverter] ${parts.join(" | ")}`;
|
|
1133
|
+
if (progressLine !== lastProgressLine) {
|
|
1134
|
+
lastProgressLine = progressLine;
|
|
1135
|
+
process.stdout.write(progressLine);
|
|
1136
|
+
}
|
|
1137
|
+
};
|
|
1138
|
+
while (true) {
|
|
1139
|
+
if (Date.now() - conversionStartTime > this.timeout) {
|
|
1140
|
+
throw new Error("Task timeout");
|
|
1141
|
+
}
|
|
1142
|
+
const status = await task.poll();
|
|
1143
|
+
logProgress(status);
|
|
1144
|
+
if (status.task_status === "success") {
|
|
1145
|
+
this.logger.info("\n[PDFConverter] Conversion completed!");
|
|
1146
|
+
return;
|
|
1147
|
+
}
|
|
1148
|
+
if (status.task_status === "failure") {
|
|
1149
|
+
throw new Error("Task failed with status: failure");
|
|
1150
|
+
}
|
|
1151
|
+
await new Promise(
|
|
1152
|
+
(resolve) => setTimeout(resolve, PDF_CONVERTER.POLL_INTERVAL_MS)
|
|
1153
|
+
);
|
|
1014
1154
|
}
|
|
1015
1155
|
}
|
|
1016
1156
|
async downloadResult(taskId) {
|
|
@@ -1046,6 +1186,7 @@ var PDFParser = class {
|
|
|
1046
1186
|
killExistingProcess;
|
|
1047
1187
|
enableImagePdfFallback;
|
|
1048
1188
|
client = null;
|
|
1189
|
+
environment;
|
|
1049
1190
|
constructor(options) {
|
|
1050
1191
|
const {
|
|
1051
1192
|
logger,
|
|
@@ -1090,13 +1231,13 @@ var PDFParser = class {
|
|
|
1090
1231
|
}
|
|
1091
1232
|
this.logger.info("[PDFParser] Setting up local server...");
|
|
1092
1233
|
try {
|
|
1093
|
-
|
|
1234
|
+
this.environment = new DoclingEnvironment({
|
|
1094
1235
|
logger: this.logger,
|
|
1095
1236
|
venvPath: this.venvPath,
|
|
1096
1237
|
port: this.port,
|
|
1097
1238
|
killExistingProcess: this.killExistingProcess
|
|
1098
1239
|
});
|
|
1099
|
-
await environment.setup();
|
|
1240
|
+
await this.environment.setup();
|
|
1100
1241
|
const clientUrl = `http://localhost:${this.port}`;
|
|
1101
1242
|
this.client = new Docling({
|
|
1102
1243
|
api: {
|
|
@@ -1112,9 +1253,9 @@ var PDFParser = class {
|
|
|
1112
1253
|
}
|
|
1113
1254
|
}
|
|
1114
1255
|
checkOperatingSystem() {
|
|
1115
|
-
if (
|
|
1256
|
+
if (platform2() !== "darwin") {
|
|
1116
1257
|
throw new Error(
|
|
1117
|
-
"PDFParser is only supported on macOS. Current platform: " +
|
|
1258
|
+
"PDFParser is only supported on macOS. Current platform: " + platform2()
|
|
1118
1259
|
);
|
|
1119
1260
|
}
|
|
1120
1261
|
}
|
|
@@ -1242,6 +1383,12 @@ var PDFParser = class {
|
|
|
1242
1383
|
"PDFParser is not initialized. Call init() before using parse()"
|
|
1243
1384
|
);
|
|
1244
1385
|
}
|
|
1386
|
+
if (options.pipeline === "vlm" && this.environment && !this.baseUrl) {
|
|
1387
|
+
this.logger.info(
|
|
1388
|
+
"[PDFParser] VLM pipeline requested, ensuring VLM dependencies..."
|
|
1389
|
+
);
|
|
1390
|
+
await this.environment.setupVlmDependencies();
|
|
1391
|
+
}
|
|
1245
1392
|
const canRecover = !this.baseUrl && this.port !== void 0;
|
|
1246
1393
|
const maxAttempts = PDF_PARSER.MAX_SERVER_RECOVERY_ATTEMPTS;
|
|
1247
1394
|
let attempt = 0;
|
|
@@ -1251,7 +1398,8 @@ var PDFParser = class {
|
|
|
1251
1398
|
const converter = new PDFConverter(
|
|
1252
1399
|
this.logger,
|
|
1253
1400
|
this.client,
|
|
1254
|
-
effectiveFallbackEnabled
|
|
1401
|
+
effectiveFallbackEnabled,
|
|
1402
|
+
this.timeout
|
|
1255
1403
|
);
|
|
1256
1404
|
return await converter.convert(
|
|
1257
1405
|
url,
|
|
@@ -1298,7 +1446,10 @@ var PDFParser = class {
|
|
|
1298
1446
|
}
|
|
1299
1447
|
};
|
|
1300
1448
|
export {
|
|
1449
|
+
DEFAULT_VLM_MODEL,
|
|
1301
1450
|
ImagePdfFallbackError,
|
|
1302
|
-
PDFParser
|
|
1451
|
+
PDFParser,
|
|
1452
|
+
VLM_MODELS,
|
|
1453
|
+
resolveVlmModel
|
|
1303
1454
|
};
|
|
1304
1455
|
//# sourceMappingURL=index.js.map
|