@heripo/pdf-parser 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -1,11 +1,26 @@
1
1
  import { LoggerMethods } from '@heripo/logger';
2
- import { ConversionOptions } from 'docling-sdk';
2
+ import { ConversionOptions, VlmModelLocal } from 'docling-sdk';
3
+ export { DEFAULT_VLM_MODEL, VLM_MODELS, VlmModelPreset, resolveVlmModel } from './vlm-models.cjs';
3
4
 
4
5
  /**
5
6
  * Callback function invoked after PDF conversion completes
6
7
  * @param outputPath Absolute path to the output directory containing result files
7
8
  */
8
9
  type ConversionCompleteCallback = (outputPath: string) => Promise<void> | void;
10
+ /**
11
+ * Pipeline type for PDF conversion
12
+ * - 'standard': Use OCR-based pipeline (default, uses ocrmac)
13
+ * - 'vlm': Use Vision Language Model pipeline for better KCJ/complex layout handling
14
+ */
15
+ type PipelineType = 'standard' | 'vlm';
16
+ /**
17
+ * Extended options for PDF conversion including pipeline selection
18
+ */
19
+ type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'images_scale' | 'force_ocr' | 'pipeline' | 'vlm_pipeline_model_local'> & {
20
+ num_threads?: number;
21
+ pipeline?: PipelineType;
22
+ vlm_model?: string | VlmModelLocal;
23
+ };
9
24
 
10
25
  type Options = {
11
26
  logger: LoggerMethods;
@@ -82,6 +97,7 @@ declare class PDFParser {
82
97
  private readonly killExistingProcess;
83
98
  private readonly enableImagePdfFallback;
84
99
  private client;
100
+ private environment?;
85
101
  constructor(options: Options);
86
102
  init(): Promise<void>;
87
103
  private checkOperatingSystem;
@@ -104,9 +120,7 @@ declare class PDFParser {
104
120
  */
105
121
  private restartServer;
106
122
  private waitForServerReady;
107
- parse(url: string, reportId: string, onComplete: ConversionCompleteCallback, cleanupAfterCallback: boolean, options: Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'images_scale' | 'force_ocr'> & {
108
- num_threads?: number;
109
- }, abortSignal?: AbortSignal): Promise<void>;
123
+ parse(url: string, reportId: string, onComplete: ConversionCompleteCallback, cleanupAfterCallback: boolean, options: PDFConvertOptions, abortSignal?: AbortSignal): Promise<void>;
110
124
  /**
111
125
  * Dispose the parser instance.
112
126
  * - Sets the internal client to null
@@ -126,4 +140,4 @@ declare class ImagePdfFallbackError extends Error {
126
140
  constructor(originalError: Error, fallbackError: Error);
127
141
  }
128
142
 
129
- export { type ConversionCompleteCallback, ImagePdfFallbackError, PDFParser };
143
+ export { type ConversionCompleteCallback, ImagePdfFallbackError, type PDFConvertOptions, PDFParser, type PipelineType };
package/dist/index.d.ts CHANGED
@@ -1,11 +1,26 @@
1
1
  import { LoggerMethods } from '@heripo/logger';
2
- import { ConversionOptions } from 'docling-sdk';
2
+ import { ConversionOptions, VlmModelLocal } from 'docling-sdk';
3
+ export { DEFAULT_VLM_MODEL, VLM_MODELS, VlmModelPreset, resolveVlmModel } from './vlm-models.js';
3
4
 
4
5
  /**
5
6
  * Callback function invoked after PDF conversion completes
6
7
  * @param outputPath Absolute path to the output directory containing result files
7
8
  */
8
9
  type ConversionCompleteCallback = (outputPath: string) => Promise<void> | void;
10
+ /**
11
+ * Pipeline type for PDF conversion
12
+ * - 'standard': Use OCR-based pipeline (default, uses ocrmac)
13
+ * - 'vlm': Use Vision Language Model pipeline for better KCJ/complex layout handling
14
+ */
15
+ type PipelineType = 'standard' | 'vlm';
16
+ /**
17
+ * Extended options for PDF conversion including pipeline selection
18
+ */
19
+ type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'images_scale' | 'force_ocr' | 'pipeline' | 'vlm_pipeline_model_local'> & {
20
+ num_threads?: number;
21
+ pipeline?: PipelineType;
22
+ vlm_model?: string | VlmModelLocal;
23
+ };
9
24
 
10
25
  type Options = {
11
26
  logger: LoggerMethods;
@@ -82,6 +97,7 @@ declare class PDFParser {
82
97
  private readonly killExistingProcess;
83
98
  private readonly enableImagePdfFallback;
84
99
  private client;
100
+ private environment?;
85
101
  constructor(options: Options);
86
102
  init(): Promise<void>;
87
103
  private checkOperatingSystem;
@@ -104,9 +120,7 @@ declare class PDFParser {
104
120
  */
105
121
  private restartServer;
106
122
  private waitForServerReady;
107
- parse(url: string, reportId: string, onComplete: ConversionCompleteCallback, cleanupAfterCallback: boolean, options: Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'images_scale' | 'force_ocr'> & {
108
- num_threads?: number;
109
- }, abortSignal?: AbortSignal): Promise<void>;
123
+ parse(url: string, reportId: string, onComplete: ConversionCompleteCallback, cleanupAfterCallback: boolean, options: PDFConvertOptions, abortSignal?: AbortSignal): Promise<void>;
110
124
  /**
111
125
  * Dispose the parser instance.
112
126
  * - Sets the internal client to null
@@ -126,4 +140,4 @@ declare class ImagePdfFallbackError extends Error {
126
140
  constructor(originalError: Error, fallbackError: Error);
127
141
  }
128
142
 
129
- export { type ConversionCompleteCallback, ImagePdfFallbackError, PDFParser };
143
+ export { type ConversionCompleteCallback, ImagePdfFallbackError, type PDFConvertOptions, PDFParser, type PipelineType };
package/dist/index.js CHANGED
@@ -1,9 +1,14 @@
1
+ import {
2
+ DEFAULT_VLM_MODEL,
3
+ VLM_MODELS,
4
+ resolveVlmModel
5
+ } from "./chunk-WWNI354M.js";
1
6
  import "./chunk-VUNV25KB.js";
2
7
 
3
8
  // src/core/pdf-parser.ts
4
9
  import { Docling } from "docling-sdk";
5
10
  import { execSync } from "child_process";
6
- import { platform } from "os";
11
+ import { platform as platform2 } from "os";
7
12
  import { join as join5 } from "path";
8
13
 
9
14
  // src/config/constants.ts
@@ -33,7 +38,11 @@ var PDF_CONVERTER = {
33
38
  /**
34
39
  * Interval for progress polling in milliseconds
35
40
  */
36
- POLL_INTERVAL_MS: 1e3
41
+ POLL_INTERVAL_MS: 1e3,
42
+ /**
43
+ * Default timeout for task completion in milliseconds (30 minutes)
44
+ */
45
+ DEFAULT_TIMEOUT_MS: 18e5
37
46
  };
38
47
  var DOCLING_ENVIRONMENT = {
39
48
  /**
@@ -51,6 +60,19 @@ var IMAGE_PDF_CONVERTER = {
51
60
  */
52
61
  QUALITY: 100
53
62
  };
63
+ var VLM_ENVIRONMENT = {
64
+ /**
65
+ * Timeout for VLM dependency installation (pip install) in milliseconds (3 hours).
66
+ * VLM packages can be very large and may require extended download times
67
+ * depending on network conditions.
68
+ */
69
+ SETUP_TIMEOUT_MS: 108e5,
70
+ /**
71
+ * Timeout for VLM model download in milliseconds (3 hours).
72
+ * Large VLM models (e.g., multi-GB weights) need sufficient time to download.
73
+ */
74
+ MODEL_DOWNLOAD_TIMEOUT_MS: 108e5
75
+ };
54
76
 
55
77
  // ../shared/dist/index.mjs
56
78
  import { spawn } from "child_process";
@@ -83,6 +105,7 @@ function spawnAsync(command, args, options = {}) {
83
105
 
84
106
  // src/environment/docling-environment.ts
85
107
  import { spawn as spawn2 } from "child_process";
108
+ import { arch, platform } from "os";
86
109
  import { join } from "path";
87
110
 
88
111
  // src/utils/python-version.ts
@@ -124,6 +147,7 @@ var DoclingEnvironment = class _DoclingEnvironment {
124
147
  venvPath;
125
148
  port;
126
149
  killExistingProcess;
150
+ vlmDependenciesInstalled = false;
127
151
  constructor(options) {
128
152
  this.logger = options.logger;
129
153
  this.venvPath = options.venvPath;
@@ -253,6 +277,81 @@ var DoclingEnvironment = class _DoclingEnvironment {
253
277
  );
254
278
  }
255
279
  }
280
+ /**
281
+ * Install VLM-specific dependencies for the Docling VLM pipeline.
282
+ *
283
+ * Installs:
284
+ * 1. docling-serve[vlm] - VLM model support for docling-serve
285
+ * 2. mlx + mlx-lm (macOS ARM64 only) - Apple Silicon optimized inference
286
+ *
287
+ * This is idempotent - subsequent calls skip if already installed.
288
+ */
289
+ async setupVlmDependencies() {
290
+ if (this.vlmDependenciesInstalled) {
291
+ this.logger.info(
292
+ "[DoclingEnvironment] VLM dependencies already installed, skipping"
293
+ );
294
+ return;
295
+ }
296
+ if (await this.isVlmReady()) {
297
+ this.vlmDependenciesInstalled = true;
298
+ this.logger.info(
299
+ "[DoclingEnvironment] VLM dependencies already installed, skipping"
300
+ );
301
+ return;
302
+ }
303
+ this.logger.info("[DoclingEnvironment] Installing VLM dependencies...");
304
+ const pipPath = join(this.venvPath, "bin", "pip");
305
+ this.logger.info("[DoclingEnvironment] Installing docling[vlm]...");
306
+ const vlmResult = await spawnAsync(
307
+ pipPath,
308
+ ["install", "docling-serve[vlm]"],
309
+ { timeout: VLM_ENVIRONMENT.SETUP_TIMEOUT_MS }
310
+ );
311
+ if (vlmResult.code !== 0) {
312
+ this.logger.error(
313
+ "[DoclingEnvironment] Failed to install docling-serve[vlm]:",
314
+ vlmResult.stderr
315
+ );
316
+ throw new Error(
317
+ `Failed to install docling-serve[vlm]. Exit code: ${vlmResult.code}`
318
+ );
319
+ }
320
+ if (platform() === "darwin" && arch() === "arm64") {
321
+ this.logger.info(
322
+ "[DoclingEnvironment] Installing mlx + mlx-lm for Apple Silicon..."
323
+ );
324
+ const mlxResult = await spawnAsync(
325
+ pipPath,
326
+ ["install", "mlx", "mlx-lm"],
327
+ { timeout: VLM_ENVIRONMENT.SETUP_TIMEOUT_MS }
328
+ );
329
+ if (mlxResult.code !== 0) {
330
+ this.logger.error(
331
+ "[DoclingEnvironment] Failed to install mlx/mlx-lm:",
332
+ mlxResult.stderr
333
+ );
334
+ throw new Error(
335
+ `Failed to install mlx/mlx-lm. Exit code: ${mlxResult.code}`
336
+ );
337
+ }
338
+ }
339
+ this.vlmDependenciesInstalled = true;
340
+ this.logger.info(
341
+ "[DoclingEnvironment] VLM dependencies installed successfully"
342
+ );
343
+ }
344
+ /**
345
+ * Check if VLM dependencies are ready by verifying Python module imports
346
+ */
347
+ async isVlmReady() {
348
+ const pythonPath = join(this.venvPath, "bin", "python");
349
+ const result = await spawnAsync(pythonPath, [
350
+ "-c",
351
+ "import docling_core; import docling"
352
+ ]);
353
+ return result.code === 0;
354
+ }
256
355
  async isPortInUse(port) {
257
356
  try {
258
357
  const result = await spawnAsync("lsof", ["-ti", `:${port}`]);
@@ -339,6 +438,7 @@ var DoclingEnvironment = class _DoclingEnvironment {
339
438
  };
340
439
 
341
440
  // src/core/pdf-converter.ts
441
+ import { ValidationUtils } from "docling-sdk";
342
442
  import { omit } from "es-toolkit";
343
443
  import { createWriteStream as createWriteStream2, existsSync as existsSync3, rmSync as rmSync3 } from "fs";
344
444
  import { join as join4 } from "path";
@@ -789,11 +889,17 @@ var ImagePdfConverter = class {
789
889
  };
790
890
 
791
891
  // src/core/pdf-converter.ts
892
+ var _origAssertValidConversionOptions = ValidationUtils.assertValidConversionOptions.bind(ValidationUtils);
893
+ ValidationUtils.assertValidConversionOptions = (options) => {
894
+ const { pipeline: _pipeline, ...rest } = options;
895
+ _origAssertValidConversionOptions(rest);
896
+ };
792
897
  var PDFConverter = class {
793
- constructor(logger, client, enableImagePdfFallback = false) {
898
+ constructor(logger, client, enableImagePdfFallback = false, timeout = PDF_CONVERTER.DEFAULT_TIMEOUT_MS) {
794
899
  this.logger = logger;
795
900
  this.client = client;
796
901
  this.enableImagePdfFallback = enableImagePdfFallback;
902
+ this.timeout = timeout;
797
903
  }
798
904
  async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
799
905
  this.logger.info("[PDFConverter] Converting:", url);
@@ -848,7 +954,15 @@ var PDFConverter = class {
848
954
  }
849
955
  async performConversion(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
850
956
  const startTime = Date.now();
851
- const conversionOptions = this.buildConversionOptions(options);
957
+ const pipelineType = options.pipeline ?? "standard";
958
+ const conversionOptions = pipelineType === "vlm" ? this.buildVlmConversionOptions(options) : this.buildConversionOptions(options);
959
+ if (pipelineType === "vlm") {
960
+ this.logger.info("[PDFConverter] Using VLM pipeline");
961
+ } else {
962
+ this.logger.info(
963
+ `[PDFConverter] OCR languages: ${JSON.stringify(conversionOptions.ocr_options?.lang)}`
964
+ );
965
+ }
852
966
  this.logger.info(
853
967
  "[PDFConverter] Converting document with Async Source API..."
854
968
  );
@@ -915,7 +1029,7 @@ var PDFConverter = class {
915
1029
  }
916
1030
  buildConversionOptions(options) {
917
1031
  return {
918
- ...omit(options, ["num_threads"]),
1032
+ ...omit(options, ["num_threads", "pipeline", "vlm_model"]),
919
1033
  to_formats: ["json", "html"],
920
1034
  image_export_mode: "embedded",
921
1035
  ocr_engine: "ocrmac",
@@ -941,6 +1055,31 @@ var PDFConverter = class {
941
1055
  }
942
1056
  };
943
1057
  }
1058
+ /**
1059
+ * Build conversion options for VLM pipeline.
1060
+ *
1061
+ * VLM pipeline uses a Vision Language Model instead of traditional OCR,
1062
+ * providing better accuracy for KCJ characters and complex layouts.
1063
+ */
1064
+ buildVlmConversionOptions(options) {
1065
+ const vlmModel = resolveVlmModel(options.vlm_model ?? DEFAULT_VLM_MODEL);
1066
+ this.logger.info(
1067
+ `[PDFConverter] VLM model: ${vlmModel.repo_id} (framework: ${vlmModel.inference_framework}, format: ${vlmModel.response_format})`
1068
+ );
1069
+ return {
1070
+ ...omit(options, ["num_threads", "pipeline", "vlm_model", "ocr_lang"]),
1071
+ to_formats: ["json", "html"],
1072
+ image_export_mode: "embedded",
1073
+ pipeline: "vlm",
1074
+ vlm_pipeline_model_local: vlmModel,
1075
+ generate_picture_images: true,
1076
+ images_scale: 2,
1077
+ accelerator_options: {
1078
+ device: "mps",
1079
+ num_threads: options.num_threads
1080
+ }
1081
+ };
1082
+ }
944
1083
  async startConversionTask(url, conversionOptions) {
945
1084
  const task = await this.client.convertSourceAsync({
946
1085
  sources: [
@@ -976,38 +1115,42 @@ var PDFConverter = class {
976
1115
  }
977
1116
  async trackTaskProgress(task) {
978
1117
  const conversionStartTime = Date.now();
979
- let lastStatus = "";
980
- let isCompleted = false;
981
- const pollInterval = setInterval(() => {
982
- if (isCompleted) return;
983
- const elapsed = Math.floor((Date.now() - conversionStartTime) / 1e3);
984
- process.stdout.write(
985
- `\r[PDFConverter] Status: ${lastStatus || "processing"} (${elapsed}s elapsed)`
986
- );
987
- }, PDF_CONVERTER.POLL_INTERVAL_MS);
988
- task.on("progress", (status) => {
989
- lastStatus = status.task_status;
1118
+ let lastProgressLine = "";
1119
+ const logProgress = (status) => {
1120
+ const parts = [`Status: ${status.task_status}`];
990
1121
  if (status.task_position !== void 0) {
991
- process.stdout.write(
992
- `\r[PDFConverter] Status: ${status.task_status} (position: ${status.task_position})`
993
- );
1122
+ parts.push(`position: ${status.task_position}`);
994
1123
  }
995
- });
996
- task.on("complete", () => {
997
- isCompleted = true;
998
- clearInterval(pollInterval);
999
- this.logger.info("\n[PDFConverter] Conversion completed!");
1000
- });
1001
- task.on("error", (error) => {
1002
- isCompleted = true;
1003
- clearInterval(pollInterval);
1004
- this.logger.error("\n[PDFConverter] Conversion error:", error.message);
1005
- });
1006
- try {
1007
- await task.waitForCompletion();
1008
- } finally {
1009
- isCompleted = true;
1010
- clearInterval(pollInterval);
1124
+ const meta = status.task_meta;
1125
+ if (meta) {
1126
+ if (meta.processed_documents !== void 0 && meta.total_documents !== void 0) {
1127
+ parts.push(
1128
+ `progress: ${meta.processed_documents}/${meta.total_documents}`
1129
+ );
1130
+ }
1131
+ }
1132
+ const progressLine = `\r[PDFConverter] ${parts.join(" | ")}`;
1133
+ if (progressLine !== lastProgressLine) {
1134
+ lastProgressLine = progressLine;
1135
+ process.stdout.write(progressLine);
1136
+ }
1137
+ };
1138
+ while (true) {
1139
+ if (Date.now() - conversionStartTime > this.timeout) {
1140
+ throw new Error("Task timeout");
1141
+ }
1142
+ const status = await task.poll();
1143
+ logProgress(status);
1144
+ if (status.task_status === "success") {
1145
+ this.logger.info("\n[PDFConverter] Conversion completed!");
1146
+ return;
1147
+ }
1148
+ if (status.task_status === "failure") {
1149
+ throw new Error("Task failed with status: failure");
1150
+ }
1151
+ await new Promise(
1152
+ (resolve) => setTimeout(resolve, PDF_CONVERTER.POLL_INTERVAL_MS)
1153
+ );
1011
1154
  }
1012
1155
  }
1013
1156
  async downloadResult(taskId) {
@@ -1043,6 +1186,7 @@ var PDFParser = class {
1043
1186
  killExistingProcess;
1044
1187
  enableImagePdfFallback;
1045
1188
  client = null;
1189
+ environment;
1046
1190
  constructor(options) {
1047
1191
  const {
1048
1192
  logger,
@@ -1087,13 +1231,13 @@ var PDFParser = class {
1087
1231
  }
1088
1232
  this.logger.info("[PDFParser] Setting up local server...");
1089
1233
  try {
1090
- const environment = new DoclingEnvironment({
1234
+ this.environment = new DoclingEnvironment({
1091
1235
  logger: this.logger,
1092
1236
  venvPath: this.venvPath,
1093
1237
  port: this.port,
1094
1238
  killExistingProcess: this.killExistingProcess
1095
1239
  });
1096
- await environment.setup();
1240
+ await this.environment.setup();
1097
1241
  const clientUrl = `http://localhost:${this.port}`;
1098
1242
  this.client = new Docling({
1099
1243
  api: {
@@ -1109,9 +1253,9 @@ var PDFParser = class {
1109
1253
  }
1110
1254
  }
1111
1255
  checkOperatingSystem() {
1112
- if (platform() !== "darwin") {
1256
+ if (platform2() !== "darwin") {
1113
1257
  throw new Error(
1114
- "PDFParser is only supported on macOS. Current platform: " + platform()
1258
+ "PDFParser is only supported on macOS. Current platform: " + platform2()
1115
1259
  );
1116
1260
  }
1117
1261
  }
@@ -1239,6 +1383,12 @@ var PDFParser = class {
1239
1383
  "PDFParser is not initialized. Call init() before using parse()"
1240
1384
  );
1241
1385
  }
1386
+ if (options.pipeline === "vlm" && this.environment && !this.baseUrl) {
1387
+ this.logger.info(
1388
+ "[PDFParser] VLM pipeline requested, ensuring VLM dependencies..."
1389
+ );
1390
+ await this.environment.setupVlmDependencies();
1391
+ }
1242
1392
  const canRecover = !this.baseUrl && this.port !== void 0;
1243
1393
  const maxAttempts = PDF_PARSER.MAX_SERVER_RECOVERY_ATTEMPTS;
1244
1394
  let attempt = 0;
@@ -1248,7 +1398,8 @@ var PDFParser = class {
1248
1398
  const converter = new PDFConverter(
1249
1399
  this.logger,
1250
1400
  this.client,
1251
- effectiveFallbackEnabled
1401
+ effectiveFallbackEnabled,
1402
+ this.timeout
1252
1403
  );
1253
1404
  return await converter.convert(
1254
1405
  url,
@@ -1295,7 +1446,10 @@ var PDFParser = class {
1295
1446
  }
1296
1447
  };
1297
1448
  export {
1449
+ DEFAULT_VLM_MODEL,
1298
1450
  ImagePdfFallbackError,
1299
- PDFParser
1451
+ PDFParser,
1452
+ VLM_MODELS,
1453
+ resolveVlmModel
1300
1454
  };
1301
1455
  //# sourceMappingURL=index.js.map