@heripo/pdf-parser 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -1,11 +1,26 @@
1
1
  import { LoggerMethods } from '@heripo/logger';
2
- import { ConversionOptions } from 'docling-sdk';
2
+ import { ConversionOptions, VlmModelLocal } from 'docling-sdk';
3
+ export { DEFAULT_VLM_MODEL, VLM_MODELS, VlmModelPreset, resolveVlmModel } from './vlm-models.cjs';
3
4
 
4
5
  /**
5
6
  * Callback function invoked after PDF conversion completes
6
7
  * @param outputPath Absolute path to the output directory containing result files
7
8
  */
8
9
  type ConversionCompleteCallback = (outputPath: string) => Promise<void> | void;
10
+ /**
11
+ * Pipeline type for PDF conversion
12
+ * - 'standard': Use OCR-based pipeline (default, uses ocrmac)
13
+ * - 'vlm': Use Vision Language Model pipeline for better KCJ/complex layout handling
14
+ */
15
+ type PipelineType = 'standard' | 'vlm';
16
+ /**
17
+ * Extended options for PDF conversion including pipeline selection
18
+ */
19
+ type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'images_scale' | 'force_ocr' | 'pipeline' | 'vlm_pipeline_model_local'> & {
20
+ num_threads?: number;
21
+ pipeline?: PipelineType;
22
+ vlm_model?: string | VlmModelLocal;
23
+ };
9
24
 
10
25
  type Options = {
11
26
  logger: LoggerMethods;
@@ -82,6 +97,7 @@ declare class PDFParser {
82
97
  private readonly killExistingProcess;
83
98
  private readonly enableImagePdfFallback;
84
99
  private client;
100
+ private environment?;
85
101
  constructor(options: Options);
86
102
  init(): Promise<void>;
87
103
  private checkOperatingSystem;
@@ -104,9 +120,7 @@ declare class PDFParser {
104
120
  */
105
121
  private restartServer;
106
122
  private waitForServerReady;
107
- parse(url: string, reportId: string, onComplete: ConversionCompleteCallback, cleanupAfterCallback: boolean, options: Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'images_scale' | 'force_ocr'> & {
108
- num_threads?: number;
109
- }, abortSignal?: AbortSignal): Promise<void>;
123
+ parse(url: string, reportId: string, onComplete: ConversionCompleteCallback, cleanupAfterCallback: boolean, options: PDFConvertOptions, abortSignal?: AbortSignal): Promise<void>;
110
124
  /**
111
125
  * Dispose the parser instance.
112
126
  * - Sets the internal client to null
@@ -126,4 +140,4 @@ declare class ImagePdfFallbackError extends Error {
126
140
  constructor(originalError: Error, fallbackError: Error);
127
141
  }
128
142
 
129
- export { type ConversionCompleteCallback, ImagePdfFallbackError, PDFParser };
143
+ export { type ConversionCompleteCallback, ImagePdfFallbackError, type PDFConvertOptions, PDFParser, type PipelineType };
package/dist/index.d.ts CHANGED
@@ -1,11 +1,26 @@
1
1
  import { LoggerMethods } from '@heripo/logger';
2
- import { ConversionOptions } from 'docling-sdk';
2
+ import { ConversionOptions, VlmModelLocal } from 'docling-sdk';
3
+ export { DEFAULT_VLM_MODEL, VLM_MODELS, VlmModelPreset, resolveVlmModel } from './vlm-models.js';
3
4
 
4
5
  /**
5
6
  * Callback function invoked after PDF conversion completes
6
7
  * @param outputPath Absolute path to the output directory containing result files
7
8
  */
8
9
  type ConversionCompleteCallback = (outputPath: string) => Promise<void> | void;
10
+ /**
11
+ * Pipeline type for PDF conversion
12
+ * - 'standard': Use OCR-based pipeline (default, uses ocrmac)
13
+ * - 'vlm': Use Vision Language Model pipeline for better KCJ/complex layout handling
14
+ */
15
+ type PipelineType = 'standard' | 'vlm';
16
+ /**
17
+ * Extended options for PDF conversion including pipeline selection
18
+ */
19
+ type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'images_scale' | 'force_ocr' | 'pipeline' | 'vlm_pipeline_model_local'> & {
20
+ num_threads?: number;
21
+ pipeline?: PipelineType;
22
+ vlm_model?: string | VlmModelLocal;
23
+ };
9
24
 
10
25
  type Options = {
11
26
  logger: LoggerMethods;
@@ -82,6 +97,7 @@ declare class PDFParser {
82
97
  private readonly killExistingProcess;
83
98
  private readonly enableImagePdfFallback;
84
99
  private client;
100
+ private environment?;
85
101
  constructor(options: Options);
86
102
  init(): Promise<void>;
87
103
  private checkOperatingSystem;
@@ -104,9 +120,7 @@ declare class PDFParser {
104
120
  */
105
121
  private restartServer;
106
122
  private waitForServerReady;
107
- parse(url: string, reportId: string, onComplete: ConversionCompleteCallback, cleanupAfterCallback: boolean, options: Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'images_scale' | 'force_ocr'> & {
108
- num_threads?: number;
109
- }, abortSignal?: AbortSignal): Promise<void>;
123
+ parse(url: string, reportId: string, onComplete: ConversionCompleteCallback, cleanupAfterCallback: boolean, options: PDFConvertOptions, abortSignal?: AbortSignal): Promise<void>;
110
124
  /**
111
125
  * Dispose the parser instance.
112
126
  * - Sets the internal client to null
@@ -126,4 +140,4 @@ declare class ImagePdfFallbackError extends Error {
126
140
  constructor(originalError: Error, fallbackError: Error);
127
141
  }
128
142
 
129
- export { type ConversionCompleteCallback, ImagePdfFallbackError, PDFParser };
143
+ export { type ConversionCompleteCallback, ImagePdfFallbackError, type PDFConvertOptions, PDFParser, type PipelineType };
package/dist/index.js CHANGED
@@ -1,9 +1,14 @@
1
+ import {
2
+ DEFAULT_VLM_MODEL,
3
+ VLM_MODELS,
4
+ resolveVlmModel
5
+ } from "./chunk-WWNI354M.js";
1
6
  import "./chunk-VUNV25KB.js";
2
7
 
3
8
  // src/core/pdf-parser.ts
4
9
  import { Docling } from "docling-sdk";
5
10
  import { execSync } from "child_process";
6
- import { platform } from "os";
11
+ import { platform as platform2 } from "os";
7
12
  import { join as join5 } from "path";
8
13
 
9
14
  // src/config/constants.ts
@@ -33,7 +38,11 @@ var PDF_CONVERTER = {
33
38
  /**
34
39
  * Interval for progress polling in milliseconds
35
40
  */
36
- POLL_INTERVAL_MS: 1e3
41
+ POLL_INTERVAL_MS: 1e3,
42
+ /**
43
+ * Default timeout for task completion in milliseconds (30 minutes)
44
+ */
45
+ DEFAULT_TIMEOUT_MS: 18e5
37
46
  };
38
47
  var DOCLING_ENVIRONMENT = {
39
48
  /**
@@ -51,6 +60,19 @@ var IMAGE_PDF_CONVERTER = {
51
60
  */
52
61
  QUALITY: 100
53
62
  };
63
+ var VLM_ENVIRONMENT = {
64
+ /**
65
+ * Timeout for VLM dependency installation (pip install) in milliseconds (3 hours).
66
+ * VLM packages can be very large and may require extended download times
67
+ * depending on network conditions.
68
+ */
69
+ SETUP_TIMEOUT_MS: 108e5,
70
+ /**
71
+ * Timeout for VLM model download in milliseconds (3 hours).
72
+ * Large VLM models (e.g., multi-GB weights) need sufficient time to download.
73
+ */
74
+ MODEL_DOWNLOAD_TIMEOUT_MS: 108e5
75
+ };
54
76
 
55
77
  // ../shared/dist/index.mjs
56
78
  import { spawn } from "child_process";
@@ -83,6 +105,7 @@ function spawnAsync(command, args, options = {}) {
83
105
 
84
106
  // src/environment/docling-environment.ts
85
107
  import { spawn as spawn2 } from "child_process";
108
+ import { arch, platform } from "os";
86
109
  import { join } from "path";
87
110
 
88
111
  // src/utils/python-version.ts
@@ -124,6 +147,7 @@ var DoclingEnvironment = class _DoclingEnvironment {
124
147
  venvPath;
125
148
  port;
126
149
  killExistingProcess;
150
+ vlmDependenciesInstalled = false;
127
151
  constructor(options) {
128
152
  this.logger = options.logger;
129
153
  this.venvPath = options.venvPath;
@@ -253,6 +277,81 @@ var DoclingEnvironment = class _DoclingEnvironment {
253
277
  );
254
278
  }
255
279
  }
280
+ /**
281
+ * Install VLM-specific dependencies for the Docling VLM pipeline.
282
+ *
283
+ * Installs:
284
+ * 1. docling-serve[vlm] - VLM model support for docling-serve
285
+ * 2. mlx + mlx-lm (macOS ARM64 only) - Apple Silicon optimized inference
286
+ *
287
+ * This is idempotent - subsequent calls skip if already installed.
288
+ */
289
+ async setupVlmDependencies() {
290
+ if (this.vlmDependenciesInstalled) {
291
+ this.logger.info(
292
+ "[DoclingEnvironment] VLM dependencies already installed, skipping"
293
+ );
294
+ return;
295
+ }
296
+ if (await this.isVlmReady()) {
297
+ this.vlmDependenciesInstalled = true;
298
+ this.logger.info(
299
+ "[DoclingEnvironment] VLM dependencies already installed, skipping"
300
+ );
301
+ return;
302
+ }
303
+ this.logger.info("[DoclingEnvironment] Installing VLM dependencies...");
304
+ const pipPath = join(this.venvPath, "bin", "pip");
305
+ this.logger.info("[DoclingEnvironment] Installing docling[vlm]...");
306
+ const vlmResult = await spawnAsync(
307
+ pipPath,
308
+ ["install", "docling-serve[vlm]"],
309
+ { timeout: VLM_ENVIRONMENT.SETUP_TIMEOUT_MS }
310
+ );
311
+ if (vlmResult.code !== 0) {
312
+ this.logger.error(
313
+ "[DoclingEnvironment] Failed to install docling-serve[vlm]:",
314
+ vlmResult.stderr
315
+ );
316
+ throw new Error(
317
+ `Failed to install docling-serve[vlm]. Exit code: ${vlmResult.code}`
318
+ );
319
+ }
320
+ if (platform() === "darwin" && arch() === "arm64") {
321
+ this.logger.info(
322
+ "[DoclingEnvironment] Installing mlx + mlx-lm for Apple Silicon..."
323
+ );
324
+ const mlxResult = await spawnAsync(
325
+ pipPath,
326
+ ["install", "mlx", "mlx-lm"],
327
+ { timeout: VLM_ENVIRONMENT.SETUP_TIMEOUT_MS }
328
+ );
329
+ if (mlxResult.code !== 0) {
330
+ this.logger.error(
331
+ "[DoclingEnvironment] Failed to install mlx/mlx-lm:",
332
+ mlxResult.stderr
333
+ );
334
+ throw new Error(
335
+ `Failed to install mlx/mlx-lm. Exit code: ${mlxResult.code}`
336
+ );
337
+ }
338
+ }
339
+ this.vlmDependenciesInstalled = true;
340
+ this.logger.info(
341
+ "[DoclingEnvironment] VLM dependencies installed successfully"
342
+ );
343
+ }
344
+ /**
345
+ * Check if VLM dependencies are ready by verifying Python module imports
346
+ */
347
+ async isVlmReady() {
348
+ const pythonPath = join(this.venvPath, "bin", "python");
349
+ const result = await spawnAsync(pythonPath, [
350
+ "-c",
351
+ "import docling_core; import docling"
352
+ ]);
353
+ return result.code === 0;
354
+ }
256
355
  async isPortInUse(port) {
257
356
  try {
258
357
  const result = await spawnAsync("lsof", ["-ti", `:${port}`]);
@@ -339,6 +438,7 @@ var DoclingEnvironment = class _DoclingEnvironment {
339
438
  };
340
439
 
341
440
  // src/core/pdf-converter.ts
441
+ import { ValidationUtils } from "docling-sdk";
342
442
  import { omit } from "es-toolkit";
343
443
  import { createWriteStream as createWriteStream2, existsSync as existsSync3, rmSync as rmSync3 } from "fs";
344
444
  import { join as join4 } from "path";
@@ -789,11 +889,17 @@ var ImagePdfConverter = class {
789
889
  };
790
890
 
791
891
  // src/core/pdf-converter.ts
892
+ var _origAssertValidConversionOptions = ValidationUtils.assertValidConversionOptions.bind(ValidationUtils);
893
+ ValidationUtils.assertValidConversionOptions = (options) => {
894
+ const { pipeline: _pipeline, ...rest } = options;
895
+ _origAssertValidConversionOptions(rest);
896
+ };
792
897
  var PDFConverter = class {
793
- constructor(logger, client, enableImagePdfFallback = false) {
898
+ constructor(logger, client, enableImagePdfFallback = false, timeout = PDF_CONVERTER.DEFAULT_TIMEOUT_MS) {
794
899
  this.logger = logger;
795
900
  this.client = client;
796
901
  this.enableImagePdfFallback = enableImagePdfFallback;
902
+ this.timeout = timeout;
797
903
  }
798
904
  async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
799
905
  this.logger.info("[PDFConverter] Converting:", url);
@@ -848,10 +954,15 @@ var PDFConverter = class {
848
954
  }
849
955
  async performConversion(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
850
956
  const startTime = Date.now();
851
- const conversionOptions = this.buildConversionOptions(options);
852
- this.logger.info(
853
- `[PDFConverter] OCR languages: ${JSON.stringify(conversionOptions.ocr_options?.lang)}`
854
- );
957
+ const pipelineType = options.pipeline ?? "standard";
958
+ const conversionOptions = pipelineType === "vlm" ? this.buildVlmConversionOptions(options) : this.buildConversionOptions(options);
959
+ if (pipelineType === "vlm") {
960
+ this.logger.info("[PDFConverter] Using VLM pipeline");
961
+ } else {
962
+ this.logger.info(
963
+ `[PDFConverter] OCR languages: ${JSON.stringify(conversionOptions.ocr_options?.lang)}`
964
+ );
965
+ }
855
966
  this.logger.info(
856
967
  "[PDFConverter] Converting document with Async Source API..."
857
968
  );
@@ -918,7 +1029,7 @@ var PDFConverter = class {
918
1029
  }
919
1030
  buildConversionOptions(options) {
920
1031
  return {
921
- ...omit(options, ["num_threads"]),
1032
+ ...omit(options, ["num_threads", "pipeline", "vlm_model"]),
922
1033
  to_formats: ["json", "html"],
923
1034
  image_export_mode: "embedded",
924
1035
  ocr_engine: "ocrmac",
@@ -944,6 +1055,31 @@ var PDFConverter = class {
944
1055
  }
945
1056
  };
946
1057
  }
1058
+ /**
1059
+ * Build conversion options for VLM pipeline.
1060
+ *
1061
+ * VLM pipeline uses a Vision Language Model instead of traditional OCR,
1062
+ * providing better accuracy for KCJ characters and complex layouts.
1063
+ */
1064
+ buildVlmConversionOptions(options) {
1065
+ const vlmModel = resolveVlmModel(options.vlm_model ?? DEFAULT_VLM_MODEL);
1066
+ this.logger.info(
1067
+ `[PDFConverter] VLM model: ${vlmModel.repo_id} (framework: ${vlmModel.inference_framework}, format: ${vlmModel.response_format})`
1068
+ );
1069
+ return {
1070
+ ...omit(options, ["num_threads", "pipeline", "vlm_model", "ocr_lang"]),
1071
+ to_formats: ["json", "html"],
1072
+ image_export_mode: "embedded",
1073
+ pipeline: "vlm",
1074
+ vlm_pipeline_model_local: vlmModel,
1075
+ generate_picture_images: true,
1076
+ images_scale: 2,
1077
+ accelerator_options: {
1078
+ device: "mps",
1079
+ num_threads: options.num_threads
1080
+ }
1081
+ };
1082
+ }
947
1083
  async startConversionTask(url, conversionOptions) {
948
1084
  const task = await this.client.convertSourceAsync({
949
1085
  sources: [
@@ -979,38 +1115,42 @@ var PDFConverter = class {
979
1115
  }
980
1116
  async trackTaskProgress(task) {
981
1117
  const conversionStartTime = Date.now();
982
- let lastStatus = "";
983
- let isCompleted = false;
984
- const pollInterval = setInterval(() => {
985
- if (isCompleted) return;
986
- const elapsed = Math.floor((Date.now() - conversionStartTime) / 1e3);
987
- process.stdout.write(
988
- `\r[PDFConverter] Status: ${lastStatus || "processing"} (${elapsed}s elapsed)`
989
- );
990
- }, PDF_CONVERTER.POLL_INTERVAL_MS);
991
- task.on("progress", (status) => {
992
- lastStatus = status.task_status;
1118
+ let lastProgressLine = "";
1119
+ const logProgress = (status) => {
1120
+ const parts = [`Status: ${status.task_status}`];
993
1121
  if (status.task_position !== void 0) {
994
- process.stdout.write(
995
- `\r[PDFConverter] Status: ${status.task_status} (position: ${status.task_position})`
996
- );
1122
+ parts.push(`position: ${status.task_position}`);
997
1123
  }
998
- });
999
- task.on("complete", () => {
1000
- isCompleted = true;
1001
- clearInterval(pollInterval);
1002
- this.logger.info("\n[PDFConverter] Conversion completed!");
1003
- });
1004
- task.on("error", (error) => {
1005
- isCompleted = true;
1006
- clearInterval(pollInterval);
1007
- this.logger.error("\n[PDFConverter] Conversion error:", error.message);
1008
- });
1009
- try {
1010
- await task.waitForCompletion();
1011
- } finally {
1012
- isCompleted = true;
1013
- clearInterval(pollInterval);
1124
+ const meta = status.task_meta;
1125
+ if (meta) {
1126
+ if (meta.processed_documents !== void 0 && meta.total_documents !== void 0) {
1127
+ parts.push(
1128
+ `progress: ${meta.processed_documents}/${meta.total_documents}`
1129
+ );
1130
+ }
1131
+ }
1132
+ const progressLine = `\r[PDFConverter] ${parts.join(" | ")}`;
1133
+ if (progressLine !== lastProgressLine) {
1134
+ lastProgressLine = progressLine;
1135
+ process.stdout.write(progressLine);
1136
+ }
1137
+ };
1138
+ while (true) {
1139
+ if (Date.now() - conversionStartTime > this.timeout) {
1140
+ throw new Error("Task timeout");
1141
+ }
1142
+ const status = await task.poll();
1143
+ logProgress(status);
1144
+ if (status.task_status === "success") {
1145
+ this.logger.info("\n[PDFConverter] Conversion completed!");
1146
+ return;
1147
+ }
1148
+ if (status.task_status === "failure") {
1149
+ throw new Error("Task failed with status: failure");
1150
+ }
1151
+ await new Promise(
1152
+ (resolve) => setTimeout(resolve, PDF_CONVERTER.POLL_INTERVAL_MS)
1153
+ );
1014
1154
  }
1015
1155
  }
1016
1156
  async downloadResult(taskId) {
@@ -1046,6 +1186,7 @@ var PDFParser = class {
1046
1186
  killExistingProcess;
1047
1187
  enableImagePdfFallback;
1048
1188
  client = null;
1189
+ environment;
1049
1190
  constructor(options) {
1050
1191
  const {
1051
1192
  logger,
@@ -1090,13 +1231,13 @@ var PDFParser = class {
1090
1231
  }
1091
1232
  this.logger.info("[PDFParser] Setting up local server...");
1092
1233
  try {
1093
- const environment = new DoclingEnvironment({
1234
+ this.environment = new DoclingEnvironment({
1094
1235
  logger: this.logger,
1095
1236
  venvPath: this.venvPath,
1096
1237
  port: this.port,
1097
1238
  killExistingProcess: this.killExistingProcess
1098
1239
  });
1099
- await environment.setup();
1240
+ await this.environment.setup();
1100
1241
  const clientUrl = `http://localhost:${this.port}`;
1101
1242
  this.client = new Docling({
1102
1243
  api: {
@@ -1112,9 +1253,9 @@ var PDFParser = class {
1112
1253
  }
1113
1254
  }
1114
1255
  checkOperatingSystem() {
1115
- if (platform() !== "darwin") {
1256
+ if (platform2() !== "darwin") {
1116
1257
  throw new Error(
1117
- "PDFParser is only supported on macOS. Current platform: " + platform()
1258
+ "PDFParser is only supported on macOS. Current platform: " + platform2()
1118
1259
  );
1119
1260
  }
1120
1261
  }
@@ -1242,6 +1383,12 @@ var PDFParser = class {
1242
1383
  "PDFParser is not initialized. Call init() before using parse()"
1243
1384
  );
1244
1385
  }
1386
+ if (options.pipeline === "vlm" && this.environment && !this.baseUrl) {
1387
+ this.logger.info(
1388
+ "[PDFParser] VLM pipeline requested, ensuring VLM dependencies..."
1389
+ );
1390
+ await this.environment.setupVlmDependencies();
1391
+ }
1245
1392
  const canRecover = !this.baseUrl && this.port !== void 0;
1246
1393
  const maxAttempts = PDF_PARSER.MAX_SERVER_RECOVERY_ATTEMPTS;
1247
1394
  let attempt = 0;
@@ -1251,7 +1398,8 @@ var PDFParser = class {
1251
1398
  const converter = new PDFConverter(
1252
1399
  this.logger,
1253
1400
  this.client,
1254
- effectiveFallbackEnabled
1401
+ effectiveFallbackEnabled,
1402
+ this.timeout
1255
1403
  );
1256
1404
  return await converter.convert(
1257
1405
  url,
@@ -1298,7 +1446,10 @@ var PDFParser = class {
1298
1446
  }
1299
1447
  };
1300
1448
  export {
1449
+ DEFAULT_VLM_MODEL,
1301
1450
  ImagePdfFallbackError,
1302
- PDFParser
1451
+ PDFParser,
1452
+ VLM_MODELS,
1453
+ resolveVlmModel
1303
1454
  };
1304
1455
  //# sourceMappingURL=index.js.map