@heripo/pdf-parser 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,121 @@
1
+ // src/config/vlm-models.ts
2
+ var VLM_MODELS = {
3
+ // ── DocTags models (specialized document structure output) ──────────
4
+ "granite-docling-258M-mlx": {
5
+ repo_id: "ibm-granite/granite-docling-258M-mlx",
6
+ inference_framework: "mlx",
7
+ response_format: "doctags",
8
+ transformers_model_type: "automodel-vision2seq",
9
+ description: "Granite Docling 258M (MLX, Apple Silicon optimized, ~6s/page)"
10
+ },
11
+ "granite-docling-258M": {
12
+ repo_id: "ibm-granite/granite-docling-258M",
13
+ inference_framework: "transformers",
14
+ response_format: "doctags",
15
+ transformers_model_type: "automodel-vision2seq",
16
+ description: "Granite Docling 258M (Transformers, cross-platform)"
17
+ },
18
+ "smoldocling-256M-mlx": {
19
+ repo_id: "docling-project/SmolDocling-256M-preview-mlx-bf16",
20
+ inference_framework: "mlx",
21
+ response_format: "doctags",
22
+ transformers_model_type: "automodel-vision2seq",
23
+ description: "SmolDocling 256M (MLX, fastest option)"
24
+ },
25
+ "smoldocling-256M": {
26
+ repo_id: "docling-project/SmolDocling-256M-preview",
27
+ inference_framework: "transformers",
28
+ response_format: "doctags",
29
+ transformers_model_type: "automodel-vision2seq",
30
+ description: "SmolDocling 256M (Transformers)"
31
+ },
32
+ // ── Markdown models (general-purpose vision LLMs) ──────────────────
33
+ "granite-vision-2B": {
34
+ repo_id: "ibm-granite/granite-vision-3.2-2b",
35
+ inference_framework: "transformers",
36
+ response_format: "markdown",
37
+ transformers_model_type: "automodel-vision2seq",
38
+ description: "Granite Vision 3.2 2B (IBM, higher accuracy)"
39
+ },
40
+ "qwen25-vl-3B-mlx": {
41
+ repo_id: "mlx-community/Qwen2.5-VL-3B-Instruct-bf16",
42
+ inference_framework: "mlx",
43
+ response_format: "markdown",
44
+ transformers_model_type: "automodel-vision2seq",
45
+ description: "Qwen 2.5 VL 3B (MLX, multilingual, good KCJ support)"
46
+ },
47
+ phi4: {
48
+ repo_id: "microsoft/Phi-4-multimodal-instruct",
49
+ inference_framework: "transformers",
50
+ response_format: "markdown",
51
+ transformers_model_type: "automodel",
52
+ description: "Phi-4 Multimodal (Microsoft, CausalLM)"
53
+ },
54
+ "pixtral-12B-mlx": {
55
+ repo_id: "mlx-community/pixtral-12b-bf16",
56
+ inference_framework: "mlx",
57
+ response_format: "markdown",
58
+ transformers_model_type: "automodel-vision2seq",
59
+ description: "Pixtral 12B (MLX, Mistral, high accuracy)"
60
+ },
61
+ "pixtral-12B": {
62
+ repo_id: "mistral-community/pixtral-12b",
63
+ inference_framework: "transformers",
64
+ response_format: "markdown",
65
+ transformers_model_type: "automodel-vision2seq",
66
+ description: "Pixtral 12B (Transformers, Mistral)"
67
+ },
68
+ got2: {
69
+ repo_id: "stepfun-ai/GOT-OCR-2.0-hf",
70
+ inference_framework: "transformers",
71
+ response_format: "markdown",
72
+ transformers_model_type: "automodel-vision2seq",
73
+ description: "GOT-OCR 2.0 (StepFun, OCR-specialized)"
74
+ },
75
+ "gemma3-12B-mlx": {
76
+ repo_id: "mlx-community/gemma-3-12b-it-bf16",
77
+ inference_framework: "mlx",
78
+ response_format: "markdown",
79
+ transformers_model_type: "automodel-vision2seq",
80
+ description: "Gemma 3 12B (MLX, Google)"
81
+ },
82
+ "gemma3-27B-mlx": {
83
+ repo_id: "mlx-community/gemma-3-27b-it-bf16",
84
+ inference_framework: "mlx",
85
+ response_format: "markdown",
86
+ transformers_model_type: "automodel-vision2seq",
87
+ description: "Gemma 3 27B (MLX, Google, highest accuracy)"
88
+ },
89
+ dolphin: {
90
+ repo_id: "ByteDance/Dolphin",
91
+ inference_framework: "transformers",
92
+ response_format: "markdown",
93
+ transformers_model_type: "automodel-vision2seq",
94
+ description: "Dolphin (ByteDance, document-oriented)"
95
+ }
96
+ };
97
+ var DEFAULT_VLM_MODEL = "granite-docling-258M-mlx";
98
+ function resolveVlmModel(model) {
99
+ if (typeof model === "string") {
100
+ const preset = VLM_MODELS[model];
101
+ if (!preset) {
102
+ throw new Error(
103
+ `Unknown VLM model preset: "${model}". Available presets: ${Object.keys(VLM_MODELS).join(", ")}`
104
+ );
105
+ }
106
+ return {
107
+ repo_id: preset.repo_id,
108
+ inference_framework: preset.inference_framework,
109
+ response_format: preset.response_format,
110
+ transformers_model_type: preset.transformers_model_type
111
+ };
112
+ }
113
+ return model;
114
+ }
115
+
116
+ export {
117
+ VLM_MODELS,
118
+ DEFAULT_VLM_MODEL,
119
+ resolveVlmModel
120
+ };
121
+ //# sourceMappingURL=chunk-WWNI354M.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/config/vlm-models.ts"],"sourcesContent":["import type { VlmModelLocal } from 'docling-sdk';\n\n/**\n * VLM model preset with description\n */\nexport interface VlmModelPreset {\n repo_id: string;\n inference_framework: 'mlx' | 'transformers';\n response_format: 'doctags' | 'markdown';\n transformers_model_type: 'automodel-vision2seq' | 'automodel';\n description: string;\n}\n\n/**\n * Available VLM model presets\n *\n * Based on Docling's official VLM model specs:\n * https://docling-project.github.io/docling/usage/vision_models/#available-local-models\n *\n * Users can select a preset key or provide a custom VlmModelLocal object.\n */\nexport const VLM_MODELS: Record<string, VlmModelPreset> = {\n // ── DocTags models (specialized document structure output) ──────────\n\n 'granite-docling-258M-mlx': {\n repo_id: 'ibm-granite/granite-docling-258M-mlx',\n inference_framework: 'mlx',\n response_format: 'doctags',\n transformers_model_type: 'automodel-vision2seq',\n description:\n 'Granite Docling 258M (MLX, Apple Silicon optimized, ~6s/page)',\n },\n 'granite-docling-258M': {\n repo_id: 'ibm-granite/granite-docling-258M',\n inference_framework: 'transformers',\n response_format: 'doctags',\n transformers_model_type: 'automodel-vision2seq',\n description: 'Granite Docling 258M (Transformers, cross-platform)',\n },\n 'smoldocling-256M-mlx': {\n repo_id: 'docling-project/SmolDocling-256M-preview-mlx-bf16',\n inference_framework: 'mlx',\n response_format: 'doctags',\n transformers_model_type: 'automodel-vision2seq',\n description: 'SmolDocling 256M (MLX, fastest option)',\n },\n 'smoldocling-256M': {\n repo_id: 'docling-project/SmolDocling-256M-preview',\n inference_framework: 'transformers',\n response_format: 'doctags',\n transformers_model_type: 'automodel-vision2seq',\n description: 'SmolDocling 256M (Transformers)',\n },\n\n // ── Markdown models (general-purpose vision LLMs) ──────────────────\n\n 'granite-vision-2B': {\n repo_id: 'ibm-granite/granite-vision-3.2-2b',\n inference_framework: 'transformers',\n response_format: 'markdown',\n transformers_model_type: 'automodel-vision2seq',\n description: 'Granite Vision 3.2 2B (IBM, higher accuracy)',\n },\n 'qwen25-vl-3B-mlx': {\n repo_id: 'mlx-community/Qwen2.5-VL-3B-Instruct-bf16',\n inference_framework: 'mlx',\n response_format: 'markdown',\n transformers_model_type: 'automodel-vision2seq',\n description: 'Qwen 2.5 VL 3B (MLX, multilingual, good KCJ support)',\n },\n phi4: {\n repo_id: 'microsoft/Phi-4-multimodal-instruct',\n inference_framework: 'transformers',\n response_format: 'markdown',\n transformers_model_type: 'automodel',\n description: 'Phi-4 Multimodal (Microsoft, CausalLM)',\n },\n 'pixtral-12B-mlx': {\n repo_id: 'mlx-community/pixtral-12b-bf16',\n inference_framework: 'mlx',\n response_format: 'markdown',\n transformers_model_type: 'automodel-vision2seq',\n description: 'Pixtral 12B (MLX, Mistral, high accuracy)',\n },\n 'pixtral-12B': {\n repo_id: 'mistral-community/pixtral-12b',\n inference_framework: 'transformers',\n response_format: 'markdown',\n transformers_model_type: 'automodel-vision2seq',\n description: 'Pixtral 12B (Transformers, Mistral)',\n },\n got2: {\n repo_id: 'stepfun-ai/GOT-OCR-2.0-hf',\n inference_framework: 'transformers',\n response_format: 'markdown',\n transformers_model_type: 'automodel-vision2seq',\n description: 'GOT-OCR 2.0 (StepFun, OCR-specialized)',\n },\n 'gemma3-12B-mlx': {\n repo_id: 'mlx-community/gemma-3-12b-it-bf16',\n inference_framework: 'mlx',\n response_format: 'markdown',\n transformers_model_type: 'automodel-vision2seq',\n description: 'Gemma 3 12B (MLX, Google)',\n },\n 'gemma3-27B-mlx': {\n repo_id: 'mlx-community/gemma-3-27b-it-bf16',\n inference_framework: 'mlx',\n response_format: 'markdown',\n transformers_model_type: 'automodel-vision2seq',\n description: 'Gemma 3 27B (MLX, Google, highest accuracy)',\n },\n dolphin: {\n repo_id: 'ByteDance/Dolphin',\n inference_framework: 'transformers',\n response_format: 'markdown',\n transformers_model_type: 'automodel-vision2seq',\n description: 'Dolphin (ByteDance, document-oriented)',\n },\n} as const;\n\n/**\n * Default VLM model preset key\n */\nexport const DEFAULT_VLM_MODEL = 'granite-docling-258M-mlx';\n\n/**\n * Resolve a VLM model from a preset key or custom VlmModelLocal object.\n *\n * When using a preset key, only required fields are populated.\n * Optional fields (prompt, scale, extra_generation_config) use Docling defaults.\n */\nexport function resolveVlmModel(model: string | VlmModelLocal): VlmModelLocal {\n if (typeof model === 'string') {\n const preset = VLM_MODELS[model];\n if (!preset) {\n throw new Error(\n `Unknown VLM model preset: \"${model}\". Available presets: ${Object.keys(VLM_MODELS).join(', ')}`,\n );\n }\n return {\n repo_id: preset.repo_id,\n inference_framework: preset.inference_framework,\n response_format: preset.response_format,\n transformers_model_type: preset.transformers_model_type,\n } as VlmModelLocal;\n }\n return model;\n}\n"],"mappings":";AAqBO,IAAM,aAA6C;AAAA;AAAA,EAGxD,4BAA4B;AAAA,IAC1B,SAAS;AAAA,IACT,qBAAqB;AAAA,IACrB,iBAAiB;AAAA,IACjB,yBAAyB;AAAA,IACzB,aACE;AAAA,EACJ;AAAA,EACA,wBAAwB;AAAA,IACtB,SAAS;AAAA,IACT,qBAAqB;AAAA,IACrB,iBAAiB;AAAA,IACjB,yBAAyB;AAAA,IACzB,aAAa;AAAA,EACf;AAAA,EACA,wBAAwB;AAAA,IACtB,SAAS;AAAA,IACT,qBAAqB;AAAA,IACrB,iBAAiB;AAAA,IACjB,yBAAyB;AAAA,IACzB,aAAa;AAAA,EACf;AAAA,EACA,oBAAoB;AAAA,IAClB,SAAS;AAAA,IACT,qBAAqB;AAAA,IACrB,iBAAiB;AAAA,IACjB,yBAAyB;AAAA,IACzB,aAAa;AAAA,EACf;AAAA;AAAA,EAIA,qBAAqB;AAAA,IACnB,SAAS;AAAA,IACT,qBAAqB;AAAA,IACrB,iBAAiB;AAAA,IACjB,yBAAyB;AAAA,IACzB,aAAa;AAAA,EACf;AAAA,EACA,oBAAoB;AAAA,IAClB,SAAS;AAAA,IACT,qBAAqB;AAAA,IACrB,iBAAiB;AAAA,IACjB,yBAAyB;AAAA,IACzB,aAAa;AAAA,EACf;AAAA,EACA,MAAM;AAAA,IACJ,SAAS;AAAA,IACT,qBAAqB;AAAA,IACrB,iBAAiB;AAAA,IACjB,yBAAyB;AAAA,IACzB,aAAa;AAAA,EACf;AAAA,EACA,mBAAmB;AAAA,IACjB,SAAS;AAAA,IACT,qBAAqB;AAAA,IACrB,iBAAiB;AAAA,IACjB,yBAAyB;AAAA,IACzB,aAAa;AAAA,EACf;AAAA,EACA,eAAe;AAAA,IACb,SAAS;AAAA,IACT,qBAAqB;AAAA,IACrB,iBAAiB;AAAA,IACjB,yBAAyB;AAAA,IACzB,aAAa;AAAA,EACf;AAAA,EACA,MAAM;AAAA,IACJ,SAAS;AAAA,IACT,qBAAqB;AAAA,IACrB,iBAAiB;AAAA,IACjB,yBAAyB;AAAA,IACzB,aAAa;AAAA,EACf;AAAA,EACA,kBAAkB;AAAA,IAChB,SAAS;AAAA,IACT,qBAAqB;AAAA,IACrB,iBAAiB;AAAA,IACjB,yBAAyB;AAAA,IACzB,aAAa;AAAA,EACf;AAAA,EACA,kBAAkB;AAAA,IAChB,SAAS;AAAA,IACT,qBAAqB;AAAA,IACrB,iBAAiB;AAAA,IACjB,yBAAyB;AAAA,IACzB,aAAa;AAAA,EACf;AAAA,EACA,SAAS;AAAA,IACP,SAAS;AAAA,IACT,qBAAqB;AAAA,IACrB,iBAAiB;AAAA,IACjB,yBAAyB;AAAA,IACzB,aAAa;AAAA,EACf;AACF;AAKO,IAAM,oBAAoB;AAQ1B,SAAS,gBAAgB,OAA8C;AAC5E,MAAI,OAAO,UAAU,UAAU;AAC7B,UAAM,SAAS,WAAW,KAAK;AAC/B,QAAI,CAAC,QAAQ;AACX,YAAM,IAAI;AAAA,QACR,8BAA8B,KAAK,yBAAyB,OAAO,KAAK,UAAU,EAAE,KAAK,IAAI,CAAC;AAAA,MAChG;AAAA,IACF;AACA,WAAO;AAAA,MACL,SAAS,OAAO;AAAA,MAChB,qBAAqB,OAAO;AAAA,MAC5B,iBAAiB,OAAO;AAAA,MACxB,yBAAyB,OAAO;AAAA,IAClC;AAAA,EACF;AACA,SAAO;AACT;","names":[]}
package/dist/index.cjs CHANGED
@@ -30,15 +30,18 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
30
30
  // src/index.ts
31
31
  var src_exports = {};
32
32
  __export(src_exports, {
33
+ DEFAULT_VLM_MODEL: () => DEFAULT_VLM_MODEL,
33
34
  ImagePdfFallbackError: () => ImagePdfFallbackError,
34
- PDFParser: () => PDFParser
35
+ PDFParser: () => PDFParser,
36
+ VLM_MODELS: () => VLM_MODELS,
37
+ resolveVlmModel: () => resolveVlmModel
35
38
  });
36
39
  module.exports = __toCommonJS(src_exports);
37
40
 
38
41
  // src/core/pdf-parser.ts
39
- var import_docling_sdk = require("docling-sdk");
42
+ var import_docling_sdk2 = require("docling-sdk");
40
43
  var import_node_child_process3 = require("child_process");
41
- var import_node_os2 = require("os");
44
+ var import_node_os3 = require("os");
42
45
  var import_node_path6 = require("path");
43
46
 
44
47
  // src/config/constants.ts
@@ -68,7 +71,11 @@ var PDF_CONVERTER = {
68
71
  /**
69
72
  * Interval for progress polling in milliseconds
70
73
  */
71
- POLL_INTERVAL_MS: 1e3
74
+ POLL_INTERVAL_MS: 1e3,
75
+ /**
76
+ * Default timeout for task completion in milliseconds (30 minutes)
77
+ */
78
+ DEFAULT_TIMEOUT_MS: 18e5
72
79
  };
73
80
  var DOCLING_ENVIRONMENT = {
74
81
  /**
@@ -86,6 +93,19 @@ var IMAGE_PDF_CONVERTER = {
86
93
  */
87
94
  QUALITY: 100
88
95
  };
96
+ var VLM_ENVIRONMENT = {
97
+ /**
98
+ * Timeout for VLM dependency installation (pip install) in milliseconds (3 hours).
99
+ * VLM packages can be very large and may require extended download times
100
+ * depending on network conditions.
101
+ */
102
+ SETUP_TIMEOUT_MS: 108e5,
103
+ /**
104
+ * Timeout for VLM model download in milliseconds (3 hours).
105
+ * Large VLM models (e.g., multi-GB weights) need sufficient time to download.
106
+ */
107
+ MODEL_DOWNLOAD_TIMEOUT_MS: 108e5
108
+ };
89
109
 
90
110
  // ../shared/dist/index.mjs
91
111
  var import_child_process = require("child_process");
@@ -118,6 +138,7 @@ function spawnAsync(command, args, options = {}) {
118
138
 
119
139
  // src/environment/docling-environment.ts
120
140
  var import_node_child_process = require("child_process");
141
+ var import_node_os = require("os");
121
142
  var import_node_path = require("path");
122
143
 
123
144
  // src/utils/python-version.ts
@@ -159,6 +180,7 @@ var DoclingEnvironment = class _DoclingEnvironment {
159
180
  venvPath;
160
181
  port;
161
182
  killExistingProcess;
183
+ vlmDependenciesInstalled = false;
162
184
  constructor(options) {
163
185
  this.logger = options.logger;
164
186
  this.venvPath = options.venvPath;
@@ -288,6 +310,81 @@ var DoclingEnvironment = class _DoclingEnvironment {
288
310
  );
289
311
  }
290
312
  }
313
+ /**
314
+ * Install VLM-specific dependencies for the Docling VLM pipeline.
315
+ *
316
+ * Installs:
317
+ * 1. docling-serve[vlm] - VLM model support for docling-serve
318
+ * 2. mlx + mlx-lm (macOS ARM64 only) - Apple Silicon optimized inference
319
+ *
320
+ * This is idempotent - subsequent calls skip if already installed.
321
+ */
322
+ async setupVlmDependencies() {
323
+ if (this.vlmDependenciesInstalled) {
324
+ this.logger.info(
325
+ "[DoclingEnvironment] VLM dependencies already installed, skipping"
326
+ );
327
+ return;
328
+ }
329
+ if (await this.isVlmReady()) {
330
+ this.vlmDependenciesInstalled = true;
331
+ this.logger.info(
332
+ "[DoclingEnvironment] VLM dependencies already installed, skipping"
333
+ );
334
+ return;
335
+ }
336
+ this.logger.info("[DoclingEnvironment] Installing VLM dependencies...");
337
+ const pipPath = (0, import_node_path.join)(this.venvPath, "bin", "pip");
338
+ this.logger.info("[DoclingEnvironment] Installing docling[vlm]...");
339
+ const vlmResult = await spawnAsync(
340
+ pipPath,
341
+ ["install", "docling-serve[vlm]"],
342
+ { timeout: VLM_ENVIRONMENT.SETUP_TIMEOUT_MS }
343
+ );
344
+ if (vlmResult.code !== 0) {
345
+ this.logger.error(
346
+ "[DoclingEnvironment] Failed to install docling-serve[vlm]:",
347
+ vlmResult.stderr
348
+ );
349
+ throw new Error(
350
+ `Failed to install docling-serve[vlm]. Exit code: ${vlmResult.code}`
351
+ );
352
+ }
353
+ if ((0, import_node_os.platform)() === "darwin" && (0, import_node_os.arch)() === "arm64") {
354
+ this.logger.info(
355
+ "[DoclingEnvironment] Installing mlx + mlx-lm for Apple Silicon..."
356
+ );
357
+ const mlxResult = await spawnAsync(
358
+ pipPath,
359
+ ["install", "mlx", "mlx-lm"],
360
+ { timeout: VLM_ENVIRONMENT.SETUP_TIMEOUT_MS }
361
+ );
362
+ if (mlxResult.code !== 0) {
363
+ this.logger.error(
364
+ "[DoclingEnvironment] Failed to install mlx/mlx-lm:",
365
+ mlxResult.stderr
366
+ );
367
+ throw new Error(
368
+ `Failed to install mlx/mlx-lm. Exit code: ${mlxResult.code}`
369
+ );
370
+ }
371
+ }
372
+ this.vlmDependenciesInstalled = true;
373
+ this.logger.info(
374
+ "[DoclingEnvironment] VLM dependencies installed successfully"
375
+ );
376
+ }
377
+ /**
378
+ * Check if VLM dependencies are ready by verifying Python module imports
379
+ */
380
+ async isVlmReady() {
381
+ const pythonPath = (0, import_node_path.join)(this.venvPath, "bin", "python");
382
+ const result = await spawnAsync(pythonPath, [
383
+ "-c",
384
+ "import docling_core; import docling"
385
+ ]);
386
+ return result.code === 0;
387
+ }
291
388
  async isPortInUse(port) {
292
389
  try {
293
390
  const result = await spawnAsync("lsof", ["-ti", `:${port}`]);
@@ -374,11 +471,127 @@ var DoclingEnvironment = class _DoclingEnvironment {
374
471
  };
375
472
 
376
473
  // src/core/pdf-converter.ts
474
+ var import_docling_sdk = require("docling-sdk");
377
475
  var import_es_toolkit = require("es-toolkit");
378
476
  var import_node_fs4 = require("fs");
379
477
  var import_node_path5 = require("path");
380
478
  var import_promises = require("stream/promises");
381
479
 
480
+ // src/config/vlm-models.ts
481
+ var VLM_MODELS = {
482
+ // ── DocTags models (specialized document structure output) ──────────
483
+ "granite-docling-258M-mlx": {
484
+ repo_id: "ibm-granite/granite-docling-258M-mlx",
485
+ inference_framework: "mlx",
486
+ response_format: "doctags",
487
+ transformers_model_type: "automodel-vision2seq",
488
+ description: "Granite Docling 258M (MLX, Apple Silicon optimized, ~6s/page)"
489
+ },
490
+ "granite-docling-258M": {
491
+ repo_id: "ibm-granite/granite-docling-258M",
492
+ inference_framework: "transformers",
493
+ response_format: "doctags",
494
+ transformers_model_type: "automodel-vision2seq",
495
+ description: "Granite Docling 258M (Transformers, cross-platform)"
496
+ },
497
+ "smoldocling-256M-mlx": {
498
+ repo_id: "docling-project/SmolDocling-256M-preview-mlx-bf16",
499
+ inference_framework: "mlx",
500
+ response_format: "doctags",
501
+ transformers_model_type: "automodel-vision2seq",
502
+ description: "SmolDocling 256M (MLX, fastest option)"
503
+ },
504
+ "smoldocling-256M": {
505
+ repo_id: "docling-project/SmolDocling-256M-preview",
506
+ inference_framework: "transformers",
507
+ response_format: "doctags",
508
+ transformers_model_type: "automodel-vision2seq",
509
+ description: "SmolDocling 256M (Transformers)"
510
+ },
511
+ // ── Markdown models (general-purpose vision LLMs) ──────────────────
512
+ "granite-vision-2B": {
513
+ repo_id: "ibm-granite/granite-vision-3.2-2b",
514
+ inference_framework: "transformers",
515
+ response_format: "markdown",
516
+ transformers_model_type: "automodel-vision2seq",
517
+ description: "Granite Vision 3.2 2B (IBM, higher accuracy)"
518
+ },
519
+ "qwen25-vl-3B-mlx": {
520
+ repo_id: "mlx-community/Qwen2.5-VL-3B-Instruct-bf16",
521
+ inference_framework: "mlx",
522
+ response_format: "markdown",
523
+ transformers_model_type: "automodel-vision2seq",
524
+ description: "Qwen 2.5 VL 3B (MLX, multilingual, good KCJ support)"
525
+ },
526
+ phi4: {
527
+ repo_id: "microsoft/Phi-4-multimodal-instruct",
528
+ inference_framework: "transformers",
529
+ response_format: "markdown",
530
+ transformers_model_type: "automodel",
531
+ description: "Phi-4 Multimodal (Microsoft, CausalLM)"
532
+ },
533
+ "pixtral-12B-mlx": {
534
+ repo_id: "mlx-community/pixtral-12b-bf16",
535
+ inference_framework: "mlx",
536
+ response_format: "markdown",
537
+ transformers_model_type: "automodel-vision2seq",
538
+ description: "Pixtral 12B (MLX, Mistral, high accuracy)"
539
+ },
540
+ "pixtral-12B": {
541
+ repo_id: "mistral-community/pixtral-12b",
542
+ inference_framework: "transformers",
543
+ response_format: "markdown",
544
+ transformers_model_type: "automodel-vision2seq",
545
+ description: "Pixtral 12B (Transformers, Mistral)"
546
+ },
547
+ got2: {
548
+ repo_id: "stepfun-ai/GOT-OCR-2.0-hf",
549
+ inference_framework: "transformers",
550
+ response_format: "markdown",
551
+ transformers_model_type: "automodel-vision2seq",
552
+ description: "GOT-OCR 2.0 (StepFun, OCR-specialized)"
553
+ },
554
+ "gemma3-12B-mlx": {
555
+ repo_id: "mlx-community/gemma-3-12b-it-bf16",
556
+ inference_framework: "mlx",
557
+ response_format: "markdown",
558
+ transformers_model_type: "automodel-vision2seq",
559
+ description: "Gemma 3 12B (MLX, Google)"
560
+ },
561
+ "gemma3-27B-mlx": {
562
+ repo_id: "mlx-community/gemma-3-27b-it-bf16",
563
+ inference_framework: "mlx",
564
+ response_format: "markdown",
565
+ transformers_model_type: "automodel-vision2seq",
566
+ description: "Gemma 3 27B (MLX, Google, highest accuracy)"
567
+ },
568
+ dolphin: {
569
+ repo_id: "ByteDance/Dolphin",
570
+ inference_framework: "transformers",
571
+ response_format: "markdown",
572
+ transformers_model_type: "automodel-vision2seq",
573
+ description: "Dolphin (ByteDance, document-oriented)"
574
+ }
575
+ };
576
+ var DEFAULT_VLM_MODEL = "granite-docling-258M-mlx";
577
+ function resolveVlmModel(model) {
578
+ if (typeof model === "string") {
579
+ const preset = VLM_MODELS[model];
580
+ if (!preset) {
581
+ throw new Error(
582
+ `Unknown VLM model preset: "${model}". Available presets: ${Object.keys(VLM_MODELS).join(", ")}`
583
+ );
584
+ }
585
+ return {
586
+ repo_id: preset.repo_id,
587
+ inference_framework: preset.inference_framework,
588
+ response_format: preset.response_format,
589
+ transformers_model_type: preset.transformers_model_type
590
+ };
591
+ }
592
+ return model;
593
+ }
594
+
382
595
  // src/errors/image-pdf-fallback-error.ts
383
596
  var ImagePdfFallbackError = class extends Error {
384
597
  constructor(originalError, fallbackError) {
@@ -730,7 +943,7 @@ var LocalFileServer = class {
730
943
 
731
944
  // src/core/image-pdf-converter.ts
732
945
  var import_node_fs3 = require("fs");
733
- var import_node_os = require("os");
946
+ var import_node_os2 = require("os");
734
947
  var import_node_path4 = require("path");
735
948
  var ImagePdfConverter = class {
736
949
  constructor(logger) {
@@ -746,7 +959,7 @@ var ImagePdfConverter = class {
746
959
  */
747
960
  async convert(pdfUrl, reportId) {
748
961
  const timestamp = Date.now();
749
- const tempDir = (0, import_node_os.tmpdir)();
962
+ const tempDir = (0, import_node_os2.tmpdir)();
750
963
  const inputPath = (0, import_node_path4.join)(tempDir, `${reportId}-${timestamp}-input.pdf`);
751
964
  const outputPath = (0, import_node_path4.join)(tempDir, `${reportId}-${timestamp}-image.pdf`);
752
965
  try {
@@ -816,11 +1029,17 @@ var ImagePdfConverter = class {
816
1029
  };
817
1030
 
818
1031
  // src/core/pdf-converter.ts
1032
+ var _origAssertValidConversionOptions = import_docling_sdk.ValidationUtils.assertValidConversionOptions.bind(import_docling_sdk.ValidationUtils);
1033
+ import_docling_sdk.ValidationUtils.assertValidConversionOptions = (options) => {
1034
+ const { pipeline: _pipeline, ...rest } = options;
1035
+ _origAssertValidConversionOptions(rest);
1036
+ };
819
1037
  var PDFConverter = class {
820
- constructor(logger, client, enableImagePdfFallback = false) {
1038
+ constructor(logger, client, enableImagePdfFallback = false, timeout = PDF_CONVERTER.DEFAULT_TIMEOUT_MS) {
821
1039
  this.logger = logger;
822
1040
  this.client = client;
823
1041
  this.enableImagePdfFallback = enableImagePdfFallback;
1042
+ this.timeout = timeout;
824
1043
  }
825
1044
  async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
826
1045
  this.logger.info("[PDFConverter] Converting:", url);
@@ -875,10 +1094,15 @@ var PDFConverter = class {
875
1094
  }
876
1095
  async performConversion(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
877
1096
  const startTime = Date.now();
878
- const conversionOptions = this.buildConversionOptions(options);
879
- this.logger.info(
880
- `[PDFConverter] OCR languages: ${JSON.stringify(conversionOptions.ocr_options?.lang)}`
881
- );
1097
+ const pipelineType = options.pipeline ?? "standard";
1098
+ const conversionOptions = pipelineType === "vlm" ? this.buildVlmConversionOptions(options) : this.buildConversionOptions(options);
1099
+ if (pipelineType === "vlm") {
1100
+ this.logger.info("[PDFConverter] Using VLM pipeline");
1101
+ } else {
1102
+ this.logger.info(
1103
+ `[PDFConverter] OCR languages: ${JSON.stringify(conversionOptions.ocr_options?.lang)}`
1104
+ );
1105
+ }
882
1106
  this.logger.info(
883
1107
  "[PDFConverter] Converting document with Async Source API..."
884
1108
  );
@@ -945,7 +1169,7 @@ var PDFConverter = class {
945
1169
  }
946
1170
  buildConversionOptions(options) {
947
1171
  return {
948
- ...(0, import_es_toolkit.omit)(options, ["num_threads"]),
1172
+ ...(0, import_es_toolkit.omit)(options, ["num_threads", "pipeline", "vlm_model"]),
949
1173
  to_formats: ["json", "html"],
950
1174
  image_export_mode: "embedded",
951
1175
  ocr_engine: "ocrmac",
@@ -971,6 +1195,31 @@ var PDFConverter = class {
971
1195
  }
972
1196
  };
973
1197
  }
1198
+ /**
1199
+ * Build conversion options for VLM pipeline.
1200
+ *
1201
+ * VLM pipeline uses a Vision Language Model instead of traditional OCR,
1202
+ * providing better accuracy for KCJ characters and complex layouts.
1203
+ */
1204
+ buildVlmConversionOptions(options) {
1205
+ const vlmModel = resolveVlmModel(options.vlm_model ?? DEFAULT_VLM_MODEL);
1206
+ this.logger.info(
1207
+ `[PDFConverter] VLM model: ${vlmModel.repo_id} (framework: ${vlmModel.inference_framework}, format: ${vlmModel.response_format})`
1208
+ );
1209
+ return {
1210
+ ...(0, import_es_toolkit.omit)(options, ["num_threads", "pipeline", "vlm_model", "ocr_lang"]),
1211
+ to_formats: ["json", "html"],
1212
+ image_export_mode: "embedded",
1213
+ pipeline: "vlm",
1214
+ vlm_pipeline_model_local: vlmModel,
1215
+ generate_picture_images: true,
1216
+ images_scale: 2,
1217
+ accelerator_options: {
1218
+ device: "mps",
1219
+ num_threads: options.num_threads
1220
+ }
1221
+ };
1222
+ }
974
1223
  async startConversionTask(url, conversionOptions) {
975
1224
  const task = await this.client.convertSourceAsync({
976
1225
  sources: [
@@ -1006,38 +1255,42 @@ var PDFConverter = class {
1006
1255
  }
1007
1256
  async trackTaskProgress(task) {
1008
1257
  const conversionStartTime = Date.now();
1009
- let lastStatus = "";
1010
- let isCompleted = false;
1011
- const pollInterval = setInterval(() => {
1012
- if (isCompleted) return;
1013
- const elapsed = Math.floor((Date.now() - conversionStartTime) / 1e3);
1014
- process.stdout.write(
1015
- `\r[PDFConverter] Status: ${lastStatus || "processing"} (${elapsed}s elapsed)`
1016
- );
1017
- }, PDF_CONVERTER.POLL_INTERVAL_MS);
1018
- task.on("progress", (status) => {
1019
- lastStatus = status.task_status;
1258
+ let lastProgressLine = "";
1259
+ const logProgress = (status) => {
1260
+ const parts = [`Status: ${status.task_status}`];
1020
1261
  if (status.task_position !== void 0) {
1021
- process.stdout.write(
1022
- `\r[PDFConverter] Status: ${status.task_status} (position: ${status.task_position})`
1023
- );
1262
+ parts.push(`position: ${status.task_position}`);
1024
1263
  }
1025
- });
1026
- task.on("complete", () => {
1027
- isCompleted = true;
1028
- clearInterval(pollInterval);
1029
- this.logger.info("\n[PDFConverter] Conversion completed!");
1030
- });
1031
- task.on("error", (error) => {
1032
- isCompleted = true;
1033
- clearInterval(pollInterval);
1034
- this.logger.error("\n[PDFConverter] Conversion error:", error.message);
1035
- });
1036
- try {
1037
- await task.waitForCompletion();
1038
- } finally {
1039
- isCompleted = true;
1040
- clearInterval(pollInterval);
1264
+ const meta = status.task_meta;
1265
+ if (meta) {
1266
+ if (meta.processed_documents !== void 0 && meta.total_documents !== void 0) {
1267
+ parts.push(
1268
+ `progress: ${meta.processed_documents}/${meta.total_documents}`
1269
+ );
1270
+ }
1271
+ }
1272
+ const progressLine = `\r[PDFConverter] ${parts.join(" | ")}`;
1273
+ if (progressLine !== lastProgressLine) {
1274
+ lastProgressLine = progressLine;
1275
+ process.stdout.write(progressLine);
1276
+ }
1277
+ };
1278
+ while (true) {
1279
+ if (Date.now() - conversionStartTime > this.timeout) {
1280
+ throw new Error("Task timeout");
1281
+ }
1282
+ const status = await task.poll();
1283
+ logProgress(status);
1284
+ if (status.task_status === "success") {
1285
+ this.logger.info("\n[PDFConverter] Conversion completed!");
1286
+ return;
1287
+ }
1288
+ if (status.task_status === "failure") {
1289
+ throw new Error("Task failed with status: failure");
1290
+ }
1291
+ await new Promise(
1292
+ (resolve) => setTimeout(resolve, PDF_CONVERTER.POLL_INTERVAL_MS)
1293
+ );
1041
1294
  }
1042
1295
  }
1043
1296
  async downloadResult(taskId) {
@@ -1073,6 +1326,7 @@ var PDFParser = class {
1073
1326
  killExistingProcess;
1074
1327
  enableImagePdfFallback;
1075
1328
  client = null;
1329
+ environment;
1076
1330
  constructor(options) {
1077
1331
  const {
1078
1332
  logger,
@@ -1109,7 +1363,7 @@ var PDFParser = class {
1109
1363
  }
1110
1364
  if (this.baseUrl) {
1111
1365
  this.logger.info("[PDFParser] Using external server:", this.baseUrl);
1112
- this.client = new import_docling_sdk.Docling({
1366
+ this.client = new import_docling_sdk2.Docling({
1113
1367
  api: { baseUrl: this.baseUrl, timeout: this.timeout }
1114
1368
  });
1115
1369
  await this.waitForServerReady();
@@ -1117,15 +1371,15 @@ var PDFParser = class {
1117
1371
  }
1118
1372
  this.logger.info("[PDFParser] Setting up local server...");
1119
1373
  try {
1120
- const environment = new DoclingEnvironment({
1374
+ this.environment = new DoclingEnvironment({
1121
1375
  logger: this.logger,
1122
1376
  venvPath: this.venvPath,
1123
1377
  port: this.port,
1124
1378
  killExistingProcess: this.killExistingProcess
1125
1379
  });
1126
- await environment.setup();
1380
+ await this.environment.setup();
1127
1381
  const clientUrl = `http://localhost:${this.port}`;
1128
- this.client = new import_docling_sdk.Docling({
1382
+ this.client = new import_docling_sdk2.Docling({
1129
1383
  api: {
1130
1384
  baseUrl: clientUrl,
1131
1385
  timeout: this.timeout
@@ -1139,9 +1393,9 @@ var PDFParser = class {
1139
1393
  }
1140
1394
  }
1141
1395
  checkOperatingSystem() {
1142
- if ((0, import_node_os2.platform)() !== "darwin") {
1396
+ if ((0, import_node_os3.platform)() !== "darwin") {
1143
1397
  throw new Error(
1144
- "PDFParser is only supported on macOS. Current platform: " + (0, import_node_os2.platform)()
1398
+ "PDFParser is only supported on macOS. Current platform: " + (0, import_node_os3.platform)()
1145
1399
  );
1146
1400
  }
1147
1401
  }
@@ -1225,7 +1479,7 @@ var PDFParser = class {
1225
1479
  });
1226
1480
  await environment.startServer();
1227
1481
  this.client?.destroy();
1228
- this.client = new import_docling_sdk.Docling({
1482
+ this.client = new import_docling_sdk2.Docling({
1229
1483
  api: {
1230
1484
  baseUrl: `http://localhost:${this.port}`,
1231
1485
  timeout: this.timeout
@@ -1269,6 +1523,12 @@ var PDFParser = class {
1269
1523
  "PDFParser is not initialized. Call init() before using parse()"
1270
1524
  );
1271
1525
  }
1526
+ if (options.pipeline === "vlm" && this.environment && !this.baseUrl) {
1527
+ this.logger.info(
1528
+ "[PDFParser] VLM pipeline requested, ensuring VLM dependencies..."
1529
+ );
1530
+ await this.environment.setupVlmDependencies();
1531
+ }
1272
1532
  const canRecover = !this.baseUrl && this.port !== void 0;
1273
1533
  const maxAttempts = PDF_PARSER.MAX_SERVER_RECOVERY_ATTEMPTS;
1274
1534
  let attempt = 0;
@@ -1278,7 +1538,8 @@ var PDFParser = class {
1278
1538
  const converter = new PDFConverter(
1279
1539
  this.logger,
1280
1540
  this.client,
1281
- effectiveFallbackEnabled
1541
+ effectiveFallbackEnabled,
1542
+ this.timeout
1282
1543
  );
1283
1544
  return await converter.convert(
1284
1545
  url,
@@ -1326,7 +1587,10 @@ var PDFParser = class {
1326
1587
  };
1327
1588
  // Annotate the CommonJS export names for ESM import in node:
1328
1589
  0 && (module.exports = {
1590
+ DEFAULT_VLM_MODEL,
1329
1591
  ImagePdfFallbackError,
1330
- PDFParser
1592
+ PDFParser,
1593
+ VLM_MODELS,
1594
+ resolveVlmModel
1331
1595
  });
1332
1596
  //# sourceMappingURL=index.cjs.map