@heripo/pdf-parser 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,121 @@
1
+ // src/config/vlm-models.ts
2
+ var VLM_MODELS = {
3
+ // ── DocTags models (specialized document structure output) ──────────
4
+ "granite-docling-258M-mlx": {
5
+ repo_id: "ibm-granite/granite-docling-258M-mlx",
6
+ inference_framework: "mlx",
7
+ response_format: "doctags",
8
+ transformers_model_type: "automodel-vision2seq",
9
+ description: "Granite Docling 258M (MLX, Apple Silicon optimized, ~6s/page)"
10
+ },
11
+ "granite-docling-258M": {
12
+ repo_id: "ibm-granite/granite-docling-258M",
13
+ inference_framework: "transformers",
14
+ response_format: "doctags",
15
+ transformers_model_type: "automodel-vision2seq",
16
+ description: "Granite Docling 258M (Transformers, cross-platform)"
17
+ },
18
+ "smoldocling-256M-mlx": {
19
+ repo_id: "docling-project/SmolDocling-256M-preview-mlx-bf16",
20
+ inference_framework: "mlx",
21
+ response_format: "doctags",
22
+ transformers_model_type: "automodel-vision2seq",
23
+ description: "SmolDocling 256M (MLX, fastest option)"
24
+ },
25
+ "smoldocling-256M": {
26
+ repo_id: "docling-project/SmolDocling-256M-preview",
27
+ inference_framework: "transformers",
28
+ response_format: "doctags",
29
+ transformers_model_type: "automodel-vision2seq",
30
+ description: "SmolDocling 256M (Transformers)"
31
+ },
32
+ // ── Markdown models (general-purpose vision LLMs) ──────────────────
33
+ "granite-vision-2B": {
34
+ repo_id: "ibm-granite/granite-vision-3.2-2b",
35
+ inference_framework: "transformers",
36
+ response_format: "markdown",
37
+ transformers_model_type: "automodel-vision2seq",
38
+ description: "Granite Vision 3.2 2B (IBM, higher accuracy)"
39
+ },
40
+ "qwen25-vl-3B-mlx": {
41
+ repo_id: "mlx-community/Qwen2.5-VL-3B-Instruct-bf16",
42
+ inference_framework: "mlx",
43
+ response_format: "markdown",
44
+ transformers_model_type: "automodel-vision2seq",
45
+ description: "Qwen 2.5 VL 3B (MLX, multilingual, good KCJ support)"
46
+ },
47
+ phi4: {
48
+ repo_id: "microsoft/Phi-4-multimodal-instruct",
49
+ inference_framework: "transformers",
50
+ response_format: "markdown",
51
+ transformers_model_type: "automodel",
52
+ description: "Phi-4 Multimodal (Microsoft, CausalLM)"
53
+ },
54
+ "pixtral-12B-mlx": {
55
+ repo_id: "mlx-community/pixtral-12b-bf16",
56
+ inference_framework: "mlx",
57
+ response_format: "markdown",
58
+ transformers_model_type: "automodel-vision2seq",
59
+ description: "Pixtral 12B (MLX, Mistral, high accuracy)"
60
+ },
61
+ "pixtral-12B": {
62
+ repo_id: "mistral-community/pixtral-12b",
63
+ inference_framework: "transformers",
64
+ response_format: "markdown",
65
+ transformers_model_type: "automodel-vision2seq",
66
+ description: "Pixtral 12B (Transformers, Mistral)"
67
+ },
68
+ got2: {
69
+ repo_id: "stepfun-ai/GOT-OCR-2.0-hf",
70
+ inference_framework: "transformers",
71
+ response_format: "markdown",
72
+ transformers_model_type: "automodel-vision2seq",
73
+ description: "GOT-OCR 2.0 (StepFun, OCR-specialized)"
74
+ },
75
+ "gemma3-12B-mlx": {
76
+ repo_id: "mlx-community/gemma-3-12b-it-bf16",
77
+ inference_framework: "mlx",
78
+ response_format: "markdown",
79
+ transformers_model_type: "automodel-vision2seq",
80
+ description: "Gemma 3 12B (MLX, Google)"
81
+ },
82
+ "gemma3-27B-mlx": {
83
+ repo_id: "mlx-community/gemma-3-27b-it-bf16",
84
+ inference_framework: "mlx",
85
+ response_format: "markdown",
86
+ transformers_model_type: "automodel-vision2seq",
87
+ description: "Gemma 3 27B (MLX, Google, highest accuracy)"
88
+ },
89
+ dolphin: {
90
+ repo_id: "ByteDance/Dolphin",
91
+ inference_framework: "transformers",
92
+ response_format: "markdown",
93
+ transformers_model_type: "automodel-vision2seq",
94
+ description: "Dolphin (ByteDance, document-oriented)"
95
+ }
96
+ };
97
+ var DEFAULT_VLM_MODEL = "granite-docling-258M-mlx";
98
+ function resolveVlmModel(model) {
99
+ if (typeof model === "string") {
100
+ const preset = VLM_MODELS[model];
101
+ if (!preset) {
102
+ throw new Error(
103
+ `Unknown VLM model preset: "${model}". Available presets: ${Object.keys(VLM_MODELS).join(", ")}`
104
+ );
105
+ }
106
+ return {
107
+ repo_id: preset.repo_id,
108
+ inference_framework: preset.inference_framework,
109
+ response_format: preset.response_format,
110
+ transformers_model_type: preset.transformers_model_type
111
+ };
112
+ }
113
+ return model;
114
+ }
115
+
116
+ export {
117
+ VLM_MODELS,
118
+ DEFAULT_VLM_MODEL,
119
+ resolveVlmModel
120
+ };
121
+ //# sourceMappingURL=chunk-WWNI354M.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/config/vlm-models.ts"],"sourcesContent":["import type { VlmModelLocal } from 'docling-sdk';\n\n/**\n * VLM model preset with description\n */\nexport interface VlmModelPreset {\n repo_id: string;\n inference_framework: 'mlx' | 'transformers';\n response_format: 'doctags' | 'markdown';\n transformers_model_type: 'automodel-vision2seq' | 'automodel';\n description: string;\n}\n\n/**\n * Available VLM model presets\n *\n * Based on Docling's official VLM model specs:\n * https://docling-project.github.io/docling/usage/vision_models/#available-local-models\n *\n * Users can select a preset key or provide a custom VlmModelLocal object.\n */\nexport const VLM_MODELS: Record<string, VlmModelPreset> = {\n // ── DocTags models (specialized document structure output) ──────────\n\n 'granite-docling-258M-mlx': {\n repo_id: 'ibm-granite/granite-docling-258M-mlx',\n inference_framework: 'mlx',\n response_format: 'doctags',\n transformers_model_type: 'automodel-vision2seq',\n description:\n 'Granite Docling 258M (MLX, Apple Silicon optimized, ~6s/page)',\n },\n 'granite-docling-258M': {\n repo_id: 'ibm-granite/granite-docling-258M',\n inference_framework: 'transformers',\n response_format: 'doctags',\n transformers_model_type: 'automodel-vision2seq',\n description: 'Granite Docling 258M (Transformers, cross-platform)',\n },\n 'smoldocling-256M-mlx': {\n repo_id: 'docling-project/SmolDocling-256M-preview-mlx-bf16',\n inference_framework: 'mlx',\n response_format: 'doctags',\n transformers_model_type: 'automodel-vision2seq',\n description: 'SmolDocling 256M (MLX, fastest option)',\n },\n 'smoldocling-256M': {\n repo_id: 'docling-project/SmolDocling-256M-preview',\n inference_framework: 'transformers',\n response_format: 'doctags',\n transformers_model_type: 'automodel-vision2seq',\n description: 'SmolDocling 256M (Transformers)',\n },\n\n // ── Markdown models (general-purpose vision LLMs) ──────────────────\n\n 'granite-vision-2B': {\n repo_id: 'ibm-granite/granite-vision-3.2-2b',\n inference_framework: 'transformers',\n response_format: 'markdown',\n transformers_model_type: 'automodel-vision2seq',\n description: 'Granite Vision 3.2 2B (IBM, higher accuracy)',\n },\n 'qwen25-vl-3B-mlx': {\n repo_id: 'mlx-community/Qwen2.5-VL-3B-Instruct-bf16',\n inference_framework: 'mlx',\n response_format: 'markdown',\n transformers_model_type: 'automodel-vision2seq',\n description: 'Qwen 2.5 VL 3B (MLX, multilingual, good KCJ support)',\n },\n phi4: {\n repo_id: 'microsoft/Phi-4-multimodal-instruct',\n inference_framework: 'transformers',\n response_format: 'markdown',\n transformers_model_type: 'automodel',\n description: 'Phi-4 Multimodal (Microsoft, CausalLM)',\n },\n 'pixtral-12B-mlx': {\n repo_id: 'mlx-community/pixtral-12b-bf16',\n inference_framework: 'mlx',\n response_format: 'markdown',\n transformers_model_type: 'automodel-vision2seq',\n description: 'Pixtral 12B (MLX, Mistral, high accuracy)',\n },\n 'pixtral-12B': {\n repo_id: 'mistral-community/pixtral-12b',\n inference_framework: 'transformers',\n response_format: 'markdown',\n transformers_model_type: 'automodel-vision2seq',\n description: 'Pixtral 12B (Transformers, Mistral)',\n },\n got2: {\n repo_id: 'stepfun-ai/GOT-OCR-2.0-hf',\n inference_framework: 'transformers',\n response_format: 'markdown',\n transformers_model_type: 'automodel-vision2seq',\n description: 'GOT-OCR 2.0 (StepFun, OCR-specialized)',\n },\n 'gemma3-12B-mlx': {\n repo_id: 'mlx-community/gemma-3-12b-it-bf16',\n inference_framework: 'mlx',\n response_format: 'markdown',\n transformers_model_type: 'automodel-vision2seq',\n description: 'Gemma 3 12B (MLX, Google)',\n },\n 'gemma3-27B-mlx': {\n repo_id: 'mlx-community/gemma-3-27b-it-bf16',\n inference_framework: 'mlx',\n response_format: 'markdown',\n transformers_model_type: 'automodel-vision2seq',\n description: 'Gemma 3 27B (MLX, Google, highest accuracy)',\n },\n dolphin: {\n repo_id: 'ByteDance/Dolphin',\n inference_framework: 'transformers',\n response_format: 'markdown',\n transformers_model_type: 'automodel-vision2seq',\n description: 'Dolphin (ByteDance, document-oriented)',\n },\n} as const;\n\n/**\n * Default VLM model preset key\n */\nexport const DEFAULT_VLM_MODEL = 'granite-docling-258M-mlx';\n\n/**\n * Resolve a VLM model from a preset key or custom VlmModelLocal object.\n *\n * When using a preset key, only required fields are populated.\n * Optional fields (prompt, scale, extra_generation_config) use Docling defaults.\n */\nexport function resolveVlmModel(model: string | VlmModelLocal): VlmModelLocal {\n if (typeof model === 'string') {\n const preset = VLM_MODELS[model];\n if (!preset) {\n throw new Error(\n `Unknown VLM model preset: \"${model}\". Available presets: ${Object.keys(VLM_MODELS).join(', ')}`,\n );\n }\n return {\n repo_id: preset.repo_id,\n inference_framework: preset.inference_framework,\n response_format: preset.response_format,\n transformers_model_type: preset.transformers_model_type,\n } as VlmModelLocal;\n }\n return model;\n}\n"],"mappings":";AAqBO,IAAM,aAA6C;AAAA;AAAA,EAGxD,4BAA4B;AAAA,IAC1B,SAAS;AAAA,IACT,qBAAqB;AAAA,IACrB,iBAAiB;AAAA,IACjB,yBAAyB;AAAA,IACzB,aACE;AAAA,EACJ;AAAA,EACA,wBAAwB;AAAA,IACtB,SAAS;AAAA,IACT,qBAAqB;AAAA,IACrB,iBAAiB;AAAA,IACjB,yBAAyB;AAAA,IACzB,aAAa;AAAA,EACf;AAAA,EACA,wBAAwB;AAAA,IACtB,SAAS;AAAA,IACT,qBAAqB;AAAA,IACrB,iBAAiB;AAAA,IACjB,yBAAyB;AAAA,IACzB,aAAa;AAAA,EACf;AAAA,EACA,oBAAoB;AAAA,IAClB,SAAS;AAAA,IACT,qBAAqB;AAAA,IACrB,iBAAiB;AAAA,IACjB,yBAAyB;AAAA,IACzB,aAAa;AAAA,EACf;AAAA;AAAA,EAIA,qBAAqB;AAAA,IACnB,SAAS;AAAA,IACT,qBAAqB;AAAA,IACrB,iBAAiB;AAAA,IACjB,yBAAyB;AAAA,IACzB,aAAa;AAAA,EACf;AAAA,EACA,oBAAoB;AAAA,IAClB,SAAS;AAAA,IACT,qBAAqB;AAAA,IACrB,iBAAiB;AAAA,IACjB,yBAAyB;AAAA,IACzB,aAAa;AAAA,EACf;AAAA,EACA,MAAM;AAAA,IACJ,SAAS;AAAA,IACT,qBAAqB;AAAA,IACrB,iBAAiB;AAAA,IACjB,yBAAyB;AAAA,IACzB,aAAa;AAAA,EACf;AAAA,EACA,mBAAmB;AAAA,IACjB,SAAS;AAAA,IACT,qBAAqB;AAAA,IACrB,iBAAiB;AAAA,IACjB,yBAAyB;AAAA,IACzB,aAAa;AAAA,EACf;AAAA,EACA,eAAe;AAAA,IACb,SAAS;AAAA,IACT,qBAAqB;AAAA,IACrB,iBAAiB;AAAA,IACjB,yBAAyB;AAAA,IACzB,aAAa;AAAA,EACf;AAAA,EACA,MAAM;AAAA,IACJ,SAAS;AAAA,IACT,qBAAqB;AAAA,IACrB,iBAAiB;AAAA,IACjB,yBAAyB;AAAA,IACzB,aAAa;AAAA,EACf;AAAA,EACA,kBAAkB;AAAA,IAChB,SAAS;AAAA,IACT,qBAAqB;AAAA,IACrB,iBAAiB;AAAA,IACjB,yBAAyB;AAAA,IACzB,aAAa;AAAA,EACf;AAAA,EACA,kBAAkB;AAAA,IAChB,SAAS;AAAA,IACT,qBAAqB;AAAA,IACrB,iBAAiB;AAAA,IACjB,yBAAyB;AAAA,IACzB,aAAa;AAAA,EACf;AAAA,EACA,SAAS;AAAA,IACP,SAAS;AAAA,IACT,qBAAqB;AAAA,IACrB,iBAAiB;AAAA,IACjB,yBAAyB;AAAA,IACzB,aAAa;AAAA,EACf;AACF;AAKO,IAAM,oBAAoB;AAQ1B,SAAS,gBAAgB,OAA8C;AAC5E,MAAI,OAAO,UAAU,UAAU;AAC7B,UAAM,SAAS,WAAW,KAAK;AAC/B,QAAI,CAAC,QAAQ;AACX,YAAM,IAAI;AAAA,QACR,8BAA8B,KAAK,yBAAyB,OAAO,KAAK,UAAU,EAAE,KAAK,IAAI,CAAC;AAAA,MAChG;AAAA,IACF;AACA,WAAO;AAAA,MACL,SAAS,OAAO;AAAA,MAChB,qBAAqB,OAAO;AAAA,MAC5B,iBAAiB,OAAO;AAAA,MACxB,yBAAyB,OAAO;AAAA,IAClC;AAAA,EACF;AACA,SAAO;AACT;","names":[]}
package/dist/index.cjs CHANGED
@@ -30,15 +30,18 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
30
30
  // src/index.ts
31
31
  var src_exports = {};
32
32
  __export(src_exports, {
33
+ DEFAULT_VLM_MODEL: () => DEFAULT_VLM_MODEL,
33
34
  ImagePdfFallbackError: () => ImagePdfFallbackError,
34
- PDFParser: () => PDFParser
35
+ PDFParser: () => PDFParser,
36
+ VLM_MODELS: () => VLM_MODELS,
37
+ resolveVlmModel: () => resolveVlmModel
35
38
  });
36
39
  module.exports = __toCommonJS(src_exports);
37
40
 
38
41
  // src/core/pdf-parser.ts
39
- var import_docling_sdk = require("docling-sdk");
42
+ var import_docling_sdk2 = require("docling-sdk");
40
43
  var import_node_child_process3 = require("child_process");
41
- var import_node_os2 = require("os");
44
+ var import_node_os3 = require("os");
42
45
  var import_node_path6 = require("path");
43
46
 
44
47
  // src/config/constants.ts
@@ -68,7 +71,11 @@ var PDF_CONVERTER = {
68
71
  /**
69
72
  * Interval for progress polling in milliseconds
70
73
  */
71
- POLL_INTERVAL_MS: 1e3
74
+ POLL_INTERVAL_MS: 1e3,
75
+ /**
76
+ * Default timeout for task completion in milliseconds (30 minutes)
77
+ */
78
+ DEFAULT_TIMEOUT_MS: 18e5
72
79
  };
73
80
  var DOCLING_ENVIRONMENT = {
74
81
  /**
@@ -86,6 +93,19 @@ var IMAGE_PDF_CONVERTER = {
86
93
  */
87
94
  QUALITY: 100
88
95
  };
96
+ var VLM_ENVIRONMENT = {
97
+ /**
98
+ * Timeout for VLM dependency installation (pip install) in milliseconds (3 hours).
99
+ * VLM packages can be very large and may require extended download times
100
+ * depending on network conditions.
101
+ */
102
+ SETUP_TIMEOUT_MS: 108e5,
103
+ /**
104
+ * Timeout for VLM model download in milliseconds (3 hours).
105
+ * Large VLM models (e.g., multi-GB weights) need sufficient time to download.
106
+ */
107
+ MODEL_DOWNLOAD_TIMEOUT_MS: 108e5
108
+ };
89
109
 
90
110
  // ../shared/dist/index.mjs
91
111
  var import_child_process = require("child_process");
@@ -118,6 +138,7 @@ function spawnAsync(command, args, options = {}) {
118
138
 
119
139
  // src/environment/docling-environment.ts
120
140
  var import_node_child_process = require("child_process");
141
+ var import_node_os = require("os");
121
142
  var import_node_path = require("path");
122
143
 
123
144
  // src/utils/python-version.ts
@@ -159,6 +180,7 @@ var DoclingEnvironment = class _DoclingEnvironment {
159
180
  venvPath;
160
181
  port;
161
182
  killExistingProcess;
183
+ vlmDependenciesInstalled = false;
162
184
  constructor(options) {
163
185
  this.logger = options.logger;
164
186
  this.venvPath = options.venvPath;
@@ -288,6 +310,81 @@ var DoclingEnvironment = class _DoclingEnvironment {
288
310
  );
289
311
  }
290
312
  }
313
+ /**
314
+ * Install VLM-specific dependencies for the Docling VLM pipeline.
315
+ *
316
+ * Installs:
317
+ * 1. docling-serve[vlm] - VLM model support for docling-serve
318
+ * 2. mlx + mlx-lm (macOS ARM64 only) - Apple Silicon optimized inference
319
+ *
320
+ * This is idempotent - subsequent calls skip if already installed.
321
+ */
322
+ async setupVlmDependencies() {
323
+ if (this.vlmDependenciesInstalled) {
324
+ this.logger.info(
325
+ "[DoclingEnvironment] VLM dependencies already installed, skipping"
326
+ );
327
+ return;
328
+ }
329
+ if (await this.isVlmReady()) {
330
+ this.vlmDependenciesInstalled = true;
331
+ this.logger.info(
332
+ "[DoclingEnvironment] VLM dependencies already installed, skipping"
333
+ );
334
+ return;
335
+ }
336
+ this.logger.info("[DoclingEnvironment] Installing VLM dependencies...");
337
+ const pipPath = (0, import_node_path.join)(this.venvPath, "bin", "pip");
338
+ this.logger.info("[DoclingEnvironment] Installing docling[vlm]...");
339
+ const vlmResult = await spawnAsync(
340
+ pipPath,
341
+ ["install", "docling-serve[vlm]"],
342
+ { timeout: VLM_ENVIRONMENT.SETUP_TIMEOUT_MS }
343
+ );
344
+ if (vlmResult.code !== 0) {
345
+ this.logger.error(
346
+ "[DoclingEnvironment] Failed to install docling-serve[vlm]:",
347
+ vlmResult.stderr
348
+ );
349
+ throw new Error(
350
+ `Failed to install docling-serve[vlm]. Exit code: ${vlmResult.code}`
351
+ );
352
+ }
353
+ if ((0, import_node_os.platform)() === "darwin" && (0, import_node_os.arch)() === "arm64") {
354
+ this.logger.info(
355
+ "[DoclingEnvironment] Installing mlx + mlx-lm for Apple Silicon..."
356
+ );
357
+ const mlxResult = await spawnAsync(
358
+ pipPath,
359
+ ["install", "mlx", "mlx-lm"],
360
+ { timeout: VLM_ENVIRONMENT.SETUP_TIMEOUT_MS }
361
+ );
362
+ if (mlxResult.code !== 0) {
363
+ this.logger.error(
364
+ "[DoclingEnvironment] Failed to install mlx/mlx-lm:",
365
+ mlxResult.stderr
366
+ );
367
+ throw new Error(
368
+ `Failed to install mlx/mlx-lm. Exit code: ${mlxResult.code}`
369
+ );
370
+ }
371
+ }
372
+ this.vlmDependenciesInstalled = true;
373
+ this.logger.info(
374
+ "[DoclingEnvironment] VLM dependencies installed successfully"
375
+ );
376
+ }
377
+ /**
378
+ * Check if VLM dependencies are ready by verifying Python module imports
379
+ */
380
+ async isVlmReady() {
381
+ const pythonPath = (0, import_node_path.join)(this.venvPath, "bin", "python");
382
+ const result = await spawnAsync(pythonPath, [
383
+ "-c",
384
+ "import docling_core; import docling"
385
+ ]);
386
+ return result.code === 0;
387
+ }
291
388
  async isPortInUse(port) {
292
389
  try {
293
390
  const result = await spawnAsync("lsof", ["-ti", `:${port}`]);
@@ -374,11 +471,127 @@ var DoclingEnvironment = class _DoclingEnvironment {
374
471
  };
375
472
 
376
473
  // src/core/pdf-converter.ts
474
+ var import_docling_sdk = require("docling-sdk");
377
475
  var import_es_toolkit = require("es-toolkit");
378
476
  var import_node_fs4 = require("fs");
379
477
  var import_node_path5 = require("path");
380
478
  var import_promises = require("stream/promises");
381
479
 
480
+ // src/config/vlm-models.ts
481
+ var VLM_MODELS = {
482
+ // ── DocTags models (specialized document structure output) ──────────
483
+ "granite-docling-258M-mlx": {
484
+ repo_id: "ibm-granite/granite-docling-258M-mlx",
485
+ inference_framework: "mlx",
486
+ response_format: "doctags",
487
+ transformers_model_type: "automodel-vision2seq",
488
+ description: "Granite Docling 258M (MLX, Apple Silicon optimized, ~6s/page)"
489
+ },
490
+ "granite-docling-258M": {
491
+ repo_id: "ibm-granite/granite-docling-258M",
492
+ inference_framework: "transformers",
493
+ response_format: "doctags",
494
+ transformers_model_type: "automodel-vision2seq",
495
+ description: "Granite Docling 258M (Transformers, cross-platform)"
496
+ },
497
+ "smoldocling-256M-mlx": {
498
+ repo_id: "docling-project/SmolDocling-256M-preview-mlx-bf16",
499
+ inference_framework: "mlx",
500
+ response_format: "doctags",
501
+ transformers_model_type: "automodel-vision2seq",
502
+ description: "SmolDocling 256M (MLX, fastest option)"
503
+ },
504
+ "smoldocling-256M": {
505
+ repo_id: "docling-project/SmolDocling-256M-preview",
506
+ inference_framework: "transformers",
507
+ response_format: "doctags",
508
+ transformers_model_type: "automodel-vision2seq",
509
+ description: "SmolDocling 256M (Transformers)"
510
+ },
511
+ // ── Markdown models (general-purpose vision LLMs) ──────────────────
512
+ "granite-vision-2B": {
513
+ repo_id: "ibm-granite/granite-vision-3.2-2b",
514
+ inference_framework: "transformers",
515
+ response_format: "markdown",
516
+ transformers_model_type: "automodel-vision2seq",
517
+ description: "Granite Vision 3.2 2B (IBM, higher accuracy)"
518
+ },
519
+ "qwen25-vl-3B-mlx": {
520
+ repo_id: "mlx-community/Qwen2.5-VL-3B-Instruct-bf16",
521
+ inference_framework: "mlx",
522
+ response_format: "markdown",
523
+ transformers_model_type: "automodel-vision2seq",
524
+ description: "Qwen 2.5 VL 3B (MLX, multilingual, good KCJ support)"
525
+ },
526
+ phi4: {
527
+ repo_id: "microsoft/Phi-4-multimodal-instruct",
528
+ inference_framework: "transformers",
529
+ response_format: "markdown",
530
+ transformers_model_type: "automodel",
531
+ description: "Phi-4 Multimodal (Microsoft, CausalLM)"
532
+ },
533
+ "pixtral-12B-mlx": {
534
+ repo_id: "mlx-community/pixtral-12b-bf16",
535
+ inference_framework: "mlx",
536
+ response_format: "markdown",
537
+ transformers_model_type: "automodel-vision2seq",
538
+ description: "Pixtral 12B (MLX, Mistral, high accuracy)"
539
+ },
540
+ "pixtral-12B": {
541
+ repo_id: "mistral-community/pixtral-12b",
542
+ inference_framework: "transformers",
543
+ response_format: "markdown",
544
+ transformers_model_type: "automodel-vision2seq",
545
+ description: "Pixtral 12B (Transformers, Mistral)"
546
+ },
547
+ got2: {
548
+ repo_id: "stepfun-ai/GOT-OCR-2.0-hf",
549
+ inference_framework: "transformers",
550
+ response_format: "markdown",
551
+ transformers_model_type: "automodel-vision2seq",
552
+ description: "GOT-OCR 2.0 (StepFun, OCR-specialized)"
553
+ },
554
+ "gemma3-12B-mlx": {
555
+ repo_id: "mlx-community/gemma-3-12b-it-bf16",
556
+ inference_framework: "mlx",
557
+ response_format: "markdown",
558
+ transformers_model_type: "automodel-vision2seq",
559
+ description: "Gemma 3 12B (MLX, Google)"
560
+ },
561
+ "gemma3-27B-mlx": {
562
+ repo_id: "mlx-community/gemma-3-27b-it-bf16",
563
+ inference_framework: "mlx",
564
+ response_format: "markdown",
565
+ transformers_model_type: "automodel-vision2seq",
566
+ description: "Gemma 3 27B (MLX, Google, highest accuracy)"
567
+ },
568
+ dolphin: {
569
+ repo_id: "ByteDance/Dolphin",
570
+ inference_framework: "transformers",
571
+ response_format: "markdown",
572
+ transformers_model_type: "automodel-vision2seq",
573
+ description: "Dolphin (ByteDance, document-oriented)"
574
+ }
575
+ };
576
+ var DEFAULT_VLM_MODEL = "granite-docling-258M-mlx";
577
+ function resolveVlmModel(model) {
578
+ if (typeof model === "string") {
579
+ const preset = VLM_MODELS[model];
580
+ if (!preset) {
581
+ throw new Error(
582
+ `Unknown VLM model preset: "${model}". Available presets: ${Object.keys(VLM_MODELS).join(", ")}`
583
+ );
584
+ }
585
+ return {
586
+ repo_id: preset.repo_id,
587
+ inference_framework: preset.inference_framework,
588
+ response_format: preset.response_format,
589
+ transformers_model_type: preset.transformers_model_type
590
+ };
591
+ }
592
+ return model;
593
+ }
594
+
382
595
  // src/errors/image-pdf-fallback-error.ts
383
596
  var ImagePdfFallbackError = class extends Error {
384
597
  constructor(originalError, fallbackError) {
@@ -730,7 +943,7 @@ var LocalFileServer = class {
730
943
 
731
944
  // src/core/image-pdf-converter.ts
732
945
  var import_node_fs3 = require("fs");
733
- var import_node_os = require("os");
946
+ var import_node_os2 = require("os");
734
947
  var import_node_path4 = require("path");
735
948
  var ImagePdfConverter = class {
736
949
  constructor(logger) {
@@ -746,7 +959,7 @@ var ImagePdfConverter = class {
746
959
  */
747
960
  async convert(pdfUrl, reportId) {
748
961
  const timestamp = Date.now();
749
- const tempDir = (0, import_node_os.tmpdir)();
962
+ const tempDir = (0, import_node_os2.tmpdir)();
750
963
  const inputPath = (0, import_node_path4.join)(tempDir, `${reportId}-${timestamp}-input.pdf`);
751
964
  const outputPath = (0, import_node_path4.join)(tempDir, `${reportId}-${timestamp}-image.pdf`);
752
965
  try {
@@ -816,11 +1029,17 @@ var ImagePdfConverter = class {
816
1029
  };
817
1030
 
818
1031
  // src/core/pdf-converter.ts
1032
+ var _origAssertValidConversionOptions = import_docling_sdk.ValidationUtils.assertValidConversionOptions.bind(import_docling_sdk.ValidationUtils);
1033
+ import_docling_sdk.ValidationUtils.assertValidConversionOptions = (options) => {
1034
+ const { pipeline: _pipeline, ...rest } = options;
1035
+ _origAssertValidConversionOptions(rest);
1036
+ };
819
1037
  var PDFConverter = class {
820
- constructor(logger, client, enableImagePdfFallback = false) {
1038
+ constructor(logger, client, enableImagePdfFallback = false, timeout = PDF_CONVERTER.DEFAULT_TIMEOUT_MS) {
821
1039
  this.logger = logger;
822
1040
  this.client = client;
823
1041
  this.enableImagePdfFallback = enableImagePdfFallback;
1042
+ this.timeout = timeout;
824
1043
  }
825
1044
  async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
826
1045
  this.logger.info("[PDFConverter] Converting:", url);
@@ -875,7 +1094,15 @@ var PDFConverter = class {
875
1094
  }
876
1095
  async performConversion(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
877
1096
  const startTime = Date.now();
878
- const conversionOptions = this.buildConversionOptions(options);
1097
+ const pipelineType = options.pipeline ?? "standard";
1098
+ const conversionOptions = pipelineType === "vlm" ? this.buildVlmConversionOptions(options) : this.buildConversionOptions(options);
1099
+ if (pipelineType === "vlm") {
1100
+ this.logger.info("[PDFConverter] Using VLM pipeline");
1101
+ } else {
1102
+ this.logger.info(
1103
+ `[PDFConverter] OCR languages: ${JSON.stringify(conversionOptions.ocr_options?.lang)}`
1104
+ );
1105
+ }
879
1106
  this.logger.info(
880
1107
  "[PDFConverter] Converting document with Async Source API..."
881
1108
  );
@@ -942,7 +1169,7 @@ var PDFConverter = class {
942
1169
  }
943
1170
  buildConversionOptions(options) {
944
1171
  return {
945
- ...(0, import_es_toolkit.omit)(options, ["num_threads"]),
1172
+ ...(0, import_es_toolkit.omit)(options, ["num_threads", "pipeline", "vlm_model"]),
946
1173
  to_formats: ["json", "html"],
947
1174
  image_export_mode: "embedded",
948
1175
  ocr_engine: "ocrmac",
@@ -968,6 +1195,31 @@ var PDFConverter = class {
968
1195
  }
969
1196
  };
970
1197
  }
1198
+ /**
1199
+ * Build conversion options for VLM pipeline.
1200
+ *
1201
+ * VLM pipeline uses a Vision Language Model instead of traditional OCR,
1202
+ * providing better accuracy for KCJ characters and complex layouts.
1203
+ */
1204
+ buildVlmConversionOptions(options) {
1205
+ const vlmModel = resolveVlmModel(options.vlm_model ?? DEFAULT_VLM_MODEL);
1206
+ this.logger.info(
1207
+ `[PDFConverter] VLM model: ${vlmModel.repo_id} (framework: ${vlmModel.inference_framework}, format: ${vlmModel.response_format})`
1208
+ );
1209
+ return {
1210
+ ...(0, import_es_toolkit.omit)(options, ["num_threads", "pipeline", "vlm_model", "ocr_lang"]),
1211
+ to_formats: ["json", "html"],
1212
+ image_export_mode: "embedded",
1213
+ pipeline: "vlm",
1214
+ vlm_pipeline_model_local: vlmModel,
1215
+ generate_picture_images: true,
1216
+ images_scale: 2,
1217
+ accelerator_options: {
1218
+ device: "mps",
1219
+ num_threads: options.num_threads
1220
+ }
1221
+ };
1222
+ }
971
1223
  async startConversionTask(url, conversionOptions) {
972
1224
  const task = await this.client.convertSourceAsync({
973
1225
  sources: [
@@ -1003,38 +1255,42 @@ var PDFConverter = class {
1003
1255
  }
1004
1256
  async trackTaskProgress(task) {
1005
1257
  const conversionStartTime = Date.now();
1006
- let lastStatus = "";
1007
- let isCompleted = false;
1008
- const pollInterval = setInterval(() => {
1009
- if (isCompleted) return;
1010
- const elapsed = Math.floor((Date.now() - conversionStartTime) / 1e3);
1011
- process.stdout.write(
1012
- `\r[PDFConverter] Status: ${lastStatus || "processing"} (${elapsed}s elapsed)`
1013
- );
1014
- }, PDF_CONVERTER.POLL_INTERVAL_MS);
1015
- task.on("progress", (status) => {
1016
- lastStatus = status.task_status;
1258
+ let lastProgressLine = "";
1259
+ const logProgress = (status) => {
1260
+ const parts = [`Status: ${status.task_status}`];
1017
1261
  if (status.task_position !== void 0) {
1018
- process.stdout.write(
1019
- `\r[PDFConverter] Status: ${status.task_status} (position: ${status.task_position})`
1020
- );
1262
+ parts.push(`position: ${status.task_position}`);
1021
1263
  }
1022
- });
1023
- task.on("complete", () => {
1024
- isCompleted = true;
1025
- clearInterval(pollInterval);
1026
- this.logger.info("\n[PDFConverter] Conversion completed!");
1027
- });
1028
- task.on("error", (error) => {
1029
- isCompleted = true;
1030
- clearInterval(pollInterval);
1031
- this.logger.error("\n[PDFConverter] Conversion error:", error.message);
1032
- });
1033
- try {
1034
- await task.waitForCompletion();
1035
- } finally {
1036
- isCompleted = true;
1037
- clearInterval(pollInterval);
1264
+ const meta = status.task_meta;
1265
+ if (meta) {
1266
+ if (meta.processed_documents !== void 0 && meta.total_documents !== void 0) {
1267
+ parts.push(
1268
+ `progress: ${meta.processed_documents}/${meta.total_documents}`
1269
+ );
1270
+ }
1271
+ }
1272
+ const progressLine = `\r[PDFConverter] ${parts.join(" | ")}`;
1273
+ if (progressLine !== lastProgressLine) {
1274
+ lastProgressLine = progressLine;
1275
+ process.stdout.write(progressLine);
1276
+ }
1277
+ };
1278
+ while (true) {
1279
+ if (Date.now() - conversionStartTime > this.timeout) {
1280
+ throw new Error("Task timeout");
1281
+ }
1282
+ const status = await task.poll();
1283
+ logProgress(status);
1284
+ if (status.task_status === "success") {
1285
+ this.logger.info("\n[PDFConverter] Conversion completed!");
1286
+ return;
1287
+ }
1288
+ if (status.task_status === "failure") {
1289
+ throw new Error("Task failed with status: failure");
1290
+ }
1291
+ await new Promise(
1292
+ (resolve) => setTimeout(resolve, PDF_CONVERTER.POLL_INTERVAL_MS)
1293
+ );
1038
1294
  }
1039
1295
  }
1040
1296
  async downloadResult(taskId) {
@@ -1070,6 +1326,7 @@ var PDFParser = class {
1070
1326
  killExistingProcess;
1071
1327
  enableImagePdfFallback;
1072
1328
  client = null;
1329
+ environment;
1073
1330
  constructor(options) {
1074
1331
  const {
1075
1332
  logger,
@@ -1106,7 +1363,7 @@ var PDFParser = class {
1106
1363
  }
1107
1364
  if (this.baseUrl) {
1108
1365
  this.logger.info("[PDFParser] Using external server:", this.baseUrl);
1109
- this.client = new import_docling_sdk.Docling({
1366
+ this.client = new import_docling_sdk2.Docling({
1110
1367
  api: { baseUrl: this.baseUrl, timeout: this.timeout }
1111
1368
  });
1112
1369
  await this.waitForServerReady();
@@ -1114,15 +1371,15 @@ var PDFParser = class {
1114
1371
  }
1115
1372
  this.logger.info("[PDFParser] Setting up local server...");
1116
1373
  try {
1117
- const environment = new DoclingEnvironment({
1374
+ this.environment = new DoclingEnvironment({
1118
1375
  logger: this.logger,
1119
1376
  venvPath: this.venvPath,
1120
1377
  port: this.port,
1121
1378
  killExistingProcess: this.killExistingProcess
1122
1379
  });
1123
- await environment.setup();
1380
+ await this.environment.setup();
1124
1381
  const clientUrl = `http://localhost:${this.port}`;
1125
- this.client = new import_docling_sdk.Docling({
1382
+ this.client = new import_docling_sdk2.Docling({
1126
1383
  api: {
1127
1384
  baseUrl: clientUrl,
1128
1385
  timeout: this.timeout
@@ -1136,9 +1393,9 @@ var PDFParser = class {
1136
1393
  }
1137
1394
  }
1138
1395
  checkOperatingSystem() {
1139
- if ((0, import_node_os2.platform)() !== "darwin") {
1396
+ if ((0, import_node_os3.platform)() !== "darwin") {
1140
1397
  throw new Error(
1141
- "PDFParser is only supported on macOS. Current platform: " + (0, import_node_os2.platform)()
1398
+ "PDFParser is only supported on macOS. Current platform: " + (0, import_node_os3.platform)()
1142
1399
  );
1143
1400
  }
1144
1401
  }
@@ -1222,7 +1479,7 @@ var PDFParser = class {
1222
1479
  });
1223
1480
  await environment.startServer();
1224
1481
  this.client?.destroy();
1225
- this.client = new import_docling_sdk.Docling({
1482
+ this.client = new import_docling_sdk2.Docling({
1226
1483
  api: {
1227
1484
  baseUrl: `http://localhost:${this.port}`,
1228
1485
  timeout: this.timeout
@@ -1266,6 +1523,12 @@ var PDFParser = class {
1266
1523
  "PDFParser is not initialized. Call init() before using parse()"
1267
1524
  );
1268
1525
  }
1526
+ if (options.pipeline === "vlm" && this.environment && !this.baseUrl) {
1527
+ this.logger.info(
1528
+ "[PDFParser] VLM pipeline requested, ensuring VLM dependencies..."
1529
+ );
1530
+ await this.environment.setupVlmDependencies();
1531
+ }
1269
1532
  const canRecover = !this.baseUrl && this.port !== void 0;
1270
1533
  const maxAttempts = PDF_PARSER.MAX_SERVER_RECOVERY_ATTEMPTS;
1271
1534
  let attempt = 0;
@@ -1275,7 +1538,8 @@ var PDFParser = class {
1275
1538
  const converter = new PDFConverter(
1276
1539
  this.logger,
1277
1540
  this.client,
1278
- effectiveFallbackEnabled
1541
+ effectiveFallbackEnabled,
1542
+ this.timeout
1279
1543
  );
1280
1544
  return await converter.convert(
1281
1545
  url,
@@ -1323,7 +1587,10 @@ var PDFParser = class {
1323
1587
  };
1324
1588
  // Annotate the CommonJS export names for ESM import in node:
1325
1589
  0 && (module.exports = {
1590
+ DEFAULT_VLM_MODEL,
1326
1591
  ImagePdfFallbackError,
1327
- PDFParser
1592
+ PDFParser,
1593
+ VLM_MODELS,
1594
+ resolveVlmModel
1328
1595
  });
1329
1596
  //# sourceMappingURL=index.cjs.map