@heripo/pdf-parser 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -30,19 +30,17 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
30
30
  // src/index.ts
31
31
  var src_exports = {};
32
32
  __export(src_exports, {
33
- DEFAULT_VLM_MODEL: () => DEFAULT_VLM_MODEL,
34
33
  ImagePdfFallbackError: () => ImagePdfFallbackError,
35
34
  PDFParser: () => PDFParser,
36
- VLM_MODELS: () => VLM_MODELS,
37
- resolveVlmModel: () => resolveVlmModel
35
+ VlmResponseValidator: () => VlmResponseValidator
38
36
  });
39
37
  module.exports = __toCommonJS(src_exports);
40
38
 
41
39
  // src/core/pdf-parser.ts
42
- var import_docling_sdk2 = require("docling-sdk");
40
+ var import_docling_sdk = require("docling-sdk");
43
41
  var import_node_child_process3 = require("child_process");
44
- var import_node_os3 = require("os");
45
- var import_node_path6 = require("path");
42
+ var import_node_os2 = require("os");
43
+ var import_node_path8 = require("path");
46
44
 
47
45
  // src/config/constants.ts
48
46
  var PDF_PARSER = {
@@ -93,22 +91,42 @@ var IMAGE_PDF_CONVERTER = {
93
91
  */
94
92
  QUALITY: 100
95
93
  };
96
- var VLM_ENVIRONMENT = {
97
- /**
98
- * Timeout for VLM dependency installation (pip install) in milliseconds (3 hours).
99
- * VLM packages can be very large and may require extended download times
100
- * depending on network conditions.
101
- */
102
- SETUP_TIMEOUT_MS: 108e5,
103
- /**
104
- * Timeout for VLM model download in milliseconds (3 hours).
105
- * Large VLM models (e.g., multi-GB weights) need sufficient time to download.
106
- */
107
- MODEL_DOWNLOAD_TIMEOUT_MS: 108e5
108
- };
109
94
 
110
95
  // ../shared/dist/index.mjs
111
96
  var import_child_process = require("child_process");
97
+ var import_ai = require("ai");
98
+ var ConcurrentPool = class {
99
+ /**
100
+ * Process items concurrently using a worker pool pattern.
101
+ *
102
+ * Spawns up to `concurrency` workers that pull items from a shared queue.
103
+ * Each worker processes one item at a time; when it finishes, it immediately
104
+ * takes the next available item. Results maintain the original item order.
105
+ *
106
+ * @param items - Array of items to process
107
+ * @param concurrency - Maximum number of concurrent workers
108
+ * @param processFn - Async function to process each item
109
+ * @param onItemComplete - Optional callback fired after each item completes
110
+ * @returns Array of results in the same order as the input items
111
+ */
112
+ static async run(items, concurrency, processFn, onItemComplete) {
113
+ const results = new Array(items.length);
114
+ let nextIndex = 0;
115
+ async function worker() {
116
+ while (nextIndex < items.length) {
117
+ const index = nextIndex++;
118
+ results[index] = await processFn(items[index], index);
119
+ onItemComplete?.(results[index], index);
120
+ }
121
+ }
122
+ const workers = Array.from(
123
+ { length: Math.min(concurrency, items.length) },
124
+ () => worker()
125
+ );
126
+ await Promise.all(workers);
127
+ return results;
128
+ }
129
+ };
112
130
  function spawnAsync(command, args, options = {}) {
113
131
  const {
114
132
  captureStdout = true,
@@ -135,10 +153,456 @@ function spawnAsync(command, args, options = {}) {
135
153
  proc.on("error", reject);
136
154
  });
137
155
  }
156
+ function detectProvider(model) {
157
+ const providerId = model.provider;
158
+ if (!providerId || typeof providerId !== "string") return "unknown";
159
+ if (providerId.includes("openai")) return "openai";
160
+ if (providerId.includes("google")) return "google";
161
+ if (providerId.includes("anthropic")) return "anthropic";
162
+ if (providerId.includes("together")) return "togetherai";
163
+ return "unknown";
164
+ }
165
+ var LLMCaller = class {
166
+ /**
167
+ * Extract model name from LanguageModel object
168
+ *
169
+ * Attempts to get model ID from various possible fields in the LanguageModel object.
170
+ */
171
+ static extractModelName(model) {
172
+ const modelObj = model;
173
+ if (typeof modelObj.modelId === "string") return modelObj.modelId;
174
+ if (typeof modelObj.id === "string") return modelObj.id;
175
+ if (typeof modelObj.model === "string") return modelObj.model;
176
+ if (typeof modelObj.name === "string") return modelObj.name;
177
+ return String(model);
178
+ }
179
+ /**
180
+ * Build usage information from response
181
+ */
182
+ static buildUsage(config, modelName, response, usedFallback) {
183
+ return {
184
+ component: config.component,
185
+ phase: config.phase,
186
+ model: usedFallback ? "fallback" : "primary",
187
+ modelName,
188
+ inputTokens: response.usage?.inputTokens ?? 0,
189
+ outputTokens: response.usage?.outputTokens ?? 0,
190
+ totalTokens: response.usage?.totalTokens ?? 0
191
+ };
192
+ }
193
+ /**
194
+ * Maximum number of retries when structured output generation fails.
195
+ * Total attempts = MAX_STRUCTURED_OUTPUT_RETRIES + 1.
196
+ *
197
+ * Applied to both:
198
+ * - `Output.object()` path: retries on NoObjectGeneratedError (schema mismatch)
199
+ * - Tool call path: retries when model does not produce a tool call
200
+ */
201
+ static MAX_STRUCTURED_OUTPUT_RETRIES = 10;
202
+ /**
203
+ * Generate structured output via forced tool call.
204
+ *
205
+ * Used for providers (Together AI, unknown) that do not reliably support
206
+ * `Output.object()`. Forces the model to call a tool whose inputSchema
207
+ * is the target Zod schema, then extracts the parsed input.
208
+ *
209
+ * Retries up to MAX_STRUCTURED_OUTPUT_RETRIES times when the model does not
210
+ * produce a tool call, for a total of MAX_STRUCTURED_OUTPUT_RETRIES + 1 attempts.
211
+ *
212
+ * @throws NoObjectGeneratedError when all attempts fail to produce a tool call
213
+ */
214
+ static async generateViaToolCall(model, schema, promptParams) {
215
+ const submitTool = (0, import_ai.tool)({
216
+ description: "Submit the structured result",
217
+ inputSchema: schema
218
+ });
219
+ let lastResult;
220
+ for (let attempt = 0; attempt <= this.MAX_STRUCTURED_OUTPUT_RETRIES; attempt++) {
221
+ lastResult = await (0, import_ai.generateText)({
222
+ ...promptParams,
223
+ model,
224
+ tools: { submitResult: submitTool },
225
+ toolChoice: { type: "tool", toolName: "submitResult" },
226
+ stopWhen: (0, import_ai.hasToolCall)("submitResult")
227
+ });
228
+ const toolCall = lastResult.toolCalls?.[0];
229
+ if (toolCall) {
230
+ return {
231
+ output: toolCall.input,
232
+ usage: lastResult.usage
233
+ };
234
+ }
235
+ }
236
+ throw new import_ai.NoObjectGeneratedError({
237
+ message: "Model did not produce a tool call for structured output",
238
+ text: lastResult.text ?? "",
239
+ response: lastResult.response,
240
+ usage: lastResult.usage,
241
+ finishReason: lastResult.finishReason
242
+ });
243
+ }
244
+ /**
245
+ * Generate structured output with provider-aware strategy.
246
+ *
247
+ * Strategy per provider:
248
+ * - OpenAI / Anthropic / Google Gemini: `Output.object()` with schema retry
249
+ * - Together AI / unknown: forced tool call pattern
250
+ *
251
+ * Retries up to MAX_STRUCTURED_OUTPUT_RETRIES times on NoObjectGeneratedError
252
+ * (schema mismatch), re-throwing the last error if all attempts fail.
253
+ */
254
+ static async generateStructuredOutput(model, schema, promptParams) {
255
+ const providerType = detectProvider(model);
256
+ if (providerType === "togetherai" || providerType === "unknown") {
257
+ return this.generateViaToolCall(model, schema, promptParams);
258
+ }
259
+ let lastError;
260
+ for (let attempt = 0; attempt <= this.MAX_STRUCTURED_OUTPUT_RETRIES; attempt++) {
261
+ try {
262
+ return await (0, import_ai.generateText)({
263
+ model,
264
+ output: import_ai.Output.object({ schema }),
265
+ ...promptParams
266
+ });
267
+ } catch (error) {
268
+ if (import_ai.NoObjectGeneratedError.isInstance(error)) {
269
+ lastError = error;
270
+ continue;
271
+ }
272
+ throw error;
273
+ }
274
+ }
275
+ throw lastError;
276
+ }
277
+ /**
278
+ * Execute LLM call with fallback support
279
+ *
280
+ * Common execution logic for both text and vision calls.
281
+ * Logs additional details when NoObjectGeneratedError occurs.
282
+ */
283
+ static async executeWithFallback(config, generateFn) {
284
+ const primaryModelName = this.extractModelName(config.primaryModel);
285
+ try {
286
+ const response = await generateFn(config.primaryModel);
287
+ return {
288
+ output: response.output,
289
+ usage: this.buildUsage(config, primaryModelName, response, false),
290
+ usedFallback: false
291
+ };
292
+ } catch (primaryError) {
293
+ if (config.abortSignal?.aborted) {
294
+ throw primaryError;
295
+ }
296
+ if (!config.fallbackModel) {
297
+ throw primaryError;
298
+ }
299
+ const fallbackModelName = this.extractModelName(config.fallbackModel);
300
+ const response = await generateFn(config.fallbackModel);
301
+ return {
302
+ output: response.output,
303
+ usage: this.buildUsage(config, fallbackModelName, response, true),
304
+ usedFallback: true
305
+ };
306
+ }
307
+ }
308
+ /**
309
+ * Call LLM with retry and fallback support
310
+ *
311
+ * Retry Strategy:
312
+ * 1. Try primary model up to maxRetries times
313
+ * 2. If all fail and fallbackModel provided, try fallback up to maxRetries times
314
+ * 3. Throw error if all attempts exhausted
315
+ *
316
+ * Provider-aware strategy is automatically applied based on the model's provider field.
317
+ *
318
+ * @template TOutput - Output type from schema validation
319
+ * @param config - LLM call configuration
320
+ * @returns Result with parsed object and usage information
321
+ * @throws Error if all retry attempts fail
322
+ */
323
+ static async call(config) {
324
+ return this.executeWithFallback(
325
+ config,
326
+ (model) => this.generateStructuredOutput(model, config.schema, {
327
+ system: config.systemPrompt,
328
+ prompt: config.userPrompt,
329
+ temperature: config.temperature,
330
+ maxRetries: config.maxRetries,
331
+ abortSignal: config.abortSignal
332
+ })
333
+ );
334
+ }
335
+ /**
336
+ * Call LLM for vision tasks with message format support
337
+ *
338
+ * Same retry and fallback logic as call(), but using message format instead of system/user prompts.
339
+ * Provider-aware strategy is automatically applied based on the model's provider field.
340
+ *
341
+ * @template TOutput - Output type from schema validation
342
+ * @param config - LLM vision call configuration
343
+ * @returns Result with parsed object and usage information
344
+ * @throws Error if all retry attempts fail
345
+ */
346
+ static async callVision(config) {
347
+ return this.executeWithFallback(
348
+ config,
349
+ (model) => this.generateStructuredOutput(model, config.schema, {
350
+ messages: config.messages,
351
+ temperature: config.temperature,
352
+ maxRetries: config.maxRetries,
353
+ abortSignal: config.abortSignal
354
+ })
355
+ );
356
+ }
357
+ };
358
+ function formatTokens(usage) {
359
+ return `${usage.inputTokens} input, ${usage.outputTokens} output, ${usage.totalTokens} total`;
360
+ }
361
+ var LLMTokenUsageAggregator = class {
362
+ usage = {};
363
+ /**
364
+ * Track token usage from an LLM call
365
+ *
366
+ * @param usage - Extended token usage with component/phase/model information
367
+ */
368
+ track(usage) {
369
+ if (!this.usage[usage.component]) {
370
+ this.usage[usage.component] = {
371
+ component: usage.component,
372
+ phases: {},
373
+ total: {
374
+ inputTokens: 0,
375
+ outputTokens: 0,
376
+ totalTokens: 0
377
+ }
378
+ };
379
+ }
380
+ const component = this.usage[usage.component];
381
+ if (!component.phases[usage.phase]) {
382
+ component.phases[usage.phase] = {
383
+ total: {
384
+ inputTokens: 0,
385
+ outputTokens: 0,
386
+ totalTokens: 0
387
+ }
388
+ };
389
+ }
390
+ const phase = component.phases[usage.phase];
391
+ if (usage.model === "primary") {
392
+ if (!phase.primary) {
393
+ phase.primary = {
394
+ modelName: usage.modelName,
395
+ inputTokens: 0,
396
+ outputTokens: 0,
397
+ totalTokens: 0
398
+ };
399
+ }
400
+ phase.primary.inputTokens += usage.inputTokens;
401
+ phase.primary.outputTokens += usage.outputTokens;
402
+ phase.primary.totalTokens += usage.totalTokens;
403
+ } else if (usage.model === "fallback") {
404
+ if (!phase.fallback) {
405
+ phase.fallback = {
406
+ modelName: usage.modelName,
407
+ inputTokens: 0,
408
+ outputTokens: 0,
409
+ totalTokens: 0
410
+ };
411
+ }
412
+ phase.fallback.inputTokens += usage.inputTokens;
413
+ phase.fallback.outputTokens += usage.outputTokens;
414
+ phase.fallback.totalTokens += usage.totalTokens;
415
+ }
416
+ phase.total.inputTokens += usage.inputTokens;
417
+ phase.total.outputTokens += usage.outputTokens;
418
+ phase.total.totalTokens += usage.totalTokens;
419
+ component.total.inputTokens += usage.inputTokens;
420
+ component.total.outputTokens += usage.outputTokens;
421
+ component.total.totalTokens += usage.totalTokens;
422
+ }
423
+ /**
424
+ * Get aggregated usage grouped by component
425
+ *
426
+ * @returns Array of component aggregates with phase breakdown
427
+ */
428
+ getByComponent() {
429
+ return Object.values(this.usage);
430
+ }
431
+ /**
432
+ * Get token usage report in structured JSON format
433
+ *
434
+ * Converts internal usage data to external TokenUsageReport format suitable
435
+ * for serialization and reporting. The report includes component breakdown,
436
+ * phase-level details, and both primary and fallback model usage.
437
+ *
438
+ * @returns Structured token usage report with components and total
439
+ */
440
+ getReport() {
441
+ const components = [];
442
+ for (const component of Object.values(this.usage)) {
443
+ const phases = [];
444
+ for (const [phaseName, phaseData] of Object.entries(component.phases)) {
445
+ const phaseReport = {
446
+ phase: phaseName,
447
+ total: {
448
+ inputTokens: phaseData.total.inputTokens,
449
+ outputTokens: phaseData.total.outputTokens,
450
+ totalTokens: phaseData.total.totalTokens
451
+ }
452
+ };
453
+ if (phaseData.primary) {
454
+ phaseReport.primary = {
455
+ modelName: phaseData.primary.modelName,
456
+ inputTokens: phaseData.primary.inputTokens,
457
+ outputTokens: phaseData.primary.outputTokens,
458
+ totalTokens: phaseData.primary.totalTokens
459
+ };
460
+ }
461
+ if (phaseData.fallback) {
462
+ phaseReport.fallback = {
463
+ modelName: phaseData.fallback.modelName,
464
+ inputTokens: phaseData.fallback.inputTokens,
465
+ outputTokens: phaseData.fallback.outputTokens,
466
+ totalTokens: phaseData.fallback.totalTokens
467
+ };
468
+ }
469
+ phases.push(phaseReport);
470
+ }
471
+ components.push({
472
+ component: component.component,
473
+ phases,
474
+ total: {
475
+ inputTokens: component.total.inputTokens,
476
+ outputTokens: component.total.outputTokens,
477
+ totalTokens: component.total.totalTokens
478
+ }
479
+ });
480
+ }
481
+ const totalUsage = this.getTotalUsage();
482
+ return {
483
+ components,
484
+ total: {
485
+ inputTokens: totalUsage.inputTokens,
486
+ outputTokens: totalUsage.outputTokens,
487
+ totalTokens: totalUsage.totalTokens
488
+ }
489
+ };
490
+ }
491
+ /**
492
+ * Get total usage across all components and phases
493
+ *
494
+ * @returns Aggregated token usage totals
495
+ */
496
+ getTotalUsage() {
497
+ let totalInput = 0;
498
+ let totalOutput = 0;
499
+ let totalTokens = 0;
500
+ for (const component of Object.values(this.usage)) {
501
+ totalInput += component.total.inputTokens;
502
+ totalOutput += component.total.outputTokens;
503
+ totalTokens += component.total.totalTokens;
504
+ }
505
+ return {
506
+ inputTokens: totalInput,
507
+ outputTokens: totalOutput,
508
+ totalTokens
509
+ };
510
+ }
511
+ /**
512
+ * Log comprehensive token usage summary
513
+ *
514
+ * Outputs usage grouped by component, with phase and model breakdown.
515
+ * Shows primary and fallback token usage separately for each phase.
516
+ * Call this once at the end of document processing.
517
+ *
518
+ * @param logger - Logger instance for output
519
+ */
520
+ logSummary(logger) {
521
+ const components = this.getByComponent();
522
+ if (components.length === 0) {
523
+ logger.info("[DocumentProcessor] No token usage to report");
524
+ return;
525
+ }
526
+ logger.info("[DocumentProcessor] Token usage summary:");
527
+ logger.info("");
528
+ let grandInputTokens = 0;
529
+ let grandOutputTokens = 0;
530
+ let grandTotalTokens = 0;
531
+ let grandPrimaryInputTokens = 0;
532
+ let grandPrimaryOutputTokens = 0;
533
+ let grandPrimaryTotalTokens = 0;
534
+ let grandFallbackInputTokens = 0;
535
+ let grandFallbackOutputTokens = 0;
536
+ let grandFallbackTotalTokens = 0;
537
+ for (const component of components) {
538
+ logger.info(`${component.component}:`);
539
+ for (const [phase, phaseData] of Object.entries(component.phases)) {
540
+ logger.info(` - ${phase}:`);
541
+ if (phaseData.primary) {
542
+ logger.info(
543
+ ` primary (${phaseData.primary.modelName}): ${formatTokens(phaseData.primary)}`
544
+ );
545
+ grandPrimaryInputTokens += phaseData.primary.inputTokens;
546
+ grandPrimaryOutputTokens += phaseData.primary.outputTokens;
547
+ grandPrimaryTotalTokens += phaseData.primary.totalTokens;
548
+ }
549
+ if (phaseData.fallback) {
550
+ logger.info(
551
+ ` fallback (${phaseData.fallback.modelName}): ${formatTokens(phaseData.fallback)}`
552
+ );
553
+ grandFallbackInputTokens += phaseData.fallback.inputTokens;
554
+ grandFallbackOutputTokens += phaseData.fallback.outputTokens;
555
+ grandFallbackTotalTokens += phaseData.fallback.totalTokens;
556
+ }
557
+ logger.info(` subtotal: ${formatTokens(phaseData.total)}`);
558
+ }
559
+ logger.info(
560
+ ` ${component.component} total: ${formatTokens(component.total)}`
561
+ );
562
+ logger.info("");
563
+ grandInputTokens += component.total.inputTokens;
564
+ grandOutputTokens += component.total.outputTokens;
565
+ grandTotalTokens += component.total.totalTokens;
566
+ }
567
+ logger.info("--- Summary ---");
568
+ if (grandPrimaryTotalTokens > 0) {
569
+ logger.info(
570
+ `Primary total: ${formatTokens({
571
+ inputTokens: grandPrimaryInputTokens,
572
+ outputTokens: grandPrimaryOutputTokens,
573
+ totalTokens: grandPrimaryTotalTokens
574
+ })}`
575
+ );
576
+ }
577
+ if (grandFallbackTotalTokens > 0) {
578
+ logger.info(
579
+ `Fallback total: ${formatTokens({
580
+ inputTokens: grandFallbackInputTokens,
581
+ outputTokens: grandFallbackOutputTokens,
582
+ totalTokens: grandFallbackTotalTokens
583
+ })}`
584
+ );
585
+ }
586
+ logger.info(
587
+ `Grand total: ${formatTokens({
588
+ inputTokens: grandInputTokens,
589
+ outputTokens: grandOutputTokens,
590
+ totalTokens: grandTotalTokens
591
+ })}`
592
+ );
593
+ }
594
+ /**
595
+ * Reset all tracked usage
596
+ *
597
+ * Call this at the start of a new document processing run.
598
+ */
599
+ reset() {
600
+ this.usage = {};
601
+ }
602
+ };
138
603
 
139
604
  // src/environment/docling-environment.ts
140
605
  var import_node_child_process = require("child_process");
141
- var import_node_os = require("os");
142
606
  var import_node_path = require("path");
143
607
 
144
608
  // src/utils/python-version.ts
@@ -180,7 +644,6 @@ var DoclingEnvironment = class _DoclingEnvironment {
180
644
  venvPath;
181
645
  port;
182
646
  killExistingProcess;
183
- vlmDependenciesInstalled = false;
184
647
  constructor(options) {
185
648
  this.logger = options.logger;
186
649
  this.venvPath = options.venvPath;
@@ -299,7 +762,11 @@ var DoclingEnvironment = class _DoclingEnvironment {
299
762
  }
300
763
  async installDoclingServe() {
301
764
  const pipPath = (0, import_node_path.join)(this.venvPath, "bin", "pip");
302
- const result = await spawnAsync(pipPath, ["install", "docling-serve"]);
765
+ const result = await spawnAsync(pipPath, [
766
+ "install",
767
+ "--upgrade",
768
+ "docling-serve"
769
+ ]);
303
770
  if (result.code !== 0) {
304
771
  this.logger.error(
305
772
  "[DoclingEnvironment] Failed to install docling-serve:",
@@ -310,81 +777,6 @@ var DoclingEnvironment = class _DoclingEnvironment {
310
777
  );
311
778
  }
312
779
  }
313
- /**
314
- * Install VLM-specific dependencies for the Docling VLM pipeline.
315
- *
316
- * Installs:
317
- * 1. docling-serve[vlm] - VLM model support for docling-serve
318
- * 2. mlx + mlx-lm (macOS ARM64 only) - Apple Silicon optimized inference
319
- *
320
- * This is idempotent - subsequent calls skip if already installed.
321
- */
322
- async setupVlmDependencies() {
323
- if (this.vlmDependenciesInstalled) {
324
- this.logger.info(
325
- "[DoclingEnvironment] VLM dependencies already installed, skipping"
326
- );
327
- return;
328
- }
329
- if (await this.isVlmReady()) {
330
- this.vlmDependenciesInstalled = true;
331
- this.logger.info(
332
- "[DoclingEnvironment] VLM dependencies already installed, skipping"
333
- );
334
- return;
335
- }
336
- this.logger.info("[DoclingEnvironment] Installing VLM dependencies...");
337
- const pipPath = (0, import_node_path.join)(this.venvPath, "bin", "pip");
338
- this.logger.info("[DoclingEnvironment] Installing docling[vlm]...");
339
- const vlmResult = await spawnAsync(
340
- pipPath,
341
- ["install", "docling-serve[vlm]"],
342
- { timeout: VLM_ENVIRONMENT.SETUP_TIMEOUT_MS }
343
- );
344
- if (vlmResult.code !== 0) {
345
- this.logger.error(
346
- "[DoclingEnvironment] Failed to install docling-serve[vlm]:",
347
- vlmResult.stderr
348
- );
349
- throw new Error(
350
- `Failed to install docling-serve[vlm]. Exit code: ${vlmResult.code}`
351
- );
352
- }
353
- if ((0, import_node_os.platform)() === "darwin" && (0, import_node_os.arch)() === "arm64") {
354
- this.logger.info(
355
- "[DoclingEnvironment] Installing mlx + mlx-lm for Apple Silicon..."
356
- );
357
- const mlxResult = await spawnAsync(
358
- pipPath,
359
- ["install", "mlx", "mlx-lm"],
360
- { timeout: VLM_ENVIRONMENT.SETUP_TIMEOUT_MS }
361
- );
362
- if (mlxResult.code !== 0) {
363
- this.logger.error(
364
- "[DoclingEnvironment] Failed to install mlx/mlx-lm:",
365
- mlxResult.stderr
366
- );
367
- throw new Error(
368
- `Failed to install mlx/mlx-lm. Exit code: ${mlxResult.code}`
369
- );
370
- }
371
- }
372
- this.vlmDependenciesInstalled = true;
373
- this.logger.info(
374
- "[DoclingEnvironment] VLM dependencies installed successfully"
375
- );
376
- }
377
- /**
378
- * Check if VLM dependencies are ready by verifying Python module imports
379
- */
380
- async isVlmReady() {
381
- const pythonPath = (0, import_node_path.join)(this.venvPath, "bin", "python");
382
- const result = await spawnAsync(pythonPath, [
383
- "-c",
384
- "import docling_core; import docling"
385
- ]);
386
- return result.code === 0;
387
- }
388
780
  async isPortInUse(port) {
389
781
  try {
390
782
  const result = await spawnAsync("lsof", ["-ti", `:${port}`]);
@@ -455,8 +847,13 @@ var DoclingEnvironment = class _DoclingEnvironment {
455
847
  const doclingProcess = (0, import_node_child_process.spawn)(doclingServePath, args, {
456
848
  detached: true,
457
849
  // Detached from parent process
458
- stdio: "ignore"
850
+ stdio: "ignore",
459
851
  // Remove stdio pipes to prevent event loop from hanging
852
+ env: {
853
+ ...process.env,
854
+ // Enable remote API calls for API VLM models
855
+ DOCLING_SERVE_ENABLE_REMOTE_SERVICES: "true"
856
+ }
460
857
  });
461
858
  doclingProcess.unref();
462
859
  doclingProcess.on("error", (error) => {
@@ -471,126 +868,11 @@ var DoclingEnvironment = class _DoclingEnvironment {
471
868
  };
472
869
 
473
870
  // src/core/pdf-converter.ts
474
- var import_docling_sdk = require("docling-sdk");
475
871
  var import_es_toolkit = require("es-toolkit");
476
- var import_node_fs4 = require("fs");
477
- var import_node_path5 = require("path");
478
- var import_promises = require("stream/promises");
479
-
480
- // src/config/vlm-models.ts
481
- var VLM_MODELS = {
482
- // ── DocTags models (specialized document structure output) ──────────
483
- "granite-docling-258M-mlx": {
484
- repo_id: "ibm-granite/granite-docling-258M-mlx",
485
- inference_framework: "mlx",
486
- response_format: "doctags",
487
- transformers_model_type: "automodel-vision2seq",
488
- description: "Granite Docling 258M (MLX, Apple Silicon optimized, ~6s/page)"
489
- },
490
- "granite-docling-258M": {
491
- repo_id: "ibm-granite/granite-docling-258M",
492
- inference_framework: "transformers",
493
- response_format: "doctags",
494
- transformers_model_type: "automodel-vision2seq",
495
- description: "Granite Docling 258M (Transformers, cross-platform)"
496
- },
497
- "smoldocling-256M-mlx": {
498
- repo_id: "docling-project/SmolDocling-256M-preview-mlx-bf16",
499
- inference_framework: "mlx",
500
- response_format: "doctags",
501
- transformers_model_type: "automodel-vision2seq",
502
- description: "SmolDocling 256M (MLX, fastest option)"
503
- },
504
- "smoldocling-256M": {
505
- repo_id: "docling-project/SmolDocling-256M-preview",
506
- inference_framework: "transformers",
507
- response_format: "doctags",
508
- transformers_model_type: "automodel-vision2seq",
509
- description: "SmolDocling 256M (Transformers)"
510
- },
511
- // ── Markdown models (general-purpose vision LLMs) ──────────────────
512
- "granite-vision-2B": {
513
- repo_id: "ibm-granite/granite-vision-3.2-2b",
514
- inference_framework: "transformers",
515
- response_format: "markdown",
516
- transformers_model_type: "automodel-vision2seq",
517
- description: "Granite Vision 3.2 2B (IBM, higher accuracy)"
518
- },
519
- "qwen25-vl-3B-mlx": {
520
- repo_id: "mlx-community/Qwen2.5-VL-3B-Instruct-bf16",
521
- inference_framework: "mlx",
522
- response_format: "markdown",
523
- transformers_model_type: "automodel-vision2seq",
524
- description: "Qwen 2.5 VL 3B (MLX, multilingual, good KCJ support)"
525
- },
526
- phi4: {
527
- repo_id: "microsoft/Phi-4-multimodal-instruct",
528
- inference_framework: "transformers",
529
- response_format: "markdown",
530
- transformers_model_type: "automodel",
531
- description: "Phi-4 Multimodal (Microsoft, CausalLM)"
532
- },
533
- "pixtral-12B-mlx": {
534
- repo_id: "mlx-community/pixtral-12b-bf16",
535
- inference_framework: "mlx",
536
- response_format: "markdown",
537
- transformers_model_type: "automodel-vision2seq",
538
- description: "Pixtral 12B (MLX, Mistral, high accuracy)"
539
- },
540
- "pixtral-12B": {
541
- repo_id: "mistral-community/pixtral-12b",
542
- inference_framework: "transformers",
543
- response_format: "markdown",
544
- transformers_model_type: "automodel-vision2seq",
545
- description: "Pixtral 12B (Transformers, Mistral)"
546
- },
547
- got2: {
548
- repo_id: "stepfun-ai/GOT-OCR-2.0-hf",
549
- inference_framework: "transformers",
550
- response_format: "markdown",
551
- transformers_model_type: "automodel-vision2seq",
552
- description: "GOT-OCR 2.0 (StepFun, OCR-specialized)"
553
- },
554
- "gemma3-12B-mlx": {
555
- repo_id: "mlx-community/gemma-3-12b-it-bf16",
556
- inference_framework: "mlx",
557
- response_format: "markdown",
558
- transformers_model_type: "automodel-vision2seq",
559
- description: "Gemma 3 12B (MLX, Google)"
560
- },
561
- "gemma3-27B-mlx": {
562
- repo_id: "mlx-community/gemma-3-27b-it-bf16",
563
- inference_framework: "mlx",
564
- response_format: "markdown",
565
- transformers_model_type: "automodel-vision2seq",
566
- description: "Gemma 3 27B (MLX, Google, highest accuracy)"
567
- },
568
- dolphin: {
569
- repo_id: "ByteDance/Dolphin",
570
- inference_framework: "transformers",
571
- response_format: "markdown",
572
- transformers_model_type: "automodel-vision2seq",
573
- description: "Dolphin (ByteDance, document-oriented)"
574
- }
575
- };
576
- var DEFAULT_VLM_MODEL = "granite-docling-258M-mlx";
577
- function resolveVlmModel(model) {
578
- if (typeof model === "string") {
579
- const preset = VLM_MODELS[model];
580
- if (!preset) {
581
- throw new Error(
582
- `Unknown VLM model preset: "${model}". Available presets: ${Object.keys(VLM_MODELS).join(", ")}`
583
- );
584
- }
585
- return {
586
- repo_id: preset.repo_id,
587
- inference_framework: preset.inference_framework,
588
- response_format: preset.response_format,
589
- transformers_model_type: preset.transformers_model_type
590
- };
591
- }
592
- return model;
593
- }
872
+ var import_node_fs7 = require("fs");
873
+ var import_promises = require("fs/promises");
874
+ var import_node_path7 = require("path");
875
+ var import_promises2 = require("stream/promises");
594
876
 
595
877
  // src/errors/image-pdf-fallback-error.ts
596
878
  var ImagePdfFallbackError = class extends Error {
@@ -778,28 +1060,28 @@ var ImageExtractor = class _ImageExtractor {
778
1060
  const baseName = filename.replace((0, import_node_path2.extname)(filename), "");
779
1061
  const jsonPath = (0, import_node_path2.join)(outputDir, `${baseName}.json`);
780
1062
  try {
781
- const pagesDir = (0, import_node_path2.join)(outputDir, "pages");
782
- if (!(0, import_node_fs.existsSync)(pagesDir)) {
783
- (0, import_node_fs.mkdirSync)(pagesDir, { recursive: true });
1063
+ const imagesDir = (0, import_node_path2.join)(outputDir, "images");
1064
+ if (!(0, import_node_fs.existsSync)(imagesDir)) {
1065
+ (0, import_node_fs.mkdirSync)(imagesDir, { recursive: true });
784
1066
  }
785
1067
  const base64Images = await _ImageExtractor.extractBase64ImagesFromJsonWithJq(jsonSourcePath);
786
1068
  base64Images.forEach((base64Data, index) => {
787
1069
  _ImageExtractor.extractBase64ImageToFile(
788
1070
  base64Data,
789
- pagesDir,
1071
+ imagesDir,
790
1072
  index,
791
- "page",
792
- "pages"
1073
+ "pic",
1074
+ "images"
793
1075
  );
794
1076
  });
795
1077
  logger.info(
796
- `[PDFConverter] Extracted ${base64Images.length} images from JSON to ${pagesDir}`
1078
+ `[PDFConverter] Extracted ${base64Images.length} picture images from JSON to ${imagesDir}`
797
1079
  );
798
1080
  const replacedCount = await _ImageExtractor.replaceBase64ImagesInJsonWithJq(
799
1081
  jsonSourcePath,
800
1082
  jsonPath,
801
- "pages",
802
- "page"
1083
+ "images",
1084
+ "pic"
803
1085
  );
804
1086
  logger.info(
805
1087
  `[PDFConverter] Replaced ${replacedCount} base64 images with file paths`
@@ -882,46 +1164,962 @@ var ImageExtractor = class _ImageExtractor {
882
1164
  }
883
1165
  };
884
1166
 
885
- // src/utils/local-file-server.ts
1167
+ // src/processors/page-renderer.ts
886
1168
  var import_node_fs2 = require("fs");
887
- var import_node_http = require("http");
888
1169
  var import_node_path3 = require("path");
889
- var LocalFileServer = class {
890
- server = null;
891
- port = 0;
1170
+ var DEFAULT_DPI = 300;
1171
+ var PageRenderer = class {
1172
+ constructor(logger) {
1173
+ this.logger = logger;
1174
+ }
892
1175
  /**
893
- * Start serving a file and return the URL
1176
+ * Render all pages of a PDF to individual PNG files.
894
1177
  *
895
- * @param filePath Absolute path to the file to serve
896
- * @returns URL to access the file
1178
+ * @param pdfPath - Absolute path to the source PDF file
1179
+ * @param outputDir - Directory where pages/ subdirectory will be created
1180
+ * @param options - Rendering options
1181
+ * @returns Render result with page count and file paths
897
1182
  */
898
- async start(filePath) {
899
- const filename = (0, import_node_path3.basename)(filePath);
900
- const stat = (0, import_node_fs2.statSync)(filePath);
901
- return new Promise((resolve, reject) => {
902
- this.server = (0, import_node_http.createServer)((req, res) => {
903
- if (req.url === `/${filename}`) {
904
- res.writeHead(200, {
905
- "Content-Type": "application/pdf",
906
- "Content-Length": stat.size
907
- });
908
- (0, import_node_fs2.createReadStream)(filePath).pipe(res);
909
- } else {
910
- res.writeHead(404);
911
- res.end("Not Found");
912
- }
913
- });
914
- this.server.on("error", reject);
915
- this.server.listen(0, "127.0.0.1", () => {
916
- const address = this.server.address();
917
- if (typeof address === "object" && address !== null) {
918
- this.port = address.port;
919
- resolve(`http://127.0.0.1:${this.port}/${filename}`);
920
- } else {
921
- reject(new Error("Failed to get server address"));
922
- }
923
- });
924
- });
1183
+ async renderPages(pdfPath, outputDir, options) {
1184
+ const dpi = options?.dpi ?? DEFAULT_DPI;
1185
+ const pagesDir = (0, import_node_path3.join)(outputDir, "pages");
1186
+ if (!(0, import_node_fs2.existsSync)(pagesDir)) {
1187
+ (0, import_node_fs2.mkdirSync)(pagesDir, { recursive: true });
1188
+ }
1189
+ this.logger.info(`[PageRenderer] Rendering PDF at ${dpi} DPI...`);
1190
+ const outputPattern = (0, import_node_path3.join)(pagesDir, "page_%d.png");
1191
+ const result = await spawnAsync("magick", [
1192
+ "-density",
1193
+ dpi.toString(),
1194
+ pdfPath,
1195
+ "-background",
1196
+ "white",
1197
+ "-alpha",
1198
+ "remove",
1199
+ "-alpha",
1200
+ "off",
1201
+ outputPattern
1202
+ ]);
1203
+ if (result.code !== 0) {
1204
+ throw new Error(
1205
+ `[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
1206
+ );
1207
+ }
1208
+ const pageFiles = (0, import_node_fs2.readdirSync)(pagesDir).filter((f) => f.startsWith("page_") && f.endsWith(".png")).sort((a, b) => {
1209
+ const numA = parseInt(a.replace("page_", "").replace(".png", ""), 10);
1210
+ const numB = parseInt(b.replace("page_", "").replace(".png", ""), 10);
1211
+ return numA - numB;
1212
+ }).map((f) => (0, import_node_path3.join)(pagesDir, f));
1213
+ this.logger.info(
1214
+ `[PageRenderer] Rendered ${pageFiles.length} pages to ${pagesDir}`
1215
+ );
1216
+ return {
1217
+ pageCount: pageFiles.length,
1218
+ pagesDir,
1219
+ pageFiles
1220
+ };
1221
+ }
1222
+ };
1223
+
1224
+ // src/processors/pdf-text-extractor.ts
1225
+ var PdfTextExtractor = class {
1226
+ constructor(logger) {
1227
+ this.logger = logger;
1228
+ }
1229
+ /**
1230
+ * Extract text from all pages of a PDF.
1231
+ *
1232
+ * @param pdfPath - Absolute path to the source PDF file
1233
+ * @param totalPages - Total number of pages in the PDF
1234
+ * @returns Map of 1-based page numbers to extracted text strings
1235
+ */
1236
+ async extractText(pdfPath, totalPages) {
1237
+ this.logger.info(
1238
+ `[PdfTextExtractor] Extracting text from ${totalPages} pages...`
1239
+ );
1240
+ const pageTexts = /* @__PURE__ */ new Map();
1241
+ for (let page = 1; page <= totalPages; page++) {
1242
+ const text = await this.extractPageText(pdfPath, page);
1243
+ pageTexts.set(page, text);
1244
+ }
1245
+ const nonEmptyCount = [...pageTexts.values()].filter(
1246
+ (t) => t.trim().length > 0
1247
+ ).length;
1248
+ this.logger.info(
1249
+ `[PdfTextExtractor] Extracted text from ${nonEmptyCount}/${totalPages} pages`
1250
+ );
1251
+ return pageTexts;
1252
+ }
1253
+ /**
1254
+ * Get total page count of a PDF using pdfinfo.
1255
+ * Returns 0 on failure.
1256
+ */
1257
+ async getPageCount(pdfPath) {
1258
+ const result = await spawnAsync("pdfinfo", [pdfPath]);
1259
+ if (result.code !== 0) {
1260
+ this.logger.warn(
1261
+ `[PdfTextExtractor] pdfinfo failed: ${result.stderr || "Unknown error"}`
1262
+ );
1263
+ return 0;
1264
+ }
1265
+ const match = result.stdout.match(/^Pages:\s+(\d+)/m);
1266
+ return match ? parseInt(match[1], 10) : 0;
1267
+ }
1268
+ /**
1269
+ * Extract text from the entire PDF in a single pdftotext invocation.
1270
+ * Returns empty string on failure (logged as warning).
1271
+ */
1272
+ async extractFullText(pdfPath) {
1273
+ const result = await spawnAsync("pdftotext", ["-layout", pdfPath, "-"]);
1274
+ if (result.code !== 0) {
1275
+ this.logger.warn(
1276
+ `[PdfTextExtractor] pdftotext (full) failed: ${result.stderr || "Unknown error"}`
1277
+ );
1278
+ return "";
1279
+ }
1280
+ return result.stdout;
1281
+ }
1282
+ /**
1283
+ * Extract text from a single PDF page using pdftotext.
1284
+ * Returns empty string on failure (logged as warning).
1285
+ */
1286
+ async extractPageText(pdfPath, page) {
1287
+ const result = await spawnAsync("pdftotext", [
1288
+ "-f",
1289
+ page.toString(),
1290
+ "-l",
1291
+ page.toString(),
1292
+ "-layout",
1293
+ pdfPath,
1294
+ "-"
1295
+ ]);
1296
+ if (result.code !== 0) {
1297
+ this.logger.warn(
1298
+ `[PdfTextExtractor] pdftotext failed for page ${page}: ${result.stderr || "Unknown error"}`
1299
+ );
1300
+ return "";
1301
+ }
1302
+ return result.stdout;
1303
+ }
1304
+ };
1305
+
1306
+ // src/processors/vlm-text-corrector.ts
1307
+ var import_node_fs3 = require("fs");
1308
+ var import_node_path4 = require("path");
1309
+
1310
+ // src/types/vlm-text-correction-schema.ts
1311
+ var import_v4 = require("zod/v4");
1312
+ var vlmTextCorrectionSchema = import_v4.z.object({
1313
+ /** Text element corrections (substitution-based) */
1314
+ tc: import_v4.z.array(
1315
+ import_v4.z.object({
1316
+ /** Text element index (from prompt) */
1317
+ i: import_v4.z.number().int().nonnegative(),
1318
+ /** Substitutions: find/replace pairs applied left-to-right */
1319
+ s: import_v4.z.array(
1320
+ import_v4.z.object({
1321
+ /** Exact garbled substring to find */
1322
+ f: import_v4.z.string(),
1323
+ /** Corrected replacement text */
1324
+ r: import_v4.z.string()
1325
+ })
1326
+ )
1327
+ })
1328
+ ),
1329
+ /** Table cell corrections */
1330
+ cc: import_v4.z.array(
1331
+ import_v4.z.object({
1332
+ /** Table index (within the page) */
1333
+ ti: import_v4.z.number().int().nonnegative(),
1334
+ /** Row index */
1335
+ r: import_v4.z.number().int().nonnegative(),
1336
+ /** Column index */
1337
+ c: import_v4.z.number().int().nonnegative(),
1338
+ /** Corrected cell text */
1339
+ t: import_v4.z.string()
1340
+ })
1341
+ )
1342
+ });
1343
+
1344
+ // src/processors/vlm-text-corrector.ts
1345
+ var LANGUAGE_DISPLAY_NAMES = {
1346
+ ko: "Korean (\uD55C\uAD6D\uC5B4)",
1347
+ ja: "Japanese (\u65E5\u672C\u8A9E)",
1348
+ zh: "Chinese (\u4E2D\u6587)",
1349
+ en: "English",
1350
+ fr: "French (Fran\xE7ais)",
1351
+ de: "German (Deutsch)",
1352
+ es: "Spanish (Espa\xF1ol)",
1353
+ pt: "Portuguese (Portugu\xEAs)",
1354
+ ru: "Russian (\u0420\u0443\u0441\u0441\u043A\u0438\u0439)",
1355
+ uk: "Ukrainian (\u0423\u043A\u0440\u0430\u0457\u043D\u0441\u044C\u043A\u0430)",
1356
+ it: "Italian (Italiano)"
1357
+ };
1358
+ var REFERENCE_MATCH_THRESHOLD = 0.4;
1359
+ var DEFAULT_CONCURRENCY = 1;
1360
+ var DEFAULT_MAX_RETRIES = 3;
1361
+ var DEFAULT_TEMPERATURE = 0;
1362
+ var LABEL_TO_TYPE_CODE = {
1363
+ section_header: "sh",
1364
+ text: "tx",
1365
+ caption: "ca",
1366
+ footnote: "fn",
1367
+ list_item: "li",
1368
+ page_header: "ph",
1369
+ page_footer: "pf"
1370
+ };
1371
+ var TEXT_LABELS = new Set(Object.keys(LABEL_TO_TYPE_CODE));
1372
+ var TEXT_CORRECTION_SYSTEM_PROMPT = `You are a text correction engine for OCR output from Korean archaeological (\u8003\u53E4\u5B78) report PDFs. Compare OCR text against the page image and reference text to fix errors.
1373
+
1374
+ The OCR engine cannot read Chinese characters (\u6F22\u5B57/Hanja) correctly. These errors appear as:
1375
+ - Random ASCII letters/symbols: \u718A\u6D25 \u2192 "M", \u5C0F\u4EAC\u5236 \u2192 "5\u2606", \u6545\u5740 \u2192 "Bbt"
1376
+ - Meaningless Korean syllables: \u6771\u660E \u2192 "\uD587\uBC30", \u91D1\u61B2\u660C \u2192 "\uC232", \u7E3D\u7BA1 \u2192 "3\uC501"
1377
+ - Number/symbol noise: \u718A\u5DDD\u5DDE \u2192 "IEJIM", \u6E6F\u4E95\u90E1 \u2192 "3#"
1378
+ - Hanja dropped entirely: (\u682A)\u97D3\u570B\u7E96\u7DAD \u2192 (\uC8FC), (\u8CA1)\u5FE0\u6DF8\u6587\u5316\u8CA1\u784F\u7A76\u9662 \u2192 (\uC7AC)\uCDA9\uB0A8\uBB38\uD654\uC7AC\uC5F0\uAD6C\uC6D0
1379
+ - Phonetic reading substitution (\u97F3\u8B80): \u6F22\u5B57 replaced by Korean pronunciation, e.g. \u5FE0\u6DF8\u6587\u5316\u8CA1\u784F\u7A76\u9662 \u2192 \uCDA9\uB0A8\uBB38\uD654\uC7AC\uC5F0\uAD6C\uC6D0, \u5BE6\u7389\u6D1E\u907A\u8E5F \u2192 \uC2E4\uC625\uB3D9\uC720\uC801
1380
+
1381
+ FIX: garbled/wrong Chinese characters, mojibake, encoding artifacts, random ASCII/Korean replacing Hanja, dropped Hanja, phonetic reading substitutions
1382
+ KEEP: correct text, structure, punctuation, whitespace
1383
+
1384
+ Input format:
1385
+ T: (text elements) index|type|text
1386
+ Optional: index|ref|reference_text (PDF text layer for the above element)
1387
+ C: (table cells) tableIndex|row,col|text
1388
+ Optional: C_REF: (unused pdftotext blocks as table reference)
1389
+
1390
+ FOOTNOTE (fn) SPECIAL INSTRUCTIONS:
1391
+ - Footnotes in archaeological reports contain institution names with Hanja that are severely garbled
1392
+ - Common pattern: (\u8CA1)\u6A5F\u95DC\u540D\u784F\u7A76\u9662 \u2192 (W)#X1CR003T or (W): 103 or similar ASCII noise
1393
+ - When OCR shows patterns like (W), (M), or random ASCII where an institution name should be, READ THE IMAGE directly
1394
+ - Institution names follow patterns like: (\u8CA1)OO\u6587\u5316\u8CA1\u784F\u7A76\u9662, (\u682A)OO, (\u793E)OO\u5B78\u6703
1395
+
1396
+ TABLE CELL (C:) SPECIAL INSTRUCTIONS:
1397
+ - Table headers often contain Hanja that OCR cannot read: \u767C\u520A\u65E5, \u6642\u4EE3, \u8ABF\u67FB\u7DE3\u7531, \u8ABF\u67FB\u6A5F\u95DC, \u907A\u8E5F\u540D, \u985E\u578B \uBC0F \u57FA\u6578
1398
+ - When OCR shows garbled characters like "\u20A9 A", "#\uCA6F\uBC0F\uD45C\uBF70" in table cells, READ THE IMAGE directly
1399
+ - If C_REF is present, use it as additional context for correcting table cells
1400
+
1401
+ When a |ref| line is present:
1402
+ - It shows text extracted directly from the PDF text layer for that element
1403
+ - If OCR text contains garbled characters but ref text looks correct, USE the ref text
1404
+ - For long paragraphs, align OCR and ref text segment by segment to identify and fix each garbled portion
1405
+ - IMPORTANT: If BOTH OCR and ref text are garbled (e.g. CJK font encoding issues), IGNORE the ref text and READ THE IMAGE directly
1406
+
1407
+ When NO |ref| line is present:
1408
+ - The PDF text layer could not be matched to this element
1409
+ - READ THE IMAGE directly to determine the correct text
1410
+
1411
+ Output JSON with corrections:
1412
+ tc=[{i:index, s:[{f:"garbled_substring",r:"corrected_text"}, ...]}] for text
1413
+ cc=[{ti:tableIndex, r:row, c:col, t:corrected}] for table cells
1414
+
1415
+ Substitution rules for tc:
1416
+ - 'f': exact garbled/wrong substring from the input text (must match exactly)
1417
+ - 'r': the corrected replacement
1418
+ - Include ALL garbled portions for each element as separate s entries
1419
+ - Order substitutions left-to-right as they appear in the text
1420
+ - Do NOT include unchanged text \u2014 only the specific substrings that need fixing
1421
+
1422
+ If all correct: {"tc":[],"cc":[]}`;
1423
+ var VlmTextCorrector = class {
1424
+ constructor(logger) {
1425
+ this.logger = logger;
1426
+ }
1427
+ /**
1428
+ * Read DoclingDocument from output directory, correct text via VLM,
1429
+ * and save the corrected document back.
1430
+ *
1431
+ * @param outputDir - Directory containing result.json and pages/
1432
+ * @param model - Vision language model for text correction
1433
+ * @param options - Processing options
1434
+ * @returns Correction statistics
1435
+ */
1436
+ async correctAndSave(outputDir, model, options) {
1437
+ this.logger.info("[VlmTextCorrector] Starting text correction...");
1438
+ const resultPath = (0, import_node_path4.join)(outputDir, "result.json");
1439
+ const doc = JSON.parse((0, import_node_fs3.readFileSync)(resultPath, "utf-8"));
1440
+ let pageNumbers = this.getPageNumbers(doc);
1441
+ if (pageNumbers.length === 0) {
1442
+ this.logger.info("[VlmTextCorrector] No pages to process");
1443
+ return {
1444
+ textCorrections: 0,
1445
+ cellCorrections: 0,
1446
+ pagesProcessed: 0,
1447
+ pagesFailed: 0
1448
+ };
1449
+ }
1450
+ if (options?.koreanHanjaMixPages && options.koreanHanjaMixPages.length > 0) {
1451
+ const totalPageCount = pageNumbers.length;
1452
+ const hanjaSet = new Set(options.koreanHanjaMixPages);
1453
+ pageNumbers = pageNumbers.filter((p) => hanjaSet.has(p));
1454
+ this.logger.info(
1455
+ `[VlmTextCorrector] Filtering to ${pageNumbers.length} Korean-Hanja mix pages out of ${totalPageCount} total`
1456
+ );
1457
+ }
1458
+ const concurrency = options?.concurrency ?? DEFAULT_CONCURRENCY;
1459
+ this.logger.info(
1460
+ `[VlmTextCorrector] Processing ${pageNumbers.length} pages (concurrency: ${concurrency})...`
1461
+ );
1462
+ const results = await ConcurrentPool.run(
1463
+ pageNumbers,
1464
+ concurrency,
1465
+ (pageNo) => this.correctPage(outputDir, doc, pageNo, model, options),
1466
+ () => {
1467
+ if (options?.onTokenUsage && options?.aggregator) {
1468
+ options.onTokenUsage(
1469
+ options.aggregator.getReport()
1470
+ );
1471
+ }
1472
+ }
1473
+ );
1474
+ let totalTextCorrections = 0;
1475
+ let totalCellCorrections = 0;
1476
+ let pagesFailed = 0;
1477
+ for (const result of results) {
1478
+ if (result === null) {
1479
+ pagesFailed++;
1480
+ } else {
1481
+ totalTextCorrections += result.tc.length;
1482
+ totalCellCorrections += result.cc.length;
1483
+ }
1484
+ }
1485
+ for (let i = 0; i < pageNumbers.length; i++) {
1486
+ const corrections = results[i];
1487
+ if (corrections === null) continue;
1488
+ this.applyCorrections(doc, pageNumbers[i], corrections);
1489
+ }
1490
+ (0, import_node_fs3.writeFileSync)(resultPath, JSON.stringify(doc, null, 2));
1491
+ this.logger.info(
1492
+ `[VlmTextCorrector] Correction complete: ${totalTextCorrections} text, ${totalCellCorrections} cell corrections across ${pageNumbers.length} pages (${pagesFailed} failed)`
1493
+ );
1494
+ return {
1495
+ textCorrections: totalTextCorrections,
1496
+ cellCorrections: totalCellCorrections,
1497
+ pagesProcessed: pageNumbers.length,
1498
+ pagesFailed
1499
+ };
1500
+ }
1501
+ /**
1502
+ * Get sorted page numbers from the document.
1503
+ */
1504
+ getPageNumbers(doc) {
1505
+ return Object.values(doc.pages).map((p) => p.page_no).sort((a, b) => a - b);
1506
+ }
1507
+ /**
1508
+ * Correct text on a single page via VLM.
1509
+ * Returns null if VLM call fails (graceful degradation).
1510
+ */
1511
+ async correctPage(outputDir, doc, pageNo, model, options) {
1512
+ try {
1513
+ const pageTexts = this.getPageTexts(doc, pageNo);
1514
+ const pageTables = this.getPageTables(doc, pageNo);
1515
+ if (pageTexts.length === 0 && pageTables.length === 0) {
1516
+ this.logger.debug(
1517
+ `[VlmTextCorrector] Page ${pageNo}: no text content, skipping`
1518
+ );
1519
+ return { tc: [], cc: [] };
1520
+ }
1521
+ const imageBase64 = this.readPageImage(outputDir, pageNo);
1522
+ const pageText = options?.pageTexts?.get(pageNo);
1523
+ let references;
1524
+ let tableContext;
1525
+ if (pageText) {
1526
+ const { references: refs, unusedBlocks } = this.matchTextToReferenceWithUnused(pageTexts, pageText);
1527
+ references = refs;
1528
+ if (pageTables.length > 0 && unusedBlocks.length > 0) {
1529
+ tableContext = unusedBlocks.join("\n");
1530
+ }
1531
+ }
1532
+ const userPrompt = this.buildUserPrompt(
1533
+ pageTexts,
1534
+ pageTables,
1535
+ references,
1536
+ tableContext
1537
+ );
1538
+ const systemPrompt = this.buildLanguageAwareSystemPrompt(
1539
+ options?.documentLanguages
1540
+ );
1541
+ const fullPrompt = systemPrompt + "\n\n" + userPrompt;
1542
+ const result = await LLMCaller.callVision({
1543
+ schema: vlmTextCorrectionSchema,
1544
+ messages: [
1545
+ {
1546
+ role: "user",
1547
+ content: [
1548
+ {
1549
+ type: "text",
1550
+ text: fullPrompt
1551
+ },
1552
+ {
1553
+ type: "image",
1554
+ image: `data:image/png;base64,${imageBase64}`
1555
+ }
1556
+ ]
1557
+ }
1558
+ ],
1559
+ primaryModel: model,
1560
+ maxRetries: options?.maxRetries ?? DEFAULT_MAX_RETRIES,
1561
+ temperature: options?.temperature ?? DEFAULT_TEMPERATURE,
1562
+ abortSignal: options?.abortSignal,
1563
+ component: "VlmTextCorrector",
1564
+ phase: "text-correction"
1565
+ });
1566
+ if (options?.aggregator) {
1567
+ options.aggregator.track(result.usage);
1568
+ }
1569
+ const output = result.output;
1570
+ if (output.tc.length > 0 || output.cc.length > 0) {
1571
+ this.logger.debug(
1572
+ `[VlmTextCorrector] Page ${pageNo}: ${output.tc.length} text, ${output.cc.length} cell corrections`
1573
+ );
1574
+ }
1575
+ return output;
1576
+ } catch (error) {
1577
+ if (options?.abortSignal?.aborted) {
1578
+ throw error;
1579
+ }
1580
+ this.logger.warn(
1581
+ `[VlmTextCorrector] Page ${pageNo}: VLM correction failed, keeping OCR text`,
1582
+ error
1583
+ );
1584
+ return null;
1585
+ }
1586
+ }
1587
+ /**
1588
+ * Get text items on a specific page, with their indices for prompt building.
1589
+ */
1590
+ getPageTexts(doc, pageNo) {
1591
+ const results = [];
1592
+ for (let i = 0; i < doc.texts.length; i++) {
1593
+ const item = doc.texts[i];
1594
+ if (!TEXT_LABELS.has(item.label)) continue;
1595
+ if (item.prov.some((p) => p.page_no === pageNo)) {
1596
+ results.push({ index: i, item });
1597
+ }
1598
+ }
1599
+ return results;
1600
+ }
1601
+ /**
1602
+ * Get table items on a specific page, with their indices.
1603
+ */
1604
+ getPageTables(doc, pageNo) {
1605
+ const results = [];
1606
+ for (let i = 0; i < doc.tables.length; i++) {
1607
+ const item = doc.tables[i];
1608
+ if (item.prov.some((p) => p.page_no === pageNo)) {
1609
+ results.push({ index: i, item });
1610
+ }
1611
+ }
1612
+ return results;
1613
+ }
1614
+ /**
1615
+ * Build compact user prompt for a page.
1616
+ *
1617
+ * Format:
1618
+ * T:
1619
+ * 0|sh|제1장 조사개요
1620
+ * 1|tx|본 보고서는 ...
1621
+ * C:
1622
+ * 0|0,0|유구명
1623
+ * 0|1,0|1호 住居址
1624
+ */
1625
+ buildUserPrompt(pageTexts, pageTables, references, tableContext) {
1626
+ const parts = [];
1627
+ if (pageTexts.length > 0) {
1628
+ const textLines = [];
1629
+ pageTexts.forEach((entry, promptIndex) => {
1630
+ const typeCode = LABEL_TO_TYPE_CODE[entry.item.label] ?? "tx";
1631
+ textLines.push(`${promptIndex}|${typeCode}|${entry.item.text}`);
1632
+ const ref = references?.get(promptIndex);
1633
+ if (ref) {
1634
+ textLines.push(`${promptIndex}|ref|${ref}`);
1635
+ }
1636
+ });
1637
+ parts.push("T:\n" + textLines.join("\n"));
1638
+ }
1639
+ if (pageTables.length > 0) {
1640
+ const cellLines = [];
1641
+ for (let tablePromptIndex = 0; tablePromptIndex < pageTables.length; tablePromptIndex++) {
1642
+ const table = pageTables[tablePromptIndex].item;
1643
+ for (const cell of table.data.table_cells) {
1644
+ if (!cell.text || cell.text.trim().length === 0) continue;
1645
+ cellLines.push(
1646
+ `${tablePromptIndex}|${cell.start_row_offset_idx},${cell.start_col_offset_idx}|${cell.text}`
1647
+ );
1648
+ }
1649
+ }
1650
+ if (cellLines.length > 0) {
1651
+ const cellSection = "C:\n" + cellLines.join("\n");
1652
+ if (tableContext) {
1653
+ parts.push(cellSection + "\nC_REF:\n" + tableContext);
1654
+ } else {
1655
+ parts.push(cellSection);
1656
+ }
1657
+ }
1658
+ }
1659
+ return parts.join("\n");
1660
+ }
1661
+ /**
1662
+ * Build a language-aware system prompt by prepending language context.
1663
+ */
1664
+ buildLanguageAwareSystemPrompt(documentLanguages) {
1665
+ if (!documentLanguages?.length) {
1666
+ return TEXT_CORRECTION_SYSTEM_PROMPT;
1667
+ }
1668
+ const primaryBase = documentLanguages[0].split("-")[0];
1669
+ const primaryName = LANGUAGE_DISPLAY_NAMES[primaryBase] ?? documentLanguages[0];
1670
+ const otherNames = documentLanguages.slice(1).map((code) => LANGUAGE_DISPLAY_NAMES[code.split("-")[0]] ?? code);
1671
+ const languageDesc = otherNames.length > 0 ? `primarily written in ${primaryName}, with ${otherNames.join(", ")} also present` : `written in ${primaryName}`;
1672
+ const prefix = `LANGUAGE CONTEXT: This document is ${languageDesc}. Focus on correcting characters that do not match this language.
1673
+
1674
+ `;
1675
+ return prefix + TEXT_CORRECTION_SYSTEM_PROMPT;
1676
+ }
1677
+ /**
1678
+ * Match pdftotext paragraph blocks to OCR elements using character multiset overlap.
1679
+ * Returns a map from prompt index to the best-matching reference block.
1680
+ */
1681
+ matchTextToReference(pageTexts, pageText) {
1682
+ return this.matchTextToReferenceWithUnused(pageTexts, pageText).references;
1683
+ }
1684
+ /**
1685
+ * Match pdftotext paragraph blocks to OCR elements and also return unused blocks.
1686
+ * Unused blocks are those that were not consumed by any text element match.
1687
+ */
1688
+ matchTextToReferenceWithUnused(pageTexts, pageText) {
1689
+ const references = /* @__PURE__ */ new Map();
1690
+ const refBlocks = this.mergeIntoBlocks(pageText);
1691
+ if (refBlocks.length === 0) {
1692
+ return { references, unusedBlocks: [] };
1693
+ }
1694
+ const available = new Set(refBlocks.map((_, i) => i));
1695
+ for (let promptIndex = 0; promptIndex < pageTexts.length; promptIndex++) {
1696
+ const ocrText = pageTexts[promptIndex].item.text;
1697
+ let bestScore = 0;
1698
+ let bestBlockIndex = -1;
1699
+ for (const blockIndex of available) {
1700
+ const score = this.computeCharOverlap(ocrText, refBlocks[blockIndex]);
1701
+ if (score > bestScore) {
1702
+ bestScore = score;
1703
+ bestBlockIndex = blockIndex;
1704
+ }
1705
+ }
1706
+ if (bestBlockIndex >= 0 && bestScore >= REFERENCE_MATCH_THRESHOLD) {
1707
+ if (refBlocks[bestBlockIndex] !== ocrText) {
1708
+ references.set(promptIndex, refBlocks[bestBlockIndex]);
1709
+ }
1710
+ available.delete(bestBlockIndex);
1711
+ }
1712
+ }
1713
+ const unusedBlocks = [...available].sort((a, b) => a - b).map((i) => refBlocks[i]);
1714
+ return { references, unusedBlocks };
1715
+ }
1716
+ /**
1717
+ * Merge pdftotext output into paragraph blocks separated by blank lines.
1718
+ * Consecutive non-empty lines are joined with a space.
1719
+ */
1720
+ mergeIntoBlocks(pageText) {
1721
+ const blocks = [];
1722
+ let currentLines = [];
1723
+ for (const rawLine of pageText.split("\n")) {
1724
+ const trimmed = rawLine.trim();
1725
+ if (trimmed.length === 0) {
1726
+ if (currentLines.length > 0) {
1727
+ blocks.push(currentLines.join(" "));
1728
+ currentLines = [];
1729
+ }
1730
+ } else {
1731
+ currentLines.push(trimmed);
1732
+ }
1733
+ }
1734
+ if (currentLines.length > 0) {
1735
+ blocks.push(currentLines.join(" "));
1736
+ }
1737
+ return blocks;
1738
+ }
1739
+ /**
1740
+ * Compute character multiset overlap ratio between two strings.
1741
+ * Returns a value between 0.0 and 1.0.
1742
+ */
1743
+ computeCharOverlap(a, b) {
1744
+ if (a.length === 0 || b.length === 0) return 0;
1745
+ const freqA = /* @__PURE__ */ new Map();
1746
+ for (const ch of a) {
1747
+ freqA.set(ch, (freqA.get(ch) ?? 0) + 1);
1748
+ }
1749
+ const freqB = /* @__PURE__ */ new Map();
1750
+ for (const ch of b) {
1751
+ freqB.set(ch, (freqB.get(ch) ?? 0) + 1);
1752
+ }
1753
+ let overlap = 0;
1754
+ for (const [ch, countA] of freqA) {
1755
+ const countB = freqB.get(ch) ?? 0;
1756
+ overlap += Math.min(countA, countB);
1757
+ }
1758
+ return overlap / Math.max(a.length, b.length);
1759
+ }
1760
+ /**
1761
+ * Read page image as base64.
1762
+ * Page images are 0-indexed: page_no N → pages/page_{N-1}.png
1763
+ */
1764
+ readPageImage(outputDir, pageNo) {
1765
+ const imagePath = (0, import_node_path4.join)(outputDir, "pages", `page_${pageNo - 1}.png`);
1766
+ return (0, import_node_fs3.readFileSync)(imagePath).toString("base64");
1767
+ }
1768
+ /**
1769
+ * Apply VLM corrections to the DoclingDocument.
1770
+ */
1771
+ applyCorrections(doc, pageNo, corrections) {
1772
+ if (corrections.tc.length > 0) {
1773
+ const pageTexts = this.getPageTexts(doc, pageNo);
1774
+ for (const correction of corrections.tc) {
1775
+ if (correction.i >= 0 && correction.i < pageTexts.length) {
1776
+ const docIndex = pageTexts[correction.i].index;
1777
+ let text = doc.texts[docIndex].text;
1778
+ for (const sub of correction.s) {
1779
+ const idx = text.indexOf(sub.f);
1780
+ if (idx >= 0) {
1781
+ text = text.substring(0, idx) + sub.r + text.substring(idx + sub.f.length);
1782
+ } else {
1783
+ this.logger.warn(
1784
+ `[VlmTextCorrector] Page ${pageNo}, text ${correction.i}: find string not found, skipping substitution`
1785
+ );
1786
+ }
1787
+ }
1788
+ if (text !== doc.texts[docIndex].text) {
1789
+ doc.texts[docIndex].text = text;
1790
+ doc.texts[docIndex].orig = text;
1791
+ }
1792
+ }
1793
+ }
1794
+ }
1795
+ if (corrections.cc.length > 0) {
1796
+ const pageTables = this.getPageTables(doc, pageNo);
1797
+ for (const correction of corrections.cc) {
1798
+ if (correction.ti >= 0 && correction.ti < pageTables.length) {
1799
+ const table = pageTables[correction.ti].item;
1800
+ for (const cell of table.data.table_cells) {
1801
+ if (cell.start_row_offset_idx === correction.r && cell.start_col_offset_idx === correction.c) {
1802
+ cell.text = correction.t;
1803
+ break;
1804
+ }
1805
+ }
1806
+ const gridRow = table.data.grid[correction.r];
1807
+ if (gridRow) {
1808
+ const gridCell = gridRow[correction.c];
1809
+ if (gridCell) {
1810
+ gridCell.text = correction.t;
1811
+ }
1812
+ }
1813
+ }
1814
+ }
1815
+ }
1816
+ }
1817
+ };
1818
+
1819
+ // src/samplers/ocr-strategy-sampler.ts
1820
+ var import_model = require("@heripo/model");
1821
+ var import_node_fs4 = require("fs");
1822
+ var import_v42 = require("zod/v4");
1823
+ var SAMPLE_DPI = 150;
1824
+ var EDGE_TRIM_RATIO = 0.1;
1825
+ var DEFAULT_MAX_SAMPLE_PAGES = 15;
1826
+ var DEFAULT_MAX_RETRIES2 = 3;
1827
+ var CJK_REGEX = /[\u4E00-\u9FFF]/;
1828
+ var HANGUL_REGEX = /[\uAC00-\uD7AF]/;
1829
+ var koreanHanjaMixSchema = import_v42.z.object({
1830
+ hasKoreanHanjaMix: import_v42.z.boolean().describe(
1831
+ "Whether the page contains any Hanja (\u6F22\u5B57/Chinese characters) mixed with Korean text"
1832
+ ),
1833
+ detectedLanguages: import_v42.z.array(import_v42.z.string()).describe(
1834
+ 'BCP 47 language tags of languages found on this page, ordered by prevalence (e.g., ["ko-KR", "en-US"])'
1835
+ )
1836
+ });
1837
+ var KOREAN_HANJA_MIX_PROMPT = `Look at this page image carefully. Does it contain any Hanja (\u6F22\u5B57/Chinese characters) mixed with Korean text?
1838
+
1839
+ Hanja examples: \u907A\u8E5F, \u767C\u6398, \u8ABF\u67FB, \u5831\u544A\u66F8, \u6587\u5316\u8CA1
1840
+ Note: Hanja are Chinese characters used in Korean documents, different from modern Korean (\uD55C\uAE00).
1841
+
1842
+ Answer whether any Hanja characters are present on this page.
1843
+
1844
+ Also identify all languages present on this page. Return an array of BCP 47 language tags ordered by prevalence (primary language first).
1845
+ Examples: ["ko-KR", "en-US"], ["ja-JP"], ["zh-TW", "en-US"]`;
1846
+ var OcrStrategySampler = class {
1847
+ logger;
1848
+ pageRenderer;
1849
+ textExtractor;
1850
+ constructor(logger, pageRenderer, textExtractor) {
1851
+ this.logger = logger;
1852
+ this.pageRenderer = pageRenderer;
1853
+ this.textExtractor = textExtractor ?? new PdfTextExtractor(logger);
1854
+ }
1855
+ /**
1856
+ * Sample pages from a PDF and determine the OCR strategy.
1857
+ *
1858
+ * @param pdfPath - Path to the PDF file
1859
+ * @param outputDir - Directory for temporary rendered pages
1860
+ * @param model - Vision language model for Korean-Hanja mix detection
1861
+ * @param options - Sampling options
1862
+ * @returns OcrStrategy with method ('ocrmac' or 'vlm') and metadata
1863
+ */
1864
+ async sample(pdfPath, outputDir, model, options) {
1865
+ const maxSamplePages = options?.maxSamplePages ?? DEFAULT_MAX_SAMPLE_PAGES;
1866
+ this.logger.info("[OcrStrategySampler] Starting OCR strategy sampling...");
1867
+ const preCheckResult = await this.preCheckHanjaFromTextLayer(pdfPath);
1868
+ if (preCheckResult) {
1869
+ return preCheckResult;
1870
+ }
1871
+ const renderResult = await this.pageRenderer.renderPages(
1872
+ pdfPath,
1873
+ outputDir,
1874
+ { dpi: SAMPLE_DPI }
1875
+ );
1876
+ if (renderResult.pageCount === 0) {
1877
+ this.logger.info("[OcrStrategySampler] No pages found in PDF");
1878
+ return {
1879
+ method: "ocrmac",
1880
+ reason: "No pages found in PDF",
1881
+ sampledPages: 0,
1882
+ totalPages: 0
1883
+ };
1884
+ }
1885
+ const sampleIndices = this.selectSamplePages(
1886
+ renderResult.pageCount,
1887
+ maxSamplePages
1888
+ );
1889
+ this.logger.info(
1890
+ `[OcrStrategySampler] Sampling ${sampleIndices.length} of ${renderResult.pageCount} pages: [${sampleIndices.map((i) => i + 1).join(", ")}]`
1891
+ );
1892
+ let sampledCount = 0;
1893
+ const languageFrequency = /* @__PURE__ */ new Map();
1894
+ for (const idx of sampleIndices) {
1895
+ sampledCount++;
1896
+ const pageFile = renderResult.pageFiles[idx];
1897
+ const pageAnalysis = await this.analyzeSamplePage(
1898
+ pageFile,
1899
+ idx + 1,
1900
+ model,
1901
+ options
1902
+ );
1903
+ for (const lang of pageAnalysis.detectedLanguages) {
1904
+ languageFrequency.set(lang, (languageFrequency.get(lang) ?? 0) + 1);
1905
+ }
1906
+ if (pageAnalysis.hasKoreanHanjaMix) {
1907
+ this.logger.info(
1908
+ `[OcrStrategySampler] Korean-Hanja mix detected on page ${idx + 1} \u2192 VLM strategy`
1909
+ );
1910
+ const detectedLanguages2 = this.aggregateLanguages(languageFrequency);
1911
+ return {
1912
+ method: "vlm",
1913
+ detectedLanguages: detectedLanguages2,
1914
+ reason: `Korean-Hanja mix detected on page ${idx + 1}`,
1915
+ sampledPages: sampledCount,
1916
+ totalPages: renderResult.pageCount
1917
+ };
1918
+ }
1919
+ }
1920
+ this.logger.info(
1921
+ "[OcrStrategySampler] No Korean-Hanja mix detected \u2192 ocrmac strategy"
1922
+ );
1923
+ const detectedLanguages = this.aggregateLanguages(languageFrequency);
1924
+ return {
1925
+ method: "ocrmac",
1926
+ detectedLanguages,
1927
+ reason: `No Korean-Hanja mix detected in ${sampledCount} sampled pages`,
1928
+ sampledPages: sampledCount,
1929
+ totalPages: renderResult.pageCount
1930
+ };
1931
+ }
1932
+ /**
1933
+ * Pre-check for Hangul-Hanja mix in PDF text layer using pdftotext.
1934
+ * Extracts full document text in a single process and checks at document level.
1935
+ * Only makes a definitive decision for Korean (Hangul) documents:
1936
+ * - Hangul + Hanja (anywhere in document) → VLM (confirmed Korean-Hanja mix)
1937
+ * - Hangul only → ocrmac with ko-KR (confirmed Korean)
1938
+ * - No Hangul (English, Japanese, etc.) → null (delegates to VLM for language detection)
1939
+ */
1940
+ async preCheckHanjaFromTextLayer(pdfPath) {
1941
+ try {
1942
+ const totalPages = await this.textExtractor.getPageCount(pdfPath);
1943
+ if (totalPages === 0) return null;
1944
+ const fullText = await this.textExtractor.extractFullText(pdfPath);
1945
+ if (fullText.trim().length === 0) {
1946
+ this.logger.debug(
1947
+ "[OcrStrategySampler] No Hangul in text layer, falling back to VLM sampling"
1948
+ );
1949
+ return null;
1950
+ }
1951
+ const hasHangul = HANGUL_REGEX.test(fullText);
1952
+ const hasHanja = CJK_REGEX.test(fullText);
1953
+ if (!hasHangul) {
1954
+ this.logger.debug(
1955
+ "[OcrStrategySampler] No Hangul in text layer, falling back to VLM sampling"
1956
+ );
1957
+ return null;
1958
+ }
1959
+ if (hasHanja) {
1960
+ const pageTextArray = fullText.split("\f");
1961
+ const koreanHanjaMixPages = [];
1962
+ for (let i = 0; i < pageTextArray.length; i++) {
1963
+ if (CJK_REGEX.test(pageTextArray[i])) {
1964
+ koreanHanjaMixPages.push(i + 1);
1965
+ }
1966
+ }
1967
+ this.logger.info(
1968
+ `[OcrStrategySampler] Hangul-Hanja mix detected in text layer \u2192 VLM strategy (${koreanHanjaMixPages.length} pages with Hanja)`
1969
+ );
1970
+ return {
1971
+ method: "vlm",
1972
+ detectedLanguages: ["ko-KR"],
1973
+ reason: "Hangul-Hanja mix found in PDF text layer",
1974
+ koreanHanjaMixPages,
1975
+ sampledPages: totalPages,
1976
+ totalPages
1977
+ };
1978
+ }
1979
+ this.logger.info(
1980
+ "[OcrStrategySampler] No Hangul-Hanja mix in text layer \u2192 ocrmac strategy"
1981
+ );
1982
+ return {
1983
+ method: "ocrmac",
1984
+ detectedLanguages: ["ko-KR"],
1985
+ reason: `No Hangul-Hanja mix in PDF text layer (${totalPages} pages checked)`,
1986
+ sampledPages: totalPages,
1987
+ totalPages
1988
+ };
1989
+ } catch {
1990
+ this.logger.debug(
1991
+ "[OcrStrategySampler] Text layer pre-check failed, falling back to VLM sampling"
1992
+ );
1993
+ return null;
1994
+ }
1995
+ }
1996
+ /**
1997
+ * Select page indices for sampling.
1998
+ * Trims front/back edges and distributes samples evenly.
1999
+ *
2000
+ * @param totalPages - Total number of pages
2001
+ * @param maxSamples - Maximum number of samples
2002
+ * @returns Array of 0-based page indices
2003
+ */
2004
+ selectSamplePages(totalPages, maxSamples) {
2005
+ if (totalPages === 0) return [];
2006
+ if (totalPages <= maxSamples) {
2007
+ return Array.from({ length: totalPages }, (_, i) => i);
2008
+ }
2009
+ const trimCount = Math.max(1, Math.ceil(totalPages * EDGE_TRIM_RATIO));
2010
+ const start = trimCount;
2011
+ const end = totalPages - trimCount;
2012
+ const eligibleCount = end - start;
2013
+ if (eligibleCount <= 0) {
2014
+ return [Math.floor(totalPages / 2)];
2015
+ }
2016
+ if (eligibleCount <= maxSamples) {
2017
+ return Array.from({ length: eligibleCount }, (_, i) => start + i);
2018
+ }
2019
+ const indices = [];
2020
+ const step = eligibleCount / maxSamples;
2021
+ for (let i = 0; i < maxSamples; i++) {
2022
+ indices.push(start + Math.floor(i * step));
2023
+ }
2024
+ return indices;
2025
+ }
2026
+ /**
2027
+ * Analyze a single sample page for Korean-Hanja mixed script and primary language.
2028
+ * Normalizes raw VLM language responses to valid BCP 47 tags, filtering out invalid ones.
2029
+ *
2030
+ * @returns Object with Korean-Hanja detection result and normalized detected languages
2031
+ */
2032
+ async analyzeSamplePage(pageFile, pageNo, model, options) {
2033
+ this.logger.debug(
2034
+ `[OcrStrategySampler] Analyzing page ${pageNo} for Korean-Hanja mix and language...`
2035
+ );
2036
+ const base64Image = (0, import_node_fs4.readFileSync)(pageFile).toString("base64");
2037
+ const messages = [
2038
+ {
2039
+ role: "user",
2040
+ content: [
2041
+ { type: "text", text: KOREAN_HANJA_MIX_PROMPT },
2042
+ {
2043
+ type: "image",
2044
+ image: `data:image/png;base64,${base64Image}`
2045
+ }
2046
+ ]
2047
+ }
2048
+ ];
2049
+ const result = await LLMCaller.callVision({
2050
+ schema: koreanHanjaMixSchema,
2051
+ messages,
2052
+ primaryModel: model,
2053
+ fallbackModel: options?.fallbackModel,
2054
+ maxRetries: options?.maxRetries ?? DEFAULT_MAX_RETRIES2,
2055
+ temperature: options?.temperature ?? 0,
2056
+ abortSignal: options?.abortSignal,
2057
+ component: "OcrStrategySampler",
2058
+ phase: "korean-hanja-mix-detection"
2059
+ });
2060
+ if (options?.aggregator) {
2061
+ options.aggregator.track(result.usage);
2062
+ }
2063
+ const output = result.output;
2064
+ const normalizedLanguages = output.detectedLanguages.map(import_model.normalizeToBcp47).filter((tag) => tag !== null);
2065
+ this.logger.debug(
2066
+ `[OcrStrategySampler] Page ${pageNo}: hasKoreanHanjaMix=${output.hasKoreanHanjaMix}, detectedLanguages=${normalizedLanguages.join(",")}`
2067
+ );
2068
+ return {
2069
+ hasKoreanHanjaMix: output.hasKoreanHanjaMix,
2070
+ detectedLanguages: normalizedLanguages
2071
+ };
2072
+ }
2073
+ /**
2074
+ * Aggregate language frequency map into a sorted array.
2075
+ * Returns languages sorted by frequency (descending), or undefined if empty.
2076
+ */
2077
+ aggregateLanguages(frequencyMap) {
2078
+ if (frequencyMap.size === 0) return void 0;
2079
+ return [...frequencyMap.entries()].sort((a, b) => b[1] - a[1]).map(([lang]) => lang);
2080
+ }
2081
+ };
2082
+
2083
+ // src/utils/local-file-server.ts
2084
+ var import_node_fs5 = require("fs");
2085
+ var import_node_http = require("http");
2086
+ var import_node_path5 = require("path");
2087
+ var LocalFileServer = class {
2088
+ server = null;
2089
+ port = 0;
2090
+ /**
2091
+ * Start serving a file and return the URL
2092
+ *
2093
+ * @param filePath Absolute path to the file to serve
2094
+ * @returns URL to access the file
2095
+ */
2096
+ async start(filePath) {
2097
+ const filename = (0, import_node_path5.basename)(filePath);
2098
+ const stat = (0, import_node_fs5.statSync)(filePath);
2099
+ return new Promise((resolve, reject) => {
2100
+ this.server = (0, import_node_http.createServer)((req, res) => {
2101
+ if (req.url === `/${filename}`) {
2102
+ res.writeHead(200, {
2103
+ "Content-Type": "application/pdf",
2104
+ "Content-Length": stat.size
2105
+ });
2106
+ (0, import_node_fs5.createReadStream)(filePath).pipe(res);
2107
+ } else {
2108
+ res.writeHead(404);
2109
+ res.end("Not Found");
2110
+ }
2111
+ });
2112
+ this.server.on("error", reject);
2113
+ this.server.listen(0, "127.0.0.1", () => {
2114
+ const address = this.server.address();
2115
+ if (typeof address === "object" && address !== null) {
2116
+ this.port = address.port;
2117
+ resolve(`http://127.0.0.1:${this.port}/${filename}`);
2118
+ } else {
2119
+ reject(new Error("Failed to get server address"));
2120
+ }
2121
+ });
2122
+ });
925
2123
  }
926
2124
  /**
927
2125
  * Stop the server
@@ -942,9 +2140,9 @@ var LocalFileServer = class {
942
2140
  };
943
2141
 
944
2142
  // src/core/image-pdf-converter.ts
945
- var import_node_fs3 = require("fs");
946
- var import_node_os2 = require("os");
947
- var import_node_path4 = require("path");
2143
+ var import_node_fs6 = require("fs");
2144
+ var import_node_os = require("os");
2145
+ var import_node_path6 = require("path");
948
2146
  var ImagePdfConverter = class {
949
2147
  constructor(logger) {
950
2148
  this.logger = logger;
@@ -959,9 +2157,9 @@ var ImagePdfConverter = class {
959
2157
  */
960
2158
  async convert(pdfUrl, reportId) {
961
2159
  const timestamp = Date.now();
962
- const tempDir = (0, import_node_os2.tmpdir)();
963
- const inputPath = (0, import_node_path4.join)(tempDir, `${reportId}-${timestamp}-input.pdf`);
964
- const outputPath = (0, import_node_path4.join)(tempDir, `${reportId}-${timestamp}-image.pdf`);
2160
+ const tempDir = (0, import_node_os.tmpdir)();
2161
+ const inputPath = (0, import_node_path6.join)(tempDir, `${reportId}-${timestamp}-input.pdf`);
2162
+ const outputPath = (0, import_node_path6.join)(tempDir, `${reportId}-${timestamp}-image.pdf`);
965
2163
  try {
966
2164
  this.logger.info("[ImagePdfConverter] Downloading PDF from URL...");
967
2165
  await this.downloadPdf(pdfUrl, inputPath);
@@ -970,8 +2168,8 @@ var ImagePdfConverter = class {
970
2168
  this.logger.info("[ImagePdfConverter] Image PDF created:", outputPath);
971
2169
  return outputPath;
972
2170
  } finally {
973
- if ((0, import_node_fs3.existsSync)(inputPath)) {
974
- (0, import_node_fs3.rmSync)(inputPath, { force: true });
2171
+ if ((0, import_node_fs6.existsSync)(inputPath)) {
2172
+ (0, import_node_fs6.rmSync)(inputPath, { force: true });
975
2173
  }
976
2174
  }
977
2175
  }
@@ -1018,22 +2216,17 @@ var ImagePdfConverter = class {
1018
2216
  * Cleanup the temporary image PDF file
1019
2217
  */
1020
2218
  cleanup(imagePdfPath) {
1021
- if ((0, import_node_fs3.existsSync)(imagePdfPath)) {
2219
+ if ((0, import_node_fs6.existsSync)(imagePdfPath)) {
1022
2220
  this.logger.info(
1023
2221
  "[ImagePdfConverter] Cleaning up temp file:",
1024
2222
  imagePdfPath
1025
2223
  );
1026
- (0, import_node_fs3.rmSync)(imagePdfPath, { force: true });
2224
+ (0, import_node_fs6.rmSync)(imagePdfPath, { force: true });
1027
2225
  }
1028
2226
  }
1029
2227
  };
1030
2228
 
1031
2229
  // src/core/pdf-converter.ts
1032
- var _origAssertValidConversionOptions = import_docling_sdk.ValidationUtils.assertValidConversionOptions.bind(import_docling_sdk.ValidationUtils);
1033
- import_docling_sdk.ValidationUtils.assertValidConversionOptions = (options) => {
1034
- const { pipeline: _pipeline, ...rest } = options;
1035
- _origAssertValidConversionOptions(rest);
1036
- };
1037
2230
  var PDFConverter = class {
1038
2231
  constructor(logger, client, enableImagePdfFallback = false, timeout = PDF_CONVERTER.DEFAULT_TIMEOUT_MS) {
1039
2232
  this.logger = logger;
@@ -1043,9 +2236,232 @@ var PDFConverter = class {
1043
2236
  }
1044
2237
  async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
1045
2238
  this.logger.info("[PDFConverter] Converting:", url);
2239
+ if (options.forceImagePdf) {
2240
+ return this.convertViaImagePdf(
2241
+ url,
2242
+ reportId,
2243
+ onComplete,
2244
+ cleanupAfterCallback,
2245
+ options,
2246
+ abortSignal
2247
+ );
2248
+ }
2249
+ return this.convertWithFallback(
2250
+ url,
2251
+ reportId,
2252
+ onComplete,
2253
+ cleanupAfterCallback,
2254
+ options,
2255
+ abortSignal
2256
+ );
2257
+ }
2258
+ /**
2259
+ * Convert a PDF using OCR strategy sampling to decide between ocrmac and VLM.
2260
+ *
2261
+ * Flow:
2262
+ * 1. Determine strategy (forced, skipped, or sampled via VLM)
2263
+ * 2. If VLM → OCR pipeline + VlmTextCorrector (text correction)
2264
+ * 3. If ocrmac → existing Docling conversion
2265
+ *
2266
+ * @returns ConvertWithStrategyResult with the chosen strategy and token report
2267
+ */
2268
+ async convertWithStrategy(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
2269
+ this.logger.info("[PDFConverter] Starting strategy-based conversion:", url);
2270
+ const aggregator = options.aggregator ?? new LLMTokenUsageAggregator();
2271
+ const trackedOptions = { ...options, aggregator };
2272
+ const pdfPath = url.startsWith("file://") ? url.slice(7) : null;
2273
+ const strategy = await this.determineStrategy(
2274
+ pdfPath,
2275
+ reportId,
2276
+ trackedOptions,
2277
+ abortSignal
2278
+ );
2279
+ this.logger.info(
2280
+ `[PDFConverter] OCR strategy: ${strategy.method} (${strategy.reason})`
2281
+ );
2282
+ if (trackedOptions.onTokenUsage) {
2283
+ const samplingReport = this.buildTokenReport(aggregator);
2284
+ if (samplingReport) {
2285
+ trackedOptions.onTokenUsage(samplingReport);
2286
+ }
2287
+ }
2288
+ if (strategy.method === "vlm") {
2289
+ await this.convertWithVlm(
2290
+ pdfPath,
2291
+ reportId,
2292
+ onComplete,
2293
+ cleanupAfterCallback,
2294
+ trackedOptions,
2295
+ abortSignal,
2296
+ strategy.detectedLanguages,
2297
+ strategy.koreanHanjaMixPages
2298
+ );
2299
+ return {
2300
+ strategy,
2301
+ tokenUsageReport: this.buildTokenReport(aggregator)
2302
+ };
2303
+ }
2304
+ const ocrmacOptions = strategy.detectedLanguages ? { ...trackedOptions, ocr_lang: strategy.detectedLanguages } : trackedOptions;
2305
+ await this.convert(
2306
+ url,
2307
+ reportId,
2308
+ onComplete,
2309
+ cleanupAfterCallback,
2310
+ ocrmacOptions,
2311
+ abortSignal
2312
+ );
2313
+ return {
2314
+ strategy,
2315
+ tokenUsageReport: this.buildTokenReport(aggregator)
2316
+ };
2317
+ }
2318
+ /**
2319
+ * Build a token usage report from the aggregator.
2320
+ * Returns null when no LLM calls were tracked (e.g. forced ocrmac without sampling).
2321
+ */
2322
+ buildTokenReport(aggregator) {
2323
+ const report = aggregator.getReport();
2324
+ if (report.components.length === 0) {
2325
+ return null;
2326
+ }
2327
+ return report;
2328
+ }
2329
+ /**
2330
+ * Determine the OCR strategy based on options and page sampling.
2331
+ *
2332
+ * When sampling is possible (strategySamplerModel + local file), it always
2333
+ * runs — even with forcedMethod — so that detectedLanguages are available
2334
+ * for OCR engine configuration. The forced method simply overrides the
2335
+ * sampled method choice.
2336
+ */
2337
+ async determineStrategy(pdfPath, reportId, options, abortSignal) {
2338
+ if (options.skipSampling || !options.strategySamplerModel || !pdfPath) {
2339
+ const method = options.forcedMethod ?? "ocrmac";
2340
+ const reason = options.forcedMethod ? `Forced: ${options.forcedMethod}` : !pdfPath ? "Non-local URL, sampling skipped" : "Sampling skipped";
2341
+ return { method, reason, sampledPages: 0, totalPages: 0 };
2342
+ }
2343
+ const samplingDir = (0, import_node_path7.join)(process.cwd(), "output", reportId, "_sampling");
2344
+ const sampler = new OcrStrategySampler(
2345
+ this.logger,
2346
+ new PageRenderer(this.logger),
2347
+ new PdfTextExtractor(this.logger)
2348
+ );
2349
+ try {
2350
+ const strategy = await sampler.sample(
2351
+ pdfPath,
2352
+ samplingDir,
2353
+ options.strategySamplerModel,
2354
+ {
2355
+ aggregator: options.aggregator,
2356
+ abortSignal
2357
+ }
2358
+ );
2359
+ if (options.forcedMethod) {
2360
+ return {
2361
+ ...strategy,
2362
+ method: options.forcedMethod,
2363
+ reason: `Forced: ${options.forcedMethod} (${strategy.reason})`
2364
+ };
2365
+ }
2366
+ return strategy;
2367
+ } finally {
2368
+ if ((0, import_node_fs7.existsSync)(samplingDir)) {
2369
+ (0, import_node_fs7.rmSync)(samplingDir, { recursive: true, force: true });
2370
+ }
2371
+ }
2372
+ }
2373
+ /**
2374
+ * Execute VLM-enhanced PDF conversion.
2375
+ *
2376
+ * Runs the standard OCR pipeline (Docling) first, then applies VLM text
2377
+ * correction to fix garbled Chinese characters (漢字/Hanja) in OCR output.
2378
+ */
2379
+ async convertWithVlm(pdfPath, reportId, onComplete, cleanupAfterCallback, options, abortSignal, detectedLanguages, koreanHanjaMixPages) {
2380
+ if (!options.vlmProcessorModel) {
2381
+ throw new Error("vlmProcessorModel is required when OCR strategy is VLM");
2382
+ }
2383
+ if (!pdfPath) {
2384
+ throw new Error("VLM conversion requires a local file (file:// URL)");
2385
+ }
2386
+ const url = `file://${pdfPath}`;
2387
+ const wrappedCallback = async (outputDir) => {
2388
+ let pageTexts;
2389
+ try {
2390
+ const resultPath2 = (0, import_node_path7.join)(outputDir, "result.json");
2391
+ const doc = JSON.parse((0, import_node_fs7.readFileSync)(resultPath2, "utf-8"));
2392
+ const totalPages = Object.keys(doc.pages).length;
2393
+ const textExtractor = new PdfTextExtractor(this.logger);
2394
+ pageTexts = await textExtractor.extractText(pdfPath, totalPages);
2395
+ } catch {
2396
+ this.logger.warn(
2397
+ "[PDFConverter] pdftotext extraction failed, proceeding without text reference"
2398
+ );
2399
+ }
2400
+ const resultPath = (0, import_node_path7.join)(outputDir, "result.json");
2401
+ const ocrOriginPath = (0, import_node_path7.join)(outputDir, "result_ocr_origin.json");
2402
+ (0, import_node_fs7.copyFileSync)(resultPath, ocrOriginPath);
2403
+ const corrector = new VlmTextCorrector(this.logger);
2404
+ await corrector.correctAndSave(outputDir, options.vlmProcessorModel, {
2405
+ concurrency: options.vlmConcurrency,
2406
+ aggregator: options.aggregator,
2407
+ abortSignal,
2408
+ onTokenUsage: options.onTokenUsage,
2409
+ documentLanguages: detectedLanguages,
2410
+ pageTexts,
2411
+ koreanHanjaMixPages
2412
+ });
2413
+ await onComplete(outputDir);
2414
+ };
2415
+ const vlmOptions = detectedLanguages ? { ...options, ocr_lang: detectedLanguages } : options;
2416
+ await this.convert(
2417
+ url,
2418
+ reportId,
2419
+ wrappedCallback,
2420
+ cleanupAfterCallback,
2421
+ vlmOptions,
2422
+ abortSignal
2423
+ );
2424
+ this.logger.info("[PDFConverter] VLM conversion completed successfully");
2425
+ }
2426
+ /**
2427
+ * Convert by first creating an image PDF, then running the conversion.
2428
+ * Used when forceImagePdf option is enabled.
2429
+ */
2430
+ async convertViaImagePdf(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
2431
+ this.logger.info(
2432
+ "[PDFConverter] Force image PDF mode: converting to image PDF first..."
2433
+ );
2434
+ const imagePdfConverter = new ImagePdfConverter(this.logger);
2435
+ let imagePdfPath = null;
2436
+ try {
2437
+ imagePdfPath = await imagePdfConverter.convert(url, reportId);
2438
+ const localUrl = `file://${imagePdfPath}`;
2439
+ this.logger.info(
2440
+ "[PDFConverter] Image PDF ready, starting conversion:",
2441
+ localUrl
2442
+ );
2443
+ return await this.performConversion(
2444
+ localUrl,
2445
+ reportId,
2446
+ onComplete,
2447
+ cleanupAfterCallback,
2448
+ options,
2449
+ abortSignal
2450
+ );
2451
+ } finally {
2452
+ if (imagePdfPath) {
2453
+ imagePdfConverter.cleanup(imagePdfPath);
2454
+ }
2455
+ }
2456
+ }
2457
+ /**
2458
+ * Convert directly with optional image PDF fallback on failure.
2459
+ * Used by standard (OCR) pipeline.
2460
+ */
2461
+ async convertWithFallback(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
1046
2462
  let originalError = null;
1047
2463
  try {
1048
- await this.performConversion(
2464
+ return await this.performConversion(
1049
2465
  url,
1050
2466
  reportId,
1051
2467
  onComplete,
@@ -1053,7 +2469,6 @@ var PDFConverter = class {
1053
2469
  options,
1054
2470
  abortSignal
1055
2471
  );
1056
- return;
1057
2472
  } catch (error) {
1058
2473
  if (abortSignal?.aborted) {
1059
2474
  throw error;
@@ -1071,7 +2486,7 @@ var PDFConverter = class {
1071
2486
  imagePdfPath = await imagePdfConverter.convert(url, reportId);
1072
2487
  const localUrl = `file://${imagePdfPath}`;
1073
2488
  this.logger.info("[PDFConverter] Retrying with image PDF:", localUrl);
1074
- await this.performConversion(
2489
+ const report = await this.performConversion(
1075
2490
  localUrl,
1076
2491
  reportId,
1077
2492
  onComplete,
@@ -1080,6 +2495,7 @@ var PDFConverter = class {
1080
2495
  abortSignal
1081
2496
  );
1082
2497
  this.logger.info("[PDFConverter] Fallback conversion succeeded");
2498
+ return report;
1083
2499
  } catch (fallbackError) {
1084
2500
  this.logger.error(
1085
2501
  "[PDFConverter] Fallback conversion also failed:",
@@ -1094,15 +2510,10 @@ var PDFConverter = class {
1094
2510
  }
1095
2511
  async performConversion(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
1096
2512
  const startTime = Date.now();
1097
- const pipelineType = options.pipeline ?? "standard";
1098
- const conversionOptions = pipelineType === "vlm" ? this.buildVlmConversionOptions(options) : this.buildConversionOptions(options);
1099
- if (pipelineType === "vlm") {
1100
- this.logger.info("[PDFConverter] Using VLM pipeline");
1101
- } else {
1102
- this.logger.info(
1103
- `[PDFConverter] OCR languages: ${JSON.stringify(conversionOptions.ocr_options?.lang)}`
1104
- );
1105
- }
2513
+ const conversionOptions = this.buildConversionOptions(options);
2514
+ this.logger.info(
2515
+ `[PDFConverter] OCR languages: ${JSON.stringify(conversionOptions.ocr_options?.lang)}`
2516
+ );
1106
2517
  this.logger.info(
1107
2518
  "[PDFConverter] Converting document with Async Source API..."
1108
2519
  );
@@ -1130,11 +2541,12 @@ var PDFConverter = class {
1130
2541
  }
1131
2542
  }
1132
2543
  const cwd = process.cwd();
1133
- const zipPath = (0, import_node_path5.join)(cwd, "result.zip");
1134
- const extractDir = (0, import_node_path5.join)(cwd, "result_extracted");
1135
- const outputDir = (0, import_node_path5.join)(cwd, "output", reportId);
2544
+ const zipPath = (0, import_node_path7.join)(cwd, "result.zip");
2545
+ const extractDir = (0, import_node_path7.join)(cwd, "result_extracted");
2546
+ const outputDir = (0, import_node_path7.join)(cwd, "output", reportId);
1136
2547
  try {
1137
2548
  await this.processConvertedFiles(zipPath, extractDir, outputDir);
2549
+ await this.renderPageImages(url, outputDir);
1138
2550
  if (abortSignal?.aborted) {
1139
2551
  this.logger.info("[PDFConverter] Conversion aborted before callback");
1140
2552
  const error = new Error("PDF conversion was aborted");
@@ -1148,28 +2560,38 @@ var PDFConverter = class {
1148
2560
  this.logger.info("[PDFConverter] Total time:", duration, "ms");
1149
2561
  } finally {
1150
2562
  this.logger.info("[PDFConverter] Cleaning up temporary files...");
1151
- if ((0, import_node_fs4.existsSync)(zipPath)) {
1152
- (0, import_node_fs4.rmSync)(zipPath, { force: true });
2563
+ if ((0, import_node_fs7.existsSync)(zipPath)) {
2564
+ (0, import_node_fs7.rmSync)(zipPath, { force: true });
1153
2565
  }
1154
- if ((0, import_node_fs4.existsSync)(extractDir)) {
1155
- (0, import_node_fs4.rmSync)(extractDir, { recursive: true, force: true });
2566
+ if ((0, import_node_fs7.existsSync)(extractDir)) {
2567
+ (0, import_node_fs7.rmSync)(extractDir, { recursive: true, force: true });
1156
2568
  }
1157
2569
  if (cleanupAfterCallback) {
1158
2570
  this.logger.info(
1159
2571
  "[PDFConverter] Cleaning up output directory:",
1160
2572
  outputDir
1161
2573
  );
1162
- if ((0, import_node_fs4.existsSync)(outputDir)) {
1163
- (0, import_node_fs4.rmSync)(outputDir, { recursive: true, force: true });
2574
+ if ((0, import_node_fs7.existsSync)(outputDir)) {
2575
+ (0, import_node_fs7.rmSync)(outputDir, { recursive: true, force: true });
1164
2576
  }
1165
2577
  } else {
1166
2578
  this.logger.info("[PDFConverter] Output preserved at:", outputDir);
1167
2579
  }
1168
2580
  }
2581
+ return null;
1169
2582
  }
1170
2583
  buildConversionOptions(options) {
1171
2584
  return {
1172
- ...(0, import_es_toolkit.omit)(options, ["num_threads", "pipeline", "vlm_model"]),
2585
+ ...(0, import_es_toolkit.omit)(options, [
2586
+ "num_threads",
2587
+ "forceImagePdf",
2588
+ "strategySamplerModel",
2589
+ "vlmProcessorModel",
2590
+ "skipSampling",
2591
+ "forcedMethod",
2592
+ "aggregator",
2593
+ "onTokenUsage"
2594
+ ]),
1173
2595
  to_formats: ["json", "html"],
1174
2596
  image_export_mode: "embedded",
1175
2597
  ocr_engine: "ocrmac",
@@ -1180,6 +2602,8 @@ var PDFConverter = class {
1180
2602
  framework: "livetext"
1181
2603
  },
1182
2604
  generate_picture_images: true,
2605
+ generate_page_images: false,
2606
+ // Page images are rendered by PageRenderer (ImageMagick) after conversion
1183
2607
  images_scale: 2,
1184
2608
  /**
1185
2609
  * While disabling this option yields the most accurate text extraction for readable PDFs,
@@ -1195,31 +2619,6 @@ var PDFConverter = class {
1195
2619
  }
1196
2620
  };
1197
2621
  }
1198
- /**
1199
- * Build conversion options for VLM pipeline.
1200
- *
1201
- * VLM pipeline uses a Vision Language Model instead of traditional OCR,
1202
- * providing better accuracy for KCJ characters and complex layouts.
1203
- */
1204
- buildVlmConversionOptions(options) {
1205
- const vlmModel = resolveVlmModel(options.vlm_model ?? DEFAULT_VLM_MODEL);
1206
- this.logger.info(
1207
- `[PDFConverter] VLM model: ${vlmModel.repo_id} (framework: ${vlmModel.inference_framework}, format: ${vlmModel.response_format})`
1208
- );
1209
- return {
1210
- ...(0, import_es_toolkit.omit)(options, ["num_threads", "pipeline", "vlm_model", "ocr_lang"]),
1211
- to_formats: ["json", "html"],
1212
- image_export_mode: "embedded",
1213
- pipeline: "vlm",
1214
- vlm_pipeline_model_local: vlmModel,
1215
- generate_picture_images: true,
1216
- images_scale: 2,
1217
- accelerator_options: {
1218
- device: "mps",
1219
- num_threads: options.num_threads
1220
- }
1221
- };
1222
- }
1223
2622
  async startConversionTask(url, conversionOptions) {
1224
2623
  const task = await this.client.convertSourceAsync({
1225
2624
  sources: [
@@ -1286,25 +2685,64 @@ var PDFConverter = class {
1286
2685
  return;
1287
2686
  }
1288
2687
  if (status.task_status === "failure") {
1289
- throw new Error("Task failed with status: failure");
2688
+ const errorDetails = await this.getTaskFailureDetails(task);
2689
+ const elapsed = Math.round((Date.now() - conversionStartTime) / 1e3);
2690
+ this.logger.error(
2691
+ `
2692
+ [PDFConverter] Task failed after ${elapsed}s: ${errorDetails}`
2693
+ );
2694
+ throw new Error(`Task failed: ${errorDetails}`);
1290
2695
  }
1291
2696
  await new Promise(
1292
2697
  (resolve) => setTimeout(resolve, PDF_CONVERTER.POLL_INTERVAL_MS)
1293
2698
  );
1294
2699
  }
1295
2700
  }
2701
+ /**
2702
+ * Fetch detailed error information from a failed task result.
2703
+ */
2704
+ async getTaskFailureDetails(task) {
2705
+ try {
2706
+ const result = await task.getResult();
2707
+ if (result.errors?.length) {
2708
+ return result.errors.map((e) => e.message).join("; ");
2709
+ }
2710
+ return `status: ${result.status ?? "unknown"}`;
2711
+ } catch (err) {
2712
+ this.logger.error("[PDFConverter] Failed to retrieve task result:", err);
2713
+ return "unable to retrieve error details";
2714
+ }
2715
+ }
1296
2716
  async downloadResult(taskId) {
1297
2717
  this.logger.info(
1298
2718
  "\n[PDFConverter] Task completed, downloading ZIP file..."
1299
2719
  );
1300
2720
  const zipResult = await this.client.getTaskResultFile(taskId);
1301
- if (!zipResult.success || !zipResult.fileStream) {
1302
- throw new Error("Failed to get ZIP file result");
1303
- }
1304
- const zipPath = (0, import_node_path5.join)(process.cwd(), "result.zip");
2721
+ const zipPath = (0, import_node_path7.join)(process.cwd(), "result.zip");
1305
2722
  this.logger.info("[PDFConverter] Saving ZIP file to:", zipPath);
1306
- const writeStream = (0, import_node_fs4.createWriteStream)(zipPath);
1307
- await (0, import_promises.pipeline)(zipResult.fileStream, writeStream);
2723
+ if (zipResult.fileStream) {
2724
+ const writeStream = (0, import_node_fs7.createWriteStream)(zipPath);
2725
+ await (0, import_promises2.pipeline)(zipResult.fileStream, writeStream);
2726
+ return;
2727
+ }
2728
+ if (zipResult.data) {
2729
+ await (0, import_promises.writeFile)(zipPath, zipResult.data);
2730
+ return;
2731
+ }
2732
+ this.logger.warn(
2733
+ "[PDFConverter] SDK file result unavailable, falling back to direct download..."
2734
+ );
2735
+ const baseUrl = this.client.getConfig().baseUrl;
2736
+ const response = await fetch(`${baseUrl}/v1/result/${taskId}`, {
2737
+ headers: { Accept: "application/zip" }
2738
+ });
2739
+ if (!response.ok) {
2740
+ throw new Error(
2741
+ `Failed to download ZIP file: ${response.status} ${response.statusText}`
2742
+ );
2743
+ }
2744
+ const buffer = new Uint8Array(await response.arrayBuffer());
2745
+ await (0, import_promises.writeFile)(zipPath, buffer);
1308
2746
  }
1309
2747
  async processConvertedFiles(zipPath, extractDir, outputDir) {
1310
2748
  await ImageExtractor.extractAndSaveDocumentsFromZip(
@@ -1314,6 +2752,40 @@ var PDFConverter = class {
1314
2752
  outputDir
1315
2753
  );
1316
2754
  }
2755
+ /**
2756
+ * Render page images from the source PDF using ImageMagick and update result.json.
2757
+ * Replaces Docling's generate_page_images which fails on large PDFs
2758
+ * due to memory limits when embedding all page images as base64.
2759
+ */
2760
+ async renderPageImages(url, outputDir) {
2761
+ if (!url.startsWith("file://")) {
2762
+ this.logger.warn(
2763
+ "[PDFConverter] Page image rendering skipped: only supported for local files (file:// URLs)"
2764
+ );
2765
+ return;
2766
+ }
2767
+ const pdfPath = url.slice(7);
2768
+ this.logger.info(
2769
+ "[PDFConverter] Rendering page images with ImageMagick..."
2770
+ );
2771
+ const renderer = new PageRenderer(this.logger);
2772
+ const renderResult = await renderer.renderPages(pdfPath, outputDir);
2773
+ const resultPath = (0, import_node_path7.join)(outputDir, "result.json");
2774
+ const doc = JSON.parse((0, import_node_fs7.readFileSync)(resultPath, "utf-8"));
2775
+ for (const page of Object.values(doc.pages)) {
2776
+ const pageNo = page.page_no;
2777
+ const fileIndex = pageNo - 1;
2778
+ if (fileIndex >= 0 && fileIndex < renderResult.pageCount) {
2779
+ page.image.uri = `pages/page_${fileIndex}.png`;
2780
+ page.image.mimetype = "image/png";
2781
+ page.image.dpi = 300;
2782
+ }
2783
+ }
2784
+ await (0, import_promises.writeFile)(resultPath, JSON.stringify(doc, null, 2));
2785
+ this.logger.info(
2786
+ `[PDFConverter] Rendered ${renderResult.pageCount} page images`
2787
+ );
2788
+ }
1317
2789
  };
1318
2790
 
1319
2791
  // src/core/pdf-parser.ts
@@ -1344,7 +2816,7 @@ var PDFParser = class {
1344
2816
  this.baseUrl = void 0;
1345
2817
  }
1346
2818
  this.timeout = timeout;
1347
- this.venvPath = venvPath || (0, import_node_path6.join)(process.cwd(), ".venv");
2819
+ this.venvPath = venvPath || (0, import_node_path8.join)(process.cwd(), ".venv");
1348
2820
  this.killExistingProcess = killExistingProcess;
1349
2821
  this.enableImagePdfFallback = enableImagePdfFallback;
1350
2822
  }
@@ -1352,6 +2824,7 @@ var PDFParser = class {
1352
2824
  this.logger.info("[PDFParser] Initializing...");
1353
2825
  this.checkOperatingSystem();
1354
2826
  this.checkJqInstalled();
2827
+ this.checkPopplerInstalled();
1355
2828
  this.checkMacOSVersion();
1356
2829
  if (this.enableImagePdfFallback && !this.baseUrl) {
1357
2830
  this.checkImageMagickInstalled();
@@ -1363,7 +2836,7 @@ var PDFParser = class {
1363
2836
  }
1364
2837
  if (this.baseUrl) {
1365
2838
  this.logger.info("[PDFParser] Using external server:", this.baseUrl);
1366
- this.client = new import_docling_sdk2.Docling({
2839
+ this.client = new import_docling_sdk.Docling({
1367
2840
  api: { baseUrl: this.baseUrl, timeout: this.timeout }
1368
2841
  });
1369
2842
  await this.waitForServerReady();
@@ -1379,7 +2852,7 @@ var PDFParser = class {
1379
2852
  });
1380
2853
  await this.environment.setup();
1381
2854
  const clientUrl = `http://localhost:${this.port}`;
1382
- this.client = new import_docling_sdk2.Docling({
2855
+ this.client = new import_docling_sdk.Docling({
1383
2856
  api: {
1384
2857
  baseUrl: clientUrl,
1385
2858
  timeout: this.timeout
@@ -1393,9 +2866,9 @@ var PDFParser = class {
1393
2866
  }
1394
2867
  }
1395
2868
  checkOperatingSystem() {
1396
- if ((0, import_node_os3.platform)() !== "darwin") {
2869
+ if ((0, import_node_os2.platform)() !== "darwin") {
1397
2870
  throw new Error(
1398
- "PDFParser is only supported on macOS. Current platform: " + (0, import_node_os3.platform)()
2871
+ "PDFParser is only supported on macOS. Current platform: " + (0, import_node_os2.platform)()
1399
2872
  );
1400
2873
  }
1401
2874
  }
@@ -1408,6 +2881,15 @@ var PDFParser = class {
1408
2881
  );
1409
2882
  }
1410
2883
  }
2884
+ checkPopplerInstalled() {
2885
+ try {
2886
+ (0, import_node_child_process3.execSync)("which pdftotext", { stdio: "ignore" });
2887
+ } catch {
2888
+ throw new Error(
2889
+ "poppler is not installed. Please install poppler using: brew install poppler"
2890
+ );
2891
+ }
2892
+ }
1411
2893
  checkMacOSVersion() {
1412
2894
  try {
1413
2895
  const versionOutput = (0, import_node_child_process3.execSync)("sw_vers -productVersion", {
@@ -1454,8 +2936,12 @@ var PDFParser = class {
1454
2936
  */
1455
2937
  isConnectionRefusedError(error) {
1456
2938
  if (error instanceof Error) {
1457
- const errorStr = JSON.stringify(error);
1458
- return errorStr.includes("ECONNREFUSED");
2939
+ if (error.message.includes("ECONNREFUSED")) {
2940
+ return true;
2941
+ }
2942
+ if (error.cause instanceof Error && error.cause.message.includes("ECONNREFUSED")) {
2943
+ return true;
2944
+ }
1459
2945
  }
1460
2946
  return false;
1461
2947
  }
@@ -1479,7 +2965,7 @@ var PDFParser = class {
1479
2965
  });
1480
2966
  await environment.startServer();
1481
2967
  this.client?.destroy();
1482
- this.client = new import_docling_sdk2.Docling({
2968
+ this.client = new import_docling_sdk.Docling({
1483
2969
  api: {
1484
2970
  baseUrl: `http://localhost:${this.port}`,
1485
2971
  timeout: this.timeout
@@ -1523,11 +3009,20 @@ var PDFParser = class {
1523
3009
  "PDFParser is not initialized. Call init() before using parse()"
1524
3010
  );
1525
3011
  }
1526
- if (options.pipeline === "vlm" && this.environment && !this.baseUrl) {
1527
- this.logger.info(
1528
- "[PDFParser] VLM pipeline requested, ensuring VLM dependencies..."
3012
+ if (options.forceImagePdf && !this.baseUrl) {
3013
+ this.checkImageMagickInstalled();
3014
+ this.checkGhostscriptInstalled();
3015
+ }
3016
+ const useStrategyFlow = options.strategySamplerModel !== void 0 || options.forcedMethod !== void 0;
3017
+ if (useStrategyFlow) {
3018
+ return this.parseWithStrategy(
3019
+ url,
3020
+ reportId,
3021
+ onComplete,
3022
+ cleanupAfterCallback,
3023
+ options,
3024
+ abortSignal
1529
3025
  );
1530
- await this.environment.setupVlmDependencies();
1531
3026
  }
1532
3027
  const canRecover = !this.baseUrl && this.port !== void 0;
1533
3028
  const maxAttempts = PDF_PARSER.MAX_SERVER_RECOVERY_ATTEMPTS;
@@ -1564,6 +3059,53 @@ var PDFParser = class {
1564
3059
  throw error;
1565
3060
  }
1566
3061
  }
3062
+ return null;
3063
+ }
3064
+ /**
3065
+ * Parse a PDF using OCR strategy sampling to decide between ocrmac and VLM.
3066
+ * Delegates to PDFConverter.convertWithStrategy() and returns the token usage report.
3067
+ *
3068
+ * Server recovery (restart on ECONNREFUSED) is preserved because
3069
+ * the ocrmac path still uses the Docling server.
3070
+ */
3071
+ async parseWithStrategy(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
3072
+ const canRecover = !this.baseUrl && this.port !== void 0;
3073
+ const maxAttempts = PDF_PARSER.MAX_SERVER_RECOVERY_ATTEMPTS;
3074
+ let attempt = 0;
3075
+ while (attempt <= maxAttempts) {
3076
+ try {
3077
+ const effectiveFallbackEnabled = this.enableImagePdfFallback && !this.baseUrl;
3078
+ const converter = new PDFConverter(
3079
+ this.logger,
3080
+ this.client,
3081
+ effectiveFallbackEnabled,
3082
+ this.timeout
3083
+ );
3084
+ const result = await converter.convertWithStrategy(
3085
+ url,
3086
+ reportId,
3087
+ onComplete,
3088
+ cleanupAfterCallback,
3089
+ options,
3090
+ abortSignal
3091
+ );
3092
+ return result.tokenUsageReport;
3093
+ } catch (error) {
3094
+ if (abortSignal?.aborted) {
3095
+ throw error;
3096
+ }
3097
+ if (canRecover && this.isConnectionRefusedError(error) && attempt < maxAttempts) {
3098
+ this.logger.warn(
3099
+ "[PDFParser] Connection refused, attempting server recovery..."
3100
+ );
3101
+ await this.restartServer();
3102
+ attempt++;
3103
+ continue;
3104
+ }
3105
+ throw error;
3106
+ }
3107
+ }
3108
+ return null;
1567
3109
  }
1568
3110
  /**
1569
3111
  * Dispose the parser instance.
@@ -1585,12 +3127,174 @@ var PDFParser = class {
1585
3127
  }
1586
3128
  }
1587
3129
  };
3130
+
3131
+ // src/validators/vlm-response-validator.ts
3132
+ var MIN_CONTENT_LENGTH = 20;
3133
+ var KOREAN_SCRIPT_RATIO_THRESHOLD = 0.1;
3134
+ var PLACEHOLDER_PATTERNS = [
3135
+ /lorem\s+ipsum/i,
3136
+ /dolor\s+sit\s+amet/i,
3137
+ /consectetur\s+adipiscing/i,
3138
+ /sed\s+do\s+eiusmod/i,
3139
+ /ut\s+labore\s+et\s+dolore/i
3140
+ ];
3141
+ var META_DESCRIPTION_PATTERNS_KO = [
3142
+ /이미지\s*해상도/,
3143
+ /판독하기?\s*어렵/,
3144
+ /해상도가?\s*(매우\s*)?(낮|부족)/,
3145
+ /텍스트를?\s*판독/,
3146
+ /글자를?\s*읽기?\s*어렵/,
3147
+ /정확한?\s*판독이?\s*(불가|어렵)/
3148
+ ];
3149
+ var META_DESCRIPTION_PATTERNS_EN = [
3150
+ /the image contains/i,
3151
+ /unable to (read|transcribe)/i,
3152
+ /resolution.*(too low|insufficient)/i,
3153
+ /cannot (read|make out|decipher)/i,
3154
+ /text is (not |un)?(legible|readable)/i,
3155
+ /exact transcription is not possible/i
3156
+ ];
3157
+ var REPETITIVE_PATTERN_RATIO_THRESHOLD = 0.3;
3158
+ var REPETITIVE_PATTERN_MIN_REPEATS = 5;
3159
+ var HANGUL_REGEX2 = /[\uAC00-\uD7AF\u1100-\u11FF]/g;
3160
+ var CJK_REGEX2 = /[\u4E00-\u9FFF]/g;
3161
+ var VlmResponseValidator = class {
3162
+ /**
3163
+ * Validate VLM page result quality.
3164
+ *
3165
+ * @param elements - Extracted page elements to validate
3166
+ * @param documentLanguages - BCP 47 language tags (e.g., ['ko-KR', 'en-US'])
3167
+ * @returns Validation result with issues list
3168
+ */
3169
+ static validate(elements, documentLanguages) {
3170
+ const issues = [];
3171
+ const textElements = elements.filter(
3172
+ (el) => el.type !== "picture" && el.content.length > 0
3173
+ );
3174
+ if (textElements.length === 0) {
3175
+ return { isValid: true, issues: [] };
3176
+ }
3177
+ const placeholderIssue = this.detectPlaceholderText(textElements);
3178
+ if (placeholderIssue) {
3179
+ issues.push(placeholderIssue);
3180
+ }
3181
+ if (documentLanguages?.[0]?.startsWith("ko")) {
3182
+ const scriptIssue = this.detectScriptAnomaly(textElements);
3183
+ if (scriptIssue) {
3184
+ issues.push(scriptIssue);
3185
+ }
3186
+ }
3187
+ const metaIssue = this.detectMetaDescription(textElements);
3188
+ if (metaIssue) {
3189
+ issues.push(metaIssue);
3190
+ }
3191
+ const repetitiveIssue = this.detectRepetitivePattern(textElements);
3192
+ if (repetitiveIssue) {
3193
+ issues.push(repetitiveIssue);
3194
+ }
3195
+ return { isValid: issues.length === 0, issues };
3196
+ }
3197
+ /**
3198
+ * Detect known placeholder / filler text in elements.
3199
+ */
3200
+ static detectPlaceholderText(elements) {
3201
+ const affectedElements = [];
3202
+ for (const el of elements) {
3203
+ for (const pattern of PLACEHOLDER_PATTERNS) {
3204
+ if (pattern.test(el.content)) {
3205
+ affectedElements.push(el.order);
3206
+ break;
3207
+ }
3208
+ }
3209
+ }
3210
+ if (affectedElements.length === 0) return null;
3211
+ return {
3212
+ type: "placeholder_text",
3213
+ message: `Detected placeholder text (e.g., Lorem ipsum) in ${affectedElements.length} element(s)`,
3214
+ affectedElements
3215
+ };
3216
+ }
3217
+ /**
3218
+ * Detect script anomaly: expected Korean content but found mostly Latin text.
3219
+ * Counts Hangul + CJK characters and flags if the ratio is below threshold.
3220
+ */
3221
+ static detectScriptAnomaly(elements) {
3222
+ const allContent = elements.map((el) => el.content).join("");
3223
+ const nonWhitespace = allContent.replace(/\s/g, "");
3224
+ if (nonWhitespace.length < MIN_CONTENT_LENGTH) {
3225
+ return null;
3226
+ }
3227
+ const hangulCount = allContent.match(HANGUL_REGEX2)?.length ?? 0;
3228
+ const cjkCount = allContent.match(CJK_REGEX2)?.length ?? 0;
3229
+ const koreanCjkCount = hangulCount + cjkCount;
3230
+ const ratio = koreanCjkCount / nonWhitespace.length;
3231
+ if (ratio < KOREAN_SCRIPT_RATIO_THRESHOLD) {
3232
+ return {
3233
+ type: "script_anomaly",
3234
+ message: `Expected Korean text but found ${(ratio * 100).toFixed(1)}% Korean/CJK characters (threshold: ${KOREAN_SCRIPT_RATIO_THRESHOLD * 100}%)`,
3235
+ affectedElements: elements.map((el) => el.order)
3236
+ };
3237
+ }
3238
+ return null;
3239
+ }
3240
+ /**
3241
+ * Detect meta description: VLM described the image/resolution instead
3242
+ * of transcribing actual text content.
3243
+ */
3244
+ static detectMetaDescription(elements) {
3245
+ const affectedElements = [];
3246
+ const allPatterns = [
3247
+ ...META_DESCRIPTION_PATTERNS_KO,
3248
+ ...META_DESCRIPTION_PATTERNS_EN
3249
+ ];
3250
+ for (const el of elements) {
3251
+ for (const pattern of allPatterns) {
3252
+ if (pattern.test(el.content)) {
3253
+ affectedElements.push(el.order);
3254
+ break;
3255
+ }
3256
+ }
3257
+ }
3258
+ if (affectedElements.length === 0) return null;
3259
+ return {
3260
+ type: "meta_description",
3261
+ message: `Detected meta-description of image instead of text transcription in ${affectedElements.length} element(s)`,
3262
+ affectedElements
3263
+ };
3264
+ }
3265
+ /**
3266
+ * Detect repetitive character patterns (e.g., `: : : : :` or `= = = = =`).
3267
+ * Flags when the same character repeats with spaces 5+ times and the
3268
+ * repetitive portion exceeds 30% of total content.
3269
+ */
3270
+ static detectRepetitivePattern(elements) {
3271
+ const allContent = elements.map((el) => el.content).join("\n");
3272
+ if (allContent.trim().length === 0) return null;
3273
+ const repetitiveRegex = /(\S)(\s+\1){4,}/g;
3274
+ let totalRepetitiveLength = 0;
3275
+ let match;
3276
+ while ((match = repetitiveRegex.exec(allContent)) !== null) {
3277
+ const repeatedChar = match[1];
3278
+ const segment = match[0];
3279
+ const parts = segment.split(/\s+/).filter((p) => p === repeatedChar);
3280
+ if (parts.length >= REPETITIVE_PATTERN_MIN_REPEATS) {
3281
+ totalRepetitiveLength += segment.length;
3282
+ }
3283
+ }
3284
+ if (totalRepetitiveLength === 0) return null;
3285
+ const ratio = totalRepetitiveLength / allContent.length;
3286
+ if (ratio < REPETITIVE_PATTERN_RATIO_THRESHOLD) return null;
3287
+ return {
3288
+ type: "repetitive_pattern",
3289
+ message: `Detected repetitive character patterns (${(ratio * 100).toFixed(0)}% of content)`,
3290
+ affectedElements: elements.map((el) => el.order)
3291
+ };
3292
+ }
3293
+ };
1588
3294
  // Annotate the CommonJS export names for ESM import in node:
1589
3295
  0 && (module.exports = {
1590
- DEFAULT_VLM_MODEL,
1591
3296
  ImagePdfFallbackError,
1592
3297
  PDFParser,
1593
- VLM_MODELS,
1594
- resolveVlmModel
3298
+ VlmResponseValidator
1595
3299
  });
1596
3300
  //# sourceMappingURL=index.cjs.map