@heripo/pdf-parser 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,15 +1,8 @@
1
- import {
2
- DEFAULT_VLM_MODEL,
3
- VLM_MODELS,
4
- resolveVlmModel
5
- } from "./chunk-WWNI354M.js";
6
- import "./chunk-VUNV25KB.js";
7
-
8
1
  // src/core/pdf-parser.ts
9
2
  import { Docling } from "docling-sdk";
10
3
  import { execSync } from "child_process";
11
- import { platform as platform2 } from "os";
12
- import { join as join5 } from "path";
4
+ import { platform } from "os";
5
+ import { join as join7 } from "path";
13
6
 
14
7
  // src/config/constants.ts
15
8
  var PDF_PARSER = {
@@ -60,22 +53,48 @@ var IMAGE_PDF_CONVERTER = {
60
53
  */
61
54
  QUALITY: 100
62
55
  };
63
- var VLM_ENVIRONMENT = {
64
- /**
65
- * Timeout for VLM dependency installation (pip install) in milliseconds (3 hours).
66
- * VLM packages can be very large and may require extended download times
67
- * depending on network conditions.
68
- */
69
- SETUP_TIMEOUT_MS: 108e5,
70
- /**
71
- * Timeout for VLM model download in milliseconds (3 hours).
72
- * Large VLM models (e.g., multi-GB weights) need sufficient time to download.
73
- */
74
- MODEL_DOWNLOAD_TIMEOUT_MS: 108e5
75
- };
76
56
 
77
57
  // ../shared/dist/index.mjs
78
58
  import { spawn } from "child_process";
59
+ import {
60
+ NoObjectGeneratedError,
61
+ Output,
62
+ generateText,
63
+ hasToolCall,
64
+ tool
65
+ } from "ai";
66
+ var ConcurrentPool = class {
67
+ /**
68
+ * Process items concurrently using a worker pool pattern.
69
+ *
70
+ * Spawns up to `concurrency` workers that pull items from a shared queue.
71
+ * Each worker processes one item at a time; when it finishes, it immediately
72
+ * takes the next available item. Results maintain the original item order.
73
+ *
74
+ * @param items - Array of items to process
75
+ * @param concurrency - Maximum number of concurrent workers
76
+ * @param processFn - Async function to process each item
77
+ * @param onItemComplete - Optional callback fired after each item completes
78
+ * @returns Array of results in the same order as the input items
79
+ */
80
+ static async run(items, concurrency, processFn, onItemComplete) {
81
+ const results = new Array(items.length);
82
+ let nextIndex = 0;
83
+ async function worker() {
84
+ while (nextIndex < items.length) {
85
+ const index = nextIndex++;
86
+ results[index] = await processFn(items[index], index);
87
+ onItemComplete?.(results[index], index);
88
+ }
89
+ }
90
+ const workers = Array.from(
91
+ { length: Math.min(concurrency, items.length) },
92
+ () => worker()
93
+ );
94
+ await Promise.all(workers);
95
+ return results;
96
+ }
97
+ };
79
98
  function spawnAsync(command, args, options = {}) {
80
99
  const {
81
100
  captureStdout = true,
@@ -102,10 +121,456 @@ function spawnAsync(command, args, options = {}) {
102
121
  proc.on("error", reject);
103
122
  });
104
123
  }
124
+ function detectProvider(model) {
125
+ const providerId = model.provider;
126
+ if (!providerId || typeof providerId !== "string") return "unknown";
127
+ if (providerId.includes("openai")) return "openai";
128
+ if (providerId.includes("google")) return "google";
129
+ if (providerId.includes("anthropic")) return "anthropic";
130
+ if (providerId.includes("together")) return "togetherai";
131
+ return "unknown";
132
+ }
133
+ var LLMCaller = class {
134
+ /**
135
+ * Extract model name from LanguageModel object
136
+ *
137
+ * Attempts to get model ID from various possible fields in the LanguageModel object.
138
+ */
139
+ static extractModelName(model) {
140
+ const modelObj = model;
141
+ if (typeof modelObj.modelId === "string") return modelObj.modelId;
142
+ if (typeof modelObj.id === "string") return modelObj.id;
143
+ if (typeof modelObj.model === "string") return modelObj.model;
144
+ if (typeof modelObj.name === "string") return modelObj.name;
145
+ return String(model);
146
+ }
147
+ /**
148
+ * Build usage information from response
149
+ */
150
+ static buildUsage(config, modelName, response, usedFallback) {
151
+ return {
152
+ component: config.component,
153
+ phase: config.phase,
154
+ model: usedFallback ? "fallback" : "primary",
155
+ modelName,
156
+ inputTokens: response.usage?.inputTokens ?? 0,
157
+ outputTokens: response.usage?.outputTokens ?? 0,
158
+ totalTokens: response.usage?.totalTokens ?? 0
159
+ };
160
+ }
161
+ /**
162
+ * Maximum number of retries when structured output generation fails.
163
+ * Total attempts = MAX_STRUCTURED_OUTPUT_RETRIES + 1.
164
+ *
165
+ * Applied to both:
166
+ * - `Output.object()` path: retries on NoObjectGeneratedError (schema mismatch)
167
+ * - Tool call path: retries when model does not produce a tool call
168
+ */
169
+ static MAX_STRUCTURED_OUTPUT_RETRIES = 10;
170
+ /**
171
+ * Generate structured output via forced tool call.
172
+ *
173
+ * Used for providers (Together AI, unknown) that do not reliably support
174
+ * `Output.object()`. Forces the model to call a tool whose inputSchema
175
+ * is the target Zod schema, then extracts the parsed input.
176
+ *
177
+ * Retries up to MAX_STRUCTURED_OUTPUT_RETRIES times when the model does not
178
+ * produce a tool call, for a total of MAX_STRUCTURED_OUTPUT_RETRIES + 1 attempts.
179
+ *
180
+ * @throws NoObjectGeneratedError when all attempts fail to produce a tool call
181
+ */
182
+ static async generateViaToolCall(model, schema, promptParams) {
183
+ const submitTool = tool({
184
+ description: "Submit the structured result",
185
+ inputSchema: schema
186
+ });
187
+ let lastResult;
188
+ for (let attempt = 0; attempt <= this.MAX_STRUCTURED_OUTPUT_RETRIES; attempt++) {
189
+ lastResult = await generateText({
190
+ ...promptParams,
191
+ model,
192
+ tools: { submitResult: submitTool },
193
+ toolChoice: { type: "tool", toolName: "submitResult" },
194
+ stopWhen: hasToolCall("submitResult")
195
+ });
196
+ const toolCall = lastResult.toolCalls?.[0];
197
+ if (toolCall) {
198
+ return {
199
+ output: toolCall.input,
200
+ usage: lastResult.usage
201
+ };
202
+ }
203
+ }
204
+ throw new NoObjectGeneratedError({
205
+ message: "Model did not produce a tool call for structured output",
206
+ text: lastResult.text ?? "",
207
+ response: lastResult.response,
208
+ usage: lastResult.usage,
209
+ finishReason: lastResult.finishReason
210
+ });
211
+ }
212
+ /**
213
+ * Generate structured output with provider-aware strategy.
214
+ *
215
+ * Strategy per provider:
216
+ * - OpenAI / Anthropic / Google Gemini: `Output.object()` with schema retry
217
+ * - Together AI / unknown: forced tool call pattern
218
+ *
219
+ * Retries up to MAX_STRUCTURED_OUTPUT_RETRIES times on NoObjectGeneratedError
220
+ * (schema mismatch), re-throwing the last error if all attempts fail.
221
+ */
222
+ static async generateStructuredOutput(model, schema, promptParams) {
223
+ const providerType = detectProvider(model);
224
+ if (providerType === "togetherai" || providerType === "unknown") {
225
+ return this.generateViaToolCall(model, schema, promptParams);
226
+ }
227
+ let lastError;
228
+ for (let attempt = 0; attempt <= this.MAX_STRUCTURED_OUTPUT_RETRIES; attempt++) {
229
+ try {
230
+ return await generateText({
231
+ model,
232
+ output: Output.object({ schema }),
233
+ ...promptParams
234
+ });
235
+ } catch (error) {
236
+ if (NoObjectGeneratedError.isInstance(error)) {
237
+ lastError = error;
238
+ continue;
239
+ }
240
+ throw error;
241
+ }
242
+ }
243
+ throw lastError;
244
+ }
245
+ /**
246
+ * Execute LLM call with fallback support
247
+ *
248
+ * Common execution logic for both text and vision calls.
249
+ * Logs additional details when NoObjectGeneratedError occurs.
250
+ */
251
+ static async executeWithFallback(config, generateFn) {
252
+ const primaryModelName = this.extractModelName(config.primaryModel);
253
+ try {
254
+ const response = await generateFn(config.primaryModel);
255
+ return {
256
+ output: response.output,
257
+ usage: this.buildUsage(config, primaryModelName, response, false),
258
+ usedFallback: false
259
+ };
260
+ } catch (primaryError) {
261
+ if (config.abortSignal?.aborted) {
262
+ throw primaryError;
263
+ }
264
+ if (!config.fallbackModel) {
265
+ throw primaryError;
266
+ }
267
+ const fallbackModelName = this.extractModelName(config.fallbackModel);
268
+ const response = await generateFn(config.fallbackModel);
269
+ return {
270
+ output: response.output,
271
+ usage: this.buildUsage(config, fallbackModelName, response, true),
272
+ usedFallback: true
273
+ };
274
+ }
275
+ }
276
+ /**
277
+ * Call LLM with retry and fallback support
278
+ *
279
+ * Retry Strategy:
280
+ * 1. Try primary model up to maxRetries times
281
+ * 2. If all fail and fallbackModel provided, try fallback up to maxRetries times
282
+ * 3. Throw error if all attempts exhausted
283
+ *
284
+ * Provider-aware strategy is automatically applied based on the model's provider field.
285
+ *
286
+ * @template TOutput - Output type from schema validation
287
+ * @param config - LLM call configuration
288
+ * @returns Result with parsed object and usage information
289
+ * @throws Error if all retry attempts fail
290
+ */
291
+ static async call(config) {
292
+ return this.executeWithFallback(
293
+ config,
294
+ (model) => this.generateStructuredOutput(model, config.schema, {
295
+ system: config.systemPrompt,
296
+ prompt: config.userPrompt,
297
+ temperature: config.temperature,
298
+ maxRetries: config.maxRetries,
299
+ abortSignal: config.abortSignal
300
+ })
301
+ );
302
+ }
303
+ /**
304
+ * Call LLM for vision tasks with message format support
305
+ *
306
+ * Same retry and fallback logic as call(), but using message format instead of system/user prompts.
307
+ * Provider-aware strategy is automatically applied based on the model's provider field.
308
+ *
309
+ * @template TOutput - Output type from schema validation
310
+ * @param config - LLM vision call configuration
311
+ * @returns Result with parsed object and usage information
312
+ * @throws Error if all retry attempts fail
313
+ */
314
+ static async callVision(config) {
315
+ return this.executeWithFallback(
316
+ config,
317
+ (model) => this.generateStructuredOutput(model, config.schema, {
318
+ messages: config.messages,
319
+ temperature: config.temperature,
320
+ maxRetries: config.maxRetries,
321
+ abortSignal: config.abortSignal
322
+ })
323
+ );
324
+ }
325
+ };
326
+ function formatTokens(usage) {
327
+ return `${usage.inputTokens} input, ${usage.outputTokens} output, ${usage.totalTokens} total`;
328
+ }
329
+ var LLMTokenUsageAggregator = class {
330
+ usage = {};
331
+ /**
332
+ * Track token usage from an LLM call
333
+ *
334
+ * @param usage - Extended token usage with component/phase/model information
335
+ */
336
+ track(usage) {
337
+ if (!this.usage[usage.component]) {
338
+ this.usage[usage.component] = {
339
+ component: usage.component,
340
+ phases: {},
341
+ total: {
342
+ inputTokens: 0,
343
+ outputTokens: 0,
344
+ totalTokens: 0
345
+ }
346
+ };
347
+ }
348
+ const component = this.usage[usage.component];
349
+ if (!component.phases[usage.phase]) {
350
+ component.phases[usage.phase] = {
351
+ total: {
352
+ inputTokens: 0,
353
+ outputTokens: 0,
354
+ totalTokens: 0
355
+ }
356
+ };
357
+ }
358
+ const phase = component.phases[usage.phase];
359
+ if (usage.model === "primary") {
360
+ if (!phase.primary) {
361
+ phase.primary = {
362
+ modelName: usage.modelName,
363
+ inputTokens: 0,
364
+ outputTokens: 0,
365
+ totalTokens: 0
366
+ };
367
+ }
368
+ phase.primary.inputTokens += usage.inputTokens;
369
+ phase.primary.outputTokens += usage.outputTokens;
370
+ phase.primary.totalTokens += usage.totalTokens;
371
+ } else if (usage.model === "fallback") {
372
+ if (!phase.fallback) {
373
+ phase.fallback = {
374
+ modelName: usage.modelName,
375
+ inputTokens: 0,
376
+ outputTokens: 0,
377
+ totalTokens: 0
378
+ };
379
+ }
380
+ phase.fallback.inputTokens += usage.inputTokens;
381
+ phase.fallback.outputTokens += usage.outputTokens;
382
+ phase.fallback.totalTokens += usage.totalTokens;
383
+ }
384
+ phase.total.inputTokens += usage.inputTokens;
385
+ phase.total.outputTokens += usage.outputTokens;
386
+ phase.total.totalTokens += usage.totalTokens;
387
+ component.total.inputTokens += usage.inputTokens;
388
+ component.total.outputTokens += usage.outputTokens;
389
+ component.total.totalTokens += usage.totalTokens;
390
+ }
391
+ /**
392
+ * Get aggregated usage grouped by component
393
+ *
394
+ * @returns Array of component aggregates with phase breakdown
395
+ */
396
+ getByComponent() {
397
+ return Object.values(this.usage);
398
+ }
399
+ /**
400
+ * Get token usage report in structured JSON format
401
+ *
402
+ * Converts internal usage data to external TokenUsageReport format suitable
403
+ * for serialization and reporting. The report includes component breakdown,
404
+ * phase-level details, and both primary and fallback model usage.
405
+ *
406
+ * @returns Structured token usage report with components and total
407
+ */
408
+ getReport() {
409
+ const components = [];
410
+ for (const component of Object.values(this.usage)) {
411
+ const phases = [];
412
+ for (const [phaseName, phaseData] of Object.entries(component.phases)) {
413
+ const phaseReport = {
414
+ phase: phaseName,
415
+ total: {
416
+ inputTokens: phaseData.total.inputTokens,
417
+ outputTokens: phaseData.total.outputTokens,
418
+ totalTokens: phaseData.total.totalTokens
419
+ }
420
+ };
421
+ if (phaseData.primary) {
422
+ phaseReport.primary = {
423
+ modelName: phaseData.primary.modelName,
424
+ inputTokens: phaseData.primary.inputTokens,
425
+ outputTokens: phaseData.primary.outputTokens,
426
+ totalTokens: phaseData.primary.totalTokens
427
+ };
428
+ }
429
+ if (phaseData.fallback) {
430
+ phaseReport.fallback = {
431
+ modelName: phaseData.fallback.modelName,
432
+ inputTokens: phaseData.fallback.inputTokens,
433
+ outputTokens: phaseData.fallback.outputTokens,
434
+ totalTokens: phaseData.fallback.totalTokens
435
+ };
436
+ }
437
+ phases.push(phaseReport);
438
+ }
439
+ components.push({
440
+ component: component.component,
441
+ phases,
442
+ total: {
443
+ inputTokens: component.total.inputTokens,
444
+ outputTokens: component.total.outputTokens,
445
+ totalTokens: component.total.totalTokens
446
+ }
447
+ });
448
+ }
449
+ const totalUsage = this.getTotalUsage();
450
+ return {
451
+ components,
452
+ total: {
453
+ inputTokens: totalUsage.inputTokens,
454
+ outputTokens: totalUsage.outputTokens,
455
+ totalTokens: totalUsage.totalTokens
456
+ }
457
+ };
458
+ }
459
+ /**
460
+ * Get total usage across all components and phases
461
+ *
462
+ * @returns Aggregated token usage totals
463
+ */
464
+ getTotalUsage() {
465
+ let totalInput = 0;
466
+ let totalOutput = 0;
467
+ let totalTokens = 0;
468
+ for (const component of Object.values(this.usage)) {
469
+ totalInput += component.total.inputTokens;
470
+ totalOutput += component.total.outputTokens;
471
+ totalTokens += component.total.totalTokens;
472
+ }
473
+ return {
474
+ inputTokens: totalInput,
475
+ outputTokens: totalOutput,
476
+ totalTokens
477
+ };
478
+ }
479
+ /**
480
+ * Log comprehensive token usage summary
481
+ *
482
+ * Outputs usage grouped by component, with phase and model breakdown.
483
+ * Shows primary and fallback token usage separately for each phase.
484
+ * Call this once at the end of document processing.
485
+ *
486
+ * @param logger - Logger instance for output
487
+ */
488
+ logSummary(logger) {
489
+ const components = this.getByComponent();
490
+ if (components.length === 0) {
491
+ logger.info("[DocumentProcessor] No token usage to report");
492
+ return;
493
+ }
494
+ logger.info("[DocumentProcessor] Token usage summary:");
495
+ logger.info("");
496
+ let grandInputTokens = 0;
497
+ let grandOutputTokens = 0;
498
+ let grandTotalTokens = 0;
499
+ let grandPrimaryInputTokens = 0;
500
+ let grandPrimaryOutputTokens = 0;
501
+ let grandPrimaryTotalTokens = 0;
502
+ let grandFallbackInputTokens = 0;
503
+ let grandFallbackOutputTokens = 0;
504
+ let grandFallbackTotalTokens = 0;
505
+ for (const component of components) {
506
+ logger.info(`${component.component}:`);
507
+ for (const [phase, phaseData] of Object.entries(component.phases)) {
508
+ logger.info(` - ${phase}:`);
509
+ if (phaseData.primary) {
510
+ logger.info(
511
+ ` primary (${phaseData.primary.modelName}): ${formatTokens(phaseData.primary)}`
512
+ );
513
+ grandPrimaryInputTokens += phaseData.primary.inputTokens;
514
+ grandPrimaryOutputTokens += phaseData.primary.outputTokens;
515
+ grandPrimaryTotalTokens += phaseData.primary.totalTokens;
516
+ }
517
+ if (phaseData.fallback) {
518
+ logger.info(
519
+ ` fallback (${phaseData.fallback.modelName}): ${formatTokens(phaseData.fallback)}`
520
+ );
521
+ grandFallbackInputTokens += phaseData.fallback.inputTokens;
522
+ grandFallbackOutputTokens += phaseData.fallback.outputTokens;
523
+ grandFallbackTotalTokens += phaseData.fallback.totalTokens;
524
+ }
525
+ logger.info(` subtotal: ${formatTokens(phaseData.total)}`);
526
+ }
527
+ logger.info(
528
+ ` ${component.component} total: ${formatTokens(component.total)}`
529
+ );
530
+ logger.info("");
531
+ grandInputTokens += component.total.inputTokens;
532
+ grandOutputTokens += component.total.outputTokens;
533
+ grandTotalTokens += component.total.totalTokens;
534
+ }
535
+ logger.info("--- Summary ---");
536
+ if (grandPrimaryTotalTokens > 0) {
537
+ logger.info(
538
+ `Primary total: ${formatTokens({
539
+ inputTokens: grandPrimaryInputTokens,
540
+ outputTokens: grandPrimaryOutputTokens,
541
+ totalTokens: grandPrimaryTotalTokens
542
+ })}`
543
+ );
544
+ }
545
+ if (grandFallbackTotalTokens > 0) {
546
+ logger.info(
547
+ `Fallback total: ${formatTokens({
548
+ inputTokens: grandFallbackInputTokens,
549
+ outputTokens: grandFallbackOutputTokens,
550
+ totalTokens: grandFallbackTotalTokens
551
+ })}`
552
+ );
553
+ }
554
+ logger.info(
555
+ `Grand total: ${formatTokens({
556
+ inputTokens: grandInputTokens,
557
+ outputTokens: grandOutputTokens,
558
+ totalTokens: grandTotalTokens
559
+ })}`
560
+ );
561
+ }
562
+ /**
563
+ * Reset all tracked usage
564
+ *
565
+ * Call this at the start of a new document processing run.
566
+ */
567
+ reset() {
568
+ this.usage = {};
569
+ }
570
+ };
105
571
 
106
572
  // src/environment/docling-environment.ts
107
573
  import { spawn as spawn2 } from "child_process";
108
- import { arch, platform } from "os";
109
574
  import { join } from "path";
110
575
 
111
576
  // src/utils/python-version.ts
@@ -147,7 +612,6 @@ var DoclingEnvironment = class _DoclingEnvironment {
147
612
  venvPath;
148
613
  port;
149
614
  killExistingProcess;
150
- vlmDependenciesInstalled = false;
151
615
  constructor(options) {
152
616
  this.logger = options.logger;
153
617
  this.venvPath = options.venvPath;
@@ -266,7 +730,11 @@ var DoclingEnvironment = class _DoclingEnvironment {
266
730
  }
267
731
  async installDoclingServe() {
268
732
  const pipPath = join(this.venvPath, "bin", "pip");
269
- const result = await spawnAsync(pipPath, ["install", "docling-serve"]);
733
+ const result = await spawnAsync(pipPath, [
734
+ "install",
735
+ "--upgrade",
736
+ "docling-serve"
737
+ ]);
270
738
  if (result.code !== 0) {
271
739
  this.logger.error(
272
740
  "[DoclingEnvironment] Failed to install docling-serve:",
@@ -277,81 +745,6 @@ var DoclingEnvironment = class _DoclingEnvironment {
277
745
  );
278
746
  }
279
747
  }
280
- /**
281
- * Install VLM-specific dependencies for the Docling VLM pipeline.
282
- *
283
- * Installs:
284
- * 1. docling-serve[vlm] - VLM model support for docling-serve
285
- * 2. mlx + mlx-lm (macOS ARM64 only) - Apple Silicon optimized inference
286
- *
287
- * This is idempotent - subsequent calls skip if already installed.
288
- */
289
- async setupVlmDependencies() {
290
- if (this.vlmDependenciesInstalled) {
291
- this.logger.info(
292
- "[DoclingEnvironment] VLM dependencies already installed, skipping"
293
- );
294
- return;
295
- }
296
- if (await this.isVlmReady()) {
297
- this.vlmDependenciesInstalled = true;
298
- this.logger.info(
299
- "[DoclingEnvironment] VLM dependencies already installed, skipping"
300
- );
301
- return;
302
- }
303
- this.logger.info("[DoclingEnvironment] Installing VLM dependencies...");
304
- const pipPath = join(this.venvPath, "bin", "pip");
305
- this.logger.info("[DoclingEnvironment] Installing docling[vlm]...");
306
- const vlmResult = await spawnAsync(
307
- pipPath,
308
- ["install", "docling-serve[vlm]"],
309
- { timeout: VLM_ENVIRONMENT.SETUP_TIMEOUT_MS }
310
- );
311
- if (vlmResult.code !== 0) {
312
- this.logger.error(
313
- "[DoclingEnvironment] Failed to install docling-serve[vlm]:",
314
- vlmResult.stderr
315
- );
316
- throw new Error(
317
- `Failed to install docling-serve[vlm]. Exit code: ${vlmResult.code}`
318
- );
319
- }
320
- if (platform() === "darwin" && arch() === "arm64") {
321
- this.logger.info(
322
- "[DoclingEnvironment] Installing mlx + mlx-lm for Apple Silicon..."
323
- );
324
- const mlxResult = await spawnAsync(
325
- pipPath,
326
- ["install", "mlx", "mlx-lm"],
327
- { timeout: VLM_ENVIRONMENT.SETUP_TIMEOUT_MS }
328
- );
329
- if (mlxResult.code !== 0) {
330
- this.logger.error(
331
- "[DoclingEnvironment] Failed to install mlx/mlx-lm:",
332
- mlxResult.stderr
333
- );
334
- throw new Error(
335
- `Failed to install mlx/mlx-lm. Exit code: ${mlxResult.code}`
336
- );
337
- }
338
- }
339
- this.vlmDependenciesInstalled = true;
340
- this.logger.info(
341
- "[DoclingEnvironment] VLM dependencies installed successfully"
342
- );
343
- }
344
- /**
345
- * Check if VLM dependencies are ready by verifying Python module imports
346
- */
347
- async isVlmReady() {
348
- const pythonPath = join(this.venvPath, "bin", "python");
349
- const result = await spawnAsync(pythonPath, [
350
- "-c",
351
- "import docling_core; import docling"
352
- ]);
353
- return result.code === 0;
354
- }
355
748
  async isPortInUse(port) {
356
749
  try {
357
750
  const result = await spawnAsync("lsof", ["-ti", `:${port}`]);
@@ -422,8 +815,13 @@ var DoclingEnvironment = class _DoclingEnvironment {
422
815
  const doclingProcess = spawn2(doclingServePath, args, {
423
816
  detached: true,
424
817
  // Detached from parent process
425
- stdio: "ignore"
818
+ stdio: "ignore",
426
819
  // Remove stdio pipes to prevent event loop from hanging
820
+ env: {
821
+ ...process.env,
822
+ // Enable remote API calls for API VLM models
823
+ DOCLING_SERVE_ENABLE_REMOTE_SERVICES: "true"
824
+ }
427
825
  });
428
826
  doclingProcess.unref();
429
827
  doclingProcess.on("error", (error) => {
@@ -438,10 +836,16 @@ var DoclingEnvironment = class _DoclingEnvironment {
438
836
  };
439
837
 
440
838
  // src/core/pdf-converter.ts
441
- import { ValidationUtils } from "docling-sdk";
442
839
  import { omit } from "es-toolkit";
443
- import { createWriteStream as createWriteStream2, existsSync as existsSync3, rmSync as rmSync3 } from "fs";
444
- import { join as join4 } from "path";
840
+ import {
841
+ copyFileSync,
842
+ createWriteStream as createWriteStream2,
843
+ existsSync as existsSync4,
844
+ readFileSync as readFileSync4,
845
+ rmSync as rmSync3
846
+ } from "fs";
847
+ import { writeFile } from "fs/promises";
848
+ import { join as join6 } from "path";
445
849
  import { pipeline } from "stream/promises";
446
850
 
447
851
  // src/errors/image-pdf-fallback-error.ts
@@ -638,28 +1042,28 @@ var ImageExtractor = class _ImageExtractor {
638
1042
  const baseName = filename.replace(extname(filename), "");
639
1043
  const jsonPath = join2(outputDir, `${baseName}.json`);
640
1044
  try {
641
- const pagesDir = join2(outputDir, "pages");
642
- if (!existsSync(pagesDir)) {
643
- mkdirSync(pagesDir, { recursive: true });
1045
+ const imagesDir = join2(outputDir, "images");
1046
+ if (!existsSync(imagesDir)) {
1047
+ mkdirSync(imagesDir, { recursive: true });
644
1048
  }
645
1049
  const base64Images = await _ImageExtractor.extractBase64ImagesFromJsonWithJq(jsonSourcePath);
646
1050
  base64Images.forEach((base64Data, index) => {
647
1051
  _ImageExtractor.extractBase64ImageToFile(
648
1052
  base64Data,
649
- pagesDir,
1053
+ imagesDir,
650
1054
  index,
651
- "page",
652
- "pages"
1055
+ "pic",
1056
+ "images"
653
1057
  );
654
1058
  });
655
1059
  logger.info(
656
- `[PDFConverter] Extracted ${base64Images.length} images from JSON to ${pagesDir}`
1060
+ `[PDFConverter] Extracted ${base64Images.length} picture images from JSON to ${imagesDir}`
657
1061
  );
658
1062
  const replacedCount = await _ImageExtractor.replaceBase64ImagesInJsonWithJq(
659
1063
  jsonSourcePath,
660
1064
  jsonPath,
661
- "pages",
662
- "page"
1065
+ "images",
1066
+ "pic"
663
1067
  );
664
1068
  logger.info(
665
1069
  `[PDFConverter] Replaced ${replacedCount} base64 images with file paths`
@@ -742,52 +1146,968 @@ var ImageExtractor = class _ImageExtractor {
742
1146
  }
743
1147
  };
744
1148
 
745
- // src/utils/local-file-server.ts
746
- import { createReadStream, statSync } from "fs";
747
- import { createServer } from "http";
748
- import { basename } from "path";
749
- var LocalFileServer = class {
750
- server = null;
751
- port = 0;
1149
+ // src/processors/page-renderer.ts
1150
+ import { existsSync as existsSync2, mkdirSync as mkdirSync2, readdirSync as readdirSync2 } from "fs";
1151
+ import { join as join3 } from "path";
1152
+ var DEFAULT_DPI = 300;
1153
+ var PageRenderer = class {
1154
+ constructor(logger) {
1155
+ this.logger = logger;
1156
+ }
752
1157
  /**
753
- * Start serving a file and return the URL
1158
+ * Render all pages of a PDF to individual PNG files.
754
1159
  *
755
- * @param filePath Absolute path to the file to serve
756
- * @returns URL to access the file
1160
+ * @param pdfPath - Absolute path to the source PDF file
1161
+ * @param outputDir - Directory where pages/ subdirectory will be created
1162
+ * @param options - Rendering options
1163
+ * @returns Render result with page count and file paths
757
1164
  */
758
- async start(filePath) {
759
- const filename = basename(filePath);
760
- const stat = statSync(filePath);
761
- return new Promise((resolve, reject) => {
762
- this.server = createServer((req, res) => {
763
- if (req.url === `/${filename}`) {
764
- res.writeHead(200, {
765
- "Content-Type": "application/pdf",
766
- "Content-Length": stat.size
767
- });
768
- createReadStream(filePath).pipe(res);
769
- } else {
770
- res.writeHead(404);
771
- res.end("Not Found");
772
- }
773
- });
774
- this.server.on("error", reject);
775
- this.server.listen(0, "127.0.0.1", () => {
776
- const address = this.server.address();
777
- if (typeof address === "object" && address !== null) {
778
- this.port = address.port;
779
- resolve(`http://127.0.0.1:${this.port}/${filename}`);
780
- } else {
781
- reject(new Error("Failed to get server address"));
782
- }
783
- });
784
- });
1165
+ async renderPages(pdfPath, outputDir, options) {
1166
+ const dpi = options?.dpi ?? DEFAULT_DPI;
1167
+ const pagesDir = join3(outputDir, "pages");
1168
+ if (!existsSync2(pagesDir)) {
1169
+ mkdirSync2(pagesDir, { recursive: true });
1170
+ }
1171
+ this.logger.info(`[PageRenderer] Rendering PDF at ${dpi} DPI...`);
1172
+ const outputPattern = join3(pagesDir, "page_%d.png");
1173
+ const result = await spawnAsync("magick", [
1174
+ "-density",
1175
+ dpi.toString(),
1176
+ pdfPath,
1177
+ "-background",
1178
+ "white",
1179
+ "-alpha",
1180
+ "remove",
1181
+ "-alpha",
1182
+ "off",
1183
+ outputPattern
1184
+ ]);
1185
+ if (result.code !== 0) {
1186
+ throw new Error(
1187
+ `[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
1188
+ );
1189
+ }
1190
+ const pageFiles = readdirSync2(pagesDir).filter((f) => f.startsWith("page_") && f.endsWith(".png")).sort((a, b) => {
1191
+ const numA = parseInt(a.replace("page_", "").replace(".png", ""), 10);
1192
+ const numB = parseInt(b.replace("page_", "").replace(".png", ""), 10);
1193
+ return numA - numB;
1194
+ }).map((f) => join3(pagesDir, f));
1195
+ this.logger.info(
1196
+ `[PageRenderer] Rendered ${pageFiles.length} pages to ${pagesDir}`
1197
+ );
1198
+ return {
1199
+ pageCount: pageFiles.length,
1200
+ pagesDir,
1201
+ pageFiles
1202
+ };
1203
+ }
1204
+ };
1205
+
1206
+ // src/processors/pdf-text-extractor.ts
1207
+ var PdfTextExtractor = class {
1208
+ constructor(logger) {
1209
+ this.logger = logger;
785
1210
  }
786
1211
  /**
787
- * Stop the server
1212
+ * Extract text from all pages of a PDF.
1213
+ *
1214
+ * @param pdfPath - Absolute path to the source PDF file
1215
+ * @param totalPages - Total number of pages in the PDF
1216
+ * @returns Map of 1-based page numbers to extracted text strings
788
1217
  */
789
- stop() {
790
- return new Promise((resolve) => {
1218
+ async extractText(pdfPath, totalPages) {
1219
+ this.logger.info(
1220
+ `[PdfTextExtractor] Extracting text from ${totalPages} pages...`
1221
+ );
1222
+ const pageTexts = /* @__PURE__ */ new Map();
1223
+ for (let page = 1; page <= totalPages; page++) {
1224
+ const text = await this.extractPageText(pdfPath, page);
1225
+ pageTexts.set(page, text);
1226
+ }
1227
+ const nonEmptyCount = [...pageTexts.values()].filter(
1228
+ (t) => t.trim().length > 0
1229
+ ).length;
1230
+ this.logger.info(
1231
+ `[PdfTextExtractor] Extracted text from ${nonEmptyCount}/${totalPages} pages`
1232
+ );
1233
+ return pageTexts;
1234
+ }
1235
+ /**
1236
+ * Get total page count of a PDF using pdfinfo.
1237
+ * Returns 0 on failure.
1238
+ */
1239
+ async getPageCount(pdfPath) {
1240
+ const result = await spawnAsync("pdfinfo", [pdfPath]);
1241
+ if (result.code !== 0) {
1242
+ this.logger.warn(
1243
+ `[PdfTextExtractor] pdfinfo failed: ${result.stderr || "Unknown error"}`
1244
+ );
1245
+ return 0;
1246
+ }
1247
+ const match = result.stdout.match(/^Pages:\s+(\d+)/m);
1248
+ return match ? parseInt(match[1], 10) : 0;
1249
+ }
1250
+ /**
1251
+ * Extract text from the entire PDF in a single pdftotext invocation.
1252
+ * Returns empty string on failure (logged as warning).
1253
+ */
1254
+ async extractFullText(pdfPath) {
1255
+ const result = await spawnAsync("pdftotext", ["-layout", pdfPath, "-"]);
1256
+ if (result.code !== 0) {
1257
+ this.logger.warn(
1258
+ `[PdfTextExtractor] pdftotext (full) failed: ${result.stderr || "Unknown error"}`
1259
+ );
1260
+ return "";
1261
+ }
1262
+ return result.stdout;
1263
+ }
1264
+ /**
1265
+ * Extract text from a single PDF page using pdftotext.
1266
+ * Returns empty string on failure (logged as warning).
1267
+ */
1268
+ async extractPageText(pdfPath, page) {
1269
+ const result = await spawnAsync("pdftotext", [
1270
+ "-f",
1271
+ page.toString(),
1272
+ "-l",
1273
+ page.toString(),
1274
+ "-layout",
1275
+ pdfPath,
1276
+ "-"
1277
+ ]);
1278
+ if (result.code !== 0) {
1279
+ this.logger.warn(
1280
+ `[PdfTextExtractor] pdftotext failed for page ${page}: ${result.stderr || "Unknown error"}`
1281
+ );
1282
+ return "";
1283
+ }
1284
+ return result.stdout;
1285
+ }
1286
+ };
1287
+
1288
+ // src/processors/vlm-text-corrector.ts
1289
+ import { readFileSync as readFileSync2, writeFileSync as writeFileSync2 } from "fs";
1290
+ import { join as join4 } from "path";
1291
+
1292
+ // src/types/vlm-text-correction-schema.ts
1293
+ import { z } from "zod/v4";
1294
+ var vlmTextCorrectionSchema = z.object({
1295
+ /** Text element corrections (substitution-based) */
1296
+ tc: z.array(
1297
+ z.object({
1298
+ /** Text element index (from prompt) */
1299
+ i: z.number().int().nonnegative(),
1300
+ /** Substitutions: find/replace pairs applied left-to-right */
1301
+ s: z.array(
1302
+ z.object({
1303
+ /** Exact garbled substring to find */
1304
+ f: z.string(),
1305
+ /** Corrected replacement text */
1306
+ r: z.string()
1307
+ })
1308
+ )
1309
+ })
1310
+ ),
1311
+ /** Table cell corrections */
1312
+ cc: z.array(
1313
+ z.object({
1314
+ /** Table index (within the page) */
1315
+ ti: z.number().int().nonnegative(),
1316
+ /** Row index */
1317
+ r: z.number().int().nonnegative(),
1318
+ /** Column index */
1319
+ c: z.number().int().nonnegative(),
1320
+ /** Corrected cell text */
1321
+ t: z.string()
1322
+ })
1323
+ )
1324
+ });
1325
+
1326
+ // src/processors/vlm-text-corrector.ts
1327
+ var LANGUAGE_DISPLAY_NAMES = {
1328
+ ko: "Korean (\uD55C\uAD6D\uC5B4)",
1329
+ ja: "Japanese (\u65E5\u672C\u8A9E)",
1330
+ zh: "Chinese (\u4E2D\u6587)",
1331
+ en: "English",
1332
+ fr: "French (Fran\xE7ais)",
1333
+ de: "German (Deutsch)",
1334
+ es: "Spanish (Espa\xF1ol)",
1335
+ pt: "Portuguese (Portugu\xEAs)",
1336
+ ru: "Russian (\u0420\u0443\u0441\u0441\u043A\u0438\u0439)",
1337
+ uk: "Ukrainian (\u0423\u043A\u0440\u0430\u0457\u043D\u0441\u044C\u043A\u0430)",
1338
+ it: "Italian (Italiano)"
1339
+ };
1340
+ var REFERENCE_MATCH_THRESHOLD = 0.4;
1341
+ var DEFAULT_CONCURRENCY = 1;
1342
+ var DEFAULT_MAX_RETRIES = 3;
1343
+ var DEFAULT_TEMPERATURE = 0;
1344
+ var LABEL_TO_TYPE_CODE = {
1345
+ section_header: "sh",
1346
+ text: "tx",
1347
+ caption: "ca",
1348
+ footnote: "fn",
1349
+ list_item: "li",
1350
+ page_header: "ph",
1351
+ page_footer: "pf"
1352
+ };
1353
+ var TEXT_LABELS = new Set(Object.keys(LABEL_TO_TYPE_CODE));
1354
+ var TEXT_CORRECTION_SYSTEM_PROMPT = `You are a text correction engine for OCR output from Korean archaeological (\u8003\u53E4\u5B78) report PDFs. Compare OCR text against the page image and reference text to fix errors.
1355
+
1356
+ The OCR engine cannot read Chinese characters (\u6F22\u5B57/Hanja) correctly. These errors appear as:
1357
+ - Random ASCII letters/symbols: \u718A\u6D25 \u2192 "M", \u5C0F\u4EAC\u5236 \u2192 "5\u2606", \u6545\u5740 \u2192 "Bbt"
1358
+ - Meaningless Korean syllables: \u6771\u660E \u2192 "\uD587\uBC30", \u91D1\u61B2\u660C \u2192 "\uC232", \u7E3D\u7BA1 \u2192 "3\uC501"
1359
+ - Number/symbol noise: \u718A\u5DDD\u5DDE \u2192 "IEJIM", \u6E6F\u4E95\u90E1 \u2192 "3#"
1360
+ - Hanja dropped entirely: (\u682A)\u97D3\u570B\u7E96\u7DAD \u2192 (\uC8FC), (\u8CA1)\u5FE0\u6DF8\u6587\u5316\u8CA1\u784F\u7A76\u9662 \u2192 (\uC7AC)\uCDA9\uB0A8\uBB38\uD654\uC7AC\uC5F0\uAD6C\uC6D0
1361
+ - Phonetic reading substitution (\u97F3\u8B80): \u6F22\u5B57 replaced by Korean pronunciation, e.g. \u5FE0\u6DF8\u6587\u5316\u8CA1\u784F\u7A76\u9662 \u2192 \uCDA9\uB0A8\uBB38\uD654\uC7AC\uC5F0\uAD6C\uC6D0, \u5BE6\u7389\u6D1E\u907A\u8E5F \u2192 \uC2E4\uC625\uB3D9\uC720\uC801
1362
+
1363
+ FIX: garbled/wrong Chinese characters, mojibake, encoding artifacts, random ASCII/Korean replacing Hanja, dropped Hanja, phonetic reading substitutions
1364
+ KEEP: correct text, structure, punctuation, whitespace
1365
+
1366
+ Input format:
1367
+ T: (text elements) index|type|text
1368
+ Optional: index|ref|reference_text (PDF text layer for the above element)
1369
+ C: (table cells) tableIndex|row,col|text
1370
+ Optional: C_REF: (unused pdftotext blocks as table reference)
1371
+
1372
+ FOOTNOTE (fn) SPECIAL INSTRUCTIONS:
1373
+ - Footnotes in archaeological reports contain institution names with Hanja that are severely garbled
1374
+ - Common pattern: (\u8CA1)\u6A5F\u95DC\u540D\u784F\u7A76\u9662 \u2192 (W)#X1CR003T or (W): 103 or similar ASCII noise
1375
+ - When OCR shows patterns like (W), (M), or random ASCII where an institution name should be, READ THE IMAGE directly
1376
+ - Institution names follow patterns like: (\u8CA1)OO\u6587\u5316\u8CA1\u784F\u7A76\u9662, (\u682A)OO, (\u793E)OO\u5B78\u6703
1377
+
1378
+ TABLE CELL (C:) SPECIAL INSTRUCTIONS:
1379
+ - Table headers often contain Hanja that OCR cannot read: \u767C\u520A\u65E5, \u6642\u4EE3, \u8ABF\u67FB\u7DE3\u7531, \u8ABF\u67FB\u6A5F\u95DC, \u907A\u8E5F\u540D, \u985E\u578B \uBC0F \u57FA\u6578
1380
+ - When OCR shows garbled characters like "\u20A9 A", "#\uCA6F\uBC0F\uD45C\uBF70" in table cells, READ THE IMAGE directly
1381
+ - If C_REF is present, use it as additional context for correcting table cells
1382
+
1383
+ When a |ref| line is present:
1384
+ - It shows text extracted directly from the PDF text layer for that element
1385
+ - If OCR text contains garbled characters but ref text looks correct, USE the ref text
1386
+ - For long paragraphs, align OCR and ref text segment by segment to identify and fix each garbled portion
1387
+ - IMPORTANT: If BOTH OCR and ref text are garbled (e.g. CJK font encoding issues), IGNORE the ref text and READ THE IMAGE directly
1388
+
1389
+ When NO |ref| line is present:
1390
+ - The PDF text layer could not be matched to this element
1391
+ - READ THE IMAGE directly to determine the correct text
1392
+
1393
+ Output JSON with corrections:
1394
+ tc=[{i:index, s:[{f:"garbled_substring",r:"corrected_text"}, ...]}] for text
1395
+ cc=[{ti:tableIndex, r:row, c:col, t:corrected}] for table cells
1396
+
1397
+ Substitution rules for tc:
1398
+ - 'f': exact garbled/wrong substring from the input text (must match exactly)
1399
+ - 'r': the corrected replacement
1400
+ - Include ALL garbled portions for each element as separate s entries
1401
+ - Order substitutions left-to-right as they appear in the text
1402
+ - Do NOT include unchanged text \u2014 only the specific substrings that need fixing
1403
+
1404
+ If all correct: {"tc":[],"cc":[]}`;
1405
+ var VlmTextCorrector = class {
1406
+ constructor(logger) {
1407
+ this.logger = logger;
1408
+ }
1409
+ /**
1410
+ * Read DoclingDocument from output directory, correct text via VLM,
1411
+ * and save the corrected document back.
1412
+ *
1413
+ * @param outputDir - Directory containing result.json and pages/
1414
+ * @param model - Vision language model for text correction
1415
+ * @param options - Processing options
1416
+ * @returns Correction statistics
1417
+ */
1418
+ async correctAndSave(outputDir, model, options) {
1419
+ this.logger.info("[VlmTextCorrector] Starting text correction...");
1420
+ const resultPath = join4(outputDir, "result.json");
1421
+ const doc = JSON.parse(readFileSync2(resultPath, "utf-8"));
1422
+ let pageNumbers = this.getPageNumbers(doc);
1423
+ if (pageNumbers.length === 0) {
1424
+ this.logger.info("[VlmTextCorrector] No pages to process");
1425
+ return {
1426
+ textCorrections: 0,
1427
+ cellCorrections: 0,
1428
+ pagesProcessed: 0,
1429
+ pagesFailed: 0
1430
+ };
1431
+ }
1432
+ if (options?.koreanHanjaMixPages && options.koreanHanjaMixPages.length > 0) {
1433
+ const totalPageCount = pageNumbers.length;
1434
+ const hanjaSet = new Set(options.koreanHanjaMixPages);
1435
+ pageNumbers = pageNumbers.filter((p) => hanjaSet.has(p));
1436
+ this.logger.info(
1437
+ `[VlmTextCorrector] Filtering to ${pageNumbers.length} Korean-Hanja mix pages out of ${totalPageCount} total`
1438
+ );
1439
+ }
1440
+ const concurrency = options?.concurrency ?? DEFAULT_CONCURRENCY;
1441
+ this.logger.info(
1442
+ `[VlmTextCorrector] Processing ${pageNumbers.length} pages (concurrency: ${concurrency})...`
1443
+ );
1444
+ const results = await ConcurrentPool.run(
1445
+ pageNumbers,
1446
+ concurrency,
1447
+ (pageNo) => this.correctPage(outputDir, doc, pageNo, model, options),
1448
+ () => {
1449
+ if (options?.onTokenUsage && options?.aggregator) {
1450
+ options.onTokenUsage(
1451
+ options.aggregator.getReport()
1452
+ );
1453
+ }
1454
+ }
1455
+ );
1456
+ let totalTextCorrections = 0;
1457
+ let totalCellCorrections = 0;
1458
+ let pagesFailed = 0;
1459
+ for (const result of results) {
1460
+ if (result === null) {
1461
+ pagesFailed++;
1462
+ } else {
1463
+ totalTextCorrections += result.tc.length;
1464
+ totalCellCorrections += result.cc.length;
1465
+ }
1466
+ }
1467
+ for (let i = 0; i < pageNumbers.length; i++) {
1468
+ const corrections = results[i];
1469
+ if (corrections === null) continue;
1470
+ this.applyCorrections(doc, pageNumbers[i], corrections);
1471
+ }
1472
+ writeFileSync2(resultPath, JSON.stringify(doc, null, 2));
1473
+ this.logger.info(
1474
+ `[VlmTextCorrector] Correction complete: ${totalTextCorrections} text, ${totalCellCorrections} cell corrections across ${pageNumbers.length} pages (${pagesFailed} failed)`
1475
+ );
1476
+ return {
1477
+ textCorrections: totalTextCorrections,
1478
+ cellCorrections: totalCellCorrections,
1479
+ pagesProcessed: pageNumbers.length,
1480
+ pagesFailed
1481
+ };
1482
+ }
1483
+ /**
1484
+ * Get sorted page numbers from the document.
1485
+ */
1486
+ getPageNumbers(doc) {
1487
+ return Object.values(doc.pages).map((p) => p.page_no).sort((a, b) => a - b);
1488
+ }
1489
+ /**
1490
+ * Correct text on a single page via VLM.
1491
+ * Returns null if VLM call fails (graceful degradation).
1492
+ */
1493
+ async correctPage(outputDir, doc, pageNo, model, options) {
1494
+ try {
1495
+ const pageTexts = this.getPageTexts(doc, pageNo);
1496
+ const pageTables = this.getPageTables(doc, pageNo);
1497
+ if (pageTexts.length === 0 && pageTables.length === 0) {
1498
+ this.logger.debug(
1499
+ `[VlmTextCorrector] Page ${pageNo}: no text content, skipping`
1500
+ );
1501
+ return { tc: [], cc: [] };
1502
+ }
1503
+ const imageBase64 = this.readPageImage(outputDir, pageNo);
1504
+ const pageText = options?.pageTexts?.get(pageNo);
1505
+ let references;
1506
+ let tableContext;
1507
+ if (pageText) {
1508
+ const { references: refs, unusedBlocks } = this.matchTextToReferenceWithUnused(pageTexts, pageText);
1509
+ references = refs;
1510
+ if (pageTables.length > 0 && unusedBlocks.length > 0) {
1511
+ tableContext = unusedBlocks.join("\n");
1512
+ }
1513
+ }
1514
+ const userPrompt = this.buildUserPrompt(
1515
+ pageTexts,
1516
+ pageTables,
1517
+ references,
1518
+ tableContext
1519
+ );
1520
+ const systemPrompt = this.buildLanguageAwareSystemPrompt(
1521
+ options?.documentLanguages
1522
+ );
1523
+ const fullPrompt = systemPrompt + "\n\n" + userPrompt;
1524
+ const result = await LLMCaller.callVision({
1525
+ schema: vlmTextCorrectionSchema,
1526
+ messages: [
1527
+ {
1528
+ role: "user",
1529
+ content: [
1530
+ {
1531
+ type: "text",
1532
+ text: fullPrompt
1533
+ },
1534
+ {
1535
+ type: "image",
1536
+ image: `data:image/png;base64,${imageBase64}`
1537
+ }
1538
+ ]
1539
+ }
1540
+ ],
1541
+ primaryModel: model,
1542
+ maxRetries: options?.maxRetries ?? DEFAULT_MAX_RETRIES,
1543
+ temperature: options?.temperature ?? DEFAULT_TEMPERATURE,
1544
+ abortSignal: options?.abortSignal,
1545
+ component: "VlmTextCorrector",
1546
+ phase: "text-correction"
1547
+ });
1548
+ if (options?.aggregator) {
1549
+ options.aggregator.track(result.usage);
1550
+ }
1551
+ const output = result.output;
1552
+ if (output.tc.length > 0 || output.cc.length > 0) {
1553
+ this.logger.debug(
1554
+ `[VlmTextCorrector] Page ${pageNo}: ${output.tc.length} text, ${output.cc.length} cell corrections`
1555
+ );
1556
+ }
1557
+ return output;
1558
+ } catch (error) {
1559
+ if (options?.abortSignal?.aborted) {
1560
+ throw error;
1561
+ }
1562
+ this.logger.warn(
1563
+ `[VlmTextCorrector] Page ${pageNo}: VLM correction failed, keeping OCR text`,
1564
+ error
1565
+ );
1566
+ return null;
1567
+ }
1568
+ }
1569
+ /**
1570
+ * Get text items on a specific page, with their indices for prompt building.
1571
+ */
1572
+ getPageTexts(doc, pageNo) {
1573
+ const results = [];
1574
+ for (let i = 0; i < doc.texts.length; i++) {
1575
+ const item = doc.texts[i];
1576
+ if (!TEXT_LABELS.has(item.label)) continue;
1577
+ if (item.prov.some((p) => p.page_no === pageNo)) {
1578
+ results.push({ index: i, item });
1579
+ }
1580
+ }
1581
+ return results;
1582
+ }
1583
+ /**
1584
+ * Get table items on a specific page, with their indices.
1585
+ */
1586
+ getPageTables(doc, pageNo) {
1587
+ const results = [];
1588
+ for (let i = 0; i < doc.tables.length; i++) {
1589
+ const item = doc.tables[i];
1590
+ if (item.prov.some((p) => p.page_no === pageNo)) {
1591
+ results.push({ index: i, item });
1592
+ }
1593
+ }
1594
+ return results;
1595
+ }
1596
+ /**
1597
+ * Build compact user prompt for a page.
1598
+ *
1599
+ * Format:
1600
+ * T:
1601
+ * 0|sh|제1장 조사개요
1602
+ * 1|tx|본 보고서는 ...
1603
+ * C:
1604
+ * 0|0,0|유구명
1605
+ * 0|1,0|1호 住居址
1606
+ */
1607
+ buildUserPrompt(pageTexts, pageTables, references, tableContext) {
1608
+ const parts = [];
1609
+ if (pageTexts.length > 0) {
1610
+ const textLines = [];
1611
+ pageTexts.forEach((entry, promptIndex) => {
1612
+ const typeCode = LABEL_TO_TYPE_CODE[entry.item.label] ?? "tx";
1613
+ textLines.push(`${promptIndex}|${typeCode}|${entry.item.text}`);
1614
+ const ref = references?.get(promptIndex);
1615
+ if (ref) {
1616
+ textLines.push(`${promptIndex}|ref|${ref}`);
1617
+ }
1618
+ });
1619
+ parts.push("T:\n" + textLines.join("\n"));
1620
+ }
1621
+ if (pageTables.length > 0) {
1622
+ const cellLines = [];
1623
+ for (let tablePromptIndex = 0; tablePromptIndex < pageTables.length; tablePromptIndex++) {
1624
+ const table = pageTables[tablePromptIndex].item;
1625
+ for (const cell of table.data.table_cells) {
1626
+ if (!cell.text || cell.text.trim().length === 0) continue;
1627
+ cellLines.push(
1628
+ `${tablePromptIndex}|${cell.start_row_offset_idx},${cell.start_col_offset_idx}|${cell.text}`
1629
+ );
1630
+ }
1631
+ }
1632
+ if (cellLines.length > 0) {
1633
+ const cellSection = "C:\n" + cellLines.join("\n");
1634
+ if (tableContext) {
1635
+ parts.push(cellSection + "\nC_REF:\n" + tableContext);
1636
+ } else {
1637
+ parts.push(cellSection);
1638
+ }
1639
+ }
1640
+ }
1641
+ return parts.join("\n");
1642
+ }
1643
+ /**
1644
+ * Build a language-aware system prompt by prepending language context.
1645
+ */
1646
+ buildLanguageAwareSystemPrompt(documentLanguages) {
1647
+ if (!documentLanguages?.length) {
1648
+ return TEXT_CORRECTION_SYSTEM_PROMPT;
1649
+ }
1650
+ const primaryBase = documentLanguages[0].split("-")[0];
1651
+ const primaryName = LANGUAGE_DISPLAY_NAMES[primaryBase] ?? documentLanguages[0];
1652
+ const otherNames = documentLanguages.slice(1).map((code) => LANGUAGE_DISPLAY_NAMES[code.split("-")[0]] ?? code);
1653
+ const languageDesc = otherNames.length > 0 ? `primarily written in ${primaryName}, with ${otherNames.join(", ")} also present` : `written in ${primaryName}`;
1654
+ const prefix = `LANGUAGE CONTEXT: This document is ${languageDesc}. Focus on correcting characters that do not match this language.
1655
+
1656
+ `;
1657
+ return prefix + TEXT_CORRECTION_SYSTEM_PROMPT;
1658
+ }
1659
+ /**
1660
+ * Match pdftotext paragraph blocks to OCR elements using character multiset overlap.
1661
+ * Returns a map from prompt index to the best-matching reference block.
1662
+ */
1663
+ matchTextToReference(pageTexts, pageText) {
1664
+ return this.matchTextToReferenceWithUnused(pageTexts, pageText).references;
1665
+ }
1666
+ /**
1667
+ * Match pdftotext paragraph blocks to OCR elements and also return unused blocks.
1668
+ * Unused blocks are those that were not consumed by any text element match.
1669
+ */
1670
+ matchTextToReferenceWithUnused(pageTexts, pageText) {
1671
+ const references = /* @__PURE__ */ new Map();
1672
+ const refBlocks = this.mergeIntoBlocks(pageText);
1673
+ if (refBlocks.length === 0) {
1674
+ return { references, unusedBlocks: [] };
1675
+ }
1676
+ const available = new Set(refBlocks.map((_, i) => i));
1677
+ for (let promptIndex = 0; promptIndex < pageTexts.length; promptIndex++) {
1678
+ const ocrText = pageTexts[promptIndex].item.text;
1679
+ let bestScore = 0;
1680
+ let bestBlockIndex = -1;
1681
+ for (const blockIndex of available) {
1682
+ const score = this.computeCharOverlap(ocrText, refBlocks[blockIndex]);
1683
+ if (score > bestScore) {
1684
+ bestScore = score;
1685
+ bestBlockIndex = blockIndex;
1686
+ }
1687
+ }
1688
+ if (bestBlockIndex >= 0 && bestScore >= REFERENCE_MATCH_THRESHOLD) {
1689
+ if (refBlocks[bestBlockIndex] !== ocrText) {
1690
+ references.set(promptIndex, refBlocks[bestBlockIndex]);
1691
+ }
1692
+ available.delete(bestBlockIndex);
1693
+ }
1694
+ }
1695
+ const unusedBlocks = [...available].sort((a, b) => a - b).map((i) => refBlocks[i]);
1696
+ return { references, unusedBlocks };
1697
+ }
1698
+ /**
1699
+ * Merge pdftotext output into paragraph blocks separated by blank lines.
1700
+ * Consecutive non-empty lines are joined with a space.
1701
+ */
1702
+ mergeIntoBlocks(pageText) {
1703
+ const blocks = [];
1704
+ let currentLines = [];
1705
+ for (const rawLine of pageText.split("\n")) {
1706
+ const trimmed = rawLine.trim();
1707
+ if (trimmed.length === 0) {
1708
+ if (currentLines.length > 0) {
1709
+ blocks.push(currentLines.join(" "));
1710
+ currentLines = [];
1711
+ }
1712
+ } else {
1713
+ currentLines.push(trimmed);
1714
+ }
1715
+ }
1716
+ if (currentLines.length > 0) {
1717
+ blocks.push(currentLines.join(" "));
1718
+ }
1719
+ return blocks;
1720
+ }
1721
+ /**
1722
+ * Compute character multiset overlap ratio between two strings.
1723
+ * Returns a value between 0.0 and 1.0.
1724
+ */
1725
+ computeCharOverlap(a, b) {
1726
+ if (a.length === 0 || b.length === 0) return 0;
1727
+ const freqA = /* @__PURE__ */ new Map();
1728
+ for (const ch of a) {
1729
+ freqA.set(ch, (freqA.get(ch) ?? 0) + 1);
1730
+ }
1731
+ const freqB = /* @__PURE__ */ new Map();
1732
+ for (const ch of b) {
1733
+ freqB.set(ch, (freqB.get(ch) ?? 0) + 1);
1734
+ }
1735
+ let overlap = 0;
1736
+ for (const [ch, countA] of freqA) {
1737
+ const countB = freqB.get(ch) ?? 0;
1738
+ overlap += Math.min(countA, countB);
1739
+ }
1740
+ return overlap / Math.max(a.length, b.length);
1741
+ }
1742
+ /**
1743
+ * Read page image as base64.
1744
+ * Page images are 0-indexed: page_no N → pages/page_{N-1}.png
1745
+ */
1746
+ readPageImage(outputDir, pageNo) {
1747
+ const imagePath = join4(outputDir, "pages", `page_${pageNo - 1}.png`);
1748
+ return readFileSync2(imagePath).toString("base64");
1749
+ }
1750
+ /**
1751
+ * Apply VLM corrections to the DoclingDocument.
1752
+ */
1753
+ applyCorrections(doc, pageNo, corrections) {
1754
+ if (corrections.tc.length > 0) {
1755
+ const pageTexts = this.getPageTexts(doc, pageNo);
1756
+ for (const correction of corrections.tc) {
1757
+ if (correction.i >= 0 && correction.i < pageTexts.length) {
1758
+ const docIndex = pageTexts[correction.i].index;
1759
+ let text = doc.texts[docIndex].text;
1760
+ for (const sub of correction.s) {
1761
+ const idx = text.indexOf(sub.f);
1762
+ if (idx >= 0) {
1763
+ text = text.substring(0, idx) + sub.r + text.substring(idx + sub.f.length);
1764
+ } else {
1765
+ this.logger.warn(
1766
+ `[VlmTextCorrector] Page ${pageNo}, text ${correction.i}: find string not found, skipping substitution`
1767
+ );
1768
+ }
1769
+ }
1770
+ if (text !== doc.texts[docIndex].text) {
1771
+ doc.texts[docIndex].text = text;
1772
+ doc.texts[docIndex].orig = text;
1773
+ }
1774
+ }
1775
+ }
1776
+ }
1777
+ if (corrections.cc.length > 0) {
1778
+ const pageTables = this.getPageTables(doc, pageNo);
1779
+ for (const correction of corrections.cc) {
1780
+ if (correction.ti >= 0 && correction.ti < pageTables.length) {
1781
+ const table = pageTables[correction.ti].item;
1782
+ for (const cell of table.data.table_cells) {
1783
+ if (cell.start_row_offset_idx === correction.r && cell.start_col_offset_idx === correction.c) {
1784
+ cell.text = correction.t;
1785
+ break;
1786
+ }
1787
+ }
1788
+ const gridRow = table.data.grid[correction.r];
1789
+ if (gridRow) {
1790
+ const gridCell = gridRow[correction.c];
1791
+ if (gridCell) {
1792
+ gridCell.text = correction.t;
1793
+ }
1794
+ }
1795
+ }
1796
+ }
1797
+ }
1798
+ }
1799
+ };
1800
+
1801
+ // src/samplers/ocr-strategy-sampler.ts
1802
+ import { normalizeToBcp47 } from "@heripo/model";
1803
+ import { readFileSync as readFileSync3 } from "fs";
1804
+ import { z as z2 } from "zod/v4";
1805
+ var SAMPLE_DPI = 150;
1806
+ var EDGE_TRIM_RATIO = 0.1;
1807
+ var DEFAULT_MAX_SAMPLE_PAGES = 15;
1808
+ var DEFAULT_MAX_RETRIES2 = 3;
1809
+ var CJK_REGEX = /[\u4E00-\u9FFF]/;
1810
+ var HANGUL_REGEX = /[\uAC00-\uD7AF]/;
1811
+ var koreanHanjaMixSchema = z2.object({
1812
+ hasKoreanHanjaMix: z2.boolean().describe(
1813
+ "Whether the page contains any Hanja (\u6F22\u5B57/Chinese characters) mixed with Korean text"
1814
+ ),
1815
+ detectedLanguages: z2.array(z2.string()).describe(
1816
+ 'BCP 47 language tags of languages found on this page, ordered by prevalence (e.g., ["ko-KR", "en-US"])'
1817
+ )
1818
+ });
1819
+ var KOREAN_HANJA_MIX_PROMPT = `Look at this page image carefully. Does it contain any Hanja (\u6F22\u5B57/Chinese characters) mixed with Korean text?
1820
+
1821
+ Hanja examples: \u907A\u8E5F, \u767C\u6398, \u8ABF\u67FB, \u5831\u544A\u66F8, \u6587\u5316\u8CA1
1822
+ Note: Hanja are Chinese characters used in Korean documents, different from modern Korean (\uD55C\uAE00).
1823
+
1824
+ Answer whether any Hanja characters are present on this page.
1825
+
1826
+ Also identify all languages present on this page. Return an array of BCP 47 language tags ordered by prevalence (primary language first).
1827
+ Examples: ["ko-KR", "en-US"], ["ja-JP"], ["zh-TW", "en-US"]`;
1828
+ var OcrStrategySampler = class {
1829
+ logger;
1830
+ pageRenderer;
1831
+ textExtractor;
1832
+ constructor(logger, pageRenderer, textExtractor) {
1833
+ this.logger = logger;
1834
+ this.pageRenderer = pageRenderer;
1835
+ this.textExtractor = textExtractor ?? new PdfTextExtractor(logger);
1836
+ }
1837
+ /**
1838
+ * Sample pages from a PDF and determine the OCR strategy.
1839
+ *
1840
+ * @param pdfPath - Path to the PDF file
1841
+ * @param outputDir - Directory for temporary rendered pages
1842
+ * @param model - Vision language model for Korean-Hanja mix detection
1843
+ * @param options - Sampling options
1844
+ * @returns OcrStrategy with method ('ocrmac' or 'vlm') and metadata
1845
+ */
1846
+ async sample(pdfPath, outputDir, model, options) {
1847
+ const maxSamplePages = options?.maxSamplePages ?? DEFAULT_MAX_SAMPLE_PAGES;
1848
+ this.logger.info("[OcrStrategySampler] Starting OCR strategy sampling...");
1849
+ const preCheckResult = await this.preCheckHanjaFromTextLayer(pdfPath);
1850
+ if (preCheckResult) {
1851
+ return preCheckResult;
1852
+ }
1853
+ const renderResult = await this.pageRenderer.renderPages(
1854
+ pdfPath,
1855
+ outputDir,
1856
+ { dpi: SAMPLE_DPI }
1857
+ );
1858
+ if (renderResult.pageCount === 0) {
1859
+ this.logger.info("[OcrStrategySampler] No pages found in PDF");
1860
+ return {
1861
+ method: "ocrmac",
1862
+ reason: "No pages found in PDF",
1863
+ sampledPages: 0,
1864
+ totalPages: 0
1865
+ };
1866
+ }
1867
+ const sampleIndices = this.selectSamplePages(
1868
+ renderResult.pageCount,
1869
+ maxSamplePages
1870
+ );
1871
+ this.logger.info(
1872
+ `[OcrStrategySampler] Sampling ${sampleIndices.length} of ${renderResult.pageCount} pages: [${sampleIndices.map((i) => i + 1).join(", ")}]`
1873
+ );
1874
+ let sampledCount = 0;
1875
+ const languageFrequency = /* @__PURE__ */ new Map();
1876
+ for (const idx of sampleIndices) {
1877
+ sampledCount++;
1878
+ const pageFile = renderResult.pageFiles[idx];
1879
+ const pageAnalysis = await this.analyzeSamplePage(
1880
+ pageFile,
1881
+ idx + 1,
1882
+ model,
1883
+ options
1884
+ );
1885
+ for (const lang of pageAnalysis.detectedLanguages) {
1886
+ languageFrequency.set(lang, (languageFrequency.get(lang) ?? 0) + 1);
1887
+ }
1888
+ if (pageAnalysis.hasKoreanHanjaMix) {
1889
+ this.logger.info(
1890
+ `[OcrStrategySampler] Korean-Hanja mix detected on page ${idx + 1} \u2192 VLM strategy`
1891
+ );
1892
+ const detectedLanguages2 = this.aggregateLanguages(languageFrequency);
1893
+ return {
1894
+ method: "vlm",
1895
+ detectedLanguages: detectedLanguages2,
1896
+ reason: `Korean-Hanja mix detected on page ${idx + 1}`,
1897
+ sampledPages: sampledCount,
1898
+ totalPages: renderResult.pageCount
1899
+ };
1900
+ }
1901
+ }
1902
+ this.logger.info(
1903
+ "[OcrStrategySampler] No Korean-Hanja mix detected \u2192 ocrmac strategy"
1904
+ );
1905
+ const detectedLanguages = this.aggregateLanguages(languageFrequency);
1906
+ return {
1907
+ method: "ocrmac",
1908
+ detectedLanguages,
1909
+ reason: `No Korean-Hanja mix detected in ${sampledCount} sampled pages`,
1910
+ sampledPages: sampledCount,
1911
+ totalPages: renderResult.pageCount
1912
+ };
1913
+ }
1914
+ /**
1915
+ * Pre-check for Hangul-Hanja mix in PDF text layer using pdftotext.
1916
+ * Extracts full document text in a single process and checks at document level.
1917
+ * Only makes a definitive decision for Korean (Hangul) documents:
1918
+ * - Hangul + Hanja (anywhere in document) → VLM (confirmed Korean-Hanja mix)
1919
+ * - Hangul only → ocrmac with ko-KR (confirmed Korean)
1920
+ * - No Hangul (English, Japanese, etc.) → null (delegates to VLM for language detection)
1921
+ */
1922
+ async preCheckHanjaFromTextLayer(pdfPath) {
1923
+ try {
1924
+ const totalPages = await this.textExtractor.getPageCount(pdfPath);
1925
+ if (totalPages === 0) return null;
1926
+ const fullText = await this.textExtractor.extractFullText(pdfPath);
1927
+ if (fullText.trim().length === 0) {
1928
+ this.logger.debug(
1929
+ "[OcrStrategySampler] No Hangul in text layer, falling back to VLM sampling"
1930
+ );
1931
+ return null;
1932
+ }
1933
+ const hasHangul = HANGUL_REGEX.test(fullText);
1934
+ const hasHanja = CJK_REGEX.test(fullText);
1935
+ if (!hasHangul) {
1936
+ this.logger.debug(
1937
+ "[OcrStrategySampler] No Hangul in text layer, falling back to VLM sampling"
1938
+ );
1939
+ return null;
1940
+ }
1941
+ if (hasHanja) {
1942
+ const pageTextArray = fullText.split("\f");
1943
+ const koreanHanjaMixPages = [];
1944
+ for (let i = 0; i < pageTextArray.length; i++) {
1945
+ if (CJK_REGEX.test(pageTextArray[i])) {
1946
+ koreanHanjaMixPages.push(i + 1);
1947
+ }
1948
+ }
1949
+ this.logger.info(
1950
+ `[OcrStrategySampler] Hangul-Hanja mix detected in text layer \u2192 VLM strategy (${koreanHanjaMixPages.length} pages with Hanja)`
1951
+ );
1952
+ return {
1953
+ method: "vlm",
1954
+ detectedLanguages: ["ko-KR"],
1955
+ reason: "Hangul-Hanja mix found in PDF text layer",
1956
+ koreanHanjaMixPages,
1957
+ sampledPages: totalPages,
1958
+ totalPages
1959
+ };
1960
+ }
1961
+ this.logger.info(
1962
+ "[OcrStrategySampler] No Hangul-Hanja mix in text layer \u2192 ocrmac strategy"
1963
+ );
1964
+ return {
1965
+ method: "ocrmac",
1966
+ detectedLanguages: ["ko-KR"],
1967
+ reason: `No Hangul-Hanja mix in PDF text layer (${totalPages} pages checked)`,
1968
+ sampledPages: totalPages,
1969
+ totalPages
1970
+ };
1971
+ } catch {
1972
+ this.logger.debug(
1973
+ "[OcrStrategySampler] Text layer pre-check failed, falling back to VLM sampling"
1974
+ );
1975
+ return null;
1976
+ }
1977
+ }
1978
+ /**
1979
+ * Select page indices for sampling.
1980
+ * Trims front/back edges and distributes samples evenly.
1981
+ *
1982
+ * @param totalPages - Total number of pages
1983
+ * @param maxSamples - Maximum number of samples
1984
+ * @returns Array of 0-based page indices
1985
+ */
1986
+ selectSamplePages(totalPages, maxSamples) {
1987
+ if (totalPages === 0) return [];
1988
+ if (totalPages <= maxSamples) {
1989
+ return Array.from({ length: totalPages }, (_, i) => i);
1990
+ }
1991
+ const trimCount = Math.max(1, Math.ceil(totalPages * EDGE_TRIM_RATIO));
1992
+ const start = trimCount;
1993
+ const end = totalPages - trimCount;
1994
+ const eligibleCount = end - start;
1995
+ if (eligibleCount <= 0) {
1996
+ return [Math.floor(totalPages / 2)];
1997
+ }
1998
+ if (eligibleCount <= maxSamples) {
1999
+ return Array.from({ length: eligibleCount }, (_, i) => start + i);
2000
+ }
2001
+ const indices = [];
2002
+ const step = eligibleCount / maxSamples;
2003
+ for (let i = 0; i < maxSamples; i++) {
2004
+ indices.push(start + Math.floor(i * step));
2005
+ }
2006
+ return indices;
2007
+ }
2008
+ /**
2009
+ * Analyze a single sample page for Korean-Hanja mixed script and primary language.
2010
+ * Normalizes raw VLM language responses to valid BCP 47 tags, filtering out invalid ones.
2011
+ *
2012
+ * @returns Object with Korean-Hanja detection result and normalized detected languages
2013
+ */
2014
+ async analyzeSamplePage(pageFile, pageNo, model, options) {
2015
+ this.logger.debug(
2016
+ `[OcrStrategySampler] Analyzing page ${pageNo} for Korean-Hanja mix and language...`
2017
+ );
2018
+ const base64Image = readFileSync3(pageFile).toString("base64");
2019
+ const messages = [
2020
+ {
2021
+ role: "user",
2022
+ content: [
2023
+ { type: "text", text: KOREAN_HANJA_MIX_PROMPT },
2024
+ {
2025
+ type: "image",
2026
+ image: `data:image/png;base64,${base64Image}`
2027
+ }
2028
+ ]
2029
+ }
2030
+ ];
2031
+ const result = await LLMCaller.callVision({
2032
+ schema: koreanHanjaMixSchema,
2033
+ messages,
2034
+ primaryModel: model,
2035
+ fallbackModel: options?.fallbackModel,
2036
+ maxRetries: options?.maxRetries ?? DEFAULT_MAX_RETRIES2,
2037
+ temperature: options?.temperature ?? 0,
2038
+ abortSignal: options?.abortSignal,
2039
+ component: "OcrStrategySampler",
2040
+ phase: "korean-hanja-mix-detection"
2041
+ });
2042
+ if (options?.aggregator) {
2043
+ options.aggregator.track(result.usage);
2044
+ }
2045
+ const output = result.output;
2046
+ const normalizedLanguages = output.detectedLanguages.map(normalizeToBcp47).filter((tag) => tag !== null);
2047
+ this.logger.debug(
2048
+ `[OcrStrategySampler] Page ${pageNo}: hasKoreanHanjaMix=${output.hasKoreanHanjaMix}, detectedLanguages=${normalizedLanguages.join(",")}`
2049
+ );
2050
+ return {
2051
+ hasKoreanHanjaMix: output.hasKoreanHanjaMix,
2052
+ detectedLanguages: normalizedLanguages
2053
+ };
2054
+ }
2055
+ /**
2056
+ * Aggregate language frequency map into a sorted array.
2057
+ * Returns languages sorted by frequency (descending), or undefined if empty.
2058
+ */
2059
+ aggregateLanguages(frequencyMap) {
2060
+ if (frequencyMap.size === 0) return void 0;
2061
+ return [...frequencyMap.entries()].sort((a, b) => b[1] - a[1]).map(([lang]) => lang);
2062
+ }
2063
+ };
2064
+
2065
+ // src/utils/local-file-server.ts
2066
+ import { createReadStream, statSync } from "fs";
2067
+ import { createServer } from "http";
2068
+ import { basename } from "path";
2069
+ var LocalFileServer = class {
2070
+ server = null;
2071
+ port = 0;
2072
+ /**
2073
+ * Start serving a file and return the URL
2074
+ *
2075
+ * @param filePath Absolute path to the file to serve
2076
+ * @returns URL to access the file
2077
+ */
2078
+ async start(filePath) {
2079
+ const filename = basename(filePath);
2080
+ const stat = statSync(filePath);
2081
+ return new Promise((resolve, reject) => {
2082
+ this.server = createServer((req, res) => {
2083
+ if (req.url === `/${filename}`) {
2084
+ res.writeHead(200, {
2085
+ "Content-Type": "application/pdf",
2086
+ "Content-Length": stat.size
2087
+ });
2088
+ createReadStream(filePath).pipe(res);
2089
+ } else {
2090
+ res.writeHead(404);
2091
+ res.end("Not Found");
2092
+ }
2093
+ });
2094
+ this.server.on("error", reject);
2095
+ this.server.listen(0, "127.0.0.1", () => {
2096
+ const address = this.server.address();
2097
+ if (typeof address === "object" && address !== null) {
2098
+ this.port = address.port;
2099
+ resolve(`http://127.0.0.1:${this.port}/${filename}`);
2100
+ } else {
2101
+ reject(new Error("Failed to get server address"));
2102
+ }
2103
+ });
2104
+ });
2105
+ }
2106
+ /**
2107
+ * Stop the server
2108
+ */
2109
+ stop() {
2110
+ return new Promise((resolve) => {
791
2111
  if (this.server) {
792
2112
  this.server.close(() => {
793
2113
  this.server = null;
@@ -802,9 +2122,9 @@ var LocalFileServer = class {
802
2122
  };
803
2123
 
804
2124
  // src/core/image-pdf-converter.ts
805
- import { existsSync as existsSync2, rmSync as rmSync2 } from "fs";
2125
+ import { existsSync as existsSync3, rmSync as rmSync2 } from "fs";
806
2126
  import { tmpdir } from "os";
807
- import { join as join3 } from "path";
2127
+ import { join as join5 } from "path";
808
2128
  var ImagePdfConverter = class {
809
2129
  constructor(logger) {
810
2130
  this.logger = logger;
@@ -820,8 +2140,8 @@ var ImagePdfConverter = class {
820
2140
  async convert(pdfUrl, reportId) {
821
2141
  const timestamp = Date.now();
822
2142
  const tempDir = tmpdir();
823
- const inputPath = join3(tempDir, `${reportId}-${timestamp}-input.pdf`);
824
- const outputPath = join3(tempDir, `${reportId}-${timestamp}-image.pdf`);
2143
+ const inputPath = join5(tempDir, `${reportId}-${timestamp}-input.pdf`);
2144
+ const outputPath = join5(tempDir, `${reportId}-${timestamp}-image.pdf`);
825
2145
  try {
826
2146
  this.logger.info("[ImagePdfConverter] Downloading PDF from URL...");
827
2147
  await this.downloadPdf(pdfUrl, inputPath);
@@ -830,7 +2150,7 @@ var ImagePdfConverter = class {
830
2150
  this.logger.info("[ImagePdfConverter] Image PDF created:", outputPath);
831
2151
  return outputPath;
832
2152
  } finally {
833
- if (existsSync2(inputPath)) {
2153
+ if (existsSync3(inputPath)) {
834
2154
  rmSync2(inputPath, { force: true });
835
2155
  }
836
2156
  }
@@ -878,7 +2198,7 @@ var ImagePdfConverter = class {
878
2198
  * Cleanup the temporary image PDF file
879
2199
  */
880
2200
  cleanup(imagePdfPath) {
881
- if (existsSync2(imagePdfPath)) {
2201
+ if (existsSync3(imagePdfPath)) {
882
2202
  this.logger.info(
883
2203
  "[ImagePdfConverter] Cleaning up temp file:",
884
2204
  imagePdfPath
@@ -889,11 +2209,6 @@ var ImagePdfConverter = class {
889
2209
  };
890
2210
 
891
2211
  // src/core/pdf-converter.ts
892
- var _origAssertValidConversionOptions = ValidationUtils.assertValidConversionOptions.bind(ValidationUtils);
893
- ValidationUtils.assertValidConversionOptions = (options) => {
894
- const { pipeline: _pipeline, ...rest } = options;
895
- _origAssertValidConversionOptions(rest);
896
- };
897
2212
  var PDFConverter = class {
898
2213
  constructor(logger, client, enableImagePdfFallback = false, timeout = PDF_CONVERTER.DEFAULT_TIMEOUT_MS) {
899
2214
  this.logger = logger;
@@ -903,9 +2218,232 @@ var PDFConverter = class {
903
2218
  }
904
2219
  async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
905
2220
  this.logger.info("[PDFConverter] Converting:", url);
2221
+ if (options.forceImagePdf) {
2222
+ return this.convertViaImagePdf(
2223
+ url,
2224
+ reportId,
2225
+ onComplete,
2226
+ cleanupAfterCallback,
2227
+ options,
2228
+ abortSignal
2229
+ );
2230
+ }
2231
+ return this.convertWithFallback(
2232
+ url,
2233
+ reportId,
2234
+ onComplete,
2235
+ cleanupAfterCallback,
2236
+ options,
2237
+ abortSignal
2238
+ );
2239
+ }
2240
+ /**
2241
+ * Convert a PDF using OCR strategy sampling to decide between ocrmac and VLM.
2242
+ *
2243
+ * Flow:
2244
+ * 1. Determine strategy (forced, skipped, or sampled via VLM)
2245
+ * 2. If VLM → OCR pipeline + VlmTextCorrector (text correction)
2246
+ * 3. If ocrmac → existing Docling conversion
2247
+ *
2248
+ * @returns ConvertWithStrategyResult with the chosen strategy and token report
2249
+ */
2250
+ async convertWithStrategy(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
2251
+ this.logger.info("[PDFConverter] Starting strategy-based conversion:", url);
2252
+ const aggregator = options.aggregator ?? new LLMTokenUsageAggregator();
2253
+ const trackedOptions = { ...options, aggregator };
2254
+ const pdfPath = url.startsWith("file://") ? url.slice(7) : null;
2255
+ const strategy = await this.determineStrategy(
2256
+ pdfPath,
2257
+ reportId,
2258
+ trackedOptions,
2259
+ abortSignal
2260
+ );
2261
+ this.logger.info(
2262
+ `[PDFConverter] OCR strategy: ${strategy.method} (${strategy.reason})`
2263
+ );
2264
+ if (trackedOptions.onTokenUsage) {
2265
+ const samplingReport = this.buildTokenReport(aggregator);
2266
+ if (samplingReport) {
2267
+ trackedOptions.onTokenUsage(samplingReport);
2268
+ }
2269
+ }
2270
+ if (strategy.method === "vlm") {
2271
+ await this.convertWithVlm(
2272
+ pdfPath,
2273
+ reportId,
2274
+ onComplete,
2275
+ cleanupAfterCallback,
2276
+ trackedOptions,
2277
+ abortSignal,
2278
+ strategy.detectedLanguages,
2279
+ strategy.koreanHanjaMixPages
2280
+ );
2281
+ return {
2282
+ strategy,
2283
+ tokenUsageReport: this.buildTokenReport(aggregator)
2284
+ };
2285
+ }
2286
+ const ocrmacOptions = strategy.detectedLanguages ? { ...trackedOptions, ocr_lang: strategy.detectedLanguages } : trackedOptions;
2287
+ await this.convert(
2288
+ url,
2289
+ reportId,
2290
+ onComplete,
2291
+ cleanupAfterCallback,
2292
+ ocrmacOptions,
2293
+ abortSignal
2294
+ );
2295
+ return {
2296
+ strategy,
2297
+ tokenUsageReport: this.buildTokenReport(aggregator)
2298
+ };
2299
+ }
2300
+ /**
2301
+ * Build a token usage report from the aggregator.
2302
+ * Returns null when no LLM calls were tracked (e.g. forced ocrmac without sampling).
2303
+ */
2304
+ buildTokenReport(aggregator) {
2305
+ const report = aggregator.getReport();
2306
+ if (report.components.length === 0) {
2307
+ return null;
2308
+ }
2309
+ return report;
2310
+ }
2311
+ /**
2312
+ * Determine the OCR strategy based on options and page sampling.
2313
+ *
2314
+ * When sampling is possible (strategySamplerModel + local file), it always
2315
+ * runs — even with forcedMethod — so that detectedLanguages are available
2316
+ * for OCR engine configuration. The forced method simply overrides the
2317
+ * sampled method choice.
2318
+ */
2319
+ async determineStrategy(pdfPath, reportId, options, abortSignal) {
2320
+ if (options.skipSampling || !options.strategySamplerModel || !pdfPath) {
2321
+ const method = options.forcedMethod ?? "ocrmac";
2322
+ const reason = options.forcedMethod ? `Forced: ${options.forcedMethod}` : !pdfPath ? "Non-local URL, sampling skipped" : "Sampling skipped";
2323
+ return { method, reason, sampledPages: 0, totalPages: 0 };
2324
+ }
2325
+ const samplingDir = join6(process.cwd(), "output", reportId, "_sampling");
2326
+ const sampler = new OcrStrategySampler(
2327
+ this.logger,
2328
+ new PageRenderer(this.logger),
2329
+ new PdfTextExtractor(this.logger)
2330
+ );
2331
+ try {
2332
+ const strategy = await sampler.sample(
2333
+ pdfPath,
2334
+ samplingDir,
2335
+ options.strategySamplerModel,
2336
+ {
2337
+ aggregator: options.aggregator,
2338
+ abortSignal
2339
+ }
2340
+ );
2341
+ if (options.forcedMethod) {
2342
+ return {
2343
+ ...strategy,
2344
+ method: options.forcedMethod,
2345
+ reason: `Forced: ${options.forcedMethod} (${strategy.reason})`
2346
+ };
2347
+ }
2348
+ return strategy;
2349
+ } finally {
2350
+ if (existsSync4(samplingDir)) {
2351
+ rmSync3(samplingDir, { recursive: true, force: true });
2352
+ }
2353
+ }
2354
+ }
2355
+ /**
2356
+ * Execute VLM-enhanced PDF conversion.
2357
+ *
2358
+ * Runs the standard OCR pipeline (Docling) first, then applies VLM text
2359
+ * correction to fix garbled Chinese characters (漢字/Hanja) in OCR output.
2360
+ */
2361
+ async convertWithVlm(pdfPath, reportId, onComplete, cleanupAfterCallback, options, abortSignal, detectedLanguages, koreanHanjaMixPages) {
2362
+ if (!options.vlmProcessorModel) {
2363
+ throw new Error("vlmProcessorModel is required when OCR strategy is VLM");
2364
+ }
2365
+ if (!pdfPath) {
2366
+ throw new Error("VLM conversion requires a local file (file:// URL)");
2367
+ }
2368
+ const url = `file://${pdfPath}`;
2369
+ const wrappedCallback = async (outputDir) => {
2370
+ let pageTexts;
2371
+ try {
2372
+ const resultPath2 = join6(outputDir, "result.json");
2373
+ const doc = JSON.parse(readFileSync4(resultPath2, "utf-8"));
2374
+ const totalPages = Object.keys(doc.pages).length;
2375
+ const textExtractor = new PdfTextExtractor(this.logger);
2376
+ pageTexts = await textExtractor.extractText(pdfPath, totalPages);
2377
+ } catch {
2378
+ this.logger.warn(
2379
+ "[PDFConverter] pdftotext extraction failed, proceeding without text reference"
2380
+ );
2381
+ }
2382
+ const resultPath = join6(outputDir, "result.json");
2383
+ const ocrOriginPath = join6(outputDir, "result_ocr_origin.json");
2384
+ copyFileSync(resultPath, ocrOriginPath);
2385
+ const corrector = new VlmTextCorrector(this.logger);
2386
+ await corrector.correctAndSave(outputDir, options.vlmProcessorModel, {
2387
+ concurrency: options.vlmConcurrency,
2388
+ aggregator: options.aggregator,
2389
+ abortSignal,
2390
+ onTokenUsage: options.onTokenUsage,
2391
+ documentLanguages: detectedLanguages,
2392
+ pageTexts,
2393
+ koreanHanjaMixPages
2394
+ });
2395
+ await onComplete(outputDir);
2396
+ };
2397
+ const vlmOptions = detectedLanguages ? { ...options, ocr_lang: detectedLanguages } : options;
2398
+ await this.convert(
2399
+ url,
2400
+ reportId,
2401
+ wrappedCallback,
2402
+ cleanupAfterCallback,
2403
+ vlmOptions,
2404
+ abortSignal
2405
+ );
2406
+ this.logger.info("[PDFConverter] VLM conversion completed successfully");
2407
+ }
2408
+ /**
2409
+ * Convert by first creating an image PDF, then running the conversion.
2410
+ * Used when forceImagePdf option is enabled.
2411
+ */
2412
+ async convertViaImagePdf(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
2413
+ this.logger.info(
2414
+ "[PDFConverter] Force image PDF mode: converting to image PDF first..."
2415
+ );
2416
+ const imagePdfConverter = new ImagePdfConverter(this.logger);
2417
+ let imagePdfPath = null;
2418
+ try {
2419
+ imagePdfPath = await imagePdfConverter.convert(url, reportId);
2420
+ const localUrl = `file://${imagePdfPath}`;
2421
+ this.logger.info(
2422
+ "[PDFConverter] Image PDF ready, starting conversion:",
2423
+ localUrl
2424
+ );
2425
+ return await this.performConversion(
2426
+ localUrl,
2427
+ reportId,
2428
+ onComplete,
2429
+ cleanupAfterCallback,
2430
+ options,
2431
+ abortSignal
2432
+ );
2433
+ } finally {
2434
+ if (imagePdfPath) {
2435
+ imagePdfConverter.cleanup(imagePdfPath);
2436
+ }
2437
+ }
2438
+ }
2439
+ /**
2440
+ * Convert directly with optional image PDF fallback on failure.
2441
+ * Used by standard (OCR) pipeline.
2442
+ */
2443
+ async convertWithFallback(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
906
2444
  let originalError = null;
907
2445
  try {
908
- await this.performConversion(
2446
+ return await this.performConversion(
909
2447
  url,
910
2448
  reportId,
911
2449
  onComplete,
@@ -913,7 +2451,6 @@ var PDFConverter = class {
913
2451
  options,
914
2452
  abortSignal
915
2453
  );
916
- return;
917
2454
  } catch (error) {
918
2455
  if (abortSignal?.aborted) {
919
2456
  throw error;
@@ -931,7 +2468,7 @@ var PDFConverter = class {
931
2468
  imagePdfPath = await imagePdfConverter.convert(url, reportId);
932
2469
  const localUrl = `file://${imagePdfPath}`;
933
2470
  this.logger.info("[PDFConverter] Retrying with image PDF:", localUrl);
934
- await this.performConversion(
2471
+ const report = await this.performConversion(
935
2472
  localUrl,
936
2473
  reportId,
937
2474
  onComplete,
@@ -940,6 +2477,7 @@ var PDFConverter = class {
940
2477
  abortSignal
941
2478
  );
942
2479
  this.logger.info("[PDFConverter] Fallback conversion succeeded");
2480
+ return report;
943
2481
  } catch (fallbackError) {
944
2482
  this.logger.error(
945
2483
  "[PDFConverter] Fallback conversion also failed:",
@@ -954,15 +2492,10 @@ var PDFConverter = class {
954
2492
  }
955
2493
  async performConversion(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
956
2494
  const startTime = Date.now();
957
- const pipelineType = options.pipeline ?? "standard";
958
- const conversionOptions = pipelineType === "vlm" ? this.buildVlmConversionOptions(options) : this.buildConversionOptions(options);
959
- if (pipelineType === "vlm") {
960
- this.logger.info("[PDFConverter] Using VLM pipeline");
961
- } else {
962
- this.logger.info(
963
- `[PDFConverter] OCR languages: ${JSON.stringify(conversionOptions.ocr_options?.lang)}`
964
- );
965
- }
2495
+ const conversionOptions = this.buildConversionOptions(options);
2496
+ this.logger.info(
2497
+ `[PDFConverter] OCR languages: ${JSON.stringify(conversionOptions.ocr_options?.lang)}`
2498
+ );
966
2499
  this.logger.info(
967
2500
  "[PDFConverter] Converting document with Async Source API..."
968
2501
  );
@@ -990,11 +2523,12 @@ var PDFConverter = class {
990
2523
  }
991
2524
  }
992
2525
  const cwd = process.cwd();
993
- const zipPath = join4(cwd, "result.zip");
994
- const extractDir = join4(cwd, "result_extracted");
995
- const outputDir = join4(cwd, "output", reportId);
2526
+ const zipPath = join6(cwd, "result.zip");
2527
+ const extractDir = join6(cwd, "result_extracted");
2528
+ const outputDir = join6(cwd, "output", reportId);
996
2529
  try {
997
2530
  await this.processConvertedFiles(zipPath, extractDir, outputDir);
2531
+ await this.renderPageImages(url, outputDir);
998
2532
  if (abortSignal?.aborted) {
999
2533
  this.logger.info("[PDFConverter] Conversion aborted before callback");
1000
2534
  const error = new Error("PDF conversion was aborted");
@@ -1008,10 +2542,10 @@ var PDFConverter = class {
1008
2542
  this.logger.info("[PDFConverter] Total time:", duration, "ms");
1009
2543
  } finally {
1010
2544
  this.logger.info("[PDFConverter] Cleaning up temporary files...");
1011
- if (existsSync3(zipPath)) {
2545
+ if (existsSync4(zipPath)) {
1012
2546
  rmSync3(zipPath, { force: true });
1013
2547
  }
1014
- if (existsSync3(extractDir)) {
2548
+ if (existsSync4(extractDir)) {
1015
2549
  rmSync3(extractDir, { recursive: true, force: true });
1016
2550
  }
1017
2551
  if (cleanupAfterCallback) {
@@ -1019,17 +2553,27 @@ var PDFConverter = class {
1019
2553
  "[PDFConverter] Cleaning up output directory:",
1020
2554
  outputDir
1021
2555
  );
1022
- if (existsSync3(outputDir)) {
2556
+ if (existsSync4(outputDir)) {
1023
2557
  rmSync3(outputDir, { recursive: true, force: true });
1024
2558
  }
1025
2559
  } else {
1026
2560
  this.logger.info("[PDFConverter] Output preserved at:", outputDir);
1027
2561
  }
1028
2562
  }
2563
+ return null;
1029
2564
  }
1030
2565
  buildConversionOptions(options) {
1031
2566
  return {
1032
- ...omit(options, ["num_threads", "pipeline", "vlm_model"]),
2567
+ ...omit(options, [
2568
+ "num_threads",
2569
+ "forceImagePdf",
2570
+ "strategySamplerModel",
2571
+ "vlmProcessorModel",
2572
+ "skipSampling",
2573
+ "forcedMethod",
2574
+ "aggregator",
2575
+ "onTokenUsage"
2576
+ ]),
1033
2577
  to_formats: ["json", "html"],
1034
2578
  image_export_mode: "embedded",
1035
2579
  ocr_engine: "ocrmac",
@@ -1040,6 +2584,8 @@ var PDFConverter = class {
1040
2584
  framework: "livetext"
1041
2585
  },
1042
2586
  generate_picture_images: true,
2587
+ generate_page_images: false,
2588
+ // Page images are rendered by PageRenderer (ImageMagick) after conversion
1043
2589
  images_scale: 2,
1044
2590
  /**
1045
2591
  * While disabling this option yields the most accurate text extraction for readable PDFs,
@@ -1055,31 +2601,6 @@ var PDFConverter = class {
1055
2601
  }
1056
2602
  };
1057
2603
  }
1058
- /**
1059
- * Build conversion options for VLM pipeline.
1060
- *
1061
- * VLM pipeline uses a Vision Language Model instead of traditional OCR,
1062
- * providing better accuracy for KCJ characters and complex layouts.
1063
- */
1064
- buildVlmConversionOptions(options) {
1065
- const vlmModel = resolveVlmModel(options.vlm_model ?? DEFAULT_VLM_MODEL);
1066
- this.logger.info(
1067
- `[PDFConverter] VLM model: ${vlmModel.repo_id} (framework: ${vlmModel.inference_framework}, format: ${vlmModel.response_format})`
1068
- );
1069
- return {
1070
- ...omit(options, ["num_threads", "pipeline", "vlm_model", "ocr_lang"]),
1071
- to_formats: ["json", "html"],
1072
- image_export_mode: "embedded",
1073
- pipeline: "vlm",
1074
- vlm_pipeline_model_local: vlmModel,
1075
- generate_picture_images: true,
1076
- images_scale: 2,
1077
- accelerator_options: {
1078
- device: "mps",
1079
- num_threads: options.num_threads
1080
- }
1081
- };
1082
- }
1083
2604
  async startConversionTask(url, conversionOptions) {
1084
2605
  const task = await this.client.convertSourceAsync({
1085
2606
  sources: [
@@ -1146,25 +2667,64 @@ var PDFConverter = class {
1146
2667
  return;
1147
2668
  }
1148
2669
  if (status.task_status === "failure") {
1149
- throw new Error("Task failed with status: failure");
2670
+ const errorDetails = await this.getTaskFailureDetails(task);
2671
+ const elapsed = Math.round((Date.now() - conversionStartTime) / 1e3);
2672
+ this.logger.error(
2673
+ `
2674
+ [PDFConverter] Task failed after ${elapsed}s: ${errorDetails}`
2675
+ );
2676
+ throw new Error(`Task failed: ${errorDetails}`);
1150
2677
  }
1151
2678
  await new Promise(
1152
2679
  (resolve) => setTimeout(resolve, PDF_CONVERTER.POLL_INTERVAL_MS)
1153
2680
  );
1154
2681
  }
1155
2682
  }
2683
+ /**
2684
+ * Fetch detailed error information from a failed task result.
2685
+ */
2686
+ async getTaskFailureDetails(task) {
2687
+ try {
2688
+ const result = await task.getResult();
2689
+ if (result.errors?.length) {
2690
+ return result.errors.map((e) => e.message).join("; ");
2691
+ }
2692
+ return `status: ${result.status ?? "unknown"}`;
2693
+ } catch (err) {
2694
+ this.logger.error("[PDFConverter] Failed to retrieve task result:", err);
2695
+ return "unable to retrieve error details";
2696
+ }
2697
+ }
1156
2698
  async downloadResult(taskId) {
1157
2699
  this.logger.info(
1158
2700
  "\n[PDFConverter] Task completed, downloading ZIP file..."
1159
2701
  );
1160
2702
  const zipResult = await this.client.getTaskResultFile(taskId);
1161
- if (!zipResult.success || !zipResult.fileStream) {
1162
- throw new Error("Failed to get ZIP file result");
1163
- }
1164
- const zipPath = join4(process.cwd(), "result.zip");
2703
+ const zipPath = join6(process.cwd(), "result.zip");
1165
2704
  this.logger.info("[PDFConverter] Saving ZIP file to:", zipPath);
1166
- const writeStream = createWriteStream2(zipPath);
1167
- await pipeline(zipResult.fileStream, writeStream);
2705
+ if (zipResult.fileStream) {
2706
+ const writeStream = createWriteStream2(zipPath);
2707
+ await pipeline(zipResult.fileStream, writeStream);
2708
+ return;
2709
+ }
2710
+ if (zipResult.data) {
2711
+ await writeFile(zipPath, zipResult.data);
2712
+ return;
2713
+ }
2714
+ this.logger.warn(
2715
+ "[PDFConverter] SDK file result unavailable, falling back to direct download..."
2716
+ );
2717
+ const baseUrl = this.client.getConfig().baseUrl;
2718
+ const response = await fetch(`${baseUrl}/v1/result/${taskId}`, {
2719
+ headers: { Accept: "application/zip" }
2720
+ });
2721
+ if (!response.ok) {
2722
+ throw new Error(
2723
+ `Failed to download ZIP file: ${response.status} ${response.statusText}`
2724
+ );
2725
+ }
2726
+ const buffer = new Uint8Array(await response.arrayBuffer());
2727
+ await writeFile(zipPath, buffer);
1168
2728
  }
1169
2729
  async processConvertedFiles(zipPath, extractDir, outputDir) {
1170
2730
  await ImageExtractor.extractAndSaveDocumentsFromZip(
@@ -1174,6 +2734,40 @@ var PDFConverter = class {
1174
2734
  outputDir
1175
2735
  );
1176
2736
  }
2737
+ /**
2738
+ * Render page images from the source PDF using ImageMagick and update result.json.
2739
+ * Replaces Docling's generate_page_images which fails on large PDFs
2740
+ * due to memory limits when embedding all page images as base64.
2741
+ */
2742
+ async renderPageImages(url, outputDir) {
2743
+ if (!url.startsWith("file://")) {
2744
+ this.logger.warn(
2745
+ "[PDFConverter] Page image rendering skipped: only supported for local files (file:// URLs)"
2746
+ );
2747
+ return;
2748
+ }
2749
+ const pdfPath = url.slice(7);
2750
+ this.logger.info(
2751
+ "[PDFConverter] Rendering page images with ImageMagick..."
2752
+ );
2753
+ const renderer = new PageRenderer(this.logger);
2754
+ const renderResult = await renderer.renderPages(pdfPath, outputDir);
2755
+ const resultPath = join6(outputDir, "result.json");
2756
+ const doc = JSON.parse(readFileSync4(resultPath, "utf-8"));
2757
+ for (const page of Object.values(doc.pages)) {
2758
+ const pageNo = page.page_no;
2759
+ const fileIndex = pageNo - 1;
2760
+ if (fileIndex >= 0 && fileIndex < renderResult.pageCount) {
2761
+ page.image.uri = `pages/page_${fileIndex}.png`;
2762
+ page.image.mimetype = "image/png";
2763
+ page.image.dpi = 300;
2764
+ }
2765
+ }
2766
+ await writeFile(resultPath, JSON.stringify(doc, null, 2));
2767
+ this.logger.info(
2768
+ `[PDFConverter] Rendered ${renderResult.pageCount} page images`
2769
+ );
2770
+ }
1177
2771
  };
1178
2772
 
1179
2773
  // src/core/pdf-parser.ts
@@ -1204,7 +2798,7 @@ var PDFParser = class {
1204
2798
  this.baseUrl = void 0;
1205
2799
  }
1206
2800
  this.timeout = timeout;
1207
- this.venvPath = venvPath || join5(process.cwd(), ".venv");
2801
+ this.venvPath = venvPath || join7(process.cwd(), ".venv");
1208
2802
  this.killExistingProcess = killExistingProcess;
1209
2803
  this.enableImagePdfFallback = enableImagePdfFallback;
1210
2804
  }
@@ -1212,6 +2806,7 @@ var PDFParser = class {
1212
2806
  this.logger.info("[PDFParser] Initializing...");
1213
2807
  this.checkOperatingSystem();
1214
2808
  this.checkJqInstalled();
2809
+ this.checkPopplerInstalled();
1215
2810
  this.checkMacOSVersion();
1216
2811
  if (this.enableImagePdfFallback && !this.baseUrl) {
1217
2812
  this.checkImageMagickInstalled();
@@ -1253,9 +2848,9 @@ var PDFParser = class {
1253
2848
  }
1254
2849
  }
1255
2850
  checkOperatingSystem() {
1256
- if (platform2() !== "darwin") {
2851
+ if (platform() !== "darwin") {
1257
2852
  throw new Error(
1258
- "PDFParser is only supported on macOS. Current platform: " + platform2()
2853
+ "PDFParser is only supported on macOS. Current platform: " + platform()
1259
2854
  );
1260
2855
  }
1261
2856
  }
@@ -1268,6 +2863,15 @@ var PDFParser = class {
1268
2863
  );
1269
2864
  }
1270
2865
  }
2866
+ checkPopplerInstalled() {
2867
+ try {
2868
+ execSync("which pdftotext", { stdio: "ignore" });
2869
+ } catch {
2870
+ throw new Error(
2871
+ "poppler is not installed. Please install poppler using: brew install poppler"
2872
+ );
2873
+ }
2874
+ }
1271
2875
  checkMacOSVersion() {
1272
2876
  try {
1273
2877
  const versionOutput = execSync("sw_vers -productVersion", {
@@ -1314,8 +2918,12 @@ var PDFParser = class {
1314
2918
  */
1315
2919
  isConnectionRefusedError(error) {
1316
2920
  if (error instanceof Error) {
1317
- const errorStr = JSON.stringify(error);
1318
- return errorStr.includes("ECONNREFUSED");
2921
+ if (error.message.includes("ECONNREFUSED")) {
2922
+ return true;
2923
+ }
2924
+ if (error.cause instanceof Error && error.cause.message.includes("ECONNREFUSED")) {
2925
+ return true;
2926
+ }
1319
2927
  }
1320
2928
  return false;
1321
2929
  }
@@ -1383,11 +2991,20 @@ var PDFParser = class {
1383
2991
  "PDFParser is not initialized. Call init() before using parse()"
1384
2992
  );
1385
2993
  }
1386
- if (options.pipeline === "vlm" && this.environment && !this.baseUrl) {
1387
- this.logger.info(
1388
- "[PDFParser] VLM pipeline requested, ensuring VLM dependencies..."
2994
+ if (options.forceImagePdf && !this.baseUrl) {
2995
+ this.checkImageMagickInstalled();
2996
+ this.checkGhostscriptInstalled();
2997
+ }
2998
+ const useStrategyFlow = options.strategySamplerModel !== void 0 || options.forcedMethod !== void 0;
2999
+ if (useStrategyFlow) {
3000
+ return this.parseWithStrategy(
3001
+ url,
3002
+ reportId,
3003
+ onComplete,
3004
+ cleanupAfterCallback,
3005
+ options,
3006
+ abortSignal
1389
3007
  );
1390
- await this.environment.setupVlmDependencies();
1391
3008
  }
1392
3009
  const canRecover = !this.baseUrl && this.port !== void 0;
1393
3010
  const maxAttempts = PDF_PARSER.MAX_SERVER_RECOVERY_ATTEMPTS;
@@ -1424,6 +3041,53 @@ var PDFParser = class {
1424
3041
  throw error;
1425
3042
  }
1426
3043
  }
3044
+ return null;
3045
+ }
3046
+ /**
3047
+ * Parse a PDF using OCR strategy sampling to decide between ocrmac and VLM.
3048
+ * Delegates to PDFConverter.convertWithStrategy() and returns the token usage report.
3049
+ *
3050
+ * Server recovery (restart on ECONNREFUSED) is preserved because
3051
+ * the ocrmac path still uses the Docling server.
3052
+ */
3053
+ async parseWithStrategy(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
3054
+ const canRecover = !this.baseUrl && this.port !== void 0;
3055
+ const maxAttempts = PDF_PARSER.MAX_SERVER_RECOVERY_ATTEMPTS;
3056
+ let attempt = 0;
3057
+ while (attempt <= maxAttempts) {
3058
+ try {
3059
+ const effectiveFallbackEnabled = this.enableImagePdfFallback && !this.baseUrl;
3060
+ const converter = new PDFConverter(
3061
+ this.logger,
3062
+ this.client,
3063
+ effectiveFallbackEnabled,
3064
+ this.timeout
3065
+ );
3066
+ const result = await converter.convertWithStrategy(
3067
+ url,
3068
+ reportId,
3069
+ onComplete,
3070
+ cleanupAfterCallback,
3071
+ options,
3072
+ abortSignal
3073
+ );
3074
+ return result.tokenUsageReport;
3075
+ } catch (error) {
3076
+ if (abortSignal?.aborted) {
3077
+ throw error;
3078
+ }
3079
+ if (canRecover && this.isConnectionRefusedError(error) && attempt < maxAttempts) {
3080
+ this.logger.warn(
3081
+ "[PDFParser] Connection refused, attempting server recovery..."
3082
+ );
3083
+ await this.restartServer();
3084
+ attempt++;
3085
+ continue;
3086
+ }
3087
+ throw error;
3088
+ }
3089
+ }
3090
+ return null;
1427
3091
  }
1428
3092
  /**
1429
3093
  * Dispose the parser instance.
@@ -1445,11 +3109,173 @@ var PDFParser = class {
1445
3109
  }
1446
3110
  }
1447
3111
  };
3112
+
3113
+ // src/validators/vlm-response-validator.ts
3114
+ var MIN_CONTENT_LENGTH = 20;
3115
+ var KOREAN_SCRIPT_RATIO_THRESHOLD = 0.1;
3116
+ var PLACEHOLDER_PATTERNS = [
3117
+ /lorem\s+ipsum/i,
3118
+ /dolor\s+sit\s+amet/i,
3119
+ /consectetur\s+adipiscing/i,
3120
+ /sed\s+do\s+eiusmod/i,
3121
+ /ut\s+labore\s+et\s+dolore/i
3122
+ ];
3123
+ var META_DESCRIPTION_PATTERNS_KO = [
3124
+ /이미지\s*해상도/,
3125
+ /판독하기?\s*어렵/,
3126
+ /해상도가?\s*(매우\s*)?(낮|부족)/,
3127
+ /텍스트를?\s*판독/,
3128
+ /글자를?\s*읽기?\s*어렵/,
3129
+ /정확한?\s*판독이?\s*(불가|어렵)/
3130
+ ];
3131
+ var META_DESCRIPTION_PATTERNS_EN = [
3132
+ /the image contains/i,
3133
+ /unable to (read|transcribe)/i,
3134
+ /resolution.*(too low|insufficient)/i,
3135
+ /cannot (read|make out|decipher)/i,
3136
+ /text is (not |un)?(legible|readable)/i,
3137
+ /exact transcription is not possible/i
3138
+ ];
3139
+ var REPETITIVE_PATTERN_RATIO_THRESHOLD = 0.3;
3140
+ var REPETITIVE_PATTERN_MIN_REPEATS = 5;
3141
+ var HANGUL_REGEX2 = /[\uAC00-\uD7AF\u1100-\u11FF]/g;
3142
+ var CJK_REGEX2 = /[\u4E00-\u9FFF]/g;
3143
+ var VlmResponseValidator = class {
3144
+ /**
3145
+ * Validate VLM page result quality.
3146
+ *
3147
+ * @param elements - Extracted page elements to validate
3148
+ * @param documentLanguages - BCP 47 language tags (e.g., ['ko-KR', 'en-US'])
3149
+ * @returns Validation result with issues list
3150
+ */
3151
+ static validate(elements, documentLanguages) {
3152
+ const issues = [];
3153
+ const textElements = elements.filter(
3154
+ (el) => el.type !== "picture" && el.content.length > 0
3155
+ );
3156
+ if (textElements.length === 0) {
3157
+ return { isValid: true, issues: [] };
3158
+ }
3159
+ const placeholderIssue = this.detectPlaceholderText(textElements);
3160
+ if (placeholderIssue) {
3161
+ issues.push(placeholderIssue);
3162
+ }
3163
+ if (documentLanguages?.[0]?.startsWith("ko")) {
3164
+ const scriptIssue = this.detectScriptAnomaly(textElements);
3165
+ if (scriptIssue) {
3166
+ issues.push(scriptIssue);
3167
+ }
3168
+ }
3169
+ const metaIssue = this.detectMetaDescription(textElements);
3170
+ if (metaIssue) {
3171
+ issues.push(metaIssue);
3172
+ }
3173
+ const repetitiveIssue = this.detectRepetitivePattern(textElements);
3174
+ if (repetitiveIssue) {
3175
+ issues.push(repetitiveIssue);
3176
+ }
3177
+ return { isValid: issues.length === 0, issues };
3178
+ }
3179
+ /**
3180
+ * Detect known placeholder / filler text in elements.
3181
+ */
3182
+ static detectPlaceholderText(elements) {
3183
+ const affectedElements = [];
3184
+ for (const el of elements) {
3185
+ for (const pattern of PLACEHOLDER_PATTERNS) {
3186
+ if (pattern.test(el.content)) {
3187
+ affectedElements.push(el.order);
3188
+ break;
3189
+ }
3190
+ }
3191
+ }
3192
+ if (affectedElements.length === 0) return null;
3193
+ return {
3194
+ type: "placeholder_text",
3195
+ message: `Detected placeholder text (e.g., Lorem ipsum) in ${affectedElements.length} element(s)`,
3196
+ affectedElements
3197
+ };
3198
+ }
3199
+ /**
3200
+ * Detect script anomaly: expected Korean content but found mostly Latin text.
3201
+ * Counts Hangul + CJK characters and flags if the ratio is below threshold.
3202
+ */
3203
+ static detectScriptAnomaly(elements) {
3204
+ const allContent = elements.map((el) => el.content).join("");
3205
+ const nonWhitespace = allContent.replace(/\s/g, "");
3206
+ if (nonWhitespace.length < MIN_CONTENT_LENGTH) {
3207
+ return null;
3208
+ }
3209
+ const hangulCount = allContent.match(HANGUL_REGEX2)?.length ?? 0;
3210
+ const cjkCount = allContent.match(CJK_REGEX2)?.length ?? 0;
3211
+ const koreanCjkCount = hangulCount + cjkCount;
3212
+ const ratio = koreanCjkCount / nonWhitespace.length;
3213
+ if (ratio < KOREAN_SCRIPT_RATIO_THRESHOLD) {
3214
+ return {
3215
+ type: "script_anomaly",
3216
+ message: `Expected Korean text but found ${(ratio * 100).toFixed(1)}% Korean/CJK characters (threshold: ${KOREAN_SCRIPT_RATIO_THRESHOLD * 100}%)`,
3217
+ affectedElements: elements.map((el) => el.order)
3218
+ };
3219
+ }
3220
+ return null;
3221
+ }
3222
+ /**
3223
+ * Detect meta description: VLM described the image/resolution instead
3224
+ * of transcribing actual text content.
3225
+ */
3226
+ static detectMetaDescription(elements) {
3227
+ const affectedElements = [];
3228
+ const allPatterns = [
3229
+ ...META_DESCRIPTION_PATTERNS_KO,
3230
+ ...META_DESCRIPTION_PATTERNS_EN
3231
+ ];
3232
+ for (const el of elements) {
3233
+ for (const pattern of allPatterns) {
3234
+ if (pattern.test(el.content)) {
3235
+ affectedElements.push(el.order);
3236
+ break;
3237
+ }
3238
+ }
3239
+ }
3240
+ if (affectedElements.length === 0) return null;
3241
+ return {
3242
+ type: "meta_description",
3243
+ message: `Detected meta-description of image instead of text transcription in ${affectedElements.length} element(s)`,
3244
+ affectedElements
3245
+ };
3246
+ }
3247
+ /**
3248
+ * Detect repetitive character patterns (e.g., `: : : : :` or `= = = = =`).
3249
+ * Flags when the same character repeats with spaces 5+ times and the
3250
+ * repetitive portion exceeds 30% of total content.
3251
+ */
3252
+ static detectRepetitivePattern(elements) {
3253
+ const allContent = elements.map((el) => el.content).join("\n");
3254
+ if (allContent.trim().length === 0) return null;
3255
+ const repetitiveRegex = /(\S)(\s+\1){4,}/g;
3256
+ let totalRepetitiveLength = 0;
3257
+ let match;
3258
+ while ((match = repetitiveRegex.exec(allContent)) !== null) {
3259
+ const repeatedChar = match[1];
3260
+ const segment = match[0];
3261
+ const parts = segment.split(/\s+/).filter((p) => p === repeatedChar);
3262
+ if (parts.length >= REPETITIVE_PATTERN_MIN_REPEATS) {
3263
+ totalRepetitiveLength += segment.length;
3264
+ }
3265
+ }
3266
+ if (totalRepetitiveLength === 0) return null;
3267
+ const ratio = totalRepetitiveLength / allContent.length;
3268
+ if (ratio < REPETITIVE_PATTERN_RATIO_THRESHOLD) return null;
3269
+ return {
3270
+ type: "repetitive_pattern",
3271
+ message: `Detected repetitive character patterns (${(ratio * 100).toFixed(0)}% of content)`,
3272
+ affectedElements: elements.map((el) => el.order)
3273
+ };
3274
+ }
3275
+ };
1448
3276
  export {
1449
- DEFAULT_VLM_MODEL,
1450
3277
  ImagePdfFallbackError,
1451
3278
  PDFParser,
1452
- VLM_MODELS,
1453
- resolveVlmModel
3279
+ VlmResponseValidator
1454
3280
  };
1455
3281
  //# sourceMappingURL=index.js.map