@heripo/document-processor 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,4262 @@
1
+ // ../shared/dist/index.mjs
2
+ import { Output, generateText } from "ai";
3
+ var BatchProcessor = class {
4
+ /**
5
+ * Splits an array into batches of specified size.
6
+ *
7
+ * @param items - Array to split
8
+ * @param batchSize - Size of each batch
9
+ * @returns Array of batches
10
+ *
11
+ * @example
12
+ * ```typescript
13
+ * const items = [1, 2, 3, 4, 5];
14
+ * const batches = BatchProcessor.createBatches(items, 2);
15
+ * // [[1, 2], [3, 4], [5]]
16
+ * ```
17
+ */
18
+ static createBatches(items, batchSize) {
19
+ const batches = [];
20
+ for (let i = 0; i < items.length; i += batchSize) {
21
+ batches.push(items.slice(i, i + batchSize));
22
+ }
23
+ return batches;
24
+ }
25
+ /**
26
+ * Splits an array into batches and executes async function in parallel.
27
+ *
28
+ * @param items - Array to process
29
+ * @param batchSize - Size of each batch
30
+ * @param processFn - Async function to process each batch
31
+ * @returns Flattened array of processed results
32
+ *
33
+ * @example
34
+ * ```typescript
35
+ * const texts = ['a', 'b', 'c', 'd', 'e'];
36
+ * const results = await BatchProcessor.processBatch(
37
+ * texts,
38
+ * 2,
39
+ * async (batch) => {
40
+ * return batch.map(t => t.toUpperCase());
41
+ * }
42
+ * );
43
+ * // ['A', 'B', 'C', 'D', 'E']
44
+ * ```
45
+ */
46
+ static async processBatch(items, batchSize, processFn) {
47
+ const batches = this.createBatches(items, batchSize);
48
+ const results = await Promise.all(batches.map((batch) => processFn(batch)));
49
+ return results.flat();
50
+ }
51
+ /**
52
+ * Splits an array into batches and executes sync function in parallel.
53
+ *
54
+ * @param items - Array to process
55
+ * @param batchSize - Size of each batch
56
+ * @param processFn - Sync function to process each batch
57
+ * @returns Flattened array of processed results
58
+ *
59
+ * @example
60
+ * ```typescript
61
+ * const numbers = [1, 2, 3, 4, 5];
62
+ * const results = BatchProcessor.processBatchSync(
63
+ * numbers,
64
+ * 2,
65
+ * (batch) => batch.map(n => n * 2)
66
+ * );
67
+ * // [2, 4, 6, 8, 10]
68
+ * ```
69
+ */
70
+ static processBatchSync(items, batchSize, processFn) {
71
+ const batches = this.createBatches(items, batchSize);
72
+ const results = batches.map((batch) => processFn(batch));
73
+ return results.flat();
74
+ }
75
+ };
76
+ var LLMCaller = class {
77
+ /**
78
+ * Extract model name from LanguageModel object
79
+ *
80
+ * Attempts to get model ID from various possible fields in the LanguageModel object.
81
+ */
82
+ static extractModelName(model) {
83
+ const modelObj = model;
84
+ if (typeof modelObj.modelId === "string") return modelObj.modelId;
85
+ if (typeof modelObj.id === "string") return modelObj.id;
86
+ if (typeof modelObj.model === "string") return modelObj.model;
87
+ if (typeof modelObj.name === "string") return modelObj.name;
88
+ return String(model);
89
+ }
90
+ /**
91
+ * Build usage information from response
92
+ */
93
+ static buildUsage(config, modelName, response, usedFallback) {
94
+ return {
95
+ component: config.component,
96
+ phase: config.phase,
97
+ model: usedFallback ? "fallback" : "primary",
98
+ modelName,
99
+ inputTokens: response.usage?.inputTokens ?? 0,
100
+ outputTokens: response.usage?.outputTokens ?? 0,
101
+ totalTokens: response.usage?.totalTokens ?? 0
102
+ };
103
+ }
104
+ /**
105
+ * Execute LLM call with fallback support
106
+ *
107
+ * Common execution logic for both text and vision calls.
108
+ */
109
+ static async executeWithFallback(config, generateFn) {
110
+ const primaryModelName = this.extractModelName(config.primaryModel);
111
+ try {
112
+ const response = await generateFn(config.primaryModel);
113
+ return {
114
+ output: response.output,
115
+ usage: this.buildUsage(config, primaryModelName, response, false),
116
+ usedFallback: false
117
+ };
118
+ } catch (primaryError) {
119
+ if (config.abortSignal?.aborted) {
120
+ throw primaryError;
121
+ }
122
+ if (!config.fallbackModel) {
123
+ throw primaryError;
124
+ }
125
+ const fallbackModelName = this.extractModelName(config.fallbackModel);
126
+ const response = await generateFn(config.fallbackModel);
127
+ return {
128
+ output: response.output,
129
+ usage: this.buildUsage(config, fallbackModelName, response, true),
130
+ usedFallback: true
131
+ };
132
+ }
133
+ }
134
+ /**
135
+ * Call LLM with retry and fallback support
136
+ *
137
+ * Retry Strategy:
138
+ * 1. Try primary model up to maxRetries times
139
+ * 2. If all fail and fallbackModel provided, try fallback up to maxRetries times
140
+ * 3. Throw error if all attempts exhausted
141
+ *
142
+ * @template TOutput - Output type from schema validation
143
+ * @param config - LLM call configuration
144
+ * @returns Result with parsed object and usage information
145
+ * @throws Error if all retry attempts fail
146
+ */
147
+ static async call(config) {
148
+ return this.executeWithFallback(
149
+ config,
150
+ (model) => generateText({
151
+ model,
152
+ output: Output.object({
153
+ schema: config.schema
154
+ }),
155
+ system: config.systemPrompt,
156
+ prompt: config.userPrompt,
157
+ temperature: config.temperature,
158
+ maxRetries: config.maxRetries,
159
+ abortSignal: config.abortSignal
160
+ })
161
+ );
162
+ }
163
+ /**
164
+ * Call LLM for vision tasks with message format support
165
+ *
166
+ * Same retry and fallback logic as call(), but using message format instead of system/user prompts.
167
+ *
168
+ * @template TOutput - Output type from schema validation
169
+ * @param config - LLM vision call configuration
170
+ * @returns Result with parsed object and usage information
171
+ * @throws Error if all retry attempts fail
172
+ */
173
+ static async callVision(config) {
174
+ return this.executeWithFallback(
175
+ config,
176
+ (model) => generateText({
177
+ model,
178
+ output: Output.object({
179
+ schema: config.schema
180
+ }),
181
+ messages: config.messages,
182
+ temperature: config.temperature,
183
+ maxRetries: config.maxRetries,
184
+ abortSignal: config.abortSignal
185
+ })
186
+ );
187
+ }
188
+ };
189
+ function formatTokens(usage) {
190
+ return `${usage.inputTokens} input, ${usage.outputTokens} output, ${usage.totalTokens} total`;
191
+ }
192
+ var LLMTokenUsageAggregator = class {
193
+ usage = {};
194
+ /**
195
+ * Track token usage from an LLM call
196
+ *
197
+ * @param usage - Extended token usage with component/phase/model information
198
+ */
199
+ track(usage) {
200
+ if (!this.usage[usage.component]) {
201
+ this.usage[usage.component] = {
202
+ component: usage.component,
203
+ phases: {},
204
+ total: {
205
+ inputTokens: 0,
206
+ outputTokens: 0,
207
+ totalTokens: 0
208
+ }
209
+ };
210
+ }
211
+ const component = this.usage[usage.component];
212
+ if (!component.phases[usage.phase]) {
213
+ component.phases[usage.phase] = {
214
+ total: {
215
+ inputTokens: 0,
216
+ outputTokens: 0,
217
+ totalTokens: 0
218
+ }
219
+ };
220
+ }
221
+ const phase = component.phases[usage.phase];
222
+ if (usage.model === "primary") {
223
+ if (!phase.primary) {
224
+ phase.primary = {
225
+ modelName: usage.modelName,
226
+ inputTokens: 0,
227
+ outputTokens: 0,
228
+ totalTokens: 0
229
+ };
230
+ }
231
+ phase.primary.inputTokens += usage.inputTokens;
232
+ phase.primary.outputTokens += usage.outputTokens;
233
+ phase.primary.totalTokens += usage.totalTokens;
234
+ } else if (usage.model === "fallback") {
235
+ if (!phase.fallback) {
236
+ phase.fallback = {
237
+ modelName: usage.modelName,
238
+ inputTokens: 0,
239
+ outputTokens: 0,
240
+ totalTokens: 0
241
+ };
242
+ }
243
+ phase.fallback.inputTokens += usage.inputTokens;
244
+ phase.fallback.outputTokens += usage.outputTokens;
245
+ phase.fallback.totalTokens += usage.totalTokens;
246
+ }
247
+ phase.total.inputTokens += usage.inputTokens;
248
+ phase.total.outputTokens += usage.outputTokens;
249
+ phase.total.totalTokens += usage.totalTokens;
250
+ component.total.inputTokens += usage.inputTokens;
251
+ component.total.outputTokens += usage.outputTokens;
252
+ component.total.totalTokens += usage.totalTokens;
253
+ }
254
+ /**
255
+ * Get aggregated usage grouped by component
256
+ *
257
+ * @returns Array of component aggregates with phase breakdown
258
+ */
259
+ getByComponent() {
260
+ return Object.values(this.usage);
261
+ }
262
+ /**
263
+ * Get token usage report in structured JSON format
264
+ *
265
+ * Converts internal usage data to external TokenUsageReport format suitable
266
+ * for serialization and reporting. The report includes component breakdown,
267
+ * phase-level details, and both primary and fallback model usage.
268
+ *
269
+ * @returns Structured token usage report with components and total
270
+ */
271
+ getReport() {
272
+ const components = [];
273
+ for (const component of Object.values(this.usage)) {
274
+ const phases = [];
275
+ for (const [phaseName, phaseData] of Object.entries(component.phases)) {
276
+ const phaseReport = {
277
+ phase: phaseName,
278
+ total: {
279
+ inputTokens: phaseData.total.inputTokens,
280
+ outputTokens: phaseData.total.outputTokens,
281
+ totalTokens: phaseData.total.totalTokens
282
+ }
283
+ };
284
+ if (phaseData.primary) {
285
+ phaseReport.primary = {
286
+ modelName: phaseData.primary.modelName,
287
+ inputTokens: phaseData.primary.inputTokens,
288
+ outputTokens: phaseData.primary.outputTokens,
289
+ totalTokens: phaseData.primary.totalTokens
290
+ };
291
+ }
292
+ if (phaseData.fallback) {
293
+ phaseReport.fallback = {
294
+ modelName: phaseData.fallback.modelName,
295
+ inputTokens: phaseData.fallback.inputTokens,
296
+ outputTokens: phaseData.fallback.outputTokens,
297
+ totalTokens: phaseData.fallback.totalTokens
298
+ };
299
+ }
300
+ phases.push(phaseReport);
301
+ }
302
+ components.push({
303
+ component: component.component,
304
+ phases,
305
+ total: {
306
+ inputTokens: component.total.inputTokens,
307
+ outputTokens: component.total.outputTokens,
308
+ totalTokens: component.total.totalTokens
309
+ }
310
+ });
311
+ }
312
+ const totalUsage = this.getTotalUsage();
313
+ return {
314
+ components,
315
+ total: {
316
+ inputTokens: totalUsage.inputTokens,
317
+ outputTokens: totalUsage.outputTokens,
318
+ totalTokens: totalUsage.totalTokens
319
+ }
320
+ };
321
+ }
322
+ /**
323
+ * Get total usage across all components and phases
324
+ *
325
+ * @returns Aggregated token usage totals
326
+ */
327
+ getTotalUsage() {
328
+ let totalInput = 0;
329
+ let totalOutput = 0;
330
+ let totalTokens = 0;
331
+ for (const component of Object.values(this.usage)) {
332
+ totalInput += component.total.inputTokens;
333
+ totalOutput += component.total.outputTokens;
334
+ totalTokens += component.total.totalTokens;
335
+ }
336
+ return {
337
+ inputTokens: totalInput,
338
+ outputTokens: totalOutput,
339
+ totalTokens
340
+ };
341
+ }
342
+ /**
343
+ * Log comprehensive token usage summary
344
+ *
345
+ * Outputs usage grouped by component, with phase and model breakdown.
346
+ * Shows primary and fallback token usage separately for each phase.
347
+ * Call this once at the end of document processing.
348
+ *
349
+ * @param logger - Logger instance for output
350
+ */
351
+ logSummary(logger) {
352
+ const components = this.getByComponent();
353
+ if (components.length === 0) {
354
+ logger.info("[DocumentProcessor] No token usage to report");
355
+ return;
356
+ }
357
+ logger.info("[DocumentProcessor] Token usage summary:");
358
+ logger.info("");
359
+ let grandInputTokens = 0;
360
+ let grandOutputTokens = 0;
361
+ let grandTotalTokens = 0;
362
+ let grandPrimaryInputTokens = 0;
363
+ let grandPrimaryOutputTokens = 0;
364
+ let grandPrimaryTotalTokens = 0;
365
+ let grandFallbackInputTokens = 0;
366
+ let grandFallbackOutputTokens = 0;
367
+ let grandFallbackTotalTokens = 0;
368
+ for (const component of components) {
369
+ logger.info(`${component.component}:`);
370
+ for (const [phase, phaseData] of Object.entries(component.phases)) {
371
+ logger.info(` - ${phase}:`);
372
+ if (phaseData.primary) {
373
+ logger.info(
374
+ ` primary (${phaseData.primary.modelName}): ${formatTokens(phaseData.primary)}`
375
+ );
376
+ grandPrimaryInputTokens += phaseData.primary.inputTokens;
377
+ grandPrimaryOutputTokens += phaseData.primary.outputTokens;
378
+ grandPrimaryTotalTokens += phaseData.primary.totalTokens;
379
+ }
380
+ if (phaseData.fallback) {
381
+ logger.info(
382
+ ` fallback (${phaseData.fallback.modelName}): ${formatTokens(phaseData.fallback)}`
383
+ );
384
+ grandFallbackInputTokens += phaseData.fallback.inputTokens;
385
+ grandFallbackOutputTokens += phaseData.fallback.outputTokens;
386
+ grandFallbackTotalTokens += phaseData.fallback.totalTokens;
387
+ }
388
+ logger.info(` subtotal: ${formatTokens(phaseData.total)}`);
389
+ }
390
+ logger.info(
391
+ ` ${component.component} total: ${formatTokens(component.total)}`
392
+ );
393
+ logger.info("");
394
+ grandInputTokens += component.total.inputTokens;
395
+ grandOutputTokens += component.total.outputTokens;
396
+ grandTotalTokens += component.total.totalTokens;
397
+ }
398
+ logger.info("--- Summary ---");
399
+ if (grandPrimaryTotalTokens > 0) {
400
+ logger.info(
401
+ `Primary total: ${formatTokens({
402
+ inputTokens: grandPrimaryInputTokens,
403
+ outputTokens: grandPrimaryOutputTokens,
404
+ totalTokens: grandPrimaryTotalTokens
405
+ })}`
406
+ );
407
+ }
408
+ if (grandFallbackTotalTokens > 0) {
409
+ logger.info(
410
+ `Fallback total: ${formatTokens({
411
+ inputTokens: grandFallbackInputTokens,
412
+ outputTokens: grandFallbackOutputTokens,
413
+ totalTokens: grandFallbackTotalTokens
414
+ })}`
415
+ );
416
+ }
417
+ logger.info(
418
+ `Grand total: ${formatTokens({
419
+ inputTokens: grandInputTokens,
420
+ outputTokens: grandOutputTokens,
421
+ totalTokens: grandTotalTokens
422
+ })}`
423
+ );
424
+ }
425
+ /**
426
+ * Reset all tracked usage
427
+ *
428
+ * Call this at the start of a new document processing run.
429
+ */
430
+ reset() {
431
+ this.usage = {};
432
+ }
433
+ };
434
+
435
+ // src/utils/ref-resolver.ts
436
+ var RefResolver = class {
437
+ logger;
438
+ textMap;
439
+ pictureMap;
440
+ tableMap;
441
+ groupMap;
442
+ constructor(logger, doc) {
443
+ this.logger = logger;
444
+ this.logger.info("[RefResolver] Initializing reference resolver...");
445
+ this.textMap = this.buildIndex(doc.texts, "texts");
446
+ this.pictureMap = this.buildIndex(doc.pictures, "pictures");
447
+ this.tableMap = this.buildIndex(doc.tables, "tables");
448
+ this.groupMap = this.buildIndex(doc.groups, "groups");
449
+ this.logger.info(
450
+ `[RefResolver] Indexed ${this.textMap.size} texts, ${this.pictureMap.size} pictures, ${this.tableMap.size} tables, ${this.groupMap.size} groups`
451
+ );
452
+ }
453
+ /**
454
+ * Build an index mapping self_ref to the actual item
455
+ */
456
+ buildIndex(items, _prefix) {
457
+ const map = /* @__PURE__ */ new Map();
458
+ for (const item of items) {
459
+ map.set(item.self_ref, item);
460
+ }
461
+ return map;
462
+ }
463
+ /**
464
+ * Resolve a $ref string to the actual item
465
+ * @param ref - Reference string (e.g., "#/texts/0")
466
+ * @returns The resolved item, or null if not found
467
+ */
468
+ resolve(ref) {
469
+ const match = ref.match(/^#\/(\w+)\//);
470
+ if (!match) {
471
+ this.logger.warn(`[RefResolver] Invalid reference format: ${ref}`);
472
+ return null;
473
+ }
474
+ const collection = match[1];
475
+ if (collection === "texts") {
476
+ const result = this.textMap.get(ref) ?? null;
477
+ if (!result) {
478
+ this.logger.warn(`[RefResolver] Text reference not found: ${ref}`);
479
+ }
480
+ return result;
481
+ }
482
+ if (collection === "pictures") {
483
+ const result = this.pictureMap.get(ref) ?? null;
484
+ if (!result) {
485
+ this.logger.warn(`[RefResolver] Picture reference not found: ${ref}`);
486
+ }
487
+ return result;
488
+ }
489
+ if (collection === "tables") {
490
+ const result = this.tableMap.get(ref) ?? null;
491
+ if (!result) {
492
+ this.logger.warn(`[RefResolver] Table reference not found: ${ref}`);
493
+ }
494
+ return result;
495
+ }
496
+ if (collection === "groups") {
497
+ const result = this.groupMap.get(ref) ?? null;
498
+ if (!result) {
499
+ this.logger.warn(`[RefResolver] Group reference not found: ${ref}`);
500
+ }
501
+ return result;
502
+ }
503
+ this.logger.warn(`[RefResolver] Unknown collection type: ${collection}`);
504
+ return null;
505
+ }
506
+ /**
507
+ * Resolve a text reference
508
+ * @param ref - Reference string (e.g., "#/texts/0")
509
+ * @returns The resolved text item, or null if not found
510
+ */
511
+ resolveText(ref) {
512
+ return this.textMap.get(ref) ?? null;
513
+ }
514
+ /**
515
+ * Resolve a picture reference
516
+ * @param ref - Reference string (e.g., "#/pictures/0")
517
+ * @returns The resolved picture item, or null if not found
518
+ */
519
+ resolvePicture(ref) {
520
+ return this.pictureMap.get(ref) ?? null;
521
+ }
522
+ /**
523
+ * Resolve a table reference
524
+ * @param ref - Reference string (e.g., "#/tables/0")
525
+ * @returns The resolved table item, or null if not found
526
+ */
527
+ resolveTable(ref) {
528
+ return this.tableMap.get(ref) ?? null;
529
+ }
530
+ /**
531
+ * Resolve a group reference
532
+ * @param ref - Reference string (e.g., "#/groups/0")
533
+ * @returns The resolved group item, or null if not found
534
+ */
535
+ resolveGroup(ref) {
536
+ return this.groupMap.get(ref) ?? null;
537
+ }
538
+ /**
539
+ * Resolve multiple references at once
540
+ * @param refs - Array of reference objects with $ref property
541
+ * @returns Array of resolved items (null for unresolved references)
542
+ */
543
+ resolveMany(refs) {
544
+ return refs.map((ref) => this.resolve(ref.$ref));
545
+ }
546
+ };
547
+
548
+ // src/utils/id-generator.ts
549
+ var IdGenerator = class {
550
+ chapterCounter = 0;
551
+ imageCounter = 0;
552
+ tableCounter = 0;
553
+ footnoteCounter = 0;
554
+ /**
555
+ * Generate a chapter ID
556
+ * @returns A chapter ID in the format "ch-001"
557
+ */
558
+ generateChapterId() {
559
+ this.chapterCounter++;
560
+ return `ch-${this.padNumber(this.chapterCounter)}`;
561
+ }
562
+ /**
563
+ * Generate an image ID
564
+ * @returns An image ID in the format "img-001"
565
+ */
566
+ generateImageId() {
567
+ this.imageCounter++;
568
+ return `img-${this.padNumber(this.imageCounter)}`;
569
+ }
570
+ /**
571
+ * Generate a table ID
572
+ * @returns A table ID in the format "tbl-001"
573
+ */
574
+ generateTableId() {
575
+ this.tableCounter++;
576
+ return `tbl-${this.padNumber(this.tableCounter)}`;
577
+ }
578
+ /**
579
+ * Generate a footnote ID
580
+ * @returns A footnote ID in the format "ftn-001"
581
+ */
582
+ generateFootnoteId() {
583
+ this.footnoteCounter++;
584
+ return `ftn-${this.padNumber(this.footnoteCounter)}`;
585
+ }
586
+ /**
587
+ * Reset all counters to zero
588
+ */
589
+ reset() {
590
+ this.chapterCounter = 0;
591
+ this.imageCounter = 0;
592
+ this.tableCounter = 0;
593
+ this.footnoteCounter = 0;
594
+ }
595
+ /**
596
+ * Get current counter values (for testing/debugging)
597
+ */
598
+ getCounters() {
599
+ return {
600
+ chapter: this.chapterCounter,
601
+ image: this.imageCounter,
602
+ table: this.tableCounter,
603
+ footnote: this.footnoteCounter
604
+ };
605
+ }
606
+ /**
607
+ * Pad a number to 3 digits with leading zeros
608
+ */
609
+ padNumber(num) {
610
+ return num.toString().padStart(3, "0");
611
+ }
612
+ };
613
+
614
+ // src/utils/text-cleaner.ts
615
+ var TextCleaner = class {
616
+ /**
617
+ * Normalizes text
618
+ * - Converts consecutive spaces/line breaks to single space
619
+ * - Trims leading and trailing spaces
620
+ * - Normalizes special whitespace characters (tabs, non-breaking spaces, etc.)
621
+ */
622
+ static normalize(text) {
623
+ if (!text) return "";
624
+ let normalized = text.normalize("NFC");
625
+ normalized = normalized.replace(/[\t\u00A0\u2000-\u200B]/g, " ");
626
+ normalized = normalized.replace(/[\r\n]+/g, " ");
627
+ normalized = normalized.replace(/\s+/g, " ");
628
+ normalized = normalized.trim();
629
+ return normalized;
630
+ }
631
+ /**
632
+ * Clean text starting/ending with punctuation marks
633
+ * - Remove commas/periods at sentence start
634
+ * - Clean spaces and punctuation at sentence end
635
+ */
636
+ static cleanPunctuation(text) {
637
+ if (!text) return "";
638
+ let cleaned = text.replace(/^[,.:;!?]+\s*/, "");
639
+ cleaned = cleaned.replace(/\s+[,.:;!?]*$/, "");
640
+ return cleaned;
641
+ }
642
+ /**
643
+ * Filter text consisting only of numbers and spaces
644
+ */
645
+ static isValidText(text) {
646
+ if (!text) return false;
647
+ const cleaned = this.normalize(text);
648
+ return !/^\s*[\d\s]*$/.test(cleaned);
649
+ }
650
+ /**
651
+ * Batch normalization (for bulk processing)
652
+ */
653
+ static normalizeBatch(texts) {
654
+ return texts.map((text) => this.normalize(text));
655
+ }
656
+ /**
657
+ * Batch filtering (returns only valid text)
658
+ */
659
+ static filterValidTexts(texts) {
660
+ return texts.filter((text) => this.isValidText(text));
661
+ }
662
+ /**
663
+ * Batch normalization + filtering (stage 1 + stage 2 combined)
664
+ *
665
+ * Performs TextCleaner's basic normalization and filtering in batch processing at once.
666
+ * Splits large amounts of text into batches for efficient processing.
667
+ *
668
+ * If batchSize is 0, processes items sequentially without batch processing.
669
+ *
670
+ * @param texts - Original text array
671
+ * @param batchSize - Batch size (default: 10). Set to 0 for sequential processing without batching.
672
+ * @returns Normalized and filtered text array
673
+ *
674
+ * @example
675
+ * ```typescript
676
+ * const rawTexts = [' text 1 ', '123', 'text 2\n'];
677
+ * const cleaned = TextCleaner.normalizeAndFilterBatch(rawTexts, 10);
678
+ * // ['text 1', 'text 2']
679
+ *
680
+ * // Sequential processing (no batching)
681
+ * const cleanedSequential = TextCleaner.normalizeAndFilterBatch(rawTexts, 0);
682
+ * // ['text 1', 'text 2']
683
+ * ```
684
+ */
685
+ static normalizeAndFilterBatch(texts, batchSize = 10) {
686
+ if (batchSize === 0) {
687
+ const results = [];
688
+ for (const text of texts) {
689
+ const normalized = this.normalize(text);
690
+ if (this.isValidText(normalized)) {
691
+ results.push(normalized);
692
+ }
693
+ }
694
+ return results;
695
+ }
696
+ return BatchProcessor.processBatchSync(texts, batchSize, (batch) => {
697
+ const normalized = this.normalizeBatch(batch);
698
+ return this.filterValidTexts(normalized);
699
+ });
700
+ }
701
+ };
702
+
703
+ // src/utils/markdown-converter.ts
704
+ var MarkdownConverter = class _MarkdownConverter {
705
+ /**
706
+ * Convert TOC items (groups/tables) to Markdown string
707
+ *
708
+ * @param refs - Array of item references from TocAreaResult
709
+ * @param refResolver - RefResolver for resolving references
710
+ * @returns Markdown string representation of TOC
711
+ */
712
+ static convert(refs, refResolver) {
713
+ if (refs.length === 0) {
714
+ return "";
715
+ }
716
+ const lines = [];
717
+ for (const ref of refs) {
718
+ const item = refResolver.resolve(ref);
719
+ if (!item) {
720
+ continue;
721
+ }
722
+ if ("name" in item && (item.name === "list" || item.name === "group")) {
723
+ const groupMarkdown = _MarkdownConverter.groupToMarkdown(
724
+ item,
725
+ refResolver,
726
+ 0
727
+ );
728
+ if (groupMarkdown) {
729
+ lines.push(groupMarkdown);
730
+ }
731
+ } else if ("data" in item && "grid" in item.data) {
732
+ const tableMarkdown = _MarkdownConverter.tableToMarkdown(
733
+ item
734
+ );
735
+ if (tableMarkdown) {
736
+ lines.push(tableMarkdown);
737
+ }
738
+ } else if ("text" in item && "orig" in item) {
739
+ const textMarkdown = _MarkdownConverter.textToMarkdown(
740
+ item,
741
+ 0
742
+ );
743
+ if (textMarkdown) {
744
+ lines.push(textMarkdown);
745
+ }
746
+ }
747
+ }
748
+ return lines.join("\n\n");
749
+ }
750
+ /**
751
+ * Convert a group item to Markdown list format
752
+ *
753
+ * Handles nested lists and preserves hierarchy.
754
+ *
755
+ * @example
756
+ * Output:
757
+ * - Chapter 1 Introduction ..... 1
758
+ * - 1.1 Background ..... 3
759
+ * - 1.2 Objectives ..... 5
760
+ * - Chapter 2 Methodology ..... 10
761
+ */
762
+ static groupToMarkdown(group, refResolver, indentLevel = 0) {
763
+ const lines = [];
764
+ for (const childRef of group.children) {
765
+ const child = refResolver.resolve(childRef.$ref);
766
+ if (!child) {
767
+ continue;
768
+ }
769
+ if ("name" in child && (child.name === "list" || child.name === "group")) {
770
+ const nestedMarkdown = _MarkdownConverter.groupToMarkdown(
771
+ child,
772
+ refResolver,
773
+ indentLevel + 1
774
+ );
775
+ if (nestedMarkdown) {
776
+ lines.push(nestedMarkdown);
777
+ }
778
+ } else if ("text" in child && "orig" in child) {
779
+ const textMarkdown = _MarkdownConverter.textToMarkdown(
780
+ child,
781
+ indentLevel
782
+ );
783
+ if (textMarkdown) {
784
+ lines.push(textMarkdown);
785
+ }
786
+ }
787
+ }
788
+ return lines.join("\n");
789
+ }
790
+ /**
791
+ * Convert a table item to Markdown table format
792
+ *
793
+ * @example
794
+ * Output:
795
+ * | Chapter | Page |
796
+ * |---------|------|
797
+ * | Chapter 1 Introduction | 1 |
798
+ * | Chapter 2 Methodology | 10 |
799
+ */
800
+ static tableToMarkdown(table) {
801
+ const { grid } = table.data;
802
+ if (!grid || grid.length === 0) {
803
+ return "";
804
+ }
805
+ const lines = [];
806
+ for (let rowIdx = 0; rowIdx < grid.length; rowIdx++) {
807
+ const row = grid[rowIdx];
808
+ if (!row || row.length === 0) {
809
+ continue;
810
+ }
811
+ const cells = row.map(
812
+ (cell) => _MarkdownConverter.escapeTableCell(cell.text)
813
+ );
814
+ lines.push(`| ${cells.join(" | ")} |`);
815
+ if (rowIdx === 0) {
816
+ const separator = row.map(() => "---").join(" | ");
817
+ lines.push(`| ${separator} |`);
818
+ }
819
+ }
820
+ return lines.join("\n");
821
+ }
822
+ /**
823
+ * Convert a text item to Markdown line
824
+ */
825
+ static textToMarkdown(text, indentLevel = 0) {
826
+ const content = text.text.trim();
827
+ if (!content) {
828
+ return "";
829
+ }
830
+ const indent = _MarkdownConverter.getIndent(indentLevel);
831
+ const marker = _MarkdownConverter.getListMarker(
832
+ text.enumerated,
833
+ text.marker
834
+ );
835
+ return `${indent}${marker}${content}`;
836
+ }
837
+ /**
838
+ * Generate list marker based on enumeration and marker
839
+ */
840
+ static getListMarker(enumerated, marker) {
841
+ if (marker) {
842
+ return `${marker} `;
843
+ }
844
+ if (enumerated === true) {
845
+ return "1. ";
846
+ }
847
+ if (enumerated === false) {
848
+ return "- ";
849
+ }
850
+ return "- ";
851
+ }
852
+ /**
853
+ * Generate indent string (2 spaces per level)
854
+ */
855
+ static getIndent(level) {
856
+ return " ".repeat(level);
857
+ }
858
+ /**
859
+ * Escape special characters in table cell content
860
+ */
861
+ static escapeTableCell(text) {
862
+ return text.replace(/\|/g, "\\|").replace(/\n/g, " ").trim();
863
+ }
864
+ };
865
+
866
+ // src/converters/chapter-converter.ts
867
+ var ChapterConverter = class _ChapterConverter {
868
+ static FRONT_MATTER_ID = "ch-000";
869
+ static FRONT_MATTER_TITLE = "Front Matter";
870
+ logger;
871
+ idGenerator;
872
+ constructor(logger, idGenerator) {
873
+ this.logger = logger;
874
+ this.idGenerator = idGenerator;
875
+ }
876
+ /**
877
+ * Convert TocEntry[] to Chapter[]
878
+ *
879
+ * @param tocEntries - Table of contents entries
880
+ * @param textItems - DoclingDocument.texts (with prov for page numbers)
881
+ * @param pageRangeMap - PDF page to actual page mapping
882
+ * @param images - Converted images
883
+ * @param tables - Converted tables
884
+ * @param footnotes - Converted footnotes
885
+ * @returns Converted chapters with text blocks and resource references
886
+ */
887
+ convert(tocEntries, textItems, pageRangeMap, images, tables, footnotes) {
888
+ this.logger.info("[ChapterConverter] Starting chapter conversion...");
889
+ const frontMatter = this.createFrontMatterChapter();
890
+ const tocChapters = this.buildChapterTree(tocEntries);
891
+ this.logger.info(
892
+ `[ChapterConverter] Built ${tocChapters.length} TOC chapters + Front Matter`
893
+ );
894
+ const allChapters = [frontMatter, ...tocChapters];
895
+ const flatChapters = this.flattenChapters(allChapters);
896
+ const chapterRanges = this.calculatePageRanges(flatChapters, tocEntries);
897
+ this.logger.info(
898
+ `[ChapterConverter] Calculated ranges for ${chapterRanges.size} chapters`
899
+ );
900
+ const textBlocks = this.convertTextBlocks(textItems, pageRangeMap);
901
+ this.assignTextBlocks(allChapters, textBlocks, chapterRanges, pageRangeMap);
902
+ this.logger.info(
903
+ `[ChapterConverter] Assigned ${textBlocks.length} text blocks`
904
+ );
905
+ this.linkResources(
906
+ allChapters,
907
+ images,
908
+ tables,
909
+ footnotes,
910
+ chapterRanges,
911
+ pageRangeMap
912
+ );
913
+ this.logger.info(
914
+ `[ChapterConverter] Linked ${images.length} images, ${tables.length} tables, and ${footnotes.length} footnotes`
915
+ );
916
+ return allChapters;
917
+ }
918
+ /**
919
+ * Create Front Matter chapter for pre-TOC content
920
+ */
921
+ createFrontMatterChapter() {
922
+ return {
923
+ id: _ChapterConverter.FRONT_MATTER_ID,
924
+ originTitle: _ChapterConverter.FRONT_MATTER_TITLE,
925
+ title: _ChapterConverter.FRONT_MATTER_TITLE,
926
+ pageNo: 1,
927
+ level: 1,
928
+ textBlocks: [],
929
+ imageIds: [],
930
+ tableIds: [],
931
+ footnoteIds: []
932
+ };
933
+ }
934
+ /**
935
+ * Build chapter tree from TocEntry[]
936
+ * Recursively processes children
937
+ */
938
+ buildChapterTree(entries) {
939
+ return entries.map((entry) => {
940
+ const chapterId = this.idGenerator.generateChapterId();
941
+ const chapter = {
942
+ id: chapterId,
943
+ originTitle: entry.title,
944
+ title: TextCleaner.normalize(entry.title),
945
+ pageNo: entry.pageNo,
946
+ level: entry.level,
947
+ textBlocks: [],
948
+ imageIds: [],
949
+ tableIds: [],
950
+ footnoteIds: []
951
+ };
952
+ if (entry.children && entry.children.length > 0) {
953
+ chapter.children = this.buildChapterTree(entry.children);
954
+ }
955
+ return chapter;
956
+ });
957
+ }
958
+ /**
959
+ * Flatten chapter tree for page range calculation
960
+ * Preserves original TOC page numbers
961
+ */
962
+ flattenChapters(chapters) {
963
+ const result = [];
964
+ const flatten = (chapterList) => {
965
+ for (const chapter of chapterList) {
966
+ result.push({
967
+ chapter,
968
+ tocPageNo: chapter.pageNo
969
+ });
970
+ if (chapter.children && chapter.children.length > 0) {
971
+ flatten(chapter.children);
972
+ }
973
+ }
974
+ };
975
+ flatten(chapters);
976
+ return result;
977
+ }
978
+ /**
979
+ * Calculate page range for each chapter
980
+ * Uses next chapter's start page as end boundary
981
+ *
982
+ * Front Matter (ch-000) gets special handling:
983
+ * - startPage: 1
984
+ * - endPage: first TOC entry's page - 1 (or 0 if TOC starts at page 1)
985
+ */
986
+ calculatePageRanges(flatChapters, tocEntries) {
987
+ const ranges = /* @__PURE__ */ new Map();
988
+ if (flatChapters.length === 0) {
989
+ return ranges;
990
+ }
991
+ const firstTocPage = tocEntries.length > 0 ? Math.min(...tocEntries.map((e) => e.pageNo)) : Number.MAX_SAFE_INTEGER;
992
+ const tocChapters = flatChapters.filter(
993
+ (fc) => fc.chapter.id !== _ChapterConverter.FRONT_MATTER_ID
994
+ );
995
+ const sorted = [...tocChapters].sort((a, b) => a.tocPageNo - b.tocPageNo);
996
+ ranges.set(_ChapterConverter.FRONT_MATTER_ID, {
997
+ startPage: 1,
998
+ endPage: firstTocPage - 1
999
+ });
1000
+ for (let i = 0; i < sorted.length; i++) {
1001
+ const current = sorted[i];
1002
+ const next = sorted[i + 1];
1003
+ ranges.set(current.chapter.id, {
1004
+ startPage: current.tocPageNo,
1005
+ endPage: next ? next.tocPageNo - 1 : Number.MAX_SAFE_INTEGER
1006
+ });
1007
+ }
1008
+ return ranges;
1009
+ }
1010
+ /**
1011
+ * Valid labels for text blocks
1012
+ * Only these labels are included in chapter text blocks
1013
+ */
1014
+ static VALID_TEXT_LABELS = /* @__PURE__ */ new Set([
1015
+ "text",
1016
+ "section_header",
1017
+ "list_item"
1018
+ ]);
1019
+ /**
1020
+ * Check if text item has a picture parent
1021
+ * Items with parent.$ref starting with "#/pictures/" are excluded
1022
+ */
1023
+ static hasPictureParent(item) {
1024
+ const parentRef = item.parent?.$ref;
1025
+ return typeof parentRef === "string" && parentRef.startsWith("#/pictures/");
1026
+ }
1027
+ /**
1028
+ * Convert text items to text blocks
1029
+ * Filters by label (text, section_header, list_item), excludes picture children,
1030
+ * and extracts page numbers from prov
1031
+ */
1032
+ convertTextBlocks(textItems, _pageRangeMap) {
1033
+ return textItems.filter(
1034
+ (item) => _ChapterConverter.VALID_TEXT_LABELS.has(item.label) && !_ChapterConverter.hasPictureParent(item) && TextCleaner.isValidText(item.text)
1035
+ ).map((item) => {
1036
+ const pdfPageNo = item.prov?.[0]?.page_no ?? 1;
1037
+ return {
1038
+ text: TextCleaner.normalize(item.text),
1039
+ pdfPageNo
1040
+ };
1041
+ });
1042
+ }
1043
+ /**
1044
+ * Convert PDF page number to actual document page number
1045
+ * Falls back to pdfPageNo if mapping is missing
1046
+ */
1047
+ pdfPageToActualPage(pdfPageNo, pageRangeMap) {
1048
+ const range = pageRangeMap[pdfPageNo];
1049
+ if (!range) {
1050
+ return pdfPageNo;
1051
+ }
1052
+ return range.startPageNo;
1053
+ }
1054
+ /**
1055
+ * Find chapter ID for a given actual page number
1056
+ * Uses "start page first" strategy
1057
+ */
1058
+ findChapterForPage(actualPageNo, chapterRanges) {
1059
+ let bestMatch = null;
1060
+ let bestStartPage = -1;
1061
+ for (const [chapterId, range] of chapterRanges) {
1062
+ if (actualPageNo >= range.startPage && actualPageNo <= range.endPage) {
1063
+ if (range.startPage > bestStartPage) {
1064
+ bestStartPage = range.startPage;
1065
+ bestMatch = chapterId;
1066
+ }
1067
+ }
1068
+ }
1069
+ return bestMatch;
1070
+ }
1071
+ /**
1072
+ * Assign text blocks to chapters based on page ranges
1073
+ */
1074
+ assignTextBlocks(chapters, textBlocks, chapterRanges, pageRangeMap) {
1075
+ const chapterMap = this.buildChapterMap(chapters);
1076
+ for (const textBlock of textBlocks) {
1077
+ const actualPageNo = this.pdfPageToActualPage(
1078
+ textBlock.pdfPageNo,
1079
+ pageRangeMap
1080
+ );
1081
+ const chapterId = this.findChapterForPage(actualPageNo, chapterRanges);
1082
+ if (chapterId && chapterMap.has(chapterId)) {
1083
+ chapterMap.get(chapterId).textBlocks.push(textBlock);
1084
+ }
1085
+ }
1086
+ }
1087
+ /**
1088
+ * Link images, tables, and footnotes to chapters based on page ranges
1089
+ */
1090
+ linkResources(chapters, images, tables, footnotes, chapterRanges, pageRangeMap) {
1091
+ const chapterMap = this.buildChapterMap(chapters);
1092
+ for (const image of images) {
1093
+ const actualPageNo = this.pdfPageToActualPage(
1094
+ image.pdfPageNo,
1095
+ pageRangeMap
1096
+ );
1097
+ const chapterId = this.findChapterForPage(actualPageNo, chapterRanges);
1098
+ if (chapterId && chapterMap.has(chapterId)) {
1099
+ chapterMap.get(chapterId).imageIds.push(image.id);
1100
+ }
1101
+ }
1102
+ for (const table of tables) {
1103
+ const actualPageNo = this.pdfPageToActualPage(
1104
+ table.pdfPageNo,
1105
+ pageRangeMap
1106
+ );
1107
+ const chapterId = this.findChapterForPage(actualPageNo, chapterRanges);
1108
+ if (chapterId && chapterMap.has(chapterId)) {
1109
+ chapterMap.get(chapterId).tableIds.push(table.id);
1110
+ }
1111
+ }
1112
+ for (const footnote of footnotes) {
1113
+ const actualPageNo = this.pdfPageToActualPage(
1114
+ footnote.pdfPageNo,
1115
+ pageRangeMap
1116
+ );
1117
+ const chapterId = this.findChapterForPage(actualPageNo, chapterRanges);
1118
+ if (chapterId && chapterMap.has(chapterId)) {
1119
+ chapterMap.get(chapterId).footnoteIds.push(footnote.id);
1120
+ }
1121
+ }
1122
+ }
1123
+ /**
1124
+ * Build flat chapter map for O(1) lookup
1125
+ */
1126
+ buildChapterMap(chapters) {
1127
+ const map = /* @__PURE__ */ new Map();
1128
+ const addToMap = (chapterList) => {
1129
+ for (const chapter of chapterList) {
1130
+ map.set(chapter.id, chapter);
1131
+ if (chapter.children && chapter.children.length > 0) {
1132
+ addToMap(chapter.children);
1133
+ }
1134
+ }
1135
+ };
1136
+ addToMap(chapters);
1137
+ return map;
1138
+ }
1139
+ };
1140
+
1141
+ // src/extractors/toc-extract-error.ts
1142
+ var TocExtractError = class _TocExtractError extends Error {
1143
+ constructor(message, options) {
1144
+ super(message, options);
1145
+ this.name = "TocExtractError";
1146
+ }
1147
+ /**
1148
+ * Extract error message from unknown error type
1149
+ */
1150
+ static getErrorMessage(error) {
1151
+ return error instanceof Error ? error.message : String(error);
1152
+ }
1153
+ /**
1154
+ * Create TocExtractError from unknown error with context
1155
+ */
1156
+ static fromError(context, error) {
1157
+ return new _TocExtractError(
1158
+ `${context}: ${_TocExtractError.getErrorMessage(error)}`,
1159
+ { cause: error }
1160
+ );
1161
+ }
1162
+ };
1163
+ var TocNotFoundError = class extends TocExtractError {
1164
+ constructor(message = "Table of contents not found in the document") {
1165
+ super(message);
1166
+ this.name = "TocNotFoundError";
1167
+ }
1168
+ };
1169
+ var TocParseError = class extends TocExtractError {
1170
+ constructor(message, options) {
1171
+ super(message, options);
1172
+ this.name = "TocParseError";
1173
+ }
1174
+ };
1175
+ var TocValidationError = class extends TocExtractError {
1176
+ /**
1177
+ * Validation result with detailed issues
1178
+ */
1179
+ validationResult;
1180
+ constructor(message, validationResult) {
1181
+ super(message);
1182
+ this.name = "TocValidationError";
1183
+ this.validationResult = validationResult;
1184
+ }
1185
+ /**
1186
+ * Get formatted error summary
1187
+ */
1188
+ getSummary() {
1189
+ const { errorCount, issues } = this.validationResult;
1190
+ const lines = [
1191
+ `TOC validation failed: ${errorCount} error(s)`,
1192
+ "",
1193
+ "Issues:"
1194
+ ];
1195
+ for (const issue of issues) {
1196
+ lines.push(` [${issue.code}] ${issue.message}`);
1197
+ lines.push(` Path: ${issue.path}`);
1198
+ lines.push(
1199
+ ` Entry: "${issue.entry.title}" (page ${issue.entry.pageNo})`
1200
+ );
1201
+ }
1202
+ return lines.join("\n");
1203
+ }
1204
+ };
1205
+
1206
+ // src/extractors/toc-validator.ts
1207
+ var DEFAULT_OPTIONS = {
1208
+ totalPages: Infinity,
1209
+ maxTitleLength: 200
1210
+ };
1211
+ var TocValidator = class {
1212
+ options;
1213
+ issues;
1214
+ constructor(options) {
1215
+ this.options = {
1216
+ ...DEFAULT_OPTIONS,
1217
+ ...options
1218
+ };
1219
+ this.issues = [];
1220
+ }
1221
+ /**
1222
+ * Validate TocEntry array
1223
+ *
1224
+ * @param entries - TOC entries to validate
1225
+ * @returns Validation result
1226
+ */
1227
+ validate(entries) {
1228
+ this.issues = [];
1229
+ this.validateEntries(entries, "", null, /* @__PURE__ */ new Set());
1230
+ const errorCount = this.issues.length;
1231
+ return {
1232
+ valid: errorCount === 0,
1233
+ issues: [...this.issues],
1234
+ errorCount
1235
+ };
1236
+ }
1237
+ /**
1238
+ * Validate and throw if invalid
1239
+ *
1240
+ * @param entries - TOC entries to validate
1241
+ * @throws {TocValidationError} When validation fails
1242
+ */
1243
+ validateOrThrow(entries) {
1244
+ const result = this.validate(entries);
1245
+ if (!result.valid) {
1246
+ throw new TocValidationError(
1247
+ `TOC validation failed with ${result.errorCount} error(s)`,
1248
+ result
1249
+ );
1250
+ }
1251
+ }
1252
+ /**
1253
+ * Recursively validate entries
1254
+ */
1255
+ validateEntries(entries, parentPath, parentEntry, seenKeys) {
1256
+ let prevPageNo = parentEntry?.pageNo ?? 0;
1257
+ for (let i = 0; i < entries.length; i++) {
1258
+ const entry = entries[i];
1259
+ const path4 = parentPath ? `${parentPath}.children[${i}]` : `[${i}]`;
1260
+ this.validateTitle(entry, path4);
1261
+ this.validateTitleLength(entry, path4);
1262
+ this.validatePageRange(entry, path4);
1263
+ this.validatePageOrder(entry, path4, prevPageNo);
1264
+ prevPageNo = entry.pageNo;
1265
+ if (parentEntry) {
1266
+ this.validateParentChildPage(entry, path4, parentEntry);
1267
+ }
1268
+ const key = `${entry.title}:${entry.pageNo}`;
1269
+ this.validateDuplicate(entry, path4, key, seenKeys);
1270
+ seenKeys.add(key);
1271
+ if (entry.children && entry.children.length > 0) {
1272
+ this.validateEntries(entry.children, path4, entry, seenKeys);
1273
+ }
1274
+ }
1275
+ }
1276
+ /**
1277
+ * V003: Validate title is not empty
1278
+ */
1279
+ validateTitle(entry, path4) {
1280
+ if (!entry.title || entry.title.trim() === "") {
1281
+ this.addIssue({
1282
+ code: "V003",
1283
+ message: "Title is empty or contains only whitespace",
1284
+ path: path4,
1285
+ entry
1286
+ });
1287
+ }
1288
+ }
1289
+ /**
1290
+ * V004: Validate title length
1291
+ */
1292
+ validateTitleLength(entry, path4) {
1293
+ if (entry.title.length > this.options.maxTitleLength) {
1294
+ this.addIssue({
1295
+ code: "V004",
1296
+ message: `Title exceeds ${this.options.maxTitleLength} characters (${entry.title.length})`,
1297
+ path: path4,
1298
+ entry
1299
+ });
1300
+ }
1301
+ }
1302
+ /**
1303
+ * V002: Validate page number range
1304
+ */
1305
+ validatePageRange(entry, path4) {
1306
+ if (entry.pageNo < 1) {
1307
+ this.addIssue({
1308
+ code: "V002",
1309
+ message: `Page number must be >= 1, got ${entry.pageNo}`,
1310
+ path: path4,
1311
+ entry
1312
+ });
1313
+ }
1314
+ if (entry.pageNo > this.options.totalPages) {
1315
+ this.addIssue({
1316
+ code: "V002",
1317
+ message: `Page number ${entry.pageNo} exceeds document total pages (${this.options.totalPages})`,
1318
+ path: path4,
1319
+ entry
1320
+ });
1321
+ }
1322
+ }
1323
+ /**
1324
+ * V001: Validate page order within same level
1325
+ */
1326
+ validatePageOrder(entry, path4, prevPageNo) {
1327
+ if (entry.pageNo < prevPageNo) {
1328
+ this.addIssue({
1329
+ code: "V001",
1330
+ message: `Page number decreased from ${prevPageNo} to ${entry.pageNo}`,
1331
+ path: path4,
1332
+ entry
1333
+ });
1334
+ }
1335
+ }
1336
+ /**
1337
+ * V005: Validate parent-child page relationship
1338
+ */
1339
+ validateParentChildPage(entry, path4, parent) {
1340
+ if (entry.pageNo < parent.pageNo) {
1341
+ this.addIssue({
1342
+ code: "V005",
1343
+ message: `Child page (${entry.pageNo}) is before parent page (${parent.pageNo})`,
1344
+ path: path4,
1345
+ entry
1346
+ });
1347
+ }
1348
+ }
1349
+ /**
1350
+ * V006: Validate no duplicates
1351
+ */
1352
+ validateDuplicate(entry, path4, key, seenKeys) {
1353
+ if (seenKeys.has(key)) {
1354
+ this.addIssue({
1355
+ code: "V006",
1356
+ message: `Duplicate entry: "${entry.title}" at page ${entry.pageNo}`,
1357
+ path: path4,
1358
+ entry
1359
+ });
1360
+ }
1361
+ }
1362
+ /**
1363
+ * Add issue to the list
1364
+ */
1365
+ addIssue(issue) {
1366
+ this.issues.push(issue);
1367
+ }
1368
+ };
1369
+
1370
+ // src/extractors/toc-finder.ts
1371
+ var TOC_KEYWORDS = [
1372
+ "\uBAA9\uCC28",
1373
+ "\uCC28\uB840",
1374
+ "\uBAA9 \uCC28",
1375
+ "\u76EE\u5F55",
1376
+ "\u76EE \u5F55",
1377
+ "\u5185\u5BB9",
1378
+ "\u5167\u5BB9",
1379
+ "\u76EE\u6B21",
1380
+ "\u76EE \u6B21",
1381
+ "Contents",
1382
+ "Table of Contents",
1383
+ "TABLE OF CONTENTS",
1384
+ "CONTENTS"
1385
+ ];
1386
+ var CONTINUATION_MARKERS = [
1387
+ "\uBAA9\uCC28(\uACC4\uC18D)",
1388
+ "\uBAA9\uCC28 (\uACC4\uC18D)",
1389
+ "(\uACC4\uC18D)",
1390
+ "\u76EE\u5F55(\u7EED)",
1391
+ "\u76EE\u5F55 (\u7EED)",
1392
+ "(\u7EED)",
1393
+ "\u7EED\u8868",
1394
+ "\u76EE\u6B21(\u7D9A)",
1395
+ "\u76EE\u6B21 (\u7D9A)",
1396
+ "(\u7D9A)",
1397
+ "(continued)",
1398
+ "(Continued)",
1399
+ "(CONTINUED)",
1400
+ "continued"
1401
+ ];
1402
+ var PAGE_NUMBER_PATTERN = /\.{2,}\s*\d+\s*$|…+\s*\d+\s*$|\s+\d+\s*$/;
1403
+ var TocFinder = class {
1404
+ constructor(logger, refResolver, options) {
1405
+ this.logger = logger;
1406
+ this.refResolver = refResolver;
1407
+ this.maxSearchPages = options?.maxSearchPages ?? 10;
1408
+ this.keywords = [...TOC_KEYWORDS, ...options?.additionalKeywords ?? []];
1409
+ }
1410
+ maxSearchPages;
1411
+ keywords;
1412
+ /**
1413
+ * Find TOC area in the document
1414
+ *
1415
+ * @throws {TocNotFoundError} When no TOC area is found
1416
+ */
1417
+ find(doc) {
1418
+ this.logger.info("[TocFinder] Starting TOC search...");
1419
+ const keywordResult = this.findByKeywords(doc);
1420
+ if (keywordResult) {
1421
+ this.logger.info(
1422
+ `[TocFinder] Found TOC by keyword search: pages ${keywordResult.startPage}-${keywordResult.endPage}`
1423
+ );
1424
+ return keywordResult;
1425
+ }
1426
+ const structureResult = this.findByStructure(doc);
1427
+ if (structureResult) {
1428
+ this.logger.info(
1429
+ `[TocFinder] Found TOC by structure analysis: pages ${structureResult.startPage}-${structureResult.endPage}`
1430
+ );
1431
+ return structureResult;
1432
+ }
1433
+ this.logger.warn("[TocFinder] No TOC found in document");
1434
+ throw new TocNotFoundError();
1435
+ }
1436
+ /**
1437
+ * Stage 1: Search by keywords in text items
1438
+ */
1439
+ findByKeywords(doc) {
1440
+ for (const text of doc.texts) {
1441
+ if (!this.containsTocKeyword(text.text)) {
1442
+ continue;
1443
+ }
1444
+ const pageNo = text.prov[0]?.page_no;
1445
+ if (pageNo === void 0 || pageNo > this.maxSearchPages) {
1446
+ continue;
1447
+ }
1448
+ this.logger.info(
1449
+ `[TocFinder] Found TOC keyword "${text.text}" on page ${pageNo}`
1450
+ );
1451
+ const parentRef = text.parent?.$ref;
1452
+ if (!parentRef) {
1453
+ return {
1454
+ itemRefs: [text.self_ref],
1455
+ startPage: pageNo,
1456
+ endPage: pageNo
1457
+ };
1458
+ }
1459
+ const result = this.findTocContainer(doc, parentRef, pageNo);
1460
+ if (result) {
1461
+ return this.expandToConsecutivePages(result, doc);
1462
+ }
1463
+ }
1464
+ return null;
1465
+ }
1466
+ /**
1467
+ * Stage 2: Search by structure (lists/tables with page numbers)
1468
+ */
1469
+ findByStructure(doc) {
1470
+ const candidates = [];
1471
+ for (const group of doc.groups) {
1472
+ const pageNo = this.getGroupFirstPage(group);
1473
+ if (pageNo === void 0 || pageNo > this.maxSearchPages) {
1474
+ continue;
1475
+ }
1476
+ if (this.isGroupTocLike(group, doc)) {
1477
+ const score = this.calculateScore(group, pageNo);
1478
+ candidates.push({
1479
+ result: {
1480
+ itemRefs: [group.self_ref],
1481
+ startPage: pageNo,
1482
+ endPage: pageNo
1483
+ },
1484
+ score
1485
+ });
1486
+ }
1487
+ }
1488
+ for (const table of doc.tables) {
1489
+ const pageNo = table.prov[0]?.page_no;
1490
+ if (pageNo === void 0 || pageNo > this.maxSearchPages) {
1491
+ continue;
1492
+ }
1493
+ if (this.isTableTocLike(table)) {
1494
+ const score = this.calculateTableScore(table, pageNo);
1495
+ candidates.push({
1496
+ result: {
1497
+ itemRefs: [table.self_ref],
1498
+ startPage: pageNo,
1499
+ endPage: pageNo
1500
+ },
1501
+ score
1502
+ });
1503
+ }
1504
+ }
1505
+ if (candidates.length === 0) {
1506
+ return null;
1507
+ }
1508
+ candidates.sort((a, b) => b.score - a.score);
1509
+ const best = candidates[0];
1510
+ return this.expandToConsecutivePages(best.result, doc);
1511
+ }
1512
+ /**
1513
+ * Find the TOC container (group or table) from a parent reference
1514
+ */
1515
+ findTocContainer(doc, parentRef, pageNo) {
1516
+ const group = this.refResolver.resolveGroup(parentRef);
1517
+ if (group) {
1518
+ return {
1519
+ itemRefs: [group.self_ref],
1520
+ startPage: pageNo,
1521
+ endPage: pageNo
1522
+ };
1523
+ }
1524
+ const table = this.refResolver.resolveTable(parentRef);
1525
+ if (table) {
1526
+ return {
1527
+ itemRefs: [table.self_ref],
1528
+ startPage: pageNo,
1529
+ endPage: pageNo
1530
+ };
1531
+ }
1532
+ const item = this.refResolver.resolve(parentRef);
1533
+ if (item && item.parent?.$ref) {
1534
+ return this.findTocContainer(doc, item.parent.$ref, pageNo);
1535
+ }
1536
+ return null;
1537
+ }
1538
+ /**
1539
+ * Check if a group contains TOC-like structure
1540
+ */
1541
+ isGroupTocLike(group, _doc) {
1542
+ if (group.name !== "list" && group.name !== "group") {
1543
+ return false;
1544
+ }
1545
+ let pageNumberCount = 0;
1546
+ const children = this.refResolver.resolveMany(group.children);
1547
+ for (const child of children) {
1548
+ if (!child) continue;
1549
+ if ("text" in child && "orig" in child) {
1550
+ const textItem = child;
1551
+ if (PAGE_NUMBER_PATTERN.test(textItem.text)) {
1552
+ pageNumberCount++;
1553
+ }
1554
+ }
1555
+ }
1556
+ const total = children.filter((c) => c !== null).length;
1557
+ return pageNumberCount >= 3 || total > 0 && pageNumberCount / total > 0.5;
1558
+ }
1559
+ /**
1560
+ * Check if a table contains TOC-like structure
1561
+ */
1562
+ isTableTocLike(table) {
1563
+ if (table.label === "document_index") {
1564
+ return true;
1565
+ }
1566
+ const { grid, num_rows, num_cols } = table.data;
1567
+ if (num_rows < 3 || num_cols < 2) {
1568
+ return false;
1569
+ }
1570
+ let numberCount = 0;
1571
+ for (let row = 1; row < grid.length; row++) {
1572
+ const lastCell = grid[row]?.[num_cols - 1];
1573
+ if (lastCell && /^\d+$/.test(lastCell.text.trim())) {
1574
+ numberCount++;
1575
+ }
1576
+ }
1577
+ return numberCount > 0 && numberCount / (num_rows - 1) > 0.5;
1578
+ }
1579
+ /**
1580
+ * Expand TOC area to consecutive pages
1581
+ */
1582
+ expandToConsecutivePages(initial, doc) {
1583
+ const itemRefs = [...initial.itemRefs];
1584
+ let endPage = initial.endPage;
1585
+ for (let pageNo = initial.endPage + 1; pageNo <= this.maxSearchPages; pageNo++) {
1586
+ const continuationItems = this.findContinuationOnPage(doc, pageNo);
1587
+ if (continuationItems.length === 0) {
1588
+ break;
1589
+ }
1590
+ itemRefs.push(...continuationItems);
1591
+ endPage = pageNo;
1592
+ }
1593
+ return {
1594
+ itemRefs,
1595
+ startPage: initial.startPage,
1596
+ endPage
1597
+ };
1598
+ }
1599
+ /**
1600
+ * Find TOC continuation items on a specific page
1601
+ */
1602
+ findContinuationOnPage(doc, pageNo) {
1603
+ const refs = [];
1604
+ for (const text of doc.texts) {
1605
+ if (text.prov[0]?.page_no !== pageNo) {
1606
+ continue;
1607
+ }
1608
+ if (this.hasContinuationMarker(text.text)) {
1609
+ const parentRef = text.parent?.$ref;
1610
+ if (parentRef) {
1611
+ const group = this.refResolver.resolveGroup(parentRef);
1612
+ if (group) {
1613
+ refs.push(group.self_ref);
1614
+ }
1615
+ }
1616
+ }
1617
+ }
1618
+ for (const group of doc.groups) {
1619
+ const groupPage = this.getGroupFirstPage(group);
1620
+ if (groupPage !== pageNo) {
1621
+ continue;
1622
+ }
1623
+ if (this.isGroupTocLike(group, doc) && !refs.includes(group.self_ref)) {
1624
+ refs.push(group.self_ref);
1625
+ }
1626
+ }
1627
+ for (const table of doc.tables) {
1628
+ if (table.prov[0]?.page_no !== pageNo) {
1629
+ continue;
1630
+ }
1631
+ if (this.isTableTocLike(table) && !refs.includes(table.self_ref)) {
1632
+ refs.push(table.self_ref);
1633
+ }
1634
+ }
1635
+ return refs;
1636
+ }
1637
+ /**
1638
+ * Check if text contains TOC keyword
1639
+ */
1640
+ containsTocKeyword(text) {
1641
+ const normalizedText = text.trim().toLowerCase();
1642
+ return this.keywords.some(
1643
+ (keyword) => normalizedText.includes(keyword.toLowerCase())
1644
+ );
1645
+ }
1646
+ /**
1647
+ * Check for continuation markers
1648
+ */
1649
+ hasContinuationMarker(text) {
1650
+ const normalizedText = text.trim().toLowerCase();
1651
+ return CONTINUATION_MARKERS.some(
1652
+ (marker) => normalizedText.includes(marker.toLowerCase())
1653
+ );
1654
+ }
1655
+ /**
1656
+ * Get first page number of a group by checking its children
1657
+ */
1658
+ getGroupFirstPage(group) {
1659
+ for (const childRef of group.children) {
1660
+ const child = this.refResolver.resolve(childRef.$ref);
1661
+ if (child && "prov" in child) {
1662
+ const prov = child.prov;
1663
+ if (prov && prov[0]?.page_no !== void 0) {
1664
+ return prov[0].page_no;
1665
+ }
1666
+ }
1667
+ }
1668
+ return void 0;
1669
+ }
1670
+ /**
1671
+ * Calculate score for a group candidate
1672
+ * Higher score = better match
1673
+ */
1674
+ calculateScore(group, pageNo) {
1675
+ let score = 0;
1676
+ score += (this.maxSearchPages - pageNo + 1) * 10;
1677
+ score += group.children.length * 2;
1678
+ const children = this.refResolver.resolveMany(group.children);
1679
+ for (const child of children) {
1680
+ if (child && "text" in child) {
1681
+ const textItem = child;
1682
+ if (PAGE_NUMBER_PATTERN.test(textItem.text)) {
1683
+ score += 5;
1684
+ }
1685
+ }
1686
+ }
1687
+ return score;
1688
+ }
1689
+ /**
1690
+ * Calculate score for a table candidate
1691
+ */
1692
+ calculateTableScore(table, pageNo) {
1693
+ let score = 0;
1694
+ score += (this.maxSearchPages - pageNo + 1) * 10;
1695
+ score += table.data.num_rows * 2;
1696
+ if (table.label === "document_index") {
1697
+ score += 50;
1698
+ }
1699
+ return score;
1700
+ }
1701
+ };
1702
+
1703
+ // src/extractors/toc-extractor.ts
1704
+ import { z } from "zod";
1705
+
1706
+ // src/core/base-llm-component.ts
1707
+ var BaseLLMComponent = class {
1708
+ logger;
1709
+ model;
1710
+ fallbackModel;
1711
+ maxRetries;
1712
+ temperature;
1713
+ componentName;
1714
+ aggregator;
1715
+ abortSignal;
1716
+ /**
1717
+ * Constructor for BaseLLMComponent
1718
+ *
1719
+ * @param logger - Logger instance for logging
1720
+ * @param model - Primary language model for LLM calls
1721
+ * @param componentName - Name of the component for logging (e.g., "TocExtractor")
1722
+ * @param options - Optional configuration (maxRetries, temperature)
1723
+ * @param fallbackModel - Optional fallback model for retry on failure
1724
+ * @param aggregator - Optional token usage aggregator for tracking LLM calls
1725
+ */
1726
+ constructor(logger, model, componentName, options, fallbackModel, aggregator) {
1727
+ this.logger = logger;
1728
+ this.model = model;
1729
+ this.componentName = componentName;
1730
+ this.maxRetries = options?.maxRetries ?? 3;
1731
+ this.temperature = options?.temperature ?? 0;
1732
+ this.fallbackModel = fallbackModel;
1733
+ this.aggregator = aggregator;
1734
+ this.abortSignal = options?.abortSignal;
1735
+ }
1736
+ /**
1737
+ * Log a message with consistent component name prefix
1738
+ *
1739
+ * @param level - Log level ('info', 'warn', 'error')
1740
+ * @param message - Message to log (without prefix)
1741
+ * @param args - Additional arguments to pass to logger
1742
+ */
1743
+ log(level, message, ...args) {
1744
+ const formattedMessage = `[${this.componentName}] ${message}`;
1745
+ this.logger[level](formattedMessage, ...args);
1746
+ }
1747
+ /**
1748
+ * Track token usage to aggregator if available
1749
+ *
1750
+ * @param usage - Token usage information to track
1751
+ */
1752
+ trackUsage(usage) {
1753
+ if (this.aggregator) {
1754
+ this.aggregator.track(usage);
1755
+ }
1756
+ }
1757
+ /**
1758
+ * Create an empty usage record for edge cases (e.g., empty input)
1759
+ *
1760
+ * @param phase - Phase name for the usage record
1761
+ * @returns Empty ExtendedTokenUsage object
1762
+ */
1763
+ createEmptyUsage(phase) {
1764
+ return {
1765
+ component: this.componentName,
1766
+ phase,
1767
+ model: "primary",
1768
+ modelName: "none",
1769
+ inputTokens: 0,
1770
+ outputTokens: 0,
1771
+ totalTokens: 0
1772
+ };
1773
+ }
1774
+ };
1775
+
1776
+ // src/core/text-llm-component.ts
1777
+ var TextLLMComponent = class extends BaseLLMComponent {
1778
+ constructor(logger, model, componentName, options, fallbackModel, aggregator) {
1779
+ super(logger, model, componentName, options, fallbackModel, aggregator);
1780
+ }
1781
+ /**
1782
+ * Call LLM with text-based prompts using LLMCaller.call()
1783
+ *
1784
+ * @template TSchema - Zod schema type for response validation
1785
+ * @param schema - Zod schema for response validation
1786
+ * @param systemPrompt - System prompt for LLM
1787
+ * @param userPrompt - User prompt for LLM
1788
+ * @param phase - Phase name for tracking (e.g., 'extraction', 'validation')
1789
+ * @returns Promise with parsed object and usage information
1790
+ */
1791
+ async callTextLLM(schema, systemPrompt, userPrompt, phase) {
1792
+ const result = await LLMCaller.call({
1793
+ schema,
1794
+ systemPrompt,
1795
+ userPrompt,
1796
+ primaryModel: this.model,
1797
+ fallbackModel: this.fallbackModel,
1798
+ maxRetries: this.maxRetries,
1799
+ temperature: this.temperature,
1800
+ abortSignal: this.abortSignal,
1801
+ component: this.componentName,
1802
+ phase
1803
+ });
1804
+ this.trackUsage(result.usage);
1805
+ return {
1806
+ output: result.output,
1807
+ usage: result.usage
1808
+ };
1809
+ }
1810
+ };
1811
+
1812
+ // src/extractors/toc-extractor.ts
1813
+ var TocEntrySchema = z.lazy(
1814
+ () => z.object({
1815
+ title: z.string().describe("Chapter or section title"),
1816
+ level: z.number().int().min(1).describe("Hierarchy depth (1 = top level)"),
1817
+ pageNo: z.number().int().min(1).describe("Starting page number"),
1818
+ children: z.array(TocEntrySchema).optional().describe("Child sections")
1819
+ })
1820
+ );
1821
+ var TocResponseSchema = z.object({
1822
+ entries: z.array(TocEntrySchema).describe("Extracted TOC entries")
1823
+ });
1824
+ var TocExtractor = class extends TextLLMComponent {
1825
+ validationOptions;
1826
+ skipValidation;
1827
+ constructor(logger, model, options, fallbackModel, abortSignal) {
1828
+ super(
1829
+ logger,
1830
+ model,
1831
+ "TocExtractor",
1832
+ { ...options, abortSignal },
1833
+ fallbackModel
1834
+ );
1835
+ this.validationOptions = options?.validation;
1836
+ this.skipValidation = options?.skipValidation ?? false;
1837
+ }
1838
+ /**
1839
+ * Extract TOC structure from Markdown
1840
+ *
1841
+ * @param markdown - Markdown representation of TOC area
1842
+ * @returns Object with entries array and token usage information
1843
+ * @throws {TocParseError} When LLM fails to parse structure
1844
+ * @throws {TocValidationError} When validation fails
1845
+ */
1846
+ async extract(markdown) {
1847
+ this.log("info", `Starting TOC extraction (${markdown.length} chars)`);
1848
+ if (!markdown.trim()) {
1849
+ this.log("info", "Empty markdown, returning empty array");
1850
+ return {
1851
+ entries: [],
1852
+ usage: this.createEmptyUsage("extraction")
1853
+ };
1854
+ }
1855
+ try {
1856
+ const result = await this.callTextLLM(
1857
+ TocResponseSchema,
1858
+ this.buildSystemPrompt(),
1859
+ this.buildUserPrompt(markdown),
1860
+ "extraction"
1861
+ );
1862
+ const entries = this.normalizeEntries(result.output.entries);
1863
+ if (!this.skipValidation) {
1864
+ this.validateEntries(entries);
1865
+ }
1866
+ this.log(
1867
+ "info",
1868
+ `Extraction completed: ${entries.length} top-level entries`
1869
+ );
1870
+ return { entries, usage: result.usage };
1871
+ } catch (error) {
1872
+ if (error instanceof TocValidationError) {
1873
+ this.log("error", `Validation failed: ${error.message}`);
1874
+ throw error;
1875
+ }
1876
+ const message = error instanceof Error ? error.message : String(error);
1877
+ this.log("error", `Extraction failed: ${message}`);
1878
+ throw new TocParseError(`Failed to extract TOC structure: ${message}`, {
1879
+ cause: error
1880
+ });
1881
+ }
1882
+ }
1883
+ /**
1884
+ * Validate extracted entries
1885
+ *
1886
+ * @throws {TocValidationError} When validation fails
1887
+ */
1888
+ validateEntries(entries) {
1889
+ if (entries.length === 0) {
1890
+ return;
1891
+ }
1892
+ const validator = new TocValidator(this.validationOptions);
1893
+ validator.validateOrThrow(entries);
1894
+ }
1895
+ /**
1896
+ * Build system prompt for TOC extraction
1897
+ */
1898
+ buildSystemPrompt() {
1899
+ return `You are a document structure extraction assistant. Your task is to parse a table of contents (TOC) from markdown format and extract structured entries.
1900
+
1901
+ ## Instructions
1902
+
1903
+ 1. **Title**: Extract the exact chapter/section title from each line. Remove page number indicators like "..... 10" or "... 5" at the end.
1904
+
1905
+ 2. **Level**: Determine the hierarchy depth:
1906
+ - Level 1: Top-level chapters (e.g., "\uC81C1\uC7A5", "Chapter 1", "I.", "Part 1")
1907
+ - Level 2: Main sections within chapters (e.g., "1.", "1.1", "A.")
1908
+ - Level 3: Subsections (e.g., "1.1.1", "a.", "(1)")
1909
+ - Use indentation and numbering patterns to infer level
1910
+
1911
+ 3. **Page Number**: Extract the page number from each entry. Convert Roman numerals to Arabic numerals if present (e.g., "iv" \u2192 4).
1912
+
1913
+ 4. **Children**: Nest child entries under parent entries based on their hierarchy level.
1914
+
1915
+ 5. **IMPORTANT - Extract Main TOC Only**: Only extract the main document table of contents. EXCLUDE the following supplementary indices:
1916
+ - Photo/image indices (\uC0AC\uC9C4 \uBAA9\uCC28, \uC0AC\uC9C4\uBAA9\uCC28, \uD654\uBCF4 \uBAA9\uCC28, Photo Index, List of Photos, List of Figures)
1917
+ - Drawing/diagram indices (\uB3C4\uBA74 \uBAA9\uCC28, \uB3C4\uBA74\uBAA9\uCC28, \uC0BD\uB3C4 \uBAA9\uCC28, Drawing Index, List of Drawings)
1918
+ - Table indices (\uD45C \uBAA9\uCC28, \uD45C\uBAA9\uCC28, Table Index, List of Tables)
1919
+ - Appendix indices (\uBD80\uB85D \uBAA9\uCC28, Appendix Index)
1920
+ - Any other supplementary material indices
1921
+
1922
+ ## Output Format
1923
+
1924
+ Return a flat array of top-level entries. Each entry at level 1 should contain its children (level 2+) nested properly.
1925
+
1926
+ ## Example
1927
+
1928
+ Input:
1929
+ - \uC81C1\uC7A5 \uC11C\uB860 ..... 1
1930
+ - 1. \uC5F0\uAD6C \uBC30\uACBD ..... 3
1931
+ - 2. \uC5F0\uAD6C \uBAA9\uC801 ..... 5
1932
+ - \uC81C2\uC7A5 \uBC29\uBC95\uB860 ..... 10
1933
+
1934
+ Output:
1935
+ {
1936
+ "entries": [
1937
+ {
1938
+ "title": "\uC81C1\uC7A5 \uC11C\uB860",
1939
+ "level": 1,
1940
+ "pageNo": 1,
1941
+ "children": [
1942
+ { "title": "1. \uC5F0\uAD6C \uBC30\uACBD", "level": 2, "pageNo": 3 },
1943
+ { "title": "2. \uC5F0\uAD6C \uBAA9\uC801", "level": 2, "pageNo": 5 }
1944
+ ]
1945
+ },
1946
+ { "title": "\uC81C2\uC7A5 \uBC29\uBC95\uB860", "level": 1, "pageNo": 10 }
1947
+ ]
1948
+ }`;
1949
+ }
1950
+ /**
1951
+ * Build user prompt with Markdown content
1952
+ */
1953
+ buildUserPrompt(markdown) {
1954
+ return `Extract the table of contents structure from the following markdown:
1955
+
1956
+ ${markdown}`;
1957
+ }
1958
+ /**
1959
+ * Normalize and validate extracted entries
1960
+ */
1961
+ normalizeEntries(entries) {
1962
+ if (entries.length === 0) {
1963
+ return [];
1964
+ }
1965
+ return this.normalizeLevel(entries, 1);
1966
+ }
1967
+ /**
1968
+ * Recursively ensure level consistency
1969
+ *
1970
+ * Children must have level = parent.level + 1
1971
+ */
1972
+ normalizeLevel(entries, expectedLevel) {
1973
+ return entries.map((entry) => {
1974
+ const normalizedEntry = {
1975
+ title: entry.title.trim(),
1976
+ level: expectedLevel,
1977
+ pageNo: entry.pageNo
1978
+ };
1979
+ if (entry.children && entry.children.length > 0) {
1980
+ normalizedEntry.children = this.normalizeLevel(
1981
+ entry.children,
1982
+ expectedLevel + 1
1983
+ );
1984
+ }
1985
+ return normalizedEntry;
1986
+ });
1987
+ }
1988
+ };
1989
+
1990
+ // src/extractors/vision-toc-extractor.ts
1991
+ import * as fs2 from "fs";
1992
+ import * as path2 from "path";
1993
+ import { z as z2 } from "zod";
1994
+
1995
+ // src/core/vision-llm-component.ts
1996
+ import * as fs from "fs";
1997
+ import * as path from "path";
1998
+ var VisionLLMComponent = class extends BaseLLMComponent {
1999
+ outputPath;
2000
+ constructor(logger, model, componentName, outputPath, options, fallbackModel, aggregator) {
2001
+ super(logger, model, componentName, options, fallbackModel, aggregator);
2002
+ this.outputPath = outputPath;
2003
+ }
2004
+ /**
2005
+ * Call LLM with vision capabilities using LLMCaller.callVision()
2006
+ *
2007
+ * @template TSchema - Zod schema type for response validation
2008
+ * @param schema - Zod schema for response validation
2009
+ * @param messages - Messages array including image content
2010
+ * @param phase - Phase name for tracking (e.g., 'extraction', 'sampling')
2011
+ * @returns Promise with parsed object and usage information
2012
+ */
2013
+ async callVisionLLM(schema, messages, phase) {
2014
+ const result = await LLMCaller.callVision({
2015
+ schema,
2016
+ messages,
2017
+ primaryModel: this.model,
2018
+ fallbackModel: this.fallbackModel,
2019
+ maxRetries: this.maxRetries,
2020
+ temperature: this.temperature,
2021
+ abortSignal: this.abortSignal,
2022
+ component: this.componentName,
2023
+ phase
2024
+ });
2025
+ this.trackUsage(result.usage);
2026
+ return {
2027
+ output: result.output,
2028
+ usage: result.usage
2029
+ };
2030
+ }
2031
+ /**
2032
+ * Load an image file and encode it as base64
2033
+ *
2034
+ * @param imagePath - Absolute path to the image file
2035
+ * @returns Base64 encoded image string
2036
+ */
2037
+ loadImageAsBase64(imagePath) {
2038
+ const imageBuffer = fs.readFileSync(imagePath);
2039
+ return imageBuffer.toString("base64");
2040
+ }
2041
+ /**
2042
+ * Build image content object for vision LLM messages
2043
+ *
2044
+ * @param imagePath - Path to the image file (relative to outputPath or absolute)
2045
+ * @param mimeType - MIME type of the image (default: 'image/png')
2046
+ * @returns ImageContent object for LLM message
2047
+ */
2048
+ buildImageContent(imagePath, mimeType = "image/png") {
2049
+ const absolutePath = path.isAbsolute(imagePath) ? imagePath : path.resolve(this.outputPath, imagePath);
2050
+ const base64Image = this.loadImageAsBase64(absolutePath);
2051
+ return {
2052
+ type: "image",
2053
+ image: `data:${mimeType};base64,${base64Image}`
2054
+ };
2055
+ }
2056
+ };
2057
+
2058
+ // src/extractors/vision-toc-extractor.ts
2059
+ var VisionTocExtractionSchema = z2.object({
2060
+ hasToc: z2.boolean().describe("Whether a TOC is visible on these pages"),
2061
+ tocMarkdown: z2.string().nullable().describe("Extracted TOC in markdown format, null if not found"),
2062
+ continuesOnNextPage: z2.boolean().describe("Whether TOC continues beyond these pages")
2063
+ });
2064
+ var VisionTocExtractor = class extends VisionLLMComponent {
2065
+ firstBatchSize;
2066
+ secondBatchSize;
2067
+ constructor(logger, model, outputPath, options, fallbackModel, aggregator) {
2068
+ super(
2069
+ logger,
2070
+ model,
2071
+ "VisionTocExtractor",
2072
+ outputPath,
2073
+ options,
2074
+ fallbackModel,
2075
+ aggregator ?? new LLMTokenUsageAggregator()
2076
+ );
2077
+ this.firstBatchSize = options?.firstBatchSize ?? 10;
2078
+ this.secondBatchSize = options?.secondBatchSize ?? 10;
2079
+ }
2080
+ /**
2081
+ * Extract TOC from page images
2082
+ *
2083
+ * Searches pages 1-10 first, then 11-20 if not found.
2084
+ *
2085
+ * @param totalPages - Total number of pages in the document
2086
+ * @returns Extracted TOC markdown or null if not found
2087
+ */
2088
+ async extract(totalPages) {
2089
+ this.log("info", `Starting TOC extraction from ${totalPages} pages`);
2090
+ if (totalPages === 0) {
2091
+ this.log("info", "No pages to search");
2092
+ return null;
2093
+ }
2094
+ const firstBatchEnd = Math.min(this.firstBatchSize, totalPages);
2095
+ this.log("info", `Searching first batch: pages 1-${firstBatchEnd}`);
2096
+ const firstResult = await this.extractFromBatch(1, firstBatchEnd);
2097
+ if (firstResult.hasToc && firstResult.tocMarkdown) {
2098
+ if (firstResult.continuesOnNextPage && firstBatchEnd < totalPages) {
2099
+ this.log("info", "TOC continues on next pages, extracting more");
2100
+ const continuationEnd = Math.min(
2101
+ firstBatchEnd + this.secondBatchSize,
2102
+ totalPages
2103
+ );
2104
+ const continuationResult = await this.extractFromBatch(
2105
+ firstBatchEnd + 1,
2106
+ continuationEnd
2107
+ );
2108
+ if (continuationResult.hasToc && continuationResult.tocMarkdown) {
2109
+ const merged = this.mergeMarkdown(
2110
+ firstResult.tocMarkdown,
2111
+ continuationResult.tocMarkdown
2112
+ );
2113
+ this.aggregator.logSummary(this.logger);
2114
+ this.log(
2115
+ "info",
2116
+ `TOC extracted with continuation (${merged.length} chars)`
2117
+ );
2118
+ return merged;
2119
+ }
2120
+ }
2121
+ this.aggregator.logSummary(this.logger);
2122
+ this.log(
2123
+ "info",
2124
+ `TOC found in first batch (${firstResult.tocMarkdown.length} chars)`
2125
+ );
2126
+ return firstResult.tocMarkdown;
2127
+ }
2128
+ if (firstBatchEnd < totalPages) {
2129
+ const secondBatchStart = firstBatchEnd + 1;
2130
+ const secondBatchEnd = Math.min(
2131
+ firstBatchEnd + this.secondBatchSize,
2132
+ totalPages
2133
+ );
2134
+ this.log(
2135
+ "info",
2136
+ `Searching second batch: pages ${secondBatchStart}-${secondBatchEnd}`
2137
+ );
2138
+ const secondResult = await this.extractFromBatch(
2139
+ secondBatchStart,
2140
+ secondBatchEnd
2141
+ );
2142
+ if (secondResult.hasToc && secondResult.tocMarkdown) {
2143
+ this.aggregator.logSummary(this.logger);
2144
+ this.log(
2145
+ "info",
2146
+ `TOC found in second batch (${secondResult.tocMarkdown.length} chars)`
2147
+ );
2148
+ return secondResult.tocMarkdown;
2149
+ }
2150
+ }
2151
+ this.aggregator.logSummary(this.logger);
2152
+ this.log("info", "TOC not found in any batch");
2153
+ return null;
2154
+ }
2155
+ /**
2156
+ * Extract TOC from a specific batch of pages
2157
+ */
2158
+ async extractFromBatch(startPage, endPage) {
2159
+ this.log("info", `Extracting from pages ${startPage}-${endPage}`);
2160
+ const imageContents = this.loadPageImages(startPage, endPage);
2161
+ const result = await LLMCaller.callVision({
2162
+ schema: VisionTocExtractionSchema,
2163
+ messages: [
2164
+ {
2165
+ role: "user",
2166
+ content: [
2167
+ {
2168
+ type: "text",
2169
+ text: this.buildUserPrompt(startPage, endPage)
2170
+ },
2171
+ ...imageContents
2172
+ ]
2173
+ }
2174
+ ],
2175
+ primaryModel: this.model,
2176
+ fallbackModel: this.fallbackModel,
2177
+ maxRetries: this.maxRetries,
2178
+ temperature: this.temperature,
2179
+ abortSignal: this.abortSignal,
2180
+ component: "VisionTocExtractor",
2181
+ phase: "extraction"
2182
+ });
2183
+ this.trackUsage(result.usage);
2184
+ return result.output;
2185
+ }
2186
+ /**
2187
+ * Load page images and build message content
2188
+ */
2189
+ loadPageImages(startPage, endPage) {
2190
+ const imageContents = [];
2191
+ for (let pageNo = startPage; pageNo <= endPage; pageNo++) {
2192
+ const imagePath = path2.resolve(
2193
+ this.outputPath,
2194
+ `pages/page_${pageNo - 1}.png`
2195
+ );
2196
+ const imageBuffer = fs2.readFileSync(imagePath);
2197
+ const base64Image = imageBuffer.toString("base64");
2198
+ imageContents.push({
2199
+ type: "image",
2200
+ image: `data:image/png;base64,${base64Image}`
2201
+ });
2202
+ }
2203
+ return imageContents;
2204
+ }
2205
+ /**
2206
+ * Merge markdown from multiple batches
2207
+ */
2208
+ mergeMarkdown(first, continuation) {
2209
+ return `${first.trim()}
2210
+ ${continuation.trim()}`;
2211
+ }
2212
+ /**
2213
+ * Build system prompt for vision LLM (not used, but required by abstract class)
2214
+ */
2215
+ buildSystemPrompt() {
2216
+ return "";
2217
+ }
2218
+ /**
2219
+ * Build user prompt with page range information
2220
+ */
2221
+ buildUserPrompt(startPage, endPage) {
2222
+ const pageCount = endPage - startPage + 1;
2223
+ return `You are a document analysis specialist. Your task is to find and extract the Table of Contents (TOC) from document page images.
2224
+
2225
+ I am providing ${pageCount} document page images (pages ${startPage}-${endPage}).
2226
+
2227
+ ## Where to Look for TOC:
2228
+ - TOC typically appears in the first 10-20 pages of a document
2229
+ - Look for pages with headings like "\uBAA9\uCC28", "\uCC28\uB840", "Contents", "Table of Contents"
2230
+ - Look for structured lists with chapter titles and page numbers
2231
+
2232
+ ## What to Extract:
2233
+ Extract the TOC content as markdown format that matches this exact structure:
2234
+ - Use "- " prefix for each list item
2235
+ - Use 2-space indentation for hierarchy levels
2236
+ - Include "..... " followed by page number at the end of each entry
2237
+ - Preserve original chapter/section numbering from the document
2238
+
2239
+ ## Output Format Example:
2240
+ \`\`\`
2241
+ - \uC81C1\uC7A5 \uC11C\uB860 ..... 1
2242
+ - 1. \uC5F0\uAD6C \uBC30\uACBD ..... 3
2243
+ - 2. \uC5F0\uAD6C \uBAA9\uC801 ..... 5
2244
+ - \uC81C2\uC7A5 \uC5F0\uAD6C \uBC29\uBC95 ..... 10
2245
+ - 1. \uC870\uC0AC \uC9C0\uC5ED ..... 10
2246
+ - 2. \uC870\uC0AC \uBC29\uBC95 ..... 15
2247
+ - \uC81C3\uC7A5 \uC5F0\uAD6C \uACB0\uACFC ..... 25
2248
+ \`\`\`
2249
+
2250
+ ## Important Rules:
2251
+ 1. Extract ONLY the main document TOC
2252
+ 2. DO NOT include supplementary indices:
2253
+ - Photo indices (\uC0AC\uC9C4 \uBAA9\uCC28, \uC0AC\uC9C4\uBAA9\uCC28)
2254
+ - Table indices (\uD45C \uBAA9\uCC28, \uD45C\uBAA9\uCC28)
2255
+ - Figure indices (\uB3C4\uBA74 \uBAA9\uCC28, \uB3C4\uBA74\uBAA9\uCC28)
2256
+ 3. If no TOC is found, set hasToc to false and tocMarkdown to null
2257
+ 4. Set continuesOnNextPage to true if the TOC appears to continue beyond the visible pages
2258
+
2259
+ Please examine these pages and:
2260
+ 1. Determine if any page contains a Table of Contents (TOC)
2261
+ 2. If found, extract the complete TOC in markdown format
2262
+ 3. Indicate if the TOC continues beyond these pages
2263
+
2264
+ Remember: Extract the main document TOC only. Ignore photo/table/figure indices.`;
2265
+ }
2266
+ };
2267
+
2268
+ // src/parsers/caption-parser.ts
2269
+ import { z as z3 } from "zod";
2270
+ var CaptionSingleSchema = z3.object({
2271
+ num: z3.string().nullable().describe('Extracted caption prefix + number (e.g., "\uB3C4\uD310 1", "Figure 2")')
2272
+ });
2273
+ var CaptionExtractionSchema = z3.object({
2274
+ index: z3.number().int().describe("Index of the caption in the input array"),
2275
+ num: z3.string().nullable().describe('Extracted caption prefix + number (e.g., "\uB3C4\uD310 1", "Figure 2")')
2276
+ });
2277
+ var CaptionBatchSchema = z3.object({
2278
+ results: z3.array(CaptionExtractionSchema)
2279
+ });
2280
+ var CaptionParser = class extends TextLLMComponent {
2281
+ constructor(logger, model, options, fallbackModel, aggregator) {
2282
+ super(
2283
+ logger,
2284
+ model,
2285
+ options?.componentName ?? "CaptionParser",
2286
+ options,
2287
+ fallbackModel,
2288
+ aggregator ?? new LLMTokenUsageAggregator()
2289
+ );
2290
+ }
2291
+ /**
2292
+ * Parse batch of captions
2293
+ *
2294
+ * @param captions - Array of caption full texts
2295
+ * @param batchSize - Batch size for processing. Set to 0 for sequential processing without batching.
2296
+ * @param overrideModel - Optional model to use instead of the default model
2297
+ * @returns Array of Caption objects with num extracted (maintains original order)
2298
+ */
2299
+ async parseBatch(captions, batchSize, overrideModel) {
2300
+ const effectiveModel = overrideModel ?? this.model;
2301
+ const isOverride = overrideModel !== void 0;
2302
+ const modelName = effectiveModel.modelId ?? effectiveModel.id ?? "unknown";
2303
+ this.log(
2304
+ "info",
2305
+ `Starting caption parsing for ${captions.length} captions with ${isOverride ? "override " : ""}model: ${modelName}`
2306
+ );
2307
+ if (captions.length === 0) {
2308
+ this.log("info", "No captions to parse");
2309
+ return [];
2310
+ }
2311
+ try {
2312
+ if (batchSize === 0) {
2313
+ this.log("info", "Using sequential processing (batchSize=0)");
2314
+ const results2 = [];
2315
+ for (let i = 0; i < captions.length; i++) {
2316
+ const fullText = captions[i];
2317
+ this.log("info", `Processing ${i + 1} / ${captions.length}...`);
2318
+ const result = await LLMCaller.call({
2319
+ schema: CaptionSingleSchema,
2320
+ systemPrompt: this.buildSystemPrompt("single"),
2321
+ userPrompt: this.buildUserPromptSingle(fullText),
2322
+ primaryModel: effectiveModel,
2323
+ fallbackModel: this.fallbackModel,
2324
+ maxRetries: this.maxRetries,
2325
+ temperature: this.temperature,
2326
+ abortSignal: this.abortSignal,
2327
+ component: this.componentName,
2328
+ phase: "caption-extraction"
2329
+ });
2330
+ this.trackUsage(result.usage);
2331
+ const finalNum = this.extractNumFromFullText(
2332
+ fullText,
2333
+ result.output.num
2334
+ );
2335
+ results2.push({ fullText, num: finalNum });
2336
+ }
2337
+ this.aggregator.logSummary(this.logger);
2338
+ this.log(
2339
+ "info",
2340
+ `Completed: ${results2.length} captions parsed, ${results2.filter((r) => r.num).length} with extracted numbers`
2341
+ );
2342
+ return results2;
2343
+ }
2344
+ const indexedCaptions = captions.map((text, index) => ({ index, text }));
2345
+ const batchResults = await BatchProcessor.processBatch(
2346
+ indexedCaptions,
2347
+ batchSize,
2348
+ async (batch) => this.parseBatchInternal(batch, effectiveModel)
2349
+ );
2350
+ batchResults.sort((a, b) => a.index - b.index);
2351
+ const results = batchResults.map((r) => r.caption);
2352
+ this.aggregator.logSummary(this.logger);
2353
+ this.log(
2354
+ "info",
2355
+ `Completed: ${results.length} captions parsed, ${results.filter((r) => r.num).length} with extracted numbers`
2356
+ );
2357
+ return results;
2358
+ } catch (error) {
2359
+ const message = error instanceof Error ? error.message : String(error);
2360
+ this.log("error", `Parsing failed: ${message}`);
2361
+ throw new CaptionParseError(`Failed to parse captions: ${message}`, {
2362
+ cause: error
2363
+ });
2364
+ }
2365
+ }
2366
+ /**
2367
+ * Internal: Parse batch of captions using LLM
2368
+ *
2369
+ * @param captions - Batch of caption texts with original indices
2370
+ * @param model - Effective model to use
2371
+ * @returns Array of Caption objects indexed correctly
2372
+ */
2373
+ async parseBatchInternal(captions, model) {
2374
+ const result = await LLMCaller.call({
2375
+ schema: CaptionBatchSchema,
2376
+ systemPrompt: this.buildSystemPrompt(),
2377
+ userPrompt: this.buildUserPrompt(captions),
2378
+ primaryModel: model,
2379
+ fallbackModel: this.fallbackModel,
2380
+ maxRetries: this.maxRetries,
2381
+ temperature: this.temperature,
2382
+ abortSignal: this.abortSignal,
2383
+ component: this.componentName,
2384
+ phase: "caption-extraction"
2385
+ });
2386
+ this.trackUsage(result.usage);
2387
+ if (result.output.results.length !== captions.length) {
2388
+ this.log(
2389
+ "warn",
2390
+ `LLM returned ${result.output.results.length} results for ${captions.length} captions. This may cause index mismatch.`
2391
+ );
2392
+ }
2393
+ const captionMap = new Map(captions.map((c) => [c.index, c.text]));
2394
+ return result.output.results.map((resultItem) => {
2395
+ const originalCaption = captions[resultItem.index];
2396
+ const originalIndex = originalCaption?.index ?? resultItem.index;
2397
+ const fullText = captionMap.get(originalIndex) || "";
2398
+ const finalNum = this.extractNumFromFullText(fullText, resultItem.num);
2399
+ return {
2400
+ index: originalIndex,
2401
+ caption: {
2402
+ fullText,
2403
+ num: finalNum
2404
+ }
2405
+ };
2406
+ });
2407
+ }
2408
+ /**
2409
+ * Extract and normalize caption number from full text
2410
+ *
2411
+ * Finds the extracted num pattern in the full text and extracts it
2412
+ * with original casing. Handles case-insensitive matching.
2413
+ *
2414
+ * @param fullText - The full caption text
2415
+ * @param extractedNum - The num extracted by LLM (may have different casing)
2416
+ * @returns Normalized num or undefined if no match
2417
+ */
2418
+ extractNumFromFullText(fullText, extractedNum) {
2419
+ if (!extractedNum) return void 0;
2420
+ let matchIndex = fullText.indexOf(extractedNum);
2421
+ if (matchIndex === -1) {
2422
+ const lowerFullText = fullText.toLowerCase();
2423
+ const lowerNum = extractedNum.toLowerCase();
2424
+ matchIndex = lowerFullText.indexOf(lowerNum);
2425
+ if (matchIndex !== -1) {
2426
+ return fullText.substring(matchIndex, matchIndex + extractedNum.length);
2427
+ }
2428
+ return extractedNum;
2429
+ }
2430
+ return fullText.substring(matchIndex, matchIndex + extractedNum.length);
2431
+ }
2432
+ /**
2433
+ * Build system prompt for caption parsing
2434
+ *
2435
+ * @param mode - 'batch' for multiple captions, 'single' for single caption
2436
+ */
2437
+ buildSystemPrompt(mode = "batch") {
2438
+ const intro = mode === "batch" ? 'Extract the caption prefix and number (e.g., "\uB3C4\uD310 1", "Figure 2") from image/table captions.\nReturn the prefix + number part as a string, or null if no number exists.' : 'Extract the caption prefix and number (e.g., "\uB3C4\uD310 1", "Figure 2") from an image/table caption.\nReturn the prefix + number part as a string, or null if no number exists.';
2439
+ return `You are a caption prefix extractor for archaeological excavation reports.
2440
+
2441
+ ${intro}
2442
+
2443
+ Rules:
2444
+ 1. Extract if the text follows a caption pattern: <prefix word(s)> <number>
2445
+ - The prefix can be ANY Korean/English word(s) that label images/tables/figures
2446
+ - Common examples: \uB3C4\uD310, \uC0AC\uC9C4, \uADF8\uB9BC, \uB3C4\uBA74, \uD45C, \uC6D0\uC0C9\uC0AC\uC9C4, \uD751\uBC31\uC0AC\uC9C4, Figure, Photo, Plate, etc.
2447
+ - The key is the PATTERN (text followed by number), not a specific word list
2448
+ - "\uC6D0\uC0C9\uC0AC\uC9C4 1. \uC870\uC0AC\uC9C0\uC5ED" \u2192 "\uC6D0\uC0C9\uC0AC\uC9C4 1" (valid: prefix + number pattern)
2449
+ - "\uD751\uBC31\uC0AC\uC9C4 2 \uCD9C\uD1A0\uC720\uBB3C" \u2192 "\uD751\uBC31\uC0AC\uC9C4 2" (valid: prefix + number pattern)
2450
+ 2. IGNORE leading punctuation/brackets when extracting:
2451
+ - "(\uC0AC\uC9C4 16> \uB290\uD2F0\uB098\uBB34" \u2192 "\uC0AC\uC9C4 16" (ignore leading '(' and extract the pattern inside)
2452
+ - "<\uB3C4\uD310 1> \uC720\uC801" \u2192 "\uB3C4\uD310 1" (ignore angle brackets)
2453
+ - "[\uADF8\uB9BC 2] \uC804\uACBD" \u2192 "\uADF8\uB9BC 2" (ignore square brackets)
2454
+ 3. Do NOT extract (return null) if:
2455
+ - It's a numbered list item starting with just a number: "1. \uC720\uC801 \uC804\uACBD" \u2192 null
2456
+ - It's a date/time reference: "39 3\uC6D4 28\uC77C..." \u2192 null
2457
+ - It's a year reference: "2024\uB144 \uC870\uC0AC \uD604\uD669" \u2192 null
2458
+ - It starts with a number without a prefix: "123 \uC124\uBA85" \u2192 null
2459
+ 4. PRESERVE original spacing from the input text exactly (after ignoring leading punctuation)
2460
+ 5. Include the full number (e.g., "1-2", "3a") not just the first digit
2461
+ 6. Include period/dot after number if it directly follows (e.g., "3.6" \u2192 "\uB3C4\uD310 3.6")
2462
+ - "\uADF8\uB9BC 3.6. \uD55C\uBC18\uB3C4 \uC911\uBD80" \u2192 "\uADF8\uB9BC 3.6" (period after decimal number included)
2463
+ - "\uB3C4\uD310 2. \uC720\uC801" \u2192 "\uB3C4\uD310 2" (period after space NOT included)
2464
+ 7. Stop at the first punctuation (except decimal point), whitespace, or underscore after the number
2465
+ - "\uC0AC\uC9C4 1_\u3147\u3147\u3147" \u2192 "\uC0AC\uC9C4 1" (stop at underscore)
2466
+ - "\uC0AC\uC9C4 1 \u3147\u3147\u3147" \u2192 "\uC0AC\uC9C4 1" (stop at space)
2467
+ - "\uADF8\uB9BC 3.6. \uD55C\uBC18\uB3C4" \u2192 "\uADF8\uB9BC 3.6" (period included as decimal separator)
2468
+
2469
+ Examples:
2470
+ - "\uB3C4\uD310 1 \uC720\uC801 \uC804\uACBD" \u2192 "\uB3C4\uD310 1"
2471
+ - "\uC6D0\uC0C9\uC0AC\uC9C4 1. \uC870\uC0AC\uC9C0\uC5ED \uC6D0\uACBD" \u2192 "\uC6D0\uC0C9\uC0AC\uC9C4 1"
2472
+ - "\uD751\uBC31\uC0AC\uC9C4 2 \uCD9C\uD1A0\uC720\uBB3C" \u2192 "\uD751\uBC31\uC0AC\uC9C4 2"
2473
+ - "(\uC0AC\uC9C4 16> \uB290\uD2F0\uB098\uBB34\uC758 \uC811\uC120\uB2E8\uBA74" \u2192 "\uC0AC\uC9C4 16" (ignore leading punctuation)
2474
+ - "<\uB3C4\uD310 3> \uC720\uBB3C \uC0AC\uC9C4" \u2192 "\uB3C4\uD310 3" (ignore angle brackets)
2475
+ - "\uB3C4\uD3101 \uC5B4\uCA4C\uAD6C" \u2192 "\uB3C4\uD3101" (no space preserved)
2476
+ - "\uC0AC\uC9C4 2. \uCD9C\uD1A0 \uC720\uBB3C" \u2192 "\uC0AC\uC9C4 2" (period after space, not included)
2477
+ - "\uADF8\uB9BC 3.6. \uD55C\uBC18\uB3C4 \uC911\uBD80 \uBC0F \uB0A8\uBD80\uC758 \u3147\u3147\u3147" \u2192 "\uADF8\uB9BC 3.6" (period as decimal included)
2478
+ - "Figure 3: Site plan" \u2192 "Figure 3"
2479
+ - "Table 4a. Artifact list" \u2192 "Table 4a"
2480
+ - "\uB3C4\uD310 5-2 \uCE35\uC704 \uB2E8\uBA74" \u2192 "\uB3C4\uD310 5-2"
2481
+ - "\uC124\uBA85 \uC5C6\uB294 \uC774\uBBF8\uC9C0" \u2192 null
2482
+ - "39 3\uC6D4 28\uC77C(\uBC31\uC81C \uB3C4\uB85C\uC720\uAD6C \uB0B4\uBD80 \uC870\uC0AC)" \u2192 null (starts with number, no prefix)
2483
+ - "1. \uC720\uAD6C \uD604\uD669" \u2192 null (numbered list, not caption)
2484
+ - "2024-05-01 \uCD2C\uC601" \u2192 null (date, not caption)`;
2485
+ }
2486
+ /**
2487
+ * Build user prompt for caption parsing
2488
+ */
2489
+ buildUserPrompt(captions) {
2490
+ const captionList = captions.map((c) => `[${c.index}] ${c.text}`).join("\n");
2491
+ return `Extract caption prefix and number from the following captions:
2492
+
2493
+ ${captionList}
2494
+
2495
+ Return the results as JSON array with "index" (original position) and "num" (extracted prefix + number or null).
2496
+
2497
+ Example format:
2498
+ [
2499
+ { "index": 0, "num": "\uB3C4\uD310 1" },
2500
+ { "index": 1, "num": "Figure 2" },
2501
+ { "index": 2, "num": null }
2502
+ ]`;
2503
+ }
2504
+ /**
2505
+ * Build user prompt for single caption parsing
2506
+ */
2507
+ buildUserPromptSingle(caption) {
2508
+ return `Extract caption prefix and number from the following caption:
2509
+
2510
+ "${caption}"
2511
+
2512
+ CRITICAL: Return ONLY the JSON object directly with a "num" field.
2513
+ - DO NOT wrap the JSON in quotes or additional formatting
2514
+ - DO NOT output "final:", "result:", or any prefix labels
2515
+ - DO NOT wrap in backticks or code blocks
2516
+ - Return ONLY valid JSON: { "num": value }
2517
+
2518
+ The value must be:
2519
+ - A string with the extracted caption prefix + number (e.g., "\uB3C4\uD310 1", "Figure 2")
2520
+ - null if no number exists
2521
+
2522
+ Valid outputs:
2523
+ { "num": "\uB3C4\uD310 1" }
2524
+ { "num": null }
2525
+
2526
+ Invalid outputs (NEVER do these):
2527
+ - { "final": "..." } \u274C
2528
+ - \`\`\`json { "num": "..." } \`\`\` \u274C
2529
+ - "{ "num": "..." }" \u274C
2530
+ - { "num": { "value": "..." } } \u274C`;
2531
+ }
2532
+ };
2533
+ var CaptionParseError = class extends Error {
2534
+ constructor(message, options) {
2535
+ super(message, options);
2536
+ this.name = "CaptionParseError";
2537
+ }
2538
+ };
2539
+
2540
+ // src/parsers/page-range-parse-error.ts
2541
+ var PageRangeParseError = class _PageRangeParseError extends Error {
2542
+ constructor(message, options) {
2543
+ super(message, options);
2544
+ this.name = "PageRangeParseError";
2545
+ }
2546
+ /**
2547
+ * Extract error message from unknown error type
2548
+ */
2549
+ static getErrorMessage(error) {
2550
+ return error instanceof Error ? error.message : String(error);
2551
+ }
2552
+ /**
2553
+ * Create PageRangeParseError from unknown error with context
2554
+ */
2555
+ static fromError(context, error) {
2556
+ return new _PageRangeParseError(
2557
+ `${context}: ${_PageRangeParseError.getErrorMessage(error)}`,
2558
+ { cause: error }
2559
+ );
2560
+ }
2561
+ };
2562
+
2563
+ // src/parsers/page-range-parser.ts
2564
+ import * as fs3 from "fs";
2565
+ import * as path3 from "path";
2566
+ import { z as z4 } from "zod";
2567
+ var PagePattern = /* @__PURE__ */ ((PagePattern2) => {
2568
+ PagePattern2["SIMPLE_INCREMENT"] = "simple_increment";
2569
+ PagePattern2["DOUBLE_SIDED"] = "double_sided";
2570
+ PagePattern2["OFFSET"] = "offset";
2571
+ PagePattern2["UNKNOWN"] = "unknown";
2572
+ return PagePattern2;
2573
+ })(PagePattern || {});
2574
+ var PageRangeParser = class extends VisionLLMComponent {
2575
+ // Configuration constants
2576
+ SAMPLE_SIZE = 3;
2577
+ MAX_PATTERN_RETRIES = 6;
2578
+ SIZE_TOLERANCE = 5;
2579
+ constructor(logger, model, outputPath, maxRetries = 3, fallbackModel, aggregator, abortSignal) {
2580
+ super(
2581
+ logger,
2582
+ model,
2583
+ "PageRangeParser",
2584
+ outputPath,
2585
+ { maxRetries, abortSignal },
2586
+ fallbackModel,
2587
+ aggregator ?? new LLMTokenUsageAggregator()
2588
+ );
2589
+ }
2590
+ /**
2591
+ * Main parse method
2592
+ *
2593
+ * Extracts page range mapping from DoclingDocument using Vision LLM.
2594
+ * Automatically tracks token usage in the aggregator if one was provided.
2595
+ *
2596
+ * @param doclingDoc - DoclingDocument to extract page ranges from
2597
+ * @returns Object with page range mapping and token usage information
2598
+ */
2599
+ async parse(doclingDoc) {
2600
+ this.log("info", "Starting page range parsing...");
2601
+ const pages = this.extractPages(doclingDoc);
2602
+ if (pages.length === 0) {
2603
+ this.log("warn", "No pages found");
2604
+ const emptyUsage = this.createEmptyUsage("sampling");
2605
+ this.trackUsage(emptyUsage);
2606
+ return {
2607
+ pageRangeMap: {},
2608
+ usage: [emptyUsage]
2609
+ };
2610
+ }
2611
+ const sizeGroups = this.analyzeSizes(pages);
2612
+ this.log(
2613
+ "info",
2614
+ `Found ${sizeGroups.length} size group(s), total ${pages.length} pages`
2615
+ );
2616
+ const pageRangeMap = {};
2617
+ const usageList = [];
2618
+ for (let i = 0; i < sizeGroups.length; i++) {
2619
+ const group = sizeGroups[i];
2620
+ this.log(
2621
+ "info",
2622
+ `Processing group ${i + 1}/${sizeGroups.length}: ${group.pageNos.length} pages`
2623
+ );
2624
+ const groupResult = await this.processGroup(pages, group, this.model);
2625
+ Object.assign(pageRangeMap, groupResult.pageRangeMap);
2626
+ usageList.push(...groupResult.usage);
2627
+ }
2628
+ for (const usage of usageList) {
2629
+ this.trackUsage(usage);
2630
+ }
2631
+ this.postProcess(pageRangeMap);
2632
+ this.log(
2633
+ "info",
2634
+ `Completed: ${Object.keys(pageRangeMap).length} pages mapped`
2635
+ );
2636
+ return { pageRangeMap, usage: usageList };
2637
+ }
2638
+ /**
2639
+ * Extract pages array from DoclingDocument
2640
+ */
2641
+ extractPages(doclingDoc) {
2642
+ const pageKeys = Object.keys(doclingDoc.pages).map(Number).filter((n) => !Number.isNaN(n)).sort((a, b) => a - b);
2643
+ return pageKeys.map((key) => doclingDoc.pages[String(key)]);
2644
+ }
2645
+ /**
2646
+ * Analyze page sizes and group consecutive pages with same dimensions
2647
+ */
2648
+ analyzeSizes(pages) {
2649
+ const groups = [];
2650
+ let currentGroup = null;
2651
+ for (const page of pages) {
2652
+ const sizeKey = this.createSizeKey(page.size.width, page.size.height);
2653
+ if (!currentGroup || currentGroup.sizeKey !== sizeKey) {
2654
+ currentGroup = { sizeKey, pageNos: [page.page_no] };
2655
+ groups.push(currentGroup);
2656
+ } else {
2657
+ currentGroup.pageNos.push(page.page_no);
2658
+ }
2659
+ }
2660
+ return groups;
2661
+ }
2662
+ /**
2663
+ * Create size key with tolerance for floating point comparison
2664
+ */
2665
+ createSizeKey(width, height) {
2666
+ const roundedWidth = Math.round(width / this.SIZE_TOLERANCE);
2667
+ const roundedHeight = Math.round(height / this.SIZE_TOLERANCE);
2668
+ return `${roundedWidth}x${roundedHeight}`;
2669
+ }
2670
+ /**
2671
+ * Process a single size group
2672
+ */
2673
+ async processGroup(pages, group, model) {
2674
+ const { pageNos } = group;
2675
+ const usageList = [];
2676
+ if (pageNos.length <= this.SAMPLE_SIZE) {
2677
+ this.log(
2678
+ "info",
2679
+ `Small group (${pageNos.length} pages), extracting all at once`
2680
+ );
2681
+ const result = await this.extractMultiplePages(pages, pageNos, model);
2682
+ usageList.push(result.usage);
2683
+ return {
2684
+ pageRangeMap: this.samplesToMap(result.samples),
2685
+ usage: usageList
2686
+ };
2687
+ }
2688
+ const sampledPages = /* @__PURE__ */ new Set();
2689
+ for (let attempt = 0; attempt <= this.MAX_PATTERN_RETRIES; attempt++) {
2690
+ const samplePageNos = this.selectRandomSamples(
2691
+ pageNos,
2692
+ this.SAMPLE_SIZE,
2693
+ sampledPages
2694
+ );
2695
+ for (const p of samplePageNos) {
2696
+ sampledPages.add(p);
2697
+ }
2698
+ this.log(
2699
+ "info",
2700
+ `Attempt ${attempt + 1}/${this.MAX_PATTERN_RETRIES + 1}: sampling pages ${samplePageNos.join(", ")}`
2701
+ );
2702
+ const result = await this.extractMultiplePages(
2703
+ pages,
2704
+ samplePageNos,
2705
+ model
2706
+ );
2707
+ usageList.push(result.usage);
2708
+ const samples = result.samples;
2709
+ const pattern = this.detectPattern(samples);
2710
+ if (pattern.pattern !== "unknown" /* UNKNOWN */) {
2711
+ this.log(
2712
+ "info",
2713
+ `Pattern detected: ${pattern.pattern} (offset=${pattern.offset}, increment=${pattern.increment})`
2714
+ );
2715
+ return {
2716
+ pageRangeMap: this.applyPattern(pageNos, pattern),
2717
+ usage: usageList
2718
+ };
2719
+ }
2720
+ this.log(
2721
+ "warn",
2722
+ `Pattern detection failed, attempt ${attempt + 1}/${this.MAX_PATTERN_RETRIES + 1}`
2723
+ );
2724
+ }
2725
+ throw new PageRangeParseError(
2726
+ `Failed to detect page pattern after ${this.MAX_PATTERN_RETRIES + 1} attempts for size group with ${pageNos.length} pages`
2727
+ );
2728
+ }
2729
+ /**
2730
+ * Select random samples from page numbers
2731
+ */
2732
+ selectRandomSamples(pageNos, count, exclude = /* @__PURE__ */ new Set()) {
2733
+ const available = pageNos.filter((p) => !exclude.has(p));
2734
+ const pool = available.length >= count ? available : pageNos;
2735
+ const shuffled = [...pool];
2736
+ for (let i = shuffled.length - 1; i > 0; i--) {
2737
+ const j = Math.floor(Math.random() * (i + 1));
2738
+ [shuffled[i], shuffled[j]] = [shuffled[j], shuffled[i]];
2739
+ }
2740
+ return shuffled.slice(0, count).sort((a, b) => a - b);
2741
+ }
2742
+ /**
2743
+ * Extract page numbers from multiple pages in a single LLM call
2744
+ */
2745
+ async extractMultiplePages(pages, pageNos, model) {
2746
+ this.log("info", `Extracting ${pageNos.length} pages in single LLM call`);
2747
+ const imageContents = [];
2748
+ for (const pageNo of pageNos) {
2749
+ const page = pages[pageNo - 1];
2750
+ const imagePath = path3.resolve(this.outputPath, page.image.uri);
2751
+ const imageBuffer = fs3.readFileSync(imagePath);
2752
+ const base64Image = imageBuffer.toString("base64");
2753
+ const mimeType = page.image.mimetype || "image/png";
2754
+ imageContents.push({
2755
+ type: "image",
2756
+ image: `data:${mimeType};base64,${base64Image}`
2757
+ });
2758
+ }
2759
+ const schema = z4.object({
2760
+ pages: z4.array(
2761
+ z4.object({
2762
+ imageIndex: z4.number().describe("0-based index of the image in the request"),
2763
+ startPageNo: z4.number().nullable().describe("Start page number (null if not found)"),
2764
+ endPageNo: z4.number().nullable().describe(
2765
+ "End page number for double-sided scans (null for single page)"
2766
+ )
2767
+ })
2768
+ ).describe("Extracted page numbers for each image")
2769
+ });
2770
+ try {
2771
+ const result = await LLMCaller.callVision({
2772
+ schema,
2773
+ messages: [
2774
+ {
2775
+ role: "user",
2776
+ content: [
2777
+ { type: "text", text: this.buildUserPrompt(pageNos) },
2778
+ ...imageContents
2779
+ ]
2780
+ }
2781
+ ],
2782
+ primaryModel: model,
2783
+ fallbackModel: this.fallbackModel,
2784
+ maxRetries: this.maxRetries,
2785
+ temperature: 0,
2786
+ abortSignal: this.abortSignal,
2787
+ component: "PageRangeParser",
2788
+ phase: "sampling"
2789
+ });
2790
+ const samples = result.output.pages.map((p) => ({
2791
+ pdfPageNo: pageNos[p.imageIndex],
2792
+ startPageNo: p.startPageNo,
2793
+ endPageNo: p.endPageNo
2794
+ }));
2795
+ return { samples, usage: result.usage };
2796
+ } catch (error) {
2797
+ this.log("error", "Multi-image extraction failed:", error);
2798
+ throw PageRangeParseError.fromError(
2799
+ "Multi-image extraction failed",
2800
+ error
2801
+ );
2802
+ }
2803
+ }
2804
+ /**
2805
+ * Detect pattern from sample results
2806
+ */
2807
+ detectPattern(samples) {
2808
+ const validSamples = samples.filter((s) => s.startPageNo !== null);
2809
+ if (validSamples.length < 2) {
2810
+ return { pattern: "unknown" /* UNKNOWN */, offset: 0, increment: 1 };
2811
+ }
2812
+ validSamples.sort((a, b) => a.pdfPageNo - b.pdfPageNo);
2813
+ const isSimple = validSamples.every((s, i) => {
2814
+ if (s.endPageNo !== null && s.startPageNo !== s.endPageNo) return false;
2815
+ if (i === 0) return true;
2816
+ const prev = validSamples[i - 1];
2817
+ const expectedIncrease = s.pdfPageNo - prev.pdfPageNo;
2818
+ return s.startPageNo === prev.startPageNo + expectedIncrease;
2819
+ });
2820
+ if (isSimple) {
2821
+ const firstSample = validSamples[0];
2822
+ const offset = firstSample.startPageNo - firstSample.pdfPageNo;
2823
+ return { pattern: "simple_increment" /* SIMPLE_INCREMENT */, offset, increment: 1 };
2824
+ }
2825
+ const isDoubleSided = validSamples.every((s, i) => {
2826
+ if (s.endPageNo === null) return false;
2827
+ if (s.endPageNo !== s.startPageNo + 1) return false;
2828
+ if (i === 0) return true;
2829
+ const prev = validSamples[i - 1];
2830
+ const pdfDiff = s.pdfPageNo - prev.pdfPageNo;
2831
+ const expectedStartDiff = pdfDiff * 2;
2832
+ const actualStartDiff = s.startPageNo - prev.startPageNo;
2833
+ return actualStartDiff === expectedStartDiff;
2834
+ });
2835
+ if (isDoubleSided) {
2836
+ const firstSample = validSamples[0];
2837
+ const offset = firstSample.startPageNo - firstSample.pdfPageNo * 2;
2838
+ return { pattern: "double_sided" /* DOUBLE_SIDED */, offset, increment: 2 };
2839
+ }
2840
+ const offsets = validSamples.map((s) => s.startPageNo - s.pdfPageNo);
2841
+ const avgOffset = Math.round(
2842
+ offsets.reduce((a, b) => a + b, 0) / offsets.length
2843
+ );
2844
+ const isConsistentOffset = offsets.every(
2845
+ (o) => Math.abs(o - avgOffset) <= 1
2846
+ );
2847
+ if (isConsistentOffset) {
2848
+ return { pattern: "offset" /* OFFSET */, offset: avgOffset, increment: 1 };
2849
+ }
2850
+ return { pattern: "unknown" /* UNKNOWN */, offset: 0, increment: 1 };
2851
+ }
2852
+ /**
2853
+ * Apply detected pattern to generate page range map
2854
+ */
2855
+ applyPattern(pageNos, pattern) {
2856
+ const result = {};
2857
+ for (const pdfPageNo of pageNos) {
2858
+ switch (pattern.pattern) {
2859
+ case "simple_increment" /* SIMPLE_INCREMENT */:
2860
+ case "offset" /* OFFSET */: {
2861
+ const pageNo = pdfPageNo + pattern.offset;
2862
+ result[pdfPageNo] = {
2863
+ startPageNo: pageNo,
2864
+ endPageNo: pageNo
2865
+ };
2866
+ break;
2867
+ }
2868
+ case "double_sided" /* DOUBLE_SIDED */: {
2869
+ const start = pdfPageNo * 2 + pattern.offset;
2870
+ result[pdfPageNo] = {
2871
+ startPageNo: start,
2872
+ endPageNo: start + 1
2873
+ };
2874
+ break;
2875
+ }
2876
+ default:
2877
+ result[pdfPageNo] = { startPageNo: 0, endPageNo: 0 };
2878
+ }
2879
+ }
2880
+ return result;
2881
+ }
2882
+ /**
2883
+ * Convert sample results to page range map (for small groups)
2884
+ */
2885
+ samplesToMap(samples) {
2886
+ const result = {};
2887
+ for (const sample of samples) {
2888
+ if (sample.startPageNo !== null) {
2889
+ result[sample.pdfPageNo] = {
2890
+ startPageNo: sample.startPageNo,
2891
+ endPageNo: sample.endPageNo ?? sample.startPageNo
2892
+ };
2893
+ } else {
2894
+ result[sample.pdfPageNo] = { startPageNo: 0, endPageNo: 0 };
2895
+ }
2896
+ }
2897
+ return result;
2898
+ }
2899
+ /**
2900
+ * Post-process the page range map
2901
+ */
2902
+ postProcess(pageRangeMap) {
2903
+ this.detectAndHandleOutliers(pageRangeMap);
2904
+ this.detectAndHandleDrops(pageRangeMap);
2905
+ this.normalizeNegatives(pageRangeMap);
2906
+ this.backfillFailedPages(pageRangeMap);
2907
+ }
2908
+ /**
2909
+ * Detect and handle outlier page numbers at the beginning of document
2910
+ *
2911
+ * When early PDF pages have abnormally high page numbers compared to
2912
+ * subsequent pages (e.g., PDF 1-9 = 75-83, but PDF 10+ = 2,3,4...),
2913
+ * the LLM likely misread figure/photo numbers as page numbers.
2914
+ *
2915
+ * Detection: If page numbers at the beginning are significantly higher
2916
+ * than subsequent pages (which follow a normal pattern), mark them as failed.
2917
+ */
2918
+ detectAndHandleOutliers(pageRangeMap) {
2919
+ const pdfPages = Object.keys(pageRangeMap).map(Number).sort((a, b) => a - b);
2920
+ if (pdfPages.length < 3) return;
2921
+ const normalSequenceStart = this.findNormalSequenceStart(
2922
+ pageRangeMap,
2923
+ pdfPages
2924
+ );
2925
+ if (normalSequenceStart === null || normalSequenceStart <= 0) return;
2926
+ const normalStartPdfPage = pdfPages[normalSequenceStart];
2927
+ const normalStartPageNo = pageRangeMap[normalStartPdfPage].startPageNo;
2928
+ let hasOutliers = false;
2929
+ for (let i = 0; i < normalSequenceStart; i++) {
2930
+ const pdfPage = pdfPages[i];
2931
+ const pageNo = pageRangeMap[pdfPage].startPageNo;
2932
+ if (pageNo === 0) continue;
2933
+ const pdfDiff = normalStartPdfPage - pdfPage;
2934
+ const isDoubleSided = this.isDoubleSidedRange(
2935
+ pageRangeMap[normalStartPdfPage]
2936
+ );
2937
+ const expectedPageNo = isDoubleSided ? normalStartPageNo - pdfDiff * 2 : normalStartPageNo - pdfDiff;
2938
+ if (pageNo > expectedPageNo + 10) {
2939
+ this.log(
2940
+ "info",
2941
+ `Outlier detected: PDF ${pdfPage}=${pageNo} (expected ~${expectedPageNo})`
2942
+ );
2943
+ pageRangeMap[pdfPage] = { startPageNo: 0, endPageNo: 0 };
2944
+ hasOutliers = true;
2945
+ }
2946
+ }
2947
+ if (hasOutliers) {
2948
+ this.log("info", `Outliers marked as failed, will be backfilled later`);
2949
+ }
2950
+ }
2951
+ /**
2952
+ * Find the start index of a "normal" sequence in the page range map
2953
+ *
2954
+ * A normal sequence is defined as at least 3 consecutive PDF pages where:
2955
+ * - Page numbers are increasing (for single-page) or increasing by 2 (for double-sided)
2956
+ * - The pattern is consistent
2957
+ *
2958
+ * Returns the index in pdfPages array, or null if not found.
2959
+ */
2960
+ findNormalSequenceStart(pageRangeMap, pdfPages) {
2961
+ const MIN_SEQUENCE_LENGTH = 3;
2962
+ for (let startIdx = 0; startIdx <= pdfPages.length - MIN_SEQUENCE_LENGTH; startIdx++) {
2963
+ let isValidSequence = true;
2964
+ let expectedIncrement = null;
2965
+ for (let i = 0; i < MIN_SEQUENCE_LENGTH - 1; i++) {
2966
+ const currPdfPage = pdfPages[startIdx + i];
2967
+ const nextPdfPage = pdfPages[startIdx + i + 1];
2968
+ const currRange = pageRangeMap[currPdfPage];
2969
+ const nextRange = pageRangeMap[nextPdfPage];
2970
+ if (currRange.startPageNo === 0 || nextRange.startPageNo === 0) {
2971
+ isValidSequence = false;
2972
+ break;
2973
+ }
2974
+ const pageIncrement = nextRange.startPageNo - currRange.startPageNo;
2975
+ const pdfIncrement = nextPdfPage - currPdfPage;
2976
+ const isDoubleSided = this.isDoubleSidedRange(currRange);
2977
+ const expectedIncrementPerPdf = isDoubleSided ? 2 : 1;
2978
+ const expected = pdfIncrement * expectedIncrementPerPdf;
2979
+ if (expectedIncrement === null) {
2980
+ expectedIncrement = pageIncrement;
2981
+ }
2982
+ if (pageIncrement !== expected) {
2983
+ isValidSequence = false;
2984
+ break;
2985
+ }
2986
+ }
2987
+ if (isValidSequence) {
2988
+ return startIdx;
2989
+ }
2990
+ }
2991
+ return null;
2992
+ }
2993
+ /**
2994
+ * Check if a page range represents a double-sided scan
2995
+ */
2996
+ isDoubleSidedRange(range) {
2997
+ return range.endPageNo !== null && range.endPageNo !== range.startPageNo && range.endPageNo === range.startPageNo + 1;
2998
+ }
2999
+ /**
3000
+ * Detect and handle page number drops
3001
+ *
3002
+ * When page numbers suddenly decrease (e.g., 8,9 -> 3,4),
3003
+ * recalculate previous pages based on the drop point.
3004
+ */
3005
+ detectAndHandleDrops(pageRangeMap) {
3006
+ const pdfPages = Object.keys(pageRangeMap).map(Number).sort((a, b) => a - b);
3007
+ if (pdfPages.length < 2) return;
3008
+ for (let i = 1; i < pdfPages.length; i++) {
3009
+ const prevPdfPage = pdfPages[i - 1];
3010
+ const currPdfPage = pdfPages[i];
3011
+ const prevPageNo = pageRangeMap[prevPdfPage].startPageNo;
3012
+ const currPageNo = pageRangeMap[currPdfPage].startPageNo;
3013
+ if (prevPageNo === 0 || currPageNo === 0) continue;
3014
+ if (currPageNo > 0 && prevPageNo > currPageNo && prevPageNo - currPageNo > 1) {
3015
+ this.log(
3016
+ "info",
3017
+ `Page drop detected: PDF ${prevPdfPage}=${prevPageNo} -> PDF ${currPdfPage}=${currPageNo}`
3018
+ );
3019
+ const isDoubleSided = this.isDoubleSidedRange(
3020
+ pageRangeMap[currPdfPage]
3021
+ );
3022
+ for (let j = i - 1; j >= 0; j--) {
3023
+ const pdfPage = pdfPages[j];
3024
+ const distance = currPdfPage - pdfPage;
3025
+ if (isDoubleSided) {
3026
+ const expectedStartPageNo = currPageNo - distance * 2;
3027
+ if (expectedStartPageNo < 1) {
3028
+ pageRangeMap[pdfPage] = { startPageNo: 0, endPageNo: 0 };
3029
+ } else {
3030
+ pageRangeMap[pdfPage] = {
3031
+ startPageNo: expectedStartPageNo,
3032
+ endPageNo: expectedStartPageNo + 1
3033
+ };
3034
+ }
3035
+ } else {
3036
+ const expectedPageNo = currPageNo - distance;
3037
+ if (expectedPageNo < 1) {
3038
+ pageRangeMap[pdfPage] = { startPageNo: 0, endPageNo: 0 };
3039
+ } else {
3040
+ pageRangeMap[pdfPage] = {
3041
+ startPageNo: expectedPageNo,
3042
+ endPageNo: expectedPageNo
3043
+ };
3044
+ }
3045
+ }
3046
+ this.log(
3047
+ "info",
3048
+ `Recalculated PDF ${pdfPage} -> ${pageRangeMap[pdfPage].startPageNo}`
3049
+ );
3050
+ }
3051
+ }
3052
+ }
3053
+ }
3054
+ /**
3055
+ * Normalize negative page numbers to 0
3056
+ */
3057
+ normalizeNegatives(pageRangeMap) {
3058
+ for (const [pdfPageStr, range] of Object.entries(pageRangeMap)) {
3059
+ if (range.startPageNo < 0 || range.endPageNo < 0) {
3060
+ this.log("info", `Normalizing negative: PDF ${pdfPageStr} -> 0`);
3061
+ pageRangeMap[Number(pdfPageStr)] = { startPageNo: 0, endPageNo: 0 };
3062
+ }
3063
+ }
3064
+ }
3065
+ /**
3066
+ * Backfill pages marked with 0 using detected pattern
3067
+ */
3068
+ backfillFailedPages(pageRangeMap) {
3069
+ const pdfPages = Object.keys(pageRangeMap).map(Number).sort((a, b) => a - b);
3070
+ const failedPages = pdfPages.filter(
3071
+ (p) => pageRangeMap[p].startPageNo === 0
3072
+ );
3073
+ if (failedPages.length === 0) return;
3074
+ const successfulPages = pdfPages.filter((p) => pageRangeMap[p].startPageNo > 0).map((p) => ({
3075
+ pdfPage: p,
3076
+ pageNo: pageRangeMap[p].startPageNo,
3077
+ isDoubleSided: this.isDoubleSidedRange(pageRangeMap[p])
3078
+ }));
3079
+ if (successfulPages.length < 2) {
3080
+ this.log("warn", "Not enough successful pages for backfill");
3081
+ return;
3082
+ }
3083
+ const doubleSidedCount = successfulPages.filter(
3084
+ (s) => s.isDoubleSided
3085
+ ).length;
3086
+ const isDoubleSided = doubleSidedCount > successfulPages.length / 2;
3087
+ if (isDoubleSided) {
3088
+ const offsets = successfulPages.map((s) => s.pageNo - s.pdfPage * 2);
3089
+ const avgOffset = Math.round(
3090
+ offsets.reduce((a, b) => a + b, 0) / offsets.length
3091
+ );
3092
+ this.log(
3093
+ "info",
3094
+ `Backfilling ${failedPages.length} pages with double-sided pattern (offset=${avgOffset})`
3095
+ );
3096
+ for (const pdfPage of failedPages) {
3097
+ const expectedStartPageNo = pdfPage * 2 + avgOffset;
3098
+ if (expectedStartPageNo < 1) {
3099
+ this.log(
3100
+ "info",
3101
+ `Backfill skipped for PDF ${pdfPage} (would be ${expectedStartPageNo})`
3102
+ );
3103
+ continue;
3104
+ }
3105
+ this.log(
3106
+ "info",
3107
+ `Backfill PDF ${pdfPage}: 0 -> ${expectedStartPageNo}-${expectedStartPageNo + 1}`
3108
+ );
3109
+ pageRangeMap[pdfPage] = {
3110
+ startPageNo: expectedStartPageNo,
3111
+ endPageNo: expectedStartPageNo + 1
3112
+ };
3113
+ }
3114
+ } else {
3115
+ const offsets = successfulPages.map((s) => s.pageNo - s.pdfPage);
3116
+ const avgOffset = Math.round(
3117
+ offsets.reduce((a, b) => a + b, 0) / offsets.length
3118
+ );
3119
+ this.log(
3120
+ "info",
3121
+ `Backfilling ${failedPages.length} pages with offset ${avgOffset}`
3122
+ );
3123
+ for (const pdfPage of failedPages) {
3124
+ const expectedPageNo = pdfPage + avgOffset;
3125
+ if (expectedPageNo < 1) {
3126
+ this.log(
3127
+ "info",
3128
+ `Backfill skipped for PDF ${pdfPage} (would be ${expectedPageNo})`
3129
+ );
3130
+ continue;
3131
+ }
3132
+ this.log("info", `Backfill PDF ${pdfPage}: 0 -> ${expectedPageNo}`);
3133
+ pageRangeMap[pdfPage] = {
3134
+ startPageNo: expectedPageNo,
3135
+ endPageNo: expectedPageNo
3136
+ };
3137
+ }
3138
+ }
3139
+ }
3140
+ /**
3141
+ * Build system prompt for Vision LLM
3142
+ */
3143
+ buildSystemPrompt() {
3144
+ return `You are a page number extraction specialist for document images.
3145
+ You will receive multiple document page images. For EACH image, extract the visible page number(s).
3146
+
3147
+ **SCAN TYPES:**
3148
+ 1. SINGLE PAGE: One document page per image. Return startPageNo only, endPageNo should be null.
3149
+ 2. DOUBLE-SIDED: Two document pages per image (spread). Return startPageNo (left) and endPageNo (right).
3150
+
3151
+ **WHERE TO LOOK:**
3152
+ - Bottom center, bottom corners (most common)
3153
+ - Top corners (less common)
3154
+ - Page numbers are SMALL numbers in MARGINS, NOT in content area
3155
+
3156
+ **WHAT TO IGNORE - These are NOT page numbers:**
3157
+ - Roman numerals (i, ii, iii, iv, v...) - return null
3158
+ - Figure numbers: "Figure 5", "Fig. 5", "\uB3C4 5", "\uADF8\uB9BC 5"
3159
+ - Table numbers: "Table 3", "\uD45C 3"
3160
+ - Photo numbers: "Photo 8", "\uC0AC\uC9C4 8", "Plate 4", "\uB3C4\uD310 4"
3161
+ - Years in content: "2015", "(1998)"
3162
+ - Any numbers with text prefix or inside content area
3163
+
3164
+ **RESPONSE FORMAT:**
3165
+ For each image (in order), provide:
3166
+ - imageIndex: 0-based index of the image
3167
+ - startPageNo: The page number found (null if not visible/readable)
3168
+ - endPageNo: Right page number for double-sided scans (null for single pages)`;
3169
+ }
3170
+ /**
3171
+ * Build user prompt for Vision LLM
3172
+ */
3173
+ buildUserPrompt(pageNos) {
3174
+ return `I am providing ${pageNos.length} document page images.
3175
+ These are PDF pages: ${pageNos.join(", ")}.
3176
+
3177
+ For each image (in order), extract the visible page number(s).
3178
+ Return null for pages where no page number is visible or readable.
3179
+
3180
+ Remember: Look for SMALL numbers in MARGINS only. Ignore figure/table/photo numbers.`;
3181
+ }
3182
+ };
3183
+
3184
+ // src/validators/base-validator.ts
3185
+ var BaseValidator = class extends TextLLMComponent {
3186
+ /**
3187
+ * Validator name for logging (kept for backwards compatibility)
3188
+ */
3189
+ validatorName;
3190
+ /**
3191
+ * Constructor for BaseValidator
3192
+ *
3193
+ * @param logger - Logger instance
3194
+ * @param model - Language model to use for validation
3195
+ * @param validatorName - Name of the validator for logging (e.g., "TocContentValidator")
3196
+ * @param options - Optional configuration (maxRetries, temperature)
3197
+ * @param fallbackModel - Optional fallback model for retry on failure
3198
+ * @param aggregator - Optional token usage aggregator for tracking LLM calls
3199
+ */
3200
+ constructor(logger, model, validatorName, options, fallbackModel, aggregator) {
3201
+ super(logger, model, validatorName, options, fallbackModel, aggregator);
3202
+ this.validatorName = validatorName;
3203
+ }
3204
+ /**
3205
+ * Call LLM with LLMCaller
3206
+ *
3207
+ * This method provides backwards compatibility with existing validators.
3208
+ * It wraps the parent callTextLLM method but allows passing a custom aggregator.
3209
+ *
3210
+ * @param schema - Zod schema for response validation
3211
+ * @param systemPrompt - System prompt
3212
+ * @param userPrompt - User prompt
3213
+ * @param phase - Phase name for tracking (e.g., 'validation', 'batch-validation')
3214
+ * @param aggregator - Optional token usage aggregator for tracking this call
3215
+ * @returns Parsed and validated LLM response with usage information
3216
+ */
3217
+ async callLLM(schema, systemPrompt, userPrompt, phase, aggregator) {
3218
+ const result = await LLMCaller.call({
3219
+ schema,
3220
+ systemPrompt,
3221
+ userPrompt,
3222
+ primaryModel: this.model,
3223
+ fallbackModel: this.fallbackModel,
3224
+ maxRetries: this.maxRetries,
3225
+ temperature: this.temperature,
3226
+ abortSignal: this.abortSignal,
3227
+ component: this.validatorName,
3228
+ phase
3229
+ });
3230
+ if (aggregator) {
3231
+ aggregator.track(result.usage);
3232
+ } else {
3233
+ this.trackUsage(result.usage);
3234
+ }
3235
+ return {
3236
+ output: result.output,
3237
+ usage: result.usage
3238
+ };
3239
+ }
3240
+ };
3241
+
3242
+ // src/validators/toc-content-validator.ts
3243
+ import { z as z5 } from "zod";
3244
+ var TocContentValidationSchema = z5.object({
3245
+ isToc: z5.boolean().describe("Whether the content is a table of contents"),
3246
+ confidence: z5.number().min(0).max(1).describe("Confidence score between 0 and 1"),
3247
+ reason: z5.string().describe("Brief explanation for the decision")
3248
+ });
3249
+ var TocContentValidator = class extends BaseValidator {
3250
+ confidenceThreshold;
3251
+ constructor(logger, model, options, fallbackModel, aggregator) {
3252
+ super(
3253
+ logger,
3254
+ model,
3255
+ "TocContentValidator",
3256
+ options,
3257
+ fallbackModel,
3258
+ aggregator
3259
+ );
3260
+ this.confidenceThreshold = options?.confidenceThreshold ?? 0.7;
3261
+ }
3262
+ /**
3263
+ * Validate if the markdown content is a table of contents
3264
+ *
3265
+ * @param markdown - Markdown content to validate
3266
+ * @returns Validation result with isToc, confidence, and reason
3267
+ */
3268
+ async validate(markdown) {
3269
+ this.logger.info(
3270
+ `[TocContentValidator] Validating content (${markdown.length} chars)`
3271
+ );
3272
+ if (!markdown.trim()) {
3273
+ this.logger.info(
3274
+ "[TocContentValidator] Empty markdown, returning invalid"
3275
+ );
3276
+ return {
3277
+ isToc: false,
3278
+ confidence: 1,
3279
+ reason: "Empty content"
3280
+ };
3281
+ }
3282
+ const { output: result } = await this.callLLM(
3283
+ TocContentValidationSchema,
3284
+ this.buildSystemPrompt(),
3285
+ this.buildUserPrompt(markdown),
3286
+ "validation",
3287
+ this.aggregator
3288
+ );
3289
+ this.logger.info(
3290
+ `[TocContentValidator] Result: isToc=${result.isToc}, confidence=${result.confidence}`
3291
+ );
3292
+ return result;
3293
+ }
3294
+ /**
3295
+ * Check if validation result passes threshold
3296
+ *
3297
+ * @param result - Validation result from validate()
3298
+ * @returns true if content is valid TOC with sufficient confidence
3299
+ */
3300
+ isValid(result) {
3301
+ return result.isToc && result.confidence >= this.confidenceThreshold;
3302
+ }
3303
+ /**
3304
+ * Build system prompt for TOC content validation
3305
+ */
3306
+ buildSystemPrompt() {
3307
+ return `You are a document structure analyst. Your task is to determine if the provided content is a Table of Contents (TOC).
3308
+
3309
+ ## What IS a Table of Contents:
3310
+ - A structured list of chapters/sections with corresponding page numbers
3311
+ - Contains hierarchical section titles (e.g., "Chapter 1", "\uC81C1\uC7A5", "1.1 Introduction", etc.)
3312
+ - Has page number references for each entry (e.g., "..... 10", "... 5", or just a number at the end)
3313
+ - Multiple entries organized by document structure
3314
+ - Main document outline listing major chapters and sections
3315
+
3316
+ ## What is NOT a Table of Contents:
3317
+ - Photo/image indices (\uC0AC\uC9C4 \uBAA9\uCC28, \uC0AC\uC9C4\uBAA9\uCC28, Photo Index, List of Figures, List of Photos)
3318
+ - Table indices (\uD45C \uBAA9\uCC28, \uD45C\uBAA9\uCC28, Table Index, List of Tables)
3319
+ - Drawing/diagram indices (\uB3C4\uBA74 \uBAA9\uCC28, \uB3C4\uBA74\uBAA9\uCC28, Drawing Index, List of Drawings)
3320
+ - Appendix indices (\uBD80\uB85D \uBAA9\uCC28, Appendix Index)
3321
+ - Random body text from the document
3322
+ - Single entries or incomplete lists (fewer than 3 items)
3323
+ - Reference lists or bibliographies
3324
+ - Index pages (alphabetical keyword lists)
3325
+
3326
+ ## Response Guidelines:
3327
+ - Set isToc to true ONLY if content is clearly a main document TOC
3328
+ - Set confidence between 0.0 and 1.0 based on your certainty
3329
+ - Provide a brief reason explaining your decision (1-2 sentences)`;
3330
+ }
3331
+ /**
3332
+ * Build user prompt with markdown content
3333
+ */
3334
+ buildUserPrompt(markdown) {
3335
+ return `Determine if the following content is a Table of Contents:
3336
+
3337
+ ${markdown}`;
3338
+ }
3339
+ };
3340
+
3341
+ // src/validators/caption-validator.ts
3342
+ import { z as z6 } from "zod";
3343
+ var CaptionValidationItemSchema = z6.object({
3344
+ index: z6.number().int().describe("Index of the caption in the input array"),
3345
+ isValid: z6.boolean().describe("Whether the parsed caption is correct"),
3346
+ reason: z6.string().nullable().describe("Brief explanation if invalid, null if valid")
3347
+ });
3348
+ var CaptionValidationBatchSchema = z6.object({
3349
+ results: z6.array(CaptionValidationItemSchema)
3350
+ });
3351
+ var CaptionValidator = class extends BaseValidator {
3352
+ constructor(logger, model, options, fallbackModel, aggregator) {
3353
+ super(
3354
+ logger,
3355
+ model,
3356
+ "CaptionValidator",
3357
+ options,
3358
+ fallbackModel,
3359
+ aggregator
3360
+ );
3361
+ }
3362
+ /**
3363
+ * Validate batch of parsed captions against original texts
3364
+ *
3365
+ * @param captions - Array of parsed Caption objects
3366
+ * @param originalTexts - Array of original caption texts (same order as captions)
3367
+ * @param batchSize - Batch size for processing. Set to 0 to skip validation (assume all valid).
3368
+ * @returns Array of validation results (boolean) maintaining original order
3369
+ */
3370
+ async validateBatch(captions, originalTexts, batchSize) {
3371
+ this.logger.info(
3372
+ `[CaptionValidator] Validating ${captions.length} captions with batch size ${batchSize}...`
3373
+ );
3374
+ if (captions.length !== originalTexts.length) {
3375
+ throw new Error(
3376
+ `[CaptionValidator] Captions and originalTexts length mismatch: ${captions.length} vs ${originalTexts.length}`
3377
+ );
3378
+ }
3379
+ if (captions.length === 0) {
3380
+ this.logger.info("[CaptionValidator] No captions to validate");
3381
+ return [];
3382
+ }
3383
+ if (batchSize === 0) {
3384
+ this.logger.info(
3385
+ "[CaptionValidator] Skipping validation (batchSize=0), assuming all captions are valid"
3386
+ );
3387
+ return new Array(captions.length).fill(true);
3388
+ }
3389
+ try {
3390
+ const indexedItems = captions.map((caption, index) => ({
3391
+ index,
3392
+ caption,
3393
+ originalText: originalTexts[index]
3394
+ }));
3395
+ const batchResults = await BatchProcessor.processBatch(
3396
+ indexedItems,
3397
+ batchSize,
3398
+ async (batch) => this.validateBatchInternal(batch, this.model)
3399
+ );
3400
+ batchResults.sort((a, b) => a.index - b.index);
3401
+ const results = batchResults.map((r) => r.isValid);
3402
+ const validCount = results.filter((r) => r).length;
3403
+ this.logger.info(
3404
+ `[CaptionValidator] Completed: ${validCount}/${results.length} captions validated as correct`
3405
+ );
3406
+ if (this.aggregator) {
3407
+ this.aggregator.logSummary(this.logger);
3408
+ }
3409
+ return results;
3410
+ } catch (error) {
3411
+ const message = error instanceof Error ? error.message : String(error);
3412
+ this.logger.error(`[CaptionValidator] Validation failed: ${message}`);
3413
+ throw new CaptionValidationError(
3414
+ `Failed to validate captions: ${message}`,
3415
+ { cause: error }
3416
+ );
3417
+ }
3418
+ }
3419
+ /**
3420
+ * Internal: Validate batch of captions using LLM
3421
+ *
3422
+ * @param items - Batch of caption items with original indices
3423
+ * @param model - Effective model to use
3424
+ * @returns Array of validation results indexed correctly
3425
+ */
3426
+ async validateBatchInternal(items, model) {
3427
+ const result = await LLMCaller.call({
3428
+ schema: CaptionValidationBatchSchema,
3429
+ systemPrompt: this.buildSystemPrompt(),
3430
+ userPrompt: this.buildUserPrompt(items),
3431
+ primaryModel: model,
3432
+ fallbackModel: this.fallbackModel,
3433
+ maxRetries: this.maxRetries,
3434
+ temperature: this.temperature,
3435
+ abortSignal: this.abortSignal,
3436
+ component: "CaptionValidator",
3437
+ phase: "validation"
3438
+ });
3439
+ if (this.aggregator) {
3440
+ this.aggregator.track(result.usage);
3441
+ }
3442
+ return result.output.results.map((item) => ({
3443
+ index: item.index,
3444
+ isValid: item.isValid
3445
+ }));
3446
+ }
3447
+ buildSystemPrompt() {
3448
+ return `You are a caption validation expert for archaeological excavation reports.
3449
+
3450
+ Your task is to validate whether parsed caption prefixes (num field) are correctly extracted from original caption texts.
3451
+
3452
+ ## Caption Pattern Recognition
3453
+
3454
+ A valid caption follows the pattern: <prefix word(s)> <number>
3455
+ - The prefix can be ANY Korean/English word(s) that label images/tables/figures
3456
+ - Common examples: \uB3C4\uD310, \uC0AC\uC9C4, \uADF8\uB9BC, \uC6D0\uC0C9\uC0AC\uC9C4, \uD751\uBC31\uC0AC\uC9C4, Figure, Photo, Plate, etc.
3457
+ - The key is the PATTERN (text followed by number), not a specific word list
3458
+ - Leading punctuation/brackets should be IGNORED when extracting
3459
+
3460
+ Valid caption patterns:
3461
+ - "\uC6D0\uC0C9\uC0AC\uC9C4 1. \uC870\uC0AC\uC9C0\uC5ED" \u2192 num="\uC6D0\uC0C9\uC0AC\uC9C4 1" \u2713
3462
+ - "\uD751\uBC31\uC0AC\uC9C4 2 \uCD9C\uD1A0\uC720\uBB3C" \u2192 num="\uD751\uBC31\uC0AC\uC9C4 2" \u2713
3463
+ - "\uB3C4\uD310 1 \uC720\uC801 \uC804\uACBD" \u2192 num="\uB3C4\uD310 1" \u2713
3464
+ - "(\uC0AC\uC9C4 16> \uB290\uD2F0\uB098\uBB34" \u2192 num="\uC0AC\uC9C4 16" \u2713 (ignore leading punctuation)
3465
+ - "<\uB3C4\uD310 3> \uC720\uBB3C \uC0AC\uC9C4" \u2192 num="\uB3C4\uD310 3" \u2713 (ignore angle brackets)
3466
+
3467
+ Invalid patterns (num MUST be null):
3468
+ - "39 3\uC6D4 28\uC77C(\uBC31\uC81C \uB3C4\uB85C\uC720\uAD6C)" \u2192 null \u2713 (starts with number, no prefix)
3469
+ - "1. \uC720\uC801 \uC804\uACBD" \u2192 null \u2713 (numbered list item, not a caption)
3470
+ - "2024\uB144 \uC870\uC0AC \uD604\uD669" \u2192 null \u2713 (year reference, not a caption)
3471
+
3472
+ ## Extraction Algorithm:
3473
+
3474
+ 1. Extract prefix + number from the caption
3475
+ - The prefix is the text portion before the number
3476
+ - Full extraction: "\uC6D0\uC0C9\uC0AC\uC9C4 1", "\uB3C4\uD310 2-3", "\uADF8\uB9BC 3.6", "Figure 4a"
3477
+
3478
+ 2. **Decimal point handling**: Include period/dot after number if directly following
3479
+ - "\uADF8\uB9BC 3.6. \uD55C\uBC18\uB3C4" \u2192 "\uADF8\uB9BC 3.6" (period as decimal separator included)
3480
+ - "\uB3C4\uD310 2. \uC720\uC801" \u2192 "\uB3C4\uD310 2" (period after space, NOT included)
3481
+
3482
+ 3. **Stop rules** (extraction must stop at first occurrence of):
3483
+ - Punctuation (except decimal point): , : ; ! ? ~ ( ) [ ] { }
3484
+ - Whitespace: space, tab, newline
3485
+ - Underscore: _
3486
+ - Exception: Periods directly after digits are included as decimal separators
3487
+ - Exception: Hyphens within numbers are included (e.g., "2-3")
3488
+
3489
+ ## Validation Rules:
3490
+
3491
+ 1. **Pattern requirement**: The original text MUST follow <prefix> <number> pattern
3492
+ - "\uC6D0\uC0C9\uC0AC\uC9C4 1. \uC870\uC0AC\uC9C0\uC5ED" \u2192 num="\uC6D0\uC0C9\uC0AC\uC9C4 1" \u2713 (valid pattern)
3493
+ - "39 3\uC6D4 28\uC77C(\uBC31\uC81C)" \u2192 num="39" \u2717 (starts with number, should be null)
3494
+ - "1. \uC870\uC0AC \uAC1C\uC694" \u2192 num="1" \u2717 (numbered list, should be null)
3495
+
3496
+ 2. **Correctness**: The parsed "num" must contain the actual prefix+number
3497
+ - "\uB3C4\uD310 1 \uC720\uC801 \uC804\uACBD" \u2192 num="\uB3C4\uD310 1" \u2713
3498
+ - "\uB3C4\uD310 1 \uC720\uC801 \uC804\uACBD" \u2192 num="\uB3C4\uD310" \u2717 (incomplete)
3499
+
3500
+ 3. **Spacing**: The spacing in "num" must match the original text exactly
3501
+ - "\uB3C4\uD310 1" \u2192 num="\uB3C4\uD310 1" \u2713
3502
+ - "\uB3C4\uD3101" \u2192 num="\uB3C4\uD3101" \u2713
3503
+ - "\uB3C4\uD310 1" \u2192 num="\uB3C4\uD3101" \u2717 (spacing mismatch)
3504
+
3505
+ 4. **Completeness**: The number part must be fully extracted
3506
+ - "Figure 2-3" \u2192 num="Figure 2-3" \u2713
3507
+ - "Figure 2-3" \u2192 num="Figure 2" \u2717 (incomplete number)
3508
+
3509
+ 5. **Null handling**: If "num" is null, verify:
3510
+ - Either the original text has no number
3511
+ - OR the text starts with a number (no prefix)
3512
+ - "\uC720\uC801 \uC804\uACBD \uC0AC\uC9C4" \u2192 num=null \u2713 (no number in caption position)
3513
+ - "\uC6D0\uC0C9\uC0AC\uC9C4 1 \uC870\uC0AC" \u2192 num=null \u2717 (should extract "\uC6D0\uC0C9\uC0AC\uC9C4 1")
3514
+
3515
+ ## Response:
3516
+ For each caption, return:
3517
+ - index: original position
3518
+ - isValid: true if parsing is correct, false otherwise
3519
+ - reason: null if valid, brief explanation if invalid`;
3520
+ }
3521
+ buildUserPrompt(items) {
3522
+ const captionList = items.map(
3523
+ (item) => `[${item.index}] Original: "${item.originalText}" | Parsed num: ${item.caption.num !== void 0 ? `"${item.caption.num}"` : "null"}`
3524
+ ).join("\n");
3525
+ return `Validate the following caption parsing results:
3526
+
3527
+ ${captionList}
3528
+
3529
+ Return the results as JSON array with "index", "isValid", and "reason" (null if valid, explanation if invalid).
3530
+
3531
+ Example format:
3532
+ {
3533
+ "results": [
3534
+ { "index": 0, "isValid": true, "reason": null },
3535
+ { "index": 1, "isValid": false, "reason": "Number incomplete: expected '1-2' but got '1'" },
3536
+ { "index": 2, "isValid": true, "reason": null }
3537
+ ]
3538
+ }`;
3539
+ }
3540
+ };
3541
+ var CaptionValidationError = class extends Error {
3542
+ constructor(message, options) {
3543
+ super(message, options);
3544
+ this.name = "CaptionValidationError";
3545
+ }
3546
+ };
3547
+
3548
+ // src/document-processor.ts
3549
+ var DocumentProcessor = class {
3550
+ logger;
3551
+ fallbackModel;
3552
+ pageRangeParserModel;
3553
+ tocExtractorModel;
3554
+ validatorModel;
3555
+ visionTocExtractorModel;
3556
+ captionParserModel;
3557
+ textCleanerBatchSize;
3558
+ captionParserBatchSize;
3559
+ captionValidatorBatchSize;
3560
+ maxRetries;
3561
+ enableFallbackRetry;
3562
+ abortSignal;
3563
+ idGenerator = new IdGenerator();
3564
+ refResolver;
3565
+ pageRangeParser;
3566
+ tocFinder;
3567
+ tocExtractor;
3568
+ tocContentValidator;
3569
+ captionValidator;
3570
+ visionTocExtractor;
3571
+ captionParser;
3572
+ chapterConverter;
3573
+ textCleaner = TextCleaner;
3574
+ usageAggregator = new LLMTokenUsageAggregator();
3575
+ constructor(options) {
3576
+ this.logger = options.logger;
3577
+ this.fallbackModel = options.fallbackModel;
3578
+ this.pageRangeParserModel = options.pageRangeParserModel ?? options.fallbackModel;
3579
+ this.tocExtractorModel = options.tocExtractorModel ?? options.fallbackModel;
3580
+ this.validatorModel = options.validatorModel ?? options.fallbackModel;
3581
+ this.visionTocExtractorModel = options.visionTocExtractorModel ?? options.fallbackModel;
3582
+ this.captionParserModel = options.captionParserModel ?? options.fallbackModel;
3583
+ this.textCleanerBatchSize = options.textCleanerBatchSize;
3584
+ this.captionParserBatchSize = options.captionParserBatchSize;
3585
+ this.captionValidatorBatchSize = options.captionValidatorBatchSize;
3586
+ this.maxRetries = options.maxRetries ?? 3;
3587
+ this.enableFallbackRetry = options.enableFallbackRetry ?? false;
3588
+ this.abortSignal = options.abortSignal;
3589
+ }
3590
+ /**
3591
+ * Check if abort has been requested and throw error if so
3592
+ *
3593
+ * @throws {Error} with name 'AbortError' if aborted
3594
+ */
3595
+ checkAborted() {
3596
+ if (this.abortSignal?.aborted) {
3597
+ const error = new Error("Document processing was aborted");
3598
+ error.name = "AbortError";
3599
+ throw error;
3600
+ }
3601
+ }
3602
+ /**
3603
+ * Converts DoclingDocument to ProcessedDocument with token usage tracking.
3604
+ *
3605
+ * Conversion process:
3606
+ * 1. Initialize processors and resolvers
3607
+ * 2. Normalize and filter texts
3608
+ * 3. Clean texts and parse page ranges (parallel)
3609
+ * 4. Extract table of contents
3610
+ * 5. Convert images and tables (parallel)
3611
+ * 6. Convert chapters and link resources
3612
+ * 7. Assemble final ProcessedDocument
3613
+ * 8. Collect and report token usage
3614
+ *
3615
+ * @param doclingDoc - Original document extracted from Docling SDK
3616
+ * @param reportId - Report unique identifier
3617
+ * @param outputPath - Path containing images and pages subdirectories (images/image_0.png, pages/page_0.png, etc.)
3618
+ * @returns Document processing result with ProcessedDocument and token usage report
3619
+ *
3620
+ * @throws {TocExtractError} When TOC extraction fails
3621
+ * @throws {PageRangeParseError} When page range parsing fails
3622
+ * @throws {ConversionError} When error occurs during conversion
3623
+ */
3624
+ async process(doclingDoc, reportId, outputPath) {
3625
+ this.logger.info("[DocumentProcessor] Starting document processing...");
3626
+ this.logger.info("[DocumentProcessor] Report ID:", reportId);
3627
+ this.usageAggregator.reset();
3628
+ this.checkAborted();
3629
+ this.initializeProcessors(doclingDoc, outputPath);
3630
+ const startTimeFilter = Date.now();
3631
+ const filtered = this.normalizeAndFilterTexts(doclingDoc);
3632
+ const filteringTime = Date.now() - startTimeFilter;
3633
+ this.logger.info(
3634
+ `[DocumentProcessor] Text filtering took ${filteringTime}ms`
3635
+ );
3636
+ this.checkAborted();
3637
+ const startTimePageRange = Date.now();
3638
+ const pageRangeMap = await this.parsePageRanges(doclingDoc);
3639
+ const pageRangeTime = Date.now() - startTimePageRange;
3640
+ this.logger.info(
3641
+ `[DocumentProcessor] Page range parsing took ${pageRangeTime}ms`
3642
+ );
3643
+ this.checkAborted();
3644
+ const startTimeToc = Date.now();
3645
+ const tocEntries = await this.extractTableOfContents(doclingDoc, filtered);
3646
+ const tocTime = Date.now() - startTimeToc;
3647
+ this.logger.info(`[DocumentProcessor] TOC extraction took ${tocTime}ms`);
3648
+ this.checkAborted();
3649
+ const startTimeResources = Date.now();
3650
+ const { images, tables, footnotes } = await this.convertResources(
3651
+ doclingDoc,
3652
+ outputPath
3653
+ );
3654
+ const resourcesTime = Date.now() - startTimeResources;
3655
+ this.logger.info(
3656
+ `[DocumentProcessor] Resource conversion took ${resourcesTime}ms`
3657
+ );
3658
+ this.checkAborted();
3659
+ const startTimeChapters = Date.now();
3660
+ const chapters = await this.convertChapters(
3661
+ doclingDoc,
3662
+ tocEntries,
3663
+ pageRangeMap,
3664
+ images,
3665
+ tables,
3666
+ footnotes
3667
+ );
3668
+ const chaptersTime = Date.now() - startTimeChapters;
3669
+ this.logger.info(
3670
+ `[DocumentProcessor] Chapter conversion took ${chaptersTime}ms`
3671
+ );
3672
+ const startTimeAssemble = Date.now();
3673
+ const processedDoc = this.assembleProcessedDocument(
3674
+ reportId,
3675
+ pageRangeMap,
3676
+ chapters,
3677
+ images,
3678
+ tables,
3679
+ footnotes
3680
+ );
3681
+ const assembleTime = Date.now() - startTimeAssemble;
3682
+ this.logger.info(
3683
+ `[DocumentProcessor] Document assembly took ${assembleTime}ms`
3684
+ );
3685
+ this.logger.info("[DocumentProcessor] Document processing completed");
3686
+ return {
3687
+ document: processedDoc,
3688
+ usage: this.usageAggregator.getReport()
3689
+ };
3690
+ }
3691
+ /**
3692
+ * Initialize all processors and resolvers
3693
+ *
3694
+ * Sets up RefResolver, PageRangeParser, TocFinder, and TocExtractor
3695
+ */
3696
+ initializeProcessors(doclingDoc, outputPath) {
3697
+ this.logger.info("[DocumentProcessor] Initializing processors...");
3698
+ this.logger.info("[DocumentProcessor] - RefResolver");
3699
+ this.refResolver = new RefResolver(this.logger, doclingDoc);
3700
+ this.logger.info("[DocumentProcessor] - PageRangeParser");
3701
+ this.pageRangeParser = new PageRangeParser(
3702
+ this.logger,
3703
+ this.pageRangeParserModel,
3704
+ outputPath,
3705
+ this.maxRetries,
3706
+ this.enableFallbackRetry ? this.fallbackModel : void 0,
3707
+ this.usageAggregator,
3708
+ this.abortSignal
3709
+ );
3710
+ this.logger.info("[DocumentProcessor] - TocFinder");
3711
+ this.tocFinder = new TocFinder(this.logger, this.refResolver);
3712
+ this.logger.info("[DocumentProcessor] - TocExtractor");
3713
+ this.tocExtractor = new TocExtractor(
3714
+ this.logger,
3715
+ this.tocExtractorModel,
3716
+ {
3717
+ maxRetries: this.maxRetries
3718
+ },
3719
+ this.enableFallbackRetry ? this.fallbackModel : void 0,
3720
+ this.abortSignal
3721
+ );
3722
+ this.logger.info("[DocumentProcessor] - TocContentValidator");
3723
+ this.tocContentValidator = new TocContentValidator(
3724
+ this.logger,
3725
+ this.validatorModel,
3726
+ { maxRetries: this.maxRetries, abortSignal: this.abortSignal },
3727
+ this.enableFallbackRetry ? this.fallbackModel : void 0,
3728
+ this.usageAggregator
3729
+ );
3730
+ this.logger.info("[DocumentProcessor] - CaptionValidator");
3731
+ this.captionValidator = new CaptionValidator(
3732
+ this.logger,
3733
+ this.validatorModel,
3734
+ { maxRetries: this.maxRetries, abortSignal: this.abortSignal },
3735
+ this.enableFallbackRetry ? this.fallbackModel : void 0,
3736
+ this.usageAggregator
3737
+ );
3738
+ this.logger.info("[DocumentProcessor] - VisionTocExtractor");
3739
+ this.visionTocExtractor = new VisionTocExtractor(
3740
+ this.logger,
3741
+ this.visionTocExtractorModel,
3742
+ outputPath,
3743
+ { maxRetries: this.maxRetries, abortSignal: this.abortSignal },
3744
+ this.enableFallbackRetry ? this.fallbackModel : void 0,
3745
+ this.usageAggregator
3746
+ );
3747
+ this.logger.info("[DocumentProcessor] - CaptionParser");
3748
+ this.captionParser = new CaptionParser(
3749
+ this.logger,
3750
+ this.captionParserModel,
3751
+ { maxRetries: this.maxRetries, abortSignal: this.abortSignal },
3752
+ this.enableFallbackRetry ? this.fallbackModel : void 0,
3753
+ this.usageAggregator
3754
+ );
3755
+ this.logger.info("[DocumentProcessor] - ChapterConverter");
3756
+ this.chapterConverter = new ChapterConverter(this.logger, this.idGenerator);
3757
+ this.logger.info("[DocumentProcessor] All processors initialized");
3758
+ }
3759
+ /**
3760
+ * Normalize and filter texts using TextCleaner
3761
+ *
3762
+ * Performs basic text normalization (unicode, whitespace, punctuation)
3763
+ * and filters out invalid texts (empty, numbers-only, etc.)
3764
+ */
3765
+ normalizeAndFilterTexts(doclingDoc) {
3766
+ this.logger.info("[DocumentProcessor] Normalizing and filtering texts...");
3767
+ const texts = doclingDoc.texts.map((text) => text.text);
3768
+ const filtered = this.textCleaner.normalizeAndFilterBatch(
3769
+ texts,
3770
+ this.textCleanerBatchSize
3771
+ );
3772
+ this.logger.info(
3773
+ `[DocumentProcessor] Filtered ${filtered.length} texts from ${texts.length} original texts`
3774
+ );
3775
+ return filtered;
3776
+ }
3777
+ /**
3778
+ * Parse page ranges using Vision LLM
3779
+ *
3780
+ * Extracts actual page numbers from page images and creates mapping.
3781
+ * Token usage is automatically tracked by PageRangeParser into the shared aggregator.
3782
+ */
3783
+ async parsePageRanges(doclingDoc) {
3784
+ this.logger.info("[DocumentProcessor] Starting page range parsing...");
3785
+ const result = await this.pageRangeParser.parse(doclingDoc);
3786
+ const pageRangeMap = result.pageRangeMap;
3787
+ this.logger.info(
3788
+ `[DocumentProcessor] Page range map entries: ${Object.keys(pageRangeMap).length}`
3789
+ );
3790
+ return pageRangeMap;
3791
+ }
3792
+ /**
3793
+ * Convert images, tables, and footnotes
3794
+ *
3795
+ * Runs conversions:
3796
+ * - Images conversion (with caption extraction)
3797
+ * - Tables conversion (with caption extraction, excluding TOC tables)
3798
+ * - Footnotes conversion (synchronous, from text items with label='footnote')
3799
+ */
3800
+ async convertResources(doclingDoc, outputPath) {
3801
+ this.logger.info(
3802
+ "[DocumentProcessor] Converting images, tables, and footnotes..."
3803
+ );
3804
+ const [images, tables] = await Promise.all([
3805
+ this.convertImages(doclingDoc, outputPath),
3806
+ this.convertTables(doclingDoc)
3807
+ ]);
3808
+ const footnotes = this.convertFootnotes(doclingDoc);
3809
+ this.logger.info(
3810
+ `[DocumentProcessor] Converted ${images.length} images, ${tables.length} tables, and ${footnotes.length} footnotes`
3811
+ );
3812
+ return { images, tables, footnotes };
3813
+ }
3814
+ /**
3815
+ * Convert footnotes
3816
+ *
3817
+ * Extracts footnotes from DoclingDocument text items with label='footnote'
3818
+ */
3819
+ convertFootnotes(doclingDoc) {
3820
+ const footnoteItems = doclingDoc.texts.filter(
3821
+ (item) => item.label === "footnote"
3822
+ );
3823
+ this.logger.info(
3824
+ `[DocumentProcessor] Converting ${footnoteItems.length} footnotes...`
3825
+ );
3826
+ const footnotes = [];
3827
+ for (const item of footnoteItems) {
3828
+ if (!this.textCleaner.isValidText(item.text)) {
3829
+ continue;
3830
+ }
3831
+ const pdfPageNo = item.prov?.[0]?.page_no ?? 1;
3832
+ const footnoteId = this.idGenerator.generateFootnoteId();
3833
+ footnotes.push({
3834
+ id: footnoteId,
3835
+ text: this.textCleaner.normalize(item.text),
3836
+ pdfPageNo
3837
+ });
3838
+ }
3839
+ this.logger.info(
3840
+ `[DocumentProcessor] Converted ${footnotes.length} valid footnotes`
3841
+ );
3842
+ return footnotes;
3843
+ }
3844
+ /**
3845
+ * Assemble the final ProcessedDocument
3846
+ *
3847
+ * Creates the ProcessedDocument structure with all converted components
3848
+ */
3849
+ assembleProcessedDocument(reportId, pageRangeMap, chapters, images, tables, footnotes) {
3850
+ this.logger.info("[DocumentProcessor] Assembling ProcessedDocument...");
3851
+ const processedDoc = {
3852
+ reportId,
3853
+ pageRangeMap,
3854
+ chapters,
3855
+ images,
3856
+ tables,
3857
+ footnotes
3858
+ };
3859
+ this.logger.info(
3860
+ `[DocumentProcessor] Assembled document with ${chapters.length} chapters, ${images.length} images, ${tables.length} tables, ${footnotes.length} footnotes`
3861
+ );
3862
+ return processedDoc;
3863
+ }
3864
+ /**
3865
+ * Extract table of contents (TOC)
3866
+ *
3867
+ * Uses rule-based extraction with LLM validation and vision fallback:
3868
+ * 1. TocFinder - find TOC area in document (rule-based)
3869
+ * 2. MarkdownConverter - convert TOC items to Markdown
3870
+ * 3. TocContentValidator - validate if content is actually a TOC (LLM)
3871
+ * 4. If invalid: VisionTocExtractor - extract from page images (vision LLM fallback)
3872
+ * 5. TocExtractor - LLM-based structured extraction
3873
+ */
3874
+ async extractTableOfContents(doclingDoc, _filteredTexts) {
3875
+ this.logger.info("[DocumentProcessor] Extracting TOC...");
3876
+ let markdown = null;
3877
+ try {
3878
+ const tocArea = this.tocFinder.find(doclingDoc);
3879
+ this.logger.info(
3880
+ `[DocumentProcessor] Found TOC area: pages ${tocArea.startPage}-${tocArea.endPage}`
3881
+ );
3882
+ markdown = MarkdownConverter.convert(tocArea.itemRefs, this.refResolver);
3883
+ this.logger.info(
3884
+ `[DocumentProcessor] Converted TOC to Markdown (${markdown.length} chars)`
3885
+ );
3886
+ const validation = await this.tocContentValidator.validate(markdown);
3887
+ if (!this.tocContentValidator.isValid(validation)) {
3888
+ this.logger.warn(
3889
+ `[DocumentProcessor] TOC validation failed: ${validation.reason}`
3890
+ );
3891
+ markdown = null;
3892
+ } else {
3893
+ this.logger.info(
3894
+ `[DocumentProcessor] TOC validation passed (confidence: ${validation.confidence})`
3895
+ );
3896
+ }
3897
+ } catch (error) {
3898
+ if (error instanceof TocNotFoundError) {
3899
+ this.logger.info(
3900
+ "[DocumentProcessor] Rule-based TOC not found, will try vision fallback"
3901
+ );
3902
+ } else {
3903
+ throw error;
3904
+ }
3905
+ }
3906
+ if (!markdown) {
3907
+ this.logger.info("[DocumentProcessor] Using vision fallback for TOC");
3908
+ const totalPages = Object.keys(doclingDoc.pages).length;
3909
+ markdown = await this.visionTocExtractor.extract(totalPages);
3910
+ if (!markdown) {
3911
+ this.logger.warn(
3912
+ "[DocumentProcessor] TOC not found in any method, returning empty"
3913
+ );
3914
+ return [];
3915
+ }
3916
+ this.logger.info(
3917
+ `[DocumentProcessor] Vision extracted TOC markdown (${markdown.length} chars)`
3918
+ );
3919
+ }
3920
+ const tocResult = await this.tocExtractor.extract(markdown);
3921
+ this.usageAggregator.track(tocResult.usage);
3922
+ this.logger.info(
3923
+ `[DocumentProcessor] Extracted ${tocResult.entries.length} top-level TOC entries`
3924
+ );
3925
+ return tocResult.entries;
3926
+ }
3927
+ /**
3928
+ * Process resource captions (for images and tables)
3929
+ *
3930
+ * Common caption processing pipeline:
3931
+ * 1. Parse captions in batch
3932
+ * 2. Validate parsed captions
3933
+ * 3. Reparse failed captions with fallback model
3934
+ *
3935
+ * @param captionTexts - Array of caption texts to process
3936
+ * @param resourceType - Type of resource for logging (e.g., 'image', 'table')
3937
+ * @returns Parsed captions with index mapping
3938
+ */
3939
+ async processResourceCaptions(captionTexts, resourceType) {
3940
+ const captionsByIndex = /* @__PURE__ */ new Map();
3941
+ const validCaptionData = [];
3942
+ for (let i = 0; i < captionTexts.length; i++) {
3943
+ const text = captionTexts[i];
3944
+ if (text !== void 0) {
3945
+ validCaptionData.push({
3946
+ resourceIndex: i,
3947
+ filteredIndex: validCaptionData.length,
3948
+ text
3949
+ });
3950
+ }
3951
+ }
3952
+ const validCaptionTexts = validCaptionData.map((item) => item.text);
3953
+ const parsedCaptions = validCaptionTexts.length > 0 ? await this.captionParser.parseBatch(
3954
+ validCaptionTexts,
3955
+ this.captionParserBatchSize
3956
+ ) : [];
3957
+ let finalValidCaptionData = validCaptionData;
3958
+ let finalParsedCaptions = parsedCaptions;
3959
+ if (parsedCaptions.length !== validCaptionData.length) {
3960
+ this.logger.warn(
3961
+ `[DocumentProcessor] Caption parsing length mismatch for ${resourceType}: expected ${validCaptionData.length}, got ${parsedCaptions.length}. Attempting recovery by matching fullText...`
3962
+ );
3963
+ const parsedMap = /* @__PURE__ */ new Map();
3964
+ for (const parsed of parsedCaptions) {
3965
+ parsedMap.set(parsed.fullText, parsed);
3966
+ }
3967
+ const recoveredData = [];
3968
+ for (const item of validCaptionData) {
3969
+ if (parsedMap.has(item.text)) {
3970
+ recoveredData.push(item);
3971
+ } else {
3972
+ this.logger.warn(
3973
+ `[DocumentProcessor] Skipping ${resourceType} caption at index ${item.resourceIndex}: "${item.text}" (not found in parsed results)`
3974
+ );
3975
+ }
3976
+ }
3977
+ const recoveredCaptions = [];
3978
+ for (const item of recoveredData) {
3979
+ const caption = parsedMap.get(item.text);
3980
+ if (caption) {
3981
+ recoveredCaptions.push(caption);
3982
+ }
3983
+ }
3984
+ if (recoveredCaptions.length !== recoveredData.length) {
3985
+ throw new Error(
3986
+ `[DocumentProcessor] Failed to recover from length mismatch: recovered ${recoveredCaptions.length} captions for ${recoveredData.length} valid items`
3987
+ );
3988
+ }
3989
+ finalValidCaptionData = recoveredData;
3990
+ finalParsedCaptions = recoveredCaptions;
3991
+ this.logger.info(
3992
+ `[DocumentProcessor] Successfully recovered ${finalParsedCaptions.length} ${resourceType} captions after length mismatch`
3993
+ );
3994
+ }
3995
+ for (let i = 0; i < finalParsedCaptions.length; i++) {
3996
+ const resourceIndex = finalValidCaptionData[i].resourceIndex;
3997
+ captionsByIndex.set(resourceIndex, finalParsedCaptions[i]);
3998
+ }
3999
+ if (finalParsedCaptions.length > 0) {
4000
+ const finalValidCaptionTexts = finalValidCaptionData.map(
4001
+ (item) => item.text
4002
+ );
4003
+ const validationResults = await this.captionValidator.validateBatch(
4004
+ finalParsedCaptions,
4005
+ finalValidCaptionTexts,
4006
+ this.captionValidatorBatchSize
4007
+ );
4008
+ const failedIndices = validationResults.map((isValid, index) => isValid ? -1 : index).filter((index) => index !== -1);
4009
+ if (failedIndices.length > 0) {
4010
+ for (const filteredIndex of failedIndices) {
4011
+ const captionData = finalValidCaptionData[filteredIndex];
4012
+ const originalText = captionData.text;
4013
+ const parsedNum = finalParsedCaptions[filteredIndex].num;
4014
+ const resourceIndex = captionData.resourceIndex;
4015
+ this.logger.warn(
4016
+ `[DocumentProcessor] Invalid ${resourceType} caption [${resourceIndex}]: "${originalText}" | parsed num="${parsedNum}"`
4017
+ );
4018
+ }
4019
+ if (this.enableFallbackRetry) {
4020
+ this.logger.info(
4021
+ `[DocumentProcessor] Reparsing ${failedIndices.length} failed ${resourceType} captions with fallback model...`
4022
+ );
4023
+ const failedCaptionTexts = failedIndices.map(
4024
+ (filteredIndex) => finalValidCaptionData[filteredIndex].text
4025
+ );
4026
+ const fallbackCaptionParser = new CaptionParser(
4027
+ this.logger,
4028
+ this.fallbackModel,
4029
+ {
4030
+ maxRetries: this.maxRetries,
4031
+ componentName: "CaptionParser-fallback",
4032
+ abortSignal: this.abortSignal
4033
+ },
4034
+ void 0,
4035
+ // no fallback for the fallback
4036
+ this.usageAggregator
4037
+ );
4038
+ const reparsedCaptions = await fallbackCaptionParser.parseBatch(
4039
+ failedCaptionTexts,
4040
+ 0
4041
+ // sequential processing
4042
+ );
4043
+ for (let i = 0; i < failedIndices.length; i++) {
4044
+ const filteredIndex = failedIndices[i];
4045
+ const resourceIndex = finalValidCaptionData[filteredIndex].resourceIndex;
4046
+ captionsByIndex.set(resourceIndex, reparsedCaptions[i]);
4047
+ }
4048
+ this.logger.info(
4049
+ `[DocumentProcessor] Reparsed ${reparsedCaptions.length} ${resourceType} captions`
4050
+ );
4051
+ } else {
4052
+ this.logger.warn(
4053
+ `[DocumentProcessor] ${failedIndices.length} ${resourceType} captions failed validation (kept as-is, fallback retry disabled)`
4054
+ );
4055
+ }
4056
+ }
4057
+ }
4058
+ return captionsByIndex;
4059
+ }
4060
+ /**
4061
+ * Extract caption text from resource
4062
+ *
4063
+ * Handles both string references and $ref resolution
4064
+ */
4065
+ extractCaptionText(captions) {
4066
+ if (!captions?.[0]) {
4067
+ return void 0;
4068
+ }
4069
+ const captionRef = captions[0];
4070
+ if (typeof captionRef === "string") {
4071
+ return captionRef;
4072
+ }
4073
+ if (this.refResolver && "$ref" in captionRef) {
4074
+ const resolved = this.refResolver.resolveText(captionRef.$ref);
4075
+ return resolved?.text;
4076
+ }
4077
+ return void 0;
4078
+ }
4079
+ /**
4080
+ * Convert images
4081
+ *
4082
+ * Converts pictures from DoclingDocument to ProcessedImage
4083
+ */
4084
+ async convertImages(doclingDoc, outputPath) {
4085
+ this.logger.info(
4086
+ `[DocumentProcessor] Converting ${doclingDoc.pictures.length} images...`
4087
+ );
4088
+ const images = [];
4089
+ const captionTexts = [];
4090
+ for (const picture of doclingDoc.pictures) {
4091
+ const pdfPageNo = picture.prov?.[0]?.page_no ?? 0;
4092
+ const imageId = this.idGenerator?.generateImageId() ?? `img-${images.length + 1}`;
4093
+ const captionText = this.extractCaptionText(picture.captions);
4094
+ captionTexts.push(captionText);
4095
+ images.push({
4096
+ id: imageId,
4097
+ path: `${outputPath}/images/image_${images.length}.png`,
4098
+ pdfPageNo
4099
+ // caption will be assigned later
4100
+ });
4101
+ }
4102
+ const captionsByIndex = await this.processResourceCaptions(
4103
+ captionTexts,
4104
+ "image"
4105
+ );
4106
+ for (let i = 0; i < images.length; i++) {
4107
+ if (captionsByIndex.has(i)) {
4108
+ images[i].caption = captionsByIndex.get(i);
4109
+ }
4110
+ }
4111
+ return images;
4112
+ }
4113
+ /**
4114
+ * Convert tables
4115
+ *
4116
+ * Converts tables from DoclingDocument to ProcessedTable
4117
+ */
4118
+ async convertTables(doclingDoc) {
4119
+ this.logger.info(
4120
+ `[DocumentProcessor] Converting ${doclingDoc.tables.length} tables...`
4121
+ );
4122
+ const tables = [];
4123
+ const captionTexts = [];
4124
+ for (const table of doclingDoc.tables) {
4125
+ const pdfPageNo = table.prov?.[0]?.page_no ?? 0;
4126
+ const tableId = this.idGenerator?.generateTableId() ?? `tbl-${tables.length + 1}`;
4127
+ const grid = table.data.grid.map(
4128
+ (row) => row.map((cell) => ({
4129
+ text: cell.text,
4130
+ rowSpan: cell.row_span ?? 1,
4131
+ colSpan: cell.col_span ?? 1,
4132
+ isHeader: cell.column_header || cell.row_header || false
4133
+ }))
4134
+ );
4135
+ const captionText = this.extractCaptionText(table.captions);
4136
+ captionTexts.push(captionText);
4137
+ tables.push({
4138
+ id: tableId,
4139
+ pdfPageNo,
4140
+ numRows: grid.length,
4141
+ numCols: grid[0]?.length ?? 0,
4142
+ grid
4143
+ // caption will be assigned later
4144
+ });
4145
+ }
4146
+ const captionsByIndex = await this.processResourceCaptions(
4147
+ captionTexts,
4148
+ "table"
4149
+ );
4150
+ for (let i = 0; i < tables.length; i++) {
4151
+ if (captionsByIndex.has(i)) {
4152
+ tables[i].caption = captionsByIndex.get(i);
4153
+ }
4154
+ }
4155
+ return tables;
4156
+ }
4157
+ /**
4158
+ * Convert chapters and link resources
4159
+ *
4160
+ * Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
4161
+ * Falls back to single "Document" chapter when TOC is empty.
4162
+ */
4163
+ async convertChapters(doclingDoc, tocEntries, pageRangeMap, images, tables, footnotes) {
4164
+ this.logger.info("[DocumentProcessor] Converting chapters...");
4165
+ if (tocEntries.length === 0) {
4166
+ this.logger.info(
4167
+ "[DocumentProcessor] No TOC entries, creating fallback chapter"
4168
+ );
4169
+ return this.createFallbackChapter(
4170
+ doclingDoc,
4171
+ pageRangeMap,
4172
+ images,
4173
+ tables,
4174
+ footnotes
4175
+ );
4176
+ }
4177
+ const chapters = this.chapterConverter.convert(
4178
+ tocEntries,
4179
+ doclingDoc.texts,
4180
+ pageRangeMap,
4181
+ images,
4182
+ tables,
4183
+ footnotes
4184
+ );
4185
+ this.logger.info(
4186
+ `[DocumentProcessor] Converted ${chapters.length} top-level chapters`
4187
+ );
4188
+ return chapters;
4189
+ }
4190
+ /**
4191
+ * Create a fallback chapter when TOC is not available
4192
+ *
4193
+ * Creates a single "Document" chapter containing all text blocks,
4194
+ * images, tables, and footnotes from the document.
4195
+ */
4196
+ createFallbackChapter(doclingDoc, pageRangeMap, images, tables, footnotes) {
4197
+ const textBlocks = doclingDoc.texts.filter(
4198
+ (item) => item.label !== "footnote" && this.textCleaner.isValidText(item.text)
4199
+ ).map((item) => ({
4200
+ text: this.textCleaner.normalize(item.text),
4201
+ pdfPageNo: item.prov?.[0]?.page_no ?? 1
4202
+ }));
4203
+ if (textBlocks.length === 0 && images.length === 0 && tables.length === 0 && footnotes.length === 0) {
4204
+ this.logger.info(
4205
+ "[DocumentProcessor] No content found for fallback chapter"
4206
+ );
4207
+ return [];
4208
+ }
4209
+ const firstPdfPage = Math.min(
4210
+ ...Object.keys(pageRangeMap).map(Number).filter((n) => !isNaN(n)),
4211
+ 1
4212
+ );
4213
+ const firstPageRange = pageRangeMap[firstPdfPage];
4214
+ const pageNo = firstPageRange?.startPageNo ?? 1;
4215
+ const fallbackChapter = {
4216
+ id: this.idGenerator.generateChapterId(),
4217
+ originTitle: "Document",
4218
+ title: "Document",
4219
+ pageNo,
4220
+ level: 1,
4221
+ textBlocks,
4222
+ imageIds: images.map((img) => img.id),
4223
+ tableIds: tables.map((tbl) => tbl.id),
4224
+ footnoteIds: footnotes.map((ftn) => ftn.id),
4225
+ children: []
4226
+ };
4227
+ this.logger.info(
4228
+ `[DocumentProcessor] Created fallback chapter with ${textBlocks.length} text blocks, ${images.length} images, ${tables.length} tables, ${footnotes.length} footnotes`
4229
+ );
4230
+ return [fallbackChapter];
4231
+ }
4232
+ };
4233
+ export {
4234
+ BaseLLMComponent,
4235
+ BaseValidator,
4236
+ CONTINUATION_MARKERS,
4237
+ CaptionParseError,
4238
+ CaptionParser,
4239
+ CaptionValidationError,
4240
+ CaptionValidator,
4241
+ ChapterConverter,
4242
+ DocumentProcessor,
4243
+ PAGE_NUMBER_PATTERN,
4244
+ PagePattern,
4245
+ PageRangeParseError,
4246
+ PageRangeParser,
4247
+ TOC_KEYWORDS,
4248
+ TextLLMComponent,
4249
+ TocContentValidationSchema,
4250
+ TocContentValidator,
4251
+ TocEntrySchema,
4252
+ TocExtractError,
4253
+ TocExtractor,
4254
+ TocFinder,
4255
+ TocNotFoundError,
4256
+ TocParseError,
4257
+ TocResponseSchema,
4258
+ VisionLLMComponent,
4259
+ VisionTocExtractionSchema,
4260
+ VisionTocExtractor
4261
+ };
4262
+ //# sourceMappingURL=index.js.map