@heripo/document-processor 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs ADDED
@@ -0,0 +1,4325 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __export = (target, all) => {
9
+ for (var name in all)
10
+ __defProp(target, name, { get: all[name], enumerable: true });
11
+ };
12
+ var __copyProps = (to, from, except, desc) => {
13
+ if (from && typeof from === "object" || typeof from === "function") {
14
+ for (let key of __getOwnPropNames(from))
15
+ if (!__hasOwnProp.call(to, key) && key !== except)
16
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
+ }
18
+ return to;
19
+ };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
28
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
+
30
+ // src/index.ts
31
+ var index_exports = {};
32
+ __export(index_exports, {
33
+ BaseLLMComponent: () => BaseLLMComponent,
34
+ BaseValidator: () => BaseValidator,
35
+ CONTINUATION_MARKERS: () => CONTINUATION_MARKERS,
36
+ CaptionParseError: () => CaptionParseError,
37
+ CaptionParser: () => CaptionParser,
38
+ CaptionValidationError: () => CaptionValidationError,
39
+ CaptionValidator: () => CaptionValidator,
40
+ ChapterConverter: () => ChapterConverter,
41
+ DocumentProcessor: () => DocumentProcessor,
42
+ PAGE_NUMBER_PATTERN: () => PAGE_NUMBER_PATTERN,
43
+ PagePattern: () => PagePattern,
44
+ PageRangeParseError: () => PageRangeParseError,
45
+ PageRangeParser: () => PageRangeParser,
46
+ TOC_KEYWORDS: () => TOC_KEYWORDS,
47
+ TextLLMComponent: () => TextLLMComponent,
48
+ TocContentValidationSchema: () => TocContentValidationSchema,
49
+ TocContentValidator: () => TocContentValidator,
50
+ TocEntrySchema: () => TocEntrySchema,
51
+ TocExtractError: () => TocExtractError,
52
+ TocExtractor: () => TocExtractor,
53
+ TocFinder: () => TocFinder,
54
+ TocNotFoundError: () => TocNotFoundError,
55
+ TocParseError: () => TocParseError,
56
+ TocResponseSchema: () => TocResponseSchema,
57
+ VisionLLMComponent: () => VisionLLMComponent,
58
+ VisionTocExtractionSchema: () => VisionTocExtractionSchema,
59
+ VisionTocExtractor: () => VisionTocExtractor
60
+ });
61
+ module.exports = __toCommonJS(index_exports);
62
+
63
+ // ../shared/dist/index.mjs
64
+ var import_ai = require("ai");
65
+ var BatchProcessor = class {
66
+ /**
67
+ * Splits an array into batches of specified size.
68
+ *
69
+ * @param items - Array to split
70
+ * @param batchSize - Size of each batch
71
+ * @returns Array of batches
72
+ *
73
+ * @example
74
+ * ```typescript
75
+ * const items = [1, 2, 3, 4, 5];
76
+ * const batches = BatchProcessor.createBatches(items, 2);
77
+ * // [[1, 2], [3, 4], [5]]
78
+ * ```
79
+ */
80
+ static createBatches(items, batchSize) {
81
+ const batches = [];
82
+ for (let i = 0; i < items.length; i += batchSize) {
83
+ batches.push(items.slice(i, i + batchSize));
84
+ }
85
+ return batches;
86
+ }
87
+ /**
88
+ * Splits an array into batches and executes async function in parallel.
89
+ *
90
+ * @param items - Array to process
91
+ * @param batchSize - Size of each batch
92
+ * @param processFn - Async function to process each batch
93
+ * @returns Flattened array of processed results
94
+ *
95
+ * @example
96
+ * ```typescript
97
+ * const texts = ['a', 'b', 'c', 'd', 'e'];
98
+ * const results = await BatchProcessor.processBatch(
99
+ * texts,
100
+ * 2,
101
+ * async (batch) => {
102
+ * return batch.map(t => t.toUpperCase());
103
+ * }
104
+ * );
105
+ * // ['A', 'B', 'C', 'D', 'E']
106
+ * ```
107
+ */
108
+ static async processBatch(items, batchSize, processFn) {
109
+ const batches = this.createBatches(items, batchSize);
110
+ const results = await Promise.all(batches.map((batch) => processFn(batch)));
111
+ return results.flat();
112
+ }
113
+ /**
114
+ * Splits an array into batches and executes sync function in parallel.
115
+ *
116
+ * @param items - Array to process
117
+ * @param batchSize - Size of each batch
118
+ * @param processFn - Sync function to process each batch
119
+ * @returns Flattened array of processed results
120
+ *
121
+ * @example
122
+ * ```typescript
123
+ * const numbers = [1, 2, 3, 4, 5];
124
+ * const results = BatchProcessor.processBatchSync(
125
+ * numbers,
126
+ * 2,
127
+ * (batch) => batch.map(n => n * 2)
128
+ * );
129
+ * // [2, 4, 6, 8, 10]
130
+ * ```
131
+ */
132
+ static processBatchSync(items, batchSize, processFn) {
133
+ const batches = this.createBatches(items, batchSize);
134
+ const results = batches.map((batch) => processFn(batch));
135
+ return results.flat();
136
+ }
137
+ };
138
+ var LLMCaller = class {
139
+ /**
140
+ * Extract model name from LanguageModel object
141
+ *
142
+ * Attempts to get model ID from various possible fields in the LanguageModel object.
143
+ */
144
+ static extractModelName(model) {
145
+ const modelObj = model;
146
+ if (typeof modelObj.modelId === "string") return modelObj.modelId;
147
+ if (typeof modelObj.id === "string") return modelObj.id;
148
+ if (typeof modelObj.model === "string") return modelObj.model;
149
+ if (typeof modelObj.name === "string") return modelObj.name;
150
+ return String(model);
151
+ }
152
+ /**
153
+ * Build usage information from response
154
+ */
155
+ static buildUsage(config, modelName, response, usedFallback) {
156
+ return {
157
+ component: config.component,
158
+ phase: config.phase,
159
+ model: usedFallback ? "fallback" : "primary",
160
+ modelName,
161
+ inputTokens: response.usage?.inputTokens ?? 0,
162
+ outputTokens: response.usage?.outputTokens ?? 0,
163
+ totalTokens: response.usage?.totalTokens ?? 0
164
+ };
165
+ }
166
+ /**
167
+ * Execute LLM call with fallback support
168
+ *
169
+ * Common execution logic for both text and vision calls.
170
+ */
171
+ static async executeWithFallback(config, generateFn) {
172
+ const primaryModelName = this.extractModelName(config.primaryModel);
173
+ try {
174
+ const response = await generateFn(config.primaryModel);
175
+ return {
176
+ output: response.output,
177
+ usage: this.buildUsage(config, primaryModelName, response, false),
178
+ usedFallback: false
179
+ };
180
+ } catch (primaryError) {
181
+ if (config.abortSignal?.aborted) {
182
+ throw primaryError;
183
+ }
184
+ if (!config.fallbackModel) {
185
+ throw primaryError;
186
+ }
187
+ const fallbackModelName = this.extractModelName(config.fallbackModel);
188
+ const response = await generateFn(config.fallbackModel);
189
+ return {
190
+ output: response.output,
191
+ usage: this.buildUsage(config, fallbackModelName, response, true),
192
+ usedFallback: true
193
+ };
194
+ }
195
+ }
196
+ /**
197
+ * Call LLM with retry and fallback support
198
+ *
199
+ * Retry Strategy:
200
+ * 1. Try primary model up to maxRetries times
201
+ * 2. If all fail and fallbackModel provided, try fallback up to maxRetries times
202
+ * 3. Throw error if all attempts exhausted
203
+ *
204
+ * @template TOutput - Output type from schema validation
205
+ * @param config - LLM call configuration
206
+ * @returns Result with parsed object and usage information
207
+ * @throws Error if all retry attempts fail
208
+ */
209
+ static async call(config) {
210
+ return this.executeWithFallback(
211
+ config,
212
+ (model) => (0, import_ai.generateText)({
213
+ model,
214
+ output: import_ai.Output.object({
215
+ schema: config.schema
216
+ }),
217
+ system: config.systemPrompt,
218
+ prompt: config.userPrompt,
219
+ temperature: config.temperature,
220
+ maxRetries: config.maxRetries,
221
+ abortSignal: config.abortSignal
222
+ })
223
+ );
224
+ }
225
+ /**
226
+ * Call LLM for vision tasks with message format support
227
+ *
228
+ * Same retry and fallback logic as call(), but using message format instead of system/user prompts.
229
+ *
230
+ * @template TOutput - Output type from schema validation
231
+ * @param config - LLM vision call configuration
232
+ * @returns Result with parsed object and usage information
233
+ * @throws Error if all retry attempts fail
234
+ */
235
+ static async callVision(config) {
236
+ return this.executeWithFallback(
237
+ config,
238
+ (model) => (0, import_ai.generateText)({
239
+ model,
240
+ output: import_ai.Output.object({
241
+ schema: config.schema
242
+ }),
243
+ messages: config.messages,
244
+ temperature: config.temperature,
245
+ maxRetries: config.maxRetries,
246
+ abortSignal: config.abortSignal
247
+ })
248
+ );
249
+ }
250
+ };
251
+ function formatTokens(usage) {
252
+ return `${usage.inputTokens} input, ${usage.outputTokens} output, ${usage.totalTokens} total`;
253
+ }
254
+ var LLMTokenUsageAggregator = class {
255
+ usage = {};
256
+ /**
257
+ * Track token usage from an LLM call
258
+ *
259
+ * @param usage - Extended token usage with component/phase/model information
260
+ */
261
+ track(usage) {
262
+ if (!this.usage[usage.component]) {
263
+ this.usage[usage.component] = {
264
+ component: usage.component,
265
+ phases: {},
266
+ total: {
267
+ inputTokens: 0,
268
+ outputTokens: 0,
269
+ totalTokens: 0
270
+ }
271
+ };
272
+ }
273
+ const component = this.usage[usage.component];
274
+ if (!component.phases[usage.phase]) {
275
+ component.phases[usage.phase] = {
276
+ total: {
277
+ inputTokens: 0,
278
+ outputTokens: 0,
279
+ totalTokens: 0
280
+ }
281
+ };
282
+ }
283
+ const phase = component.phases[usage.phase];
284
+ if (usage.model === "primary") {
285
+ if (!phase.primary) {
286
+ phase.primary = {
287
+ modelName: usage.modelName,
288
+ inputTokens: 0,
289
+ outputTokens: 0,
290
+ totalTokens: 0
291
+ };
292
+ }
293
+ phase.primary.inputTokens += usage.inputTokens;
294
+ phase.primary.outputTokens += usage.outputTokens;
295
+ phase.primary.totalTokens += usage.totalTokens;
296
+ } else if (usage.model === "fallback") {
297
+ if (!phase.fallback) {
298
+ phase.fallback = {
299
+ modelName: usage.modelName,
300
+ inputTokens: 0,
301
+ outputTokens: 0,
302
+ totalTokens: 0
303
+ };
304
+ }
305
+ phase.fallback.inputTokens += usage.inputTokens;
306
+ phase.fallback.outputTokens += usage.outputTokens;
307
+ phase.fallback.totalTokens += usage.totalTokens;
308
+ }
309
+ phase.total.inputTokens += usage.inputTokens;
310
+ phase.total.outputTokens += usage.outputTokens;
311
+ phase.total.totalTokens += usage.totalTokens;
312
+ component.total.inputTokens += usage.inputTokens;
313
+ component.total.outputTokens += usage.outputTokens;
314
+ component.total.totalTokens += usage.totalTokens;
315
+ }
316
+ /**
317
+ * Get aggregated usage grouped by component
318
+ *
319
+ * @returns Array of component aggregates with phase breakdown
320
+ */
321
+ getByComponent() {
322
+ return Object.values(this.usage);
323
+ }
324
+ /**
325
+ * Get token usage report in structured JSON format
326
+ *
327
+ * Converts internal usage data to external TokenUsageReport format suitable
328
+ * for serialization and reporting. The report includes component breakdown,
329
+ * phase-level details, and both primary and fallback model usage.
330
+ *
331
+ * @returns Structured token usage report with components and total
332
+ */
333
+ getReport() {
334
+ const components = [];
335
+ for (const component of Object.values(this.usage)) {
336
+ const phases = [];
337
+ for (const [phaseName, phaseData] of Object.entries(component.phases)) {
338
+ const phaseReport = {
339
+ phase: phaseName,
340
+ total: {
341
+ inputTokens: phaseData.total.inputTokens,
342
+ outputTokens: phaseData.total.outputTokens,
343
+ totalTokens: phaseData.total.totalTokens
344
+ }
345
+ };
346
+ if (phaseData.primary) {
347
+ phaseReport.primary = {
348
+ modelName: phaseData.primary.modelName,
349
+ inputTokens: phaseData.primary.inputTokens,
350
+ outputTokens: phaseData.primary.outputTokens,
351
+ totalTokens: phaseData.primary.totalTokens
352
+ };
353
+ }
354
+ if (phaseData.fallback) {
355
+ phaseReport.fallback = {
356
+ modelName: phaseData.fallback.modelName,
357
+ inputTokens: phaseData.fallback.inputTokens,
358
+ outputTokens: phaseData.fallback.outputTokens,
359
+ totalTokens: phaseData.fallback.totalTokens
360
+ };
361
+ }
362
+ phases.push(phaseReport);
363
+ }
364
+ components.push({
365
+ component: component.component,
366
+ phases,
367
+ total: {
368
+ inputTokens: component.total.inputTokens,
369
+ outputTokens: component.total.outputTokens,
370
+ totalTokens: component.total.totalTokens
371
+ }
372
+ });
373
+ }
374
+ const totalUsage = this.getTotalUsage();
375
+ return {
376
+ components,
377
+ total: {
378
+ inputTokens: totalUsage.inputTokens,
379
+ outputTokens: totalUsage.outputTokens,
380
+ totalTokens: totalUsage.totalTokens
381
+ }
382
+ };
383
+ }
384
+ /**
385
+ * Get total usage across all components and phases
386
+ *
387
+ * @returns Aggregated token usage totals
388
+ */
389
+ getTotalUsage() {
390
+ let totalInput = 0;
391
+ let totalOutput = 0;
392
+ let totalTokens = 0;
393
+ for (const component of Object.values(this.usage)) {
394
+ totalInput += component.total.inputTokens;
395
+ totalOutput += component.total.outputTokens;
396
+ totalTokens += component.total.totalTokens;
397
+ }
398
+ return {
399
+ inputTokens: totalInput,
400
+ outputTokens: totalOutput,
401
+ totalTokens
402
+ };
403
+ }
404
+ /**
405
+ * Log comprehensive token usage summary
406
+ *
407
+ * Outputs usage grouped by component, with phase and model breakdown.
408
+ * Shows primary and fallback token usage separately for each phase.
409
+ * Call this once at the end of document processing.
410
+ *
411
+ * @param logger - Logger instance for output
412
+ */
413
+ logSummary(logger) {
414
+ const components = this.getByComponent();
415
+ if (components.length === 0) {
416
+ logger.info("[DocumentProcessor] No token usage to report");
417
+ return;
418
+ }
419
+ logger.info("[DocumentProcessor] Token usage summary:");
420
+ logger.info("");
421
+ let grandInputTokens = 0;
422
+ let grandOutputTokens = 0;
423
+ let grandTotalTokens = 0;
424
+ let grandPrimaryInputTokens = 0;
425
+ let grandPrimaryOutputTokens = 0;
426
+ let grandPrimaryTotalTokens = 0;
427
+ let grandFallbackInputTokens = 0;
428
+ let grandFallbackOutputTokens = 0;
429
+ let grandFallbackTotalTokens = 0;
430
+ for (const component of components) {
431
+ logger.info(`${component.component}:`);
432
+ for (const [phase, phaseData] of Object.entries(component.phases)) {
433
+ logger.info(` - ${phase}:`);
434
+ if (phaseData.primary) {
435
+ logger.info(
436
+ ` primary (${phaseData.primary.modelName}): ${formatTokens(phaseData.primary)}`
437
+ );
438
+ grandPrimaryInputTokens += phaseData.primary.inputTokens;
439
+ grandPrimaryOutputTokens += phaseData.primary.outputTokens;
440
+ grandPrimaryTotalTokens += phaseData.primary.totalTokens;
441
+ }
442
+ if (phaseData.fallback) {
443
+ logger.info(
444
+ ` fallback (${phaseData.fallback.modelName}): ${formatTokens(phaseData.fallback)}`
445
+ );
446
+ grandFallbackInputTokens += phaseData.fallback.inputTokens;
447
+ grandFallbackOutputTokens += phaseData.fallback.outputTokens;
448
+ grandFallbackTotalTokens += phaseData.fallback.totalTokens;
449
+ }
450
+ logger.info(` subtotal: ${formatTokens(phaseData.total)}`);
451
+ }
452
+ logger.info(
453
+ ` ${component.component} total: ${formatTokens(component.total)}`
454
+ );
455
+ logger.info("");
456
+ grandInputTokens += component.total.inputTokens;
457
+ grandOutputTokens += component.total.outputTokens;
458
+ grandTotalTokens += component.total.totalTokens;
459
+ }
460
+ logger.info("--- Summary ---");
461
+ if (grandPrimaryTotalTokens > 0) {
462
+ logger.info(
463
+ `Primary total: ${formatTokens({
464
+ inputTokens: grandPrimaryInputTokens,
465
+ outputTokens: grandPrimaryOutputTokens,
466
+ totalTokens: grandPrimaryTotalTokens
467
+ })}`
468
+ );
469
+ }
470
+ if (grandFallbackTotalTokens > 0) {
471
+ logger.info(
472
+ `Fallback total: ${formatTokens({
473
+ inputTokens: grandFallbackInputTokens,
474
+ outputTokens: grandFallbackOutputTokens,
475
+ totalTokens: grandFallbackTotalTokens
476
+ })}`
477
+ );
478
+ }
479
+ logger.info(
480
+ `Grand total: ${formatTokens({
481
+ inputTokens: grandInputTokens,
482
+ outputTokens: grandOutputTokens,
483
+ totalTokens: grandTotalTokens
484
+ })}`
485
+ );
486
+ }
487
+ /**
488
+ * Reset all tracked usage
489
+ *
490
+ * Call this at the start of a new document processing run.
491
+ */
492
+ reset() {
493
+ this.usage = {};
494
+ }
495
+ };
496
+
497
+ // src/utils/ref-resolver.ts
498
+ var RefResolver = class {
499
+ logger;
500
+ textMap;
501
+ pictureMap;
502
+ tableMap;
503
+ groupMap;
504
+ constructor(logger, doc) {
505
+ this.logger = logger;
506
+ this.logger.info("[RefResolver] Initializing reference resolver...");
507
+ this.textMap = this.buildIndex(doc.texts, "texts");
508
+ this.pictureMap = this.buildIndex(doc.pictures, "pictures");
509
+ this.tableMap = this.buildIndex(doc.tables, "tables");
510
+ this.groupMap = this.buildIndex(doc.groups, "groups");
511
+ this.logger.info(
512
+ `[RefResolver] Indexed ${this.textMap.size} texts, ${this.pictureMap.size} pictures, ${this.tableMap.size} tables, ${this.groupMap.size} groups`
513
+ );
514
+ }
515
+ /**
516
+ * Build an index mapping self_ref to the actual item
517
+ */
518
+ buildIndex(items, _prefix) {
519
+ const map = /* @__PURE__ */ new Map();
520
+ for (const item of items) {
521
+ map.set(item.self_ref, item);
522
+ }
523
+ return map;
524
+ }
525
+ /**
526
+ * Resolve a $ref string to the actual item
527
+ * @param ref - Reference string (e.g., "#/texts/0")
528
+ * @returns The resolved item, or null if not found
529
+ */
530
+ resolve(ref) {
531
+ const match = ref.match(/^#\/(\w+)\//);
532
+ if (!match) {
533
+ this.logger.warn(`[RefResolver] Invalid reference format: ${ref}`);
534
+ return null;
535
+ }
536
+ const collection = match[1];
537
+ if (collection === "texts") {
538
+ const result = this.textMap.get(ref) ?? null;
539
+ if (!result) {
540
+ this.logger.warn(`[RefResolver] Text reference not found: ${ref}`);
541
+ }
542
+ return result;
543
+ }
544
+ if (collection === "pictures") {
545
+ const result = this.pictureMap.get(ref) ?? null;
546
+ if (!result) {
547
+ this.logger.warn(`[RefResolver] Picture reference not found: ${ref}`);
548
+ }
549
+ return result;
550
+ }
551
+ if (collection === "tables") {
552
+ const result = this.tableMap.get(ref) ?? null;
553
+ if (!result) {
554
+ this.logger.warn(`[RefResolver] Table reference not found: ${ref}`);
555
+ }
556
+ return result;
557
+ }
558
+ if (collection === "groups") {
559
+ const result = this.groupMap.get(ref) ?? null;
560
+ if (!result) {
561
+ this.logger.warn(`[RefResolver] Group reference not found: ${ref}`);
562
+ }
563
+ return result;
564
+ }
565
+ this.logger.warn(`[RefResolver] Unknown collection type: ${collection}`);
566
+ return null;
567
+ }
568
+ /**
569
+ * Resolve a text reference
570
+ * @param ref - Reference string (e.g., "#/texts/0")
571
+ * @returns The resolved text item, or null if not found
572
+ */
573
+ resolveText(ref) {
574
+ return this.textMap.get(ref) ?? null;
575
+ }
576
+ /**
577
+ * Resolve a picture reference
578
+ * @param ref - Reference string (e.g., "#/pictures/0")
579
+ * @returns The resolved picture item, or null if not found
580
+ */
581
+ resolvePicture(ref) {
582
+ return this.pictureMap.get(ref) ?? null;
583
+ }
584
+ /**
585
+ * Resolve a table reference
586
+ * @param ref - Reference string (e.g., "#/tables/0")
587
+ * @returns The resolved table item, or null if not found
588
+ */
589
+ resolveTable(ref) {
590
+ return this.tableMap.get(ref) ?? null;
591
+ }
592
+ /**
593
+ * Resolve a group reference
594
+ * @param ref - Reference string (e.g., "#/groups/0")
595
+ * @returns The resolved group item, or null if not found
596
+ */
597
+ resolveGroup(ref) {
598
+ return this.groupMap.get(ref) ?? null;
599
+ }
600
+ /**
601
+ * Resolve multiple references at once
602
+ * @param refs - Array of reference objects with $ref property
603
+ * @returns Array of resolved items (null for unresolved references)
604
+ */
605
+ resolveMany(refs) {
606
+ return refs.map((ref) => this.resolve(ref.$ref));
607
+ }
608
+ };
609
+
610
+ // src/utils/id-generator.ts
611
+ var IdGenerator = class {
612
+ chapterCounter = 0;
613
+ imageCounter = 0;
614
+ tableCounter = 0;
615
+ footnoteCounter = 0;
616
+ /**
617
+ * Generate a chapter ID
618
+ * @returns A chapter ID in the format "ch-001"
619
+ */
620
+ generateChapterId() {
621
+ this.chapterCounter++;
622
+ return `ch-${this.padNumber(this.chapterCounter)}`;
623
+ }
624
+ /**
625
+ * Generate an image ID
626
+ * @returns An image ID in the format "img-001"
627
+ */
628
+ generateImageId() {
629
+ this.imageCounter++;
630
+ return `img-${this.padNumber(this.imageCounter)}`;
631
+ }
632
+ /**
633
+ * Generate a table ID
634
+ * @returns A table ID in the format "tbl-001"
635
+ */
636
+ generateTableId() {
637
+ this.tableCounter++;
638
+ return `tbl-${this.padNumber(this.tableCounter)}`;
639
+ }
640
+ /**
641
+ * Generate a footnote ID
642
+ * @returns A footnote ID in the format "ftn-001"
643
+ */
644
+ generateFootnoteId() {
645
+ this.footnoteCounter++;
646
+ return `ftn-${this.padNumber(this.footnoteCounter)}`;
647
+ }
648
+ /**
649
+ * Reset all counters to zero
650
+ */
651
+ reset() {
652
+ this.chapterCounter = 0;
653
+ this.imageCounter = 0;
654
+ this.tableCounter = 0;
655
+ this.footnoteCounter = 0;
656
+ }
657
+ /**
658
+ * Get current counter values (for testing/debugging)
659
+ */
660
+ getCounters() {
661
+ return {
662
+ chapter: this.chapterCounter,
663
+ image: this.imageCounter,
664
+ table: this.tableCounter,
665
+ footnote: this.footnoteCounter
666
+ };
667
+ }
668
+ /**
669
+ * Pad a number to 3 digits with leading zeros
670
+ */
671
+ padNumber(num) {
672
+ return num.toString().padStart(3, "0");
673
+ }
674
+ };
675
+
676
+ // src/utils/text-cleaner.ts
677
+ var TextCleaner = class {
678
+ /**
679
+ * Normalizes text
680
+ * - Converts consecutive spaces/line breaks to single space
681
+ * - Trims leading and trailing spaces
682
+ * - Normalizes special whitespace characters (tabs, non-breaking spaces, etc.)
683
+ */
684
+ static normalize(text) {
685
+ if (!text) return "";
686
+ let normalized = text.normalize("NFC");
687
+ normalized = normalized.replace(/[\t\u00A0\u2000-\u200B]/g, " ");
688
+ normalized = normalized.replace(/[\r\n]+/g, " ");
689
+ normalized = normalized.replace(/\s+/g, " ");
690
+ normalized = normalized.trim();
691
+ return normalized;
692
+ }
693
+ /**
694
+ * Clean text starting/ending with punctuation marks
695
+ * - Remove commas/periods at sentence start
696
+ * - Clean spaces and punctuation at sentence end
697
+ */
698
+ static cleanPunctuation(text) {
699
+ if (!text) return "";
700
+ let cleaned = text.replace(/^[,.:;!?]+\s*/, "");
701
+ cleaned = cleaned.replace(/\s+[,.:;!?]*$/, "");
702
+ return cleaned;
703
+ }
704
+ /**
705
+ * Filter text consisting only of numbers and spaces
706
+ */
707
+ static isValidText(text) {
708
+ if (!text) return false;
709
+ const cleaned = this.normalize(text);
710
+ return !/^\s*[\d\s]*$/.test(cleaned);
711
+ }
712
+ /**
713
+ * Batch normalization (for bulk processing)
714
+ */
715
+ static normalizeBatch(texts) {
716
+ return texts.map((text) => this.normalize(text));
717
+ }
718
+ /**
719
+ * Batch filtering (returns only valid text)
720
+ */
721
+ static filterValidTexts(texts) {
722
+ return texts.filter((text) => this.isValidText(text));
723
+ }
724
+ /**
725
+ * Batch normalization + filtering (stage 1 + stage 2 combined)
726
+ *
727
+ * Performs TextCleaner's basic normalization and filtering in batch processing at once.
728
+ * Splits large amounts of text into batches for efficient processing.
729
+ *
730
+ * If batchSize is 0, processes items sequentially without batch processing.
731
+ *
732
+ * @param texts - Original text array
733
+ * @param batchSize - Batch size (default: 10). Set to 0 for sequential processing without batching.
734
+ * @returns Normalized and filtered text array
735
+ *
736
+ * @example
737
+ * ```typescript
738
+ * const rawTexts = [' text 1 ', '123', 'text 2\n'];
739
+ * const cleaned = TextCleaner.normalizeAndFilterBatch(rawTexts, 10);
740
+ * // ['text 1', 'text 2']
741
+ *
742
+ * // Sequential processing (no batching)
743
+ * const cleanedSequential = TextCleaner.normalizeAndFilterBatch(rawTexts, 0);
744
+ * // ['text 1', 'text 2']
745
+ * ```
746
+ */
747
+ static normalizeAndFilterBatch(texts, batchSize = 10) {
748
+ if (batchSize === 0) {
749
+ const results = [];
750
+ for (const text of texts) {
751
+ const normalized = this.normalize(text);
752
+ if (this.isValidText(normalized)) {
753
+ results.push(normalized);
754
+ }
755
+ }
756
+ return results;
757
+ }
758
+ return BatchProcessor.processBatchSync(texts, batchSize, (batch) => {
759
+ const normalized = this.normalizeBatch(batch);
760
+ return this.filterValidTexts(normalized);
761
+ });
762
+ }
763
+ };
764
+
765
+ // src/utils/markdown-converter.ts
766
+ var MarkdownConverter = class _MarkdownConverter {
767
+ /**
768
+ * Convert TOC items (groups/tables) to Markdown string
769
+ *
770
+ * @param refs - Array of item references from TocAreaResult
771
+ * @param refResolver - RefResolver for resolving references
772
+ * @returns Markdown string representation of TOC
773
+ */
774
+ static convert(refs, refResolver) {
775
+ if (refs.length === 0) {
776
+ return "";
777
+ }
778
+ const lines = [];
779
+ for (const ref of refs) {
780
+ const item = refResolver.resolve(ref);
781
+ if (!item) {
782
+ continue;
783
+ }
784
+ if ("name" in item && (item.name === "list" || item.name === "group")) {
785
+ const groupMarkdown = _MarkdownConverter.groupToMarkdown(
786
+ item,
787
+ refResolver,
788
+ 0
789
+ );
790
+ if (groupMarkdown) {
791
+ lines.push(groupMarkdown);
792
+ }
793
+ } else if ("data" in item && "grid" in item.data) {
794
+ const tableMarkdown = _MarkdownConverter.tableToMarkdown(
795
+ item
796
+ );
797
+ if (tableMarkdown) {
798
+ lines.push(tableMarkdown);
799
+ }
800
+ } else if ("text" in item && "orig" in item) {
801
+ const textMarkdown = _MarkdownConverter.textToMarkdown(
802
+ item,
803
+ 0
804
+ );
805
+ if (textMarkdown) {
806
+ lines.push(textMarkdown);
807
+ }
808
+ }
809
+ }
810
+ return lines.join("\n\n");
811
+ }
812
+ /**
813
+ * Convert a group item to Markdown list format
814
+ *
815
+ * Handles nested lists and preserves hierarchy.
816
+ *
817
+ * @example
818
+ * Output:
819
+ * - Chapter 1 Introduction ..... 1
820
+ * - 1.1 Background ..... 3
821
+ * - 1.2 Objectives ..... 5
822
+ * - Chapter 2 Methodology ..... 10
823
+ */
824
+ static groupToMarkdown(group, refResolver, indentLevel = 0) {
825
+ const lines = [];
826
+ for (const childRef of group.children) {
827
+ const child = refResolver.resolve(childRef.$ref);
828
+ if (!child) {
829
+ continue;
830
+ }
831
+ if ("name" in child && (child.name === "list" || child.name === "group")) {
832
+ const nestedMarkdown = _MarkdownConverter.groupToMarkdown(
833
+ child,
834
+ refResolver,
835
+ indentLevel + 1
836
+ );
837
+ if (nestedMarkdown) {
838
+ lines.push(nestedMarkdown);
839
+ }
840
+ } else if ("text" in child && "orig" in child) {
841
+ const textMarkdown = _MarkdownConverter.textToMarkdown(
842
+ child,
843
+ indentLevel
844
+ );
845
+ if (textMarkdown) {
846
+ lines.push(textMarkdown);
847
+ }
848
+ }
849
+ }
850
+ return lines.join("\n");
851
+ }
852
+ /**
853
+ * Convert a table item to Markdown table format
854
+ *
855
+ * @example
856
+ * Output:
857
+ * | Chapter | Page |
858
+ * |---------|------|
859
+ * | Chapter 1 Introduction | 1 |
860
+ * | Chapter 2 Methodology | 10 |
861
+ */
862
+ static tableToMarkdown(table) {
863
+ const { grid } = table.data;
864
+ if (!grid || grid.length === 0) {
865
+ return "";
866
+ }
867
+ const lines = [];
868
+ for (let rowIdx = 0; rowIdx < grid.length; rowIdx++) {
869
+ const row = grid[rowIdx];
870
+ if (!row || row.length === 0) {
871
+ continue;
872
+ }
873
+ const cells = row.map(
874
+ (cell) => _MarkdownConverter.escapeTableCell(cell.text)
875
+ );
876
+ lines.push(`| ${cells.join(" | ")} |`);
877
+ if (rowIdx === 0) {
878
+ const separator = row.map(() => "---").join(" | ");
879
+ lines.push(`| ${separator} |`);
880
+ }
881
+ }
882
+ return lines.join("\n");
883
+ }
884
+ /**
885
+ * Convert a text item to Markdown line
886
+ */
887
+ static textToMarkdown(text, indentLevel = 0) {
888
+ const content = text.text.trim();
889
+ if (!content) {
890
+ return "";
891
+ }
892
+ const indent = _MarkdownConverter.getIndent(indentLevel);
893
+ const marker = _MarkdownConverter.getListMarker(
894
+ text.enumerated,
895
+ text.marker
896
+ );
897
+ return `${indent}${marker}${content}`;
898
+ }
899
+ /**
900
+ * Generate list marker based on enumeration and marker
901
+ */
902
+ static getListMarker(enumerated, marker) {
903
+ if (marker) {
904
+ return `${marker} `;
905
+ }
906
+ if (enumerated === true) {
907
+ return "1. ";
908
+ }
909
+ if (enumerated === false) {
910
+ return "- ";
911
+ }
912
+ return "- ";
913
+ }
914
+ /**
915
+ * Generate indent string (2 spaces per level)
916
+ */
917
+ static getIndent(level) {
918
+ return " ".repeat(level);
919
+ }
920
+ /**
921
+ * Escape special characters in table cell content
922
+ */
923
+ static escapeTableCell(text) {
924
+ return text.replace(/\|/g, "\\|").replace(/\n/g, " ").trim();
925
+ }
926
+ };
927
+
928
+ // src/converters/chapter-converter.ts
929
+ var ChapterConverter = class _ChapterConverter {
930
+ static FRONT_MATTER_ID = "ch-000";
931
+ static FRONT_MATTER_TITLE = "Front Matter";
932
+ logger;
933
+ idGenerator;
934
+ constructor(logger, idGenerator) {
935
+ this.logger = logger;
936
+ this.idGenerator = idGenerator;
937
+ }
938
+ /**
939
+ * Convert TocEntry[] to Chapter[]
940
+ *
941
+ * @param tocEntries - Table of contents entries
942
+ * @param textItems - DoclingDocument.texts (with prov for page numbers)
943
+ * @param pageRangeMap - PDF page to actual page mapping
944
+ * @param images - Converted images
945
+ * @param tables - Converted tables
946
+ * @param footnotes - Converted footnotes
947
+ * @returns Converted chapters with text blocks and resource references
948
+ */
949
+ convert(tocEntries, textItems, pageRangeMap, images, tables, footnotes) {
950
+ this.logger.info("[ChapterConverter] Starting chapter conversion...");
951
+ const frontMatter = this.createFrontMatterChapter();
952
+ const tocChapters = this.buildChapterTree(tocEntries);
953
+ this.logger.info(
954
+ `[ChapterConverter] Built ${tocChapters.length} TOC chapters + Front Matter`
955
+ );
956
+ const allChapters = [frontMatter, ...tocChapters];
957
+ const flatChapters = this.flattenChapters(allChapters);
958
+ const chapterRanges = this.calculatePageRanges(flatChapters, tocEntries);
959
+ this.logger.info(
960
+ `[ChapterConverter] Calculated ranges for ${chapterRanges.size} chapters`
961
+ );
962
+ const textBlocks = this.convertTextBlocks(textItems, pageRangeMap);
963
+ this.assignTextBlocks(allChapters, textBlocks, chapterRanges, pageRangeMap);
964
+ this.logger.info(
965
+ `[ChapterConverter] Assigned ${textBlocks.length} text blocks`
966
+ );
967
+ this.linkResources(
968
+ allChapters,
969
+ images,
970
+ tables,
971
+ footnotes,
972
+ chapterRanges,
973
+ pageRangeMap
974
+ );
975
+ this.logger.info(
976
+ `[ChapterConverter] Linked ${images.length} images, ${tables.length} tables, and ${footnotes.length} footnotes`
977
+ );
978
+ return allChapters;
979
+ }
980
+ /**
981
+ * Create Front Matter chapter for pre-TOC content
982
+ */
983
+ createFrontMatterChapter() {
984
+ return {
985
+ id: _ChapterConverter.FRONT_MATTER_ID,
986
+ originTitle: _ChapterConverter.FRONT_MATTER_TITLE,
987
+ title: _ChapterConverter.FRONT_MATTER_TITLE,
988
+ pageNo: 1,
989
+ level: 1,
990
+ textBlocks: [],
991
+ imageIds: [],
992
+ tableIds: [],
993
+ footnoteIds: []
994
+ };
995
+ }
996
+ /**
997
+ * Build chapter tree from TocEntry[]
998
+ * Recursively processes children
999
+ */
1000
+ buildChapterTree(entries) {
1001
+ return entries.map((entry) => {
1002
+ const chapterId = this.idGenerator.generateChapterId();
1003
+ const chapter = {
1004
+ id: chapterId,
1005
+ originTitle: entry.title,
1006
+ title: TextCleaner.normalize(entry.title),
1007
+ pageNo: entry.pageNo,
1008
+ level: entry.level,
1009
+ textBlocks: [],
1010
+ imageIds: [],
1011
+ tableIds: [],
1012
+ footnoteIds: []
1013
+ };
1014
+ if (entry.children && entry.children.length > 0) {
1015
+ chapter.children = this.buildChapterTree(entry.children);
1016
+ }
1017
+ return chapter;
1018
+ });
1019
+ }
1020
+ /**
1021
+ * Flatten chapter tree for page range calculation
1022
+ * Preserves original TOC page numbers
1023
+ */
1024
+ flattenChapters(chapters) {
1025
+ const result = [];
1026
+ const flatten = (chapterList) => {
1027
+ for (const chapter of chapterList) {
1028
+ result.push({
1029
+ chapter,
1030
+ tocPageNo: chapter.pageNo
1031
+ });
1032
+ if (chapter.children && chapter.children.length > 0) {
1033
+ flatten(chapter.children);
1034
+ }
1035
+ }
1036
+ };
1037
+ flatten(chapters);
1038
+ return result;
1039
+ }
1040
+ /**
1041
+ * Calculate page range for each chapter
1042
+ * Uses next chapter's start page as end boundary
1043
+ *
1044
+ * Front Matter (ch-000) gets special handling:
1045
+ * - startPage: 1
1046
+ * - endPage: first TOC entry's page - 1 (or 0 if TOC starts at page 1)
1047
+ */
1048
+ calculatePageRanges(flatChapters, tocEntries) {
1049
+ const ranges = /* @__PURE__ */ new Map();
1050
+ if (flatChapters.length === 0) {
1051
+ return ranges;
1052
+ }
1053
+ const firstTocPage = tocEntries.length > 0 ? Math.min(...tocEntries.map((e) => e.pageNo)) : Number.MAX_SAFE_INTEGER;
1054
+ const tocChapters = flatChapters.filter(
1055
+ (fc) => fc.chapter.id !== _ChapterConverter.FRONT_MATTER_ID
1056
+ );
1057
+ const sorted = [...tocChapters].sort((a, b) => a.tocPageNo - b.tocPageNo);
1058
+ ranges.set(_ChapterConverter.FRONT_MATTER_ID, {
1059
+ startPage: 1,
1060
+ endPage: firstTocPage - 1
1061
+ });
1062
+ for (let i = 0; i < sorted.length; i++) {
1063
+ const current = sorted[i];
1064
+ const next = sorted[i + 1];
1065
+ ranges.set(current.chapter.id, {
1066
+ startPage: current.tocPageNo,
1067
+ endPage: next ? next.tocPageNo - 1 : Number.MAX_SAFE_INTEGER
1068
+ });
1069
+ }
1070
+ return ranges;
1071
+ }
1072
+ /**
1073
+ * Valid labels for text blocks
1074
+ * Only these labels are included in chapter text blocks
1075
+ */
1076
+ static VALID_TEXT_LABELS = /* @__PURE__ */ new Set([
1077
+ "text",
1078
+ "section_header",
1079
+ "list_item"
1080
+ ]);
1081
+ /**
1082
+ * Check if text item has a picture parent
1083
+ * Items with parent.$ref starting with "#/pictures/" are excluded
1084
+ */
1085
+ static hasPictureParent(item) {
1086
+ const parentRef = item.parent?.$ref;
1087
+ return typeof parentRef === "string" && parentRef.startsWith("#/pictures/");
1088
+ }
1089
+ /**
1090
+ * Convert text items to text blocks
1091
+ * Filters by label (text, section_header, list_item), excludes picture children,
1092
+ * and extracts page numbers from prov
1093
+ */
1094
+ convertTextBlocks(textItems, _pageRangeMap) {
1095
+ return textItems.filter(
1096
+ (item) => _ChapterConverter.VALID_TEXT_LABELS.has(item.label) && !_ChapterConverter.hasPictureParent(item) && TextCleaner.isValidText(item.text)
1097
+ ).map((item) => {
1098
+ const pdfPageNo = item.prov?.[0]?.page_no ?? 1;
1099
+ return {
1100
+ text: TextCleaner.normalize(item.text),
1101
+ pdfPageNo
1102
+ };
1103
+ });
1104
+ }
1105
+ /**
1106
+ * Convert PDF page number to actual document page number
1107
+ * Falls back to pdfPageNo if mapping is missing
1108
+ */
1109
+ pdfPageToActualPage(pdfPageNo, pageRangeMap) {
1110
+ const range = pageRangeMap[pdfPageNo];
1111
+ if (!range) {
1112
+ return pdfPageNo;
1113
+ }
1114
+ return range.startPageNo;
1115
+ }
1116
+ /**
1117
+ * Find chapter ID for a given actual page number
1118
+ * Uses "start page first" strategy
1119
+ */
1120
+ findChapterForPage(actualPageNo, chapterRanges) {
1121
+ let bestMatch = null;
1122
+ let bestStartPage = -1;
1123
+ for (const [chapterId, range] of chapterRanges) {
1124
+ if (actualPageNo >= range.startPage && actualPageNo <= range.endPage) {
1125
+ if (range.startPage > bestStartPage) {
1126
+ bestStartPage = range.startPage;
1127
+ bestMatch = chapterId;
1128
+ }
1129
+ }
1130
+ }
1131
+ return bestMatch;
1132
+ }
1133
+ /**
1134
+ * Assign text blocks to chapters based on page ranges
1135
+ */
1136
+ assignTextBlocks(chapters, textBlocks, chapterRanges, pageRangeMap) {
1137
+ const chapterMap = this.buildChapterMap(chapters);
1138
+ for (const textBlock of textBlocks) {
1139
+ const actualPageNo = this.pdfPageToActualPage(
1140
+ textBlock.pdfPageNo,
1141
+ pageRangeMap
1142
+ );
1143
+ const chapterId = this.findChapterForPage(actualPageNo, chapterRanges);
1144
+ if (chapterId && chapterMap.has(chapterId)) {
1145
+ chapterMap.get(chapterId).textBlocks.push(textBlock);
1146
+ }
1147
+ }
1148
+ }
1149
+ /**
1150
+ * Link images, tables, and footnotes to chapters based on page ranges
1151
+ */
1152
+ linkResources(chapters, images, tables, footnotes, chapterRanges, pageRangeMap) {
1153
+ const chapterMap = this.buildChapterMap(chapters);
1154
+ for (const image of images) {
1155
+ const actualPageNo = this.pdfPageToActualPage(
1156
+ image.pdfPageNo,
1157
+ pageRangeMap
1158
+ );
1159
+ const chapterId = this.findChapterForPage(actualPageNo, chapterRanges);
1160
+ if (chapterId && chapterMap.has(chapterId)) {
1161
+ chapterMap.get(chapterId).imageIds.push(image.id);
1162
+ }
1163
+ }
1164
+ for (const table of tables) {
1165
+ const actualPageNo = this.pdfPageToActualPage(
1166
+ table.pdfPageNo,
1167
+ pageRangeMap
1168
+ );
1169
+ const chapterId = this.findChapterForPage(actualPageNo, chapterRanges);
1170
+ if (chapterId && chapterMap.has(chapterId)) {
1171
+ chapterMap.get(chapterId).tableIds.push(table.id);
1172
+ }
1173
+ }
1174
+ for (const footnote of footnotes) {
1175
+ const actualPageNo = this.pdfPageToActualPage(
1176
+ footnote.pdfPageNo,
1177
+ pageRangeMap
1178
+ );
1179
+ const chapterId = this.findChapterForPage(actualPageNo, chapterRanges);
1180
+ if (chapterId && chapterMap.has(chapterId)) {
1181
+ chapterMap.get(chapterId).footnoteIds.push(footnote.id);
1182
+ }
1183
+ }
1184
+ }
1185
+ /**
1186
+ * Build flat chapter map for O(1) lookup
1187
+ */
1188
+ buildChapterMap(chapters) {
1189
+ const map = /* @__PURE__ */ new Map();
1190
+ const addToMap = (chapterList) => {
1191
+ for (const chapter of chapterList) {
1192
+ map.set(chapter.id, chapter);
1193
+ if (chapter.children && chapter.children.length > 0) {
1194
+ addToMap(chapter.children);
1195
+ }
1196
+ }
1197
+ };
1198
+ addToMap(chapters);
1199
+ return map;
1200
+ }
1201
+ };
1202
+
1203
+ // src/extractors/toc-extract-error.ts
1204
+ var TocExtractError = class _TocExtractError extends Error {
1205
+ constructor(message, options) {
1206
+ super(message, options);
1207
+ this.name = "TocExtractError";
1208
+ }
1209
+ /**
1210
+ * Extract error message from unknown error type
1211
+ */
1212
+ static getErrorMessage(error) {
1213
+ return error instanceof Error ? error.message : String(error);
1214
+ }
1215
+ /**
1216
+ * Create TocExtractError from unknown error with context
1217
+ */
1218
+ static fromError(context, error) {
1219
+ return new _TocExtractError(
1220
+ `${context}: ${_TocExtractError.getErrorMessage(error)}`,
1221
+ { cause: error }
1222
+ );
1223
+ }
1224
+ };
1225
+ var TocNotFoundError = class extends TocExtractError {
1226
+ constructor(message = "Table of contents not found in the document") {
1227
+ super(message);
1228
+ this.name = "TocNotFoundError";
1229
+ }
1230
+ };
1231
+ var TocParseError = class extends TocExtractError {
1232
+ constructor(message, options) {
1233
+ super(message, options);
1234
+ this.name = "TocParseError";
1235
+ }
1236
+ };
1237
+ var TocValidationError = class extends TocExtractError {
1238
+ /**
1239
+ * Validation result with detailed issues
1240
+ */
1241
+ validationResult;
1242
+ constructor(message, validationResult) {
1243
+ super(message);
1244
+ this.name = "TocValidationError";
1245
+ this.validationResult = validationResult;
1246
+ }
1247
+ /**
1248
+ * Get formatted error summary
1249
+ */
1250
+ getSummary() {
1251
+ const { errorCount, issues } = this.validationResult;
1252
+ const lines = [
1253
+ `TOC validation failed: ${errorCount} error(s)`,
1254
+ "",
1255
+ "Issues:"
1256
+ ];
1257
+ for (const issue of issues) {
1258
+ lines.push(` [${issue.code}] ${issue.message}`);
1259
+ lines.push(` Path: ${issue.path}`);
1260
+ lines.push(
1261
+ ` Entry: "${issue.entry.title}" (page ${issue.entry.pageNo})`
1262
+ );
1263
+ }
1264
+ return lines.join("\n");
1265
+ }
1266
+ };
1267
+
1268
+ // src/extractors/toc-validator.ts
1269
+ var DEFAULT_OPTIONS = {
1270
+ totalPages: Infinity,
1271
+ maxTitleLength: 200
1272
+ };
1273
+ var TocValidator = class {
1274
+ options;
1275
+ issues;
1276
+ constructor(options) {
1277
+ this.options = {
1278
+ ...DEFAULT_OPTIONS,
1279
+ ...options
1280
+ };
1281
+ this.issues = [];
1282
+ }
1283
+ /**
1284
+ * Validate TocEntry array
1285
+ *
1286
+ * @param entries - TOC entries to validate
1287
+ * @returns Validation result
1288
+ */
1289
+ validate(entries) {
1290
+ this.issues = [];
1291
+ this.validateEntries(entries, "", null, /* @__PURE__ */ new Set());
1292
+ const errorCount = this.issues.length;
1293
+ return {
1294
+ valid: errorCount === 0,
1295
+ issues: [...this.issues],
1296
+ errorCount
1297
+ };
1298
+ }
1299
+ /**
1300
+ * Validate and throw if invalid
1301
+ *
1302
+ * @param entries - TOC entries to validate
1303
+ * @throws {TocValidationError} When validation fails
1304
+ */
1305
+ validateOrThrow(entries) {
1306
+ const result = this.validate(entries);
1307
+ if (!result.valid) {
1308
+ throw new TocValidationError(
1309
+ `TOC validation failed with ${result.errorCount} error(s)`,
1310
+ result
1311
+ );
1312
+ }
1313
+ }
1314
+ /**
1315
+ * Recursively validate entries
1316
+ */
1317
+ validateEntries(entries, parentPath, parentEntry, seenKeys) {
1318
+ let prevPageNo = parentEntry?.pageNo ?? 0;
1319
+ for (let i = 0; i < entries.length; i++) {
1320
+ const entry = entries[i];
1321
+ const path4 = parentPath ? `${parentPath}.children[${i}]` : `[${i}]`;
1322
+ this.validateTitle(entry, path4);
1323
+ this.validateTitleLength(entry, path4);
1324
+ this.validatePageRange(entry, path4);
1325
+ this.validatePageOrder(entry, path4, prevPageNo);
1326
+ prevPageNo = entry.pageNo;
1327
+ if (parentEntry) {
1328
+ this.validateParentChildPage(entry, path4, parentEntry);
1329
+ }
1330
+ const key = `${entry.title}:${entry.pageNo}`;
1331
+ this.validateDuplicate(entry, path4, key, seenKeys);
1332
+ seenKeys.add(key);
1333
+ if (entry.children && entry.children.length > 0) {
1334
+ this.validateEntries(entry.children, path4, entry, seenKeys);
1335
+ }
1336
+ }
1337
+ }
1338
+ /**
1339
+ * V003: Validate title is not empty
1340
+ */
1341
+ validateTitle(entry, path4) {
1342
+ if (!entry.title || entry.title.trim() === "") {
1343
+ this.addIssue({
1344
+ code: "V003",
1345
+ message: "Title is empty or contains only whitespace",
1346
+ path: path4,
1347
+ entry
1348
+ });
1349
+ }
1350
+ }
1351
+ /**
1352
+ * V004: Validate title length
1353
+ */
1354
+ validateTitleLength(entry, path4) {
1355
+ if (entry.title.length > this.options.maxTitleLength) {
1356
+ this.addIssue({
1357
+ code: "V004",
1358
+ message: `Title exceeds ${this.options.maxTitleLength} characters (${entry.title.length})`,
1359
+ path: path4,
1360
+ entry
1361
+ });
1362
+ }
1363
+ }
1364
+ /**
1365
+ * V002: Validate page number range
1366
+ */
1367
+ validatePageRange(entry, path4) {
1368
+ if (entry.pageNo < 1) {
1369
+ this.addIssue({
1370
+ code: "V002",
1371
+ message: `Page number must be >= 1, got ${entry.pageNo}`,
1372
+ path: path4,
1373
+ entry
1374
+ });
1375
+ }
1376
+ if (entry.pageNo > this.options.totalPages) {
1377
+ this.addIssue({
1378
+ code: "V002",
1379
+ message: `Page number ${entry.pageNo} exceeds document total pages (${this.options.totalPages})`,
1380
+ path: path4,
1381
+ entry
1382
+ });
1383
+ }
1384
+ }
1385
+ /**
1386
+ * V001: Validate page order within same level
1387
+ */
1388
+ validatePageOrder(entry, path4, prevPageNo) {
1389
+ if (entry.pageNo < prevPageNo) {
1390
+ this.addIssue({
1391
+ code: "V001",
1392
+ message: `Page number decreased from ${prevPageNo} to ${entry.pageNo}`,
1393
+ path: path4,
1394
+ entry
1395
+ });
1396
+ }
1397
+ }
1398
+ /**
1399
+ * V005: Validate parent-child page relationship
1400
+ */
1401
+ validateParentChildPage(entry, path4, parent) {
1402
+ if (entry.pageNo < parent.pageNo) {
1403
+ this.addIssue({
1404
+ code: "V005",
1405
+ message: `Child page (${entry.pageNo}) is before parent page (${parent.pageNo})`,
1406
+ path: path4,
1407
+ entry
1408
+ });
1409
+ }
1410
+ }
1411
+ /**
1412
+ * V006: Validate no duplicates
1413
+ */
1414
+ validateDuplicate(entry, path4, key, seenKeys) {
1415
+ if (seenKeys.has(key)) {
1416
+ this.addIssue({
1417
+ code: "V006",
1418
+ message: `Duplicate entry: "${entry.title}" at page ${entry.pageNo}`,
1419
+ path: path4,
1420
+ entry
1421
+ });
1422
+ }
1423
+ }
1424
+ /**
1425
+ * Add issue to the list
1426
+ */
1427
+ addIssue(issue) {
1428
+ this.issues.push(issue);
1429
+ }
1430
+ };
1431
+
1432
+ // src/extractors/toc-finder.ts
1433
+ var TOC_KEYWORDS = [
1434
+ "\uBAA9\uCC28",
1435
+ "\uCC28\uB840",
1436
+ "\uBAA9 \uCC28",
1437
+ "\u76EE\u5F55",
1438
+ "\u76EE \u5F55",
1439
+ "\u5185\u5BB9",
1440
+ "\u5167\u5BB9",
1441
+ "\u76EE\u6B21",
1442
+ "\u76EE \u6B21",
1443
+ "Contents",
1444
+ "Table of Contents",
1445
+ "TABLE OF CONTENTS",
1446
+ "CONTENTS"
1447
+ ];
1448
+ var CONTINUATION_MARKERS = [
1449
+ "\uBAA9\uCC28(\uACC4\uC18D)",
1450
+ "\uBAA9\uCC28 (\uACC4\uC18D)",
1451
+ "(\uACC4\uC18D)",
1452
+ "\u76EE\u5F55(\u7EED)",
1453
+ "\u76EE\u5F55 (\u7EED)",
1454
+ "(\u7EED)",
1455
+ "\u7EED\u8868",
1456
+ "\u76EE\u6B21(\u7D9A)",
1457
+ "\u76EE\u6B21 (\u7D9A)",
1458
+ "(\u7D9A)",
1459
+ "(continued)",
1460
+ "(Continued)",
1461
+ "(CONTINUED)",
1462
+ "continued"
1463
+ ];
1464
+ var PAGE_NUMBER_PATTERN = /\.{2,}\s*\d+\s*$|…+\s*\d+\s*$|\s+\d+\s*$/;
1465
+ var TocFinder = class {
1466
+ constructor(logger, refResolver, options) {
1467
+ this.logger = logger;
1468
+ this.refResolver = refResolver;
1469
+ this.maxSearchPages = options?.maxSearchPages ?? 10;
1470
+ this.keywords = [...TOC_KEYWORDS, ...options?.additionalKeywords ?? []];
1471
+ }
1472
+ maxSearchPages;
1473
+ keywords;
1474
+ /**
1475
+ * Find TOC area in the document
1476
+ *
1477
+ * @throws {TocNotFoundError} When no TOC area is found
1478
+ */
1479
+ find(doc) {
1480
+ this.logger.info("[TocFinder] Starting TOC search...");
1481
+ const keywordResult = this.findByKeywords(doc);
1482
+ if (keywordResult) {
1483
+ this.logger.info(
1484
+ `[TocFinder] Found TOC by keyword search: pages ${keywordResult.startPage}-${keywordResult.endPage}`
1485
+ );
1486
+ return keywordResult;
1487
+ }
1488
+ const structureResult = this.findByStructure(doc);
1489
+ if (structureResult) {
1490
+ this.logger.info(
1491
+ `[TocFinder] Found TOC by structure analysis: pages ${structureResult.startPage}-${structureResult.endPage}`
1492
+ );
1493
+ return structureResult;
1494
+ }
1495
+ this.logger.warn("[TocFinder] No TOC found in document");
1496
+ throw new TocNotFoundError();
1497
+ }
1498
+ /**
1499
+ * Stage 1: Search by keywords in text items
1500
+ */
1501
+ findByKeywords(doc) {
1502
+ for (const text of doc.texts) {
1503
+ if (!this.containsTocKeyword(text.text)) {
1504
+ continue;
1505
+ }
1506
+ const pageNo = text.prov[0]?.page_no;
1507
+ if (pageNo === void 0 || pageNo > this.maxSearchPages) {
1508
+ continue;
1509
+ }
1510
+ this.logger.info(
1511
+ `[TocFinder] Found TOC keyword "${text.text}" on page ${pageNo}`
1512
+ );
1513
+ const parentRef = text.parent?.$ref;
1514
+ if (!parentRef) {
1515
+ return {
1516
+ itemRefs: [text.self_ref],
1517
+ startPage: pageNo,
1518
+ endPage: pageNo
1519
+ };
1520
+ }
1521
+ const result = this.findTocContainer(doc, parentRef, pageNo);
1522
+ if (result) {
1523
+ return this.expandToConsecutivePages(result, doc);
1524
+ }
1525
+ }
1526
+ return null;
1527
+ }
1528
+ /**
1529
+ * Stage 2: Search by structure (lists/tables with page numbers)
1530
+ */
1531
+ findByStructure(doc) {
1532
+ const candidates = [];
1533
+ for (const group of doc.groups) {
1534
+ const pageNo = this.getGroupFirstPage(group);
1535
+ if (pageNo === void 0 || pageNo > this.maxSearchPages) {
1536
+ continue;
1537
+ }
1538
+ if (this.isGroupTocLike(group, doc)) {
1539
+ const score = this.calculateScore(group, pageNo);
1540
+ candidates.push({
1541
+ result: {
1542
+ itemRefs: [group.self_ref],
1543
+ startPage: pageNo,
1544
+ endPage: pageNo
1545
+ },
1546
+ score
1547
+ });
1548
+ }
1549
+ }
1550
+ for (const table of doc.tables) {
1551
+ const pageNo = table.prov[0]?.page_no;
1552
+ if (pageNo === void 0 || pageNo > this.maxSearchPages) {
1553
+ continue;
1554
+ }
1555
+ if (this.isTableTocLike(table)) {
1556
+ const score = this.calculateTableScore(table, pageNo);
1557
+ candidates.push({
1558
+ result: {
1559
+ itemRefs: [table.self_ref],
1560
+ startPage: pageNo,
1561
+ endPage: pageNo
1562
+ },
1563
+ score
1564
+ });
1565
+ }
1566
+ }
1567
+ if (candidates.length === 0) {
1568
+ return null;
1569
+ }
1570
+ candidates.sort((a, b) => b.score - a.score);
1571
+ const best = candidates[0];
1572
+ return this.expandToConsecutivePages(best.result, doc);
1573
+ }
1574
+ /**
1575
+ * Find the TOC container (group or table) from a parent reference
1576
+ */
1577
+ findTocContainer(doc, parentRef, pageNo) {
1578
+ const group = this.refResolver.resolveGroup(parentRef);
1579
+ if (group) {
1580
+ return {
1581
+ itemRefs: [group.self_ref],
1582
+ startPage: pageNo,
1583
+ endPage: pageNo
1584
+ };
1585
+ }
1586
+ const table = this.refResolver.resolveTable(parentRef);
1587
+ if (table) {
1588
+ return {
1589
+ itemRefs: [table.self_ref],
1590
+ startPage: pageNo,
1591
+ endPage: pageNo
1592
+ };
1593
+ }
1594
+ const item = this.refResolver.resolve(parentRef);
1595
+ if (item && item.parent?.$ref) {
1596
+ return this.findTocContainer(doc, item.parent.$ref, pageNo);
1597
+ }
1598
+ return null;
1599
+ }
1600
+ /**
1601
+ * Check if a group contains TOC-like structure
1602
+ */
1603
+ isGroupTocLike(group, _doc) {
1604
+ if (group.name !== "list" && group.name !== "group") {
1605
+ return false;
1606
+ }
1607
+ let pageNumberCount = 0;
1608
+ const children = this.refResolver.resolveMany(group.children);
1609
+ for (const child of children) {
1610
+ if (!child) continue;
1611
+ if ("text" in child && "orig" in child) {
1612
+ const textItem = child;
1613
+ if (PAGE_NUMBER_PATTERN.test(textItem.text)) {
1614
+ pageNumberCount++;
1615
+ }
1616
+ }
1617
+ }
1618
+ const total = children.filter((c) => c !== null).length;
1619
+ return pageNumberCount >= 3 || total > 0 && pageNumberCount / total > 0.5;
1620
+ }
1621
+ /**
1622
+ * Check if a table contains TOC-like structure
1623
+ */
1624
+ isTableTocLike(table) {
1625
+ if (table.label === "document_index") {
1626
+ return true;
1627
+ }
1628
+ const { grid, num_rows, num_cols } = table.data;
1629
+ if (num_rows < 3 || num_cols < 2) {
1630
+ return false;
1631
+ }
1632
+ let numberCount = 0;
1633
+ for (let row = 1; row < grid.length; row++) {
1634
+ const lastCell = grid[row]?.[num_cols - 1];
1635
+ if (lastCell && /^\d+$/.test(lastCell.text.trim())) {
1636
+ numberCount++;
1637
+ }
1638
+ }
1639
+ return numberCount > 0 && numberCount / (num_rows - 1) > 0.5;
1640
+ }
1641
+ /**
1642
+ * Expand TOC area to consecutive pages
1643
+ */
1644
+ expandToConsecutivePages(initial, doc) {
1645
+ const itemRefs = [...initial.itemRefs];
1646
+ let endPage = initial.endPage;
1647
+ for (let pageNo = initial.endPage + 1; pageNo <= this.maxSearchPages; pageNo++) {
1648
+ const continuationItems = this.findContinuationOnPage(doc, pageNo);
1649
+ if (continuationItems.length === 0) {
1650
+ break;
1651
+ }
1652
+ itemRefs.push(...continuationItems);
1653
+ endPage = pageNo;
1654
+ }
1655
+ return {
1656
+ itemRefs,
1657
+ startPage: initial.startPage,
1658
+ endPage
1659
+ };
1660
+ }
1661
+ /**
1662
+ * Find TOC continuation items on a specific page
1663
+ */
1664
+ findContinuationOnPage(doc, pageNo) {
1665
+ const refs = [];
1666
+ for (const text of doc.texts) {
1667
+ if (text.prov[0]?.page_no !== pageNo) {
1668
+ continue;
1669
+ }
1670
+ if (this.hasContinuationMarker(text.text)) {
1671
+ const parentRef = text.parent?.$ref;
1672
+ if (parentRef) {
1673
+ const group = this.refResolver.resolveGroup(parentRef);
1674
+ if (group) {
1675
+ refs.push(group.self_ref);
1676
+ }
1677
+ }
1678
+ }
1679
+ }
1680
+ for (const group of doc.groups) {
1681
+ const groupPage = this.getGroupFirstPage(group);
1682
+ if (groupPage !== pageNo) {
1683
+ continue;
1684
+ }
1685
+ if (this.isGroupTocLike(group, doc) && !refs.includes(group.self_ref)) {
1686
+ refs.push(group.self_ref);
1687
+ }
1688
+ }
1689
+ for (const table of doc.tables) {
1690
+ if (table.prov[0]?.page_no !== pageNo) {
1691
+ continue;
1692
+ }
1693
+ if (this.isTableTocLike(table) && !refs.includes(table.self_ref)) {
1694
+ refs.push(table.self_ref);
1695
+ }
1696
+ }
1697
+ return refs;
1698
+ }
1699
+ /**
1700
+ * Check if text contains TOC keyword
1701
+ */
1702
+ containsTocKeyword(text) {
1703
+ const normalizedText = text.trim().toLowerCase();
1704
+ return this.keywords.some(
1705
+ (keyword) => normalizedText.includes(keyword.toLowerCase())
1706
+ );
1707
+ }
1708
+ /**
1709
+ * Check for continuation markers
1710
+ */
1711
+ hasContinuationMarker(text) {
1712
+ const normalizedText = text.trim().toLowerCase();
1713
+ return CONTINUATION_MARKERS.some(
1714
+ (marker) => normalizedText.includes(marker.toLowerCase())
1715
+ );
1716
+ }
1717
+ /**
1718
+ * Get first page number of a group by checking its children
1719
+ */
1720
+ getGroupFirstPage(group) {
1721
+ for (const childRef of group.children) {
1722
+ const child = this.refResolver.resolve(childRef.$ref);
1723
+ if (child && "prov" in child) {
1724
+ const prov = child.prov;
1725
+ if (prov && prov[0]?.page_no !== void 0) {
1726
+ return prov[0].page_no;
1727
+ }
1728
+ }
1729
+ }
1730
+ return void 0;
1731
+ }
1732
+ /**
1733
+ * Calculate score for a group candidate
1734
+ * Higher score = better match
1735
+ */
1736
+ calculateScore(group, pageNo) {
1737
+ let score = 0;
1738
+ score += (this.maxSearchPages - pageNo + 1) * 10;
1739
+ score += group.children.length * 2;
1740
+ const children = this.refResolver.resolveMany(group.children);
1741
+ for (const child of children) {
1742
+ if (child && "text" in child) {
1743
+ const textItem = child;
1744
+ if (PAGE_NUMBER_PATTERN.test(textItem.text)) {
1745
+ score += 5;
1746
+ }
1747
+ }
1748
+ }
1749
+ return score;
1750
+ }
1751
+ /**
1752
+ * Calculate score for a table candidate
1753
+ */
1754
+ calculateTableScore(table, pageNo) {
1755
+ let score = 0;
1756
+ score += (this.maxSearchPages - pageNo + 1) * 10;
1757
+ score += table.data.num_rows * 2;
1758
+ if (table.label === "document_index") {
1759
+ score += 50;
1760
+ }
1761
+ return score;
1762
+ }
1763
+ };
1764
+
1765
+ // src/extractors/toc-extractor.ts
1766
+ var import_zod = require("zod");
1767
+
1768
+ // src/core/base-llm-component.ts
1769
+ var BaseLLMComponent = class {
1770
+ logger;
1771
+ model;
1772
+ fallbackModel;
1773
+ maxRetries;
1774
+ temperature;
1775
+ componentName;
1776
+ aggregator;
1777
+ abortSignal;
1778
+ /**
1779
+ * Constructor for BaseLLMComponent
1780
+ *
1781
+ * @param logger - Logger instance for logging
1782
+ * @param model - Primary language model for LLM calls
1783
+ * @param componentName - Name of the component for logging (e.g., "TocExtractor")
1784
+ * @param options - Optional configuration (maxRetries, temperature)
1785
+ * @param fallbackModel - Optional fallback model for retry on failure
1786
+ * @param aggregator - Optional token usage aggregator for tracking LLM calls
1787
+ */
1788
+ constructor(logger, model, componentName, options, fallbackModel, aggregator) {
1789
+ this.logger = logger;
1790
+ this.model = model;
1791
+ this.componentName = componentName;
1792
+ this.maxRetries = options?.maxRetries ?? 3;
1793
+ this.temperature = options?.temperature ?? 0;
1794
+ this.fallbackModel = fallbackModel;
1795
+ this.aggregator = aggregator;
1796
+ this.abortSignal = options?.abortSignal;
1797
+ }
1798
+ /**
1799
+ * Log a message with consistent component name prefix
1800
+ *
1801
+ * @param level - Log level ('info', 'warn', 'error')
1802
+ * @param message - Message to log (without prefix)
1803
+ * @param args - Additional arguments to pass to logger
1804
+ */
1805
+ log(level, message, ...args) {
1806
+ const formattedMessage = `[${this.componentName}] ${message}`;
1807
+ this.logger[level](formattedMessage, ...args);
1808
+ }
1809
+ /**
1810
+ * Track token usage to aggregator if available
1811
+ *
1812
+ * @param usage - Token usage information to track
1813
+ */
1814
+ trackUsage(usage) {
1815
+ if (this.aggregator) {
1816
+ this.aggregator.track(usage);
1817
+ }
1818
+ }
1819
+ /**
1820
+ * Create an empty usage record for edge cases (e.g., empty input)
1821
+ *
1822
+ * @param phase - Phase name for the usage record
1823
+ * @returns Empty ExtendedTokenUsage object
1824
+ */
1825
+ createEmptyUsage(phase) {
1826
+ return {
1827
+ component: this.componentName,
1828
+ phase,
1829
+ model: "primary",
1830
+ modelName: "none",
1831
+ inputTokens: 0,
1832
+ outputTokens: 0,
1833
+ totalTokens: 0
1834
+ };
1835
+ }
1836
+ };
1837
+
1838
+ // src/core/text-llm-component.ts
1839
+ var TextLLMComponent = class extends BaseLLMComponent {
1840
+ constructor(logger, model, componentName, options, fallbackModel, aggregator) {
1841
+ super(logger, model, componentName, options, fallbackModel, aggregator);
1842
+ }
1843
+ /**
1844
+ * Call LLM with text-based prompts using LLMCaller.call()
1845
+ *
1846
+ * @template TSchema - Zod schema type for response validation
1847
+ * @param schema - Zod schema for response validation
1848
+ * @param systemPrompt - System prompt for LLM
1849
+ * @param userPrompt - User prompt for LLM
1850
+ * @param phase - Phase name for tracking (e.g., 'extraction', 'validation')
1851
+ * @returns Promise with parsed object and usage information
1852
+ */
1853
+ async callTextLLM(schema, systemPrompt, userPrompt, phase) {
1854
+ const result = await LLMCaller.call({
1855
+ schema,
1856
+ systemPrompt,
1857
+ userPrompt,
1858
+ primaryModel: this.model,
1859
+ fallbackModel: this.fallbackModel,
1860
+ maxRetries: this.maxRetries,
1861
+ temperature: this.temperature,
1862
+ abortSignal: this.abortSignal,
1863
+ component: this.componentName,
1864
+ phase
1865
+ });
1866
+ this.trackUsage(result.usage);
1867
+ return {
1868
+ output: result.output,
1869
+ usage: result.usage
1870
+ };
1871
+ }
1872
+ };
1873
+
1874
+ // src/extractors/toc-extractor.ts
1875
+ var TocEntrySchema = import_zod.z.lazy(
1876
+ () => import_zod.z.object({
1877
+ title: import_zod.z.string().describe("Chapter or section title"),
1878
+ level: import_zod.z.number().int().min(1).describe("Hierarchy depth (1 = top level)"),
1879
+ pageNo: import_zod.z.number().int().min(1).describe("Starting page number"),
1880
+ children: import_zod.z.array(TocEntrySchema).optional().describe("Child sections")
1881
+ })
1882
+ );
1883
+ var TocResponseSchema = import_zod.z.object({
1884
+ entries: import_zod.z.array(TocEntrySchema).describe("Extracted TOC entries")
1885
+ });
1886
+ var TocExtractor = class extends TextLLMComponent {
1887
+ validationOptions;
1888
+ skipValidation;
1889
+ constructor(logger, model, options, fallbackModel, abortSignal) {
1890
+ super(
1891
+ logger,
1892
+ model,
1893
+ "TocExtractor",
1894
+ { ...options, abortSignal },
1895
+ fallbackModel
1896
+ );
1897
+ this.validationOptions = options?.validation;
1898
+ this.skipValidation = options?.skipValidation ?? false;
1899
+ }
1900
+ /**
1901
+ * Extract TOC structure from Markdown
1902
+ *
1903
+ * @param markdown - Markdown representation of TOC area
1904
+ * @returns Object with entries array and token usage information
1905
+ * @throws {TocParseError} When LLM fails to parse structure
1906
+ * @throws {TocValidationError} When validation fails
1907
+ */
1908
+ async extract(markdown) {
1909
+ this.log("info", `Starting TOC extraction (${markdown.length} chars)`);
1910
+ if (!markdown.trim()) {
1911
+ this.log("info", "Empty markdown, returning empty array");
1912
+ return {
1913
+ entries: [],
1914
+ usage: this.createEmptyUsage("extraction")
1915
+ };
1916
+ }
1917
+ try {
1918
+ const result = await this.callTextLLM(
1919
+ TocResponseSchema,
1920
+ this.buildSystemPrompt(),
1921
+ this.buildUserPrompt(markdown),
1922
+ "extraction"
1923
+ );
1924
+ const entries = this.normalizeEntries(result.output.entries);
1925
+ if (!this.skipValidation) {
1926
+ this.validateEntries(entries);
1927
+ }
1928
+ this.log(
1929
+ "info",
1930
+ `Extraction completed: ${entries.length} top-level entries`
1931
+ );
1932
+ return { entries, usage: result.usage };
1933
+ } catch (error) {
1934
+ if (error instanceof TocValidationError) {
1935
+ this.log("error", `Validation failed: ${error.message}`);
1936
+ throw error;
1937
+ }
1938
+ const message = error instanceof Error ? error.message : String(error);
1939
+ this.log("error", `Extraction failed: ${message}`);
1940
+ throw new TocParseError(`Failed to extract TOC structure: ${message}`, {
1941
+ cause: error
1942
+ });
1943
+ }
1944
+ }
1945
+ /**
1946
+ * Validate extracted entries
1947
+ *
1948
+ * @throws {TocValidationError} When validation fails
1949
+ */
1950
+ validateEntries(entries) {
1951
+ if (entries.length === 0) {
1952
+ return;
1953
+ }
1954
+ const validator = new TocValidator(this.validationOptions);
1955
+ validator.validateOrThrow(entries);
1956
+ }
1957
+ /**
1958
+ * Build system prompt for TOC extraction
1959
+ */
1960
+ buildSystemPrompt() {
1961
+ return `You are a document structure extraction assistant. Your task is to parse a table of contents (TOC) from markdown format and extract structured entries.
1962
+
1963
+ ## Instructions
1964
+
1965
+ 1. **Title**: Extract the exact chapter/section title from each line. Remove page number indicators like "..... 10" or "... 5" at the end.
1966
+
1967
+ 2. **Level**: Determine the hierarchy depth:
1968
+ - Level 1: Top-level chapters (e.g., "\uC81C1\uC7A5", "Chapter 1", "I.", "Part 1")
1969
+ - Level 2: Main sections within chapters (e.g., "1.", "1.1", "A.")
1970
+ - Level 3: Subsections (e.g., "1.1.1", "a.", "(1)")
1971
+ - Use indentation and numbering patterns to infer level
1972
+
1973
+ 3. **Page Number**: Extract the page number from each entry. Convert Roman numerals to Arabic numerals if present (e.g., "iv" \u2192 4).
1974
+
1975
+ 4. **Children**: Nest child entries under parent entries based on their hierarchy level.
1976
+
1977
+ 5. **IMPORTANT - Extract Main TOC Only**: Only extract the main document table of contents. EXCLUDE the following supplementary indices:
1978
+ - Photo/image indices (\uC0AC\uC9C4 \uBAA9\uCC28, \uC0AC\uC9C4\uBAA9\uCC28, \uD654\uBCF4 \uBAA9\uCC28, Photo Index, List of Photos, List of Figures)
1979
+ - Drawing/diagram indices (\uB3C4\uBA74 \uBAA9\uCC28, \uB3C4\uBA74\uBAA9\uCC28, \uC0BD\uB3C4 \uBAA9\uCC28, Drawing Index, List of Drawings)
1980
+ - Table indices (\uD45C \uBAA9\uCC28, \uD45C\uBAA9\uCC28, Table Index, List of Tables)
1981
+ - Appendix indices (\uBD80\uB85D \uBAA9\uCC28, Appendix Index)
1982
+ - Any other supplementary material indices
1983
+
1984
+ ## Output Format
1985
+
1986
+ Return a flat array of top-level entries. Each entry at level 1 should contain its children (level 2+) nested properly.
1987
+
1988
+ ## Example
1989
+
1990
+ Input:
1991
+ - \uC81C1\uC7A5 \uC11C\uB860 ..... 1
1992
+ - 1. \uC5F0\uAD6C \uBC30\uACBD ..... 3
1993
+ - 2. \uC5F0\uAD6C \uBAA9\uC801 ..... 5
1994
+ - \uC81C2\uC7A5 \uBC29\uBC95\uB860 ..... 10
1995
+
1996
+ Output:
1997
+ {
1998
+ "entries": [
1999
+ {
2000
+ "title": "\uC81C1\uC7A5 \uC11C\uB860",
2001
+ "level": 1,
2002
+ "pageNo": 1,
2003
+ "children": [
2004
+ { "title": "1. \uC5F0\uAD6C \uBC30\uACBD", "level": 2, "pageNo": 3 },
2005
+ { "title": "2. \uC5F0\uAD6C \uBAA9\uC801", "level": 2, "pageNo": 5 }
2006
+ ]
2007
+ },
2008
+ { "title": "\uC81C2\uC7A5 \uBC29\uBC95\uB860", "level": 1, "pageNo": 10 }
2009
+ ]
2010
+ }`;
2011
+ }
2012
+ /**
2013
+ * Build user prompt with Markdown content
2014
+ */
2015
+ buildUserPrompt(markdown) {
2016
+ return `Extract the table of contents structure from the following markdown:
2017
+
2018
+ ${markdown}`;
2019
+ }
2020
+ /**
2021
+ * Normalize and validate extracted entries
2022
+ */
2023
+ normalizeEntries(entries) {
2024
+ if (entries.length === 0) {
2025
+ return [];
2026
+ }
2027
+ return this.normalizeLevel(entries, 1);
2028
+ }
2029
+ /**
2030
+ * Recursively ensure level consistency
2031
+ *
2032
+ * Children must have level = parent.level + 1
2033
+ */
2034
+ normalizeLevel(entries, expectedLevel) {
2035
+ return entries.map((entry) => {
2036
+ const normalizedEntry = {
2037
+ title: entry.title.trim(),
2038
+ level: expectedLevel,
2039
+ pageNo: entry.pageNo
2040
+ };
2041
+ if (entry.children && entry.children.length > 0) {
2042
+ normalizedEntry.children = this.normalizeLevel(
2043
+ entry.children,
2044
+ expectedLevel + 1
2045
+ );
2046
+ }
2047
+ return normalizedEntry;
2048
+ });
2049
+ }
2050
+ };
2051
+
2052
+ // src/extractors/vision-toc-extractor.ts
2053
+ var fs2 = __toESM(require("fs"), 1);
2054
+ var path2 = __toESM(require("path"), 1);
2055
+ var import_zod2 = require("zod");
2056
+
2057
+ // src/core/vision-llm-component.ts
2058
+ var fs = __toESM(require("fs"), 1);
2059
+ var path = __toESM(require("path"), 1);
2060
+ var VisionLLMComponent = class extends BaseLLMComponent {
2061
+ outputPath;
2062
+ constructor(logger, model, componentName, outputPath, options, fallbackModel, aggregator) {
2063
+ super(logger, model, componentName, options, fallbackModel, aggregator);
2064
+ this.outputPath = outputPath;
2065
+ }
2066
+ /**
2067
+ * Call LLM with vision capabilities using LLMCaller.callVision()
2068
+ *
2069
+ * @template TSchema - Zod schema type for response validation
2070
+ * @param schema - Zod schema for response validation
2071
+ * @param messages - Messages array including image content
2072
+ * @param phase - Phase name for tracking (e.g., 'extraction', 'sampling')
2073
+ * @returns Promise with parsed object and usage information
2074
+ */
2075
+ async callVisionLLM(schema, messages, phase) {
2076
+ const result = await LLMCaller.callVision({
2077
+ schema,
2078
+ messages,
2079
+ primaryModel: this.model,
2080
+ fallbackModel: this.fallbackModel,
2081
+ maxRetries: this.maxRetries,
2082
+ temperature: this.temperature,
2083
+ abortSignal: this.abortSignal,
2084
+ component: this.componentName,
2085
+ phase
2086
+ });
2087
+ this.trackUsage(result.usage);
2088
+ return {
2089
+ output: result.output,
2090
+ usage: result.usage
2091
+ };
2092
+ }
2093
+ /**
2094
+ * Load an image file and encode it as base64
2095
+ *
2096
+ * @param imagePath - Absolute path to the image file
2097
+ * @returns Base64 encoded image string
2098
+ */
2099
+ loadImageAsBase64(imagePath) {
2100
+ const imageBuffer = fs.readFileSync(imagePath);
2101
+ return imageBuffer.toString("base64");
2102
+ }
2103
+ /**
2104
+ * Build image content object for vision LLM messages
2105
+ *
2106
+ * @param imagePath - Path to the image file (relative to outputPath or absolute)
2107
+ * @param mimeType - MIME type of the image (default: 'image/png')
2108
+ * @returns ImageContent object for LLM message
2109
+ */
2110
+ buildImageContent(imagePath, mimeType = "image/png") {
2111
+ const absolutePath = path.isAbsolute(imagePath) ? imagePath : path.resolve(this.outputPath, imagePath);
2112
+ const base64Image = this.loadImageAsBase64(absolutePath);
2113
+ return {
2114
+ type: "image",
2115
+ image: `data:${mimeType};base64,${base64Image}`
2116
+ };
2117
+ }
2118
+ };
2119
+
2120
+ // src/extractors/vision-toc-extractor.ts
2121
+ var VisionTocExtractionSchema = import_zod2.z.object({
2122
+ hasToc: import_zod2.z.boolean().describe("Whether a TOC is visible on these pages"),
2123
+ tocMarkdown: import_zod2.z.string().nullable().describe("Extracted TOC in markdown format, null if not found"),
2124
+ continuesOnNextPage: import_zod2.z.boolean().describe("Whether TOC continues beyond these pages")
2125
+ });
2126
+ var VisionTocExtractor = class extends VisionLLMComponent {
2127
+ firstBatchSize;
2128
+ secondBatchSize;
2129
+ constructor(logger, model, outputPath, options, fallbackModel, aggregator) {
2130
+ super(
2131
+ logger,
2132
+ model,
2133
+ "VisionTocExtractor",
2134
+ outputPath,
2135
+ options,
2136
+ fallbackModel,
2137
+ aggregator ?? new LLMTokenUsageAggregator()
2138
+ );
2139
+ this.firstBatchSize = options?.firstBatchSize ?? 10;
2140
+ this.secondBatchSize = options?.secondBatchSize ?? 10;
2141
+ }
2142
+ /**
2143
+ * Extract TOC from page images
2144
+ *
2145
+ * Searches pages 1-10 first, then 11-20 if not found.
2146
+ *
2147
+ * @param totalPages - Total number of pages in the document
2148
+ * @returns Extracted TOC markdown or null if not found
2149
+ */
2150
+ async extract(totalPages) {
2151
+ this.log("info", `Starting TOC extraction from ${totalPages} pages`);
2152
+ if (totalPages === 0) {
2153
+ this.log("info", "No pages to search");
2154
+ return null;
2155
+ }
2156
+ const firstBatchEnd = Math.min(this.firstBatchSize, totalPages);
2157
+ this.log("info", `Searching first batch: pages 1-${firstBatchEnd}`);
2158
+ const firstResult = await this.extractFromBatch(1, firstBatchEnd);
2159
+ if (firstResult.hasToc && firstResult.tocMarkdown) {
2160
+ if (firstResult.continuesOnNextPage && firstBatchEnd < totalPages) {
2161
+ this.log("info", "TOC continues on next pages, extracting more");
2162
+ const continuationEnd = Math.min(
2163
+ firstBatchEnd + this.secondBatchSize,
2164
+ totalPages
2165
+ );
2166
+ const continuationResult = await this.extractFromBatch(
2167
+ firstBatchEnd + 1,
2168
+ continuationEnd
2169
+ );
2170
+ if (continuationResult.hasToc && continuationResult.tocMarkdown) {
2171
+ const merged = this.mergeMarkdown(
2172
+ firstResult.tocMarkdown,
2173
+ continuationResult.tocMarkdown
2174
+ );
2175
+ this.aggregator.logSummary(this.logger);
2176
+ this.log(
2177
+ "info",
2178
+ `TOC extracted with continuation (${merged.length} chars)`
2179
+ );
2180
+ return merged;
2181
+ }
2182
+ }
2183
+ this.aggregator.logSummary(this.logger);
2184
+ this.log(
2185
+ "info",
2186
+ `TOC found in first batch (${firstResult.tocMarkdown.length} chars)`
2187
+ );
2188
+ return firstResult.tocMarkdown;
2189
+ }
2190
+ if (firstBatchEnd < totalPages) {
2191
+ const secondBatchStart = firstBatchEnd + 1;
2192
+ const secondBatchEnd = Math.min(
2193
+ firstBatchEnd + this.secondBatchSize,
2194
+ totalPages
2195
+ );
2196
+ this.log(
2197
+ "info",
2198
+ `Searching second batch: pages ${secondBatchStart}-${secondBatchEnd}`
2199
+ );
2200
+ const secondResult = await this.extractFromBatch(
2201
+ secondBatchStart,
2202
+ secondBatchEnd
2203
+ );
2204
+ if (secondResult.hasToc && secondResult.tocMarkdown) {
2205
+ this.aggregator.logSummary(this.logger);
2206
+ this.log(
2207
+ "info",
2208
+ `TOC found in second batch (${secondResult.tocMarkdown.length} chars)`
2209
+ );
2210
+ return secondResult.tocMarkdown;
2211
+ }
2212
+ }
2213
+ this.aggregator.logSummary(this.logger);
2214
+ this.log("info", "TOC not found in any batch");
2215
+ return null;
2216
+ }
2217
+ /**
2218
+ * Extract TOC from a specific batch of pages
2219
+ */
2220
+ async extractFromBatch(startPage, endPage) {
2221
+ this.log("info", `Extracting from pages ${startPage}-${endPage}`);
2222
+ const imageContents = this.loadPageImages(startPage, endPage);
2223
+ const result = await LLMCaller.callVision({
2224
+ schema: VisionTocExtractionSchema,
2225
+ messages: [
2226
+ {
2227
+ role: "user",
2228
+ content: [
2229
+ {
2230
+ type: "text",
2231
+ text: this.buildUserPrompt(startPage, endPage)
2232
+ },
2233
+ ...imageContents
2234
+ ]
2235
+ }
2236
+ ],
2237
+ primaryModel: this.model,
2238
+ fallbackModel: this.fallbackModel,
2239
+ maxRetries: this.maxRetries,
2240
+ temperature: this.temperature,
2241
+ abortSignal: this.abortSignal,
2242
+ component: "VisionTocExtractor",
2243
+ phase: "extraction"
2244
+ });
2245
+ this.trackUsage(result.usage);
2246
+ return result.output;
2247
+ }
2248
+ /**
2249
+ * Load page images and build message content
2250
+ */
2251
+ loadPageImages(startPage, endPage) {
2252
+ const imageContents = [];
2253
+ for (let pageNo = startPage; pageNo <= endPage; pageNo++) {
2254
+ const imagePath = path2.resolve(
2255
+ this.outputPath,
2256
+ `pages/page_${pageNo - 1}.png`
2257
+ );
2258
+ const imageBuffer = fs2.readFileSync(imagePath);
2259
+ const base64Image = imageBuffer.toString("base64");
2260
+ imageContents.push({
2261
+ type: "image",
2262
+ image: `data:image/png;base64,${base64Image}`
2263
+ });
2264
+ }
2265
+ return imageContents;
2266
+ }
2267
+ /**
2268
+ * Merge markdown from multiple batches
2269
+ */
2270
+ mergeMarkdown(first, continuation) {
2271
+ return `${first.trim()}
2272
+ ${continuation.trim()}`;
2273
+ }
2274
+ /**
2275
+ * Build system prompt for vision LLM (not used, but required by abstract class)
2276
+ */
2277
+ buildSystemPrompt() {
2278
+ return "";
2279
+ }
2280
+ /**
2281
+ * Build user prompt with page range information
2282
+ */
2283
+ buildUserPrompt(startPage, endPage) {
2284
+ const pageCount = endPage - startPage + 1;
2285
+ return `You are a document analysis specialist. Your task is to find and extract the Table of Contents (TOC) from document page images.
2286
+
2287
+ I am providing ${pageCount} document page images (pages ${startPage}-${endPage}).
2288
+
2289
+ ## Where to Look for TOC:
2290
+ - TOC typically appears in the first 10-20 pages of a document
2291
+ - Look for pages with headings like "\uBAA9\uCC28", "\uCC28\uB840", "Contents", "Table of Contents"
2292
+ - Look for structured lists with chapter titles and page numbers
2293
+
2294
+ ## What to Extract:
2295
+ Extract the TOC content as markdown format that matches this exact structure:
2296
+ - Use "- " prefix for each list item
2297
+ - Use 2-space indentation for hierarchy levels
2298
+ - Include "..... " followed by page number at the end of each entry
2299
+ - Preserve original chapter/section numbering from the document
2300
+
2301
+ ## Output Format Example:
2302
+ \`\`\`
2303
+ - \uC81C1\uC7A5 \uC11C\uB860 ..... 1
2304
+ - 1. \uC5F0\uAD6C \uBC30\uACBD ..... 3
2305
+ - 2. \uC5F0\uAD6C \uBAA9\uC801 ..... 5
2306
+ - \uC81C2\uC7A5 \uC5F0\uAD6C \uBC29\uBC95 ..... 10
2307
+ - 1. \uC870\uC0AC \uC9C0\uC5ED ..... 10
2308
+ - 2. \uC870\uC0AC \uBC29\uBC95 ..... 15
2309
+ - \uC81C3\uC7A5 \uC5F0\uAD6C \uACB0\uACFC ..... 25
2310
+ \`\`\`
2311
+
2312
+ ## Important Rules:
2313
+ 1. Extract ONLY the main document TOC
2314
+ 2. DO NOT include supplementary indices:
2315
+ - Photo indices (\uC0AC\uC9C4 \uBAA9\uCC28, \uC0AC\uC9C4\uBAA9\uCC28)
2316
+ - Table indices (\uD45C \uBAA9\uCC28, \uD45C\uBAA9\uCC28)
2317
+ - Figure indices (\uB3C4\uBA74 \uBAA9\uCC28, \uB3C4\uBA74\uBAA9\uCC28)
2318
+ 3. If no TOC is found, set hasToc to false and tocMarkdown to null
2319
+ 4. Set continuesOnNextPage to true if the TOC appears to continue beyond the visible pages
2320
+
2321
+ Please examine these pages and:
2322
+ 1. Determine if any page contains a Table of Contents (TOC)
2323
+ 2. If found, extract the complete TOC in markdown format
2324
+ 3. Indicate if the TOC continues beyond these pages
2325
+
2326
+ Remember: Extract the main document TOC only. Ignore photo/table/figure indices.`;
2327
+ }
2328
+ };
2329
+
2330
+ // src/parsers/caption-parser.ts
2331
+ var import_zod3 = require("zod");
2332
+ var CaptionSingleSchema = import_zod3.z.object({
2333
+ num: import_zod3.z.string().nullable().describe('Extracted caption prefix + number (e.g., "\uB3C4\uD310 1", "Figure 2")')
2334
+ });
2335
+ var CaptionExtractionSchema = import_zod3.z.object({
2336
+ index: import_zod3.z.number().int().describe("Index of the caption in the input array"),
2337
+ num: import_zod3.z.string().nullable().describe('Extracted caption prefix + number (e.g., "\uB3C4\uD310 1", "Figure 2")')
2338
+ });
2339
+ var CaptionBatchSchema = import_zod3.z.object({
2340
+ results: import_zod3.z.array(CaptionExtractionSchema)
2341
+ });
2342
+ var CaptionParser = class extends TextLLMComponent {
2343
+ constructor(logger, model, options, fallbackModel, aggregator) {
2344
+ super(
2345
+ logger,
2346
+ model,
2347
+ options?.componentName ?? "CaptionParser",
2348
+ options,
2349
+ fallbackModel,
2350
+ aggregator ?? new LLMTokenUsageAggregator()
2351
+ );
2352
+ }
2353
+ /**
2354
+ * Parse batch of captions
2355
+ *
2356
+ * @param captions - Array of caption full texts
2357
+ * @param batchSize - Batch size for processing. Set to 0 for sequential processing without batching.
2358
+ * @param overrideModel - Optional model to use instead of the default model
2359
+ * @returns Array of Caption objects with num extracted (maintains original order)
2360
+ */
2361
+ async parseBatch(captions, batchSize, overrideModel) {
2362
+ const effectiveModel = overrideModel ?? this.model;
2363
+ const isOverride = overrideModel !== void 0;
2364
+ const modelName = effectiveModel.modelId ?? effectiveModel.id ?? "unknown";
2365
+ this.log(
2366
+ "info",
2367
+ `Starting caption parsing for ${captions.length} captions with ${isOverride ? "override " : ""}model: ${modelName}`
2368
+ );
2369
+ if (captions.length === 0) {
2370
+ this.log("info", "No captions to parse");
2371
+ return [];
2372
+ }
2373
+ try {
2374
+ if (batchSize === 0) {
2375
+ this.log("info", "Using sequential processing (batchSize=0)");
2376
+ const results2 = [];
2377
+ for (let i = 0; i < captions.length; i++) {
2378
+ const fullText = captions[i];
2379
+ this.log("info", `Processing ${i + 1} / ${captions.length}...`);
2380
+ const result = await LLMCaller.call({
2381
+ schema: CaptionSingleSchema,
2382
+ systemPrompt: this.buildSystemPrompt("single"),
2383
+ userPrompt: this.buildUserPromptSingle(fullText),
2384
+ primaryModel: effectiveModel,
2385
+ fallbackModel: this.fallbackModel,
2386
+ maxRetries: this.maxRetries,
2387
+ temperature: this.temperature,
2388
+ abortSignal: this.abortSignal,
2389
+ component: this.componentName,
2390
+ phase: "caption-extraction"
2391
+ });
2392
+ this.trackUsage(result.usage);
2393
+ const finalNum = this.extractNumFromFullText(
2394
+ fullText,
2395
+ result.output.num
2396
+ );
2397
+ results2.push({ fullText, num: finalNum });
2398
+ }
2399
+ this.aggregator.logSummary(this.logger);
2400
+ this.log(
2401
+ "info",
2402
+ `Completed: ${results2.length} captions parsed, ${results2.filter((r) => r.num).length} with extracted numbers`
2403
+ );
2404
+ return results2;
2405
+ }
2406
+ const indexedCaptions = captions.map((text, index) => ({ index, text }));
2407
+ const batchResults = await BatchProcessor.processBatch(
2408
+ indexedCaptions,
2409
+ batchSize,
2410
+ async (batch) => this.parseBatchInternal(batch, effectiveModel)
2411
+ );
2412
+ batchResults.sort((a, b) => a.index - b.index);
2413
+ const results = batchResults.map((r) => r.caption);
2414
+ this.aggregator.logSummary(this.logger);
2415
+ this.log(
2416
+ "info",
2417
+ `Completed: ${results.length} captions parsed, ${results.filter((r) => r.num).length} with extracted numbers`
2418
+ );
2419
+ return results;
2420
+ } catch (error) {
2421
+ const message = error instanceof Error ? error.message : String(error);
2422
+ this.log("error", `Parsing failed: ${message}`);
2423
+ throw new CaptionParseError(`Failed to parse captions: ${message}`, {
2424
+ cause: error
2425
+ });
2426
+ }
2427
+ }
2428
+ /**
2429
+ * Internal: Parse batch of captions using LLM
2430
+ *
2431
+ * @param captions - Batch of caption texts with original indices
2432
+ * @param model - Effective model to use
2433
+ * @returns Array of Caption objects indexed correctly
2434
+ */
2435
+ async parseBatchInternal(captions, model) {
2436
+ const result = await LLMCaller.call({
2437
+ schema: CaptionBatchSchema,
2438
+ systemPrompt: this.buildSystemPrompt(),
2439
+ userPrompt: this.buildUserPrompt(captions),
2440
+ primaryModel: model,
2441
+ fallbackModel: this.fallbackModel,
2442
+ maxRetries: this.maxRetries,
2443
+ temperature: this.temperature,
2444
+ abortSignal: this.abortSignal,
2445
+ component: this.componentName,
2446
+ phase: "caption-extraction"
2447
+ });
2448
+ this.trackUsage(result.usage);
2449
+ if (result.output.results.length !== captions.length) {
2450
+ this.log(
2451
+ "warn",
2452
+ `LLM returned ${result.output.results.length} results for ${captions.length} captions. This may cause index mismatch.`
2453
+ );
2454
+ }
2455
+ const captionMap = new Map(captions.map((c) => [c.index, c.text]));
2456
+ return result.output.results.map((resultItem) => {
2457
+ const originalCaption = captions[resultItem.index];
2458
+ const originalIndex = originalCaption?.index ?? resultItem.index;
2459
+ const fullText = captionMap.get(originalIndex) || "";
2460
+ const finalNum = this.extractNumFromFullText(fullText, resultItem.num);
2461
+ return {
2462
+ index: originalIndex,
2463
+ caption: {
2464
+ fullText,
2465
+ num: finalNum
2466
+ }
2467
+ };
2468
+ });
2469
+ }
2470
+ /**
2471
+ * Extract and normalize caption number from full text
2472
+ *
2473
+ * Finds the extracted num pattern in the full text and extracts it
2474
+ * with original casing. Handles case-insensitive matching.
2475
+ *
2476
+ * @param fullText - The full caption text
2477
+ * @param extractedNum - The num extracted by LLM (may have different casing)
2478
+ * @returns Normalized num or undefined if no match
2479
+ */
2480
+ extractNumFromFullText(fullText, extractedNum) {
2481
+ if (!extractedNum) return void 0;
2482
+ let matchIndex = fullText.indexOf(extractedNum);
2483
+ if (matchIndex === -1) {
2484
+ const lowerFullText = fullText.toLowerCase();
2485
+ const lowerNum = extractedNum.toLowerCase();
2486
+ matchIndex = lowerFullText.indexOf(lowerNum);
2487
+ if (matchIndex !== -1) {
2488
+ return fullText.substring(matchIndex, matchIndex + extractedNum.length);
2489
+ }
2490
+ return extractedNum;
2491
+ }
2492
+ return fullText.substring(matchIndex, matchIndex + extractedNum.length);
2493
+ }
2494
+ /**
2495
+ * Build system prompt for caption parsing
2496
+ *
2497
+ * @param mode - 'batch' for multiple captions, 'single' for single caption
2498
+ */
2499
+ buildSystemPrompt(mode = "batch") {
2500
+ const intro = mode === "batch" ? 'Extract the caption prefix and number (e.g., "\uB3C4\uD310 1", "Figure 2") from image/table captions.\nReturn the prefix + number part as a string, or null if no number exists.' : 'Extract the caption prefix and number (e.g., "\uB3C4\uD310 1", "Figure 2") from an image/table caption.\nReturn the prefix + number part as a string, or null if no number exists.';
2501
+ return `You are a caption prefix extractor for archaeological excavation reports.
2502
+
2503
+ ${intro}
2504
+
2505
+ Rules:
2506
+ 1. Extract if the text follows a caption pattern: <prefix word(s)> <number>
2507
+ - The prefix can be ANY Korean/English word(s) that label images/tables/figures
2508
+ - Common examples: \uB3C4\uD310, \uC0AC\uC9C4, \uADF8\uB9BC, \uB3C4\uBA74, \uD45C, \uC6D0\uC0C9\uC0AC\uC9C4, \uD751\uBC31\uC0AC\uC9C4, Figure, Photo, Plate, etc.
2509
+ - The key is the PATTERN (text followed by number), not a specific word list
2510
+ - "\uC6D0\uC0C9\uC0AC\uC9C4 1. \uC870\uC0AC\uC9C0\uC5ED" \u2192 "\uC6D0\uC0C9\uC0AC\uC9C4 1" (valid: prefix + number pattern)
2511
+ - "\uD751\uBC31\uC0AC\uC9C4 2 \uCD9C\uD1A0\uC720\uBB3C" \u2192 "\uD751\uBC31\uC0AC\uC9C4 2" (valid: prefix + number pattern)
2512
+ 2. IGNORE leading punctuation/brackets when extracting:
2513
+ - "(\uC0AC\uC9C4 16> \uB290\uD2F0\uB098\uBB34" \u2192 "\uC0AC\uC9C4 16" (ignore leading '(' and extract the pattern inside)
2514
+ - "<\uB3C4\uD310 1> \uC720\uC801" \u2192 "\uB3C4\uD310 1" (ignore angle brackets)
2515
+ - "[\uADF8\uB9BC 2] \uC804\uACBD" \u2192 "\uADF8\uB9BC 2" (ignore square brackets)
2516
+ 3. Do NOT extract (return null) if:
2517
+ - It's a numbered list item starting with just a number: "1. \uC720\uC801 \uC804\uACBD" \u2192 null
2518
+ - It's a date/time reference: "39 3\uC6D4 28\uC77C..." \u2192 null
2519
+ - It's a year reference: "2024\uB144 \uC870\uC0AC \uD604\uD669" \u2192 null
2520
+ - It starts with a number without a prefix: "123 \uC124\uBA85" \u2192 null
2521
+ 4. PRESERVE original spacing from the input text exactly (after ignoring leading punctuation)
2522
+ 5. Include the full number (e.g., "1-2", "3a") not just the first digit
2523
+ 6. Include period/dot after number if it directly follows (e.g., "3.6" \u2192 "\uB3C4\uD310 3.6")
2524
+ - "\uADF8\uB9BC 3.6. \uD55C\uBC18\uB3C4 \uC911\uBD80" \u2192 "\uADF8\uB9BC 3.6" (period after decimal number included)
2525
+ - "\uB3C4\uD310 2. \uC720\uC801" \u2192 "\uB3C4\uD310 2" (period after space NOT included)
2526
+ 7. Stop at the first punctuation (except decimal point), whitespace, or underscore after the number
2527
+ - "\uC0AC\uC9C4 1_\u3147\u3147\u3147" \u2192 "\uC0AC\uC9C4 1" (stop at underscore)
2528
+ - "\uC0AC\uC9C4 1 \u3147\u3147\u3147" \u2192 "\uC0AC\uC9C4 1" (stop at space)
2529
+ - "\uADF8\uB9BC 3.6. \uD55C\uBC18\uB3C4" \u2192 "\uADF8\uB9BC 3.6" (period included as decimal separator)
2530
+
2531
+ Examples:
2532
+ - "\uB3C4\uD310 1 \uC720\uC801 \uC804\uACBD" \u2192 "\uB3C4\uD310 1"
2533
+ - "\uC6D0\uC0C9\uC0AC\uC9C4 1. \uC870\uC0AC\uC9C0\uC5ED \uC6D0\uACBD" \u2192 "\uC6D0\uC0C9\uC0AC\uC9C4 1"
2534
+ - "\uD751\uBC31\uC0AC\uC9C4 2 \uCD9C\uD1A0\uC720\uBB3C" \u2192 "\uD751\uBC31\uC0AC\uC9C4 2"
2535
+ - "(\uC0AC\uC9C4 16> \uB290\uD2F0\uB098\uBB34\uC758 \uC811\uC120\uB2E8\uBA74" \u2192 "\uC0AC\uC9C4 16" (ignore leading punctuation)
2536
+ - "<\uB3C4\uD310 3> \uC720\uBB3C \uC0AC\uC9C4" \u2192 "\uB3C4\uD310 3" (ignore angle brackets)
2537
+ - "\uB3C4\uD3101 \uC5B4\uCA4C\uAD6C" \u2192 "\uB3C4\uD3101" (no space preserved)
2538
+ - "\uC0AC\uC9C4 2. \uCD9C\uD1A0 \uC720\uBB3C" \u2192 "\uC0AC\uC9C4 2" (period after space, not included)
2539
+ - "\uADF8\uB9BC 3.6. \uD55C\uBC18\uB3C4 \uC911\uBD80 \uBC0F \uB0A8\uBD80\uC758 \u3147\u3147\u3147" \u2192 "\uADF8\uB9BC 3.6" (period as decimal included)
2540
+ - "Figure 3: Site plan" \u2192 "Figure 3"
2541
+ - "Table 4a. Artifact list" \u2192 "Table 4a"
2542
+ - "\uB3C4\uD310 5-2 \uCE35\uC704 \uB2E8\uBA74" \u2192 "\uB3C4\uD310 5-2"
2543
+ - "\uC124\uBA85 \uC5C6\uB294 \uC774\uBBF8\uC9C0" \u2192 null
2544
+ - "39 3\uC6D4 28\uC77C(\uBC31\uC81C \uB3C4\uB85C\uC720\uAD6C \uB0B4\uBD80 \uC870\uC0AC)" \u2192 null (starts with number, no prefix)
2545
+ - "1. \uC720\uAD6C \uD604\uD669" \u2192 null (numbered list, not caption)
2546
+ - "2024-05-01 \uCD2C\uC601" \u2192 null (date, not caption)`;
2547
+ }
2548
+ /**
2549
+ * Build user prompt for caption parsing
2550
+ */
2551
+ buildUserPrompt(captions) {
2552
+ const captionList = captions.map((c) => `[${c.index}] ${c.text}`).join("\n");
2553
+ return `Extract caption prefix and number from the following captions:
2554
+
2555
+ ${captionList}
2556
+
2557
+ Return the results as JSON array with "index" (original position) and "num" (extracted prefix + number or null).
2558
+
2559
+ Example format:
2560
+ [
2561
+ { "index": 0, "num": "\uB3C4\uD310 1" },
2562
+ { "index": 1, "num": "Figure 2" },
2563
+ { "index": 2, "num": null }
2564
+ ]`;
2565
+ }
2566
+ /**
2567
+ * Build user prompt for single caption parsing
2568
+ */
2569
+ buildUserPromptSingle(caption) {
2570
+ return `Extract caption prefix and number from the following caption:
2571
+
2572
+ "${caption}"
2573
+
2574
+ CRITICAL: Return ONLY the JSON object directly with a "num" field.
2575
+ - DO NOT wrap the JSON in quotes or additional formatting
2576
+ - DO NOT output "final:", "result:", or any prefix labels
2577
+ - DO NOT wrap in backticks or code blocks
2578
+ - Return ONLY valid JSON: { "num": value }
2579
+
2580
+ The value must be:
2581
+ - A string with the extracted caption prefix + number (e.g., "\uB3C4\uD310 1", "Figure 2")
2582
+ - null if no number exists
2583
+
2584
+ Valid outputs:
2585
+ { "num": "\uB3C4\uD310 1" }
2586
+ { "num": null }
2587
+
2588
+ Invalid outputs (NEVER do these):
2589
+ - { "final": "..." } \u274C
2590
+ - \`\`\`json { "num": "..." } \`\`\` \u274C
2591
+ - "{ "num": "..." }" \u274C
2592
+ - { "num": { "value": "..." } } \u274C`;
2593
+ }
2594
+ };
2595
+ var CaptionParseError = class extends Error {
2596
+ constructor(message, options) {
2597
+ super(message, options);
2598
+ this.name = "CaptionParseError";
2599
+ }
2600
+ };
2601
+
2602
+ // src/parsers/page-range-parse-error.ts
2603
+ var PageRangeParseError = class _PageRangeParseError extends Error {
2604
+ constructor(message, options) {
2605
+ super(message, options);
2606
+ this.name = "PageRangeParseError";
2607
+ }
2608
+ /**
2609
+ * Extract error message from unknown error type
2610
+ */
2611
+ static getErrorMessage(error) {
2612
+ return error instanceof Error ? error.message : String(error);
2613
+ }
2614
+ /**
2615
+ * Create PageRangeParseError from unknown error with context
2616
+ */
2617
+ static fromError(context, error) {
2618
+ return new _PageRangeParseError(
2619
+ `${context}: ${_PageRangeParseError.getErrorMessage(error)}`,
2620
+ { cause: error }
2621
+ );
2622
+ }
2623
+ };
2624
+
2625
+ // src/parsers/page-range-parser.ts
2626
+ var fs3 = __toESM(require("fs"), 1);
2627
+ var path3 = __toESM(require("path"), 1);
2628
+ var import_zod4 = require("zod");
2629
+ var PagePattern = /* @__PURE__ */ ((PagePattern2) => {
2630
+ PagePattern2["SIMPLE_INCREMENT"] = "simple_increment";
2631
+ PagePattern2["DOUBLE_SIDED"] = "double_sided";
2632
+ PagePattern2["OFFSET"] = "offset";
2633
+ PagePattern2["UNKNOWN"] = "unknown";
2634
+ return PagePattern2;
2635
+ })(PagePattern || {});
2636
+ var PageRangeParser = class extends VisionLLMComponent {
2637
+ // Configuration constants
2638
+ SAMPLE_SIZE = 3;
2639
+ MAX_PATTERN_RETRIES = 6;
2640
+ SIZE_TOLERANCE = 5;
2641
+ constructor(logger, model, outputPath, maxRetries = 3, fallbackModel, aggregator, abortSignal) {
2642
+ super(
2643
+ logger,
2644
+ model,
2645
+ "PageRangeParser",
2646
+ outputPath,
2647
+ { maxRetries, abortSignal },
2648
+ fallbackModel,
2649
+ aggregator ?? new LLMTokenUsageAggregator()
2650
+ );
2651
+ }
2652
+ /**
2653
+ * Main parse method
2654
+ *
2655
+ * Extracts page range mapping from DoclingDocument using Vision LLM.
2656
+ * Automatically tracks token usage in the aggregator if one was provided.
2657
+ *
2658
+ * @param doclingDoc - DoclingDocument to extract page ranges from
2659
+ * @returns Object with page range mapping and token usage information
2660
+ */
2661
+ async parse(doclingDoc) {
2662
+ this.log("info", "Starting page range parsing...");
2663
+ const pages = this.extractPages(doclingDoc);
2664
+ if (pages.length === 0) {
2665
+ this.log("warn", "No pages found");
2666
+ const emptyUsage = this.createEmptyUsage("sampling");
2667
+ this.trackUsage(emptyUsage);
2668
+ return {
2669
+ pageRangeMap: {},
2670
+ usage: [emptyUsage]
2671
+ };
2672
+ }
2673
+ const sizeGroups = this.analyzeSizes(pages);
2674
+ this.log(
2675
+ "info",
2676
+ `Found ${sizeGroups.length} size group(s), total ${pages.length} pages`
2677
+ );
2678
+ const pageRangeMap = {};
2679
+ const usageList = [];
2680
+ for (let i = 0; i < sizeGroups.length; i++) {
2681
+ const group = sizeGroups[i];
2682
+ this.log(
2683
+ "info",
2684
+ `Processing group ${i + 1}/${sizeGroups.length}: ${group.pageNos.length} pages`
2685
+ );
2686
+ const groupResult = await this.processGroup(pages, group, this.model);
2687
+ Object.assign(pageRangeMap, groupResult.pageRangeMap);
2688
+ usageList.push(...groupResult.usage);
2689
+ }
2690
+ for (const usage of usageList) {
2691
+ this.trackUsage(usage);
2692
+ }
2693
+ this.postProcess(pageRangeMap);
2694
+ this.log(
2695
+ "info",
2696
+ `Completed: ${Object.keys(pageRangeMap).length} pages mapped`
2697
+ );
2698
+ return { pageRangeMap, usage: usageList };
2699
+ }
2700
+ /**
2701
+ * Extract pages array from DoclingDocument
2702
+ */
2703
+ extractPages(doclingDoc) {
2704
+ const pageKeys = Object.keys(doclingDoc.pages).map(Number).filter((n) => !Number.isNaN(n)).sort((a, b) => a - b);
2705
+ return pageKeys.map((key) => doclingDoc.pages[String(key)]);
2706
+ }
2707
+ /**
2708
+ * Analyze page sizes and group consecutive pages with same dimensions
2709
+ */
2710
+ analyzeSizes(pages) {
2711
+ const groups = [];
2712
+ let currentGroup = null;
2713
+ for (const page of pages) {
2714
+ const sizeKey = this.createSizeKey(page.size.width, page.size.height);
2715
+ if (!currentGroup || currentGroup.sizeKey !== sizeKey) {
2716
+ currentGroup = { sizeKey, pageNos: [page.page_no] };
2717
+ groups.push(currentGroup);
2718
+ } else {
2719
+ currentGroup.pageNos.push(page.page_no);
2720
+ }
2721
+ }
2722
+ return groups;
2723
+ }
2724
+ /**
2725
+ * Create size key with tolerance for floating point comparison
2726
+ */
2727
+ createSizeKey(width, height) {
2728
+ const roundedWidth = Math.round(width / this.SIZE_TOLERANCE);
2729
+ const roundedHeight = Math.round(height / this.SIZE_TOLERANCE);
2730
+ return `${roundedWidth}x${roundedHeight}`;
2731
+ }
2732
+ /**
2733
+ * Process a single size group
2734
+ */
2735
+ async processGroup(pages, group, model) {
2736
+ const { pageNos } = group;
2737
+ const usageList = [];
2738
+ if (pageNos.length <= this.SAMPLE_SIZE) {
2739
+ this.log(
2740
+ "info",
2741
+ `Small group (${pageNos.length} pages), extracting all at once`
2742
+ );
2743
+ const result = await this.extractMultiplePages(pages, pageNos, model);
2744
+ usageList.push(result.usage);
2745
+ return {
2746
+ pageRangeMap: this.samplesToMap(result.samples),
2747
+ usage: usageList
2748
+ };
2749
+ }
2750
+ const sampledPages = /* @__PURE__ */ new Set();
2751
+ for (let attempt = 0; attempt <= this.MAX_PATTERN_RETRIES; attempt++) {
2752
+ const samplePageNos = this.selectRandomSamples(
2753
+ pageNos,
2754
+ this.SAMPLE_SIZE,
2755
+ sampledPages
2756
+ );
2757
+ for (const p of samplePageNos) {
2758
+ sampledPages.add(p);
2759
+ }
2760
+ this.log(
2761
+ "info",
2762
+ `Attempt ${attempt + 1}/${this.MAX_PATTERN_RETRIES + 1}: sampling pages ${samplePageNos.join(", ")}`
2763
+ );
2764
+ const result = await this.extractMultiplePages(
2765
+ pages,
2766
+ samplePageNos,
2767
+ model
2768
+ );
2769
+ usageList.push(result.usage);
2770
+ const samples = result.samples;
2771
+ const pattern = this.detectPattern(samples);
2772
+ if (pattern.pattern !== "unknown" /* UNKNOWN */) {
2773
+ this.log(
2774
+ "info",
2775
+ `Pattern detected: ${pattern.pattern} (offset=${pattern.offset}, increment=${pattern.increment})`
2776
+ );
2777
+ return {
2778
+ pageRangeMap: this.applyPattern(pageNos, pattern),
2779
+ usage: usageList
2780
+ };
2781
+ }
2782
+ this.log(
2783
+ "warn",
2784
+ `Pattern detection failed, attempt ${attempt + 1}/${this.MAX_PATTERN_RETRIES + 1}`
2785
+ );
2786
+ }
2787
+ throw new PageRangeParseError(
2788
+ `Failed to detect page pattern after ${this.MAX_PATTERN_RETRIES + 1} attempts for size group with ${pageNos.length} pages`
2789
+ );
2790
+ }
2791
+ /**
2792
+ * Select random samples from page numbers
2793
+ */
2794
+ selectRandomSamples(pageNos, count, exclude = /* @__PURE__ */ new Set()) {
2795
+ const available = pageNos.filter((p) => !exclude.has(p));
2796
+ const pool = available.length >= count ? available : pageNos;
2797
+ const shuffled = [...pool];
2798
+ for (let i = shuffled.length - 1; i > 0; i--) {
2799
+ const j = Math.floor(Math.random() * (i + 1));
2800
+ [shuffled[i], shuffled[j]] = [shuffled[j], shuffled[i]];
2801
+ }
2802
+ return shuffled.slice(0, count).sort((a, b) => a - b);
2803
+ }
2804
+ /**
2805
+ * Extract page numbers from multiple pages in a single LLM call
2806
+ */
2807
+ async extractMultiplePages(pages, pageNos, model) {
2808
+ this.log("info", `Extracting ${pageNos.length} pages in single LLM call`);
2809
+ const imageContents = [];
2810
+ for (const pageNo of pageNos) {
2811
+ const page = pages[pageNo - 1];
2812
+ const imagePath = path3.resolve(this.outputPath, page.image.uri);
2813
+ const imageBuffer = fs3.readFileSync(imagePath);
2814
+ const base64Image = imageBuffer.toString("base64");
2815
+ const mimeType = page.image.mimetype || "image/png";
2816
+ imageContents.push({
2817
+ type: "image",
2818
+ image: `data:${mimeType};base64,${base64Image}`
2819
+ });
2820
+ }
2821
+ const schema = import_zod4.z.object({
2822
+ pages: import_zod4.z.array(
2823
+ import_zod4.z.object({
2824
+ imageIndex: import_zod4.z.number().describe("0-based index of the image in the request"),
2825
+ startPageNo: import_zod4.z.number().nullable().describe("Start page number (null if not found)"),
2826
+ endPageNo: import_zod4.z.number().nullable().describe(
2827
+ "End page number for double-sided scans (null for single page)"
2828
+ )
2829
+ })
2830
+ ).describe("Extracted page numbers for each image")
2831
+ });
2832
+ try {
2833
+ const result = await LLMCaller.callVision({
2834
+ schema,
2835
+ messages: [
2836
+ {
2837
+ role: "user",
2838
+ content: [
2839
+ { type: "text", text: this.buildUserPrompt(pageNos) },
2840
+ ...imageContents
2841
+ ]
2842
+ }
2843
+ ],
2844
+ primaryModel: model,
2845
+ fallbackModel: this.fallbackModel,
2846
+ maxRetries: this.maxRetries,
2847
+ temperature: 0,
2848
+ abortSignal: this.abortSignal,
2849
+ component: "PageRangeParser",
2850
+ phase: "sampling"
2851
+ });
2852
+ const samples = result.output.pages.map((p) => ({
2853
+ pdfPageNo: pageNos[p.imageIndex],
2854
+ startPageNo: p.startPageNo,
2855
+ endPageNo: p.endPageNo
2856
+ }));
2857
+ return { samples, usage: result.usage };
2858
+ } catch (error) {
2859
+ this.log("error", "Multi-image extraction failed:", error);
2860
+ throw PageRangeParseError.fromError(
2861
+ "Multi-image extraction failed",
2862
+ error
2863
+ );
2864
+ }
2865
+ }
2866
+ /**
2867
+ * Detect pattern from sample results
2868
+ */
2869
+ detectPattern(samples) {
2870
+ const validSamples = samples.filter((s) => s.startPageNo !== null);
2871
+ if (validSamples.length < 2) {
2872
+ return { pattern: "unknown" /* UNKNOWN */, offset: 0, increment: 1 };
2873
+ }
2874
+ validSamples.sort((a, b) => a.pdfPageNo - b.pdfPageNo);
2875
+ const isSimple = validSamples.every((s, i) => {
2876
+ if (s.endPageNo !== null && s.startPageNo !== s.endPageNo) return false;
2877
+ if (i === 0) return true;
2878
+ const prev = validSamples[i - 1];
2879
+ const expectedIncrease = s.pdfPageNo - prev.pdfPageNo;
2880
+ return s.startPageNo === prev.startPageNo + expectedIncrease;
2881
+ });
2882
+ if (isSimple) {
2883
+ const firstSample = validSamples[0];
2884
+ const offset = firstSample.startPageNo - firstSample.pdfPageNo;
2885
+ return { pattern: "simple_increment" /* SIMPLE_INCREMENT */, offset, increment: 1 };
2886
+ }
2887
+ const isDoubleSided = validSamples.every((s, i) => {
2888
+ if (s.endPageNo === null) return false;
2889
+ if (s.endPageNo !== s.startPageNo + 1) return false;
2890
+ if (i === 0) return true;
2891
+ const prev = validSamples[i - 1];
2892
+ const pdfDiff = s.pdfPageNo - prev.pdfPageNo;
2893
+ const expectedStartDiff = pdfDiff * 2;
2894
+ const actualStartDiff = s.startPageNo - prev.startPageNo;
2895
+ return actualStartDiff === expectedStartDiff;
2896
+ });
2897
+ if (isDoubleSided) {
2898
+ const firstSample = validSamples[0];
2899
+ const offset = firstSample.startPageNo - firstSample.pdfPageNo * 2;
2900
+ return { pattern: "double_sided" /* DOUBLE_SIDED */, offset, increment: 2 };
2901
+ }
2902
+ const offsets = validSamples.map((s) => s.startPageNo - s.pdfPageNo);
2903
+ const avgOffset = Math.round(
2904
+ offsets.reduce((a, b) => a + b, 0) / offsets.length
2905
+ );
2906
+ const isConsistentOffset = offsets.every(
2907
+ (o) => Math.abs(o - avgOffset) <= 1
2908
+ );
2909
+ if (isConsistentOffset) {
2910
+ return { pattern: "offset" /* OFFSET */, offset: avgOffset, increment: 1 };
2911
+ }
2912
+ return { pattern: "unknown" /* UNKNOWN */, offset: 0, increment: 1 };
2913
+ }
2914
+ /**
2915
+ * Apply detected pattern to generate page range map
2916
+ */
2917
+ applyPattern(pageNos, pattern) {
2918
+ const result = {};
2919
+ for (const pdfPageNo of pageNos) {
2920
+ switch (pattern.pattern) {
2921
+ case "simple_increment" /* SIMPLE_INCREMENT */:
2922
+ case "offset" /* OFFSET */: {
2923
+ const pageNo = pdfPageNo + pattern.offset;
2924
+ result[pdfPageNo] = {
2925
+ startPageNo: pageNo,
2926
+ endPageNo: pageNo
2927
+ };
2928
+ break;
2929
+ }
2930
+ case "double_sided" /* DOUBLE_SIDED */: {
2931
+ const start = pdfPageNo * 2 + pattern.offset;
2932
+ result[pdfPageNo] = {
2933
+ startPageNo: start,
2934
+ endPageNo: start + 1
2935
+ };
2936
+ break;
2937
+ }
2938
+ default:
2939
+ result[pdfPageNo] = { startPageNo: 0, endPageNo: 0 };
2940
+ }
2941
+ }
2942
+ return result;
2943
+ }
2944
+ /**
2945
+ * Convert sample results to page range map (for small groups)
2946
+ */
2947
+ samplesToMap(samples) {
2948
+ const result = {};
2949
+ for (const sample of samples) {
2950
+ if (sample.startPageNo !== null) {
2951
+ result[sample.pdfPageNo] = {
2952
+ startPageNo: sample.startPageNo,
2953
+ endPageNo: sample.endPageNo ?? sample.startPageNo
2954
+ };
2955
+ } else {
2956
+ result[sample.pdfPageNo] = { startPageNo: 0, endPageNo: 0 };
2957
+ }
2958
+ }
2959
+ return result;
2960
+ }
2961
+ /**
2962
+ * Post-process the page range map
2963
+ */
2964
+ postProcess(pageRangeMap) {
2965
+ this.detectAndHandleOutliers(pageRangeMap);
2966
+ this.detectAndHandleDrops(pageRangeMap);
2967
+ this.normalizeNegatives(pageRangeMap);
2968
+ this.backfillFailedPages(pageRangeMap);
2969
+ }
2970
+ /**
2971
+ * Detect and handle outlier page numbers at the beginning of document
2972
+ *
2973
+ * When early PDF pages have abnormally high page numbers compared to
2974
+ * subsequent pages (e.g., PDF 1-9 = 75-83, but PDF 10+ = 2,3,4...),
2975
+ * the LLM likely misread figure/photo numbers as page numbers.
2976
+ *
2977
+ * Detection: If page numbers at the beginning are significantly higher
2978
+ * than subsequent pages (which follow a normal pattern), mark them as failed.
2979
+ */
2980
+ detectAndHandleOutliers(pageRangeMap) {
2981
+ const pdfPages = Object.keys(pageRangeMap).map(Number).sort((a, b) => a - b);
2982
+ if (pdfPages.length < 3) return;
2983
+ const normalSequenceStart = this.findNormalSequenceStart(
2984
+ pageRangeMap,
2985
+ pdfPages
2986
+ );
2987
+ if (normalSequenceStart === null || normalSequenceStart <= 0) return;
2988
+ const normalStartPdfPage = pdfPages[normalSequenceStart];
2989
+ const normalStartPageNo = pageRangeMap[normalStartPdfPage].startPageNo;
2990
+ let hasOutliers = false;
2991
+ for (let i = 0; i < normalSequenceStart; i++) {
2992
+ const pdfPage = pdfPages[i];
2993
+ const pageNo = pageRangeMap[pdfPage].startPageNo;
2994
+ if (pageNo === 0) continue;
2995
+ const pdfDiff = normalStartPdfPage - pdfPage;
2996
+ const isDoubleSided = this.isDoubleSidedRange(
2997
+ pageRangeMap[normalStartPdfPage]
2998
+ );
2999
+ const expectedPageNo = isDoubleSided ? normalStartPageNo - pdfDiff * 2 : normalStartPageNo - pdfDiff;
3000
+ if (pageNo > expectedPageNo + 10) {
3001
+ this.log(
3002
+ "info",
3003
+ `Outlier detected: PDF ${pdfPage}=${pageNo} (expected ~${expectedPageNo})`
3004
+ );
3005
+ pageRangeMap[pdfPage] = { startPageNo: 0, endPageNo: 0 };
3006
+ hasOutliers = true;
3007
+ }
3008
+ }
3009
+ if (hasOutliers) {
3010
+ this.log("info", `Outliers marked as failed, will be backfilled later`);
3011
+ }
3012
+ }
3013
+ /**
3014
+ * Find the start index of a "normal" sequence in the page range map
3015
+ *
3016
+ * A normal sequence is defined as at least 3 consecutive PDF pages where:
3017
+ * - Page numbers are increasing (for single-page) or increasing by 2 (for double-sided)
3018
+ * - The pattern is consistent
3019
+ *
3020
+ * Returns the index in pdfPages array, or null if not found.
3021
+ */
3022
+ findNormalSequenceStart(pageRangeMap, pdfPages) {
3023
+ const MIN_SEQUENCE_LENGTH = 3;
3024
+ for (let startIdx = 0; startIdx <= pdfPages.length - MIN_SEQUENCE_LENGTH; startIdx++) {
3025
+ let isValidSequence = true;
3026
+ let expectedIncrement = null;
3027
+ for (let i = 0; i < MIN_SEQUENCE_LENGTH - 1; i++) {
3028
+ const currPdfPage = pdfPages[startIdx + i];
3029
+ const nextPdfPage = pdfPages[startIdx + i + 1];
3030
+ const currRange = pageRangeMap[currPdfPage];
3031
+ const nextRange = pageRangeMap[nextPdfPage];
3032
+ if (currRange.startPageNo === 0 || nextRange.startPageNo === 0) {
3033
+ isValidSequence = false;
3034
+ break;
3035
+ }
3036
+ const pageIncrement = nextRange.startPageNo - currRange.startPageNo;
3037
+ const pdfIncrement = nextPdfPage - currPdfPage;
3038
+ const isDoubleSided = this.isDoubleSidedRange(currRange);
3039
+ const expectedIncrementPerPdf = isDoubleSided ? 2 : 1;
3040
+ const expected = pdfIncrement * expectedIncrementPerPdf;
3041
+ if (expectedIncrement === null) {
3042
+ expectedIncrement = pageIncrement;
3043
+ }
3044
+ if (pageIncrement !== expected) {
3045
+ isValidSequence = false;
3046
+ break;
3047
+ }
3048
+ }
3049
+ if (isValidSequence) {
3050
+ return startIdx;
3051
+ }
3052
+ }
3053
+ return null;
3054
+ }
3055
+ /**
3056
+ * Check if a page range represents a double-sided scan
3057
+ */
3058
+ isDoubleSidedRange(range) {
3059
+ return range.endPageNo !== null && range.endPageNo !== range.startPageNo && range.endPageNo === range.startPageNo + 1;
3060
+ }
3061
+ /**
3062
+ * Detect and handle page number drops
3063
+ *
3064
+ * When page numbers suddenly decrease (e.g., 8,9 -> 3,4),
3065
+ * recalculate previous pages based on the drop point.
3066
+ */
3067
+ detectAndHandleDrops(pageRangeMap) {
3068
+ const pdfPages = Object.keys(pageRangeMap).map(Number).sort((a, b) => a - b);
3069
+ if (pdfPages.length < 2) return;
3070
+ for (let i = 1; i < pdfPages.length; i++) {
3071
+ const prevPdfPage = pdfPages[i - 1];
3072
+ const currPdfPage = pdfPages[i];
3073
+ const prevPageNo = pageRangeMap[prevPdfPage].startPageNo;
3074
+ const currPageNo = pageRangeMap[currPdfPage].startPageNo;
3075
+ if (prevPageNo === 0 || currPageNo === 0) continue;
3076
+ if (currPageNo > 0 && prevPageNo > currPageNo && prevPageNo - currPageNo > 1) {
3077
+ this.log(
3078
+ "info",
3079
+ `Page drop detected: PDF ${prevPdfPage}=${prevPageNo} -> PDF ${currPdfPage}=${currPageNo}`
3080
+ );
3081
+ const isDoubleSided = this.isDoubleSidedRange(
3082
+ pageRangeMap[currPdfPage]
3083
+ );
3084
+ for (let j = i - 1; j >= 0; j--) {
3085
+ const pdfPage = pdfPages[j];
3086
+ const distance = currPdfPage - pdfPage;
3087
+ if (isDoubleSided) {
3088
+ const expectedStartPageNo = currPageNo - distance * 2;
3089
+ if (expectedStartPageNo < 1) {
3090
+ pageRangeMap[pdfPage] = { startPageNo: 0, endPageNo: 0 };
3091
+ } else {
3092
+ pageRangeMap[pdfPage] = {
3093
+ startPageNo: expectedStartPageNo,
3094
+ endPageNo: expectedStartPageNo + 1
3095
+ };
3096
+ }
3097
+ } else {
3098
+ const expectedPageNo = currPageNo - distance;
3099
+ if (expectedPageNo < 1) {
3100
+ pageRangeMap[pdfPage] = { startPageNo: 0, endPageNo: 0 };
3101
+ } else {
3102
+ pageRangeMap[pdfPage] = {
3103
+ startPageNo: expectedPageNo,
3104
+ endPageNo: expectedPageNo
3105
+ };
3106
+ }
3107
+ }
3108
+ this.log(
3109
+ "info",
3110
+ `Recalculated PDF ${pdfPage} -> ${pageRangeMap[pdfPage].startPageNo}`
3111
+ );
3112
+ }
3113
+ }
3114
+ }
3115
+ }
3116
+ /**
3117
+ * Normalize negative page numbers to 0
3118
+ */
3119
+ normalizeNegatives(pageRangeMap) {
3120
+ for (const [pdfPageStr, range] of Object.entries(pageRangeMap)) {
3121
+ if (range.startPageNo < 0 || range.endPageNo < 0) {
3122
+ this.log("info", `Normalizing negative: PDF ${pdfPageStr} -> 0`);
3123
+ pageRangeMap[Number(pdfPageStr)] = { startPageNo: 0, endPageNo: 0 };
3124
+ }
3125
+ }
3126
+ }
3127
+ /**
3128
+ * Backfill pages marked with 0 using detected pattern
3129
+ */
3130
+ backfillFailedPages(pageRangeMap) {
3131
+ const pdfPages = Object.keys(pageRangeMap).map(Number).sort((a, b) => a - b);
3132
+ const failedPages = pdfPages.filter(
3133
+ (p) => pageRangeMap[p].startPageNo === 0
3134
+ );
3135
+ if (failedPages.length === 0) return;
3136
+ const successfulPages = pdfPages.filter((p) => pageRangeMap[p].startPageNo > 0).map((p) => ({
3137
+ pdfPage: p,
3138
+ pageNo: pageRangeMap[p].startPageNo,
3139
+ isDoubleSided: this.isDoubleSidedRange(pageRangeMap[p])
3140
+ }));
3141
+ if (successfulPages.length < 2) {
3142
+ this.log("warn", "Not enough successful pages for backfill");
3143
+ return;
3144
+ }
3145
+ const doubleSidedCount = successfulPages.filter(
3146
+ (s) => s.isDoubleSided
3147
+ ).length;
3148
+ const isDoubleSided = doubleSidedCount > successfulPages.length / 2;
3149
+ if (isDoubleSided) {
3150
+ const offsets = successfulPages.map((s) => s.pageNo - s.pdfPage * 2);
3151
+ const avgOffset = Math.round(
3152
+ offsets.reduce((a, b) => a + b, 0) / offsets.length
3153
+ );
3154
+ this.log(
3155
+ "info",
3156
+ `Backfilling ${failedPages.length} pages with double-sided pattern (offset=${avgOffset})`
3157
+ );
3158
+ for (const pdfPage of failedPages) {
3159
+ const expectedStartPageNo = pdfPage * 2 + avgOffset;
3160
+ if (expectedStartPageNo < 1) {
3161
+ this.log(
3162
+ "info",
3163
+ `Backfill skipped for PDF ${pdfPage} (would be ${expectedStartPageNo})`
3164
+ );
3165
+ continue;
3166
+ }
3167
+ this.log(
3168
+ "info",
3169
+ `Backfill PDF ${pdfPage}: 0 -> ${expectedStartPageNo}-${expectedStartPageNo + 1}`
3170
+ );
3171
+ pageRangeMap[pdfPage] = {
3172
+ startPageNo: expectedStartPageNo,
3173
+ endPageNo: expectedStartPageNo + 1
3174
+ };
3175
+ }
3176
+ } else {
3177
+ const offsets = successfulPages.map((s) => s.pageNo - s.pdfPage);
3178
+ const avgOffset = Math.round(
3179
+ offsets.reduce((a, b) => a + b, 0) / offsets.length
3180
+ );
3181
+ this.log(
3182
+ "info",
3183
+ `Backfilling ${failedPages.length} pages with offset ${avgOffset}`
3184
+ );
3185
+ for (const pdfPage of failedPages) {
3186
+ const expectedPageNo = pdfPage + avgOffset;
3187
+ if (expectedPageNo < 1) {
3188
+ this.log(
3189
+ "info",
3190
+ `Backfill skipped for PDF ${pdfPage} (would be ${expectedPageNo})`
3191
+ );
3192
+ continue;
3193
+ }
3194
+ this.log("info", `Backfill PDF ${pdfPage}: 0 -> ${expectedPageNo}`);
3195
+ pageRangeMap[pdfPage] = {
3196
+ startPageNo: expectedPageNo,
3197
+ endPageNo: expectedPageNo
3198
+ };
3199
+ }
3200
+ }
3201
+ }
3202
+ /**
3203
+ * Build system prompt for Vision LLM
3204
+ */
3205
+ buildSystemPrompt() {
3206
+ return `You are a page number extraction specialist for document images.
3207
+ You will receive multiple document page images. For EACH image, extract the visible page number(s).
3208
+
3209
+ **SCAN TYPES:**
3210
+ 1. SINGLE PAGE: One document page per image. Return startPageNo only, endPageNo should be null.
3211
+ 2. DOUBLE-SIDED: Two document pages per image (spread). Return startPageNo (left) and endPageNo (right).
3212
+
3213
+ **WHERE TO LOOK:**
3214
+ - Bottom center, bottom corners (most common)
3215
+ - Top corners (less common)
3216
+ - Page numbers are SMALL numbers in MARGINS, NOT in content area
3217
+
3218
+ **WHAT TO IGNORE - These are NOT page numbers:**
3219
+ - Roman numerals (i, ii, iii, iv, v...) - return null
3220
+ - Figure numbers: "Figure 5", "Fig. 5", "\uB3C4 5", "\uADF8\uB9BC 5"
3221
+ - Table numbers: "Table 3", "\uD45C 3"
3222
+ - Photo numbers: "Photo 8", "\uC0AC\uC9C4 8", "Plate 4", "\uB3C4\uD310 4"
3223
+ - Years in content: "2015", "(1998)"
3224
+ - Any numbers with text prefix or inside content area
3225
+
3226
+ **RESPONSE FORMAT:**
3227
+ For each image (in order), provide:
3228
+ - imageIndex: 0-based index of the image
3229
+ - startPageNo: The page number found (null if not visible/readable)
3230
+ - endPageNo: Right page number for double-sided scans (null for single pages)`;
3231
+ }
3232
+ /**
3233
+ * Build user prompt for Vision LLM
3234
+ */
3235
+ buildUserPrompt(pageNos) {
3236
+ return `I am providing ${pageNos.length} document page images.
3237
+ These are PDF pages: ${pageNos.join(", ")}.
3238
+
3239
+ For each image (in order), extract the visible page number(s).
3240
+ Return null for pages where no page number is visible or readable.
3241
+
3242
+ Remember: Look for SMALL numbers in MARGINS only. Ignore figure/table/photo numbers.`;
3243
+ }
3244
+ };
3245
+
3246
+ // src/validators/base-validator.ts
3247
+ var BaseValidator = class extends TextLLMComponent {
3248
+ /**
3249
+ * Validator name for logging (kept for backwards compatibility)
3250
+ */
3251
+ validatorName;
3252
+ /**
3253
+ * Constructor for BaseValidator
3254
+ *
3255
+ * @param logger - Logger instance
3256
+ * @param model - Language model to use for validation
3257
+ * @param validatorName - Name of the validator for logging (e.g., "TocContentValidator")
3258
+ * @param options - Optional configuration (maxRetries, temperature)
3259
+ * @param fallbackModel - Optional fallback model for retry on failure
3260
+ * @param aggregator - Optional token usage aggregator for tracking LLM calls
3261
+ */
3262
+ constructor(logger, model, validatorName, options, fallbackModel, aggregator) {
3263
+ super(logger, model, validatorName, options, fallbackModel, aggregator);
3264
+ this.validatorName = validatorName;
3265
+ }
3266
+ /**
3267
+ * Call LLM with LLMCaller
3268
+ *
3269
+ * This method provides backwards compatibility with existing validators.
3270
+ * It wraps the parent callTextLLM method but allows passing a custom aggregator.
3271
+ *
3272
+ * @param schema - Zod schema for response validation
3273
+ * @param systemPrompt - System prompt
3274
+ * @param userPrompt - User prompt
3275
+ * @param phase - Phase name for tracking (e.g., 'validation', 'batch-validation')
3276
+ * @param aggregator - Optional token usage aggregator for tracking this call
3277
+ * @returns Parsed and validated LLM response with usage information
3278
+ */
3279
+ async callLLM(schema, systemPrompt, userPrompt, phase, aggregator) {
3280
+ const result = await LLMCaller.call({
3281
+ schema,
3282
+ systemPrompt,
3283
+ userPrompt,
3284
+ primaryModel: this.model,
3285
+ fallbackModel: this.fallbackModel,
3286
+ maxRetries: this.maxRetries,
3287
+ temperature: this.temperature,
3288
+ abortSignal: this.abortSignal,
3289
+ component: this.validatorName,
3290
+ phase
3291
+ });
3292
+ if (aggregator) {
3293
+ aggregator.track(result.usage);
3294
+ } else {
3295
+ this.trackUsage(result.usage);
3296
+ }
3297
+ return {
3298
+ output: result.output,
3299
+ usage: result.usage
3300
+ };
3301
+ }
3302
+ };
3303
+
3304
+ // src/validators/toc-content-validator.ts
3305
+ var import_zod5 = require("zod");
3306
+ var TocContentValidationSchema = import_zod5.z.object({
3307
+ isToc: import_zod5.z.boolean().describe("Whether the content is a table of contents"),
3308
+ confidence: import_zod5.z.number().min(0).max(1).describe("Confidence score between 0 and 1"),
3309
+ reason: import_zod5.z.string().describe("Brief explanation for the decision")
3310
+ });
3311
+ var TocContentValidator = class extends BaseValidator {
3312
+ confidenceThreshold;
3313
+ constructor(logger, model, options, fallbackModel, aggregator) {
3314
+ super(
3315
+ logger,
3316
+ model,
3317
+ "TocContentValidator",
3318
+ options,
3319
+ fallbackModel,
3320
+ aggregator
3321
+ );
3322
+ this.confidenceThreshold = options?.confidenceThreshold ?? 0.7;
3323
+ }
3324
+ /**
3325
+ * Validate if the markdown content is a table of contents
3326
+ *
3327
+ * @param markdown - Markdown content to validate
3328
+ * @returns Validation result with isToc, confidence, and reason
3329
+ */
3330
+ async validate(markdown) {
3331
+ this.logger.info(
3332
+ `[TocContentValidator] Validating content (${markdown.length} chars)`
3333
+ );
3334
+ if (!markdown.trim()) {
3335
+ this.logger.info(
3336
+ "[TocContentValidator] Empty markdown, returning invalid"
3337
+ );
3338
+ return {
3339
+ isToc: false,
3340
+ confidence: 1,
3341
+ reason: "Empty content"
3342
+ };
3343
+ }
3344
+ const { output: result } = await this.callLLM(
3345
+ TocContentValidationSchema,
3346
+ this.buildSystemPrompt(),
3347
+ this.buildUserPrompt(markdown),
3348
+ "validation",
3349
+ this.aggregator
3350
+ );
3351
+ this.logger.info(
3352
+ `[TocContentValidator] Result: isToc=${result.isToc}, confidence=${result.confidence}`
3353
+ );
3354
+ return result;
3355
+ }
3356
+ /**
3357
+ * Check if validation result passes threshold
3358
+ *
3359
+ * @param result - Validation result from validate()
3360
+ * @returns true if content is valid TOC with sufficient confidence
3361
+ */
3362
+ isValid(result) {
3363
+ return result.isToc && result.confidence >= this.confidenceThreshold;
3364
+ }
3365
+ /**
3366
+ * Build system prompt for TOC content validation
3367
+ */
3368
+ buildSystemPrompt() {
3369
+ return `You are a document structure analyst. Your task is to determine if the provided content is a Table of Contents (TOC).
3370
+
3371
+ ## What IS a Table of Contents:
3372
+ - A structured list of chapters/sections with corresponding page numbers
3373
+ - Contains hierarchical section titles (e.g., "Chapter 1", "\uC81C1\uC7A5", "1.1 Introduction", etc.)
3374
+ - Has page number references for each entry (e.g., "..... 10", "... 5", or just a number at the end)
3375
+ - Multiple entries organized by document structure
3376
+ - Main document outline listing major chapters and sections
3377
+
3378
+ ## What is NOT a Table of Contents:
3379
+ - Photo/image indices (\uC0AC\uC9C4 \uBAA9\uCC28, \uC0AC\uC9C4\uBAA9\uCC28, Photo Index, List of Figures, List of Photos)
3380
+ - Table indices (\uD45C \uBAA9\uCC28, \uD45C\uBAA9\uCC28, Table Index, List of Tables)
3381
+ - Drawing/diagram indices (\uB3C4\uBA74 \uBAA9\uCC28, \uB3C4\uBA74\uBAA9\uCC28, Drawing Index, List of Drawings)
3382
+ - Appendix indices (\uBD80\uB85D \uBAA9\uCC28, Appendix Index)
3383
+ - Random body text from the document
3384
+ - Single entries or incomplete lists (fewer than 3 items)
3385
+ - Reference lists or bibliographies
3386
+ - Index pages (alphabetical keyword lists)
3387
+
3388
+ ## Response Guidelines:
3389
+ - Set isToc to true ONLY if content is clearly a main document TOC
3390
+ - Set confidence between 0.0 and 1.0 based on your certainty
3391
+ - Provide a brief reason explaining your decision (1-2 sentences)`;
3392
+ }
3393
+ /**
3394
+ * Build user prompt with markdown content
3395
+ */
3396
+ buildUserPrompt(markdown) {
3397
+ return `Determine if the following content is a Table of Contents:
3398
+
3399
+ ${markdown}`;
3400
+ }
3401
+ };
3402
+
3403
+ // src/validators/caption-validator.ts
3404
+ var import_zod6 = require("zod");
3405
+ var CaptionValidationItemSchema = import_zod6.z.object({
3406
+ index: import_zod6.z.number().int().describe("Index of the caption in the input array"),
3407
+ isValid: import_zod6.z.boolean().describe("Whether the parsed caption is correct"),
3408
+ reason: import_zod6.z.string().nullable().describe("Brief explanation if invalid, null if valid")
3409
+ });
3410
+ var CaptionValidationBatchSchema = import_zod6.z.object({
3411
+ results: import_zod6.z.array(CaptionValidationItemSchema)
3412
+ });
3413
+ var CaptionValidator = class extends BaseValidator {
3414
+ constructor(logger, model, options, fallbackModel, aggregator) {
3415
+ super(
3416
+ logger,
3417
+ model,
3418
+ "CaptionValidator",
3419
+ options,
3420
+ fallbackModel,
3421
+ aggregator
3422
+ );
3423
+ }
3424
+ /**
3425
+ * Validate batch of parsed captions against original texts
3426
+ *
3427
+ * @param captions - Array of parsed Caption objects
3428
+ * @param originalTexts - Array of original caption texts (same order as captions)
3429
+ * @param batchSize - Batch size for processing. Set to 0 to skip validation (assume all valid).
3430
+ * @returns Array of validation results (boolean) maintaining original order
3431
+ */
3432
+ async validateBatch(captions, originalTexts, batchSize) {
3433
+ this.logger.info(
3434
+ `[CaptionValidator] Validating ${captions.length} captions with batch size ${batchSize}...`
3435
+ );
3436
+ if (captions.length !== originalTexts.length) {
3437
+ throw new Error(
3438
+ `[CaptionValidator] Captions and originalTexts length mismatch: ${captions.length} vs ${originalTexts.length}`
3439
+ );
3440
+ }
3441
+ if (captions.length === 0) {
3442
+ this.logger.info("[CaptionValidator] No captions to validate");
3443
+ return [];
3444
+ }
3445
+ if (batchSize === 0) {
3446
+ this.logger.info(
3447
+ "[CaptionValidator] Skipping validation (batchSize=0), assuming all captions are valid"
3448
+ );
3449
+ return new Array(captions.length).fill(true);
3450
+ }
3451
+ try {
3452
+ const indexedItems = captions.map((caption, index) => ({
3453
+ index,
3454
+ caption,
3455
+ originalText: originalTexts[index]
3456
+ }));
3457
+ const batchResults = await BatchProcessor.processBatch(
3458
+ indexedItems,
3459
+ batchSize,
3460
+ async (batch) => this.validateBatchInternal(batch, this.model)
3461
+ );
3462
+ batchResults.sort((a, b) => a.index - b.index);
3463
+ const results = batchResults.map((r) => r.isValid);
3464
+ const validCount = results.filter((r) => r).length;
3465
+ this.logger.info(
3466
+ `[CaptionValidator] Completed: ${validCount}/${results.length} captions validated as correct`
3467
+ );
3468
+ if (this.aggregator) {
3469
+ this.aggregator.logSummary(this.logger);
3470
+ }
3471
+ return results;
3472
+ } catch (error) {
3473
+ const message = error instanceof Error ? error.message : String(error);
3474
+ this.logger.error(`[CaptionValidator] Validation failed: ${message}`);
3475
+ throw new CaptionValidationError(
3476
+ `Failed to validate captions: ${message}`,
3477
+ { cause: error }
3478
+ );
3479
+ }
3480
+ }
3481
+ /**
3482
+ * Internal: Validate batch of captions using LLM
3483
+ *
3484
+ * @param items - Batch of caption items with original indices
3485
+ * @param model - Effective model to use
3486
+ * @returns Array of validation results indexed correctly
3487
+ */
3488
+ async validateBatchInternal(items, model) {
3489
+ const result = await LLMCaller.call({
3490
+ schema: CaptionValidationBatchSchema,
3491
+ systemPrompt: this.buildSystemPrompt(),
3492
+ userPrompt: this.buildUserPrompt(items),
3493
+ primaryModel: model,
3494
+ fallbackModel: this.fallbackModel,
3495
+ maxRetries: this.maxRetries,
3496
+ temperature: this.temperature,
3497
+ abortSignal: this.abortSignal,
3498
+ component: "CaptionValidator",
3499
+ phase: "validation"
3500
+ });
3501
+ if (this.aggregator) {
3502
+ this.aggregator.track(result.usage);
3503
+ }
3504
+ return result.output.results.map((item) => ({
3505
+ index: item.index,
3506
+ isValid: item.isValid
3507
+ }));
3508
+ }
3509
+ buildSystemPrompt() {
3510
+ return `You are a caption validation expert for archaeological excavation reports.
3511
+
3512
+ Your task is to validate whether parsed caption prefixes (num field) are correctly extracted from original caption texts.
3513
+
3514
+ ## Caption Pattern Recognition
3515
+
3516
+ A valid caption follows the pattern: <prefix word(s)> <number>
3517
+ - The prefix can be ANY Korean/English word(s) that label images/tables/figures
3518
+ - Common examples: \uB3C4\uD310, \uC0AC\uC9C4, \uADF8\uB9BC, \uC6D0\uC0C9\uC0AC\uC9C4, \uD751\uBC31\uC0AC\uC9C4, Figure, Photo, Plate, etc.
3519
+ - The key is the PATTERN (text followed by number), not a specific word list
3520
+ - Leading punctuation/brackets should be IGNORED when extracting
3521
+
3522
+ Valid caption patterns:
3523
+ - "\uC6D0\uC0C9\uC0AC\uC9C4 1. \uC870\uC0AC\uC9C0\uC5ED" \u2192 num="\uC6D0\uC0C9\uC0AC\uC9C4 1" \u2713
3524
+ - "\uD751\uBC31\uC0AC\uC9C4 2 \uCD9C\uD1A0\uC720\uBB3C" \u2192 num="\uD751\uBC31\uC0AC\uC9C4 2" \u2713
3525
+ - "\uB3C4\uD310 1 \uC720\uC801 \uC804\uACBD" \u2192 num="\uB3C4\uD310 1" \u2713
3526
+ - "(\uC0AC\uC9C4 16> \uB290\uD2F0\uB098\uBB34" \u2192 num="\uC0AC\uC9C4 16" \u2713 (ignore leading punctuation)
3527
+ - "<\uB3C4\uD310 3> \uC720\uBB3C \uC0AC\uC9C4" \u2192 num="\uB3C4\uD310 3" \u2713 (ignore angle brackets)
3528
+
3529
+ Invalid patterns (num MUST be null):
3530
+ - "39 3\uC6D4 28\uC77C(\uBC31\uC81C \uB3C4\uB85C\uC720\uAD6C)" \u2192 null \u2713 (starts with number, no prefix)
3531
+ - "1. \uC720\uC801 \uC804\uACBD" \u2192 null \u2713 (numbered list item, not a caption)
3532
+ - "2024\uB144 \uC870\uC0AC \uD604\uD669" \u2192 null \u2713 (year reference, not a caption)
3533
+
3534
+ ## Extraction Algorithm:
3535
+
3536
+ 1. Extract prefix + number from the caption
3537
+ - The prefix is the text portion before the number
3538
+ - Full extraction: "\uC6D0\uC0C9\uC0AC\uC9C4 1", "\uB3C4\uD310 2-3", "\uADF8\uB9BC 3.6", "Figure 4a"
3539
+
3540
+ 2. **Decimal point handling**: Include period/dot after number if directly following
3541
+ - "\uADF8\uB9BC 3.6. \uD55C\uBC18\uB3C4" \u2192 "\uADF8\uB9BC 3.6" (period as decimal separator included)
3542
+ - "\uB3C4\uD310 2. \uC720\uC801" \u2192 "\uB3C4\uD310 2" (period after space, NOT included)
3543
+
3544
+ 3. **Stop rules** (extraction must stop at first occurrence of):
3545
+ - Punctuation (except decimal point): , : ; ! ? ~ ( ) [ ] { }
3546
+ - Whitespace: space, tab, newline
3547
+ - Underscore: _
3548
+ - Exception: Periods directly after digits are included as decimal separators
3549
+ - Exception: Hyphens within numbers are included (e.g., "2-3")
3550
+
3551
+ ## Validation Rules:
3552
+
3553
+ 1. **Pattern requirement**: The original text MUST follow <prefix> <number> pattern
3554
+ - "\uC6D0\uC0C9\uC0AC\uC9C4 1. \uC870\uC0AC\uC9C0\uC5ED" \u2192 num="\uC6D0\uC0C9\uC0AC\uC9C4 1" \u2713 (valid pattern)
3555
+ - "39 3\uC6D4 28\uC77C(\uBC31\uC81C)" \u2192 num="39" \u2717 (starts with number, should be null)
3556
+ - "1. \uC870\uC0AC \uAC1C\uC694" \u2192 num="1" \u2717 (numbered list, should be null)
3557
+
3558
+ 2. **Correctness**: The parsed "num" must contain the actual prefix+number
3559
+ - "\uB3C4\uD310 1 \uC720\uC801 \uC804\uACBD" \u2192 num="\uB3C4\uD310 1" \u2713
3560
+ - "\uB3C4\uD310 1 \uC720\uC801 \uC804\uACBD" \u2192 num="\uB3C4\uD310" \u2717 (incomplete)
3561
+
3562
+ 3. **Spacing**: The spacing in "num" must match the original text exactly
3563
+ - "\uB3C4\uD310 1" \u2192 num="\uB3C4\uD310 1" \u2713
3564
+ - "\uB3C4\uD3101" \u2192 num="\uB3C4\uD3101" \u2713
3565
+ - "\uB3C4\uD310 1" \u2192 num="\uB3C4\uD3101" \u2717 (spacing mismatch)
3566
+
3567
+ 4. **Completeness**: The number part must be fully extracted
3568
+ - "Figure 2-3" \u2192 num="Figure 2-3" \u2713
3569
+ - "Figure 2-3" \u2192 num="Figure 2" \u2717 (incomplete number)
3570
+
3571
+ 5. **Null handling**: If "num" is null, verify:
3572
+ - Either the original text has no number
3573
+ - OR the text starts with a number (no prefix)
3574
+ - "\uC720\uC801 \uC804\uACBD \uC0AC\uC9C4" \u2192 num=null \u2713 (no number in caption position)
3575
+ - "\uC6D0\uC0C9\uC0AC\uC9C4 1 \uC870\uC0AC" \u2192 num=null \u2717 (should extract "\uC6D0\uC0C9\uC0AC\uC9C4 1")
3576
+
3577
+ ## Response:
3578
+ For each caption, return:
3579
+ - index: original position
3580
+ - isValid: true if parsing is correct, false otherwise
3581
+ - reason: null if valid, brief explanation if invalid`;
3582
+ }
3583
+ buildUserPrompt(items) {
3584
+ const captionList = items.map(
3585
+ (item) => `[${item.index}] Original: "${item.originalText}" | Parsed num: ${item.caption.num !== void 0 ? `"${item.caption.num}"` : "null"}`
3586
+ ).join("\n");
3587
+ return `Validate the following caption parsing results:
3588
+
3589
+ ${captionList}
3590
+
3591
+ Return the results as JSON array with "index", "isValid", and "reason" (null if valid, explanation if invalid).
3592
+
3593
+ Example format:
3594
+ {
3595
+ "results": [
3596
+ { "index": 0, "isValid": true, "reason": null },
3597
+ { "index": 1, "isValid": false, "reason": "Number incomplete: expected '1-2' but got '1'" },
3598
+ { "index": 2, "isValid": true, "reason": null }
3599
+ ]
3600
+ }`;
3601
+ }
3602
+ };
3603
+ var CaptionValidationError = class extends Error {
3604
+ constructor(message, options) {
3605
+ super(message, options);
3606
+ this.name = "CaptionValidationError";
3607
+ }
3608
+ };
3609
+
3610
+ // src/document-processor.ts
3611
+ var DocumentProcessor = class {
3612
+ logger;
3613
+ fallbackModel;
3614
+ pageRangeParserModel;
3615
+ tocExtractorModel;
3616
+ validatorModel;
3617
+ visionTocExtractorModel;
3618
+ captionParserModel;
3619
+ textCleanerBatchSize;
3620
+ captionParserBatchSize;
3621
+ captionValidatorBatchSize;
3622
+ maxRetries;
3623
+ enableFallbackRetry;
3624
+ abortSignal;
3625
+ idGenerator = new IdGenerator();
3626
+ refResolver;
3627
+ pageRangeParser;
3628
+ tocFinder;
3629
+ tocExtractor;
3630
+ tocContentValidator;
3631
+ captionValidator;
3632
+ visionTocExtractor;
3633
+ captionParser;
3634
+ chapterConverter;
3635
+ textCleaner = TextCleaner;
3636
+ usageAggregator = new LLMTokenUsageAggregator();
3637
+ constructor(options) {
3638
+ this.logger = options.logger;
3639
+ this.fallbackModel = options.fallbackModel;
3640
+ this.pageRangeParserModel = options.pageRangeParserModel ?? options.fallbackModel;
3641
+ this.tocExtractorModel = options.tocExtractorModel ?? options.fallbackModel;
3642
+ this.validatorModel = options.validatorModel ?? options.fallbackModel;
3643
+ this.visionTocExtractorModel = options.visionTocExtractorModel ?? options.fallbackModel;
3644
+ this.captionParserModel = options.captionParserModel ?? options.fallbackModel;
3645
+ this.textCleanerBatchSize = options.textCleanerBatchSize;
3646
+ this.captionParserBatchSize = options.captionParserBatchSize;
3647
+ this.captionValidatorBatchSize = options.captionValidatorBatchSize;
3648
+ this.maxRetries = options.maxRetries ?? 3;
3649
+ this.enableFallbackRetry = options.enableFallbackRetry ?? false;
3650
+ this.abortSignal = options.abortSignal;
3651
+ }
3652
+ /**
3653
+ * Check if abort has been requested and throw error if so
3654
+ *
3655
+ * @throws {Error} with name 'AbortError' if aborted
3656
+ */
3657
+ checkAborted() {
3658
+ if (this.abortSignal?.aborted) {
3659
+ const error = new Error("Document processing was aborted");
3660
+ error.name = "AbortError";
3661
+ throw error;
3662
+ }
3663
+ }
3664
+ /**
3665
+ * Converts DoclingDocument to ProcessedDocument with token usage tracking.
3666
+ *
3667
+ * Conversion process:
3668
+ * 1. Initialize processors and resolvers
3669
+ * 2. Normalize and filter texts
3670
+ * 3. Clean texts and parse page ranges (parallel)
3671
+ * 4. Extract table of contents
3672
+ * 5. Convert images and tables (parallel)
3673
+ * 6. Convert chapters and link resources
3674
+ * 7. Assemble final ProcessedDocument
3675
+ * 8. Collect and report token usage
3676
+ *
3677
+ * @param doclingDoc - Original document extracted from Docling SDK
3678
+ * @param reportId - Report unique identifier
3679
+ * @param outputPath - Path containing images and pages subdirectories (images/image_0.png, pages/page_0.png, etc.)
3680
+ * @returns Document processing result with ProcessedDocument and token usage report
3681
+ *
3682
+ * @throws {TocExtractError} When TOC extraction fails
3683
+ * @throws {PageRangeParseError} When page range parsing fails
3684
+ * @throws {ConversionError} When error occurs during conversion
3685
+ */
3686
+ async process(doclingDoc, reportId, outputPath) {
3687
+ this.logger.info("[DocumentProcessor] Starting document processing...");
3688
+ this.logger.info("[DocumentProcessor] Report ID:", reportId);
3689
+ this.usageAggregator.reset();
3690
+ this.checkAborted();
3691
+ this.initializeProcessors(doclingDoc, outputPath);
3692
+ const startTimeFilter = Date.now();
3693
+ const filtered = this.normalizeAndFilterTexts(doclingDoc);
3694
+ const filteringTime = Date.now() - startTimeFilter;
3695
+ this.logger.info(
3696
+ `[DocumentProcessor] Text filtering took ${filteringTime}ms`
3697
+ );
3698
+ this.checkAborted();
3699
+ const startTimePageRange = Date.now();
3700
+ const pageRangeMap = await this.parsePageRanges(doclingDoc);
3701
+ const pageRangeTime = Date.now() - startTimePageRange;
3702
+ this.logger.info(
3703
+ `[DocumentProcessor] Page range parsing took ${pageRangeTime}ms`
3704
+ );
3705
+ this.checkAborted();
3706
+ const startTimeToc = Date.now();
3707
+ const tocEntries = await this.extractTableOfContents(doclingDoc, filtered);
3708
+ const tocTime = Date.now() - startTimeToc;
3709
+ this.logger.info(`[DocumentProcessor] TOC extraction took ${tocTime}ms`);
3710
+ this.checkAborted();
3711
+ const startTimeResources = Date.now();
3712
+ const { images, tables, footnotes } = await this.convertResources(
3713
+ doclingDoc,
3714
+ outputPath
3715
+ );
3716
+ const resourcesTime = Date.now() - startTimeResources;
3717
+ this.logger.info(
3718
+ `[DocumentProcessor] Resource conversion took ${resourcesTime}ms`
3719
+ );
3720
+ this.checkAborted();
3721
+ const startTimeChapters = Date.now();
3722
+ const chapters = await this.convertChapters(
3723
+ doclingDoc,
3724
+ tocEntries,
3725
+ pageRangeMap,
3726
+ images,
3727
+ tables,
3728
+ footnotes
3729
+ );
3730
+ const chaptersTime = Date.now() - startTimeChapters;
3731
+ this.logger.info(
3732
+ `[DocumentProcessor] Chapter conversion took ${chaptersTime}ms`
3733
+ );
3734
+ const startTimeAssemble = Date.now();
3735
+ const processedDoc = this.assembleProcessedDocument(
3736
+ reportId,
3737
+ pageRangeMap,
3738
+ chapters,
3739
+ images,
3740
+ tables,
3741
+ footnotes
3742
+ );
3743
+ const assembleTime = Date.now() - startTimeAssemble;
3744
+ this.logger.info(
3745
+ `[DocumentProcessor] Document assembly took ${assembleTime}ms`
3746
+ );
3747
+ this.logger.info("[DocumentProcessor] Document processing completed");
3748
+ return {
3749
+ document: processedDoc,
3750
+ usage: this.usageAggregator.getReport()
3751
+ };
3752
+ }
3753
+ /**
3754
+ * Initialize all processors and resolvers
3755
+ *
3756
+ * Sets up RefResolver, PageRangeParser, TocFinder, and TocExtractor
3757
+ */
3758
+ initializeProcessors(doclingDoc, outputPath) {
3759
+ this.logger.info("[DocumentProcessor] Initializing processors...");
3760
+ this.logger.info("[DocumentProcessor] - RefResolver");
3761
+ this.refResolver = new RefResolver(this.logger, doclingDoc);
3762
+ this.logger.info("[DocumentProcessor] - PageRangeParser");
3763
+ this.pageRangeParser = new PageRangeParser(
3764
+ this.logger,
3765
+ this.pageRangeParserModel,
3766
+ outputPath,
3767
+ this.maxRetries,
3768
+ this.enableFallbackRetry ? this.fallbackModel : void 0,
3769
+ this.usageAggregator,
3770
+ this.abortSignal
3771
+ );
3772
+ this.logger.info("[DocumentProcessor] - TocFinder");
3773
+ this.tocFinder = new TocFinder(this.logger, this.refResolver);
3774
+ this.logger.info("[DocumentProcessor] - TocExtractor");
3775
+ this.tocExtractor = new TocExtractor(
3776
+ this.logger,
3777
+ this.tocExtractorModel,
3778
+ {
3779
+ maxRetries: this.maxRetries
3780
+ },
3781
+ this.enableFallbackRetry ? this.fallbackModel : void 0,
3782
+ this.abortSignal
3783
+ );
3784
+ this.logger.info("[DocumentProcessor] - TocContentValidator");
3785
+ this.tocContentValidator = new TocContentValidator(
3786
+ this.logger,
3787
+ this.validatorModel,
3788
+ { maxRetries: this.maxRetries, abortSignal: this.abortSignal },
3789
+ this.enableFallbackRetry ? this.fallbackModel : void 0,
3790
+ this.usageAggregator
3791
+ );
3792
+ this.logger.info("[DocumentProcessor] - CaptionValidator");
3793
+ this.captionValidator = new CaptionValidator(
3794
+ this.logger,
3795
+ this.validatorModel,
3796
+ { maxRetries: this.maxRetries, abortSignal: this.abortSignal },
3797
+ this.enableFallbackRetry ? this.fallbackModel : void 0,
3798
+ this.usageAggregator
3799
+ );
3800
+ this.logger.info("[DocumentProcessor] - VisionTocExtractor");
3801
+ this.visionTocExtractor = new VisionTocExtractor(
3802
+ this.logger,
3803
+ this.visionTocExtractorModel,
3804
+ outputPath,
3805
+ { maxRetries: this.maxRetries, abortSignal: this.abortSignal },
3806
+ this.enableFallbackRetry ? this.fallbackModel : void 0,
3807
+ this.usageAggregator
3808
+ );
3809
+ this.logger.info("[DocumentProcessor] - CaptionParser");
3810
+ this.captionParser = new CaptionParser(
3811
+ this.logger,
3812
+ this.captionParserModel,
3813
+ { maxRetries: this.maxRetries, abortSignal: this.abortSignal },
3814
+ this.enableFallbackRetry ? this.fallbackModel : void 0,
3815
+ this.usageAggregator
3816
+ );
3817
+ this.logger.info("[DocumentProcessor] - ChapterConverter");
3818
+ this.chapterConverter = new ChapterConverter(this.logger, this.idGenerator);
3819
+ this.logger.info("[DocumentProcessor] All processors initialized");
3820
+ }
3821
+ /**
3822
+ * Normalize and filter texts using TextCleaner
3823
+ *
3824
+ * Performs basic text normalization (unicode, whitespace, punctuation)
3825
+ * and filters out invalid texts (empty, numbers-only, etc.)
3826
+ */
3827
+ normalizeAndFilterTexts(doclingDoc) {
3828
+ this.logger.info("[DocumentProcessor] Normalizing and filtering texts...");
3829
+ const texts = doclingDoc.texts.map((text) => text.text);
3830
+ const filtered = this.textCleaner.normalizeAndFilterBatch(
3831
+ texts,
3832
+ this.textCleanerBatchSize
3833
+ );
3834
+ this.logger.info(
3835
+ `[DocumentProcessor] Filtered ${filtered.length} texts from ${texts.length} original texts`
3836
+ );
3837
+ return filtered;
3838
+ }
3839
+ /**
3840
+ * Parse page ranges using Vision LLM
3841
+ *
3842
+ * Extracts actual page numbers from page images and creates mapping.
3843
+ * Token usage is automatically tracked by PageRangeParser into the shared aggregator.
3844
+ */
3845
+ async parsePageRanges(doclingDoc) {
3846
+ this.logger.info("[DocumentProcessor] Starting page range parsing...");
3847
+ const result = await this.pageRangeParser.parse(doclingDoc);
3848
+ const pageRangeMap = result.pageRangeMap;
3849
+ this.logger.info(
3850
+ `[DocumentProcessor] Page range map entries: ${Object.keys(pageRangeMap).length}`
3851
+ );
3852
+ return pageRangeMap;
3853
+ }
3854
+ /**
3855
+ * Convert images, tables, and footnotes
3856
+ *
3857
+ * Runs conversions:
3858
+ * - Images conversion (with caption extraction)
3859
+ * - Tables conversion (with caption extraction, excluding TOC tables)
3860
+ * - Footnotes conversion (synchronous, from text items with label='footnote')
3861
+ */
3862
+ async convertResources(doclingDoc, outputPath) {
3863
+ this.logger.info(
3864
+ "[DocumentProcessor] Converting images, tables, and footnotes..."
3865
+ );
3866
+ const [images, tables] = await Promise.all([
3867
+ this.convertImages(doclingDoc, outputPath),
3868
+ this.convertTables(doclingDoc)
3869
+ ]);
3870
+ const footnotes = this.convertFootnotes(doclingDoc);
3871
+ this.logger.info(
3872
+ `[DocumentProcessor] Converted ${images.length} images, ${tables.length} tables, and ${footnotes.length} footnotes`
3873
+ );
3874
+ return { images, tables, footnotes };
3875
+ }
3876
+ /**
3877
+ * Convert footnotes
3878
+ *
3879
+ * Extracts footnotes from DoclingDocument text items with label='footnote'
3880
+ */
3881
+ convertFootnotes(doclingDoc) {
3882
+ const footnoteItems = doclingDoc.texts.filter(
3883
+ (item) => item.label === "footnote"
3884
+ );
3885
+ this.logger.info(
3886
+ `[DocumentProcessor] Converting ${footnoteItems.length} footnotes...`
3887
+ );
3888
+ const footnotes = [];
3889
+ for (const item of footnoteItems) {
3890
+ if (!this.textCleaner.isValidText(item.text)) {
3891
+ continue;
3892
+ }
3893
+ const pdfPageNo = item.prov?.[0]?.page_no ?? 1;
3894
+ const footnoteId = this.idGenerator.generateFootnoteId();
3895
+ footnotes.push({
3896
+ id: footnoteId,
3897
+ text: this.textCleaner.normalize(item.text),
3898
+ pdfPageNo
3899
+ });
3900
+ }
3901
+ this.logger.info(
3902
+ `[DocumentProcessor] Converted ${footnotes.length} valid footnotes`
3903
+ );
3904
+ return footnotes;
3905
+ }
3906
+ /**
3907
+ * Assemble the final ProcessedDocument
3908
+ *
3909
+ * Creates the ProcessedDocument structure with all converted components
3910
+ */
3911
+ assembleProcessedDocument(reportId, pageRangeMap, chapters, images, tables, footnotes) {
3912
+ this.logger.info("[DocumentProcessor] Assembling ProcessedDocument...");
3913
+ const processedDoc = {
3914
+ reportId,
3915
+ pageRangeMap,
3916
+ chapters,
3917
+ images,
3918
+ tables,
3919
+ footnotes
3920
+ };
3921
+ this.logger.info(
3922
+ `[DocumentProcessor] Assembled document with ${chapters.length} chapters, ${images.length} images, ${tables.length} tables, ${footnotes.length} footnotes`
3923
+ );
3924
+ return processedDoc;
3925
+ }
3926
+ /**
3927
+ * Extract table of contents (TOC)
3928
+ *
3929
+ * Uses rule-based extraction with LLM validation and vision fallback:
3930
+ * 1. TocFinder - find TOC area in document (rule-based)
3931
+ * 2. MarkdownConverter - convert TOC items to Markdown
3932
+ * 3. TocContentValidator - validate if content is actually a TOC (LLM)
3933
+ * 4. If invalid: VisionTocExtractor - extract from page images (vision LLM fallback)
3934
+ * 5. TocExtractor - LLM-based structured extraction
3935
+ */
3936
+ async extractTableOfContents(doclingDoc, _filteredTexts) {
3937
+ this.logger.info("[DocumentProcessor] Extracting TOC...");
3938
+ let markdown = null;
3939
+ try {
3940
+ const tocArea = this.tocFinder.find(doclingDoc);
3941
+ this.logger.info(
3942
+ `[DocumentProcessor] Found TOC area: pages ${tocArea.startPage}-${tocArea.endPage}`
3943
+ );
3944
+ markdown = MarkdownConverter.convert(tocArea.itemRefs, this.refResolver);
3945
+ this.logger.info(
3946
+ `[DocumentProcessor] Converted TOC to Markdown (${markdown.length} chars)`
3947
+ );
3948
+ const validation = await this.tocContentValidator.validate(markdown);
3949
+ if (!this.tocContentValidator.isValid(validation)) {
3950
+ this.logger.warn(
3951
+ `[DocumentProcessor] TOC validation failed: ${validation.reason}`
3952
+ );
3953
+ markdown = null;
3954
+ } else {
3955
+ this.logger.info(
3956
+ `[DocumentProcessor] TOC validation passed (confidence: ${validation.confidence})`
3957
+ );
3958
+ }
3959
+ } catch (error) {
3960
+ if (error instanceof TocNotFoundError) {
3961
+ this.logger.info(
3962
+ "[DocumentProcessor] Rule-based TOC not found, will try vision fallback"
3963
+ );
3964
+ } else {
3965
+ throw error;
3966
+ }
3967
+ }
3968
+ if (!markdown) {
3969
+ this.logger.info("[DocumentProcessor] Using vision fallback for TOC");
3970
+ const totalPages = Object.keys(doclingDoc.pages).length;
3971
+ markdown = await this.visionTocExtractor.extract(totalPages);
3972
+ if (!markdown) {
3973
+ this.logger.warn(
3974
+ "[DocumentProcessor] TOC not found in any method, returning empty"
3975
+ );
3976
+ return [];
3977
+ }
3978
+ this.logger.info(
3979
+ `[DocumentProcessor] Vision extracted TOC markdown (${markdown.length} chars)`
3980
+ );
3981
+ }
3982
+ const tocResult = await this.tocExtractor.extract(markdown);
3983
+ this.usageAggregator.track(tocResult.usage);
3984
+ this.logger.info(
3985
+ `[DocumentProcessor] Extracted ${tocResult.entries.length} top-level TOC entries`
3986
+ );
3987
+ return tocResult.entries;
3988
+ }
3989
+ /**
3990
+ * Process resource captions (for images and tables)
3991
+ *
3992
+ * Common caption processing pipeline:
3993
+ * 1. Parse captions in batch
3994
+ * 2. Validate parsed captions
3995
+ * 3. Reparse failed captions with fallback model
3996
+ *
3997
+ * @param captionTexts - Array of caption texts to process
3998
+ * @param resourceType - Type of resource for logging (e.g., 'image', 'table')
3999
+ * @returns Parsed captions with index mapping
4000
+ */
4001
+ async processResourceCaptions(captionTexts, resourceType) {
4002
+ const captionsByIndex = /* @__PURE__ */ new Map();
4003
+ const validCaptionData = [];
4004
+ for (let i = 0; i < captionTexts.length; i++) {
4005
+ const text = captionTexts[i];
4006
+ if (text !== void 0) {
4007
+ validCaptionData.push({
4008
+ resourceIndex: i,
4009
+ filteredIndex: validCaptionData.length,
4010
+ text
4011
+ });
4012
+ }
4013
+ }
4014
+ const validCaptionTexts = validCaptionData.map((item) => item.text);
4015
+ const parsedCaptions = validCaptionTexts.length > 0 ? await this.captionParser.parseBatch(
4016
+ validCaptionTexts,
4017
+ this.captionParserBatchSize
4018
+ ) : [];
4019
+ let finalValidCaptionData = validCaptionData;
4020
+ let finalParsedCaptions = parsedCaptions;
4021
+ if (parsedCaptions.length !== validCaptionData.length) {
4022
+ this.logger.warn(
4023
+ `[DocumentProcessor] Caption parsing length mismatch for ${resourceType}: expected ${validCaptionData.length}, got ${parsedCaptions.length}. Attempting recovery by matching fullText...`
4024
+ );
4025
+ const parsedMap = /* @__PURE__ */ new Map();
4026
+ for (const parsed of parsedCaptions) {
4027
+ parsedMap.set(parsed.fullText, parsed);
4028
+ }
4029
+ const recoveredData = [];
4030
+ for (const item of validCaptionData) {
4031
+ if (parsedMap.has(item.text)) {
4032
+ recoveredData.push(item);
4033
+ } else {
4034
+ this.logger.warn(
4035
+ `[DocumentProcessor] Skipping ${resourceType} caption at index ${item.resourceIndex}: "${item.text}" (not found in parsed results)`
4036
+ );
4037
+ }
4038
+ }
4039
+ const recoveredCaptions = [];
4040
+ for (const item of recoveredData) {
4041
+ const caption = parsedMap.get(item.text);
4042
+ if (caption) {
4043
+ recoveredCaptions.push(caption);
4044
+ }
4045
+ }
4046
+ if (recoveredCaptions.length !== recoveredData.length) {
4047
+ throw new Error(
4048
+ `[DocumentProcessor] Failed to recover from length mismatch: recovered ${recoveredCaptions.length} captions for ${recoveredData.length} valid items`
4049
+ );
4050
+ }
4051
+ finalValidCaptionData = recoveredData;
4052
+ finalParsedCaptions = recoveredCaptions;
4053
+ this.logger.info(
4054
+ `[DocumentProcessor] Successfully recovered ${finalParsedCaptions.length} ${resourceType} captions after length mismatch`
4055
+ );
4056
+ }
4057
+ for (let i = 0; i < finalParsedCaptions.length; i++) {
4058
+ const resourceIndex = finalValidCaptionData[i].resourceIndex;
4059
+ captionsByIndex.set(resourceIndex, finalParsedCaptions[i]);
4060
+ }
4061
+ if (finalParsedCaptions.length > 0) {
4062
+ const finalValidCaptionTexts = finalValidCaptionData.map(
4063
+ (item) => item.text
4064
+ );
4065
+ const validationResults = await this.captionValidator.validateBatch(
4066
+ finalParsedCaptions,
4067
+ finalValidCaptionTexts,
4068
+ this.captionValidatorBatchSize
4069
+ );
4070
+ const failedIndices = validationResults.map((isValid, index) => isValid ? -1 : index).filter((index) => index !== -1);
4071
+ if (failedIndices.length > 0) {
4072
+ for (const filteredIndex of failedIndices) {
4073
+ const captionData = finalValidCaptionData[filteredIndex];
4074
+ const originalText = captionData.text;
4075
+ const parsedNum = finalParsedCaptions[filteredIndex].num;
4076
+ const resourceIndex = captionData.resourceIndex;
4077
+ this.logger.warn(
4078
+ `[DocumentProcessor] Invalid ${resourceType} caption [${resourceIndex}]: "${originalText}" | parsed num="${parsedNum}"`
4079
+ );
4080
+ }
4081
+ if (this.enableFallbackRetry) {
4082
+ this.logger.info(
4083
+ `[DocumentProcessor] Reparsing ${failedIndices.length} failed ${resourceType} captions with fallback model...`
4084
+ );
4085
+ const failedCaptionTexts = failedIndices.map(
4086
+ (filteredIndex) => finalValidCaptionData[filteredIndex].text
4087
+ );
4088
+ const fallbackCaptionParser = new CaptionParser(
4089
+ this.logger,
4090
+ this.fallbackModel,
4091
+ {
4092
+ maxRetries: this.maxRetries,
4093
+ componentName: "CaptionParser-fallback",
4094
+ abortSignal: this.abortSignal
4095
+ },
4096
+ void 0,
4097
+ // no fallback for the fallback
4098
+ this.usageAggregator
4099
+ );
4100
+ const reparsedCaptions = await fallbackCaptionParser.parseBatch(
4101
+ failedCaptionTexts,
4102
+ 0
4103
+ // sequential processing
4104
+ );
4105
+ for (let i = 0; i < failedIndices.length; i++) {
4106
+ const filteredIndex = failedIndices[i];
4107
+ const resourceIndex = finalValidCaptionData[filteredIndex].resourceIndex;
4108
+ captionsByIndex.set(resourceIndex, reparsedCaptions[i]);
4109
+ }
4110
+ this.logger.info(
4111
+ `[DocumentProcessor] Reparsed ${reparsedCaptions.length} ${resourceType} captions`
4112
+ );
4113
+ } else {
4114
+ this.logger.warn(
4115
+ `[DocumentProcessor] ${failedIndices.length} ${resourceType} captions failed validation (kept as-is, fallback retry disabled)`
4116
+ );
4117
+ }
4118
+ }
4119
+ }
4120
+ return captionsByIndex;
4121
+ }
4122
+ /**
4123
+ * Extract caption text from resource
4124
+ *
4125
+ * Handles both string references and $ref resolution
4126
+ */
4127
+ extractCaptionText(captions) {
4128
+ if (!captions?.[0]) {
4129
+ return void 0;
4130
+ }
4131
+ const captionRef = captions[0];
4132
+ if (typeof captionRef === "string") {
4133
+ return captionRef;
4134
+ }
4135
+ if (this.refResolver && "$ref" in captionRef) {
4136
+ const resolved = this.refResolver.resolveText(captionRef.$ref);
4137
+ return resolved?.text;
4138
+ }
4139
+ return void 0;
4140
+ }
4141
+ /**
4142
+ * Convert images
4143
+ *
4144
+ * Converts pictures from DoclingDocument to ProcessedImage
4145
+ */
4146
+ async convertImages(doclingDoc, outputPath) {
4147
+ this.logger.info(
4148
+ `[DocumentProcessor] Converting ${doclingDoc.pictures.length} images...`
4149
+ );
4150
+ const images = [];
4151
+ const captionTexts = [];
4152
+ for (const picture of doclingDoc.pictures) {
4153
+ const pdfPageNo = picture.prov?.[0]?.page_no ?? 0;
4154
+ const imageId = this.idGenerator?.generateImageId() ?? `img-${images.length + 1}`;
4155
+ const captionText = this.extractCaptionText(picture.captions);
4156
+ captionTexts.push(captionText);
4157
+ images.push({
4158
+ id: imageId,
4159
+ path: `${outputPath}/images/image_${images.length}.png`,
4160
+ pdfPageNo
4161
+ // caption will be assigned later
4162
+ });
4163
+ }
4164
+ const captionsByIndex = await this.processResourceCaptions(
4165
+ captionTexts,
4166
+ "image"
4167
+ );
4168
+ for (let i = 0; i < images.length; i++) {
4169
+ if (captionsByIndex.has(i)) {
4170
+ images[i].caption = captionsByIndex.get(i);
4171
+ }
4172
+ }
4173
+ return images;
4174
+ }
4175
+ /**
4176
+ * Convert tables
4177
+ *
4178
+ * Converts tables from DoclingDocument to ProcessedTable
4179
+ */
4180
+ async convertTables(doclingDoc) {
4181
+ this.logger.info(
4182
+ `[DocumentProcessor] Converting ${doclingDoc.tables.length} tables...`
4183
+ );
4184
+ const tables = [];
4185
+ const captionTexts = [];
4186
+ for (const table of doclingDoc.tables) {
4187
+ const pdfPageNo = table.prov?.[0]?.page_no ?? 0;
4188
+ const tableId = this.idGenerator?.generateTableId() ?? `tbl-${tables.length + 1}`;
4189
+ const grid = table.data.grid.map(
4190
+ (row) => row.map((cell) => ({
4191
+ text: cell.text,
4192
+ rowSpan: cell.row_span ?? 1,
4193
+ colSpan: cell.col_span ?? 1,
4194
+ isHeader: cell.column_header || cell.row_header || false
4195
+ }))
4196
+ );
4197
+ const captionText = this.extractCaptionText(table.captions);
4198
+ captionTexts.push(captionText);
4199
+ tables.push({
4200
+ id: tableId,
4201
+ pdfPageNo,
4202
+ numRows: grid.length,
4203
+ numCols: grid[0]?.length ?? 0,
4204
+ grid
4205
+ // caption will be assigned later
4206
+ });
4207
+ }
4208
+ const captionsByIndex = await this.processResourceCaptions(
4209
+ captionTexts,
4210
+ "table"
4211
+ );
4212
+ for (let i = 0; i < tables.length; i++) {
4213
+ if (captionsByIndex.has(i)) {
4214
+ tables[i].caption = captionsByIndex.get(i);
4215
+ }
4216
+ }
4217
+ return tables;
4218
+ }
4219
+ /**
4220
+ * Convert chapters and link resources
4221
+ *
4222
+ * Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
4223
+ * Falls back to single "Document" chapter when TOC is empty.
4224
+ */
4225
+ async convertChapters(doclingDoc, tocEntries, pageRangeMap, images, tables, footnotes) {
4226
+ this.logger.info("[DocumentProcessor] Converting chapters...");
4227
+ if (tocEntries.length === 0) {
4228
+ this.logger.info(
4229
+ "[DocumentProcessor] No TOC entries, creating fallback chapter"
4230
+ );
4231
+ return this.createFallbackChapter(
4232
+ doclingDoc,
4233
+ pageRangeMap,
4234
+ images,
4235
+ tables,
4236
+ footnotes
4237
+ );
4238
+ }
4239
+ const chapters = this.chapterConverter.convert(
4240
+ tocEntries,
4241
+ doclingDoc.texts,
4242
+ pageRangeMap,
4243
+ images,
4244
+ tables,
4245
+ footnotes
4246
+ );
4247
+ this.logger.info(
4248
+ `[DocumentProcessor] Converted ${chapters.length} top-level chapters`
4249
+ );
4250
+ return chapters;
4251
+ }
4252
+ /**
4253
+ * Create a fallback chapter when TOC is not available
4254
+ *
4255
+ * Creates a single "Document" chapter containing all text blocks,
4256
+ * images, tables, and footnotes from the document.
4257
+ */
4258
+ createFallbackChapter(doclingDoc, pageRangeMap, images, tables, footnotes) {
4259
+ const textBlocks = doclingDoc.texts.filter(
4260
+ (item) => item.label !== "footnote" && this.textCleaner.isValidText(item.text)
4261
+ ).map((item) => ({
4262
+ text: this.textCleaner.normalize(item.text),
4263
+ pdfPageNo: item.prov?.[0]?.page_no ?? 1
4264
+ }));
4265
+ if (textBlocks.length === 0 && images.length === 0 && tables.length === 0 && footnotes.length === 0) {
4266
+ this.logger.info(
4267
+ "[DocumentProcessor] No content found for fallback chapter"
4268
+ );
4269
+ return [];
4270
+ }
4271
+ const firstPdfPage = Math.min(
4272
+ ...Object.keys(pageRangeMap).map(Number).filter((n) => !isNaN(n)),
4273
+ 1
4274
+ );
4275
+ const firstPageRange = pageRangeMap[firstPdfPage];
4276
+ const pageNo = firstPageRange?.startPageNo ?? 1;
4277
+ const fallbackChapter = {
4278
+ id: this.idGenerator.generateChapterId(),
4279
+ originTitle: "Document",
4280
+ title: "Document",
4281
+ pageNo,
4282
+ level: 1,
4283
+ textBlocks,
4284
+ imageIds: images.map((img) => img.id),
4285
+ tableIds: tables.map((tbl) => tbl.id),
4286
+ footnoteIds: footnotes.map((ftn) => ftn.id),
4287
+ children: []
4288
+ };
4289
+ this.logger.info(
4290
+ `[DocumentProcessor] Created fallback chapter with ${textBlocks.length} text blocks, ${images.length} images, ${tables.length} tables, ${footnotes.length} footnotes`
4291
+ );
4292
+ return [fallbackChapter];
4293
+ }
4294
+ };
4295
+ // Annotate the CommonJS export names for ESM import in node:
4296
+ 0 && (module.exports = {
4297
+ BaseLLMComponent,
4298
+ BaseValidator,
4299
+ CONTINUATION_MARKERS,
4300
+ CaptionParseError,
4301
+ CaptionParser,
4302
+ CaptionValidationError,
4303
+ CaptionValidator,
4304
+ ChapterConverter,
4305
+ DocumentProcessor,
4306
+ PAGE_NUMBER_PATTERN,
4307
+ PagePattern,
4308
+ PageRangeParseError,
4309
+ PageRangeParser,
4310
+ TOC_KEYWORDS,
4311
+ TextLLMComponent,
4312
+ TocContentValidationSchema,
4313
+ TocContentValidator,
4314
+ TocEntrySchema,
4315
+ TocExtractError,
4316
+ TocExtractor,
4317
+ TocFinder,
4318
+ TocNotFoundError,
4319
+ TocParseError,
4320
+ TocResponseSchema,
4321
+ VisionLLMComponent,
4322
+ VisionTocExtractionSchema,
4323
+ VisionTocExtractor
4324
+ });
4325
+ //# sourceMappingURL=index.cjs.map