@heripo/document-processor 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1420 @@
1
+ import { LoggerMethods } from '@heripo/logger';
2
+ import { DoclingDocument, DocumentProcessResult, Caption, PageRange, DoclingTextItem, DoclingPictureItem, DoclingTableItem, DoclingGroupItem, ProcessedImage, ProcessedTable, ProcessedFootnote, Chapter } from '@heripo/model';
3
+ import { LanguageModel } from 'ai';
4
+ import { LLMTokenUsageAggregator, ExtendedTokenUsage } from '@heripo/shared';
5
+ import { z } from 'zod';
6
+
7
+ /**
8
+ * DocumentProcessor Options
9
+ */
10
+ interface DocumentProcessorOptions {
11
+ /**
12
+ * Logger instance
13
+ */
14
+ logger: LoggerMethods;
15
+ /**
16
+ * Fallback model - used as fallback when component-specific models are not provided or fail.
17
+ * This is the only required model. Should be set to a frontier model (e.g., Claude Opus 4.5, GPT-5.2)
18
+ * to ensure reliable fallback performance across all components.
19
+ */
20
+ fallbackModel: LanguageModel;
21
+ /**
22
+ * Model for PageRangeParser - extracts page numbers from page images.
23
+ * Requires vision capabilities. Falls back to 'fallbackModel' if not provided.
24
+ */
25
+ pageRangeParserModel?: LanguageModel;
26
+ /**
27
+ * Model for TocExtractor - extracts structured TOC from Markdown representation.
28
+ * Falls back to 'fallbackModel' if not provided.
29
+ */
30
+ tocExtractorModel?: LanguageModel;
31
+ /**
32
+ * Model for validators (TOC content validation, caption validation).
33
+ * Falls back to 'fallbackModel' if not provided.
34
+ */
35
+ validatorModel?: LanguageModel;
36
+ /**
37
+ * Model for VisionTocExtractor - extracts TOC directly from page images.
38
+ * Requires vision capabilities. Falls back to 'fallbackModel' if not provided.
39
+ */
40
+ visionTocExtractorModel?: LanguageModel;
41
+ /**
42
+ * Model for CaptionParser - extracts caption prefix and number from image/table captions.
43
+ * Falls back to 'fallbackModel' if not provided.
44
+ */
45
+ captionParserModel?: LanguageModel;
46
+ /**
47
+ * Batch size for TextCleaner text normalization (synchronous processing)
48
+ */
49
+ textCleanerBatchSize: number;
50
+ /**
51
+ * Batch size for CaptionParser LLM parsing (async parallel processing)
52
+ */
53
+ captionParserBatchSize: number;
54
+ /**
55
+ * Batch size for CaptionValidator LLM validation (async parallel processing)
56
+ */
57
+ captionValidatorBatchSize: number;
58
+ /**
59
+ * Maximum retry count (default: 3)
60
+ */
61
+ maxRetries?: number;
62
+ /**
63
+ * Enable fallback retry mechanism - automatically retries with fallback model on failure (default: true)
64
+ * Set to false to disable automatic fallback retry and fail immediately on component-specific model errors
65
+ */
66
+ enableFallbackRetry?: boolean;
67
+ /**
68
+ * Abort signal for cancellation support.
69
+ * When aborted, processing stops at the next checkpoint between stages.
70
+ */
71
+ abortSignal?: AbortSignal;
72
+ }
73
+ /**
74
+ * DocumentProcessor
75
+ *
76
+ * Main class that converts DoclingDocument to ProcessedDocument.
77
+ *
78
+ * ## Conversion Process
79
+ *
80
+ * 1. Initialize RefResolver - indexing for $ref resolution
81
+ * 2. Initialize IdGenerator - unique ID generator
82
+ * 3. Text filtering and PageRangeMap generation (visionModel)
83
+ * 4. TOC extraction (model) - core step
84
+ * 5. Parallel processing block:
85
+ * - Images conversion (caption extraction)
86
+ * - Tables conversion (excluding TOC tables)
87
+ * 6. Chapters conversion (based on TOC)
88
+ * 7. Assemble ProcessedDocument
89
+ *
90
+ * @example
91
+ * ```typescript
92
+ * import { openai } from '@ai-sdk/openai';
93
+ * import { anthropic } from '@ai-sdk/anthropic';
94
+ * import { DocumentProcessor } from '@heripo/document-processor';
95
+ * import { getLogger } from '@heripo/logger';
96
+ *
97
+ * const logger = getLogger();
98
+ *
99
+ * // Basic usage - all components use the fallback model
100
+ * const processor = new DocumentProcessor({
101
+ * logger,
102
+ * fallbackModel: anthropic('claude-opus-4-5-20251101'), // Frontier model for reliable fallback
103
+ * });
104
+ *
105
+ * // Advanced usage - component-specific models with frontier fallback
106
+ * const advancedProcessor = new DocumentProcessor({
107
+ * logger,
108
+ * fallbackModel: anthropic('claude-opus-4-5-20251101'), // Frontier model for fallback
109
+ * pageRangeParserModel: openai('gpt-5.2'), // Vision-capable
110
+ * tocExtractorModel: openai('gpt-5-mini'), // Structured output
111
+ * validatorModel: openai('gpt-5.2'), // Validation (TOC + caption)
112
+ * visionTocExtractorModel: openai('gpt-5.1'), // Vision-capable
113
+ * captionParserModel: openai('gpt-5-mini'),
114
+ * textCleanerBatchSize: 20, // Sync text processing
115
+ * captionParserBatchSize: 10, // LLM caption parsing
116
+ * captionValidatorBatchSize: 10, // LLM caption validation
117
+ * maxRetries: 3,
118
+ * });
119
+ *
120
+ * const result = await processor.process(
121
+ * doclingDoc,
122
+ * 'report-001',
123
+ * outputPath
124
+ * );
125
+ * ```
126
+ */
127
+ declare class DocumentProcessor {
128
+ private readonly logger;
129
+ private readonly fallbackModel;
130
+ private readonly pageRangeParserModel;
131
+ private readonly tocExtractorModel;
132
+ private readonly validatorModel;
133
+ private readonly visionTocExtractorModel;
134
+ private readonly captionParserModel;
135
+ private readonly textCleanerBatchSize;
136
+ private readonly captionParserBatchSize;
137
+ private readonly captionValidatorBatchSize;
138
+ private readonly maxRetries;
139
+ private readonly enableFallbackRetry;
140
+ private readonly abortSignal?;
141
+ private idGenerator;
142
+ private refResolver?;
143
+ private pageRangeParser?;
144
+ private tocFinder?;
145
+ private tocExtractor?;
146
+ private tocContentValidator?;
147
+ private captionValidator?;
148
+ private visionTocExtractor?;
149
+ private captionParser?;
150
+ private chapterConverter?;
151
+ private textCleaner;
152
+ private readonly usageAggregator;
153
+ constructor(options: DocumentProcessorOptions);
154
+ /**
155
+ * Check if abort has been requested and throw error if so
156
+ *
157
+ * @throws {Error} with name 'AbortError' if aborted
158
+ */
159
+ private checkAborted;
160
+ /**
161
+ * Converts DoclingDocument to ProcessedDocument with token usage tracking.
162
+ *
163
+ * Conversion process:
164
+ * 1. Initialize processors and resolvers
165
+ * 2. Normalize and filter texts
166
+ * 3. Clean texts and parse page ranges (parallel)
167
+ * 4. Extract table of contents
168
+ * 5. Convert images and tables (parallel)
169
+ * 6. Convert chapters and link resources
170
+ * 7. Assemble final ProcessedDocument
171
+ * 8. Collect and report token usage
172
+ *
173
+ * @param doclingDoc - Original document extracted from Docling SDK
174
+ * @param reportId - Report unique identifier
175
+ * @param outputPath - Path containing images and pages subdirectories (images/image_0.png, pages/page_0.png, etc.)
176
+ * @returns Document processing result with ProcessedDocument and token usage report
177
+ *
178
+ * @throws {TocExtractError} When TOC extraction fails
179
+ * @throws {PageRangeParseError} When page range parsing fails
180
+ * @throws {ConversionError} When error occurs during conversion
181
+ */
182
+ process(doclingDoc: DoclingDocument, reportId: string, outputPath: string): Promise<DocumentProcessResult>;
183
+ /**
184
+ * Initialize all processors and resolvers
185
+ *
186
+ * Sets up RefResolver, PageRangeParser, TocFinder, and TocExtractor
187
+ */
188
+ private initializeProcessors;
189
+ /**
190
+ * Normalize and filter texts using TextCleaner
191
+ *
192
+ * Performs basic text normalization (unicode, whitespace, punctuation)
193
+ * and filters out invalid texts (empty, numbers-only, etc.)
194
+ */
195
+ private normalizeAndFilterTexts;
196
+ /**
197
+ * Parse page ranges using Vision LLM
198
+ *
199
+ * Extracts actual page numbers from page images and creates mapping.
200
+ * Token usage is automatically tracked by PageRangeParser into the shared aggregator.
201
+ */
202
+ private parsePageRanges;
203
+ /**
204
+ * Convert images, tables, and footnotes
205
+ *
206
+ * Runs conversions:
207
+ * - Images conversion (with caption extraction)
208
+ * - Tables conversion (with caption extraction, excluding TOC tables)
209
+ * - Footnotes conversion (synchronous, from text items with label='footnote')
210
+ */
211
+ private convertResources;
212
+ /**
213
+ * Convert footnotes
214
+ *
215
+ * Extracts footnotes from DoclingDocument text items with label='footnote'
216
+ */
217
+ private convertFootnotes;
218
+ /**
219
+ * Assemble the final ProcessedDocument
220
+ *
221
+ * Creates the ProcessedDocument structure with all converted components
222
+ */
223
+ private assembleProcessedDocument;
224
+ /**
225
+ * Extract table of contents (TOC)
226
+ *
227
+ * Uses rule-based extraction with LLM validation and vision fallback:
228
+ * 1. TocFinder - find TOC area in document (rule-based)
229
+ * 2. MarkdownConverter - convert TOC items to Markdown
230
+ * 3. TocContentValidator - validate if content is actually a TOC (LLM)
231
+ * 4. If invalid: VisionTocExtractor - extract from page images (vision LLM fallback)
232
+ * 5. TocExtractor - LLM-based structured extraction
233
+ */
234
+ private extractTableOfContents;
235
+ /**
236
+ * Process resource captions (for images and tables)
237
+ *
238
+ * Common caption processing pipeline:
239
+ * 1. Parse captions in batch
240
+ * 2. Validate parsed captions
241
+ * 3. Reparse failed captions with fallback model
242
+ *
243
+ * @param captionTexts - Array of caption texts to process
244
+ * @param resourceType - Type of resource for logging (e.g., 'image', 'table')
245
+ * @returns Parsed captions with index mapping
246
+ */
247
+ private processResourceCaptions;
248
+ /**
249
+ * Extract caption text from resource
250
+ *
251
+ * Handles both string references and $ref resolution
252
+ */
253
+ private extractCaptionText;
254
+ /**
255
+ * Convert images
256
+ *
257
+ * Converts pictures from DoclingDocument to ProcessedImage
258
+ */
259
+ private convertImages;
260
+ /**
261
+ * Convert tables
262
+ *
263
+ * Converts tables from DoclingDocument to ProcessedTable
264
+ */
265
+ private convertTables;
266
+ /**
267
+ * Convert chapters and link resources
268
+ *
269
+ * Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
270
+ * Falls back to single "Document" chapter when TOC is empty.
271
+ */
272
+ private convertChapters;
273
+ /**
274
+ * Create a fallback chapter when TOC is not available
275
+ *
276
+ * Creates a single "Document" chapter containing all text blocks,
277
+ * images, tables, and footnotes from the document.
278
+ */
279
+ private createFallbackChapter;
280
+ }
281
+
282
+ /**
283
+ * Base options for all LLM-based components
284
+ */
285
+ interface BaseLLMComponentOptions {
286
+ /**
287
+ * Maximum retry count for LLM API (default: 3)
288
+ */
289
+ maxRetries?: number;
290
+ /**
291
+ * Temperature for LLM generation (default: 0)
292
+ */
293
+ temperature?: number;
294
+ /**
295
+ * Abort signal for cancellation support
296
+ */
297
+ abortSignal?: AbortSignal;
298
+ }
299
+ /**
300
+ * Abstract base class for all LLM-based components
301
+ *
302
+ * Provides common functionality:
303
+ * - Consistent logging with component name prefix
304
+ * - Token usage tracking via optional aggregator
305
+ * - Standard configuration (model, fallback, retries, temperature)
306
+ *
307
+ * Subclasses must implement buildSystemPrompt() and buildUserPrompt().
308
+ */
309
+ declare abstract class BaseLLMComponent {
310
+ protected readonly logger: LoggerMethods;
311
+ protected readonly model: LanguageModel;
312
+ protected readonly fallbackModel?: LanguageModel;
313
+ protected readonly maxRetries: number;
314
+ protected readonly temperature: number;
315
+ protected readonly componentName: string;
316
+ protected readonly aggregator?: LLMTokenUsageAggregator;
317
+ protected readonly abortSignal?: AbortSignal;
318
+ /**
319
+ * Constructor for BaseLLMComponent
320
+ *
321
+ * @param logger - Logger instance for logging
322
+ * @param model - Primary language model for LLM calls
323
+ * @param componentName - Name of the component for logging (e.g., "TocExtractor")
324
+ * @param options - Optional configuration (maxRetries, temperature)
325
+ * @param fallbackModel - Optional fallback model for retry on failure
326
+ * @param aggregator - Optional token usage aggregator for tracking LLM calls
327
+ */
328
+ constructor(logger: LoggerMethods, model: LanguageModel, componentName: string, options?: BaseLLMComponentOptions, fallbackModel?: LanguageModel, aggregator?: LLMTokenUsageAggregator);
329
+ /**
330
+ * Log a message with consistent component name prefix
331
+ *
332
+ * @param level - Log level ('info', 'warn', 'error')
333
+ * @param message - Message to log (without prefix)
334
+ * @param args - Additional arguments to pass to logger
335
+ */
336
+ protected log(level: 'info' | 'warn' | 'error', message: string, ...args: unknown[]): void;
337
+ /**
338
+ * Track token usage to aggregator if available
339
+ *
340
+ * @param usage - Token usage information to track
341
+ */
342
+ protected trackUsage(usage: ExtendedTokenUsage): void;
343
+ /**
344
+ * Create an empty usage record for edge cases (e.g., empty input)
345
+ *
346
+ * @param phase - Phase name for the usage record
347
+ * @returns Empty ExtendedTokenUsage object
348
+ */
349
+ protected createEmptyUsage(phase: string): ExtendedTokenUsage;
350
+ /**
351
+ * Build system prompt for LLM call
352
+ *
353
+ * Subclasses must implement this to provide component-specific system prompts.
354
+ */
355
+ protected abstract buildSystemPrompt(...args: unknown[]): string;
356
+ /**
357
+ * Build user prompt for LLM call
358
+ *
359
+ * Subclasses must implement this to construct prompts from input data.
360
+ */
361
+ protected abstract buildUserPrompt(...args: unknown[]): string;
362
+ }
363
+
364
+ /**
365
+ * Abstract base class for text-based LLM components
366
+ *
367
+ * Extends BaseLLMComponent with helper method for text-based LLM calls
368
+ * using LLMCaller.call() (non-vision).
369
+ *
370
+ * Subclasses: TocExtractor, CaptionParser, BaseValidator
371
+ */
372
+ declare abstract class TextLLMComponent extends BaseLLMComponent {
373
+ constructor(logger: LoggerMethods, model: LanguageModel, componentName: string, options?: BaseLLMComponentOptions, fallbackModel?: LanguageModel, aggregator?: LLMTokenUsageAggregator);
374
+ /**
375
+ * Call LLM with text-based prompts using LLMCaller.call()
376
+ *
377
+ * @template TSchema - Zod schema type for response validation
378
+ * @param schema - Zod schema for response validation
379
+ * @param systemPrompt - System prompt for LLM
380
+ * @param userPrompt - User prompt for LLM
381
+ * @param phase - Phase name for tracking (e.g., 'extraction', 'validation')
382
+ * @returns Promise with parsed object and usage information
383
+ */
384
+ protected callTextLLM<TSchema extends z.ZodType>(schema: TSchema, systemPrompt: string, userPrompt: string, phase: string): Promise<{
385
+ output: z.infer<TSchema>;
386
+ usage: ExtendedTokenUsage;
387
+ }>;
388
+ }
389
+
390
+ /**
391
+ * Options for VisionLLMComponent
392
+ */
393
+ interface VisionLLMComponentOptions extends BaseLLMComponentOptions {
394
+ }
395
+ /**
396
+ * Image content structure for vision LLM messages
397
+ */
398
+ interface ImageContent {
399
+ type: 'image';
400
+ image: string;
401
+ }
402
+ /**
403
+ * Abstract base class for vision-based LLM components
404
+ *
405
+ * Extends BaseLLMComponent with helper methods for vision-based LLM calls
406
+ * using LLMCaller.callVision().
407
+ *
408
+ * Subclasses: PageRangeParser, VisionTocExtractor
409
+ */
410
+ declare abstract class VisionLLMComponent extends BaseLLMComponent {
411
+ protected readonly outputPath: string;
412
+ constructor(logger: LoggerMethods, model: LanguageModel, componentName: string, outputPath: string, options?: VisionLLMComponentOptions, fallbackModel?: LanguageModel, aggregator?: LLMTokenUsageAggregator);
413
+ /**
414
+ * Call LLM with vision capabilities using LLMCaller.callVision()
415
+ *
416
+ * @template TSchema - Zod schema type for response validation
417
+ * @param schema - Zod schema for response validation
418
+ * @param messages - Messages array including image content
419
+ * @param phase - Phase name for tracking (e.g., 'extraction', 'sampling')
420
+ * @returns Promise with parsed object and usage information
421
+ */
422
+ protected callVisionLLM<TSchema extends z.ZodType>(schema: TSchema, messages: Array<{
423
+ role: 'user' | 'assistant';
424
+ content: unknown[] | string;
425
+ }>, phase: string): Promise<{
426
+ output: z.infer<TSchema>;
427
+ usage: ExtendedTokenUsage;
428
+ }>;
429
+ /**
430
+ * Load an image file and encode it as base64
431
+ *
432
+ * @param imagePath - Absolute path to the image file
433
+ * @returns Base64 encoded image string
434
+ */
435
+ protected loadImageAsBase64(imagePath: string): string;
436
+ /**
437
+ * Build image content object for vision LLM messages
438
+ *
439
+ * @param imagePath - Path to the image file (relative to outputPath or absolute)
440
+ * @param mimeType - MIME type of the image (default: 'image/png')
441
+ * @returns ImageContent object for LLM message
442
+ */
443
+ protected buildImageContent(imagePath: string, mimeType?: string): ImageContent;
444
+ }
445
+
446
+ /**
447
+ * Table of Contents Entry
448
+ *
449
+ * Tree node representing the table of contents structure of a document.
450
+ */
451
+ interface TocEntry {
452
+ /**
453
+ * Chapter title
454
+ */
455
+ title: string;
456
+ /**
457
+ * Hierarchy depth (1, 2, 3...)
458
+ */
459
+ level: number;
460
+ /**
461
+ * Starting page number
462
+ */
463
+ pageNo: number;
464
+ /**
465
+ * Child TOC entries
466
+ */
467
+ children?: TocEntry[];
468
+ }
469
+ /**
470
+ * TOC Area Search Result
471
+ */
472
+ interface TocAreaResult {
473
+ /**
474
+ * Group or table item references corresponding to the table of contents
475
+ */
476
+ itemRefs: string[];
477
+ /**
478
+ * TOC start page
479
+ */
480
+ startPage: number;
481
+ /**
482
+ * TOC end page
483
+ */
484
+ endPage: number;
485
+ }
486
+ /**
487
+ * Page Size Information
488
+ */
489
+ interface PageSizeGroup {
490
+ /**
491
+ * Size identifier (width x height)
492
+ */
493
+ sizeKey: string;
494
+ /**
495
+ * PDF page numbers with this size specification
496
+ */
497
+ pageNos: number[];
498
+ }
499
+
500
+ /**
501
+ * CaptionParser options
502
+ */
503
+ interface CaptionParserOptions extends BaseLLMComponentOptions {
504
+ /**
505
+ * Custom component name for token usage tracking.
506
+ * Defaults to 'CaptionParser'.
507
+ */
508
+ componentName?: string;
509
+ }
510
+ /**
511
+ * CaptionParser
512
+ *
513
+ * Extracts caption prefix and number from image/table captions using LLM.
514
+ * Preserves original spacing from input text.
515
+ * Extends TextLLMComponent for standardized LLM call handling.
516
+ *
517
+ * ## Algorithm
518
+ *
519
+ * 1. Collect caption texts
520
+ * 2. Split into batches based on batchSize
521
+ * 3. For each batch: call LLM to extract caption prefix + number
522
+ * 4. Flatten results and return
523
+ */
524
+ declare class CaptionParser extends TextLLMComponent {
525
+ constructor(logger: LoggerMethods, model: LanguageModel, options?: CaptionParserOptions, fallbackModel?: LanguageModel, aggregator?: LLMTokenUsageAggregator);
526
+ /**
527
+ * Parse batch of captions
528
+ *
529
+ * @param captions - Array of caption full texts
530
+ * @param batchSize - Batch size for processing. Set to 0 for sequential processing without batching.
531
+ * @param overrideModel - Optional model to use instead of the default model
532
+ * @returns Array of Caption objects with num extracted (maintains original order)
533
+ */
534
+ parseBatch(captions: string[], batchSize: number, overrideModel?: LanguageModel): Promise<Caption[]>;
535
+ /**
536
+ * Internal: Parse batch of captions using LLM
537
+ *
538
+ * @param captions - Batch of caption texts with original indices
539
+ * @param model - Effective model to use
540
+ * @returns Array of Caption objects indexed correctly
541
+ */
542
+ private parseBatchInternal;
543
+ /**
544
+ * Extract and normalize caption number from full text
545
+ *
546
+ * Finds the extracted num pattern in the full text and extracts it
547
+ * with original casing. Handles case-insensitive matching.
548
+ *
549
+ * @param fullText - The full caption text
550
+ * @param extractedNum - The num extracted by LLM (may have different casing)
551
+ * @returns Normalized num or undefined if no match
552
+ */
553
+ private extractNumFromFullText;
554
+ /**
555
+ * Build system prompt for caption parsing
556
+ *
557
+ * @param mode - 'batch' for multiple captions, 'single' for single caption
558
+ */
559
+ protected buildSystemPrompt(mode?: 'batch' | 'single'): string;
560
+ /**
561
+ * Build user prompt for caption parsing
562
+ */
563
+ protected buildUserPrompt(captions: Array<{
564
+ index: number;
565
+ text: string;
566
+ }>): string;
567
+ /**
568
+ * Build user prompt for single caption parsing
569
+ */
570
+ private buildUserPromptSingle;
571
+ }
572
+ /**
573
+ * Error thrown when caption parsing fails
574
+ */
575
+ declare class CaptionParseError extends Error {
576
+ constructor(message: string, options?: ErrorOptions);
577
+ }
578
+
579
+ /**
580
+ * PageRangeParseError
581
+ *
582
+ * Custom error thrown when page range parsing fails.
583
+ */
584
+ declare class PageRangeParseError extends Error {
585
+ constructor(message: string, options?: ErrorOptions);
586
+ /**
587
+ * Extract error message from unknown error type
588
+ */
589
+ static getErrorMessage(error: unknown): string;
590
+ /**
591
+ * Create PageRangeParseError from unknown error with context
592
+ */
593
+ static fromError(context: string, error: unknown): PageRangeParseError;
594
+ }
595
+
596
+ /**
597
+ * Pattern types for page number sequences
598
+ */
599
+ declare enum PagePattern {
600
+ /** Simple increment: [1, 2, 3, 4, ...] */
601
+ SIMPLE_INCREMENT = "simple_increment",
602
+ /** Double-sided scan: [1-2, 3-4, 5-6, ...] */
603
+ DOUBLE_SIDED = "double_sided",
604
+ /** Offset pattern: PDF page != actual page (consistent offset) */
605
+ OFFSET = "offset",
606
+ /** No clear pattern detected */
607
+ UNKNOWN = "unknown"
608
+ }
609
+ /**
610
+ * PageRangeParser
611
+ *
612
+ * Extracts actual document page numbers from PDF page images using Vision LLM.
613
+ * Uses random sampling + pattern detection to minimize LLM calls.
614
+ * Extends VisionLLMComponent for standardized vision LLM call handling.
615
+ *
616
+ * ## Algorithm
617
+ *
618
+ * 1. Group pages by size (consecutive pages with same dimensions)
619
+ * 2. For each group:
620
+ * - If ≤3 pages: send all to LLM at once
621
+ * - If >3 pages: random sample 3 pages, detect pattern, apply to all
622
+ * 3. Post-process: handle drops, normalize negatives, backfill failed pages
623
+ */
624
+ declare class PageRangeParser extends VisionLLMComponent {
625
+ private readonly SAMPLE_SIZE;
626
+ private readonly MAX_PATTERN_RETRIES;
627
+ private readonly SIZE_TOLERANCE;
628
+ constructor(logger: LoggerMethods, model: LanguageModel, outputPath: string, maxRetries?: number, fallbackModel?: LanguageModel, aggregator?: LLMTokenUsageAggregator, abortSignal?: AbortSignal);
629
+ /**
630
+ * Main parse method
631
+ *
632
+ * Extracts page range mapping from DoclingDocument using Vision LLM.
633
+ * Automatically tracks token usage in the aggregator if one was provided.
634
+ *
635
+ * @param doclingDoc - DoclingDocument to extract page ranges from
636
+ * @returns Object with page range mapping and token usage information
637
+ */
638
+ parse(doclingDoc: DoclingDocument): Promise<{
639
+ pageRangeMap: Record<number, PageRange>;
640
+ usage: ExtendedTokenUsage[];
641
+ }>;
642
+ /**
643
+ * Extract pages array from DoclingDocument
644
+ */
645
+ private extractPages;
646
+ /**
647
+ * Analyze page sizes and group consecutive pages with same dimensions
648
+ */
649
+ private analyzeSizes;
650
+ /**
651
+ * Create size key with tolerance for floating point comparison
652
+ */
653
+ private createSizeKey;
654
+ /**
655
+ * Process a single size group
656
+ */
657
+ private processGroup;
658
+ /**
659
+ * Select random samples from page numbers
660
+ */
661
+ private selectRandomSamples;
662
+ /**
663
+ * Extract page numbers from multiple pages in a single LLM call
664
+ */
665
+ private extractMultiplePages;
666
+ /**
667
+ * Detect pattern from sample results
668
+ */
669
+ private detectPattern;
670
+ /**
671
+ * Apply detected pattern to generate page range map
672
+ */
673
+ private applyPattern;
674
+ /**
675
+ * Convert sample results to page range map (for small groups)
676
+ */
677
+ private samplesToMap;
678
+ /**
679
+ * Post-process the page range map
680
+ */
681
+ private postProcess;
682
+ /**
683
+ * Detect and handle outlier page numbers at the beginning of document
684
+ *
685
+ * When early PDF pages have abnormally high page numbers compared to
686
+ * subsequent pages (e.g., PDF 1-9 = 75-83, but PDF 10+ = 2,3,4...),
687
+ * the LLM likely misread figure/photo numbers as page numbers.
688
+ *
689
+ * Detection: If page numbers at the beginning are significantly higher
690
+ * than subsequent pages (which follow a normal pattern), mark them as failed.
691
+ */
692
+ private detectAndHandleOutliers;
693
+ /**
694
+ * Find the start index of a "normal" sequence in the page range map
695
+ *
696
+ * A normal sequence is defined as at least 3 consecutive PDF pages where:
697
+ * - Page numbers are increasing (for single-page) or increasing by 2 (for double-sided)
698
+ * - The pattern is consistent
699
+ *
700
+ * Returns the index in pdfPages array, or null if not found.
701
+ */
702
+ private findNormalSequenceStart;
703
+ /**
704
+ * Check if a page range represents a double-sided scan
705
+ */
706
+ private isDoubleSidedRange;
707
+ /**
708
+ * Detect and handle page number drops
709
+ *
710
+ * When page numbers suddenly decrease (e.g., 8,9 -> 3,4),
711
+ * recalculate previous pages based on the drop point.
712
+ */
713
+ private detectAndHandleDrops;
714
+ /**
715
+ * Normalize negative page numbers to 0
716
+ */
717
+ private normalizeNegatives;
718
+ /**
719
+ * Backfill pages marked with 0 using detected pattern
720
+ */
721
+ private backfillFailedPages;
722
+ /**
723
+ * Build system prompt for Vision LLM
724
+ */
725
+ protected buildSystemPrompt(): string;
726
+ /**
727
+ * Build user prompt for Vision LLM
728
+ */
729
+ protected buildUserPrompt(pageNos: number[]): string;
730
+ }
731
+
732
+ /**
733
+ * TocExtractError
734
+ *
735
+ * Base error class for TOC extraction failures.
736
+ */
737
+ declare class TocExtractError extends Error {
738
+ constructor(message: string, options?: ErrorOptions);
739
+ /**
740
+ * Extract error message from unknown error type
741
+ */
742
+ static getErrorMessage(error: unknown): string;
743
+ /**
744
+ * Create TocExtractError from unknown error with context
745
+ */
746
+ static fromError(context: string, error: unknown): TocExtractError;
747
+ }
748
+ /**
749
+ * TocNotFoundError
750
+ *
751
+ * Error thrown when TOC area cannot be found in the document.
752
+ */
753
+ declare class TocNotFoundError extends TocExtractError {
754
+ constructor(message?: string);
755
+ }
756
+ /**
757
+ * TocParseError
758
+ *
759
+ * Error thrown when LLM fails to parse TOC structure.
760
+ */
761
+ declare class TocParseError extends TocExtractError {
762
+ constructor(message: string, options?: ErrorOptions);
763
+ }
764
+
765
+ /**
766
+ * Validation options for TocValidator
767
+ */
768
+ interface TocValidationOptions {
769
+ /**
770
+ * Total page count of the document (for range validation)
771
+ * If not provided, page range upper bound validation is skipped
772
+ */
773
+ totalPages?: number;
774
+ /**
775
+ * Maximum allowed title length (default: 200)
776
+ */
777
+ maxTitleLength?: number;
778
+ }
779
+
780
+ /**
781
+ * Resolves $ref references in DoclingDocument to actual objects.
782
+ *
783
+ * DoclingDocument uses JSON references (e.g., "#/texts/0") to link nodes.
784
+ * This class builds an index for quick lookups of texts, pictures, tables, and groups.
785
+ */
786
+ declare class RefResolver {
787
+ private readonly logger;
788
+ private readonly textMap;
789
+ private readonly pictureMap;
790
+ private readonly tableMap;
791
+ private readonly groupMap;
792
+ constructor(logger: LoggerMethods, doc: DoclingDocument);
793
+ /**
794
+ * Build an index mapping self_ref to the actual item
795
+ */
796
+ private buildIndex;
797
+ /**
798
+ * Resolve a $ref string to the actual item
799
+ * @param ref - Reference string (e.g., "#/texts/0")
800
+ * @returns The resolved item, or null if not found
801
+ */
802
+ resolve(ref: string): DoclingTextItem | DoclingPictureItem | DoclingTableItem | DoclingGroupItem | null;
803
+ /**
804
+ * Resolve a text reference
805
+ * @param ref - Reference string (e.g., "#/texts/0")
806
+ * @returns The resolved text item, or null if not found
807
+ */
808
+ resolveText(ref: string): DoclingTextItem | null;
809
+ /**
810
+ * Resolve a picture reference
811
+ * @param ref - Reference string (e.g., "#/pictures/0")
812
+ * @returns The resolved picture item, or null if not found
813
+ */
814
+ resolvePicture(ref: string): DoclingPictureItem | null;
815
+ /**
816
+ * Resolve a table reference
817
+ * @param ref - Reference string (e.g., "#/tables/0")
818
+ * @returns The resolved table item, or null if not found
819
+ */
820
+ resolveTable(ref: string): DoclingTableItem | null;
821
+ /**
822
+ * Resolve a group reference
823
+ * @param ref - Reference string (e.g., "#/groups/0")
824
+ * @returns The resolved group item, or null if not found
825
+ */
826
+ resolveGroup(ref: string): DoclingGroupItem | null;
827
+ /**
828
+ * Resolve multiple references at once
829
+ * @param refs - Array of reference objects with $ref property
830
+ * @returns Array of resolved items (null for unresolved references)
831
+ */
832
+ resolveMany(refs: Array<{
833
+ $ref: string;
834
+ }>): Array<DoclingTextItem | DoclingPictureItem | DoclingTableItem | DoclingGroupItem | null>;
835
+ }
836
+
837
+ /**
838
+ * Generates sequential IDs for different types of items.
839
+ *
840
+ * IDs are formatted as: `{prefix}-{number}` where number is zero-padded to 3 digits.
841
+ * - Chapters: ch-001, ch-002, ...
842
+ * - Images: img-001, img-002, ...
843
+ * - Tables: tbl-001, tbl-002, ...
844
+ *
845
+ * Each type maintains its own independent counter.
846
+ */
847
+ declare class IdGenerator {
848
+ private chapterCounter;
849
+ private imageCounter;
850
+ private tableCounter;
851
+ private footnoteCounter;
852
+ /**
853
+ * Generate a chapter ID
854
+ * @returns A chapter ID in the format "ch-001"
855
+ */
856
+ generateChapterId(): string;
857
+ /**
858
+ * Generate an image ID
859
+ * @returns An image ID in the format "img-001"
860
+ */
861
+ generateImageId(): string;
862
+ /**
863
+ * Generate a table ID
864
+ * @returns A table ID in the format "tbl-001"
865
+ */
866
+ generateTableId(): string;
867
+ /**
868
+ * Generate a footnote ID
869
+ * @returns A footnote ID in the format "ftn-001"
870
+ */
871
+ generateFootnoteId(): string;
872
+ /**
873
+ * Reset all counters to zero
874
+ */
875
+ reset(): void;
876
+ /**
877
+ * Get current counter values (for testing/debugging)
878
+ */
879
+ getCounters(): {
880
+ chapter: number;
881
+ image: number;
882
+ table: number;
883
+ footnote: number;
884
+ };
885
+ /**
886
+ * Pad a number to 3 digits with leading zeros
887
+ */
888
+ private padNumber;
889
+ }
890
+
891
+ /**
892
+ * TOC keyword patterns for different languages
893
+ * Korean: 목차, 차례, 목 차
894
+ * Chinese: 目录, 目 录, 内容, 內容
895
+ * Japanese: 目次, 目 次
896
+ * English: Contents, Table of Contents, etc.
897
+ */
898
+ declare const TOC_KEYWORDS: readonly ["목차", "차례", "목 차", "目录", "目 录", "内容", "內容", "目次", "目 次", "Contents", "Table of Contents", "TABLE OF CONTENTS", "CONTENTS"];
899
+ /**
900
+ * Continuation marker patterns for multi-page TOC
901
+ * Korean: 목차(계속), 목차 (계속), (계속)
902
+ * Chinese: 目录(续), 目录 (续), (续), 续表
903
+ * Japanese: 目次(続), 目次 (続), (続)
904
+ * English: (continued), (Continued), etc.
905
+ */
906
+ declare const CONTINUATION_MARKERS: readonly ["목차(계속)", "목차 (계속)", "(계속)", "目录(续)", "目录 (续)", "(续)", "续表", "目次(続)", "目次 (続)", "(続)", "(continued)", "(Continued)", "(CONTINUED)", "continued"];
907
+ /**
908
+ * Page number pattern regex for detecting TOC-like structures
909
+ * Matches patterns like "... 123", ".... 45", ending with numbers
910
+ */
911
+ declare const PAGE_NUMBER_PATTERN: RegExp;
912
+ /**
913
+ * TocFinder options
914
+ */
915
+ interface TocFinderOptions {
916
+ /**
917
+ * Maximum pages to search for TOC (default: 10)
918
+ */
919
+ maxSearchPages?: number;
920
+ /**
921
+ * Custom TOC keywords to add (optional)
922
+ */
923
+ additionalKeywords?: string[];
924
+ }
925
+ /**
926
+ * TocFinder
927
+ *
928
+ * Finds TOC area in DoclingDocument using multi-stage search strategy:
929
+ * 1. Keyword search in texts (section_header, list_item labels)
930
+ * 2. Structure analysis for lists/tables with page number patterns
931
+ * 3. Position heuristic (prioritize early pages)
932
+ */
933
+ declare class TocFinder {
934
+ private readonly logger;
935
+ private readonly refResolver;
936
+ private readonly maxSearchPages;
937
+ private readonly keywords;
938
+ constructor(logger: LoggerMethods, refResolver: RefResolver, options?: TocFinderOptions);
939
+ /**
940
+ * Find TOC area in the document
941
+ *
942
+ * @throws {TocNotFoundError} When no TOC area is found
943
+ */
944
+ find(doc: DoclingDocument): TocAreaResult;
945
+ /**
946
+ * Stage 1: Search by keywords in text items
947
+ */
948
+ private findByKeywords;
949
+ /**
950
+ * Stage 2: Search by structure (lists/tables with page numbers)
951
+ */
952
+ private findByStructure;
953
+ /**
954
+ * Find the TOC container (group or table) from a parent reference
955
+ */
956
+ private findTocContainer;
957
+ /**
958
+ * Check if a group contains TOC-like structure
959
+ */
960
+ private isGroupTocLike;
961
+ /**
962
+ * Check if a table contains TOC-like structure
963
+ */
964
+ private isTableTocLike;
965
+ /**
966
+ * Expand TOC area to consecutive pages
967
+ */
968
+ private expandToConsecutivePages;
969
+ /**
970
+ * Find TOC continuation items on a specific page
971
+ */
972
+ private findContinuationOnPage;
973
+ /**
974
+ * Check if text contains TOC keyword
975
+ */
976
+ private containsTocKeyword;
977
+ /**
978
+ * Check for continuation markers
979
+ */
980
+ private hasContinuationMarker;
981
+ /**
982
+ * Get first page number of a group by checking its children
983
+ */
984
+ private getGroupFirstPage;
985
+ /**
986
+ * Calculate score for a group candidate
987
+ * Higher score = better match
988
+ */
989
+ private calculateScore;
990
+ /**
991
+ * Calculate score for a table candidate
992
+ */
993
+ private calculateTableScore;
994
+ }
995
+
996
+ /**
997
+ * Zod schema for recursive TocEntry structure
998
+ */
999
+ declare const TocEntrySchema: z.ZodType<TocEntry>;
1000
+ /**
1001
+ * Schema for LLM response
1002
+ */
1003
+ declare const TocResponseSchema: z.ZodObject<{
1004
+ entries: z.ZodArray<z.ZodType<TocEntry, unknown, z.core.$ZodTypeInternals<TocEntry, unknown>>>;
1005
+ }, z.core.$strip>;
1006
+ type TocResponse = z.infer<typeof TocResponseSchema>;
1007
+ /**
1008
+ * TocExtractor options
1009
+ */
1010
+ interface TocExtractorOptions extends BaseLLMComponentOptions {
1011
+ /**
1012
+ * Validation options (optional)
1013
+ * If not provided, validation is performed with default settings
1014
+ */
1015
+ validation?: TocValidationOptions;
1016
+ /**
1017
+ * Whether to skip validation entirely (default: false)
1018
+ */
1019
+ skipValidation?: boolean;
1020
+ }
1021
+ /**
1022
+ * TocExtractor
1023
+ *
1024
+ * Uses high-performance LLM to extract structured TOC from Markdown representation.
1025
+ * Extends TextLLMComponent for standardized LLM call handling.
1026
+ */
1027
+ declare class TocExtractor extends TextLLMComponent {
1028
+ private readonly validationOptions?;
1029
+ private readonly skipValidation;
1030
+ constructor(logger: LoggerMethods, model: LanguageModel, options?: TocExtractorOptions, fallbackModel?: LanguageModel, abortSignal?: AbortSignal);
1031
+ /**
1032
+ * Extract TOC structure from Markdown
1033
+ *
1034
+ * @param markdown - Markdown representation of TOC area
1035
+ * @returns Object with entries array and token usage information
1036
+ * @throws {TocParseError} When LLM fails to parse structure
1037
+ * @throws {TocValidationError} When validation fails
1038
+ */
1039
+ extract(markdown: string): Promise<{
1040
+ entries: TocEntry[];
1041
+ usage: ExtendedTokenUsage;
1042
+ }>;
1043
+ /**
1044
+ * Validate extracted entries
1045
+ *
1046
+ * @throws {TocValidationError} When validation fails
1047
+ */
1048
+ private validateEntries;
1049
+ /**
1050
+ * Build system prompt for TOC extraction
1051
+ */
1052
+ protected buildSystemPrompt(): string;
1053
+ /**
1054
+ * Build user prompt with Markdown content
1055
+ */
1056
+ protected buildUserPrompt(markdown: string): string;
1057
+ /**
1058
+ * Normalize and validate extracted entries
1059
+ */
1060
+ private normalizeEntries;
1061
+ /**
1062
+ * Recursively ensure level consistency
1063
+ *
1064
+ * Children must have level = parent.level + 1
1065
+ */
1066
+ private normalizeLevel;
1067
+ }
1068
+
1069
+ /**
1070
+ * Schema for vision-based TOC extraction response
1071
+ */
1072
+ declare const VisionTocExtractionSchema: z.ZodObject<{
1073
+ hasToc: z.ZodBoolean;
1074
+ tocMarkdown: z.ZodNullable<z.ZodString>;
1075
+ continuesOnNextPage: z.ZodBoolean;
1076
+ }, z.core.$strip>;
1077
+ type VisionTocExtractionResult = z.infer<typeof VisionTocExtractionSchema>;
1078
+ /**
1079
+ * Options for VisionTocExtractor
1080
+ */
1081
+ interface VisionTocExtractorOptions extends VisionLLMComponentOptions {
1082
+ /**
1083
+ * Number of pages for first batch (default: 10)
1084
+ */
1085
+ firstBatchSize?: number;
1086
+ /**
1087
+ * Number of pages for second batch (default: 10)
1088
+ */
1089
+ secondBatchSize?: number;
1090
+ }
1091
+ /**
1092
+ * VisionTocExtractor
1093
+ *
1094
+ * Uses vision LLM to find and extract TOC directly from page images.
1095
+ * Fallback strategy when rule-based extraction fails or produces invalid content.
1096
+ * Extends VisionLLMComponent for standardized vision LLM call handling.
1097
+ *
1098
+ * Output format matches MarkdownConverter.convert() for consistency.
1099
+ */
1100
+ declare class VisionTocExtractor extends VisionLLMComponent {
1101
+ private readonly firstBatchSize;
1102
+ private readonly secondBatchSize;
1103
+ constructor(logger: LoggerMethods, model: LanguageModel, outputPath: string, options?: VisionTocExtractorOptions, fallbackModel?: LanguageModel, aggregator?: LLMTokenUsageAggregator);
1104
+ /**
1105
+ * Extract TOC from page images
1106
+ *
1107
+ * Searches pages 1-10 first, then 11-20 if not found.
1108
+ *
1109
+ * @param totalPages - Total number of pages in the document
1110
+ * @returns Extracted TOC markdown or null if not found
1111
+ */
1112
+ extract(totalPages: number): Promise<string | null>;
1113
+ /**
1114
+ * Extract TOC from a specific batch of pages
1115
+ */
1116
+ private extractFromBatch;
1117
+ /**
1118
+ * Load page images and build message content
1119
+ */
1120
+ private loadPageImages;
1121
+ /**
1122
+ * Merge markdown from multiple batches
1123
+ */
1124
+ private mergeMarkdown;
1125
+ /**
1126
+ * Build system prompt for vision LLM (not used, but required by abstract class)
1127
+ */
1128
+ protected buildSystemPrompt(): string;
1129
+ /**
1130
+ * Build user prompt with page range information
1131
+ */
1132
+ protected buildUserPrompt(startPage: number, endPage: number): string;
1133
+ }
1134
+
1135
+ /**
1136
+ * Base options for all validators
1137
+ *
1138
+ * Re-exported from BaseLLMComponentOptions for backwards compatibility.
1139
+ */
1140
+ type BaseValidatorOptions = BaseLLMComponentOptions;
1141
+ /**
1142
+ * Abstract base class for LLM-based validators
1143
+ *
1144
+ * Extends TextLLMComponent to provide common functionality for validators
1145
+ * that use LLM to validate/analyze content:
1146
+ * - LLM API call wrapper with LLMCaller (via callLLM method)
1147
+ * - Standard logging patterns (via log method from base class)
1148
+ * - Retry and fallback configuration
1149
+ *
1150
+ * Token usage is tracked by LLMCaller and should be aggregated by DocumentProcessor.
1151
+ *
1152
+ * @template TSchema - Zod schema type for validation
1153
+ * @template TResult - Result type after parsing with schema
1154
+ */
1155
+ declare abstract class BaseValidator<TSchema extends z.ZodType, TResult = z.infer<TSchema>> extends TextLLMComponent {
1156
+ /**
1157
+ * Validator name for logging (kept for backwards compatibility)
1158
+ */
1159
+ protected readonly validatorName: string;
1160
+ /**
1161
+ * Constructor for BaseValidator
1162
+ *
1163
+ * @param logger - Logger instance
1164
+ * @param model - Language model to use for validation
1165
+ * @param validatorName - Name of the validator for logging (e.g., "TocContentValidator")
1166
+ * @param options - Optional configuration (maxRetries, temperature)
1167
+ * @param fallbackModel - Optional fallback model for retry on failure
1168
+ * @param aggregator - Optional token usage aggregator for tracking LLM calls
1169
+ */
1170
+ constructor(logger: LoggerMethods, model: LanguageModel, validatorName: string, options?: BaseValidatorOptions, fallbackModel?: LanguageModel, aggregator?: LLMTokenUsageAggregator);
1171
+ /**
1172
+ * Call LLM with LLMCaller
1173
+ *
1174
+ * This method provides backwards compatibility with existing validators.
1175
+ * It wraps the parent callTextLLM method but allows passing a custom aggregator.
1176
+ *
1177
+ * @param schema - Zod schema for response validation
1178
+ * @param systemPrompt - System prompt
1179
+ * @param userPrompt - User prompt
1180
+ * @param phase - Phase name for tracking (e.g., 'validation', 'batch-validation')
1181
+ * @param aggregator - Optional token usage aggregator for tracking this call
1182
+ * @returns Parsed and validated LLM response with usage information
1183
+ */
1184
+ protected callLLM(schema: TSchema, systemPrompt: string, userPrompt: string, phase: string, aggregator?: LLMTokenUsageAggregator): Promise<{
1185
+ output: TResult;
1186
+ usage: ExtendedTokenUsage;
1187
+ }>;
1188
+ }
1189
+
1190
+ /**
1191
+ * Schema for TOC content validation response
1192
+ */
1193
+ declare const TocContentValidationSchema: z.ZodObject<{
1194
+ isToc: z.ZodBoolean;
1195
+ confidence: z.ZodNumber;
1196
+ reason: z.ZodString;
1197
+ }, z.core.$strip>;
1198
+ type TocContentValidationResult = z.infer<typeof TocContentValidationSchema>;
1199
+ /**
1200
+ * Options for TocContentValidator
1201
+ */
1202
+ interface TocContentValidatorOptions extends BaseValidatorOptions {
1203
+ /**
1204
+ * Minimum confidence to consider valid (default: 0.7)
1205
+ */
1206
+ confidenceThreshold?: number;
1207
+ }
1208
+ /**
1209
+ * TocContentValidator
1210
+ *
1211
+ * Uses LLM to validate whether extracted markdown content is actually a TOC.
1212
+ * This is a semantic validation, not structural validation.
1213
+ */
1214
+ declare class TocContentValidator extends BaseValidator<typeof TocContentValidationSchema, TocContentValidationResult> {
1215
+ private readonly confidenceThreshold;
1216
+ constructor(logger: LoggerMethods, model: LanguageModel, options?: TocContentValidatorOptions, fallbackModel?: LanguageModel, aggregator?: LLMTokenUsageAggregator);
1217
+ /**
1218
+ * Validate if the markdown content is a table of contents
1219
+ *
1220
+ * @param markdown - Markdown content to validate
1221
+ * @returns Validation result with isToc, confidence, and reason
1222
+ */
1223
+ validate(markdown: string): Promise<TocContentValidationResult>;
1224
+ /**
1225
+ * Check if validation result passes threshold
1226
+ *
1227
+ * @param result - Validation result from validate()
1228
+ * @returns true if content is valid TOC with sufficient confidence
1229
+ */
1230
+ isValid(result: TocContentValidationResult): boolean;
1231
+ /**
1232
+ * Build system prompt for TOC content validation
1233
+ */
1234
+ protected buildSystemPrompt(): string;
1235
+ /**
1236
+ * Build user prompt with markdown content
1237
+ */
1238
+ protected buildUserPrompt(markdown: string): string;
1239
+ }
1240
+
1241
+ /**
1242
+ * Schema for batch caption validation response
1243
+ */
1244
+ declare const CaptionValidationBatchSchema: z.ZodObject<{
1245
+ results: z.ZodArray<z.ZodObject<{
1246
+ index: z.ZodNumber;
1247
+ isValid: z.ZodBoolean;
1248
+ reason: z.ZodNullable<z.ZodString>;
1249
+ }, z.core.$strip>>;
1250
+ }, z.core.$strip>;
1251
+ type CaptionValidationBatch = z.infer<typeof CaptionValidationBatchSchema>;
1252
+ /**
1253
+ * Options for CaptionValidator
1254
+ */
1255
+ interface CaptionValidatorOptions extends BaseValidatorOptions {
1256
+ }
1257
+ /**
1258
+ * CaptionValidator
1259
+ *
1260
+ * Validates parsed captions against original text using LLM.
1261
+ * Processes captions in batches to optimize LLM API calls.
1262
+ *
1263
+ * ## Validation Rules
1264
+ *
1265
+ * Checks if the parsed "num" field correctly extracts the prefix + number from original text:
1266
+ * 1. **Correctness**: The "num" must contain the actual prefix+number from the original text
1267
+ * - Example: "도판 1 유적 전경" → num="도판 1" ✓
1268
+ * - Example: "도판 1 유적 전경" → num="도판" ✗ (incomplete)
1269
+ *
1270
+ * 2. **Spacing**: The spacing in "num" must match the original text exactly
1271
+ * - Example: "도판 1" → num="도판 1" ✓
1272
+ * - Example: "도판1" → num="도판1" ✓
1273
+ * - Example: "도판 1" → num="도판1" ✗ (spacing mismatch)
1274
+ *
1275
+ * 3. **Completeness**: The number part must be fully extracted
1276
+ * - Example: "Figure 2-3" → num="Figure 2-3" ✓
1277
+ * - Example: "Figure 2-3" → num="Figure 2" ✗ (incomplete number)
1278
+ *
1279
+ * 4. **Null handling**: If "num" is null, verify that the original text has no number prefix
1280
+ * - Example: "유적 전경 사진" → num=null ✓
1281
+ * - Example: "도판 1 전경" → num=null ✗ (should extract "도판 1")
1282
+ */
1283
+ declare class CaptionValidator extends BaseValidator<typeof CaptionValidationBatchSchema, CaptionValidationBatch> {
1284
+ constructor(logger: LoggerMethods, model: LanguageModel, options?: CaptionValidatorOptions, fallbackModel?: LanguageModel, aggregator?: LLMTokenUsageAggregator);
1285
+ /**
1286
+ * Validate batch of parsed captions against original texts
1287
+ *
1288
+ * @param captions - Array of parsed Caption objects
1289
+ * @param originalTexts - Array of original caption texts (same order as captions)
1290
+ * @param batchSize - Batch size for processing. Set to 0 to skip validation (assume all valid).
1291
+ * @returns Array of validation results (boolean) maintaining original order
1292
+ */
1293
+ validateBatch(captions: Caption[], originalTexts: string[], batchSize: number): Promise<boolean[]>;
1294
+ /**
1295
+ * Internal: Validate batch of captions using LLM
1296
+ *
1297
+ * @param items - Batch of caption items with original indices
1298
+ * @param model - Effective model to use
1299
+ * @returns Array of validation results indexed correctly
1300
+ */
1301
+ private validateBatchInternal;
1302
+ protected buildSystemPrompt(): string;
1303
+ protected buildUserPrompt(items: Array<{
1304
+ index: number;
1305
+ caption: Caption;
1306
+ originalText: string;
1307
+ }>): string;
1308
+ }
1309
+ /**
1310
+ * Error thrown when caption validation fails
1311
+ */
1312
+ declare class CaptionValidationError extends Error {
1313
+ constructor(message: string, options?: ErrorOptions);
1314
+ }
1315
+
1316
+ /**
1317
+ * ChapterConverter
1318
+ *
1319
+ * Converts TocEntry[] to Chapter[] with text blocks, images, and tables.
1320
+ *
1321
+ * ## Conversion Process
1322
+ *
1323
+ * 1. Create Front Matter chapter (ch-000) for pre-TOC content
1324
+ * 2. Build chapter tree from TocEntry[] (recursive)
1325
+ * 3. Calculate page ranges for each chapter
1326
+ * 4. Assign text blocks to chapters based on page ranges
1327
+ * 5. Link images/tables to chapters based on page ranges
1328
+ *
1329
+ * ## Page Assignment Strategy
1330
+ *
1331
+ * Uses "start page first" strategy: resources are assigned to the chapter
1332
+ * whose startPage is the largest value that is still <= the resource's page.
1333
+ *
1334
+ * ## Front Matter
1335
+ *
1336
+ * A special chapter (ch-000) is always created to hold content that appears
1337
+ * before the first TOC entry (e.g., cover, preface, table of contents itself).
1338
+ */
1339
+ declare class ChapterConverter {
1340
+ private static readonly FRONT_MATTER_ID;
1341
+ private static readonly FRONT_MATTER_TITLE;
1342
+ private readonly logger;
1343
+ private readonly idGenerator;
1344
+ constructor(logger: LoggerMethods, idGenerator: IdGenerator);
1345
+ /**
1346
+ * Convert TocEntry[] to Chapter[]
1347
+ *
1348
+ * @param tocEntries - Table of contents entries
1349
+ * @param textItems - DoclingDocument.texts (with prov for page numbers)
1350
+ * @param pageRangeMap - PDF page to actual page mapping
1351
+ * @param images - Converted images
1352
+ * @param tables - Converted tables
1353
+ * @param footnotes - Converted footnotes
1354
+ * @returns Converted chapters with text blocks and resource references
1355
+ */
1356
+ convert(tocEntries: TocEntry[], textItems: DoclingTextItem[], pageRangeMap: Record<number, PageRange>, images: ProcessedImage[], tables: ProcessedTable[], footnotes: ProcessedFootnote[]): Chapter[];
1357
+ /**
1358
+ * Create Front Matter chapter for pre-TOC content
1359
+ */
1360
+ private createFrontMatterChapter;
1361
+ /**
1362
+ * Build chapter tree from TocEntry[]
1363
+ * Recursively processes children
1364
+ */
1365
+ private buildChapterTree;
1366
+ /**
1367
+ * Flatten chapter tree for page range calculation
1368
+ * Preserves original TOC page numbers
1369
+ */
1370
+ private flattenChapters;
1371
+ /**
1372
+ * Calculate page range for each chapter
1373
+ * Uses next chapter's start page as end boundary
1374
+ *
1375
+ * Front Matter (ch-000) gets special handling:
1376
+ * - startPage: 1
1377
+ * - endPage: first TOC entry's page - 1 (or 0 if TOC starts at page 1)
1378
+ */
1379
+ private calculatePageRanges;
1380
+ /**
1381
+ * Valid labels for text blocks
1382
+ * Only these labels are included in chapter text blocks
1383
+ */
1384
+ private static readonly VALID_TEXT_LABELS;
1385
+ /**
1386
+ * Check if text item has a picture parent
1387
+ * Items with parent.$ref starting with "#/pictures/" are excluded
1388
+ */
1389
+ private static hasPictureParent;
1390
+ /**
1391
+ * Convert text items to text blocks
1392
+ * Filters by label (text, section_header, list_item), excludes picture children,
1393
+ * and extracts page numbers from prov
1394
+ */
1395
+ private convertTextBlocks;
1396
+ /**
1397
+ * Convert PDF page number to actual document page number
1398
+ * Falls back to pdfPageNo if mapping is missing
1399
+ */
1400
+ private pdfPageToActualPage;
1401
+ /**
1402
+ * Find chapter ID for a given actual page number
1403
+ * Uses "start page first" strategy
1404
+ */
1405
+ private findChapterForPage;
1406
+ /**
1407
+ * Assign text blocks to chapters based on page ranges
1408
+ */
1409
+ private assignTextBlocks;
1410
+ /**
1411
+ * Link images, tables, and footnotes to chapters based on page ranges
1412
+ */
1413
+ private linkResources;
1414
+ /**
1415
+ * Build flat chapter map for O(1) lookup
1416
+ */
1417
+ private buildChapterMap;
1418
+ }
1419
+
1420
+ export { BaseLLMComponent, type BaseLLMComponentOptions, BaseValidator, type BaseValidatorOptions, CONTINUATION_MARKERS, CaptionParseError, CaptionParser, type CaptionParserOptions, CaptionValidationError, CaptionValidator, type CaptionValidatorOptions, ChapterConverter, DocumentProcessor, type DocumentProcessorOptions, type ImageContent, PAGE_NUMBER_PATTERN, PagePattern, PageRangeParseError, PageRangeParser, type PageSizeGroup, TOC_KEYWORDS, TextLLMComponent, type TocAreaResult, type TocContentValidationResult, TocContentValidationSchema, TocContentValidator, type TocContentValidatorOptions, type TocEntry, TocEntrySchema, TocExtractError, TocExtractor, type TocExtractorOptions, TocFinder, type TocFinderOptions, TocNotFoundError, TocParseError, type TocResponse, TocResponseSchema, VisionLLMComponent, type VisionLLMComponentOptions, type VisionTocExtractionResult, VisionTocExtractionSchema, VisionTocExtractor, type VisionTocExtractorOptions };