@heripo/document-processor 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.ko.md +332 -0
- package/README.md +332 -0
- package/dist/index.cjs +4325 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +1420 -0
- package/dist/index.d.ts +1420 -0
- package/dist/index.js +4262 -0
- package/dist/index.js.map +1 -0
- package/package.json +89 -0
package/dist/index.d.cts
ADDED
|
@@ -0,0 +1,1420 @@
|
|
|
1
|
+
import { LoggerMethods } from '@heripo/logger';
|
|
2
|
+
import { DoclingDocument, DocumentProcessResult, Caption, PageRange, DoclingTextItem, DoclingPictureItem, DoclingTableItem, DoclingGroupItem, ProcessedImage, ProcessedTable, ProcessedFootnote, Chapter } from '@heripo/model';
|
|
3
|
+
import { LanguageModel } from 'ai';
|
|
4
|
+
import { LLMTokenUsageAggregator, ExtendedTokenUsage } from '@heripo/shared';
|
|
5
|
+
import { z } from 'zod';
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* DocumentProcessor Options
|
|
9
|
+
*/
|
|
10
|
+
interface DocumentProcessorOptions {
|
|
11
|
+
/**
|
|
12
|
+
* Logger instance
|
|
13
|
+
*/
|
|
14
|
+
logger: LoggerMethods;
|
|
15
|
+
/**
|
|
16
|
+
* Fallback model - used as fallback when component-specific models are not provided or fail.
|
|
17
|
+
* This is the only required model. Should be set to a frontier model (e.g., Claude Opus 4.5, GPT-5.2)
|
|
18
|
+
* to ensure reliable fallback performance across all components.
|
|
19
|
+
*/
|
|
20
|
+
fallbackModel: LanguageModel;
|
|
21
|
+
/**
|
|
22
|
+
* Model for PageRangeParser - extracts page numbers from page images.
|
|
23
|
+
* Requires vision capabilities. Falls back to 'fallbackModel' if not provided.
|
|
24
|
+
*/
|
|
25
|
+
pageRangeParserModel?: LanguageModel;
|
|
26
|
+
/**
|
|
27
|
+
* Model for TocExtractor - extracts structured TOC from Markdown representation.
|
|
28
|
+
* Falls back to 'fallbackModel' if not provided.
|
|
29
|
+
*/
|
|
30
|
+
tocExtractorModel?: LanguageModel;
|
|
31
|
+
/**
|
|
32
|
+
* Model for validators (TOC content validation, caption validation).
|
|
33
|
+
* Falls back to 'fallbackModel' if not provided.
|
|
34
|
+
*/
|
|
35
|
+
validatorModel?: LanguageModel;
|
|
36
|
+
/**
|
|
37
|
+
* Model for VisionTocExtractor - extracts TOC directly from page images.
|
|
38
|
+
* Requires vision capabilities. Falls back to 'fallbackModel' if not provided.
|
|
39
|
+
*/
|
|
40
|
+
visionTocExtractorModel?: LanguageModel;
|
|
41
|
+
/**
|
|
42
|
+
* Model for CaptionParser - extracts caption prefix and number from image/table captions.
|
|
43
|
+
* Falls back to 'fallbackModel' if not provided.
|
|
44
|
+
*/
|
|
45
|
+
captionParserModel?: LanguageModel;
|
|
46
|
+
/**
|
|
47
|
+
* Batch size for TextCleaner text normalization (synchronous processing)
|
|
48
|
+
*/
|
|
49
|
+
textCleanerBatchSize: number;
|
|
50
|
+
/**
|
|
51
|
+
* Batch size for CaptionParser LLM parsing (async parallel processing)
|
|
52
|
+
*/
|
|
53
|
+
captionParserBatchSize: number;
|
|
54
|
+
/**
|
|
55
|
+
* Batch size for CaptionValidator LLM validation (async parallel processing)
|
|
56
|
+
*/
|
|
57
|
+
captionValidatorBatchSize: number;
|
|
58
|
+
/**
|
|
59
|
+
* Maximum retry count (default: 3)
|
|
60
|
+
*/
|
|
61
|
+
maxRetries?: number;
|
|
62
|
+
/**
|
|
63
|
+
* Enable fallback retry mechanism - automatically retries with fallback model on failure (default: true)
|
|
64
|
+
* Set to false to disable automatic fallback retry and fail immediately on component-specific model errors
|
|
65
|
+
*/
|
|
66
|
+
enableFallbackRetry?: boolean;
|
|
67
|
+
/**
|
|
68
|
+
* Abort signal for cancellation support.
|
|
69
|
+
* When aborted, processing stops at the next checkpoint between stages.
|
|
70
|
+
*/
|
|
71
|
+
abortSignal?: AbortSignal;
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* DocumentProcessor
|
|
75
|
+
*
|
|
76
|
+
* Main class that converts DoclingDocument to ProcessedDocument.
|
|
77
|
+
*
|
|
78
|
+
* ## Conversion Process
|
|
79
|
+
*
|
|
80
|
+
* 1. Initialize RefResolver - indexing for $ref resolution
|
|
81
|
+
* 2. Initialize IdGenerator - unique ID generator
|
|
82
|
+
* 3. Text filtering and PageRangeMap generation (visionModel)
|
|
83
|
+
* 4. TOC extraction (model) - core step
|
|
84
|
+
* 5. Parallel processing block:
|
|
85
|
+
* - Images conversion (caption extraction)
|
|
86
|
+
* - Tables conversion (excluding TOC tables)
|
|
87
|
+
* 6. Chapters conversion (based on TOC)
|
|
88
|
+
* 7. Assemble ProcessedDocument
|
|
89
|
+
*
|
|
90
|
+
* @example
|
|
91
|
+
* ```typescript
|
|
92
|
+
* import { openai } from '@ai-sdk/openai';
|
|
93
|
+
* import { anthropic } from '@ai-sdk/anthropic';
|
|
94
|
+
* import { DocumentProcessor } from '@heripo/document-processor';
|
|
95
|
+
* import { getLogger } from '@heripo/logger';
|
|
96
|
+
*
|
|
97
|
+
* const logger = getLogger();
|
|
98
|
+
*
|
|
99
|
+
* // Basic usage - all components use the fallback model
|
|
100
|
+
* const processor = new DocumentProcessor({
|
|
101
|
+
* logger,
|
|
102
|
+
* fallbackModel: anthropic('claude-opus-4-5-20251101'), // Frontier model for reliable fallback
|
|
103
|
+
* });
|
|
104
|
+
*
|
|
105
|
+
* // Advanced usage - component-specific models with frontier fallback
|
|
106
|
+
* const advancedProcessor = new DocumentProcessor({
|
|
107
|
+
* logger,
|
|
108
|
+
* fallbackModel: anthropic('claude-opus-4-5-20251101'), // Frontier model for fallback
|
|
109
|
+
* pageRangeParserModel: openai('gpt-5.2'), // Vision-capable
|
|
110
|
+
* tocExtractorModel: openai('gpt-5-mini'), // Structured output
|
|
111
|
+
* validatorModel: openai('gpt-5.2'), // Validation (TOC + caption)
|
|
112
|
+
* visionTocExtractorModel: openai('gpt-5.1'), // Vision-capable
|
|
113
|
+
* captionParserModel: openai('gpt-5-mini'),
|
|
114
|
+
* textCleanerBatchSize: 20, // Sync text processing
|
|
115
|
+
* captionParserBatchSize: 10, // LLM caption parsing
|
|
116
|
+
* captionValidatorBatchSize: 10, // LLM caption validation
|
|
117
|
+
* maxRetries: 3,
|
|
118
|
+
* });
|
|
119
|
+
*
|
|
120
|
+
* const result = await processor.process(
|
|
121
|
+
* doclingDoc,
|
|
122
|
+
* 'report-001',
|
|
123
|
+
* outputPath
|
|
124
|
+
* );
|
|
125
|
+
* ```
|
|
126
|
+
*/
|
|
127
|
+
declare class DocumentProcessor {
|
|
128
|
+
private readonly logger;
|
|
129
|
+
private readonly fallbackModel;
|
|
130
|
+
private readonly pageRangeParserModel;
|
|
131
|
+
private readonly tocExtractorModel;
|
|
132
|
+
private readonly validatorModel;
|
|
133
|
+
private readonly visionTocExtractorModel;
|
|
134
|
+
private readonly captionParserModel;
|
|
135
|
+
private readonly textCleanerBatchSize;
|
|
136
|
+
private readonly captionParserBatchSize;
|
|
137
|
+
private readonly captionValidatorBatchSize;
|
|
138
|
+
private readonly maxRetries;
|
|
139
|
+
private readonly enableFallbackRetry;
|
|
140
|
+
private readonly abortSignal?;
|
|
141
|
+
private idGenerator;
|
|
142
|
+
private refResolver?;
|
|
143
|
+
private pageRangeParser?;
|
|
144
|
+
private tocFinder?;
|
|
145
|
+
private tocExtractor?;
|
|
146
|
+
private tocContentValidator?;
|
|
147
|
+
private captionValidator?;
|
|
148
|
+
private visionTocExtractor?;
|
|
149
|
+
private captionParser?;
|
|
150
|
+
private chapterConverter?;
|
|
151
|
+
private textCleaner;
|
|
152
|
+
private readonly usageAggregator;
|
|
153
|
+
constructor(options: DocumentProcessorOptions);
|
|
154
|
+
/**
|
|
155
|
+
* Check if abort has been requested and throw error if so
|
|
156
|
+
*
|
|
157
|
+
* @throws {Error} with name 'AbortError' if aborted
|
|
158
|
+
*/
|
|
159
|
+
private checkAborted;
|
|
160
|
+
/**
|
|
161
|
+
* Converts DoclingDocument to ProcessedDocument with token usage tracking.
|
|
162
|
+
*
|
|
163
|
+
* Conversion process:
|
|
164
|
+
* 1. Initialize processors and resolvers
|
|
165
|
+
* 2. Normalize and filter texts
|
|
166
|
+
* 3. Clean texts and parse page ranges (parallel)
|
|
167
|
+
* 4. Extract table of contents
|
|
168
|
+
* 5. Convert images and tables (parallel)
|
|
169
|
+
* 6. Convert chapters and link resources
|
|
170
|
+
* 7. Assemble final ProcessedDocument
|
|
171
|
+
* 8. Collect and report token usage
|
|
172
|
+
*
|
|
173
|
+
* @param doclingDoc - Original document extracted from Docling SDK
|
|
174
|
+
* @param reportId - Report unique identifier
|
|
175
|
+
* @param outputPath - Path containing images and pages subdirectories (images/image_0.png, pages/page_0.png, etc.)
|
|
176
|
+
* @returns Document processing result with ProcessedDocument and token usage report
|
|
177
|
+
*
|
|
178
|
+
* @throws {TocExtractError} When TOC extraction fails
|
|
179
|
+
* @throws {PageRangeParseError} When page range parsing fails
|
|
180
|
+
* @throws {ConversionError} When error occurs during conversion
|
|
181
|
+
*/
|
|
182
|
+
process(doclingDoc: DoclingDocument, reportId: string, outputPath: string): Promise<DocumentProcessResult>;
|
|
183
|
+
/**
|
|
184
|
+
* Initialize all processors and resolvers
|
|
185
|
+
*
|
|
186
|
+
* Sets up RefResolver, PageRangeParser, TocFinder, and TocExtractor
|
|
187
|
+
*/
|
|
188
|
+
private initializeProcessors;
|
|
189
|
+
/**
|
|
190
|
+
* Normalize and filter texts using TextCleaner
|
|
191
|
+
*
|
|
192
|
+
* Performs basic text normalization (unicode, whitespace, punctuation)
|
|
193
|
+
* and filters out invalid texts (empty, numbers-only, etc.)
|
|
194
|
+
*/
|
|
195
|
+
private normalizeAndFilterTexts;
|
|
196
|
+
/**
|
|
197
|
+
* Parse page ranges using Vision LLM
|
|
198
|
+
*
|
|
199
|
+
* Extracts actual page numbers from page images and creates mapping.
|
|
200
|
+
* Token usage is automatically tracked by PageRangeParser into the shared aggregator.
|
|
201
|
+
*/
|
|
202
|
+
private parsePageRanges;
|
|
203
|
+
/**
|
|
204
|
+
* Convert images, tables, and footnotes
|
|
205
|
+
*
|
|
206
|
+
* Runs conversions:
|
|
207
|
+
* - Images conversion (with caption extraction)
|
|
208
|
+
* - Tables conversion (with caption extraction, excluding TOC tables)
|
|
209
|
+
* - Footnotes conversion (synchronous, from text items with label='footnote')
|
|
210
|
+
*/
|
|
211
|
+
private convertResources;
|
|
212
|
+
/**
|
|
213
|
+
* Convert footnotes
|
|
214
|
+
*
|
|
215
|
+
* Extracts footnotes from DoclingDocument text items with label='footnote'
|
|
216
|
+
*/
|
|
217
|
+
private convertFootnotes;
|
|
218
|
+
/**
|
|
219
|
+
* Assemble the final ProcessedDocument
|
|
220
|
+
*
|
|
221
|
+
* Creates the ProcessedDocument structure with all converted components
|
|
222
|
+
*/
|
|
223
|
+
private assembleProcessedDocument;
|
|
224
|
+
/**
|
|
225
|
+
* Extract table of contents (TOC)
|
|
226
|
+
*
|
|
227
|
+
* Uses rule-based extraction with LLM validation and vision fallback:
|
|
228
|
+
* 1. TocFinder - find TOC area in document (rule-based)
|
|
229
|
+
* 2. MarkdownConverter - convert TOC items to Markdown
|
|
230
|
+
* 3. TocContentValidator - validate if content is actually a TOC (LLM)
|
|
231
|
+
* 4. If invalid: VisionTocExtractor - extract from page images (vision LLM fallback)
|
|
232
|
+
* 5. TocExtractor - LLM-based structured extraction
|
|
233
|
+
*/
|
|
234
|
+
private extractTableOfContents;
|
|
235
|
+
/**
|
|
236
|
+
* Process resource captions (for images and tables)
|
|
237
|
+
*
|
|
238
|
+
* Common caption processing pipeline:
|
|
239
|
+
* 1. Parse captions in batch
|
|
240
|
+
* 2. Validate parsed captions
|
|
241
|
+
* 3. Reparse failed captions with fallback model
|
|
242
|
+
*
|
|
243
|
+
* @param captionTexts - Array of caption texts to process
|
|
244
|
+
* @param resourceType - Type of resource for logging (e.g., 'image', 'table')
|
|
245
|
+
* @returns Parsed captions with index mapping
|
|
246
|
+
*/
|
|
247
|
+
private processResourceCaptions;
|
|
248
|
+
/**
|
|
249
|
+
* Extract caption text from resource
|
|
250
|
+
*
|
|
251
|
+
* Handles both string references and $ref resolution
|
|
252
|
+
*/
|
|
253
|
+
private extractCaptionText;
|
|
254
|
+
/**
|
|
255
|
+
* Convert images
|
|
256
|
+
*
|
|
257
|
+
* Converts pictures from DoclingDocument to ProcessedImage
|
|
258
|
+
*/
|
|
259
|
+
private convertImages;
|
|
260
|
+
/**
|
|
261
|
+
* Convert tables
|
|
262
|
+
*
|
|
263
|
+
* Converts tables from DoclingDocument to ProcessedTable
|
|
264
|
+
*/
|
|
265
|
+
private convertTables;
|
|
266
|
+
/**
|
|
267
|
+
* Convert chapters and link resources
|
|
268
|
+
*
|
|
269
|
+
* Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
|
|
270
|
+
* Falls back to single "Document" chapter when TOC is empty.
|
|
271
|
+
*/
|
|
272
|
+
private convertChapters;
|
|
273
|
+
/**
|
|
274
|
+
* Create a fallback chapter when TOC is not available
|
|
275
|
+
*
|
|
276
|
+
* Creates a single "Document" chapter containing all text blocks,
|
|
277
|
+
* images, tables, and footnotes from the document.
|
|
278
|
+
*/
|
|
279
|
+
private createFallbackChapter;
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
/**
|
|
283
|
+
* Base options for all LLM-based components
|
|
284
|
+
*/
|
|
285
|
+
interface BaseLLMComponentOptions {
|
|
286
|
+
/**
|
|
287
|
+
* Maximum retry count for LLM API (default: 3)
|
|
288
|
+
*/
|
|
289
|
+
maxRetries?: number;
|
|
290
|
+
/**
|
|
291
|
+
* Temperature for LLM generation (default: 0)
|
|
292
|
+
*/
|
|
293
|
+
temperature?: number;
|
|
294
|
+
/**
|
|
295
|
+
* Abort signal for cancellation support
|
|
296
|
+
*/
|
|
297
|
+
abortSignal?: AbortSignal;
|
|
298
|
+
}
|
|
299
|
+
/**
|
|
300
|
+
* Abstract base class for all LLM-based components
|
|
301
|
+
*
|
|
302
|
+
* Provides common functionality:
|
|
303
|
+
* - Consistent logging with component name prefix
|
|
304
|
+
* - Token usage tracking via optional aggregator
|
|
305
|
+
* - Standard configuration (model, fallback, retries, temperature)
|
|
306
|
+
*
|
|
307
|
+
* Subclasses must implement buildSystemPrompt() and buildUserPrompt().
|
|
308
|
+
*/
|
|
309
|
+
declare abstract class BaseLLMComponent {
|
|
310
|
+
protected readonly logger: LoggerMethods;
|
|
311
|
+
protected readonly model: LanguageModel;
|
|
312
|
+
protected readonly fallbackModel?: LanguageModel;
|
|
313
|
+
protected readonly maxRetries: number;
|
|
314
|
+
protected readonly temperature: number;
|
|
315
|
+
protected readonly componentName: string;
|
|
316
|
+
protected readonly aggregator?: LLMTokenUsageAggregator;
|
|
317
|
+
protected readonly abortSignal?: AbortSignal;
|
|
318
|
+
/**
|
|
319
|
+
* Constructor for BaseLLMComponent
|
|
320
|
+
*
|
|
321
|
+
* @param logger - Logger instance for logging
|
|
322
|
+
* @param model - Primary language model for LLM calls
|
|
323
|
+
* @param componentName - Name of the component for logging (e.g., "TocExtractor")
|
|
324
|
+
* @param options - Optional configuration (maxRetries, temperature)
|
|
325
|
+
* @param fallbackModel - Optional fallback model for retry on failure
|
|
326
|
+
* @param aggregator - Optional token usage aggregator for tracking LLM calls
|
|
327
|
+
*/
|
|
328
|
+
constructor(logger: LoggerMethods, model: LanguageModel, componentName: string, options?: BaseLLMComponentOptions, fallbackModel?: LanguageModel, aggregator?: LLMTokenUsageAggregator);
|
|
329
|
+
/**
|
|
330
|
+
* Log a message with consistent component name prefix
|
|
331
|
+
*
|
|
332
|
+
* @param level - Log level ('info', 'warn', 'error')
|
|
333
|
+
* @param message - Message to log (without prefix)
|
|
334
|
+
* @param args - Additional arguments to pass to logger
|
|
335
|
+
*/
|
|
336
|
+
protected log(level: 'info' | 'warn' | 'error', message: string, ...args: unknown[]): void;
|
|
337
|
+
/**
|
|
338
|
+
* Track token usage to aggregator if available
|
|
339
|
+
*
|
|
340
|
+
* @param usage - Token usage information to track
|
|
341
|
+
*/
|
|
342
|
+
protected trackUsage(usage: ExtendedTokenUsage): void;
|
|
343
|
+
/**
|
|
344
|
+
* Create an empty usage record for edge cases (e.g., empty input)
|
|
345
|
+
*
|
|
346
|
+
* @param phase - Phase name for the usage record
|
|
347
|
+
* @returns Empty ExtendedTokenUsage object
|
|
348
|
+
*/
|
|
349
|
+
protected createEmptyUsage(phase: string): ExtendedTokenUsage;
|
|
350
|
+
/**
|
|
351
|
+
* Build system prompt for LLM call
|
|
352
|
+
*
|
|
353
|
+
* Subclasses must implement this to provide component-specific system prompts.
|
|
354
|
+
*/
|
|
355
|
+
protected abstract buildSystemPrompt(...args: unknown[]): string;
|
|
356
|
+
/**
|
|
357
|
+
* Build user prompt for LLM call
|
|
358
|
+
*
|
|
359
|
+
* Subclasses must implement this to construct prompts from input data.
|
|
360
|
+
*/
|
|
361
|
+
protected abstract buildUserPrompt(...args: unknown[]): string;
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
/**
|
|
365
|
+
* Abstract base class for text-based LLM components
|
|
366
|
+
*
|
|
367
|
+
* Extends BaseLLMComponent with helper method for text-based LLM calls
|
|
368
|
+
* using LLMCaller.call() (non-vision).
|
|
369
|
+
*
|
|
370
|
+
* Subclasses: TocExtractor, CaptionParser, BaseValidator
|
|
371
|
+
*/
|
|
372
|
+
declare abstract class TextLLMComponent extends BaseLLMComponent {
|
|
373
|
+
constructor(logger: LoggerMethods, model: LanguageModel, componentName: string, options?: BaseLLMComponentOptions, fallbackModel?: LanguageModel, aggregator?: LLMTokenUsageAggregator);
|
|
374
|
+
/**
|
|
375
|
+
* Call LLM with text-based prompts using LLMCaller.call()
|
|
376
|
+
*
|
|
377
|
+
* @template TSchema - Zod schema type for response validation
|
|
378
|
+
* @param schema - Zod schema for response validation
|
|
379
|
+
* @param systemPrompt - System prompt for LLM
|
|
380
|
+
* @param userPrompt - User prompt for LLM
|
|
381
|
+
* @param phase - Phase name for tracking (e.g., 'extraction', 'validation')
|
|
382
|
+
* @returns Promise with parsed object and usage information
|
|
383
|
+
*/
|
|
384
|
+
protected callTextLLM<TSchema extends z.ZodType>(schema: TSchema, systemPrompt: string, userPrompt: string, phase: string): Promise<{
|
|
385
|
+
output: z.infer<TSchema>;
|
|
386
|
+
usage: ExtendedTokenUsage;
|
|
387
|
+
}>;
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
/**
|
|
391
|
+
* Options for VisionLLMComponent
|
|
392
|
+
*/
|
|
393
|
+
interface VisionLLMComponentOptions extends BaseLLMComponentOptions {
|
|
394
|
+
}
|
|
395
|
+
/**
|
|
396
|
+
* Image content structure for vision LLM messages
|
|
397
|
+
*/
|
|
398
|
+
interface ImageContent {
|
|
399
|
+
type: 'image';
|
|
400
|
+
image: string;
|
|
401
|
+
}
|
|
402
|
+
/**
|
|
403
|
+
* Abstract base class for vision-based LLM components
|
|
404
|
+
*
|
|
405
|
+
* Extends BaseLLMComponent with helper methods for vision-based LLM calls
|
|
406
|
+
* using LLMCaller.callVision().
|
|
407
|
+
*
|
|
408
|
+
* Subclasses: PageRangeParser, VisionTocExtractor
|
|
409
|
+
*/
|
|
410
|
+
declare abstract class VisionLLMComponent extends BaseLLMComponent {
|
|
411
|
+
protected readonly outputPath: string;
|
|
412
|
+
constructor(logger: LoggerMethods, model: LanguageModel, componentName: string, outputPath: string, options?: VisionLLMComponentOptions, fallbackModel?: LanguageModel, aggregator?: LLMTokenUsageAggregator);
|
|
413
|
+
/**
|
|
414
|
+
* Call LLM with vision capabilities using LLMCaller.callVision()
|
|
415
|
+
*
|
|
416
|
+
* @template TSchema - Zod schema type for response validation
|
|
417
|
+
* @param schema - Zod schema for response validation
|
|
418
|
+
* @param messages - Messages array including image content
|
|
419
|
+
* @param phase - Phase name for tracking (e.g., 'extraction', 'sampling')
|
|
420
|
+
* @returns Promise with parsed object and usage information
|
|
421
|
+
*/
|
|
422
|
+
protected callVisionLLM<TSchema extends z.ZodType>(schema: TSchema, messages: Array<{
|
|
423
|
+
role: 'user' | 'assistant';
|
|
424
|
+
content: unknown[] | string;
|
|
425
|
+
}>, phase: string): Promise<{
|
|
426
|
+
output: z.infer<TSchema>;
|
|
427
|
+
usage: ExtendedTokenUsage;
|
|
428
|
+
}>;
|
|
429
|
+
/**
|
|
430
|
+
* Load an image file and encode it as base64
|
|
431
|
+
*
|
|
432
|
+
* @param imagePath - Absolute path to the image file
|
|
433
|
+
* @returns Base64 encoded image string
|
|
434
|
+
*/
|
|
435
|
+
protected loadImageAsBase64(imagePath: string): string;
|
|
436
|
+
/**
|
|
437
|
+
* Build image content object for vision LLM messages
|
|
438
|
+
*
|
|
439
|
+
* @param imagePath - Path to the image file (relative to outputPath or absolute)
|
|
440
|
+
* @param mimeType - MIME type of the image (default: 'image/png')
|
|
441
|
+
* @returns ImageContent object for LLM message
|
|
442
|
+
*/
|
|
443
|
+
protected buildImageContent(imagePath: string, mimeType?: string): ImageContent;
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
/**
|
|
447
|
+
* Table of Contents Entry
|
|
448
|
+
*
|
|
449
|
+
* Tree node representing the table of contents structure of a document.
|
|
450
|
+
*/
|
|
451
|
+
interface TocEntry {
|
|
452
|
+
/**
|
|
453
|
+
* Chapter title
|
|
454
|
+
*/
|
|
455
|
+
title: string;
|
|
456
|
+
/**
|
|
457
|
+
* Hierarchy depth (1, 2, 3...)
|
|
458
|
+
*/
|
|
459
|
+
level: number;
|
|
460
|
+
/**
|
|
461
|
+
* Starting page number
|
|
462
|
+
*/
|
|
463
|
+
pageNo: number;
|
|
464
|
+
/**
|
|
465
|
+
* Child TOC entries
|
|
466
|
+
*/
|
|
467
|
+
children?: TocEntry[];
|
|
468
|
+
}
|
|
469
|
+
/**
|
|
470
|
+
* TOC Area Search Result
|
|
471
|
+
*/
|
|
472
|
+
interface TocAreaResult {
|
|
473
|
+
/**
|
|
474
|
+
* Group or table item references corresponding to the table of contents
|
|
475
|
+
*/
|
|
476
|
+
itemRefs: string[];
|
|
477
|
+
/**
|
|
478
|
+
* TOC start page
|
|
479
|
+
*/
|
|
480
|
+
startPage: number;
|
|
481
|
+
/**
|
|
482
|
+
* TOC end page
|
|
483
|
+
*/
|
|
484
|
+
endPage: number;
|
|
485
|
+
}
|
|
486
|
+
/**
|
|
487
|
+
* Page Size Information
|
|
488
|
+
*/
|
|
489
|
+
interface PageSizeGroup {
|
|
490
|
+
/**
|
|
491
|
+
* Size identifier (width x height)
|
|
492
|
+
*/
|
|
493
|
+
sizeKey: string;
|
|
494
|
+
/**
|
|
495
|
+
* PDF page numbers with this size specification
|
|
496
|
+
*/
|
|
497
|
+
pageNos: number[];
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
/**
|
|
501
|
+
* CaptionParser options
|
|
502
|
+
*/
|
|
503
|
+
interface CaptionParserOptions extends BaseLLMComponentOptions {
|
|
504
|
+
/**
|
|
505
|
+
* Custom component name for token usage tracking.
|
|
506
|
+
* Defaults to 'CaptionParser'.
|
|
507
|
+
*/
|
|
508
|
+
componentName?: string;
|
|
509
|
+
}
|
|
510
|
+
/**
|
|
511
|
+
* CaptionParser
|
|
512
|
+
*
|
|
513
|
+
* Extracts caption prefix and number from image/table captions using LLM.
|
|
514
|
+
* Preserves original spacing from input text.
|
|
515
|
+
* Extends TextLLMComponent for standardized LLM call handling.
|
|
516
|
+
*
|
|
517
|
+
* ## Algorithm
|
|
518
|
+
*
|
|
519
|
+
* 1. Collect caption texts
|
|
520
|
+
* 2. Split into batches based on batchSize
|
|
521
|
+
* 3. For each batch: call LLM to extract caption prefix + number
|
|
522
|
+
* 4. Flatten results and return
|
|
523
|
+
*/
|
|
524
|
+
declare class CaptionParser extends TextLLMComponent {
|
|
525
|
+
constructor(logger: LoggerMethods, model: LanguageModel, options?: CaptionParserOptions, fallbackModel?: LanguageModel, aggregator?: LLMTokenUsageAggregator);
|
|
526
|
+
/**
|
|
527
|
+
* Parse batch of captions
|
|
528
|
+
*
|
|
529
|
+
* @param captions - Array of caption full texts
|
|
530
|
+
* @param batchSize - Batch size for processing. Set to 0 for sequential processing without batching.
|
|
531
|
+
* @param overrideModel - Optional model to use instead of the default model
|
|
532
|
+
* @returns Array of Caption objects with num extracted (maintains original order)
|
|
533
|
+
*/
|
|
534
|
+
parseBatch(captions: string[], batchSize: number, overrideModel?: LanguageModel): Promise<Caption[]>;
|
|
535
|
+
/**
|
|
536
|
+
* Internal: Parse batch of captions using LLM
|
|
537
|
+
*
|
|
538
|
+
* @param captions - Batch of caption texts with original indices
|
|
539
|
+
* @param model - Effective model to use
|
|
540
|
+
* @returns Array of Caption objects indexed correctly
|
|
541
|
+
*/
|
|
542
|
+
private parseBatchInternal;
|
|
543
|
+
/**
|
|
544
|
+
* Extract and normalize caption number from full text
|
|
545
|
+
*
|
|
546
|
+
* Finds the extracted num pattern in the full text and extracts it
|
|
547
|
+
* with original casing. Handles case-insensitive matching.
|
|
548
|
+
*
|
|
549
|
+
* @param fullText - The full caption text
|
|
550
|
+
* @param extractedNum - The num extracted by LLM (may have different casing)
|
|
551
|
+
* @returns Normalized num or undefined if no match
|
|
552
|
+
*/
|
|
553
|
+
private extractNumFromFullText;
|
|
554
|
+
/**
|
|
555
|
+
* Build system prompt for caption parsing
|
|
556
|
+
*
|
|
557
|
+
* @param mode - 'batch' for multiple captions, 'single' for single caption
|
|
558
|
+
*/
|
|
559
|
+
protected buildSystemPrompt(mode?: 'batch' | 'single'): string;
|
|
560
|
+
/**
|
|
561
|
+
* Build user prompt for caption parsing
|
|
562
|
+
*/
|
|
563
|
+
protected buildUserPrompt(captions: Array<{
|
|
564
|
+
index: number;
|
|
565
|
+
text: string;
|
|
566
|
+
}>): string;
|
|
567
|
+
/**
|
|
568
|
+
* Build user prompt for single caption parsing
|
|
569
|
+
*/
|
|
570
|
+
private buildUserPromptSingle;
|
|
571
|
+
}
|
|
572
|
+
/**
|
|
573
|
+
* Error thrown when caption parsing fails
|
|
574
|
+
*/
|
|
575
|
+
declare class CaptionParseError extends Error {
|
|
576
|
+
constructor(message: string, options?: ErrorOptions);
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
/**
|
|
580
|
+
* PageRangeParseError
|
|
581
|
+
*
|
|
582
|
+
* Custom error thrown when page range parsing fails.
|
|
583
|
+
*/
|
|
584
|
+
declare class PageRangeParseError extends Error {
|
|
585
|
+
constructor(message: string, options?: ErrorOptions);
|
|
586
|
+
/**
|
|
587
|
+
* Extract error message from unknown error type
|
|
588
|
+
*/
|
|
589
|
+
static getErrorMessage(error: unknown): string;
|
|
590
|
+
/**
|
|
591
|
+
* Create PageRangeParseError from unknown error with context
|
|
592
|
+
*/
|
|
593
|
+
static fromError(context: string, error: unknown): PageRangeParseError;
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
/**
|
|
597
|
+
* Pattern types for page number sequences
|
|
598
|
+
*/
|
|
599
|
+
declare enum PagePattern {
|
|
600
|
+
/** Simple increment: [1, 2, 3, 4, ...] */
|
|
601
|
+
SIMPLE_INCREMENT = "simple_increment",
|
|
602
|
+
/** Double-sided scan: [1-2, 3-4, 5-6, ...] */
|
|
603
|
+
DOUBLE_SIDED = "double_sided",
|
|
604
|
+
/** Offset pattern: PDF page != actual page (consistent offset) */
|
|
605
|
+
OFFSET = "offset",
|
|
606
|
+
/** No clear pattern detected */
|
|
607
|
+
UNKNOWN = "unknown"
|
|
608
|
+
}
|
|
609
|
+
/**
|
|
610
|
+
* PageRangeParser
|
|
611
|
+
*
|
|
612
|
+
* Extracts actual document page numbers from PDF page images using Vision LLM.
|
|
613
|
+
* Uses random sampling + pattern detection to minimize LLM calls.
|
|
614
|
+
* Extends VisionLLMComponent for standardized vision LLM call handling.
|
|
615
|
+
*
|
|
616
|
+
* ## Algorithm
|
|
617
|
+
*
|
|
618
|
+
* 1. Group pages by size (consecutive pages with same dimensions)
|
|
619
|
+
* 2. For each group:
|
|
620
|
+
* - If ≤3 pages: send all to LLM at once
|
|
621
|
+
* - If >3 pages: random sample 3 pages, detect pattern, apply to all
|
|
622
|
+
* 3. Post-process: handle drops, normalize negatives, backfill failed pages
|
|
623
|
+
*/
|
|
624
|
+
declare class PageRangeParser extends VisionLLMComponent {
|
|
625
|
+
private readonly SAMPLE_SIZE;
|
|
626
|
+
private readonly MAX_PATTERN_RETRIES;
|
|
627
|
+
private readonly SIZE_TOLERANCE;
|
|
628
|
+
constructor(logger: LoggerMethods, model: LanguageModel, outputPath: string, maxRetries?: number, fallbackModel?: LanguageModel, aggregator?: LLMTokenUsageAggregator, abortSignal?: AbortSignal);
|
|
629
|
+
/**
|
|
630
|
+
* Main parse method
|
|
631
|
+
*
|
|
632
|
+
* Extracts page range mapping from DoclingDocument using Vision LLM.
|
|
633
|
+
* Automatically tracks token usage in the aggregator if one was provided.
|
|
634
|
+
*
|
|
635
|
+
* @param doclingDoc - DoclingDocument to extract page ranges from
|
|
636
|
+
* @returns Object with page range mapping and token usage information
|
|
637
|
+
*/
|
|
638
|
+
parse(doclingDoc: DoclingDocument): Promise<{
|
|
639
|
+
pageRangeMap: Record<number, PageRange>;
|
|
640
|
+
usage: ExtendedTokenUsage[];
|
|
641
|
+
}>;
|
|
642
|
+
/**
|
|
643
|
+
* Extract pages array from DoclingDocument
|
|
644
|
+
*/
|
|
645
|
+
private extractPages;
|
|
646
|
+
/**
|
|
647
|
+
* Analyze page sizes and group consecutive pages with same dimensions
|
|
648
|
+
*/
|
|
649
|
+
private analyzeSizes;
|
|
650
|
+
/**
|
|
651
|
+
* Create size key with tolerance for floating point comparison
|
|
652
|
+
*/
|
|
653
|
+
private createSizeKey;
|
|
654
|
+
/**
|
|
655
|
+
* Process a single size group
|
|
656
|
+
*/
|
|
657
|
+
private processGroup;
|
|
658
|
+
/**
|
|
659
|
+
* Select random samples from page numbers
|
|
660
|
+
*/
|
|
661
|
+
private selectRandomSamples;
|
|
662
|
+
/**
|
|
663
|
+
* Extract page numbers from multiple pages in a single LLM call
|
|
664
|
+
*/
|
|
665
|
+
private extractMultiplePages;
|
|
666
|
+
/**
|
|
667
|
+
* Detect pattern from sample results
|
|
668
|
+
*/
|
|
669
|
+
private detectPattern;
|
|
670
|
+
/**
|
|
671
|
+
* Apply detected pattern to generate page range map
|
|
672
|
+
*/
|
|
673
|
+
private applyPattern;
|
|
674
|
+
/**
|
|
675
|
+
* Convert sample results to page range map (for small groups)
|
|
676
|
+
*/
|
|
677
|
+
private samplesToMap;
|
|
678
|
+
/**
|
|
679
|
+
* Post-process the page range map
|
|
680
|
+
*/
|
|
681
|
+
private postProcess;
|
|
682
|
+
/**
|
|
683
|
+
* Detect and handle outlier page numbers at the beginning of document
|
|
684
|
+
*
|
|
685
|
+
* When early PDF pages have abnormally high page numbers compared to
|
|
686
|
+
* subsequent pages (e.g., PDF 1-9 = 75-83, but PDF 10+ = 2,3,4...),
|
|
687
|
+
* the LLM likely misread figure/photo numbers as page numbers.
|
|
688
|
+
*
|
|
689
|
+
* Detection: If page numbers at the beginning are significantly higher
|
|
690
|
+
* than subsequent pages (which follow a normal pattern), mark them as failed.
|
|
691
|
+
*/
|
|
692
|
+
private detectAndHandleOutliers;
|
|
693
|
+
/**
|
|
694
|
+
* Find the start index of a "normal" sequence in the page range map
|
|
695
|
+
*
|
|
696
|
+
* A normal sequence is defined as at least 3 consecutive PDF pages where:
|
|
697
|
+
* - Page numbers are increasing (for single-page) or increasing by 2 (for double-sided)
|
|
698
|
+
* - The pattern is consistent
|
|
699
|
+
*
|
|
700
|
+
* Returns the index in pdfPages array, or null if not found.
|
|
701
|
+
*/
|
|
702
|
+
private findNormalSequenceStart;
|
|
703
|
+
/**
|
|
704
|
+
* Check if a page range represents a double-sided scan
|
|
705
|
+
*/
|
|
706
|
+
private isDoubleSidedRange;
|
|
707
|
+
/**
|
|
708
|
+
* Detect and handle page number drops
|
|
709
|
+
*
|
|
710
|
+
* When page numbers suddenly decrease (e.g., 8,9 -> 3,4),
|
|
711
|
+
* recalculate previous pages based on the drop point.
|
|
712
|
+
*/
|
|
713
|
+
private detectAndHandleDrops;
|
|
714
|
+
/**
|
|
715
|
+
* Normalize negative page numbers to 0
|
|
716
|
+
*/
|
|
717
|
+
private normalizeNegatives;
|
|
718
|
+
/**
|
|
719
|
+
* Backfill pages marked with 0 using detected pattern
|
|
720
|
+
*/
|
|
721
|
+
private backfillFailedPages;
|
|
722
|
+
/**
|
|
723
|
+
* Build system prompt for Vision LLM
|
|
724
|
+
*/
|
|
725
|
+
protected buildSystemPrompt(): string;
|
|
726
|
+
/**
|
|
727
|
+
* Build user prompt for Vision LLM
|
|
728
|
+
*/
|
|
729
|
+
protected buildUserPrompt(pageNos: number[]): string;
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
/**
|
|
733
|
+
* TocExtractError
|
|
734
|
+
*
|
|
735
|
+
* Base error class for TOC extraction failures.
|
|
736
|
+
*/
|
|
737
|
+
declare class TocExtractError extends Error {
|
|
738
|
+
constructor(message: string, options?: ErrorOptions);
|
|
739
|
+
/**
|
|
740
|
+
* Extract error message from unknown error type
|
|
741
|
+
*/
|
|
742
|
+
static getErrorMessage(error: unknown): string;
|
|
743
|
+
/**
|
|
744
|
+
* Create TocExtractError from unknown error with context
|
|
745
|
+
*/
|
|
746
|
+
static fromError(context: string, error: unknown): TocExtractError;
|
|
747
|
+
}
|
|
748
|
+
/**
|
|
749
|
+
* TocNotFoundError
|
|
750
|
+
*
|
|
751
|
+
* Error thrown when TOC area cannot be found in the document.
|
|
752
|
+
*/
|
|
753
|
+
declare class TocNotFoundError extends TocExtractError {
|
|
754
|
+
constructor(message?: string);
|
|
755
|
+
}
|
|
756
|
+
/**
|
|
757
|
+
* TocParseError
|
|
758
|
+
*
|
|
759
|
+
* Error thrown when LLM fails to parse TOC structure.
|
|
760
|
+
*/
|
|
761
|
+
declare class TocParseError extends TocExtractError {
|
|
762
|
+
constructor(message: string, options?: ErrorOptions);
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
/**
|
|
766
|
+
* Validation options for TocValidator
|
|
767
|
+
*/
|
|
768
|
+
interface TocValidationOptions {
|
|
769
|
+
/**
|
|
770
|
+
* Total page count of the document (for range validation)
|
|
771
|
+
* If not provided, page range upper bound validation is skipped
|
|
772
|
+
*/
|
|
773
|
+
totalPages?: number;
|
|
774
|
+
/**
|
|
775
|
+
* Maximum allowed title length (default: 200)
|
|
776
|
+
*/
|
|
777
|
+
maxTitleLength?: number;
|
|
778
|
+
}
|
|
779
|
+
|
|
780
|
+
/**
|
|
781
|
+
* Resolves $ref references in DoclingDocument to actual objects.
|
|
782
|
+
*
|
|
783
|
+
* DoclingDocument uses JSON references (e.g., "#/texts/0") to link nodes.
|
|
784
|
+
* This class builds an index for quick lookups of texts, pictures, tables, and groups.
|
|
785
|
+
*/
|
|
786
|
+
declare class RefResolver {
|
|
787
|
+
private readonly logger;
|
|
788
|
+
private readonly textMap;
|
|
789
|
+
private readonly pictureMap;
|
|
790
|
+
private readonly tableMap;
|
|
791
|
+
private readonly groupMap;
|
|
792
|
+
constructor(logger: LoggerMethods, doc: DoclingDocument);
|
|
793
|
+
/**
|
|
794
|
+
* Build an index mapping self_ref to the actual item
|
|
795
|
+
*/
|
|
796
|
+
private buildIndex;
|
|
797
|
+
/**
|
|
798
|
+
* Resolve a $ref string to the actual item
|
|
799
|
+
* @param ref - Reference string (e.g., "#/texts/0")
|
|
800
|
+
* @returns The resolved item, or null if not found
|
|
801
|
+
*/
|
|
802
|
+
resolve(ref: string): DoclingTextItem | DoclingPictureItem | DoclingTableItem | DoclingGroupItem | null;
|
|
803
|
+
/**
|
|
804
|
+
* Resolve a text reference
|
|
805
|
+
* @param ref - Reference string (e.g., "#/texts/0")
|
|
806
|
+
* @returns The resolved text item, or null if not found
|
|
807
|
+
*/
|
|
808
|
+
resolveText(ref: string): DoclingTextItem | null;
|
|
809
|
+
/**
|
|
810
|
+
* Resolve a picture reference
|
|
811
|
+
* @param ref - Reference string (e.g., "#/pictures/0")
|
|
812
|
+
* @returns The resolved picture item, or null if not found
|
|
813
|
+
*/
|
|
814
|
+
resolvePicture(ref: string): DoclingPictureItem | null;
|
|
815
|
+
/**
|
|
816
|
+
* Resolve a table reference
|
|
817
|
+
* @param ref - Reference string (e.g., "#/tables/0")
|
|
818
|
+
* @returns The resolved table item, or null if not found
|
|
819
|
+
*/
|
|
820
|
+
resolveTable(ref: string): DoclingTableItem | null;
|
|
821
|
+
/**
|
|
822
|
+
* Resolve a group reference
|
|
823
|
+
* @param ref - Reference string (e.g., "#/groups/0")
|
|
824
|
+
* @returns The resolved group item, or null if not found
|
|
825
|
+
*/
|
|
826
|
+
resolveGroup(ref: string): DoclingGroupItem | null;
|
|
827
|
+
/**
|
|
828
|
+
* Resolve multiple references at once
|
|
829
|
+
* @param refs - Array of reference objects with $ref property
|
|
830
|
+
* @returns Array of resolved items (null for unresolved references)
|
|
831
|
+
*/
|
|
832
|
+
resolveMany(refs: Array<{
|
|
833
|
+
$ref: string;
|
|
834
|
+
}>): Array<DoclingTextItem | DoclingPictureItem | DoclingTableItem | DoclingGroupItem | null>;
|
|
835
|
+
}
|
|
836
|
+
|
|
837
|
+
/**
|
|
838
|
+
* Generates sequential IDs for different types of items.
|
|
839
|
+
*
|
|
840
|
+
* IDs are formatted as: `{prefix}-{number}` where number is zero-padded to 3 digits.
|
|
841
|
+
* - Chapters: ch-001, ch-002, ...
|
|
842
|
+
* - Images: img-001, img-002, ...
|
|
843
|
+
* - Tables: tbl-001, tbl-002, ...
|
|
844
|
+
*
|
|
845
|
+
* Each type maintains its own independent counter.
|
|
846
|
+
*/
|
|
847
|
+
declare class IdGenerator {
|
|
848
|
+
private chapterCounter;
|
|
849
|
+
private imageCounter;
|
|
850
|
+
private tableCounter;
|
|
851
|
+
private footnoteCounter;
|
|
852
|
+
/**
|
|
853
|
+
* Generate a chapter ID
|
|
854
|
+
* @returns A chapter ID in the format "ch-001"
|
|
855
|
+
*/
|
|
856
|
+
generateChapterId(): string;
|
|
857
|
+
/**
|
|
858
|
+
* Generate an image ID
|
|
859
|
+
* @returns An image ID in the format "img-001"
|
|
860
|
+
*/
|
|
861
|
+
generateImageId(): string;
|
|
862
|
+
/**
|
|
863
|
+
* Generate a table ID
|
|
864
|
+
* @returns A table ID in the format "tbl-001"
|
|
865
|
+
*/
|
|
866
|
+
generateTableId(): string;
|
|
867
|
+
/**
|
|
868
|
+
* Generate a footnote ID
|
|
869
|
+
* @returns A footnote ID in the format "ftn-001"
|
|
870
|
+
*/
|
|
871
|
+
generateFootnoteId(): string;
|
|
872
|
+
/**
|
|
873
|
+
* Reset all counters to zero
|
|
874
|
+
*/
|
|
875
|
+
reset(): void;
|
|
876
|
+
/**
|
|
877
|
+
* Get current counter values (for testing/debugging)
|
|
878
|
+
*/
|
|
879
|
+
getCounters(): {
|
|
880
|
+
chapter: number;
|
|
881
|
+
image: number;
|
|
882
|
+
table: number;
|
|
883
|
+
footnote: number;
|
|
884
|
+
};
|
|
885
|
+
/**
|
|
886
|
+
* Pad a number to 3 digits with leading zeros
|
|
887
|
+
*/
|
|
888
|
+
private padNumber;
|
|
889
|
+
}
|
|
890
|
+
|
|
891
|
+
/**
|
|
892
|
+
* TOC keyword patterns for different languages
|
|
893
|
+
* Korean: 목차, 차례, 목 차
|
|
894
|
+
* Chinese: 目录, 目 录, 内容, 內容
|
|
895
|
+
* Japanese: 目次, 目 次
|
|
896
|
+
* English: Contents, Table of Contents, etc.
|
|
897
|
+
*/
|
|
898
|
+
declare const TOC_KEYWORDS: readonly ["목차", "차례", "목 차", "目录", "目 录", "内容", "內容", "目次", "目 次", "Contents", "Table of Contents", "TABLE OF CONTENTS", "CONTENTS"];
|
|
899
|
+
/**
|
|
900
|
+
* Continuation marker patterns for multi-page TOC
|
|
901
|
+
* Korean: 목차(계속), 목차 (계속), (계속)
|
|
902
|
+
* Chinese: 目录(续), 目录 (续), (续), 续表
|
|
903
|
+
* Japanese: 目次(続), 目次 (続), (続)
|
|
904
|
+
* English: (continued), (Continued), etc.
|
|
905
|
+
*/
|
|
906
|
+
declare const CONTINUATION_MARKERS: readonly ["목차(계속)", "목차 (계속)", "(계속)", "目录(续)", "目录 (续)", "(续)", "续表", "目次(続)", "目次 (続)", "(続)", "(continued)", "(Continued)", "(CONTINUED)", "continued"];
|
|
907
|
+
/**
|
|
908
|
+
* Page number pattern regex for detecting TOC-like structures
|
|
909
|
+
* Matches patterns like "... 123", ".... 45", ending with numbers
|
|
910
|
+
*/
|
|
911
|
+
declare const PAGE_NUMBER_PATTERN: RegExp;
|
|
912
|
+
/**
|
|
913
|
+
* TocFinder options
|
|
914
|
+
*/
|
|
915
|
+
interface TocFinderOptions {
|
|
916
|
+
/**
|
|
917
|
+
* Maximum pages to search for TOC (default: 10)
|
|
918
|
+
*/
|
|
919
|
+
maxSearchPages?: number;
|
|
920
|
+
/**
|
|
921
|
+
* Custom TOC keywords to add (optional)
|
|
922
|
+
*/
|
|
923
|
+
additionalKeywords?: string[];
|
|
924
|
+
}
|
|
925
|
+
/**
|
|
926
|
+
* TocFinder
|
|
927
|
+
*
|
|
928
|
+
* Finds TOC area in DoclingDocument using multi-stage search strategy:
|
|
929
|
+
* 1. Keyword search in texts (section_header, list_item labels)
|
|
930
|
+
* 2. Structure analysis for lists/tables with page number patterns
|
|
931
|
+
* 3. Position heuristic (prioritize early pages)
|
|
932
|
+
*/
|
|
933
|
+
declare class TocFinder {
|
|
934
|
+
private readonly logger;
|
|
935
|
+
private readonly refResolver;
|
|
936
|
+
private readonly maxSearchPages;
|
|
937
|
+
private readonly keywords;
|
|
938
|
+
constructor(logger: LoggerMethods, refResolver: RefResolver, options?: TocFinderOptions);
|
|
939
|
+
/**
|
|
940
|
+
* Find TOC area in the document
|
|
941
|
+
*
|
|
942
|
+
* @throws {TocNotFoundError} When no TOC area is found
|
|
943
|
+
*/
|
|
944
|
+
find(doc: DoclingDocument): TocAreaResult;
|
|
945
|
+
/**
|
|
946
|
+
* Stage 1: Search by keywords in text items
|
|
947
|
+
*/
|
|
948
|
+
private findByKeywords;
|
|
949
|
+
/**
|
|
950
|
+
* Stage 2: Search by structure (lists/tables with page numbers)
|
|
951
|
+
*/
|
|
952
|
+
private findByStructure;
|
|
953
|
+
/**
|
|
954
|
+
* Find the TOC container (group or table) from a parent reference
|
|
955
|
+
*/
|
|
956
|
+
private findTocContainer;
|
|
957
|
+
/**
|
|
958
|
+
* Check if a group contains TOC-like structure
|
|
959
|
+
*/
|
|
960
|
+
private isGroupTocLike;
|
|
961
|
+
/**
|
|
962
|
+
* Check if a table contains TOC-like structure
|
|
963
|
+
*/
|
|
964
|
+
private isTableTocLike;
|
|
965
|
+
/**
|
|
966
|
+
* Expand TOC area to consecutive pages
|
|
967
|
+
*/
|
|
968
|
+
private expandToConsecutivePages;
|
|
969
|
+
/**
|
|
970
|
+
* Find TOC continuation items on a specific page
|
|
971
|
+
*/
|
|
972
|
+
private findContinuationOnPage;
|
|
973
|
+
/**
|
|
974
|
+
* Check if text contains TOC keyword
|
|
975
|
+
*/
|
|
976
|
+
private containsTocKeyword;
|
|
977
|
+
/**
|
|
978
|
+
* Check for continuation markers
|
|
979
|
+
*/
|
|
980
|
+
private hasContinuationMarker;
|
|
981
|
+
/**
|
|
982
|
+
* Get first page number of a group by checking its children
|
|
983
|
+
*/
|
|
984
|
+
private getGroupFirstPage;
|
|
985
|
+
/**
|
|
986
|
+
* Calculate score for a group candidate
|
|
987
|
+
* Higher score = better match
|
|
988
|
+
*/
|
|
989
|
+
private calculateScore;
|
|
990
|
+
/**
|
|
991
|
+
* Calculate score for a table candidate
|
|
992
|
+
*/
|
|
993
|
+
private calculateTableScore;
|
|
994
|
+
}
|
|
995
|
+
|
|
996
|
+
/**
|
|
997
|
+
* Zod schema for recursive TocEntry structure
|
|
998
|
+
*/
|
|
999
|
+
declare const TocEntrySchema: z.ZodType<TocEntry>;
|
|
1000
|
+
/**
|
|
1001
|
+
* Schema for LLM response
|
|
1002
|
+
*/
|
|
1003
|
+
declare const TocResponseSchema: z.ZodObject<{
|
|
1004
|
+
entries: z.ZodArray<z.ZodType<TocEntry, unknown, z.core.$ZodTypeInternals<TocEntry, unknown>>>;
|
|
1005
|
+
}, z.core.$strip>;
|
|
1006
|
+
type TocResponse = z.infer<typeof TocResponseSchema>;
|
|
1007
|
+
/**
|
|
1008
|
+
* TocExtractor options
|
|
1009
|
+
*/
|
|
1010
|
+
interface TocExtractorOptions extends BaseLLMComponentOptions {
|
|
1011
|
+
/**
|
|
1012
|
+
* Validation options (optional)
|
|
1013
|
+
* If not provided, validation is performed with default settings
|
|
1014
|
+
*/
|
|
1015
|
+
validation?: TocValidationOptions;
|
|
1016
|
+
/**
|
|
1017
|
+
* Whether to skip validation entirely (default: false)
|
|
1018
|
+
*/
|
|
1019
|
+
skipValidation?: boolean;
|
|
1020
|
+
}
|
|
1021
|
+
/**
|
|
1022
|
+
* TocExtractor
|
|
1023
|
+
*
|
|
1024
|
+
* Uses high-performance LLM to extract structured TOC from Markdown representation.
|
|
1025
|
+
* Extends TextLLMComponent for standardized LLM call handling.
|
|
1026
|
+
*/
|
|
1027
|
+
declare class TocExtractor extends TextLLMComponent {
|
|
1028
|
+
private readonly validationOptions?;
|
|
1029
|
+
private readonly skipValidation;
|
|
1030
|
+
constructor(logger: LoggerMethods, model: LanguageModel, options?: TocExtractorOptions, fallbackModel?: LanguageModel, abortSignal?: AbortSignal);
|
|
1031
|
+
/**
|
|
1032
|
+
* Extract TOC structure from Markdown
|
|
1033
|
+
*
|
|
1034
|
+
* @param markdown - Markdown representation of TOC area
|
|
1035
|
+
* @returns Object with entries array and token usage information
|
|
1036
|
+
* @throws {TocParseError} When LLM fails to parse structure
|
|
1037
|
+
* @throws {TocValidationError} When validation fails
|
|
1038
|
+
*/
|
|
1039
|
+
extract(markdown: string): Promise<{
|
|
1040
|
+
entries: TocEntry[];
|
|
1041
|
+
usage: ExtendedTokenUsage;
|
|
1042
|
+
}>;
|
|
1043
|
+
/**
|
|
1044
|
+
* Validate extracted entries
|
|
1045
|
+
*
|
|
1046
|
+
* @throws {TocValidationError} When validation fails
|
|
1047
|
+
*/
|
|
1048
|
+
private validateEntries;
|
|
1049
|
+
/**
|
|
1050
|
+
* Build system prompt for TOC extraction
|
|
1051
|
+
*/
|
|
1052
|
+
protected buildSystemPrompt(): string;
|
|
1053
|
+
/**
|
|
1054
|
+
* Build user prompt with Markdown content
|
|
1055
|
+
*/
|
|
1056
|
+
protected buildUserPrompt(markdown: string): string;
|
|
1057
|
+
/**
|
|
1058
|
+
* Normalize and validate extracted entries
|
|
1059
|
+
*/
|
|
1060
|
+
private normalizeEntries;
|
|
1061
|
+
/**
|
|
1062
|
+
* Recursively ensure level consistency
|
|
1063
|
+
*
|
|
1064
|
+
* Children must have level = parent.level + 1
|
|
1065
|
+
*/
|
|
1066
|
+
private normalizeLevel;
|
|
1067
|
+
}
|
|
1068
|
+
|
|
1069
|
+
/**
|
|
1070
|
+
* Schema for vision-based TOC extraction response
|
|
1071
|
+
*/
|
|
1072
|
+
declare const VisionTocExtractionSchema: z.ZodObject<{
|
|
1073
|
+
hasToc: z.ZodBoolean;
|
|
1074
|
+
tocMarkdown: z.ZodNullable<z.ZodString>;
|
|
1075
|
+
continuesOnNextPage: z.ZodBoolean;
|
|
1076
|
+
}, z.core.$strip>;
|
|
1077
|
+
type VisionTocExtractionResult = z.infer<typeof VisionTocExtractionSchema>;
|
|
1078
|
+
/**
|
|
1079
|
+
* Options for VisionTocExtractor
|
|
1080
|
+
*/
|
|
1081
|
+
interface VisionTocExtractorOptions extends VisionLLMComponentOptions {
|
|
1082
|
+
/**
|
|
1083
|
+
* Number of pages for first batch (default: 10)
|
|
1084
|
+
*/
|
|
1085
|
+
firstBatchSize?: number;
|
|
1086
|
+
/**
|
|
1087
|
+
* Number of pages for second batch (default: 10)
|
|
1088
|
+
*/
|
|
1089
|
+
secondBatchSize?: number;
|
|
1090
|
+
}
|
|
1091
|
+
/**
|
|
1092
|
+
* VisionTocExtractor
|
|
1093
|
+
*
|
|
1094
|
+
* Uses vision LLM to find and extract TOC directly from page images.
|
|
1095
|
+
* Fallback strategy when rule-based extraction fails or produces invalid content.
|
|
1096
|
+
* Extends VisionLLMComponent for standardized vision LLM call handling.
|
|
1097
|
+
*
|
|
1098
|
+
* Output format matches MarkdownConverter.convert() for consistency.
|
|
1099
|
+
*/
|
|
1100
|
+
declare class VisionTocExtractor extends VisionLLMComponent {
|
|
1101
|
+
private readonly firstBatchSize;
|
|
1102
|
+
private readonly secondBatchSize;
|
|
1103
|
+
constructor(logger: LoggerMethods, model: LanguageModel, outputPath: string, options?: VisionTocExtractorOptions, fallbackModel?: LanguageModel, aggregator?: LLMTokenUsageAggregator);
|
|
1104
|
+
/**
|
|
1105
|
+
* Extract TOC from page images
|
|
1106
|
+
*
|
|
1107
|
+
* Searches pages 1-10 first, then 11-20 if not found.
|
|
1108
|
+
*
|
|
1109
|
+
* @param totalPages - Total number of pages in the document
|
|
1110
|
+
* @returns Extracted TOC markdown or null if not found
|
|
1111
|
+
*/
|
|
1112
|
+
extract(totalPages: number): Promise<string | null>;
|
|
1113
|
+
/**
|
|
1114
|
+
* Extract TOC from a specific batch of pages
|
|
1115
|
+
*/
|
|
1116
|
+
private extractFromBatch;
|
|
1117
|
+
/**
|
|
1118
|
+
* Load page images and build message content
|
|
1119
|
+
*/
|
|
1120
|
+
private loadPageImages;
|
|
1121
|
+
/**
|
|
1122
|
+
* Merge markdown from multiple batches
|
|
1123
|
+
*/
|
|
1124
|
+
private mergeMarkdown;
|
|
1125
|
+
/**
|
|
1126
|
+
* Build system prompt for vision LLM (not used, but required by abstract class)
|
|
1127
|
+
*/
|
|
1128
|
+
protected buildSystemPrompt(): string;
|
|
1129
|
+
/**
|
|
1130
|
+
* Build user prompt with page range information
|
|
1131
|
+
*/
|
|
1132
|
+
protected buildUserPrompt(startPage: number, endPage: number): string;
|
|
1133
|
+
}
|
|
1134
|
+
|
|
1135
|
+
/**
|
|
1136
|
+
* Base options for all validators
|
|
1137
|
+
*
|
|
1138
|
+
* Re-exported from BaseLLMComponentOptions for backwards compatibility.
|
|
1139
|
+
*/
|
|
1140
|
+
type BaseValidatorOptions = BaseLLMComponentOptions;
|
|
1141
|
+
/**
|
|
1142
|
+
* Abstract base class for LLM-based validators
|
|
1143
|
+
*
|
|
1144
|
+
* Extends TextLLMComponent to provide common functionality for validators
|
|
1145
|
+
* that use LLM to validate/analyze content:
|
|
1146
|
+
* - LLM API call wrapper with LLMCaller (via callLLM method)
|
|
1147
|
+
* - Standard logging patterns (via log method from base class)
|
|
1148
|
+
* - Retry and fallback configuration
|
|
1149
|
+
*
|
|
1150
|
+
* Token usage is tracked by LLMCaller and should be aggregated by DocumentProcessor.
|
|
1151
|
+
*
|
|
1152
|
+
* @template TSchema - Zod schema type for validation
|
|
1153
|
+
* @template TResult - Result type after parsing with schema
|
|
1154
|
+
*/
|
|
1155
|
+
declare abstract class BaseValidator<TSchema extends z.ZodType, TResult = z.infer<TSchema>> extends TextLLMComponent {
|
|
1156
|
+
/**
|
|
1157
|
+
* Validator name for logging (kept for backwards compatibility)
|
|
1158
|
+
*/
|
|
1159
|
+
protected readonly validatorName: string;
|
|
1160
|
+
/**
|
|
1161
|
+
* Constructor for BaseValidator
|
|
1162
|
+
*
|
|
1163
|
+
* @param logger - Logger instance
|
|
1164
|
+
* @param model - Language model to use for validation
|
|
1165
|
+
* @param validatorName - Name of the validator for logging (e.g., "TocContentValidator")
|
|
1166
|
+
* @param options - Optional configuration (maxRetries, temperature)
|
|
1167
|
+
* @param fallbackModel - Optional fallback model for retry on failure
|
|
1168
|
+
* @param aggregator - Optional token usage aggregator for tracking LLM calls
|
|
1169
|
+
*/
|
|
1170
|
+
constructor(logger: LoggerMethods, model: LanguageModel, validatorName: string, options?: BaseValidatorOptions, fallbackModel?: LanguageModel, aggregator?: LLMTokenUsageAggregator);
|
|
1171
|
+
/**
|
|
1172
|
+
* Call LLM with LLMCaller
|
|
1173
|
+
*
|
|
1174
|
+
* This method provides backwards compatibility with existing validators.
|
|
1175
|
+
* It wraps the parent callTextLLM method but allows passing a custom aggregator.
|
|
1176
|
+
*
|
|
1177
|
+
* @param schema - Zod schema for response validation
|
|
1178
|
+
* @param systemPrompt - System prompt
|
|
1179
|
+
* @param userPrompt - User prompt
|
|
1180
|
+
* @param phase - Phase name for tracking (e.g., 'validation', 'batch-validation')
|
|
1181
|
+
* @param aggregator - Optional token usage aggregator for tracking this call
|
|
1182
|
+
* @returns Parsed and validated LLM response with usage information
|
|
1183
|
+
*/
|
|
1184
|
+
protected callLLM(schema: TSchema, systemPrompt: string, userPrompt: string, phase: string, aggregator?: LLMTokenUsageAggregator): Promise<{
|
|
1185
|
+
output: TResult;
|
|
1186
|
+
usage: ExtendedTokenUsage;
|
|
1187
|
+
}>;
|
|
1188
|
+
}
|
|
1189
|
+
|
|
1190
|
+
/**
|
|
1191
|
+
* Schema for TOC content validation response
|
|
1192
|
+
*/
|
|
1193
|
+
declare const TocContentValidationSchema: z.ZodObject<{
|
|
1194
|
+
isToc: z.ZodBoolean;
|
|
1195
|
+
confidence: z.ZodNumber;
|
|
1196
|
+
reason: z.ZodString;
|
|
1197
|
+
}, z.core.$strip>;
|
|
1198
|
+
type TocContentValidationResult = z.infer<typeof TocContentValidationSchema>;
|
|
1199
|
+
/**
|
|
1200
|
+
* Options for TocContentValidator
|
|
1201
|
+
*/
|
|
1202
|
+
interface TocContentValidatorOptions extends BaseValidatorOptions {
|
|
1203
|
+
/**
|
|
1204
|
+
* Minimum confidence to consider valid (default: 0.7)
|
|
1205
|
+
*/
|
|
1206
|
+
confidenceThreshold?: number;
|
|
1207
|
+
}
|
|
1208
|
+
/**
|
|
1209
|
+
* TocContentValidator
|
|
1210
|
+
*
|
|
1211
|
+
* Uses LLM to validate whether extracted markdown content is actually a TOC.
|
|
1212
|
+
* This is a semantic validation, not structural validation.
|
|
1213
|
+
*/
|
|
1214
|
+
declare class TocContentValidator extends BaseValidator<typeof TocContentValidationSchema, TocContentValidationResult> {
|
|
1215
|
+
private readonly confidenceThreshold;
|
|
1216
|
+
constructor(logger: LoggerMethods, model: LanguageModel, options?: TocContentValidatorOptions, fallbackModel?: LanguageModel, aggregator?: LLMTokenUsageAggregator);
|
|
1217
|
+
/**
|
|
1218
|
+
* Validate if the markdown content is a table of contents
|
|
1219
|
+
*
|
|
1220
|
+
* @param markdown - Markdown content to validate
|
|
1221
|
+
* @returns Validation result with isToc, confidence, and reason
|
|
1222
|
+
*/
|
|
1223
|
+
validate(markdown: string): Promise<TocContentValidationResult>;
|
|
1224
|
+
/**
|
|
1225
|
+
* Check if validation result passes threshold
|
|
1226
|
+
*
|
|
1227
|
+
* @param result - Validation result from validate()
|
|
1228
|
+
* @returns true if content is valid TOC with sufficient confidence
|
|
1229
|
+
*/
|
|
1230
|
+
isValid(result: TocContentValidationResult): boolean;
|
|
1231
|
+
/**
|
|
1232
|
+
* Build system prompt for TOC content validation
|
|
1233
|
+
*/
|
|
1234
|
+
protected buildSystemPrompt(): string;
|
|
1235
|
+
/**
|
|
1236
|
+
* Build user prompt with markdown content
|
|
1237
|
+
*/
|
|
1238
|
+
protected buildUserPrompt(markdown: string): string;
|
|
1239
|
+
}
|
|
1240
|
+
|
|
1241
|
+
/**
|
|
1242
|
+
* Schema for batch caption validation response
|
|
1243
|
+
*/
|
|
1244
|
+
declare const CaptionValidationBatchSchema: z.ZodObject<{
|
|
1245
|
+
results: z.ZodArray<z.ZodObject<{
|
|
1246
|
+
index: z.ZodNumber;
|
|
1247
|
+
isValid: z.ZodBoolean;
|
|
1248
|
+
reason: z.ZodNullable<z.ZodString>;
|
|
1249
|
+
}, z.core.$strip>>;
|
|
1250
|
+
}, z.core.$strip>;
|
|
1251
|
+
type CaptionValidationBatch = z.infer<typeof CaptionValidationBatchSchema>;
|
|
1252
|
+
/**
|
|
1253
|
+
* Options for CaptionValidator
|
|
1254
|
+
*/
|
|
1255
|
+
interface CaptionValidatorOptions extends BaseValidatorOptions {
|
|
1256
|
+
}
|
|
1257
|
+
/**
|
|
1258
|
+
* CaptionValidator
|
|
1259
|
+
*
|
|
1260
|
+
* Validates parsed captions against original text using LLM.
|
|
1261
|
+
* Processes captions in batches to optimize LLM API calls.
|
|
1262
|
+
*
|
|
1263
|
+
* ## Validation Rules
|
|
1264
|
+
*
|
|
1265
|
+
* Checks if the parsed "num" field correctly extracts the prefix + number from original text:
|
|
1266
|
+
* 1. **Correctness**: The "num" must contain the actual prefix+number from the original text
|
|
1267
|
+
* - Example: "도판 1 유적 전경" → num="도판 1" ✓
|
|
1268
|
+
* - Example: "도판 1 유적 전경" → num="도판" ✗ (incomplete)
|
|
1269
|
+
*
|
|
1270
|
+
* 2. **Spacing**: The spacing in "num" must match the original text exactly
|
|
1271
|
+
* - Example: "도판 1" → num="도판 1" ✓
|
|
1272
|
+
* - Example: "도판1" → num="도판1" ✓
|
|
1273
|
+
* - Example: "도판 1" → num="도판1" ✗ (spacing mismatch)
|
|
1274
|
+
*
|
|
1275
|
+
* 3. **Completeness**: The number part must be fully extracted
|
|
1276
|
+
* - Example: "Figure 2-3" → num="Figure 2-3" ✓
|
|
1277
|
+
* - Example: "Figure 2-3" → num="Figure 2" ✗ (incomplete number)
|
|
1278
|
+
*
|
|
1279
|
+
* 4. **Null handling**: If "num" is null, verify that the original text has no number prefix
|
|
1280
|
+
* - Example: "유적 전경 사진" → num=null ✓
|
|
1281
|
+
* - Example: "도판 1 전경" → num=null ✗ (should extract "도판 1")
|
|
1282
|
+
*/
|
|
1283
|
+
declare class CaptionValidator extends BaseValidator<typeof CaptionValidationBatchSchema, CaptionValidationBatch> {
|
|
1284
|
+
constructor(logger: LoggerMethods, model: LanguageModel, options?: CaptionValidatorOptions, fallbackModel?: LanguageModel, aggregator?: LLMTokenUsageAggregator);
|
|
1285
|
+
/**
|
|
1286
|
+
* Validate batch of parsed captions against original texts
|
|
1287
|
+
*
|
|
1288
|
+
* @param captions - Array of parsed Caption objects
|
|
1289
|
+
* @param originalTexts - Array of original caption texts (same order as captions)
|
|
1290
|
+
* @param batchSize - Batch size for processing. Set to 0 to skip validation (assume all valid).
|
|
1291
|
+
* @returns Array of validation results (boolean) maintaining original order
|
|
1292
|
+
*/
|
|
1293
|
+
validateBatch(captions: Caption[], originalTexts: string[], batchSize: number): Promise<boolean[]>;
|
|
1294
|
+
/**
|
|
1295
|
+
* Internal: Validate batch of captions using LLM
|
|
1296
|
+
*
|
|
1297
|
+
* @param items - Batch of caption items with original indices
|
|
1298
|
+
* @param model - Effective model to use
|
|
1299
|
+
* @returns Array of validation results indexed correctly
|
|
1300
|
+
*/
|
|
1301
|
+
private validateBatchInternal;
|
|
1302
|
+
protected buildSystemPrompt(): string;
|
|
1303
|
+
protected buildUserPrompt(items: Array<{
|
|
1304
|
+
index: number;
|
|
1305
|
+
caption: Caption;
|
|
1306
|
+
originalText: string;
|
|
1307
|
+
}>): string;
|
|
1308
|
+
}
|
|
1309
|
+
/**
|
|
1310
|
+
* Error thrown when caption validation fails
|
|
1311
|
+
*/
|
|
1312
|
+
declare class CaptionValidationError extends Error {
|
|
1313
|
+
constructor(message: string, options?: ErrorOptions);
|
|
1314
|
+
}
|
|
1315
|
+
|
|
1316
|
+
/**
|
|
1317
|
+
* ChapterConverter
|
|
1318
|
+
*
|
|
1319
|
+
* Converts TocEntry[] to Chapter[] with text blocks, images, and tables.
|
|
1320
|
+
*
|
|
1321
|
+
* ## Conversion Process
|
|
1322
|
+
*
|
|
1323
|
+
* 1. Create Front Matter chapter (ch-000) for pre-TOC content
|
|
1324
|
+
* 2. Build chapter tree from TocEntry[] (recursive)
|
|
1325
|
+
* 3. Calculate page ranges for each chapter
|
|
1326
|
+
* 4. Assign text blocks to chapters based on page ranges
|
|
1327
|
+
* 5. Link images/tables to chapters based on page ranges
|
|
1328
|
+
*
|
|
1329
|
+
* ## Page Assignment Strategy
|
|
1330
|
+
*
|
|
1331
|
+
* Uses "start page first" strategy: resources are assigned to the chapter
|
|
1332
|
+
* whose startPage is the largest value that is still <= the resource's page.
|
|
1333
|
+
*
|
|
1334
|
+
* ## Front Matter
|
|
1335
|
+
*
|
|
1336
|
+
* A special chapter (ch-000) is always created to hold content that appears
|
|
1337
|
+
* before the first TOC entry (e.g., cover, preface, table of contents itself).
|
|
1338
|
+
*/
|
|
1339
|
+
declare class ChapterConverter {
|
|
1340
|
+
private static readonly FRONT_MATTER_ID;
|
|
1341
|
+
private static readonly FRONT_MATTER_TITLE;
|
|
1342
|
+
private readonly logger;
|
|
1343
|
+
private readonly idGenerator;
|
|
1344
|
+
constructor(logger: LoggerMethods, idGenerator: IdGenerator);
|
|
1345
|
+
/**
|
|
1346
|
+
* Convert TocEntry[] to Chapter[]
|
|
1347
|
+
*
|
|
1348
|
+
* @param tocEntries - Table of contents entries
|
|
1349
|
+
* @param textItems - DoclingDocument.texts (with prov for page numbers)
|
|
1350
|
+
* @param pageRangeMap - PDF page to actual page mapping
|
|
1351
|
+
* @param images - Converted images
|
|
1352
|
+
* @param tables - Converted tables
|
|
1353
|
+
* @param footnotes - Converted footnotes
|
|
1354
|
+
* @returns Converted chapters with text blocks and resource references
|
|
1355
|
+
*/
|
|
1356
|
+
convert(tocEntries: TocEntry[], textItems: DoclingTextItem[], pageRangeMap: Record<number, PageRange>, images: ProcessedImage[], tables: ProcessedTable[], footnotes: ProcessedFootnote[]): Chapter[];
|
|
1357
|
+
/**
|
|
1358
|
+
* Create Front Matter chapter for pre-TOC content
|
|
1359
|
+
*/
|
|
1360
|
+
private createFrontMatterChapter;
|
|
1361
|
+
/**
|
|
1362
|
+
* Build chapter tree from TocEntry[]
|
|
1363
|
+
* Recursively processes children
|
|
1364
|
+
*/
|
|
1365
|
+
private buildChapterTree;
|
|
1366
|
+
/**
|
|
1367
|
+
* Flatten chapter tree for page range calculation
|
|
1368
|
+
* Preserves original TOC page numbers
|
|
1369
|
+
*/
|
|
1370
|
+
private flattenChapters;
|
|
1371
|
+
/**
|
|
1372
|
+
* Calculate page range for each chapter
|
|
1373
|
+
* Uses next chapter's start page as end boundary
|
|
1374
|
+
*
|
|
1375
|
+
* Front Matter (ch-000) gets special handling:
|
|
1376
|
+
* - startPage: 1
|
|
1377
|
+
* - endPage: first TOC entry's page - 1 (or 0 if TOC starts at page 1)
|
|
1378
|
+
*/
|
|
1379
|
+
private calculatePageRanges;
|
|
1380
|
+
/**
|
|
1381
|
+
* Valid labels for text blocks
|
|
1382
|
+
* Only these labels are included in chapter text blocks
|
|
1383
|
+
*/
|
|
1384
|
+
private static readonly VALID_TEXT_LABELS;
|
|
1385
|
+
/**
|
|
1386
|
+
* Check if text item has a picture parent
|
|
1387
|
+
* Items with parent.$ref starting with "#/pictures/" are excluded
|
|
1388
|
+
*/
|
|
1389
|
+
private static hasPictureParent;
|
|
1390
|
+
/**
|
|
1391
|
+
* Convert text items to text blocks
|
|
1392
|
+
* Filters by label (text, section_header, list_item), excludes picture children,
|
|
1393
|
+
* and extracts page numbers from prov
|
|
1394
|
+
*/
|
|
1395
|
+
private convertTextBlocks;
|
|
1396
|
+
/**
|
|
1397
|
+
* Convert PDF page number to actual document page number
|
|
1398
|
+
* Falls back to pdfPageNo if mapping is missing
|
|
1399
|
+
*/
|
|
1400
|
+
private pdfPageToActualPage;
|
|
1401
|
+
/**
|
|
1402
|
+
* Find chapter ID for a given actual page number
|
|
1403
|
+
* Uses "start page first" strategy
|
|
1404
|
+
*/
|
|
1405
|
+
private findChapterForPage;
|
|
1406
|
+
/**
|
|
1407
|
+
* Assign text blocks to chapters based on page ranges
|
|
1408
|
+
*/
|
|
1409
|
+
private assignTextBlocks;
|
|
1410
|
+
/**
|
|
1411
|
+
* Link images, tables, and footnotes to chapters based on page ranges
|
|
1412
|
+
*/
|
|
1413
|
+
private linkResources;
|
|
1414
|
+
/**
|
|
1415
|
+
* Build flat chapter map for O(1) lookup
|
|
1416
|
+
*/
|
|
1417
|
+
private buildChapterMap;
|
|
1418
|
+
}
|
|
1419
|
+
|
|
1420
|
+
export { BaseLLMComponent, type BaseLLMComponentOptions, BaseValidator, type BaseValidatorOptions, CONTINUATION_MARKERS, CaptionParseError, CaptionParser, type CaptionParserOptions, CaptionValidationError, CaptionValidator, type CaptionValidatorOptions, ChapterConverter, DocumentProcessor, type DocumentProcessorOptions, type ImageContent, PAGE_NUMBER_PATTERN, PagePattern, PageRangeParseError, PageRangeParser, type PageSizeGroup, TOC_KEYWORDS, TextLLMComponent, type TocAreaResult, type TocContentValidationResult, TocContentValidationSchema, TocContentValidator, type TocContentValidatorOptions, type TocEntry, TocEntrySchema, TocExtractError, TocExtractor, type TocExtractorOptions, TocFinder, type TocFinderOptions, TocNotFoundError, TocParseError, type TocResponse, TocResponseSchema, VisionLLMComponent, type VisionLLMComponentOptions, type VisionTocExtractionResult, VisionTocExtractionSchema, VisionTocExtractor, type VisionTocExtractorOptions };
|