@heripo/document-processor 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.ko.md +332 -0
- package/README.md +332 -0
- package/dist/index.cjs +4325 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +1420 -0
- package/dist/index.d.ts +1420 -0
- package/dist/index.js +4262 -0
- package/dist/index.js.map +1 -0
- package/package.json +89 -0
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,4325 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __defProp = Object.defineProperty;
|
|
4
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __export = (target, all) => {
|
|
9
|
+
for (var name in all)
|
|
10
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
11
|
+
};
|
|
12
|
+
var __copyProps = (to, from, except, desc) => {
|
|
13
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
14
|
+
for (let key of __getOwnPropNames(from))
|
|
15
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
16
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
17
|
+
}
|
|
18
|
+
return to;
|
|
19
|
+
};
|
|
20
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
21
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
22
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
23
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
24
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
25
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
26
|
+
mod
|
|
27
|
+
));
|
|
28
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
|
+
|
|
30
|
+
// src/index.ts
|
|
31
|
+
var index_exports = {};
|
|
32
|
+
__export(index_exports, {
|
|
33
|
+
BaseLLMComponent: () => BaseLLMComponent,
|
|
34
|
+
BaseValidator: () => BaseValidator,
|
|
35
|
+
CONTINUATION_MARKERS: () => CONTINUATION_MARKERS,
|
|
36
|
+
CaptionParseError: () => CaptionParseError,
|
|
37
|
+
CaptionParser: () => CaptionParser,
|
|
38
|
+
CaptionValidationError: () => CaptionValidationError,
|
|
39
|
+
CaptionValidator: () => CaptionValidator,
|
|
40
|
+
ChapterConverter: () => ChapterConverter,
|
|
41
|
+
DocumentProcessor: () => DocumentProcessor,
|
|
42
|
+
PAGE_NUMBER_PATTERN: () => PAGE_NUMBER_PATTERN,
|
|
43
|
+
PagePattern: () => PagePattern,
|
|
44
|
+
PageRangeParseError: () => PageRangeParseError,
|
|
45
|
+
PageRangeParser: () => PageRangeParser,
|
|
46
|
+
TOC_KEYWORDS: () => TOC_KEYWORDS,
|
|
47
|
+
TextLLMComponent: () => TextLLMComponent,
|
|
48
|
+
TocContentValidationSchema: () => TocContentValidationSchema,
|
|
49
|
+
TocContentValidator: () => TocContentValidator,
|
|
50
|
+
TocEntrySchema: () => TocEntrySchema,
|
|
51
|
+
TocExtractError: () => TocExtractError,
|
|
52
|
+
TocExtractor: () => TocExtractor,
|
|
53
|
+
TocFinder: () => TocFinder,
|
|
54
|
+
TocNotFoundError: () => TocNotFoundError,
|
|
55
|
+
TocParseError: () => TocParseError,
|
|
56
|
+
TocResponseSchema: () => TocResponseSchema,
|
|
57
|
+
VisionLLMComponent: () => VisionLLMComponent,
|
|
58
|
+
VisionTocExtractionSchema: () => VisionTocExtractionSchema,
|
|
59
|
+
VisionTocExtractor: () => VisionTocExtractor
|
|
60
|
+
});
|
|
61
|
+
module.exports = __toCommonJS(index_exports);
|
|
62
|
+
|
|
63
|
+
// ../shared/dist/index.mjs
|
|
64
|
+
var import_ai = require("ai");
|
|
65
|
+
var BatchProcessor = class {
|
|
66
|
+
/**
|
|
67
|
+
* Splits an array into batches of specified size.
|
|
68
|
+
*
|
|
69
|
+
* @param items - Array to split
|
|
70
|
+
* @param batchSize - Size of each batch
|
|
71
|
+
* @returns Array of batches
|
|
72
|
+
*
|
|
73
|
+
* @example
|
|
74
|
+
* ```typescript
|
|
75
|
+
* const items = [1, 2, 3, 4, 5];
|
|
76
|
+
* const batches = BatchProcessor.createBatches(items, 2);
|
|
77
|
+
* // [[1, 2], [3, 4], [5]]
|
|
78
|
+
* ```
|
|
79
|
+
*/
|
|
80
|
+
static createBatches(items, batchSize) {
|
|
81
|
+
const batches = [];
|
|
82
|
+
for (let i = 0; i < items.length; i += batchSize) {
|
|
83
|
+
batches.push(items.slice(i, i + batchSize));
|
|
84
|
+
}
|
|
85
|
+
return batches;
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Splits an array into batches and executes async function in parallel.
|
|
89
|
+
*
|
|
90
|
+
* @param items - Array to process
|
|
91
|
+
* @param batchSize - Size of each batch
|
|
92
|
+
* @param processFn - Async function to process each batch
|
|
93
|
+
* @returns Flattened array of processed results
|
|
94
|
+
*
|
|
95
|
+
* @example
|
|
96
|
+
* ```typescript
|
|
97
|
+
* const texts = ['a', 'b', 'c', 'd', 'e'];
|
|
98
|
+
* const results = await BatchProcessor.processBatch(
|
|
99
|
+
* texts,
|
|
100
|
+
* 2,
|
|
101
|
+
* async (batch) => {
|
|
102
|
+
* return batch.map(t => t.toUpperCase());
|
|
103
|
+
* }
|
|
104
|
+
* );
|
|
105
|
+
* // ['A', 'B', 'C', 'D', 'E']
|
|
106
|
+
* ```
|
|
107
|
+
*/
|
|
108
|
+
static async processBatch(items, batchSize, processFn) {
|
|
109
|
+
const batches = this.createBatches(items, batchSize);
|
|
110
|
+
const results = await Promise.all(batches.map((batch) => processFn(batch)));
|
|
111
|
+
return results.flat();
|
|
112
|
+
}
|
|
113
|
+
/**
|
|
114
|
+
* Splits an array into batches and executes sync function in parallel.
|
|
115
|
+
*
|
|
116
|
+
* @param items - Array to process
|
|
117
|
+
* @param batchSize - Size of each batch
|
|
118
|
+
* @param processFn - Sync function to process each batch
|
|
119
|
+
* @returns Flattened array of processed results
|
|
120
|
+
*
|
|
121
|
+
* @example
|
|
122
|
+
* ```typescript
|
|
123
|
+
* const numbers = [1, 2, 3, 4, 5];
|
|
124
|
+
* const results = BatchProcessor.processBatchSync(
|
|
125
|
+
* numbers,
|
|
126
|
+
* 2,
|
|
127
|
+
* (batch) => batch.map(n => n * 2)
|
|
128
|
+
* );
|
|
129
|
+
* // [2, 4, 6, 8, 10]
|
|
130
|
+
* ```
|
|
131
|
+
*/
|
|
132
|
+
static processBatchSync(items, batchSize, processFn) {
|
|
133
|
+
const batches = this.createBatches(items, batchSize);
|
|
134
|
+
const results = batches.map((batch) => processFn(batch));
|
|
135
|
+
return results.flat();
|
|
136
|
+
}
|
|
137
|
+
};
|
|
138
|
+
var LLMCaller = class {
|
|
139
|
+
/**
|
|
140
|
+
* Extract model name from LanguageModel object
|
|
141
|
+
*
|
|
142
|
+
* Attempts to get model ID from various possible fields in the LanguageModel object.
|
|
143
|
+
*/
|
|
144
|
+
static extractModelName(model) {
|
|
145
|
+
const modelObj = model;
|
|
146
|
+
if (typeof modelObj.modelId === "string") return modelObj.modelId;
|
|
147
|
+
if (typeof modelObj.id === "string") return modelObj.id;
|
|
148
|
+
if (typeof modelObj.model === "string") return modelObj.model;
|
|
149
|
+
if (typeof modelObj.name === "string") return modelObj.name;
|
|
150
|
+
return String(model);
|
|
151
|
+
}
|
|
152
|
+
/**
|
|
153
|
+
* Build usage information from response
|
|
154
|
+
*/
|
|
155
|
+
static buildUsage(config, modelName, response, usedFallback) {
|
|
156
|
+
return {
|
|
157
|
+
component: config.component,
|
|
158
|
+
phase: config.phase,
|
|
159
|
+
model: usedFallback ? "fallback" : "primary",
|
|
160
|
+
modelName,
|
|
161
|
+
inputTokens: response.usage?.inputTokens ?? 0,
|
|
162
|
+
outputTokens: response.usage?.outputTokens ?? 0,
|
|
163
|
+
totalTokens: response.usage?.totalTokens ?? 0
|
|
164
|
+
};
|
|
165
|
+
}
|
|
166
|
+
/**
|
|
167
|
+
* Execute LLM call with fallback support
|
|
168
|
+
*
|
|
169
|
+
* Common execution logic for both text and vision calls.
|
|
170
|
+
*/
|
|
171
|
+
static async executeWithFallback(config, generateFn) {
|
|
172
|
+
const primaryModelName = this.extractModelName(config.primaryModel);
|
|
173
|
+
try {
|
|
174
|
+
const response = await generateFn(config.primaryModel);
|
|
175
|
+
return {
|
|
176
|
+
output: response.output,
|
|
177
|
+
usage: this.buildUsage(config, primaryModelName, response, false),
|
|
178
|
+
usedFallback: false
|
|
179
|
+
};
|
|
180
|
+
} catch (primaryError) {
|
|
181
|
+
if (config.abortSignal?.aborted) {
|
|
182
|
+
throw primaryError;
|
|
183
|
+
}
|
|
184
|
+
if (!config.fallbackModel) {
|
|
185
|
+
throw primaryError;
|
|
186
|
+
}
|
|
187
|
+
const fallbackModelName = this.extractModelName(config.fallbackModel);
|
|
188
|
+
const response = await generateFn(config.fallbackModel);
|
|
189
|
+
return {
|
|
190
|
+
output: response.output,
|
|
191
|
+
usage: this.buildUsage(config, fallbackModelName, response, true),
|
|
192
|
+
usedFallback: true
|
|
193
|
+
};
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
/**
|
|
197
|
+
* Call LLM with retry and fallback support
|
|
198
|
+
*
|
|
199
|
+
* Retry Strategy:
|
|
200
|
+
* 1. Try primary model up to maxRetries times
|
|
201
|
+
* 2. If all fail and fallbackModel provided, try fallback up to maxRetries times
|
|
202
|
+
* 3. Throw error if all attempts exhausted
|
|
203
|
+
*
|
|
204
|
+
* @template TOutput - Output type from schema validation
|
|
205
|
+
* @param config - LLM call configuration
|
|
206
|
+
* @returns Result with parsed object and usage information
|
|
207
|
+
* @throws Error if all retry attempts fail
|
|
208
|
+
*/
|
|
209
|
+
static async call(config) {
|
|
210
|
+
return this.executeWithFallback(
|
|
211
|
+
config,
|
|
212
|
+
(model) => (0, import_ai.generateText)({
|
|
213
|
+
model,
|
|
214
|
+
output: import_ai.Output.object({
|
|
215
|
+
schema: config.schema
|
|
216
|
+
}),
|
|
217
|
+
system: config.systemPrompt,
|
|
218
|
+
prompt: config.userPrompt,
|
|
219
|
+
temperature: config.temperature,
|
|
220
|
+
maxRetries: config.maxRetries,
|
|
221
|
+
abortSignal: config.abortSignal
|
|
222
|
+
})
|
|
223
|
+
);
|
|
224
|
+
}
|
|
225
|
+
/**
|
|
226
|
+
* Call LLM for vision tasks with message format support
|
|
227
|
+
*
|
|
228
|
+
* Same retry and fallback logic as call(), but using message format instead of system/user prompts.
|
|
229
|
+
*
|
|
230
|
+
* @template TOutput - Output type from schema validation
|
|
231
|
+
* @param config - LLM vision call configuration
|
|
232
|
+
* @returns Result with parsed object and usage information
|
|
233
|
+
* @throws Error if all retry attempts fail
|
|
234
|
+
*/
|
|
235
|
+
static async callVision(config) {
|
|
236
|
+
return this.executeWithFallback(
|
|
237
|
+
config,
|
|
238
|
+
(model) => (0, import_ai.generateText)({
|
|
239
|
+
model,
|
|
240
|
+
output: import_ai.Output.object({
|
|
241
|
+
schema: config.schema
|
|
242
|
+
}),
|
|
243
|
+
messages: config.messages,
|
|
244
|
+
temperature: config.temperature,
|
|
245
|
+
maxRetries: config.maxRetries,
|
|
246
|
+
abortSignal: config.abortSignal
|
|
247
|
+
})
|
|
248
|
+
);
|
|
249
|
+
}
|
|
250
|
+
};
|
|
251
|
+
function formatTokens(usage) {
|
|
252
|
+
return `${usage.inputTokens} input, ${usage.outputTokens} output, ${usage.totalTokens} total`;
|
|
253
|
+
}
|
|
254
|
+
var LLMTokenUsageAggregator = class {
|
|
255
|
+
usage = {};
|
|
256
|
+
/**
|
|
257
|
+
* Track token usage from an LLM call
|
|
258
|
+
*
|
|
259
|
+
* @param usage - Extended token usage with component/phase/model information
|
|
260
|
+
*/
|
|
261
|
+
track(usage) {
|
|
262
|
+
if (!this.usage[usage.component]) {
|
|
263
|
+
this.usage[usage.component] = {
|
|
264
|
+
component: usage.component,
|
|
265
|
+
phases: {},
|
|
266
|
+
total: {
|
|
267
|
+
inputTokens: 0,
|
|
268
|
+
outputTokens: 0,
|
|
269
|
+
totalTokens: 0
|
|
270
|
+
}
|
|
271
|
+
};
|
|
272
|
+
}
|
|
273
|
+
const component = this.usage[usage.component];
|
|
274
|
+
if (!component.phases[usage.phase]) {
|
|
275
|
+
component.phases[usage.phase] = {
|
|
276
|
+
total: {
|
|
277
|
+
inputTokens: 0,
|
|
278
|
+
outputTokens: 0,
|
|
279
|
+
totalTokens: 0
|
|
280
|
+
}
|
|
281
|
+
};
|
|
282
|
+
}
|
|
283
|
+
const phase = component.phases[usage.phase];
|
|
284
|
+
if (usage.model === "primary") {
|
|
285
|
+
if (!phase.primary) {
|
|
286
|
+
phase.primary = {
|
|
287
|
+
modelName: usage.modelName,
|
|
288
|
+
inputTokens: 0,
|
|
289
|
+
outputTokens: 0,
|
|
290
|
+
totalTokens: 0
|
|
291
|
+
};
|
|
292
|
+
}
|
|
293
|
+
phase.primary.inputTokens += usage.inputTokens;
|
|
294
|
+
phase.primary.outputTokens += usage.outputTokens;
|
|
295
|
+
phase.primary.totalTokens += usage.totalTokens;
|
|
296
|
+
} else if (usage.model === "fallback") {
|
|
297
|
+
if (!phase.fallback) {
|
|
298
|
+
phase.fallback = {
|
|
299
|
+
modelName: usage.modelName,
|
|
300
|
+
inputTokens: 0,
|
|
301
|
+
outputTokens: 0,
|
|
302
|
+
totalTokens: 0
|
|
303
|
+
};
|
|
304
|
+
}
|
|
305
|
+
phase.fallback.inputTokens += usage.inputTokens;
|
|
306
|
+
phase.fallback.outputTokens += usage.outputTokens;
|
|
307
|
+
phase.fallback.totalTokens += usage.totalTokens;
|
|
308
|
+
}
|
|
309
|
+
phase.total.inputTokens += usage.inputTokens;
|
|
310
|
+
phase.total.outputTokens += usage.outputTokens;
|
|
311
|
+
phase.total.totalTokens += usage.totalTokens;
|
|
312
|
+
component.total.inputTokens += usage.inputTokens;
|
|
313
|
+
component.total.outputTokens += usage.outputTokens;
|
|
314
|
+
component.total.totalTokens += usage.totalTokens;
|
|
315
|
+
}
|
|
316
|
+
/**
|
|
317
|
+
* Get aggregated usage grouped by component
|
|
318
|
+
*
|
|
319
|
+
* @returns Array of component aggregates with phase breakdown
|
|
320
|
+
*/
|
|
321
|
+
getByComponent() {
|
|
322
|
+
return Object.values(this.usage);
|
|
323
|
+
}
|
|
324
|
+
/**
|
|
325
|
+
* Get token usage report in structured JSON format
|
|
326
|
+
*
|
|
327
|
+
* Converts internal usage data to external TokenUsageReport format suitable
|
|
328
|
+
* for serialization and reporting. The report includes component breakdown,
|
|
329
|
+
* phase-level details, and both primary and fallback model usage.
|
|
330
|
+
*
|
|
331
|
+
* @returns Structured token usage report with components and total
|
|
332
|
+
*/
|
|
333
|
+
getReport() {
|
|
334
|
+
const components = [];
|
|
335
|
+
for (const component of Object.values(this.usage)) {
|
|
336
|
+
const phases = [];
|
|
337
|
+
for (const [phaseName, phaseData] of Object.entries(component.phases)) {
|
|
338
|
+
const phaseReport = {
|
|
339
|
+
phase: phaseName,
|
|
340
|
+
total: {
|
|
341
|
+
inputTokens: phaseData.total.inputTokens,
|
|
342
|
+
outputTokens: phaseData.total.outputTokens,
|
|
343
|
+
totalTokens: phaseData.total.totalTokens
|
|
344
|
+
}
|
|
345
|
+
};
|
|
346
|
+
if (phaseData.primary) {
|
|
347
|
+
phaseReport.primary = {
|
|
348
|
+
modelName: phaseData.primary.modelName,
|
|
349
|
+
inputTokens: phaseData.primary.inputTokens,
|
|
350
|
+
outputTokens: phaseData.primary.outputTokens,
|
|
351
|
+
totalTokens: phaseData.primary.totalTokens
|
|
352
|
+
};
|
|
353
|
+
}
|
|
354
|
+
if (phaseData.fallback) {
|
|
355
|
+
phaseReport.fallback = {
|
|
356
|
+
modelName: phaseData.fallback.modelName,
|
|
357
|
+
inputTokens: phaseData.fallback.inputTokens,
|
|
358
|
+
outputTokens: phaseData.fallback.outputTokens,
|
|
359
|
+
totalTokens: phaseData.fallback.totalTokens
|
|
360
|
+
};
|
|
361
|
+
}
|
|
362
|
+
phases.push(phaseReport);
|
|
363
|
+
}
|
|
364
|
+
components.push({
|
|
365
|
+
component: component.component,
|
|
366
|
+
phases,
|
|
367
|
+
total: {
|
|
368
|
+
inputTokens: component.total.inputTokens,
|
|
369
|
+
outputTokens: component.total.outputTokens,
|
|
370
|
+
totalTokens: component.total.totalTokens
|
|
371
|
+
}
|
|
372
|
+
});
|
|
373
|
+
}
|
|
374
|
+
const totalUsage = this.getTotalUsage();
|
|
375
|
+
return {
|
|
376
|
+
components,
|
|
377
|
+
total: {
|
|
378
|
+
inputTokens: totalUsage.inputTokens,
|
|
379
|
+
outputTokens: totalUsage.outputTokens,
|
|
380
|
+
totalTokens: totalUsage.totalTokens
|
|
381
|
+
}
|
|
382
|
+
};
|
|
383
|
+
}
|
|
384
|
+
/**
|
|
385
|
+
* Get total usage across all components and phases
|
|
386
|
+
*
|
|
387
|
+
* @returns Aggregated token usage totals
|
|
388
|
+
*/
|
|
389
|
+
getTotalUsage() {
|
|
390
|
+
let totalInput = 0;
|
|
391
|
+
let totalOutput = 0;
|
|
392
|
+
let totalTokens = 0;
|
|
393
|
+
for (const component of Object.values(this.usage)) {
|
|
394
|
+
totalInput += component.total.inputTokens;
|
|
395
|
+
totalOutput += component.total.outputTokens;
|
|
396
|
+
totalTokens += component.total.totalTokens;
|
|
397
|
+
}
|
|
398
|
+
return {
|
|
399
|
+
inputTokens: totalInput,
|
|
400
|
+
outputTokens: totalOutput,
|
|
401
|
+
totalTokens
|
|
402
|
+
};
|
|
403
|
+
}
|
|
404
|
+
/**
|
|
405
|
+
* Log comprehensive token usage summary
|
|
406
|
+
*
|
|
407
|
+
* Outputs usage grouped by component, with phase and model breakdown.
|
|
408
|
+
* Shows primary and fallback token usage separately for each phase.
|
|
409
|
+
* Call this once at the end of document processing.
|
|
410
|
+
*
|
|
411
|
+
* @param logger - Logger instance for output
|
|
412
|
+
*/
|
|
413
|
+
logSummary(logger) {
|
|
414
|
+
const components = this.getByComponent();
|
|
415
|
+
if (components.length === 0) {
|
|
416
|
+
logger.info("[DocumentProcessor] No token usage to report");
|
|
417
|
+
return;
|
|
418
|
+
}
|
|
419
|
+
logger.info("[DocumentProcessor] Token usage summary:");
|
|
420
|
+
logger.info("");
|
|
421
|
+
let grandInputTokens = 0;
|
|
422
|
+
let grandOutputTokens = 0;
|
|
423
|
+
let grandTotalTokens = 0;
|
|
424
|
+
let grandPrimaryInputTokens = 0;
|
|
425
|
+
let grandPrimaryOutputTokens = 0;
|
|
426
|
+
let grandPrimaryTotalTokens = 0;
|
|
427
|
+
let grandFallbackInputTokens = 0;
|
|
428
|
+
let grandFallbackOutputTokens = 0;
|
|
429
|
+
let grandFallbackTotalTokens = 0;
|
|
430
|
+
for (const component of components) {
|
|
431
|
+
logger.info(`${component.component}:`);
|
|
432
|
+
for (const [phase, phaseData] of Object.entries(component.phases)) {
|
|
433
|
+
logger.info(` - ${phase}:`);
|
|
434
|
+
if (phaseData.primary) {
|
|
435
|
+
logger.info(
|
|
436
|
+
` primary (${phaseData.primary.modelName}): ${formatTokens(phaseData.primary)}`
|
|
437
|
+
);
|
|
438
|
+
grandPrimaryInputTokens += phaseData.primary.inputTokens;
|
|
439
|
+
grandPrimaryOutputTokens += phaseData.primary.outputTokens;
|
|
440
|
+
grandPrimaryTotalTokens += phaseData.primary.totalTokens;
|
|
441
|
+
}
|
|
442
|
+
if (phaseData.fallback) {
|
|
443
|
+
logger.info(
|
|
444
|
+
` fallback (${phaseData.fallback.modelName}): ${formatTokens(phaseData.fallback)}`
|
|
445
|
+
);
|
|
446
|
+
grandFallbackInputTokens += phaseData.fallback.inputTokens;
|
|
447
|
+
grandFallbackOutputTokens += phaseData.fallback.outputTokens;
|
|
448
|
+
grandFallbackTotalTokens += phaseData.fallback.totalTokens;
|
|
449
|
+
}
|
|
450
|
+
logger.info(` subtotal: ${formatTokens(phaseData.total)}`);
|
|
451
|
+
}
|
|
452
|
+
logger.info(
|
|
453
|
+
` ${component.component} total: ${formatTokens(component.total)}`
|
|
454
|
+
);
|
|
455
|
+
logger.info("");
|
|
456
|
+
grandInputTokens += component.total.inputTokens;
|
|
457
|
+
grandOutputTokens += component.total.outputTokens;
|
|
458
|
+
grandTotalTokens += component.total.totalTokens;
|
|
459
|
+
}
|
|
460
|
+
logger.info("--- Summary ---");
|
|
461
|
+
if (grandPrimaryTotalTokens > 0) {
|
|
462
|
+
logger.info(
|
|
463
|
+
`Primary total: ${formatTokens({
|
|
464
|
+
inputTokens: grandPrimaryInputTokens,
|
|
465
|
+
outputTokens: grandPrimaryOutputTokens,
|
|
466
|
+
totalTokens: grandPrimaryTotalTokens
|
|
467
|
+
})}`
|
|
468
|
+
);
|
|
469
|
+
}
|
|
470
|
+
if (grandFallbackTotalTokens > 0) {
|
|
471
|
+
logger.info(
|
|
472
|
+
`Fallback total: ${formatTokens({
|
|
473
|
+
inputTokens: grandFallbackInputTokens,
|
|
474
|
+
outputTokens: grandFallbackOutputTokens,
|
|
475
|
+
totalTokens: grandFallbackTotalTokens
|
|
476
|
+
})}`
|
|
477
|
+
);
|
|
478
|
+
}
|
|
479
|
+
logger.info(
|
|
480
|
+
`Grand total: ${formatTokens({
|
|
481
|
+
inputTokens: grandInputTokens,
|
|
482
|
+
outputTokens: grandOutputTokens,
|
|
483
|
+
totalTokens: grandTotalTokens
|
|
484
|
+
})}`
|
|
485
|
+
);
|
|
486
|
+
}
|
|
487
|
+
/**
|
|
488
|
+
* Reset all tracked usage
|
|
489
|
+
*
|
|
490
|
+
* Call this at the start of a new document processing run.
|
|
491
|
+
*/
|
|
492
|
+
reset() {
|
|
493
|
+
this.usage = {};
|
|
494
|
+
}
|
|
495
|
+
};
|
|
496
|
+
|
|
497
|
+
// src/utils/ref-resolver.ts
|
|
498
|
+
var RefResolver = class {
|
|
499
|
+
logger;
|
|
500
|
+
textMap;
|
|
501
|
+
pictureMap;
|
|
502
|
+
tableMap;
|
|
503
|
+
groupMap;
|
|
504
|
+
constructor(logger, doc) {
|
|
505
|
+
this.logger = logger;
|
|
506
|
+
this.logger.info("[RefResolver] Initializing reference resolver...");
|
|
507
|
+
this.textMap = this.buildIndex(doc.texts, "texts");
|
|
508
|
+
this.pictureMap = this.buildIndex(doc.pictures, "pictures");
|
|
509
|
+
this.tableMap = this.buildIndex(doc.tables, "tables");
|
|
510
|
+
this.groupMap = this.buildIndex(doc.groups, "groups");
|
|
511
|
+
this.logger.info(
|
|
512
|
+
`[RefResolver] Indexed ${this.textMap.size} texts, ${this.pictureMap.size} pictures, ${this.tableMap.size} tables, ${this.groupMap.size} groups`
|
|
513
|
+
);
|
|
514
|
+
}
|
|
515
|
+
/**
|
|
516
|
+
* Build an index mapping self_ref to the actual item
|
|
517
|
+
*/
|
|
518
|
+
buildIndex(items, _prefix) {
|
|
519
|
+
const map = /* @__PURE__ */ new Map();
|
|
520
|
+
for (const item of items) {
|
|
521
|
+
map.set(item.self_ref, item);
|
|
522
|
+
}
|
|
523
|
+
return map;
|
|
524
|
+
}
|
|
525
|
+
/**
|
|
526
|
+
* Resolve a $ref string to the actual item
|
|
527
|
+
* @param ref - Reference string (e.g., "#/texts/0")
|
|
528
|
+
* @returns The resolved item, or null if not found
|
|
529
|
+
*/
|
|
530
|
+
resolve(ref) {
|
|
531
|
+
const match = ref.match(/^#\/(\w+)\//);
|
|
532
|
+
if (!match) {
|
|
533
|
+
this.logger.warn(`[RefResolver] Invalid reference format: ${ref}`);
|
|
534
|
+
return null;
|
|
535
|
+
}
|
|
536
|
+
const collection = match[1];
|
|
537
|
+
if (collection === "texts") {
|
|
538
|
+
const result = this.textMap.get(ref) ?? null;
|
|
539
|
+
if (!result) {
|
|
540
|
+
this.logger.warn(`[RefResolver] Text reference not found: ${ref}`);
|
|
541
|
+
}
|
|
542
|
+
return result;
|
|
543
|
+
}
|
|
544
|
+
if (collection === "pictures") {
|
|
545
|
+
const result = this.pictureMap.get(ref) ?? null;
|
|
546
|
+
if (!result) {
|
|
547
|
+
this.logger.warn(`[RefResolver] Picture reference not found: ${ref}`);
|
|
548
|
+
}
|
|
549
|
+
return result;
|
|
550
|
+
}
|
|
551
|
+
if (collection === "tables") {
|
|
552
|
+
const result = this.tableMap.get(ref) ?? null;
|
|
553
|
+
if (!result) {
|
|
554
|
+
this.logger.warn(`[RefResolver] Table reference not found: ${ref}`);
|
|
555
|
+
}
|
|
556
|
+
return result;
|
|
557
|
+
}
|
|
558
|
+
if (collection === "groups") {
|
|
559
|
+
const result = this.groupMap.get(ref) ?? null;
|
|
560
|
+
if (!result) {
|
|
561
|
+
this.logger.warn(`[RefResolver] Group reference not found: ${ref}`);
|
|
562
|
+
}
|
|
563
|
+
return result;
|
|
564
|
+
}
|
|
565
|
+
this.logger.warn(`[RefResolver] Unknown collection type: ${collection}`);
|
|
566
|
+
return null;
|
|
567
|
+
}
|
|
568
|
+
/**
|
|
569
|
+
* Resolve a text reference
|
|
570
|
+
* @param ref - Reference string (e.g., "#/texts/0")
|
|
571
|
+
* @returns The resolved text item, or null if not found
|
|
572
|
+
*/
|
|
573
|
+
resolveText(ref) {
|
|
574
|
+
return this.textMap.get(ref) ?? null;
|
|
575
|
+
}
|
|
576
|
+
/**
|
|
577
|
+
* Resolve a picture reference
|
|
578
|
+
* @param ref - Reference string (e.g., "#/pictures/0")
|
|
579
|
+
* @returns The resolved picture item, or null if not found
|
|
580
|
+
*/
|
|
581
|
+
resolvePicture(ref) {
|
|
582
|
+
return this.pictureMap.get(ref) ?? null;
|
|
583
|
+
}
|
|
584
|
+
/**
|
|
585
|
+
* Resolve a table reference
|
|
586
|
+
* @param ref - Reference string (e.g., "#/tables/0")
|
|
587
|
+
* @returns The resolved table item, or null if not found
|
|
588
|
+
*/
|
|
589
|
+
resolveTable(ref) {
|
|
590
|
+
return this.tableMap.get(ref) ?? null;
|
|
591
|
+
}
|
|
592
|
+
/**
|
|
593
|
+
* Resolve a group reference
|
|
594
|
+
* @param ref - Reference string (e.g., "#/groups/0")
|
|
595
|
+
* @returns The resolved group item, or null if not found
|
|
596
|
+
*/
|
|
597
|
+
resolveGroup(ref) {
|
|
598
|
+
return this.groupMap.get(ref) ?? null;
|
|
599
|
+
}
|
|
600
|
+
/**
|
|
601
|
+
* Resolve multiple references at once
|
|
602
|
+
* @param refs - Array of reference objects with $ref property
|
|
603
|
+
* @returns Array of resolved items (null for unresolved references)
|
|
604
|
+
*/
|
|
605
|
+
resolveMany(refs) {
|
|
606
|
+
return refs.map((ref) => this.resolve(ref.$ref));
|
|
607
|
+
}
|
|
608
|
+
};
|
|
609
|
+
|
|
610
|
+
// src/utils/id-generator.ts
|
|
611
|
+
var IdGenerator = class {
|
|
612
|
+
chapterCounter = 0;
|
|
613
|
+
imageCounter = 0;
|
|
614
|
+
tableCounter = 0;
|
|
615
|
+
footnoteCounter = 0;
|
|
616
|
+
/**
|
|
617
|
+
* Generate a chapter ID
|
|
618
|
+
* @returns A chapter ID in the format "ch-001"
|
|
619
|
+
*/
|
|
620
|
+
generateChapterId() {
|
|
621
|
+
this.chapterCounter++;
|
|
622
|
+
return `ch-${this.padNumber(this.chapterCounter)}`;
|
|
623
|
+
}
|
|
624
|
+
/**
|
|
625
|
+
* Generate an image ID
|
|
626
|
+
* @returns An image ID in the format "img-001"
|
|
627
|
+
*/
|
|
628
|
+
generateImageId() {
|
|
629
|
+
this.imageCounter++;
|
|
630
|
+
return `img-${this.padNumber(this.imageCounter)}`;
|
|
631
|
+
}
|
|
632
|
+
/**
|
|
633
|
+
* Generate a table ID
|
|
634
|
+
* @returns A table ID in the format "tbl-001"
|
|
635
|
+
*/
|
|
636
|
+
generateTableId() {
|
|
637
|
+
this.tableCounter++;
|
|
638
|
+
return `tbl-${this.padNumber(this.tableCounter)}`;
|
|
639
|
+
}
|
|
640
|
+
/**
|
|
641
|
+
* Generate a footnote ID
|
|
642
|
+
* @returns A footnote ID in the format "ftn-001"
|
|
643
|
+
*/
|
|
644
|
+
generateFootnoteId() {
|
|
645
|
+
this.footnoteCounter++;
|
|
646
|
+
return `ftn-${this.padNumber(this.footnoteCounter)}`;
|
|
647
|
+
}
|
|
648
|
+
/**
|
|
649
|
+
* Reset all counters to zero
|
|
650
|
+
*/
|
|
651
|
+
reset() {
|
|
652
|
+
this.chapterCounter = 0;
|
|
653
|
+
this.imageCounter = 0;
|
|
654
|
+
this.tableCounter = 0;
|
|
655
|
+
this.footnoteCounter = 0;
|
|
656
|
+
}
|
|
657
|
+
/**
|
|
658
|
+
* Get current counter values (for testing/debugging)
|
|
659
|
+
*/
|
|
660
|
+
getCounters() {
|
|
661
|
+
return {
|
|
662
|
+
chapter: this.chapterCounter,
|
|
663
|
+
image: this.imageCounter,
|
|
664
|
+
table: this.tableCounter,
|
|
665
|
+
footnote: this.footnoteCounter
|
|
666
|
+
};
|
|
667
|
+
}
|
|
668
|
+
/**
|
|
669
|
+
* Pad a number to 3 digits with leading zeros
|
|
670
|
+
*/
|
|
671
|
+
padNumber(num) {
|
|
672
|
+
return num.toString().padStart(3, "0");
|
|
673
|
+
}
|
|
674
|
+
};
|
|
675
|
+
|
|
676
|
+
// src/utils/text-cleaner.ts
|
|
677
|
+
var TextCleaner = class {
|
|
678
|
+
/**
|
|
679
|
+
* Normalizes text
|
|
680
|
+
* - Converts consecutive spaces/line breaks to single space
|
|
681
|
+
* - Trims leading and trailing spaces
|
|
682
|
+
* - Normalizes special whitespace characters (tabs, non-breaking spaces, etc.)
|
|
683
|
+
*/
|
|
684
|
+
static normalize(text) {
|
|
685
|
+
if (!text) return "";
|
|
686
|
+
let normalized = text.normalize("NFC");
|
|
687
|
+
normalized = normalized.replace(/[\t\u00A0\u2000-\u200B]/g, " ");
|
|
688
|
+
normalized = normalized.replace(/[\r\n]+/g, " ");
|
|
689
|
+
normalized = normalized.replace(/\s+/g, " ");
|
|
690
|
+
normalized = normalized.trim();
|
|
691
|
+
return normalized;
|
|
692
|
+
}
|
|
693
|
+
/**
|
|
694
|
+
* Clean text starting/ending with punctuation marks
|
|
695
|
+
* - Remove commas/periods at sentence start
|
|
696
|
+
* - Clean spaces and punctuation at sentence end
|
|
697
|
+
*/
|
|
698
|
+
static cleanPunctuation(text) {
|
|
699
|
+
if (!text) return "";
|
|
700
|
+
let cleaned = text.replace(/^[,.:;!?]+\s*/, "");
|
|
701
|
+
cleaned = cleaned.replace(/\s+[,.:;!?]*$/, "");
|
|
702
|
+
return cleaned;
|
|
703
|
+
}
|
|
704
|
+
/**
|
|
705
|
+
* Filter text consisting only of numbers and spaces
|
|
706
|
+
*/
|
|
707
|
+
static isValidText(text) {
|
|
708
|
+
if (!text) return false;
|
|
709
|
+
const cleaned = this.normalize(text);
|
|
710
|
+
return !/^\s*[\d\s]*$/.test(cleaned);
|
|
711
|
+
}
|
|
712
|
+
/**
|
|
713
|
+
* Batch normalization (for bulk processing)
|
|
714
|
+
*/
|
|
715
|
+
static normalizeBatch(texts) {
|
|
716
|
+
return texts.map((text) => this.normalize(text));
|
|
717
|
+
}
|
|
718
|
+
/**
|
|
719
|
+
* Batch filtering (returns only valid text)
|
|
720
|
+
*/
|
|
721
|
+
static filterValidTexts(texts) {
|
|
722
|
+
return texts.filter((text) => this.isValidText(text));
|
|
723
|
+
}
|
|
724
|
+
/**
|
|
725
|
+
* Batch normalization + filtering (stage 1 + stage 2 combined)
|
|
726
|
+
*
|
|
727
|
+
* Performs TextCleaner's basic normalization and filtering in batch processing at once.
|
|
728
|
+
* Splits large amounts of text into batches for efficient processing.
|
|
729
|
+
*
|
|
730
|
+
* If batchSize is 0, processes items sequentially without batch processing.
|
|
731
|
+
*
|
|
732
|
+
* @param texts - Original text array
|
|
733
|
+
* @param batchSize - Batch size (default: 10). Set to 0 for sequential processing without batching.
|
|
734
|
+
* @returns Normalized and filtered text array
|
|
735
|
+
*
|
|
736
|
+
* @example
|
|
737
|
+
* ```typescript
|
|
738
|
+
* const rawTexts = [' text 1 ', '123', 'text 2\n'];
|
|
739
|
+
* const cleaned = TextCleaner.normalizeAndFilterBatch(rawTexts, 10);
|
|
740
|
+
* // ['text 1', 'text 2']
|
|
741
|
+
*
|
|
742
|
+
* // Sequential processing (no batching)
|
|
743
|
+
* const cleanedSequential = TextCleaner.normalizeAndFilterBatch(rawTexts, 0);
|
|
744
|
+
* // ['text 1', 'text 2']
|
|
745
|
+
* ```
|
|
746
|
+
*/
|
|
747
|
+
static normalizeAndFilterBatch(texts, batchSize = 10) {
|
|
748
|
+
if (batchSize === 0) {
|
|
749
|
+
const results = [];
|
|
750
|
+
for (const text of texts) {
|
|
751
|
+
const normalized = this.normalize(text);
|
|
752
|
+
if (this.isValidText(normalized)) {
|
|
753
|
+
results.push(normalized);
|
|
754
|
+
}
|
|
755
|
+
}
|
|
756
|
+
return results;
|
|
757
|
+
}
|
|
758
|
+
return BatchProcessor.processBatchSync(texts, batchSize, (batch) => {
|
|
759
|
+
const normalized = this.normalizeBatch(batch);
|
|
760
|
+
return this.filterValidTexts(normalized);
|
|
761
|
+
});
|
|
762
|
+
}
|
|
763
|
+
};
|
|
764
|
+
|
|
765
|
+
// src/utils/markdown-converter.ts
|
|
766
|
+
var MarkdownConverter = class _MarkdownConverter {
|
|
767
|
+
/**
|
|
768
|
+
* Convert TOC items (groups/tables) to Markdown string
|
|
769
|
+
*
|
|
770
|
+
* @param refs - Array of item references from TocAreaResult
|
|
771
|
+
* @param refResolver - RefResolver for resolving references
|
|
772
|
+
* @returns Markdown string representation of TOC
|
|
773
|
+
*/
|
|
774
|
+
static convert(refs, refResolver) {
|
|
775
|
+
if (refs.length === 0) {
|
|
776
|
+
return "";
|
|
777
|
+
}
|
|
778
|
+
const lines = [];
|
|
779
|
+
for (const ref of refs) {
|
|
780
|
+
const item = refResolver.resolve(ref);
|
|
781
|
+
if (!item) {
|
|
782
|
+
continue;
|
|
783
|
+
}
|
|
784
|
+
if ("name" in item && (item.name === "list" || item.name === "group")) {
|
|
785
|
+
const groupMarkdown = _MarkdownConverter.groupToMarkdown(
|
|
786
|
+
item,
|
|
787
|
+
refResolver,
|
|
788
|
+
0
|
|
789
|
+
);
|
|
790
|
+
if (groupMarkdown) {
|
|
791
|
+
lines.push(groupMarkdown);
|
|
792
|
+
}
|
|
793
|
+
} else if ("data" in item && "grid" in item.data) {
|
|
794
|
+
const tableMarkdown = _MarkdownConverter.tableToMarkdown(
|
|
795
|
+
item
|
|
796
|
+
);
|
|
797
|
+
if (tableMarkdown) {
|
|
798
|
+
lines.push(tableMarkdown);
|
|
799
|
+
}
|
|
800
|
+
} else if ("text" in item && "orig" in item) {
|
|
801
|
+
const textMarkdown = _MarkdownConverter.textToMarkdown(
|
|
802
|
+
item,
|
|
803
|
+
0
|
|
804
|
+
);
|
|
805
|
+
if (textMarkdown) {
|
|
806
|
+
lines.push(textMarkdown);
|
|
807
|
+
}
|
|
808
|
+
}
|
|
809
|
+
}
|
|
810
|
+
return lines.join("\n\n");
|
|
811
|
+
}
|
|
812
|
+
/**
|
|
813
|
+
* Convert a group item to Markdown list format
|
|
814
|
+
*
|
|
815
|
+
* Handles nested lists and preserves hierarchy.
|
|
816
|
+
*
|
|
817
|
+
* @example
|
|
818
|
+
* Output:
|
|
819
|
+
* - Chapter 1 Introduction ..... 1
|
|
820
|
+
* - 1.1 Background ..... 3
|
|
821
|
+
* - 1.2 Objectives ..... 5
|
|
822
|
+
* - Chapter 2 Methodology ..... 10
|
|
823
|
+
*/
|
|
824
|
+
static groupToMarkdown(group, refResolver, indentLevel = 0) {
|
|
825
|
+
const lines = [];
|
|
826
|
+
for (const childRef of group.children) {
|
|
827
|
+
const child = refResolver.resolve(childRef.$ref);
|
|
828
|
+
if (!child) {
|
|
829
|
+
continue;
|
|
830
|
+
}
|
|
831
|
+
if ("name" in child && (child.name === "list" || child.name === "group")) {
|
|
832
|
+
const nestedMarkdown = _MarkdownConverter.groupToMarkdown(
|
|
833
|
+
child,
|
|
834
|
+
refResolver,
|
|
835
|
+
indentLevel + 1
|
|
836
|
+
);
|
|
837
|
+
if (nestedMarkdown) {
|
|
838
|
+
lines.push(nestedMarkdown);
|
|
839
|
+
}
|
|
840
|
+
} else if ("text" in child && "orig" in child) {
|
|
841
|
+
const textMarkdown = _MarkdownConverter.textToMarkdown(
|
|
842
|
+
child,
|
|
843
|
+
indentLevel
|
|
844
|
+
);
|
|
845
|
+
if (textMarkdown) {
|
|
846
|
+
lines.push(textMarkdown);
|
|
847
|
+
}
|
|
848
|
+
}
|
|
849
|
+
}
|
|
850
|
+
return lines.join("\n");
|
|
851
|
+
}
|
|
852
|
+
/**
|
|
853
|
+
* Convert a table item to Markdown table format
|
|
854
|
+
*
|
|
855
|
+
* @example
|
|
856
|
+
* Output:
|
|
857
|
+
* | Chapter | Page |
|
|
858
|
+
* |---------|------|
|
|
859
|
+
* | Chapter 1 Introduction | 1 |
|
|
860
|
+
* | Chapter 2 Methodology | 10 |
|
|
861
|
+
*/
|
|
862
|
+
static tableToMarkdown(table) {
|
|
863
|
+
const { grid } = table.data;
|
|
864
|
+
if (!grid || grid.length === 0) {
|
|
865
|
+
return "";
|
|
866
|
+
}
|
|
867
|
+
const lines = [];
|
|
868
|
+
for (let rowIdx = 0; rowIdx < grid.length; rowIdx++) {
|
|
869
|
+
const row = grid[rowIdx];
|
|
870
|
+
if (!row || row.length === 0) {
|
|
871
|
+
continue;
|
|
872
|
+
}
|
|
873
|
+
const cells = row.map(
|
|
874
|
+
(cell) => _MarkdownConverter.escapeTableCell(cell.text)
|
|
875
|
+
);
|
|
876
|
+
lines.push(`| ${cells.join(" | ")} |`);
|
|
877
|
+
if (rowIdx === 0) {
|
|
878
|
+
const separator = row.map(() => "---").join(" | ");
|
|
879
|
+
lines.push(`| ${separator} |`);
|
|
880
|
+
}
|
|
881
|
+
}
|
|
882
|
+
return lines.join("\n");
|
|
883
|
+
}
|
|
884
|
+
/**
|
|
885
|
+
* Convert a text item to Markdown line
|
|
886
|
+
*/
|
|
887
|
+
static textToMarkdown(text, indentLevel = 0) {
|
|
888
|
+
const content = text.text.trim();
|
|
889
|
+
if (!content) {
|
|
890
|
+
return "";
|
|
891
|
+
}
|
|
892
|
+
const indent = _MarkdownConverter.getIndent(indentLevel);
|
|
893
|
+
const marker = _MarkdownConverter.getListMarker(
|
|
894
|
+
text.enumerated,
|
|
895
|
+
text.marker
|
|
896
|
+
);
|
|
897
|
+
return `${indent}${marker}${content}`;
|
|
898
|
+
}
|
|
899
|
+
/**
|
|
900
|
+
* Generate list marker based on enumeration and marker
|
|
901
|
+
*/
|
|
902
|
+
static getListMarker(enumerated, marker) {
|
|
903
|
+
if (marker) {
|
|
904
|
+
return `${marker} `;
|
|
905
|
+
}
|
|
906
|
+
if (enumerated === true) {
|
|
907
|
+
return "1. ";
|
|
908
|
+
}
|
|
909
|
+
if (enumerated === false) {
|
|
910
|
+
return "- ";
|
|
911
|
+
}
|
|
912
|
+
return "- ";
|
|
913
|
+
}
|
|
914
|
+
/**
|
|
915
|
+
* Generate indent string (2 spaces per level)
|
|
916
|
+
*/
|
|
917
|
+
static getIndent(level) {
|
|
918
|
+
return " ".repeat(level);
|
|
919
|
+
}
|
|
920
|
+
/**
|
|
921
|
+
* Escape special characters in table cell content
|
|
922
|
+
*/
|
|
923
|
+
static escapeTableCell(text) {
|
|
924
|
+
return text.replace(/\|/g, "\\|").replace(/\n/g, " ").trim();
|
|
925
|
+
}
|
|
926
|
+
};
|
|
927
|
+
|
|
928
|
+
// src/converters/chapter-converter.ts
|
|
929
|
+
var ChapterConverter = class _ChapterConverter {
|
|
930
|
+
static FRONT_MATTER_ID = "ch-000";
|
|
931
|
+
static FRONT_MATTER_TITLE = "Front Matter";
|
|
932
|
+
logger;
|
|
933
|
+
idGenerator;
|
|
934
|
+
constructor(logger, idGenerator) {
|
|
935
|
+
this.logger = logger;
|
|
936
|
+
this.idGenerator = idGenerator;
|
|
937
|
+
}
|
|
938
|
+
/**
|
|
939
|
+
* Convert TocEntry[] to Chapter[]
|
|
940
|
+
*
|
|
941
|
+
* @param tocEntries - Table of contents entries
|
|
942
|
+
* @param textItems - DoclingDocument.texts (with prov for page numbers)
|
|
943
|
+
* @param pageRangeMap - PDF page to actual page mapping
|
|
944
|
+
* @param images - Converted images
|
|
945
|
+
* @param tables - Converted tables
|
|
946
|
+
* @param footnotes - Converted footnotes
|
|
947
|
+
* @returns Converted chapters with text blocks and resource references
|
|
948
|
+
*/
|
|
949
|
+
convert(tocEntries, textItems, pageRangeMap, images, tables, footnotes) {
|
|
950
|
+
this.logger.info("[ChapterConverter] Starting chapter conversion...");
|
|
951
|
+
const frontMatter = this.createFrontMatterChapter();
|
|
952
|
+
const tocChapters = this.buildChapterTree(tocEntries);
|
|
953
|
+
this.logger.info(
|
|
954
|
+
`[ChapterConverter] Built ${tocChapters.length} TOC chapters + Front Matter`
|
|
955
|
+
);
|
|
956
|
+
const allChapters = [frontMatter, ...tocChapters];
|
|
957
|
+
const flatChapters = this.flattenChapters(allChapters);
|
|
958
|
+
const chapterRanges = this.calculatePageRanges(flatChapters, tocEntries);
|
|
959
|
+
this.logger.info(
|
|
960
|
+
`[ChapterConverter] Calculated ranges for ${chapterRanges.size} chapters`
|
|
961
|
+
);
|
|
962
|
+
const textBlocks = this.convertTextBlocks(textItems, pageRangeMap);
|
|
963
|
+
this.assignTextBlocks(allChapters, textBlocks, chapterRanges, pageRangeMap);
|
|
964
|
+
this.logger.info(
|
|
965
|
+
`[ChapterConverter] Assigned ${textBlocks.length} text blocks`
|
|
966
|
+
);
|
|
967
|
+
this.linkResources(
|
|
968
|
+
allChapters,
|
|
969
|
+
images,
|
|
970
|
+
tables,
|
|
971
|
+
footnotes,
|
|
972
|
+
chapterRanges,
|
|
973
|
+
pageRangeMap
|
|
974
|
+
);
|
|
975
|
+
this.logger.info(
|
|
976
|
+
`[ChapterConverter] Linked ${images.length} images, ${tables.length} tables, and ${footnotes.length} footnotes`
|
|
977
|
+
);
|
|
978
|
+
return allChapters;
|
|
979
|
+
}
|
|
980
|
+
/**
|
|
981
|
+
* Create Front Matter chapter for pre-TOC content
|
|
982
|
+
*/
|
|
983
|
+
createFrontMatterChapter() {
|
|
984
|
+
return {
|
|
985
|
+
id: _ChapterConverter.FRONT_MATTER_ID,
|
|
986
|
+
originTitle: _ChapterConverter.FRONT_MATTER_TITLE,
|
|
987
|
+
title: _ChapterConverter.FRONT_MATTER_TITLE,
|
|
988
|
+
pageNo: 1,
|
|
989
|
+
level: 1,
|
|
990
|
+
textBlocks: [],
|
|
991
|
+
imageIds: [],
|
|
992
|
+
tableIds: [],
|
|
993
|
+
footnoteIds: []
|
|
994
|
+
};
|
|
995
|
+
}
|
|
996
|
+
/**
|
|
997
|
+
* Build chapter tree from TocEntry[]
|
|
998
|
+
* Recursively processes children
|
|
999
|
+
*/
|
|
1000
|
+
buildChapterTree(entries) {
|
|
1001
|
+
return entries.map((entry) => {
|
|
1002
|
+
const chapterId = this.idGenerator.generateChapterId();
|
|
1003
|
+
const chapter = {
|
|
1004
|
+
id: chapterId,
|
|
1005
|
+
originTitle: entry.title,
|
|
1006
|
+
title: TextCleaner.normalize(entry.title),
|
|
1007
|
+
pageNo: entry.pageNo,
|
|
1008
|
+
level: entry.level,
|
|
1009
|
+
textBlocks: [],
|
|
1010
|
+
imageIds: [],
|
|
1011
|
+
tableIds: [],
|
|
1012
|
+
footnoteIds: []
|
|
1013
|
+
};
|
|
1014
|
+
if (entry.children && entry.children.length > 0) {
|
|
1015
|
+
chapter.children = this.buildChapterTree(entry.children);
|
|
1016
|
+
}
|
|
1017
|
+
return chapter;
|
|
1018
|
+
});
|
|
1019
|
+
}
|
|
1020
|
+
/**
|
|
1021
|
+
* Flatten chapter tree for page range calculation
|
|
1022
|
+
* Preserves original TOC page numbers
|
|
1023
|
+
*/
|
|
1024
|
+
flattenChapters(chapters) {
|
|
1025
|
+
const result = [];
|
|
1026
|
+
const flatten = (chapterList) => {
|
|
1027
|
+
for (const chapter of chapterList) {
|
|
1028
|
+
result.push({
|
|
1029
|
+
chapter,
|
|
1030
|
+
tocPageNo: chapter.pageNo
|
|
1031
|
+
});
|
|
1032
|
+
if (chapter.children && chapter.children.length > 0) {
|
|
1033
|
+
flatten(chapter.children);
|
|
1034
|
+
}
|
|
1035
|
+
}
|
|
1036
|
+
};
|
|
1037
|
+
flatten(chapters);
|
|
1038
|
+
return result;
|
|
1039
|
+
}
|
|
1040
|
+
/**
|
|
1041
|
+
* Calculate page range for each chapter
|
|
1042
|
+
* Uses next chapter's start page as end boundary
|
|
1043
|
+
*
|
|
1044
|
+
* Front Matter (ch-000) gets special handling:
|
|
1045
|
+
* - startPage: 1
|
|
1046
|
+
* - endPage: first TOC entry's page - 1 (or 0 if TOC starts at page 1)
|
|
1047
|
+
*/
|
|
1048
|
+
calculatePageRanges(flatChapters, tocEntries) {
|
|
1049
|
+
const ranges = /* @__PURE__ */ new Map();
|
|
1050
|
+
if (flatChapters.length === 0) {
|
|
1051
|
+
return ranges;
|
|
1052
|
+
}
|
|
1053
|
+
const firstTocPage = tocEntries.length > 0 ? Math.min(...tocEntries.map((e) => e.pageNo)) : Number.MAX_SAFE_INTEGER;
|
|
1054
|
+
const tocChapters = flatChapters.filter(
|
|
1055
|
+
(fc) => fc.chapter.id !== _ChapterConverter.FRONT_MATTER_ID
|
|
1056
|
+
);
|
|
1057
|
+
const sorted = [...tocChapters].sort((a, b) => a.tocPageNo - b.tocPageNo);
|
|
1058
|
+
ranges.set(_ChapterConverter.FRONT_MATTER_ID, {
|
|
1059
|
+
startPage: 1,
|
|
1060
|
+
endPage: firstTocPage - 1
|
|
1061
|
+
});
|
|
1062
|
+
for (let i = 0; i < sorted.length; i++) {
|
|
1063
|
+
const current = sorted[i];
|
|
1064
|
+
const next = sorted[i + 1];
|
|
1065
|
+
ranges.set(current.chapter.id, {
|
|
1066
|
+
startPage: current.tocPageNo,
|
|
1067
|
+
endPage: next ? next.tocPageNo - 1 : Number.MAX_SAFE_INTEGER
|
|
1068
|
+
});
|
|
1069
|
+
}
|
|
1070
|
+
return ranges;
|
|
1071
|
+
}
|
|
1072
|
+
/**
|
|
1073
|
+
* Valid labels for text blocks
|
|
1074
|
+
* Only these labels are included in chapter text blocks
|
|
1075
|
+
*/
|
|
1076
|
+
static VALID_TEXT_LABELS = /* @__PURE__ */ new Set([
|
|
1077
|
+
"text",
|
|
1078
|
+
"section_header",
|
|
1079
|
+
"list_item"
|
|
1080
|
+
]);
|
|
1081
|
+
/**
|
|
1082
|
+
* Check if text item has a picture parent
|
|
1083
|
+
* Items with parent.$ref starting with "#/pictures/" are excluded
|
|
1084
|
+
*/
|
|
1085
|
+
static hasPictureParent(item) {
|
|
1086
|
+
const parentRef = item.parent?.$ref;
|
|
1087
|
+
return typeof parentRef === "string" && parentRef.startsWith("#/pictures/");
|
|
1088
|
+
}
|
|
1089
|
+
/**
|
|
1090
|
+
* Convert text items to text blocks
|
|
1091
|
+
* Filters by label (text, section_header, list_item), excludes picture children,
|
|
1092
|
+
* and extracts page numbers from prov
|
|
1093
|
+
*/
|
|
1094
|
+
convertTextBlocks(textItems, _pageRangeMap) {
|
|
1095
|
+
return textItems.filter(
|
|
1096
|
+
(item) => _ChapterConverter.VALID_TEXT_LABELS.has(item.label) && !_ChapterConverter.hasPictureParent(item) && TextCleaner.isValidText(item.text)
|
|
1097
|
+
).map((item) => {
|
|
1098
|
+
const pdfPageNo = item.prov?.[0]?.page_no ?? 1;
|
|
1099
|
+
return {
|
|
1100
|
+
text: TextCleaner.normalize(item.text),
|
|
1101
|
+
pdfPageNo
|
|
1102
|
+
};
|
|
1103
|
+
});
|
|
1104
|
+
}
|
|
1105
|
+
/**
|
|
1106
|
+
* Convert PDF page number to actual document page number
|
|
1107
|
+
* Falls back to pdfPageNo if mapping is missing
|
|
1108
|
+
*/
|
|
1109
|
+
pdfPageToActualPage(pdfPageNo, pageRangeMap) {
|
|
1110
|
+
const range = pageRangeMap[pdfPageNo];
|
|
1111
|
+
if (!range) {
|
|
1112
|
+
return pdfPageNo;
|
|
1113
|
+
}
|
|
1114
|
+
return range.startPageNo;
|
|
1115
|
+
}
|
|
1116
|
+
/**
|
|
1117
|
+
* Find chapter ID for a given actual page number
|
|
1118
|
+
* Uses "start page first" strategy
|
|
1119
|
+
*/
|
|
1120
|
+
findChapterForPage(actualPageNo, chapterRanges) {
|
|
1121
|
+
let bestMatch = null;
|
|
1122
|
+
let bestStartPage = -1;
|
|
1123
|
+
for (const [chapterId, range] of chapterRanges) {
|
|
1124
|
+
if (actualPageNo >= range.startPage && actualPageNo <= range.endPage) {
|
|
1125
|
+
if (range.startPage > bestStartPage) {
|
|
1126
|
+
bestStartPage = range.startPage;
|
|
1127
|
+
bestMatch = chapterId;
|
|
1128
|
+
}
|
|
1129
|
+
}
|
|
1130
|
+
}
|
|
1131
|
+
return bestMatch;
|
|
1132
|
+
}
|
|
1133
|
+
/**
|
|
1134
|
+
* Assign text blocks to chapters based on page ranges
|
|
1135
|
+
*/
|
|
1136
|
+
assignTextBlocks(chapters, textBlocks, chapterRanges, pageRangeMap) {
|
|
1137
|
+
const chapterMap = this.buildChapterMap(chapters);
|
|
1138
|
+
for (const textBlock of textBlocks) {
|
|
1139
|
+
const actualPageNo = this.pdfPageToActualPage(
|
|
1140
|
+
textBlock.pdfPageNo,
|
|
1141
|
+
pageRangeMap
|
|
1142
|
+
);
|
|
1143
|
+
const chapterId = this.findChapterForPage(actualPageNo, chapterRanges);
|
|
1144
|
+
if (chapterId && chapterMap.has(chapterId)) {
|
|
1145
|
+
chapterMap.get(chapterId).textBlocks.push(textBlock);
|
|
1146
|
+
}
|
|
1147
|
+
}
|
|
1148
|
+
}
|
|
1149
|
+
/**
|
|
1150
|
+
* Link images, tables, and footnotes to chapters based on page ranges
|
|
1151
|
+
*/
|
|
1152
|
+
linkResources(chapters, images, tables, footnotes, chapterRanges, pageRangeMap) {
|
|
1153
|
+
const chapterMap = this.buildChapterMap(chapters);
|
|
1154
|
+
for (const image of images) {
|
|
1155
|
+
const actualPageNo = this.pdfPageToActualPage(
|
|
1156
|
+
image.pdfPageNo,
|
|
1157
|
+
pageRangeMap
|
|
1158
|
+
);
|
|
1159
|
+
const chapterId = this.findChapterForPage(actualPageNo, chapterRanges);
|
|
1160
|
+
if (chapterId && chapterMap.has(chapterId)) {
|
|
1161
|
+
chapterMap.get(chapterId).imageIds.push(image.id);
|
|
1162
|
+
}
|
|
1163
|
+
}
|
|
1164
|
+
for (const table of tables) {
|
|
1165
|
+
const actualPageNo = this.pdfPageToActualPage(
|
|
1166
|
+
table.pdfPageNo,
|
|
1167
|
+
pageRangeMap
|
|
1168
|
+
);
|
|
1169
|
+
const chapterId = this.findChapterForPage(actualPageNo, chapterRanges);
|
|
1170
|
+
if (chapterId && chapterMap.has(chapterId)) {
|
|
1171
|
+
chapterMap.get(chapterId).tableIds.push(table.id);
|
|
1172
|
+
}
|
|
1173
|
+
}
|
|
1174
|
+
for (const footnote of footnotes) {
|
|
1175
|
+
const actualPageNo = this.pdfPageToActualPage(
|
|
1176
|
+
footnote.pdfPageNo,
|
|
1177
|
+
pageRangeMap
|
|
1178
|
+
);
|
|
1179
|
+
const chapterId = this.findChapterForPage(actualPageNo, chapterRanges);
|
|
1180
|
+
if (chapterId && chapterMap.has(chapterId)) {
|
|
1181
|
+
chapterMap.get(chapterId).footnoteIds.push(footnote.id);
|
|
1182
|
+
}
|
|
1183
|
+
}
|
|
1184
|
+
}
|
|
1185
|
+
/**
|
|
1186
|
+
* Build flat chapter map for O(1) lookup
|
|
1187
|
+
*/
|
|
1188
|
+
buildChapterMap(chapters) {
|
|
1189
|
+
const map = /* @__PURE__ */ new Map();
|
|
1190
|
+
const addToMap = (chapterList) => {
|
|
1191
|
+
for (const chapter of chapterList) {
|
|
1192
|
+
map.set(chapter.id, chapter);
|
|
1193
|
+
if (chapter.children && chapter.children.length > 0) {
|
|
1194
|
+
addToMap(chapter.children);
|
|
1195
|
+
}
|
|
1196
|
+
}
|
|
1197
|
+
};
|
|
1198
|
+
addToMap(chapters);
|
|
1199
|
+
return map;
|
|
1200
|
+
}
|
|
1201
|
+
};
|
|
1202
|
+
|
|
1203
|
+
// src/extractors/toc-extract-error.ts
|
|
1204
|
+
var TocExtractError = class _TocExtractError extends Error {
|
|
1205
|
+
constructor(message, options) {
|
|
1206
|
+
super(message, options);
|
|
1207
|
+
this.name = "TocExtractError";
|
|
1208
|
+
}
|
|
1209
|
+
/**
|
|
1210
|
+
* Extract error message from unknown error type
|
|
1211
|
+
*/
|
|
1212
|
+
static getErrorMessage(error) {
|
|
1213
|
+
return error instanceof Error ? error.message : String(error);
|
|
1214
|
+
}
|
|
1215
|
+
/**
|
|
1216
|
+
* Create TocExtractError from unknown error with context
|
|
1217
|
+
*/
|
|
1218
|
+
static fromError(context, error) {
|
|
1219
|
+
return new _TocExtractError(
|
|
1220
|
+
`${context}: ${_TocExtractError.getErrorMessage(error)}`,
|
|
1221
|
+
{ cause: error }
|
|
1222
|
+
);
|
|
1223
|
+
}
|
|
1224
|
+
};
|
|
1225
|
+
var TocNotFoundError = class extends TocExtractError {
|
|
1226
|
+
constructor(message = "Table of contents not found in the document") {
|
|
1227
|
+
super(message);
|
|
1228
|
+
this.name = "TocNotFoundError";
|
|
1229
|
+
}
|
|
1230
|
+
};
|
|
1231
|
+
var TocParseError = class extends TocExtractError {
|
|
1232
|
+
constructor(message, options) {
|
|
1233
|
+
super(message, options);
|
|
1234
|
+
this.name = "TocParseError";
|
|
1235
|
+
}
|
|
1236
|
+
};
|
|
1237
|
+
var TocValidationError = class extends TocExtractError {
|
|
1238
|
+
/**
|
|
1239
|
+
* Validation result with detailed issues
|
|
1240
|
+
*/
|
|
1241
|
+
validationResult;
|
|
1242
|
+
constructor(message, validationResult) {
|
|
1243
|
+
super(message);
|
|
1244
|
+
this.name = "TocValidationError";
|
|
1245
|
+
this.validationResult = validationResult;
|
|
1246
|
+
}
|
|
1247
|
+
/**
|
|
1248
|
+
* Get formatted error summary
|
|
1249
|
+
*/
|
|
1250
|
+
getSummary() {
|
|
1251
|
+
const { errorCount, issues } = this.validationResult;
|
|
1252
|
+
const lines = [
|
|
1253
|
+
`TOC validation failed: ${errorCount} error(s)`,
|
|
1254
|
+
"",
|
|
1255
|
+
"Issues:"
|
|
1256
|
+
];
|
|
1257
|
+
for (const issue of issues) {
|
|
1258
|
+
lines.push(` [${issue.code}] ${issue.message}`);
|
|
1259
|
+
lines.push(` Path: ${issue.path}`);
|
|
1260
|
+
lines.push(
|
|
1261
|
+
` Entry: "${issue.entry.title}" (page ${issue.entry.pageNo})`
|
|
1262
|
+
);
|
|
1263
|
+
}
|
|
1264
|
+
return lines.join("\n");
|
|
1265
|
+
}
|
|
1266
|
+
};
|
|
1267
|
+
|
|
1268
|
+
// src/extractors/toc-validator.ts
|
|
1269
|
+
var DEFAULT_OPTIONS = {
|
|
1270
|
+
totalPages: Infinity,
|
|
1271
|
+
maxTitleLength: 200
|
|
1272
|
+
};
|
|
1273
|
+
var TocValidator = class {
|
|
1274
|
+
options;
|
|
1275
|
+
issues;
|
|
1276
|
+
constructor(options) {
|
|
1277
|
+
this.options = {
|
|
1278
|
+
...DEFAULT_OPTIONS,
|
|
1279
|
+
...options
|
|
1280
|
+
};
|
|
1281
|
+
this.issues = [];
|
|
1282
|
+
}
|
|
1283
|
+
/**
|
|
1284
|
+
* Validate TocEntry array
|
|
1285
|
+
*
|
|
1286
|
+
* @param entries - TOC entries to validate
|
|
1287
|
+
* @returns Validation result
|
|
1288
|
+
*/
|
|
1289
|
+
validate(entries) {
|
|
1290
|
+
this.issues = [];
|
|
1291
|
+
this.validateEntries(entries, "", null, /* @__PURE__ */ new Set());
|
|
1292
|
+
const errorCount = this.issues.length;
|
|
1293
|
+
return {
|
|
1294
|
+
valid: errorCount === 0,
|
|
1295
|
+
issues: [...this.issues],
|
|
1296
|
+
errorCount
|
|
1297
|
+
};
|
|
1298
|
+
}
|
|
1299
|
+
/**
|
|
1300
|
+
* Validate and throw if invalid
|
|
1301
|
+
*
|
|
1302
|
+
* @param entries - TOC entries to validate
|
|
1303
|
+
* @throws {TocValidationError} When validation fails
|
|
1304
|
+
*/
|
|
1305
|
+
validateOrThrow(entries) {
|
|
1306
|
+
const result = this.validate(entries);
|
|
1307
|
+
if (!result.valid) {
|
|
1308
|
+
throw new TocValidationError(
|
|
1309
|
+
`TOC validation failed with ${result.errorCount} error(s)`,
|
|
1310
|
+
result
|
|
1311
|
+
);
|
|
1312
|
+
}
|
|
1313
|
+
}
|
|
1314
|
+
/**
|
|
1315
|
+
* Recursively validate entries
|
|
1316
|
+
*/
|
|
1317
|
+
validateEntries(entries, parentPath, parentEntry, seenKeys) {
|
|
1318
|
+
let prevPageNo = parentEntry?.pageNo ?? 0;
|
|
1319
|
+
for (let i = 0; i < entries.length; i++) {
|
|
1320
|
+
const entry = entries[i];
|
|
1321
|
+
const path4 = parentPath ? `${parentPath}.children[${i}]` : `[${i}]`;
|
|
1322
|
+
this.validateTitle(entry, path4);
|
|
1323
|
+
this.validateTitleLength(entry, path4);
|
|
1324
|
+
this.validatePageRange(entry, path4);
|
|
1325
|
+
this.validatePageOrder(entry, path4, prevPageNo);
|
|
1326
|
+
prevPageNo = entry.pageNo;
|
|
1327
|
+
if (parentEntry) {
|
|
1328
|
+
this.validateParentChildPage(entry, path4, parentEntry);
|
|
1329
|
+
}
|
|
1330
|
+
const key = `${entry.title}:${entry.pageNo}`;
|
|
1331
|
+
this.validateDuplicate(entry, path4, key, seenKeys);
|
|
1332
|
+
seenKeys.add(key);
|
|
1333
|
+
if (entry.children && entry.children.length > 0) {
|
|
1334
|
+
this.validateEntries(entry.children, path4, entry, seenKeys);
|
|
1335
|
+
}
|
|
1336
|
+
}
|
|
1337
|
+
}
|
|
1338
|
+
/**
|
|
1339
|
+
* V003: Validate title is not empty
|
|
1340
|
+
*/
|
|
1341
|
+
validateTitle(entry, path4) {
|
|
1342
|
+
if (!entry.title || entry.title.trim() === "") {
|
|
1343
|
+
this.addIssue({
|
|
1344
|
+
code: "V003",
|
|
1345
|
+
message: "Title is empty or contains only whitespace",
|
|
1346
|
+
path: path4,
|
|
1347
|
+
entry
|
|
1348
|
+
});
|
|
1349
|
+
}
|
|
1350
|
+
}
|
|
1351
|
+
/**
|
|
1352
|
+
* V004: Validate title length
|
|
1353
|
+
*/
|
|
1354
|
+
validateTitleLength(entry, path4) {
|
|
1355
|
+
if (entry.title.length > this.options.maxTitleLength) {
|
|
1356
|
+
this.addIssue({
|
|
1357
|
+
code: "V004",
|
|
1358
|
+
message: `Title exceeds ${this.options.maxTitleLength} characters (${entry.title.length})`,
|
|
1359
|
+
path: path4,
|
|
1360
|
+
entry
|
|
1361
|
+
});
|
|
1362
|
+
}
|
|
1363
|
+
}
|
|
1364
|
+
/**
|
|
1365
|
+
* V002: Validate page number range
|
|
1366
|
+
*/
|
|
1367
|
+
validatePageRange(entry, path4) {
|
|
1368
|
+
if (entry.pageNo < 1) {
|
|
1369
|
+
this.addIssue({
|
|
1370
|
+
code: "V002",
|
|
1371
|
+
message: `Page number must be >= 1, got ${entry.pageNo}`,
|
|
1372
|
+
path: path4,
|
|
1373
|
+
entry
|
|
1374
|
+
});
|
|
1375
|
+
}
|
|
1376
|
+
if (entry.pageNo > this.options.totalPages) {
|
|
1377
|
+
this.addIssue({
|
|
1378
|
+
code: "V002",
|
|
1379
|
+
message: `Page number ${entry.pageNo} exceeds document total pages (${this.options.totalPages})`,
|
|
1380
|
+
path: path4,
|
|
1381
|
+
entry
|
|
1382
|
+
});
|
|
1383
|
+
}
|
|
1384
|
+
}
|
|
1385
|
+
/**
|
|
1386
|
+
* V001: Validate page order within same level
|
|
1387
|
+
*/
|
|
1388
|
+
validatePageOrder(entry, path4, prevPageNo) {
|
|
1389
|
+
if (entry.pageNo < prevPageNo) {
|
|
1390
|
+
this.addIssue({
|
|
1391
|
+
code: "V001",
|
|
1392
|
+
message: `Page number decreased from ${prevPageNo} to ${entry.pageNo}`,
|
|
1393
|
+
path: path4,
|
|
1394
|
+
entry
|
|
1395
|
+
});
|
|
1396
|
+
}
|
|
1397
|
+
}
|
|
1398
|
+
/**
|
|
1399
|
+
* V005: Validate parent-child page relationship
|
|
1400
|
+
*/
|
|
1401
|
+
validateParentChildPage(entry, path4, parent) {
|
|
1402
|
+
if (entry.pageNo < parent.pageNo) {
|
|
1403
|
+
this.addIssue({
|
|
1404
|
+
code: "V005",
|
|
1405
|
+
message: `Child page (${entry.pageNo}) is before parent page (${parent.pageNo})`,
|
|
1406
|
+
path: path4,
|
|
1407
|
+
entry
|
|
1408
|
+
});
|
|
1409
|
+
}
|
|
1410
|
+
}
|
|
1411
|
+
/**
|
|
1412
|
+
* V006: Validate no duplicates
|
|
1413
|
+
*/
|
|
1414
|
+
validateDuplicate(entry, path4, key, seenKeys) {
|
|
1415
|
+
if (seenKeys.has(key)) {
|
|
1416
|
+
this.addIssue({
|
|
1417
|
+
code: "V006",
|
|
1418
|
+
message: `Duplicate entry: "${entry.title}" at page ${entry.pageNo}`,
|
|
1419
|
+
path: path4,
|
|
1420
|
+
entry
|
|
1421
|
+
});
|
|
1422
|
+
}
|
|
1423
|
+
}
|
|
1424
|
+
/**
|
|
1425
|
+
* Add issue to the list
|
|
1426
|
+
*/
|
|
1427
|
+
addIssue(issue) {
|
|
1428
|
+
this.issues.push(issue);
|
|
1429
|
+
}
|
|
1430
|
+
};
|
|
1431
|
+
|
|
1432
|
+
// src/extractors/toc-finder.ts
|
|
1433
|
+
var TOC_KEYWORDS = [
|
|
1434
|
+
"\uBAA9\uCC28",
|
|
1435
|
+
"\uCC28\uB840",
|
|
1436
|
+
"\uBAA9 \uCC28",
|
|
1437
|
+
"\u76EE\u5F55",
|
|
1438
|
+
"\u76EE \u5F55",
|
|
1439
|
+
"\u5185\u5BB9",
|
|
1440
|
+
"\u5167\u5BB9",
|
|
1441
|
+
"\u76EE\u6B21",
|
|
1442
|
+
"\u76EE \u6B21",
|
|
1443
|
+
"Contents",
|
|
1444
|
+
"Table of Contents",
|
|
1445
|
+
"TABLE OF CONTENTS",
|
|
1446
|
+
"CONTENTS"
|
|
1447
|
+
];
|
|
1448
|
+
var CONTINUATION_MARKERS = [
|
|
1449
|
+
"\uBAA9\uCC28(\uACC4\uC18D)",
|
|
1450
|
+
"\uBAA9\uCC28 (\uACC4\uC18D)",
|
|
1451
|
+
"(\uACC4\uC18D)",
|
|
1452
|
+
"\u76EE\u5F55(\u7EED)",
|
|
1453
|
+
"\u76EE\u5F55 (\u7EED)",
|
|
1454
|
+
"(\u7EED)",
|
|
1455
|
+
"\u7EED\u8868",
|
|
1456
|
+
"\u76EE\u6B21(\u7D9A)",
|
|
1457
|
+
"\u76EE\u6B21 (\u7D9A)",
|
|
1458
|
+
"(\u7D9A)",
|
|
1459
|
+
"(continued)",
|
|
1460
|
+
"(Continued)",
|
|
1461
|
+
"(CONTINUED)",
|
|
1462
|
+
"continued"
|
|
1463
|
+
];
|
|
1464
|
+
var PAGE_NUMBER_PATTERN = /\.{2,}\s*\d+\s*$|…+\s*\d+\s*$|\s+\d+\s*$/;
|
|
1465
|
+
var TocFinder = class {
|
|
1466
|
+
constructor(logger, refResolver, options) {
|
|
1467
|
+
this.logger = logger;
|
|
1468
|
+
this.refResolver = refResolver;
|
|
1469
|
+
this.maxSearchPages = options?.maxSearchPages ?? 10;
|
|
1470
|
+
this.keywords = [...TOC_KEYWORDS, ...options?.additionalKeywords ?? []];
|
|
1471
|
+
}
|
|
1472
|
+
maxSearchPages;
|
|
1473
|
+
keywords;
|
|
1474
|
+
/**
|
|
1475
|
+
* Find TOC area in the document
|
|
1476
|
+
*
|
|
1477
|
+
* @throws {TocNotFoundError} When no TOC area is found
|
|
1478
|
+
*/
|
|
1479
|
+
find(doc) {
|
|
1480
|
+
this.logger.info("[TocFinder] Starting TOC search...");
|
|
1481
|
+
const keywordResult = this.findByKeywords(doc);
|
|
1482
|
+
if (keywordResult) {
|
|
1483
|
+
this.logger.info(
|
|
1484
|
+
`[TocFinder] Found TOC by keyword search: pages ${keywordResult.startPage}-${keywordResult.endPage}`
|
|
1485
|
+
);
|
|
1486
|
+
return keywordResult;
|
|
1487
|
+
}
|
|
1488
|
+
const structureResult = this.findByStructure(doc);
|
|
1489
|
+
if (structureResult) {
|
|
1490
|
+
this.logger.info(
|
|
1491
|
+
`[TocFinder] Found TOC by structure analysis: pages ${structureResult.startPage}-${structureResult.endPage}`
|
|
1492
|
+
);
|
|
1493
|
+
return structureResult;
|
|
1494
|
+
}
|
|
1495
|
+
this.logger.warn("[TocFinder] No TOC found in document");
|
|
1496
|
+
throw new TocNotFoundError();
|
|
1497
|
+
}
|
|
1498
|
+
/**
|
|
1499
|
+
* Stage 1: Search by keywords in text items
|
|
1500
|
+
*/
|
|
1501
|
+
findByKeywords(doc) {
|
|
1502
|
+
for (const text of doc.texts) {
|
|
1503
|
+
if (!this.containsTocKeyword(text.text)) {
|
|
1504
|
+
continue;
|
|
1505
|
+
}
|
|
1506
|
+
const pageNo = text.prov[0]?.page_no;
|
|
1507
|
+
if (pageNo === void 0 || pageNo > this.maxSearchPages) {
|
|
1508
|
+
continue;
|
|
1509
|
+
}
|
|
1510
|
+
this.logger.info(
|
|
1511
|
+
`[TocFinder] Found TOC keyword "${text.text}" on page ${pageNo}`
|
|
1512
|
+
);
|
|
1513
|
+
const parentRef = text.parent?.$ref;
|
|
1514
|
+
if (!parentRef) {
|
|
1515
|
+
return {
|
|
1516
|
+
itemRefs: [text.self_ref],
|
|
1517
|
+
startPage: pageNo,
|
|
1518
|
+
endPage: pageNo
|
|
1519
|
+
};
|
|
1520
|
+
}
|
|
1521
|
+
const result = this.findTocContainer(doc, parentRef, pageNo);
|
|
1522
|
+
if (result) {
|
|
1523
|
+
return this.expandToConsecutivePages(result, doc);
|
|
1524
|
+
}
|
|
1525
|
+
}
|
|
1526
|
+
return null;
|
|
1527
|
+
}
|
|
1528
|
+
/**
|
|
1529
|
+
* Stage 2: Search by structure (lists/tables with page numbers)
|
|
1530
|
+
*/
|
|
1531
|
+
findByStructure(doc) {
|
|
1532
|
+
const candidates = [];
|
|
1533
|
+
for (const group of doc.groups) {
|
|
1534
|
+
const pageNo = this.getGroupFirstPage(group);
|
|
1535
|
+
if (pageNo === void 0 || pageNo > this.maxSearchPages) {
|
|
1536
|
+
continue;
|
|
1537
|
+
}
|
|
1538
|
+
if (this.isGroupTocLike(group, doc)) {
|
|
1539
|
+
const score = this.calculateScore(group, pageNo);
|
|
1540
|
+
candidates.push({
|
|
1541
|
+
result: {
|
|
1542
|
+
itemRefs: [group.self_ref],
|
|
1543
|
+
startPage: pageNo,
|
|
1544
|
+
endPage: pageNo
|
|
1545
|
+
},
|
|
1546
|
+
score
|
|
1547
|
+
});
|
|
1548
|
+
}
|
|
1549
|
+
}
|
|
1550
|
+
for (const table of doc.tables) {
|
|
1551
|
+
const pageNo = table.prov[0]?.page_no;
|
|
1552
|
+
if (pageNo === void 0 || pageNo > this.maxSearchPages) {
|
|
1553
|
+
continue;
|
|
1554
|
+
}
|
|
1555
|
+
if (this.isTableTocLike(table)) {
|
|
1556
|
+
const score = this.calculateTableScore(table, pageNo);
|
|
1557
|
+
candidates.push({
|
|
1558
|
+
result: {
|
|
1559
|
+
itemRefs: [table.self_ref],
|
|
1560
|
+
startPage: pageNo,
|
|
1561
|
+
endPage: pageNo
|
|
1562
|
+
},
|
|
1563
|
+
score
|
|
1564
|
+
});
|
|
1565
|
+
}
|
|
1566
|
+
}
|
|
1567
|
+
if (candidates.length === 0) {
|
|
1568
|
+
return null;
|
|
1569
|
+
}
|
|
1570
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
1571
|
+
const best = candidates[0];
|
|
1572
|
+
return this.expandToConsecutivePages(best.result, doc);
|
|
1573
|
+
}
|
|
1574
|
+
/**
|
|
1575
|
+
* Find the TOC container (group or table) from a parent reference
|
|
1576
|
+
*/
|
|
1577
|
+
findTocContainer(doc, parentRef, pageNo) {
|
|
1578
|
+
const group = this.refResolver.resolveGroup(parentRef);
|
|
1579
|
+
if (group) {
|
|
1580
|
+
return {
|
|
1581
|
+
itemRefs: [group.self_ref],
|
|
1582
|
+
startPage: pageNo,
|
|
1583
|
+
endPage: pageNo
|
|
1584
|
+
};
|
|
1585
|
+
}
|
|
1586
|
+
const table = this.refResolver.resolveTable(parentRef);
|
|
1587
|
+
if (table) {
|
|
1588
|
+
return {
|
|
1589
|
+
itemRefs: [table.self_ref],
|
|
1590
|
+
startPage: pageNo,
|
|
1591
|
+
endPage: pageNo
|
|
1592
|
+
};
|
|
1593
|
+
}
|
|
1594
|
+
const item = this.refResolver.resolve(parentRef);
|
|
1595
|
+
if (item && item.parent?.$ref) {
|
|
1596
|
+
return this.findTocContainer(doc, item.parent.$ref, pageNo);
|
|
1597
|
+
}
|
|
1598
|
+
return null;
|
|
1599
|
+
}
|
|
1600
|
+
/**
|
|
1601
|
+
* Check if a group contains TOC-like structure
|
|
1602
|
+
*/
|
|
1603
|
+
isGroupTocLike(group, _doc) {
|
|
1604
|
+
if (group.name !== "list" && group.name !== "group") {
|
|
1605
|
+
return false;
|
|
1606
|
+
}
|
|
1607
|
+
let pageNumberCount = 0;
|
|
1608
|
+
const children = this.refResolver.resolveMany(group.children);
|
|
1609
|
+
for (const child of children) {
|
|
1610
|
+
if (!child) continue;
|
|
1611
|
+
if ("text" in child && "orig" in child) {
|
|
1612
|
+
const textItem = child;
|
|
1613
|
+
if (PAGE_NUMBER_PATTERN.test(textItem.text)) {
|
|
1614
|
+
pageNumberCount++;
|
|
1615
|
+
}
|
|
1616
|
+
}
|
|
1617
|
+
}
|
|
1618
|
+
const total = children.filter((c) => c !== null).length;
|
|
1619
|
+
return pageNumberCount >= 3 || total > 0 && pageNumberCount / total > 0.5;
|
|
1620
|
+
}
|
|
1621
|
+
/**
|
|
1622
|
+
* Check if a table contains TOC-like structure
|
|
1623
|
+
*/
|
|
1624
|
+
isTableTocLike(table) {
|
|
1625
|
+
if (table.label === "document_index") {
|
|
1626
|
+
return true;
|
|
1627
|
+
}
|
|
1628
|
+
const { grid, num_rows, num_cols } = table.data;
|
|
1629
|
+
if (num_rows < 3 || num_cols < 2) {
|
|
1630
|
+
return false;
|
|
1631
|
+
}
|
|
1632
|
+
let numberCount = 0;
|
|
1633
|
+
for (let row = 1; row < grid.length; row++) {
|
|
1634
|
+
const lastCell = grid[row]?.[num_cols - 1];
|
|
1635
|
+
if (lastCell && /^\d+$/.test(lastCell.text.trim())) {
|
|
1636
|
+
numberCount++;
|
|
1637
|
+
}
|
|
1638
|
+
}
|
|
1639
|
+
return numberCount > 0 && numberCount / (num_rows - 1) > 0.5;
|
|
1640
|
+
}
|
|
1641
|
+
/**
|
|
1642
|
+
* Expand TOC area to consecutive pages
|
|
1643
|
+
*/
|
|
1644
|
+
expandToConsecutivePages(initial, doc) {
|
|
1645
|
+
const itemRefs = [...initial.itemRefs];
|
|
1646
|
+
let endPage = initial.endPage;
|
|
1647
|
+
for (let pageNo = initial.endPage + 1; pageNo <= this.maxSearchPages; pageNo++) {
|
|
1648
|
+
const continuationItems = this.findContinuationOnPage(doc, pageNo);
|
|
1649
|
+
if (continuationItems.length === 0) {
|
|
1650
|
+
break;
|
|
1651
|
+
}
|
|
1652
|
+
itemRefs.push(...continuationItems);
|
|
1653
|
+
endPage = pageNo;
|
|
1654
|
+
}
|
|
1655
|
+
return {
|
|
1656
|
+
itemRefs,
|
|
1657
|
+
startPage: initial.startPage,
|
|
1658
|
+
endPage
|
|
1659
|
+
};
|
|
1660
|
+
}
|
|
1661
|
+
/**
|
|
1662
|
+
* Find TOC continuation items on a specific page
|
|
1663
|
+
*/
|
|
1664
|
+
findContinuationOnPage(doc, pageNo) {
|
|
1665
|
+
const refs = [];
|
|
1666
|
+
for (const text of doc.texts) {
|
|
1667
|
+
if (text.prov[0]?.page_no !== pageNo) {
|
|
1668
|
+
continue;
|
|
1669
|
+
}
|
|
1670
|
+
if (this.hasContinuationMarker(text.text)) {
|
|
1671
|
+
const parentRef = text.parent?.$ref;
|
|
1672
|
+
if (parentRef) {
|
|
1673
|
+
const group = this.refResolver.resolveGroup(parentRef);
|
|
1674
|
+
if (group) {
|
|
1675
|
+
refs.push(group.self_ref);
|
|
1676
|
+
}
|
|
1677
|
+
}
|
|
1678
|
+
}
|
|
1679
|
+
}
|
|
1680
|
+
for (const group of doc.groups) {
|
|
1681
|
+
const groupPage = this.getGroupFirstPage(group);
|
|
1682
|
+
if (groupPage !== pageNo) {
|
|
1683
|
+
continue;
|
|
1684
|
+
}
|
|
1685
|
+
if (this.isGroupTocLike(group, doc) && !refs.includes(group.self_ref)) {
|
|
1686
|
+
refs.push(group.self_ref);
|
|
1687
|
+
}
|
|
1688
|
+
}
|
|
1689
|
+
for (const table of doc.tables) {
|
|
1690
|
+
if (table.prov[0]?.page_no !== pageNo) {
|
|
1691
|
+
continue;
|
|
1692
|
+
}
|
|
1693
|
+
if (this.isTableTocLike(table) && !refs.includes(table.self_ref)) {
|
|
1694
|
+
refs.push(table.self_ref);
|
|
1695
|
+
}
|
|
1696
|
+
}
|
|
1697
|
+
return refs;
|
|
1698
|
+
}
|
|
1699
|
+
/**
|
|
1700
|
+
* Check if text contains TOC keyword
|
|
1701
|
+
*/
|
|
1702
|
+
containsTocKeyword(text) {
|
|
1703
|
+
const normalizedText = text.trim().toLowerCase();
|
|
1704
|
+
return this.keywords.some(
|
|
1705
|
+
(keyword) => normalizedText.includes(keyword.toLowerCase())
|
|
1706
|
+
);
|
|
1707
|
+
}
|
|
1708
|
+
/**
|
|
1709
|
+
* Check for continuation markers
|
|
1710
|
+
*/
|
|
1711
|
+
hasContinuationMarker(text) {
|
|
1712
|
+
const normalizedText = text.trim().toLowerCase();
|
|
1713
|
+
return CONTINUATION_MARKERS.some(
|
|
1714
|
+
(marker) => normalizedText.includes(marker.toLowerCase())
|
|
1715
|
+
);
|
|
1716
|
+
}
|
|
1717
|
+
/**
|
|
1718
|
+
* Get first page number of a group by checking its children
|
|
1719
|
+
*/
|
|
1720
|
+
getGroupFirstPage(group) {
|
|
1721
|
+
for (const childRef of group.children) {
|
|
1722
|
+
const child = this.refResolver.resolve(childRef.$ref);
|
|
1723
|
+
if (child && "prov" in child) {
|
|
1724
|
+
const prov = child.prov;
|
|
1725
|
+
if (prov && prov[0]?.page_no !== void 0) {
|
|
1726
|
+
return prov[0].page_no;
|
|
1727
|
+
}
|
|
1728
|
+
}
|
|
1729
|
+
}
|
|
1730
|
+
return void 0;
|
|
1731
|
+
}
|
|
1732
|
+
/**
|
|
1733
|
+
* Calculate score for a group candidate
|
|
1734
|
+
* Higher score = better match
|
|
1735
|
+
*/
|
|
1736
|
+
calculateScore(group, pageNo) {
|
|
1737
|
+
let score = 0;
|
|
1738
|
+
score += (this.maxSearchPages - pageNo + 1) * 10;
|
|
1739
|
+
score += group.children.length * 2;
|
|
1740
|
+
const children = this.refResolver.resolveMany(group.children);
|
|
1741
|
+
for (const child of children) {
|
|
1742
|
+
if (child && "text" in child) {
|
|
1743
|
+
const textItem = child;
|
|
1744
|
+
if (PAGE_NUMBER_PATTERN.test(textItem.text)) {
|
|
1745
|
+
score += 5;
|
|
1746
|
+
}
|
|
1747
|
+
}
|
|
1748
|
+
}
|
|
1749
|
+
return score;
|
|
1750
|
+
}
|
|
1751
|
+
/**
|
|
1752
|
+
* Calculate score for a table candidate
|
|
1753
|
+
*/
|
|
1754
|
+
calculateTableScore(table, pageNo) {
|
|
1755
|
+
let score = 0;
|
|
1756
|
+
score += (this.maxSearchPages - pageNo + 1) * 10;
|
|
1757
|
+
score += table.data.num_rows * 2;
|
|
1758
|
+
if (table.label === "document_index") {
|
|
1759
|
+
score += 50;
|
|
1760
|
+
}
|
|
1761
|
+
return score;
|
|
1762
|
+
}
|
|
1763
|
+
};
|
|
1764
|
+
|
|
1765
|
+
// src/extractors/toc-extractor.ts
|
|
1766
|
+
var import_zod = require("zod");
|
|
1767
|
+
|
|
1768
|
+
// src/core/base-llm-component.ts
|
|
1769
|
+
var BaseLLMComponent = class {
|
|
1770
|
+
logger;
|
|
1771
|
+
model;
|
|
1772
|
+
fallbackModel;
|
|
1773
|
+
maxRetries;
|
|
1774
|
+
temperature;
|
|
1775
|
+
componentName;
|
|
1776
|
+
aggregator;
|
|
1777
|
+
abortSignal;
|
|
1778
|
+
/**
|
|
1779
|
+
* Constructor for BaseLLMComponent
|
|
1780
|
+
*
|
|
1781
|
+
* @param logger - Logger instance for logging
|
|
1782
|
+
* @param model - Primary language model for LLM calls
|
|
1783
|
+
* @param componentName - Name of the component for logging (e.g., "TocExtractor")
|
|
1784
|
+
* @param options - Optional configuration (maxRetries, temperature)
|
|
1785
|
+
* @param fallbackModel - Optional fallback model for retry on failure
|
|
1786
|
+
* @param aggregator - Optional token usage aggregator for tracking LLM calls
|
|
1787
|
+
*/
|
|
1788
|
+
constructor(logger, model, componentName, options, fallbackModel, aggregator) {
|
|
1789
|
+
this.logger = logger;
|
|
1790
|
+
this.model = model;
|
|
1791
|
+
this.componentName = componentName;
|
|
1792
|
+
this.maxRetries = options?.maxRetries ?? 3;
|
|
1793
|
+
this.temperature = options?.temperature ?? 0;
|
|
1794
|
+
this.fallbackModel = fallbackModel;
|
|
1795
|
+
this.aggregator = aggregator;
|
|
1796
|
+
this.abortSignal = options?.abortSignal;
|
|
1797
|
+
}
|
|
1798
|
+
/**
|
|
1799
|
+
* Log a message with consistent component name prefix
|
|
1800
|
+
*
|
|
1801
|
+
* @param level - Log level ('info', 'warn', 'error')
|
|
1802
|
+
* @param message - Message to log (without prefix)
|
|
1803
|
+
* @param args - Additional arguments to pass to logger
|
|
1804
|
+
*/
|
|
1805
|
+
log(level, message, ...args) {
|
|
1806
|
+
const formattedMessage = `[${this.componentName}] ${message}`;
|
|
1807
|
+
this.logger[level](formattedMessage, ...args);
|
|
1808
|
+
}
|
|
1809
|
+
/**
|
|
1810
|
+
* Track token usage to aggregator if available
|
|
1811
|
+
*
|
|
1812
|
+
* @param usage - Token usage information to track
|
|
1813
|
+
*/
|
|
1814
|
+
trackUsage(usage) {
|
|
1815
|
+
if (this.aggregator) {
|
|
1816
|
+
this.aggregator.track(usage);
|
|
1817
|
+
}
|
|
1818
|
+
}
|
|
1819
|
+
/**
|
|
1820
|
+
* Create an empty usage record for edge cases (e.g., empty input)
|
|
1821
|
+
*
|
|
1822
|
+
* @param phase - Phase name for the usage record
|
|
1823
|
+
* @returns Empty ExtendedTokenUsage object
|
|
1824
|
+
*/
|
|
1825
|
+
createEmptyUsage(phase) {
|
|
1826
|
+
return {
|
|
1827
|
+
component: this.componentName,
|
|
1828
|
+
phase,
|
|
1829
|
+
model: "primary",
|
|
1830
|
+
modelName: "none",
|
|
1831
|
+
inputTokens: 0,
|
|
1832
|
+
outputTokens: 0,
|
|
1833
|
+
totalTokens: 0
|
|
1834
|
+
};
|
|
1835
|
+
}
|
|
1836
|
+
};
|
|
1837
|
+
|
|
1838
|
+
// src/core/text-llm-component.ts
|
|
1839
|
+
var TextLLMComponent = class extends BaseLLMComponent {
|
|
1840
|
+
constructor(logger, model, componentName, options, fallbackModel, aggregator) {
|
|
1841
|
+
super(logger, model, componentName, options, fallbackModel, aggregator);
|
|
1842
|
+
}
|
|
1843
|
+
/**
|
|
1844
|
+
* Call LLM with text-based prompts using LLMCaller.call()
|
|
1845
|
+
*
|
|
1846
|
+
* @template TSchema - Zod schema type for response validation
|
|
1847
|
+
* @param schema - Zod schema for response validation
|
|
1848
|
+
* @param systemPrompt - System prompt for LLM
|
|
1849
|
+
* @param userPrompt - User prompt for LLM
|
|
1850
|
+
* @param phase - Phase name for tracking (e.g., 'extraction', 'validation')
|
|
1851
|
+
* @returns Promise with parsed object and usage information
|
|
1852
|
+
*/
|
|
1853
|
+
async callTextLLM(schema, systemPrompt, userPrompt, phase) {
|
|
1854
|
+
const result = await LLMCaller.call({
|
|
1855
|
+
schema,
|
|
1856
|
+
systemPrompt,
|
|
1857
|
+
userPrompt,
|
|
1858
|
+
primaryModel: this.model,
|
|
1859
|
+
fallbackModel: this.fallbackModel,
|
|
1860
|
+
maxRetries: this.maxRetries,
|
|
1861
|
+
temperature: this.temperature,
|
|
1862
|
+
abortSignal: this.abortSignal,
|
|
1863
|
+
component: this.componentName,
|
|
1864
|
+
phase
|
|
1865
|
+
});
|
|
1866
|
+
this.trackUsage(result.usage);
|
|
1867
|
+
return {
|
|
1868
|
+
output: result.output,
|
|
1869
|
+
usage: result.usage
|
|
1870
|
+
};
|
|
1871
|
+
}
|
|
1872
|
+
};
|
|
1873
|
+
|
|
1874
|
+
// src/extractors/toc-extractor.ts
|
|
1875
|
+
var TocEntrySchema = import_zod.z.lazy(
|
|
1876
|
+
() => import_zod.z.object({
|
|
1877
|
+
title: import_zod.z.string().describe("Chapter or section title"),
|
|
1878
|
+
level: import_zod.z.number().int().min(1).describe("Hierarchy depth (1 = top level)"),
|
|
1879
|
+
pageNo: import_zod.z.number().int().min(1).describe("Starting page number"),
|
|
1880
|
+
children: import_zod.z.array(TocEntrySchema).optional().describe("Child sections")
|
|
1881
|
+
})
|
|
1882
|
+
);
|
|
1883
|
+
var TocResponseSchema = import_zod.z.object({
|
|
1884
|
+
entries: import_zod.z.array(TocEntrySchema).describe("Extracted TOC entries")
|
|
1885
|
+
});
|
|
1886
|
+
var TocExtractor = class extends TextLLMComponent {
|
|
1887
|
+
validationOptions;
|
|
1888
|
+
skipValidation;
|
|
1889
|
+
constructor(logger, model, options, fallbackModel, abortSignal) {
|
|
1890
|
+
super(
|
|
1891
|
+
logger,
|
|
1892
|
+
model,
|
|
1893
|
+
"TocExtractor",
|
|
1894
|
+
{ ...options, abortSignal },
|
|
1895
|
+
fallbackModel
|
|
1896
|
+
);
|
|
1897
|
+
this.validationOptions = options?.validation;
|
|
1898
|
+
this.skipValidation = options?.skipValidation ?? false;
|
|
1899
|
+
}
|
|
1900
|
+
/**
|
|
1901
|
+
* Extract TOC structure from Markdown
|
|
1902
|
+
*
|
|
1903
|
+
* @param markdown - Markdown representation of TOC area
|
|
1904
|
+
* @returns Object with entries array and token usage information
|
|
1905
|
+
* @throws {TocParseError} When LLM fails to parse structure
|
|
1906
|
+
* @throws {TocValidationError} When validation fails
|
|
1907
|
+
*/
|
|
1908
|
+
async extract(markdown) {
|
|
1909
|
+
this.log("info", `Starting TOC extraction (${markdown.length} chars)`);
|
|
1910
|
+
if (!markdown.trim()) {
|
|
1911
|
+
this.log("info", "Empty markdown, returning empty array");
|
|
1912
|
+
return {
|
|
1913
|
+
entries: [],
|
|
1914
|
+
usage: this.createEmptyUsage("extraction")
|
|
1915
|
+
};
|
|
1916
|
+
}
|
|
1917
|
+
try {
|
|
1918
|
+
const result = await this.callTextLLM(
|
|
1919
|
+
TocResponseSchema,
|
|
1920
|
+
this.buildSystemPrompt(),
|
|
1921
|
+
this.buildUserPrompt(markdown),
|
|
1922
|
+
"extraction"
|
|
1923
|
+
);
|
|
1924
|
+
const entries = this.normalizeEntries(result.output.entries);
|
|
1925
|
+
if (!this.skipValidation) {
|
|
1926
|
+
this.validateEntries(entries);
|
|
1927
|
+
}
|
|
1928
|
+
this.log(
|
|
1929
|
+
"info",
|
|
1930
|
+
`Extraction completed: ${entries.length} top-level entries`
|
|
1931
|
+
);
|
|
1932
|
+
return { entries, usage: result.usage };
|
|
1933
|
+
} catch (error) {
|
|
1934
|
+
if (error instanceof TocValidationError) {
|
|
1935
|
+
this.log("error", `Validation failed: ${error.message}`);
|
|
1936
|
+
throw error;
|
|
1937
|
+
}
|
|
1938
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1939
|
+
this.log("error", `Extraction failed: ${message}`);
|
|
1940
|
+
throw new TocParseError(`Failed to extract TOC structure: ${message}`, {
|
|
1941
|
+
cause: error
|
|
1942
|
+
});
|
|
1943
|
+
}
|
|
1944
|
+
}
|
|
1945
|
+
/**
|
|
1946
|
+
* Validate extracted entries
|
|
1947
|
+
*
|
|
1948
|
+
* @throws {TocValidationError} When validation fails
|
|
1949
|
+
*/
|
|
1950
|
+
validateEntries(entries) {
|
|
1951
|
+
if (entries.length === 0) {
|
|
1952
|
+
return;
|
|
1953
|
+
}
|
|
1954
|
+
const validator = new TocValidator(this.validationOptions);
|
|
1955
|
+
validator.validateOrThrow(entries);
|
|
1956
|
+
}
|
|
1957
|
+
/**
|
|
1958
|
+
* Build system prompt for TOC extraction
|
|
1959
|
+
*/
|
|
1960
|
+
buildSystemPrompt() {
|
|
1961
|
+
return `You are a document structure extraction assistant. Your task is to parse a table of contents (TOC) from markdown format and extract structured entries.
|
|
1962
|
+
|
|
1963
|
+
## Instructions
|
|
1964
|
+
|
|
1965
|
+
1. **Title**: Extract the exact chapter/section title from each line. Remove page number indicators like "..... 10" or "... 5" at the end.
|
|
1966
|
+
|
|
1967
|
+
2. **Level**: Determine the hierarchy depth:
|
|
1968
|
+
- Level 1: Top-level chapters (e.g., "\uC81C1\uC7A5", "Chapter 1", "I.", "Part 1")
|
|
1969
|
+
- Level 2: Main sections within chapters (e.g., "1.", "1.1", "A.")
|
|
1970
|
+
- Level 3: Subsections (e.g., "1.1.1", "a.", "(1)")
|
|
1971
|
+
- Use indentation and numbering patterns to infer level
|
|
1972
|
+
|
|
1973
|
+
3. **Page Number**: Extract the page number from each entry. Convert Roman numerals to Arabic numerals if present (e.g., "iv" \u2192 4).
|
|
1974
|
+
|
|
1975
|
+
4. **Children**: Nest child entries under parent entries based on their hierarchy level.
|
|
1976
|
+
|
|
1977
|
+
5. **IMPORTANT - Extract Main TOC Only**: Only extract the main document table of contents. EXCLUDE the following supplementary indices:
|
|
1978
|
+
- Photo/image indices (\uC0AC\uC9C4 \uBAA9\uCC28, \uC0AC\uC9C4\uBAA9\uCC28, \uD654\uBCF4 \uBAA9\uCC28, Photo Index, List of Photos, List of Figures)
|
|
1979
|
+
- Drawing/diagram indices (\uB3C4\uBA74 \uBAA9\uCC28, \uB3C4\uBA74\uBAA9\uCC28, \uC0BD\uB3C4 \uBAA9\uCC28, Drawing Index, List of Drawings)
|
|
1980
|
+
- Table indices (\uD45C \uBAA9\uCC28, \uD45C\uBAA9\uCC28, Table Index, List of Tables)
|
|
1981
|
+
- Appendix indices (\uBD80\uB85D \uBAA9\uCC28, Appendix Index)
|
|
1982
|
+
- Any other supplementary material indices
|
|
1983
|
+
|
|
1984
|
+
## Output Format
|
|
1985
|
+
|
|
1986
|
+
Return a flat array of top-level entries. Each entry at level 1 should contain its children (level 2+) nested properly.
|
|
1987
|
+
|
|
1988
|
+
## Example
|
|
1989
|
+
|
|
1990
|
+
Input:
|
|
1991
|
+
- \uC81C1\uC7A5 \uC11C\uB860 ..... 1
|
|
1992
|
+
- 1. \uC5F0\uAD6C \uBC30\uACBD ..... 3
|
|
1993
|
+
- 2. \uC5F0\uAD6C \uBAA9\uC801 ..... 5
|
|
1994
|
+
- \uC81C2\uC7A5 \uBC29\uBC95\uB860 ..... 10
|
|
1995
|
+
|
|
1996
|
+
Output:
|
|
1997
|
+
{
|
|
1998
|
+
"entries": [
|
|
1999
|
+
{
|
|
2000
|
+
"title": "\uC81C1\uC7A5 \uC11C\uB860",
|
|
2001
|
+
"level": 1,
|
|
2002
|
+
"pageNo": 1,
|
|
2003
|
+
"children": [
|
|
2004
|
+
{ "title": "1. \uC5F0\uAD6C \uBC30\uACBD", "level": 2, "pageNo": 3 },
|
|
2005
|
+
{ "title": "2. \uC5F0\uAD6C \uBAA9\uC801", "level": 2, "pageNo": 5 }
|
|
2006
|
+
]
|
|
2007
|
+
},
|
|
2008
|
+
{ "title": "\uC81C2\uC7A5 \uBC29\uBC95\uB860", "level": 1, "pageNo": 10 }
|
|
2009
|
+
]
|
|
2010
|
+
}`;
|
|
2011
|
+
}
|
|
2012
|
+
/**
|
|
2013
|
+
* Build user prompt with Markdown content
|
|
2014
|
+
*/
|
|
2015
|
+
buildUserPrompt(markdown) {
|
|
2016
|
+
return `Extract the table of contents structure from the following markdown:
|
|
2017
|
+
|
|
2018
|
+
${markdown}`;
|
|
2019
|
+
}
|
|
2020
|
+
/**
|
|
2021
|
+
* Normalize and validate extracted entries
|
|
2022
|
+
*/
|
|
2023
|
+
normalizeEntries(entries) {
|
|
2024
|
+
if (entries.length === 0) {
|
|
2025
|
+
return [];
|
|
2026
|
+
}
|
|
2027
|
+
return this.normalizeLevel(entries, 1);
|
|
2028
|
+
}
|
|
2029
|
+
/**
|
|
2030
|
+
* Recursively ensure level consistency
|
|
2031
|
+
*
|
|
2032
|
+
* Children must have level = parent.level + 1
|
|
2033
|
+
*/
|
|
2034
|
+
normalizeLevel(entries, expectedLevel) {
|
|
2035
|
+
return entries.map((entry) => {
|
|
2036
|
+
const normalizedEntry = {
|
|
2037
|
+
title: entry.title.trim(),
|
|
2038
|
+
level: expectedLevel,
|
|
2039
|
+
pageNo: entry.pageNo
|
|
2040
|
+
};
|
|
2041
|
+
if (entry.children && entry.children.length > 0) {
|
|
2042
|
+
normalizedEntry.children = this.normalizeLevel(
|
|
2043
|
+
entry.children,
|
|
2044
|
+
expectedLevel + 1
|
|
2045
|
+
);
|
|
2046
|
+
}
|
|
2047
|
+
return normalizedEntry;
|
|
2048
|
+
});
|
|
2049
|
+
}
|
|
2050
|
+
};
|
|
2051
|
+
|
|
2052
|
+
// src/extractors/vision-toc-extractor.ts
|
|
2053
|
+
var fs2 = __toESM(require("fs"), 1);
|
|
2054
|
+
var path2 = __toESM(require("path"), 1);
|
|
2055
|
+
var import_zod2 = require("zod");
|
|
2056
|
+
|
|
2057
|
+
// src/core/vision-llm-component.ts
|
|
2058
|
+
var fs = __toESM(require("fs"), 1);
|
|
2059
|
+
var path = __toESM(require("path"), 1);
|
|
2060
|
+
var VisionLLMComponent = class extends BaseLLMComponent {
|
|
2061
|
+
outputPath;
|
|
2062
|
+
constructor(logger, model, componentName, outputPath, options, fallbackModel, aggregator) {
|
|
2063
|
+
super(logger, model, componentName, options, fallbackModel, aggregator);
|
|
2064
|
+
this.outputPath = outputPath;
|
|
2065
|
+
}
|
|
2066
|
+
/**
|
|
2067
|
+
* Call LLM with vision capabilities using LLMCaller.callVision()
|
|
2068
|
+
*
|
|
2069
|
+
* @template TSchema - Zod schema type for response validation
|
|
2070
|
+
* @param schema - Zod schema for response validation
|
|
2071
|
+
* @param messages - Messages array including image content
|
|
2072
|
+
* @param phase - Phase name for tracking (e.g., 'extraction', 'sampling')
|
|
2073
|
+
* @returns Promise with parsed object and usage information
|
|
2074
|
+
*/
|
|
2075
|
+
async callVisionLLM(schema, messages, phase) {
|
|
2076
|
+
const result = await LLMCaller.callVision({
|
|
2077
|
+
schema,
|
|
2078
|
+
messages,
|
|
2079
|
+
primaryModel: this.model,
|
|
2080
|
+
fallbackModel: this.fallbackModel,
|
|
2081
|
+
maxRetries: this.maxRetries,
|
|
2082
|
+
temperature: this.temperature,
|
|
2083
|
+
abortSignal: this.abortSignal,
|
|
2084
|
+
component: this.componentName,
|
|
2085
|
+
phase
|
|
2086
|
+
});
|
|
2087
|
+
this.trackUsage(result.usage);
|
|
2088
|
+
return {
|
|
2089
|
+
output: result.output,
|
|
2090
|
+
usage: result.usage
|
|
2091
|
+
};
|
|
2092
|
+
}
|
|
2093
|
+
/**
|
|
2094
|
+
* Load an image file and encode it as base64
|
|
2095
|
+
*
|
|
2096
|
+
* @param imagePath - Absolute path to the image file
|
|
2097
|
+
* @returns Base64 encoded image string
|
|
2098
|
+
*/
|
|
2099
|
+
loadImageAsBase64(imagePath) {
|
|
2100
|
+
const imageBuffer = fs.readFileSync(imagePath);
|
|
2101
|
+
return imageBuffer.toString("base64");
|
|
2102
|
+
}
|
|
2103
|
+
/**
|
|
2104
|
+
* Build image content object for vision LLM messages
|
|
2105
|
+
*
|
|
2106
|
+
* @param imagePath - Path to the image file (relative to outputPath or absolute)
|
|
2107
|
+
* @param mimeType - MIME type of the image (default: 'image/png')
|
|
2108
|
+
* @returns ImageContent object for LLM message
|
|
2109
|
+
*/
|
|
2110
|
+
buildImageContent(imagePath, mimeType = "image/png") {
|
|
2111
|
+
const absolutePath = path.isAbsolute(imagePath) ? imagePath : path.resolve(this.outputPath, imagePath);
|
|
2112
|
+
const base64Image = this.loadImageAsBase64(absolutePath);
|
|
2113
|
+
return {
|
|
2114
|
+
type: "image",
|
|
2115
|
+
image: `data:${mimeType};base64,${base64Image}`
|
|
2116
|
+
};
|
|
2117
|
+
}
|
|
2118
|
+
};
|
|
2119
|
+
|
|
2120
|
+
// src/extractors/vision-toc-extractor.ts
|
|
2121
|
+
var VisionTocExtractionSchema = import_zod2.z.object({
|
|
2122
|
+
hasToc: import_zod2.z.boolean().describe("Whether a TOC is visible on these pages"),
|
|
2123
|
+
tocMarkdown: import_zod2.z.string().nullable().describe("Extracted TOC in markdown format, null if not found"),
|
|
2124
|
+
continuesOnNextPage: import_zod2.z.boolean().describe("Whether TOC continues beyond these pages")
|
|
2125
|
+
});
|
|
2126
|
+
var VisionTocExtractor = class extends VisionLLMComponent {
|
|
2127
|
+
firstBatchSize;
|
|
2128
|
+
secondBatchSize;
|
|
2129
|
+
constructor(logger, model, outputPath, options, fallbackModel, aggregator) {
|
|
2130
|
+
super(
|
|
2131
|
+
logger,
|
|
2132
|
+
model,
|
|
2133
|
+
"VisionTocExtractor",
|
|
2134
|
+
outputPath,
|
|
2135
|
+
options,
|
|
2136
|
+
fallbackModel,
|
|
2137
|
+
aggregator ?? new LLMTokenUsageAggregator()
|
|
2138
|
+
);
|
|
2139
|
+
this.firstBatchSize = options?.firstBatchSize ?? 10;
|
|
2140
|
+
this.secondBatchSize = options?.secondBatchSize ?? 10;
|
|
2141
|
+
}
|
|
2142
|
+
/**
|
|
2143
|
+
* Extract TOC from page images
|
|
2144
|
+
*
|
|
2145
|
+
* Searches pages 1-10 first, then 11-20 if not found.
|
|
2146
|
+
*
|
|
2147
|
+
* @param totalPages - Total number of pages in the document
|
|
2148
|
+
* @returns Extracted TOC markdown or null if not found
|
|
2149
|
+
*/
|
|
2150
|
+
async extract(totalPages) {
|
|
2151
|
+
this.log("info", `Starting TOC extraction from ${totalPages} pages`);
|
|
2152
|
+
if (totalPages === 0) {
|
|
2153
|
+
this.log("info", "No pages to search");
|
|
2154
|
+
return null;
|
|
2155
|
+
}
|
|
2156
|
+
const firstBatchEnd = Math.min(this.firstBatchSize, totalPages);
|
|
2157
|
+
this.log("info", `Searching first batch: pages 1-${firstBatchEnd}`);
|
|
2158
|
+
const firstResult = await this.extractFromBatch(1, firstBatchEnd);
|
|
2159
|
+
if (firstResult.hasToc && firstResult.tocMarkdown) {
|
|
2160
|
+
if (firstResult.continuesOnNextPage && firstBatchEnd < totalPages) {
|
|
2161
|
+
this.log("info", "TOC continues on next pages, extracting more");
|
|
2162
|
+
const continuationEnd = Math.min(
|
|
2163
|
+
firstBatchEnd + this.secondBatchSize,
|
|
2164
|
+
totalPages
|
|
2165
|
+
);
|
|
2166
|
+
const continuationResult = await this.extractFromBatch(
|
|
2167
|
+
firstBatchEnd + 1,
|
|
2168
|
+
continuationEnd
|
|
2169
|
+
);
|
|
2170
|
+
if (continuationResult.hasToc && continuationResult.tocMarkdown) {
|
|
2171
|
+
const merged = this.mergeMarkdown(
|
|
2172
|
+
firstResult.tocMarkdown,
|
|
2173
|
+
continuationResult.tocMarkdown
|
|
2174
|
+
);
|
|
2175
|
+
this.aggregator.logSummary(this.logger);
|
|
2176
|
+
this.log(
|
|
2177
|
+
"info",
|
|
2178
|
+
`TOC extracted with continuation (${merged.length} chars)`
|
|
2179
|
+
);
|
|
2180
|
+
return merged;
|
|
2181
|
+
}
|
|
2182
|
+
}
|
|
2183
|
+
this.aggregator.logSummary(this.logger);
|
|
2184
|
+
this.log(
|
|
2185
|
+
"info",
|
|
2186
|
+
`TOC found in first batch (${firstResult.tocMarkdown.length} chars)`
|
|
2187
|
+
);
|
|
2188
|
+
return firstResult.tocMarkdown;
|
|
2189
|
+
}
|
|
2190
|
+
if (firstBatchEnd < totalPages) {
|
|
2191
|
+
const secondBatchStart = firstBatchEnd + 1;
|
|
2192
|
+
const secondBatchEnd = Math.min(
|
|
2193
|
+
firstBatchEnd + this.secondBatchSize,
|
|
2194
|
+
totalPages
|
|
2195
|
+
);
|
|
2196
|
+
this.log(
|
|
2197
|
+
"info",
|
|
2198
|
+
`Searching second batch: pages ${secondBatchStart}-${secondBatchEnd}`
|
|
2199
|
+
);
|
|
2200
|
+
const secondResult = await this.extractFromBatch(
|
|
2201
|
+
secondBatchStart,
|
|
2202
|
+
secondBatchEnd
|
|
2203
|
+
);
|
|
2204
|
+
if (secondResult.hasToc && secondResult.tocMarkdown) {
|
|
2205
|
+
this.aggregator.logSummary(this.logger);
|
|
2206
|
+
this.log(
|
|
2207
|
+
"info",
|
|
2208
|
+
`TOC found in second batch (${secondResult.tocMarkdown.length} chars)`
|
|
2209
|
+
);
|
|
2210
|
+
return secondResult.tocMarkdown;
|
|
2211
|
+
}
|
|
2212
|
+
}
|
|
2213
|
+
this.aggregator.logSummary(this.logger);
|
|
2214
|
+
this.log("info", "TOC not found in any batch");
|
|
2215
|
+
return null;
|
|
2216
|
+
}
|
|
2217
|
+
/**
|
|
2218
|
+
* Extract TOC from a specific batch of pages
|
|
2219
|
+
*/
|
|
2220
|
+
async extractFromBatch(startPage, endPage) {
|
|
2221
|
+
this.log("info", `Extracting from pages ${startPage}-${endPage}`);
|
|
2222
|
+
const imageContents = this.loadPageImages(startPage, endPage);
|
|
2223
|
+
const result = await LLMCaller.callVision({
|
|
2224
|
+
schema: VisionTocExtractionSchema,
|
|
2225
|
+
messages: [
|
|
2226
|
+
{
|
|
2227
|
+
role: "user",
|
|
2228
|
+
content: [
|
|
2229
|
+
{
|
|
2230
|
+
type: "text",
|
|
2231
|
+
text: this.buildUserPrompt(startPage, endPage)
|
|
2232
|
+
},
|
|
2233
|
+
...imageContents
|
|
2234
|
+
]
|
|
2235
|
+
}
|
|
2236
|
+
],
|
|
2237
|
+
primaryModel: this.model,
|
|
2238
|
+
fallbackModel: this.fallbackModel,
|
|
2239
|
+
maxRetries: this.maxRetries,
|
|
2240
|
+
temperature: this.temperature,
|
|
2241
|
+
abortSignal: this.abortSignal,
|
|
2242
|
+
component: "VisionTocExtractor",
|
|
2243
|
+
phase: "extraction"
|
|
2244
|
+
});
|
|
2245
|
+
this.trackUsage(result.usage);
|
|
2246
|
+
return result.output;
|
|
2247
|
+
}
|
|
2248
|
+
/**
|
|
2249
|
+
* Load page images and build message content
|
|
2250
|
+
*/
|
|
2251
|
+
loadPageImages(startPage, endPage) {
|
|
2252
|
+
const imageContents = [];
|
|
2253
|
+
for (let pageNo = startPage; pageNo <= endPage; pageNo++) {
|
|
2254
|
+
const imagePath = path2.resolve(
|
|
2255
|
+
this.outputPath,
|
|
2256
|
+
`pages/page_${pageNo - 1}.png`
|
|
2257
|
+
);
|
|
2258
|
+
const imageBuffer = fs2.readFileSync(imagePath);
|
|
2259
|
+
const base64Image = imageBuffer.toString("base64");
|
|
2260
|
+
imageContents.push({
|
|
2261
|
+
type: "image",
|
|
2262
|
+
image: `data:image/png;base64,${base64Image}`
|
|
2263
|
+
});
|
|
2264
|
+
}
|
|
2265
|
+
return imageContents;
|
|
2266
|
+
}
|
|
2267
|
+
/**
|
|
2268
|
+
* Merge markdown from multiple batches
|
|
2269
|
+
*/
|
|
2270
|
+
mergeMarkdown(first, continuation) {
|
|
2271
|
+
return `${first.trim()}
|
|
2272
|
+
${continuation.trim()}`;
|
|
2273
|
+
}
|
|
2274
|
+
/**
|
|
2275
|
+
* Build system prompt for vision LLM (not used, but required by abstract class)
|
|
2276
|
+
*/
|
|
2277
|
+
buildSystemPrompt() {
|
|
2278
|
+
return "";
|
|
2279
|
+
}
|
|
2280
|
+
/**
|
|
2281
|
+
* Build user prompt with page range information
|
|
2282
|
+
*/
|
|
2283
|
+
buildUserPrompt(startPage, endPage) {
|
|
2284
|
+
const pageCount = endPage - startPage + 1;
|
|
2285
|
+
return `You are a document analysis specialist. Your task is to find and extract the Table of Contents (TOC) from document page images.
|
|
2286
|
+
|
|
2287
|
+
I am providing ${pageCount} document page images (pages ${startPage}-${endPage}).
|
|
2288
|
+
|
|
2289
|
+
## Where to Look for TOC:
|
|
2290
|
+
- TOC typically appears in the first 10-20 pages of a document
|
|
2291
|
+
- Look for pages with headings like "\uBAA9\uCC28", "\uCC28\uB840", "Contents", "Table of Contents"
|
|
2292
|
+
- Look for structured lists with chapter titles and page numbers
|
|
2293
|
+
|
|
2294
|
+
## What to Extract:
|
|
2295
|
+
Extract the TOC content as markdown format that matches this exact structure:
|
|
2296
|
+
- Use "- " prefix for each list item
|
|
2297
|
+
- Use 2-space indentation for hierarchy levels
|
|
2298
|
+
- Include "..... " followed by page number at the end of each entry
|
|
2299
|
+
- Preserve original chapter/section numbering from the document
|
|
2300
|
+
|
|
2301
|
+
## Output Format Example:
|
|
2302
|
+
\`\`\`
|
|
2303
|
+
- \uC81C1\uC7A5 \uC11C\uB860 ..... 1
|
|
2304
|
+
- 1. \uC5F0\uAD6C \uBC30\uACBD ..... 3
|
|
2305
|
+
- 2. \uC5F0\uAD6C \uBAA9\uC801 ..... 5
|
|
2306
|
+
- \uC81C2\uC7A5 \uC5F0\uAD6C \uBC29\uBC95 ..... 10
|
|
2307
|
+
- 1. \uC870\uC0AC \uC9C0\uC5ED ..... 10
|
|
2308
|
+
- 2. \uC870\uC0AC \uBC29\uBC95 ..... 15
|
|
2309
|
+
- \uC81C3\uC7A5 \uC5F0\uAD6C \uACB0\uACFC ..... 25
|
|
2310
|
+
\`\`\`
|
|
2311
|
+
|
|
2312
|
+
## Important Rules:
|
|
2313
|
+
1. Extract ONLY the main document TOC
|
|
2314
|
+
2. DO NOT include supplementary indices:
|
|
2315
|
+
- Photo indices (\uC0AC\uC9C4 \uBAA9\uCC28, \uC0AC\uC9C4\uBAA9\uCC28)
|
|
2316
|
+
- Table indices (\uD45C \uBAA9\uCC28, \uD45C\uBAA9\uCC28)
|
|
2317
|
+
- Figure indices (\uB3C4\uBA74 \uBAA9\uCC28, \uB3C4\uBA74\uBAA9\uCC28)
|
|
2318
|
+
3. If no TOC is found, set hasToc to false and tocMarkdown to null
|
|
2319
|
+
4. Set continuesOnNextPage to true if the TOC appears to continue beyond the visible pages
|
|
2320
|
+
|
|
2321
|
+
Please examine these pages and:
|
|
2322
|
+
1. Determine if any page contains a Table of Contents (TOC)
|
|
2323
|
+
2. If found, extract the complete TOC in markdown format
|
|
2324
|
+
3. Indicate if the TOC continues beyond these pages
|
|
2325
|
+
|
|
2326
|
+
Remember: Extract the main document TOC only. Ignore photo/table/figure indices.`;
|
|
2327
|
+
}
|
|
2328
|
+
};
|
|
2329
|
+
|
|
2330
|
+
// src/parsers/caption-parser.ts
|
|
2331
|
+
var import_zod3 = require("zod");
|
|
2332
|
+
var CaptionSingleSchema = import_zod3.z.object({
|
|
2333
|
+
num: import_zod3.z.string().nullable().describe('Extracted caption prefix + number (e.g., "\uB3C4\uD310 1", "Figure 2")')
|
|
2334
|
+
});
|
|
2335
|
+
var CaptionExtractionSchema = import_zod3.z.object({
|
|
2336
|
+
index: import_zod3.z.number().int().describe("Index of the caption in the input array"),
|
|
2337
|
+
num: import_zod3.z.string().nullable().describe('Extracted caption prefix + number (e.g., "\uB3C4\uD310 1", "Figure 2")')
|
|
2338
|
+
});
|
|
2339
|
+
var CaptionBatchSchema = import_zod3.z.object({
|
|
2340
|
+
results: import_zod3.z.array(CaptionExtractionSchema)
|
|
2341
|
+
});
|
|
2342
|
+
var CaptionParser = class extends TextLLMComponent {
|
|
2343
|
+
constructor(logger, model, options, fallbackModel, aggregator) {
|
|
2344
|
+
super(
|
|
2345
|
+
logger,
|
|
2346
|
+
model,
|
|
2347
|
+
options?.componentName ?? "CaptionParser",
|
|
2348
|
+
options,
|
|
2349
|
+
fallbackModel,
|
|
2350
|
+
aggregator ?? new LLMTokenUsageAggregator()
|
|
2351
|
+
);
|
|
2352
|
+
}
|
|
2353
|
+
/**
|
|
2354
|
+
* Parse batch of captions
|
|
2355
|
+
*
|
|
2356
|
+
* @param captions - Array of caption full texts
|
|
2357
|
+
* @param batchSize - Batch size for processing. Set to 0 for sequential processing without batching.
|
|
2358
|
+
* @param overrideModel - Optional model to use instead of the default model
|
|
2359
|
+
* @returns Array of Caption objects with num extracted (maintains original order)
|
|
2360
|
+
*/
|
|
2361
|
+
async parseBatch(captions, batchSize, overrideModel) {
|
|
2362
|
+
const effectiveModel = overrideModel ?? this.model;
|
|
2363
|
+
const isOverride = overrideModel !== void 0;
|
|
2364
|
+
const modelName = effectiveModel.modelId ?? effectiveModel.id ?? "unknown";
|
|
2365
|
+
this.log(
|
|
2366
|
+
"info",
|
|
2367
|
+
`Starting caption parsing for ${captions.length} captions with ${isOverride ? "override " : ""}model: ${modelName}`
|
|
2368
|
+
);
|
|
2369
|
+
if (captions.length === 0) {
|
|
2370
|
+
this.log("info", "No captions to parse");
|
|
2371
|
+
return [];
|
|
2372
|
+
}
|
|
2373
|
+
try {
|
|
2374
|
+
if (batchSize === 0) {
|
|
2375
|
+
this.log("info", "Using sequential processing (batchSize=0)");
|
|
2376
|
+
const results2 = [];
|
|
2377
|
+
for (let i = 0; i < captions.length; i++) {
|
|
2378
|
+
const fullText = captions[i];
|
|
2379
|
+
this.log("info", `Processing ${i + 1} / ${captions.length}...`);
|
|
2380
|
+
const result = await LLMCaller.call({
|
|
2381
|
+
schema: CaptionSingleSchema,
|
|
2382
|
+
systemPrompt: this.buildSystemPrompt("single"),
|
|
2383
|
+
userPrompt: this.buildUserPromptSingle(fullText),
|
|
2384
|
+
primaryModel: effectiveModel,
|
|
2385
|
+
fallbackModel: this.fallbackModel,
|
|
2386
|
+
maxRetries: this.maxRetries,
|
|
2387
|
+
temperature: this.temperature,
|
|
2388
|
+
abortSignal: this.abortSignal,
|
|
2389
|
+
component: this.componentName,
|
|
2390
|
+
phase: "caption-extraction"
|
|
2391
|
+
});
|
|
2392
|
+
this.trackUsage(result.usage);
|
|
2393
|
+
const finalNum = this.extractNumFromFullText(
|
|
2394
|
+
fullText,
|
|
2395
|
+
result.output.num
|
|
2396
|
+
);
|
|
2397
|
+
results2.push({ fullText, num: finalNum });
|
|
2398
|
+
}
|
|
2399
|
+
this.aggregator.logSummary(this.logger);
|
|
2400
|
+
this.log(
|
|
2401
|
+
"info",
|
|
2402
|
+
`Completed: ${results2.length} captions parsed, ${results2.filter((r) => r.num).length} with extracted numbers`
|
|
2403
|
+
);
|
|
2404
|
+
return results2;
|
|
2405
|
+
}
|
|
2406
|
+
const indexedCaptions = captions.map((text, index) => ({ index, text }));
|
|
2407
|
+
const batchResults = await BatchProcessor.processBatch(
|
|
2408
|
+
indexedCaptions,
|
|
2409
|
+
batchSize,
|
|
2410
|
+
async (batch) => this.parseBatchInternal(batch, effectiveModel)
|
|
2411
|
+
);
|
|
2412
|
+
batchResults.sort((a, b) => a.index - b.index);
|
|
2413
|
+
const results = batchResults.map((r) => r.caption);
|
|
2414
|
+
this.aggregator.logSummary(this.logger);
|
|
2415
|
+
this.log(
|
|
2416
|
+
"info",
|
|
2417
|
+
`Completed: ${results.length} captions parsed, ${results.filter((r) => r.num).length} with extracted numbers`
|
|
2418
|
+
);
|
|
2419
|
+
return results;
|
|
2420
|
+
} catch (error) {
|
|
2421
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2422
|
+
this.log("error", `Parsing failed: ${message}`);
|
|
2423
|
+
throw new CaptionParseError(`Failed to parse captions: ${message}`, {
|
|
2424
|
+
cause: error
|
|
2425
|
+
});
|
|
2426
|
+
}
|
|
2427
|
+
}
|
|
2428
|
+
/**
|
|
2429
|
+
* Internal: Parse batch of captions using LLM
|
|
2430
|
+
*
|
|
2431
|
+
* @param captions - Batch of caption texts with original indices
|
|
2432
|
+
* @param model - Effective model to use
|
|
2433
|
+
* @returns Array of Caption objects indexed correctly
|
|
2434
|
+
*/
|
|
2435
|
+
async parseBatchInternal(captions, model) {
|
|
2436
|
+
const result = await LLMCaller.call({
|
|
2437
|
+
schema: CaptionBatchSchema,
|
|
2438
|
+
systemPrompt: this.buildSystemPrompt(),
|
|
2439
|
+
userPrompt: this.buildUserPrompt(captions),
|
|
2440
|
+
primaryModel: model,
|
|
2441
|
+
fallbackModel: this.fallbackModel,
|
|
2442
|
+
maxRetries: this.maxRetries,
|
|
2443
|
+
temperature: this.temperature,
|
|
2444
|
+
abortSignal: this.abortSignal,
|
|
2445
|
+
component: this.componentName,
|
|
2446
|
+
phase: "caption-extraction"
|
|
2447
|
+
});
|
|
2448
|
+
this.trackUsage(result.usage);
|
|
2449
|
+
if (result.output.results.length !== captions.length) {
|
|
2450
|
+
this.log(
|
|
2451
|
+
"warn",
|
|
2452
|
+
`LLM returned ${result.output.results.length} results for ${captions.length} captions. This may cause index mismatch.`
|
|
2453
|
+
);
|
|
2454
|
+
}
|
|
2455
|
+
const captionMap = new Map(captions.map((c) => [c.index, c.text]));
|
|
2456
|
+
return result.output.results.map((resultItem) => {
|
|
2457
|
+
const originalCaption = captions[resultItem.index];
|
|
2458
|
+
const originalIndex = originalCaption?.index ?? resultItem.index;
|
|
2459
|
+
const fullText = captionMap.get(originalIndex) || "";
|
|
2460
|
+
const finalNum = this.extractNumFromFullText(fullText, resultItem.num);
|
|
2461
|
+
return {
|
|
2462
|
+
index: originalIndex,
|
|
2463
|
+
caption: {
|
|
2464
|
+
fullText,
|
|
2465
|
+
num: finalNum
|
|
2466
|
+
}
|
|
2467
|
+
};
|
|
2468
|
+
});
|
|
2469
|
+
}
|
|
2470
|
+
/**
|
|
2471
|
+
* Extract and normalize caption number from full text
|
|
2472
|
+
*
|
|
2473
|
+
* Finds the extracted num pattern in the full text and extracts it
|
|
2474
|
+
* with original casing. Handles case-insensitive matching.
|
|
2475
|
+
*
|
|
2476
|
+
* @param fullText - The full caption text
|
|
2477
|
+
* @param extractedNum - The num extracted by LLM (may have different casing)
|
|
2478
|
+
* @returns Normalized num or undefined if no match
|
|
2479
|
+
*/
|
|
2480
|
+
extractNumFromFullText(fullText, extractedNum) {
|
|
2481
|
+
if (!extractedNum) return void 0;
|
|
2482
|
+
let matchIndex = fullText.indexOf(extractedNum);
|
|
2483
|
+
if (matchIndex === -1) {
|
|
2484
|
+
const lowerFullText = fullText.toLowerCase();
|
|
2485
|
+
const lowerNum = extractedNum.toLowerCase();
|
|
2486
|
+
matchIndex = lowerFullText.indexOf(lowerNum);
|
|
2487
|
+
if (matchIndex !== -1) {
|
|
2488
|
+
return fullText.substring(matchIndex, matchIndex + extractedNum.length);
|
|
2489
|
+
}
|
|
2490
|
+
return extractedNum;
|
|
2491
|
+
}
|
|
2492
|
+
return fullText.substring(matchIndex, matchIndex + extractedNum.length);
|
|
2493
|
+
}
|
|
2494
|
+
/**
|
|
2495
|
+
* Build system prompt for caption parsing
|
|
2496
|
+
*
|
|
2497
|
+
* @param mode - 'batch' for multiple captions, 'single' for single caption
|
|
2498
|
+
*/
|
|
2499
|
+
buildSystemPrompt(mode = "batch") {
|
|
2500
|
+
const intro = mode === "batch" ? 'Extract the caption prefix and number (e.g., "\uB3C4\uD310 1", "Figure 2") from image/table captions.\nReturn the prefix + number part as a string, or null if no number exists.' : 'Extract the caption prefix and number (e.g., "\uB3C4\uD310 1", "Figure 2") from an image/table caption.\nReturn the prefix + number part as a string, or null if no number exists.';
|
|
2501
|
+
return `You are a caption prefix extractor for archaeological excavation reports.
|
|
2502
|
+
|
|
2503
|
+
${intro}
|
|
2504
|
+
|
|
2505
|
+
Rules:
|
|
2506
|
+
1. Extract if the text follows a caption pattern: <prefix word(s)> <number>
|
|
2507
|
+
- The prefix can be ANY Korean/English word(s) that label images/tables/figures
|
|
2508
|
+
- Common examples: \uB3C4\uD310, \uC0AC\uC9C4, \uADF8\uB9BC, \uB3C4\uBA74, \uD45C, \uC6D0\uC0C9\uC0AC\uC9C4, \uD751\uBC31\uC0AC\uC9C4, Figure, Photo, Plate, etc.
|
|
2509
|
+
- The key is the PATTERN (text followed by number), not a specific word list
|
|
2510
|
+
- "\uC6D0\uC0C9\uC0AC\uC9C4 1. \uC870\uC0AC\uC9C0\uC5ED" \u2192 "\uC6D0\uC0C9\uC0AC\uC9C4 1" (valid: prefix + number pattern)
|
|
2511
|
+
- "\uD751\uBC31\uC0AC\uC9C4 2 \uCD9C\uD1A0\uC720\uBB3C" \u2192 "\uD751\uBC31\uC0AC\uC9C4 2" (valid: prefix + number pattern)
|
|
2512
|
+
2. IGNORE leading punctuation/brackets when extracting:
|
|
2513
|
+
- "(\uC0AC\uC9C4 16> \uB290\uD2F0\uB098\uBB34" \u2192 "\uC0AC\uC9C4 16" (ignore leading '(' and extract the pattern inside)
|
|
2514
|
+
- "<\uB3C4\uD310 1> \uC720\uC801" \u2192 "\uB3C4\uD310 1" (ignore angle brackets)
|
|
2515
|
+
- "[\uADF8\uB9BC 2] \uC804\uACBD" \u2192 "\uADF8\uB9BC 2" (ignore square brackets)
|
|
2516
|
+
3. Do NOT extract (return null) if:
|
|
2517
|
+
- It's a numbered list item starting with just a number: "1. \uC720\uC801 \uC804\uACBD" \u2192 null
|
|
2518
|
+
- It's a date/time reference: "39 3\uC6D4 28\uC77C..." \u2192 null
|
|
2519
|
+
- It's a year reference: "2024\uB144 \uC870\uC0AC \uD604\uD669" \u2192 null
|
|
2520
|
+
- It starts with a number without a prefix: "123 \uC124\uBA85" \u2192 null
|
|
2521
|
+
4. PRESERVE original spacing from the input text exactly (after ignoring leading punctuation)
|
|
2522
|
+
5. Include the full number (e.g., "1-2", "3a") not just the first digit
|
|
2523
|
+
6. Include period/dot after number if it directly follows (e.g., "3.6" \u2192 "\uB3C4\uD310 3.6")
|
|
2524
|
+
- "\uADF8\uB9BC 3.6. \uD55C\uBC18\uB3C4 \uC911\uBD80" \u2192 "\uADF8\uB9BC 3.6" (period after decimal number included)
|
|
2525
|
+
- "\uB3C4\uD310 2. \uC720\uC801" \u2192 "\uB3C4\uD310 2" (period after space NOT included)
|
|
2526
|
+
7. Stop at the first punctuation (except decimal point), whitespace, or underscore after the number
|
|
2527
|
+
- "\uC0AC\uC9C4 1_\u3147\u3147\u3147" \u2192 "\uC0AC\uC9C4 1" (stop at underscore)
|
|
2528
|
+
- "\uC0AC\uC9C4 1 \u3147\u3147\u3147" \u2192 "\uC0AC\uC9C4 1" (stop at space)
|
|
2529
|
+
- "\uADF8\uB9BC 3.6. \uD55C\uBC18\uB3C4" \u2192 "\uADF8\uB9BC 3.6" (period included as decimal separator)
|
|
2530
|
+
|
|
2531
|
+
Examples:
|
|
2532
|
+
- "\uB3C4\uD310 1 \uC720\uC801 \uC804\uACBD" \u2192 "\uB3C4\uD310 1"
|
|
2533
|
+
- "\uC6D0\uC0C9\uC0AC\uC9C4 1. \uC870\uC0AC\uC9C0\uC5ED \uC6D0\uACBD" \u2192 "\uC6D0\uC0C9\uC0AC\uC9C4 1"
|
|
2534
|
+
- "\uD751\uBC31\uC0AC\uC9C4 2 \uCD9C\uD1A0\uC720\uBB3C" \u2192 "\uD751\uBC31\uC0AC\uC9C4 2"
|
|
2535
|
+
- "(\uC0AC\uC9C4 16> \uB290\uD2F0\uB098\uBB34\uC758 \uC811\uC120\uB2E8\uBA74" \u2192 "\uC0AC\uC9C4 16" (ignore leading punctuation)
|
|
2536
|
+
- "<\uB3C4\uD310 3> \uC720\uBB3C \uC0AC\uC9C4" \u2192 "\uB3C4\uD310 3" (ignore angle brackets)
|
|
2537
|
+
- "\uB3C4\uD3101 \uC5B4\uCA4C\uAD6C" \u2192 "\uB3C4\uD3101" (no space preserved)
|
|
2538
|
+
- "\uC0AC\uC9C4 2. \uCD9C\uD1A0 \uC720\uBB3C" \u2192 "\uC0AC\uC9C4 2" (period after space, not included)
|
|
2539
|
+
- "\uADF8\uB9BC 3.6. \uD55C\uBC18\uB3C4 \uC911\uBD80 \uBC0F \uB0A8\uBD80\uC758 \u3147\u3147\u3147" \u2192 "\uADF8\uB9BC 3.6" (period as decimal included)
|
|
2540
|
+
- "Figure 3: Site plan" \u2192 "Figure 3"
|
|
2541
|
+
- "Table 4a. Artifact list" \u2192 "Table 4a"
|
|
2542
|
+
- "\uB3C4\uD310 5-2 \uCE35\uC704 \uB2E8\uBA74" \u2192 "\uB3C4\uD310 5-2"
|
|
2543
|
+
- "\uC124\uBA85 \uC5C6\uB294 \uC774\uBBF8\uC9C0" \u2192 null
|
|
2544
|
+
- "39 3\uC6D4 28\uC77C(\uBC31\uC81C \uB3C4\uB85C\uC720\uAD6C \uB0B4\uBD80 \uC870\uC0AC)" \u2192 null (starts with number, no prefix)
|
|
2545
|
+
- "1. \uC720\uAD6C \uD604\uD669" \u2192 null (numbered list, not caption)
|
|
2546
|
+
- "2024-05-01 \uCD2C\uC601" \u2192 null (date, not caption)`;
|
|
2547
|
+
}
|
|
2548
|
+
/**
|
|
2549
|
+
* Build user prompt for caption parsing
|
|
2550
|
+
*/
|
|
2551
|
+
buildUserPrompt(captions) {
|
|
2552
|
+
const captionList = captions.map((c) => `[${c.index}] ${c.text}`).join("\n");
|
|
2553
|
+
return `Extract caption prefix and number from the following captions:
|
|
2554
|
+
|
|
2555
|
+
${captionList}
|
|
2556
|
+
|
|
2557
|
+
Return the results as JSON array with "index" (original position) and "num" (extracted prefix + number or null).
|
|
2558
|
+
|
|
2559
|
+
Example format:
|
|
2560
|
+
[
|
|
2561
|
+
{ "index": 0, "num": "\uB3C4\uD310 1" },
|
|
2562
|
+
{ "index": 1, "num": "Figure 2" },
|
|
2563
|
+
{ "index": 2, "num": null }
|
|
2564
|
+
]`;
|
|
2565
|
+
}
|
|
2566
|
+
/**
|
|
2567
|
+
* Build user prompt for single caption parsing
|
|
2568
|
+
*/
|
|
2569
|
+
buildUserPromptSingle(caption) {
|
|
2570
|
+
return `Extract caption prefix and number from the following caption:
|
|
2571
|
+
|
|
2572
|
+
"${caption}"
|
|
2573
|
+
|
|
2574
|
+
CRITICAL: Return ONLY the JSON object directly with a "num" field.
|
|
2575
|
+
- DO NOT wrap the JSON in quotes or additional formatting
|
|
2576
|
+
- DO NOT output "final:", "result:", or any prefix labels
|
|
2577
|
+
- DO NOT wrap in backticks or code blocks
|
|
2578
|
+
- Return ONLY valid JSON: { "num": value }
|
|
2579
|
+
|
|
2580
|
+
The value must be:
|
|
2581
|
+
- A string with the extracted caption prefix + number (e.g., "\uB3C4\uD310 1", "Figure 2")
|
|
2582
|
+
- null if no number exists
|
|
2583
|
+
|
|
2584
|
+
Valid outputs:
|
|
2585
|
+
{ "num": "\uB3C4\uD310 1" }
|
|
2586
|
+
{ "num": null }
|
|
2587
|
+
|
|
2588
|
+
Invalid outputs (NEVER do these):
|
|
2589
|
+
- { "final": "..." } \u274C
|
|
2590
|
+
- \`\`\`json { "num": "..." } \`\`\` \u274C
|
|
2591
|
+
- "{ "num": "..." }" \u274C
|
|
2592
|
+
- { "num": { "value": "..." } } \u274C`;
|
|
2593
|
+
}
|
|
2594
|
+
};
|
|
2595
|
+
var CaptionParseError = class extends Error {
|
|
2596
|
+
constructor(message, options) {
|
|
2597
|
+
super(message, options);
|
|
2598
|
+
this.name = "CaptionParseError";
|
|
2599
|
+
}
|
|
2600
|
+
};
|
|
2601
|
+
|
|
2602
|
+
// src/parsers/page-range-parse-error.ts
|
|
2603
|
+
var PageRangeParseError = class _PageRangeParseError extends Error {
|
|
2604
|
+
constructor(message, options) {
|
|
2605
|
+
super(message, options);
|
|
2606
|
+
this.name = "PageRangeParseError";
|
|
2607
|
+
}
|
|
2608
|
+
/**
|
|
2609
|
+
* Extract error message from unknown error type
|
|
2610
|
+
*/
|
|
2611
|
+
static getErrorMessage(error) {
|
|
2612
|
+
return error instanceof Error ? error.message : String(error);
|
|
2613
|
+
}
|
|
2614
|
+
/**
|
|
2615
|
+
* Create PageRangeParseError from unknown error with context
|
|
2616
|
+
*/
|
|
2617
|
+
static fromError(context, error) {
|
|
2618
|
+
return new _PageRangeParseError(
|
|
2619
|
+
`${context}: ${_PageRangeParseError.getErrorMessage(error)}`,
|
|
2620
|
+
{ cause: error }
|
|
2621
|
+
);
|
|
2622
|
+
}
|
|
2623
|
+
};
|
|
2624
|
+
|
|
2625
|
+
// src/parsers/page-range-parser.ts
|
|
2626
|
+
var fs3 = __toESM(require("fs"), 1);
|
|
2627
|
+
var path3 = __toESM(require("path"), 1);
|
|
2628
|
+
var import_zod4 = require("zod");
|
|
2629
|
+
var PagePattern = /* @__PURE__ */ ((PagePattern2) => {
|
|
2630
|
+
PagePattern2["SIMPLE_INCREMENT"] = "simple_increment";
|
|
2631
|
+
PagePattern2["DOUBLE_SIDED"] = "double_sided";
|
|
2632
|
+
PagePattern2["OFFSET"] = "offset";
|
|
2633
|
+
PagePattern2["UNKNOWN"] = "unknown";
|
|
2634
|
+
return PagePattern2;
|
|
2635
|
+
})(PagePattern || {});
|
|
2636
|
+
var PageRangeParser = class extends VisionLLMComponent {
|
|
2637
|
+
// Configuration constants
|
|
2638
|
+
SAMPLE_SIZE = 3;
|
|
2639
|
+
MAX_PATTERN_RETRIES = 6;
|
|
2640
|
+
SIZE_TOLERANCE = 5;
|
|
2641
|
+
constructor(logger, model, outputPath, maxRetries = 3, fallbackModel, aggregator, abortSignal) {
|
|
2642
|
+
super(
|
|
2643
|
+
logger,
|
|
2644
|
+
model,
|
|
2645
|
+
"PageRangeParser",
|
|
2646
|
+
outputPath,
|
|
2647
|
+
{ maxRetries, abortSignal },
|
|
2648
|
+
fallbackModel,
|
|
2649
|
+
aggregator ?? new LLMTokenUsageAggregator()
|
|
2650
|
+
);
|
|
2651
|
+
}
|
|
2652
|
+
/**
|
|
2653
|
+
* Main parse method
|
|
2654
|
+
*
|
|
2655
|
+
* Extracts page range mapping from DoclingDocument using Vision LLM.
|
|
2656
|
+
* Automatically tracks token usage in the aggregator if one was provided.
|
|
2657
|
+
*
|
|
2658
|
+
* @param doclingDoc - DoclingDocument to extract page ranges from
|
|
2659
|
+
* @returns Object with page range mapping and token usage information
|
|
2660
|
+
*/
|
|
2661
|
+
async parse(doclingDoc) {
|
|
2662
|
+
this.log("info", "Starting page range parsing...");
|
|
2663
|
+
const pages = this.extractPages(doclingDoc);
|
|
2664
|
+
if (pages.length === 0) {
|
|
2665
|
+
this.log("warn", "No pages found");
|
|
2666
|
+
const emptyUsage = this.createEmptyUsage("sampling");
|
|
2667
|
+
this.trackUsage(emptyUsage);
|
|
2668
|
+
return {
|
|
2669
|
+
pageRangeMap: {},
|
|
2670
|
+
usage: [emptyUsage]
|
|
2671
|
+
};
|
|
2672
|
+
}
|
|
2673
|
+
const sizeGroups = this.analyzeSizes(pages);
|
|
2674
|
+
this.log(
|
|
2675
|
+
"info",
|
|
2676
|
+
`Found ${sizeGroups.length} size group(s), total ${pages.length} pages`
|
|
2677
|
+
);
|
|
2678
|
+
const pageRangeMap = {};
|
|
2679
|
+
const usageList = [];
|
|
2680
|
+
for (let i = 0; i < sizeGroups.length; i++) {
|
|
2681
|
+
const group = sizeGroups[i];
|
|
2682
|
+
this.log(
|
|
2683
|
+
"info",
|
|
2684
|
+
`Processing group ${i + 1}/${sizeGroups.length}: ${group.pageNos.length} pages`
|
|
2685
|
+
);
|
|
2686
|
+
const groupResult = await this.processGroup(pages, group, this.model);
|
|
2687
|
+
Object.assign(pageRangeMap, groupResult.pageRangeMap);
|
|
2688
|
+
usageList.push(...groupResult.usage);
|
|
2689
|
+
}
|
|
2690
|
+
for (const usage of usageList) {
|
|
2691
|
+
this.trackUsage(usage);
|
|
2692
|
+
}
|
|
2693
|
+
this.postProcess(pageRangeMap);
|
|
2694
|
+
this.log(
|
|
2695
|
+
"info",
|
|
2696
|
+
`Completed: ${Object.keys(pageRangeMap).length} pages mapped`
|
|
2697
|
+
);
|
|
2698
|
+
return { pageRangeMap, usage: usageList };
|
|
2699
|
+
}
|
|
2700
|
+
/**
|
|
2701
|
+
* Extract pages array from DoclingDocument
|
|
2702
|
+
*/
|
|
2703
|
+
extractPages(doclingDoc) {
|
|
2704
|
+
const pageKeys = Object.keys(doclingDoc.pages).map(Number).filter((n) => !Number.isNaN(n)).sort((a, b) => a - b);
|
|
2705
|
+
return pageKeys.map((key) => doclingDoc.pages[String(key)]);
|
|
2706
|
+
}
|
|
2707
|
+
/**
|
|
2708
|
+
* Analyze page sizes and group consecutive pages with same dimensions
|
|
2709
|
+
*/
|
|
2710
|
+
analyzeSizes(pages) {
|
|
2711
|
+
const groups = [];
|
|
2712
|
+
let currentGroup = null;
|
|
2713
|
+
for (const page of pages) {
|
|
2714
|
+
const sizeKey = this.createSizeKey(page.size.width, page.size.height);
|
|
2715
|
+
if (!currentGroup || currentGroup.sizeKey !== sizeKey) {
|
|
2716
|
+
currentGroup = { sizeKey, pageNos: [page.page_no] };
|
|
2717
|
+
groups.push(currentGroup);
|
|
2718
|
+
} else {
|
|
2719
|
+
currentGroup.pageNos.push(page.page_no);
|
|
2720
|
+
}
|
|
2721
|
+
}
|
|
2722
|
+
return groups;
|
|
2723
|
+
}
|
|
2724
|
+
/**
|
|
2725
|
+
* Create size key with tolerance for floating point comparison
|
|
2726
|
+
*/
|
|
2727
|
+
createSizeKey(width, height) {
|
|
2728
|
+
const roundedWidth = Math.round(width / this.SIZE_TOLERANCE);
|
|
2729
|
+
const roundedHeight = Math.round(height / this.SIZE_TOLERANCE);
|
|
2730
|
+
return `${roundedWidth}x${roundedHeight}`;
|
|
2731
|
+
}
|
|
2732
|
+
/**
|
|
2733
|
+
* Process a single size group
|
|
2734
|
+
*/
|
|
2735
|
+
async processGroup(pages, group, model) {
|
|
2736
|
+
const { pageNos } = group;
|
|
2737
|
+
const usageList = [];
|
|
2738
|
+
if (pageNos.length <= this.SAMPLE_SIZE) {
|
|
2739
|
+
this.log(
|
|
2740
|
+
"info",
|
|
2741
|
+
`Small group (${pageNos.length} pages), extracting all at once`
|
|
2742
|
+
);
|
|
2743
|
+
const result = await this.extractMultiplePages(pages, pageNos, model);
|
|
2744
|
+
usageList.push(result.usage);
|
|
2745
|
+
return {
|
|
2746
|
+
pageRangeMap: this.samplesToMap(result.samples),
|
|
2747
|
+
usage: usageList
|
|
2748
|
+
};
|
|
2749
|
+
}
|
|
2750
|
+
const sampledPages = /* @__PURE__ */ new Set();
|
|
2751
|
+
for (let attempt = 0; attempt <= this.MAX_PATTERN_RETRIES; attempt++) {
|
|
2752
|
+
const samplePageNos = this.selectRandomSamples(
|
|
2753
|
+
pageNos,
|
|
2754
|
+
this.SAMPLE_SIZE,
|
|
2755
|
+
sampledPages
|
|
2756
|
+
);
|
|
2757
|
+
for (const p of samplePageNos) {
|
|
2758
|
+
sampledPages.add(p);
|
|
2759
|
+
}
|
|
2760
|
+
this.log(
|
|
2761
|
+
"info",
|
|
2762
|
+
`Attempt ${attempt + 1}/${this.MAX_PATTERN_RETRIES + 1}: sampling pages ${samplePageNos.join(", ")}`
|
|
2763
|
+
);
|
|
2764
|
+
const result = await this.extractMultiplePages(
|
|
2765
|
+
pages,
|
|
2766
|
+
samplePageNos,
|
|
2767
|
+
model
|
|
2768
|
+
);
|
|
2769
|
+
usageList.push(result.usage);
|
|
2770
|
+
const samples = result.samples;
|
|
2771
|
+
const pattern = this.detectPattern(samples);
|
|
2772
|
+
if (pattern.pattern !== "unknown" /* UNKNOWN */) {
|
|
2773
|
+
this.log(
|
|
2774
|
+
"info",
|
|
2775
|
+
`Pattern detected: ${pattern.pattern} (offset=${pattern.offset}, increment=${pattern.increment})`
|
|
2776
|
+
);
|
|
2777
|
+
return {
|
|
2778
|
+
pageRangeMap: this.applyPattern(pageNos, pattern),
|
|
2779
|
+
usage: usageList
|
|
2780
|
+
};
|
|
2781
|
+
}
|
|
2782
|
+
this.log(
|
|
2783
|
+
"warn",
|
|
2784
|
+
`Pattern detection failed, attempt ${attempt + 1}/${this.MAX_PATTERN_RETRIES + 1}`
|
|
2785
|
+
);
|
|
2786
|
+
}
|
|
2787
|
+
throw new PageRangeParseError(
|
|
2788
|
+
`Failed to detect page pattern after ${this.MAX_PATTERN_RETRIES + 1} attempts for size group with ${pageNos.length} pages`
|
|
2789
|
+
);
|
|
2790
|
+
}
|
|
2791
|
+
/**
|
|
2792
|
+
* Select random samples from page numbers
|
|
2793
|
+
*/
|
|
2794
|
+
selectRandomSamples(pageNos, count, exclude = /* @__PURE__ */ new Set()) {
|
|
2795
|
+
const available = pageNos.filter((p) => !exclude.has(p));
|
|
2796
|
+
const pool = available.length >= count ? available : pageNos;
|
|
2797
|
+
const shuffled = [...pool];
|
|
2798
|
+
for (let i = shuffled.length - 1; i > 0; i--) {
|
|
2799
|
+
const j = Math.floor(Math.random() * (i + 1));
|
|
2800
|
+
[shuffled[i], shuffled[j]] = [shuffled[j], shuffled[i]];
|
|
2801
|
+
}
|
|
2802
|
+
return shuffled.slice(0, count).sort((a, b) => a - b);
|
|
2803
|
+
}
|
|
2804
|
+
/**
|
|
2805
|
+
* Extract page numbers from multiple pages in a single LLM call
|
|
2806
|
+
*/
|
|
2807
|
+
async extractMultiplePages(pages, pageNos, model) {
|
|
2808
|
+
this.log("info", `Extracting ${pageNos.length} pages in single LLM call`);
|
|
2809
|
+
const imageContents = [];
|
|
2810
|
+
for (const pageNo of pageNos) {
|
|
2811
|
+
const page = pages[pageNo - 1];
|
|
2812
|
+
const imagePath = path3.resolve(this.outputPath, page.image.uri);
|
|
2813
|
+
const imageBuffer = fs3.readFileSync(imagePath);
|
|
2814
|
+
const base64Image = imageBuffer.toString("base64");
|
|
2815
|
+
const mimeType = page.image.mimetype || "image/png";
|
|
2816
|
+
imageContents.push({
|
|
2817
|
+
type: "image",
|
|
2818
|
+
image: `data:${mimeType};base64,${base64Image}`
|
|
2819
|
+
});
|
|
2820
|
+
}
|
|
2821
|
+
const schema = import_zod4.z.object({
|
|
2822
|
+
pages: import_zod4.z.array(
|
|
2823
|
+
import_zod4.z.object({
|
|
2824
|
+
imageIndex: import_zod4.z.number().describe("0-based index of the image in the request"),
|
|
2825
|
+
startPageNo: import_zod4.z.number().nullable().describe("Start page number (null if not found)"),
|
|
2826
|
+
endPageNo: import_zod4.z.number().nullable().describe(
|
|
2827
|
+
"End page number for double-sided scans (null for single page)"
|
|
2828
|
+
)
|
|
2829
|
+
})
|
|
2830
|
+
).describe("Extracted page numbers for each image")
|
|
2831
|
+
});
|
|
2832
|
+
try {
|
|
2833
|
+
const result = await LLMCaller.callVision({
|
|
2834
|
+
schema,
|
|
2835
|
+
messages: [
|
|
2836
|
+
{
|
|
2837
|
+
role: "user",
|
|
2838
|
+
content: [
|
|
2839
|
+
{ type: "text", text: this.buildUserPrompt(pageNos) },
|
|
2840
|
+
...imageContents
|
|
2841
|
+
]
|
|
2842
|
+
}
|
|
2843
|
+
],
|
|
2844
|
+
primaryModel: model,
|
|
2845
|
+
fallbackModel: this.fallbackModel,
|
|
2846
|
+
maxRetries: this.maxRetries,
|
|
2847
|
+
temperature: 0,
|
|
2848
|
+
abortSignal: this.abortSignal,
|
|
2849
|
+
component: "PageRangeParser",
|
|
2850
|
+
phase: "sampling"
|
|
2851
|
+
});
|
|
2852
|
+
const samples = result.output.pages.map((p) => ({
|
|
2853
|
+
pdfPageNo: pageNos[p.imageIndex],
|
|
2854
|
+
startPageNo: p.startPageNo,
|
|
2855
|
+
endPageNo: p.endPageNo
|
|
2856
|
+
}));
|
|
2857
|
+
return { samples, usage: result.usage };
|
|
2858
|
+
} catch (error) {
|
|
2859
|
+
this.log("error", "Multi-image extraction failed:", error);
|
|
2860
|
+
throw PageRangeParseError.fromError(
|
|
2861
|
+
"Multi-image extraction failed",
|
|
2862
|
+
error
|
|
2863
|
+
);
|
|
2864
|
+
}
|
|
2865
|
+
}
|
|
2866
|
+
/**
|
|
2867
|
+
* Detect pattern from sample results
|
|
2868
|
+
*/
|
|
2869
|
+
detectPattern(samples) {
|
|
2870
|
+
const validSamples = samples.filter((s) => s.startPageNo !== null);
|
|
2871
|
+
if (validSamples.length < 2) {
|
|
2872
|
+
return { pattern: "unknown" /* UNKNOWN */, offset: 0, increment: 1 };
|
|
2873
|
+
}
|
|
2874
|
+
validSamples.sort((a, b) => a.pdfPageNo - b.pdfPageNo);
|
|
2875
|
+
const isSimple = validSamples.every((s, i) => {
|
|
2876
|
+
if (s.endPageNo !== null && s.startPageNo !== s.endPageNo) return false;
|
|
2877
|
+
if (i === 0) return true;
|
|
2878
|
+
const prev = validSamples[i - 1];
|
|
2879
|
+
const expectedIncrease = s.pdfPageNo - prev.pdfPageNo;
|
|
2880
|
+
return s.startPageNo === prev.startPageNo + expectedIncrease;
|
|
2881
|
+
});
|
|
2882
|
+
if (isSimple) {
|
|
2883
|
+
const firstSample = validSamples[0];
|
|
2884
|
+
const offset = firstSample.startPageNo - firstSample.pdfPageNo;
|
|
2885
|
+
return { pattern: "simple_increment" /* SIMPLE_INCREMENT */, offset, increment: 1 };
|
|
2886
|
+
}
|
|
2887
|
+
const isDoubleSided = validSamples.every((s, i) => {
|
|
2888
|
+
if (s.endPageNo === null) return false;
|
|
2889
|
+
if (s.endPageNo !== s.startPageNo + 1) return false;
|
|
2890
|
+
if (i === 0) return true;
|
|
2891
|
+
const prev = validSamples[i - 1];
|
|
2892
|
+
const pdfDiff = s.pdfPageNo - prev.pdfPageNo;
|
|
2893
|
+
const expectedStartDiff = pdfDiff * 2;
|
|
2894
|
+
const actualStartDiff = s.startPageNo - prev.startPageNo;
|
|
2895
|
+
return actualStartDiff === expectedStartDiff;
|
|
2896
|
+
});
|
|
2897
|
+
if (isDoubleSided) {
|
|
2898
|
+
const firstSample = validSamples[0];
|
|
2899
|
+
const offset = firstSample.startPageNo - firstSample.pdfPageNo * 2;
|
|
2900
|
+
return { pattern: "double_sided" /* DOUBLE_SIDED */, offset, increment: 2 };
|
|
2901
|
+
}
|
|
2902
|
+
const offsets = validSamples.map((s) => s.startPageNo - s.pdfPageNo);
|
|
2903
|
+
const avgOffset = Math.round(
|
|
2904
|
+
offsets.reduce((a, b) => a + b, 0) / offsets.length
|
|
2905
|
+
);
|
|
2906
|
+
const isConsistentOffset = offsets.every(
|
|
2907
|
+
(o) => Math.abs(o - avgOffset) <= 1
|
|
2908
|
+
);
|
|
2909
|
+
if (isConsistentOffset) {
|
|
2910
|
+
return { pattern: "offset" /* OFFSET */, offset: avgOffset, increment: 1 };
|
|
2911
|
+
}
|
|
2912
|
+
return { pattern: "unknown" /* UNKNOWN */, offset: 0, increment: 1 };
|
|
2913
|
+
}
|
|
2914
|
+
/**
|
|
2915
|
+
* Apply detected pattern to generate page range map
|
|
2916
|
+
*/
|
|
2917
|
+
applyPattern(pageNos, pattern) {
|
|
2918
|
+
const result = {};
|
|
2919
|
+
for (const pdfPageNo of pageNos) {
|
|
2920
|
+
switch (pattern.pattern) {
|
|
2921
|
+
case "simple_increment" /* SIMPLE_INCREMENT */:
|
|
2922
|
+
case "offset" /* OFFSET */: {
|
|
2923
|
+
const pageNo = pdfPageNo + pattern.offset;
|
|
2924
|
+
result[pdfPageNo] = {
|
|
2925
|
+
startPageNo: pageNo,
|
|
2926
|
+
endPageNo: pageNo
|
|
2927
|
+
};
|
|
2928
|
+
break;
|
|
2929
|
+
}
|
|
2930
|
+
case "double_sided" /* DOUBLE_SIDED */: {
|
|
2931
|
+
const start = pdfPageNo * 2 + pattern.offset;
|
|
2932
|
+
result[pdfPageNo] = {
|
|
2933
|
+
startPageNo: start,
|
|
2934
|
+
endPageNo: start + 1
|
|
2935
|
+
};
|
|
2936
|
+
break;
|
|
2937
|
+
}
|
|
2938
|
+
default:
|
|
2939
|
+
result[pdfPageNo] = { startPageNo: 0, endPageNo: 0 };
|
|
2940
|
+
}
|
|
2941
|
+
}
|
|
2942
|
+
return result;
|
|
2943
|
+
}
|
|
2944
|
+
/**
|
|
2945
|
+
* Convert sample results to page range map (for small groups)
|
|
2946
|
+
*/
|
|
2947
|
+
samplesToMap(samples) {
|
|
2948
|
+
const result = {};
|
|
2949
|
+
for (const sample of samples) {
|
|
2950
|
+
if (sample.startPageNo !== null) {
|
|
2951
|
+
result[sample.pdfPageNo] = {
|
|
2952
|
+
startPageNo: sample.startPageNo,
|
|
2953
|
+
endPageNo: sample.endPageNo ?? sample.startPageNo
|
|
2954
|
+
};
|
|
2955
|
+
} else {
|
|
2956
|
+
result[sample.pdfPageNo] = { startPageNo: 0, endPageNo: 0 };
|
|
2957
|
+
}
|
|
2958
|
+
}
|
|
2959
|
+
return result;
|
|
2960
|
+
}
|
|
2961
|
+
/**
|
|
2962
|
+
* Post-process the page range map
|
|
2963
|
+
*/
|
|
2964
|
+
postProcess(pageRangeMap) {
|
|
2965
|
+
this.detectAndHandleOutliers(pageRangeMap);
|
|
2966
|
+
this.detectAndHandleDrops(pageRangeMap);
|
|
2967
|
+
this.normalizeNegatives(pageRangeMap);
|
|
2968
|
+
this.backfillFailedPages(pageRangeMap);
|
|
2969
|
+
}
|
|
2970
|
+
/**
|
|
2971
|
+
* Detect and handle outlier page numbers at the beginning of document
|
|
2972
|
+
*
|
|
2973
|
+
* When early PDF pages have abnormally high page numbers compared to
|
|
2974
|
+
* subsequent pages (e.g., PDF 1-9 = 75-83, but PDF 10+ = 2,3,4...),
|
|
2975
|
+
* the LLM likely misread figure/photo numbers as page numbers.
|
|
2976
|
+
*
|
|
2977
|
+
* Detection: If page numbers at the beginning are significantly higher
|
|
2978
|
+
* than subsequent pages (which follow a normal pattern), mark them as failed.
|
|
2979
|
+
*/
|
|
2980
|
+
detectAndHandleOutliers(pageRangeMap) {
|
|
2981
|
+
const pdfPages = Object.keys(pageRangeMap).map(Number).sort((a, b) => a - b);
|
|
2982
|
+
if (pdfPages.length < 3) return;
|
|
2983
|
+
const normalSequenceStart = this.findNormalSequenceStart(
|
|
2984
|
+
pageRangeMap,
|
|
2985
|
+
pdfPages
|
|
2986
|
+
);
|
|
2987
|
+
if (normalSequenceStart === null || normalSequenceStart <= 0) return;
|
|
2988
|
+
const normalStartPdfPage = pdfPages[normalSequenceStart];
|
|
2989
|
+
const normalStartPageNo = pageRangeMap[normalStartPdfPage].startPageNo;
|
|
2990
|
+
let hasOutliers = false;
|
|
2991
|
+
for (let i = 0; i < normalSequenceStart; i++) {
|
|
2992
|
+
const pdfPage = pdfPages[i];
|
|
2993
|
+
const pageNo = pageRangeMap[pdfPage].startPageNo;
|
|
2994
|
+
if (pageNo === 0) continue;
|
|
2995
|
+
const pdfDiff = normalStartPdfPage - pdfPage;
|
|
2996
|
+
const isDoubleSided = this.isDoubleSidedRange(
|
|
2997
|
+
pageRangeMap[normalStartPdfPage]
|
|
2998
|
+
);
|
|
2999
|
+
const expectedPageNo = isDoubleSided ? normalStartPageNo - pdfDiff * 2 : normalStartPageNo - pdfDiff;
|
|
3000
|
+
if (pageNo > expectedPageNo + 10) {
|
|
3001
|
+
this.log(
|
|
3002
|
+
"info",
|
|
3003
|
+
`Outlier detected: PDF ${pdfPage}=${pageNo} (expected ~${expectedPageNo})`
|
|
3004
|
+
);
|
|
3005
|
+
pageRangeMap[pdfPage] = { startPageNo: 0, endPageNo: 0 };
|
|
3006
|
+
hasOutliers = true;
|
|
3007
|
+
}
|
|
3008
|
+
}
|
|
3009
|
+
if (hasOutliers) {
|
|
3010
|
+
this.log("info", `Outliers marked as failed, will be backfilled later`);
|
|
3011
|
+
}
|
|
3012
|
+
}
|
|
3013
|
+
/**
|
|
3014
|
+
* Find the start index of a "normal" sequence in the page range map
|
|
3015
|
+
*
|
|
3016
|
+
* A normal sequence is defined as at least 3 consecutive PDF pages where:
|
|
3017
|
+
* - Page numbers are increasing (for single-page) or increasing by 2 (for double-sided)
|
|
3018
|
+
* - The pattern is consistent
|
|
3019
|
+
*
|
|
3020
|
+
* Returns the index in pdfPages array, or null if not found.
|
|
3021
|
+
*/
|
|
3022
|
+
findNormalSequenceStart(pageRangeMap, pdfPages) {
|
|
3023
|
+
const MIN_SEQUENCE_LENGTH = 3;
|
|
3024
|
+
for (let startIdx = 0; startIdx <= pdfPages.length - MIN_SEQUENCE_LENGTH; startIdx++) {
|
|
3025
|
+
let isValidSequence = true;
|
|
3026
|
+
let expectedIncrement = null;
|
|
3027
|
+
for (let i = 0; i < MIN_SEQUENCE_LENGTH - 1; i++) {
|
|
3028
|
+
const currPdfPage = pdfPages[startIdx + i];
|
|
3029
|
+
const nextPdfPage = pdfPages[startIdx + i + 1];
|
|
3030
|
+
const currRange = pageRangeMap[currPdfPage];
|
|
3031
|
+
const nextRange = pageRangeMap[nextPdfPage];
|
|
3032
|
+
if (currRange.startPageNo === 0 || nextRange.startPageNo === 0) {
|
|
3033
|
+
isValidSequence = false;
|
|
3034
|
+
break;
|
|
3035
|
+
}
|
|
3036
|
+
const pageIncrement = nextRange.startPageNo - currRange.startPageNo;
|
|
3037
|
+
const pdfIncrement = nextPdfPage - currPdfPage;
|
|
3038
|
+
const isDoubleSided = this.isDoubleSidedRange(currRange);
|
|
3039
|
+
const expectedIncrementPerPdf = isDoubleSided ? 2 : 1;
|
|
3040
|
+
const expected = pdfIncrement * expectedIncrementPerPdf;
|
|
3041
|
+
if (expectedIncrement === null) {
|
|
3042
|
+
expectedIncrement = pageIncrement;
|
|
3043
|
+
}
|
|
3044
|
+
if (pageIncrement !== expected) {
|
|
3045
|
+
isValidSequence = false;
|
|
3046
|
+
break;
|
|
3047
|
+
}
|
|
3048
|
+
}
|
|
3049
|
+
if (isValidSequence) {
|
|
3050
|
+
return startIdx;
|
|
3051
|
+
}
|
|
3052
|
+
}
|
|
3053
|
+
return null;
|
|
3054
|
+
}
|
|
3055
|
+
/**
|
|
3056
|
+
* Check if a page range represents a double-sided scan
|
|
3057
|
+
*/
|
|
3058
|
+
isDoubleSidedRange(range) {
|
|
3059
|
+
return range.endPageNo !== null && range.endPageNo !== range.startPageNo && range.endPageNo === range.startPageNo + 1;
|
|
3060
|
+
}
|
|
3061
|
+
/**
|
|
3062
|
+
* Detect and handle page number drops
|
|
3063
|
+
*
|
|
3064
|
+
* When page numbers suddenly decrease (e.g., 8,9 -> 3,4),
|
|
3065
|
+
* recalculate previous pages based on the drop point.
|
|
3066
|
+
*/
|
|
3067
|
+
detectAndHandleDrops(pageRangeMap) {
|
|
3068
|
+
const pdfPages = Object.keys(pageRangeMap).map(Number).sort((a, b) => a - b);
|
|
3069
|
+
if (pdfPages.length < 2) return;
|
|
3070
|
+
for (let i = 1; i < pdfPages.length; i++) {
|
|
3071
|
+
const prevPdfPage = pdfPages[i - 1];
|
|
3072
|
+
const currPdfPage = pdfPages[i];
|
|
3073
|
+
const prevPageNo = pageRangeMap[prevPdfPage].startPageNo;
|
|
3074
|
+
const currPageNo = pageRangeMap[currPdfPage].startPageNo;
|
|
3075
|
+
if (prevPageNo === 0 || currPageNo === 0) continue;
|
|
3076
|
+
if (currPageNo > 0 && prevPageNo > currPageNo && prevPageNo - currPageNo > 1) {
|
|
3077
|
+
this.log(
|
|
3078
|
+
"info",
|
|
3079
|
+
`Page drop detected: PDF ${prevPdfPage}=${prevPageNo} -> PDF ${currPdfPage}=${currPageNo}`
|
|
3080
|
+
);
|
|
3081
|
+
const isDoubleSided = this.isDoubleSidedRange(
|
|
3082
|
+
pageRangeMap[currPdfPage]
|
|
3083
|
+
);
|
|
3084
|
+
for (let j = i - 1; j >= 0; j--) {
|
|
3085
|
+
const pdfPage = pdfPages[j];
|
|
3086
|
+
const distance = currPdfPage - pdfPage;
|
|
3087
|
+
if (isDoubleSided) {
|
|
3088
|
+
const expectedStartPageNo = currPageNo - distance * 2;
|
|
3089
|
+
if (expectedStartPageNo < 1) {
|
|
3090
|
+
pageRangeMap[pdfPage] = { startPageNo: 0, endPageNo: 0 };
|
|
3091
|
+
} else {
|
|
3092
|
+
pageRangeMap[pdfPage] = {
|
|
3093
|
+
startPageNo: expectedStartPageNo,
|
|
3094
|
+
endPageNo: expectedStartPageNo + 1
|
|
3095
|
+
};
|
|
3096
|
+
}
|
|
3097
|
+
} else {
|
|
3098
|
+
const expectedPageNo = currPageNo - distance;
|
|
3099
|
+
if (expectedPageNo < 1) {
|
|
3100
|
+
pageRangeMap[pdfPage] = { startPageNo: 0, endPageNo: 0 };
|
|
3101
|
+
} else {
|
|
3102
|
+
pageRangeMap[pdfPage] = {
|
|
3103
|
+
startPageNo: expectedPageNo,
|
|
3104
|
+
endPageNo: expectedPageNo
|
|
3105
|
+
};
|
|
3106
|
+
}
|
|
3107
|
+
}
|
|
3108
|
+
this.log(
|
|
3109
|
+
"info",
|
|
3110
|
+
`Recalculated PDF ${pdfPage} -> ${pageRangeMap[pdfPage].startPageNo}`
|
|
3111
|
+
);
|
|
3112
|
+
}
|
|
3113
|
+
}
|
|
3114
|
+
}
|
|
3115
|
+
}
|
|
3116
|
+
/**
|
|
3117
|
+
* Normalize negative page numbers to 0
|
|
3118
|
+
*/
|
|
3119
|
+
normalizeNegatives(pageRangeMap) {
|
|
3120
|
+
for (const [pdfPageStr, range] of Object.entries(pageRangeMap)) {
|
|
3121
|
+
if (range.startPageNo < 0 || range.endPageNo < 0) {
|
|
3122
|
+
this.log("info", `Normalizing negative: PDF ${pdfPageStr} -> 0`);
|
|
3123
|
+
pageRangeMap[Number(pdfPageStr)] = { startPageNo: 0, endPageNo: 0 };
|
|
3124
|
+
}
|
|
3125
|
+
}
|
|
3126
|
+
}
|
|
3127
|
+
/**
|
|
3128
|
+
* Backfill pages marked with 0 using detected pattern
|
|
3129
|
+
*/
|
|
3130
|
+
backfillFailedPages(pageRangeMap) {
|
|
3131
|
+
const pdfPages = Object.keys(pageRangeMap).map(Number).sort((a, b) => a - b);
|
|
3132
|
+
const failedPages = pdfPages.filter(
|
|
3133
|
+
(p) => pageRangeMap[p].startPageNo === 0
|
|
3134
|
+
);
|
|
3135
|
+
if (failedPages.length === 0) return;
|
|
3136
|
+
const successfulPages = pdfPages.filter((p) => pageRangeMap[p].startPageNo > 0).map((p) => ({
|
|
3137
|
+
pdfPage: p,
|
|
3138
|
+
pageNo: pageRangeMap[p].startPageNo,
|
|
3139
|
+
isDoubleSided: this.isDoubleSidedRange(pageRangeMap[p])
|
|
3140
|
+
}));
|
|
3141
|
+
if (successfulPages.length < 2) {
|
|
3142
|
+
this.log("warn", "Not enough successful pages for backfill");
|
|
3143
|
+
return;
|
|
3144
|
+
}
|
|
3145
|
+
const doubleSidedCount = successfulPages.filter(
|
|
3146
|
+
(s) => s.isDoubleSided
|
|
3147
|
+
).length;
|
|
3148
|
+
const isDoubleSided = doubleSidedCount > successfulPages.length / 2;
|
|
3149
|
+
if (isDoubleSided) {
|
|
3150
|
+
const offsets = successfulPages.map((s) => s.pageNo - s.pdfPage * 2);
|
|
3151
|
+
const avgOffset = Math.round(
|
|
3152
|
+
offsets.reduce((a, b) => a + b, 0) / offsets.length
|
|
3153
|
+
);
|
|
3154
|
+
this.log(
|
|
3155
|
+
"info",
|
|
3156
|
+
`Backfilling ${failedPages.length} pages with double-sided pattern (offset=${avgOffset})`
|
|
3157
|
+
);
|
|
3158
|
+
for (const pdfPage of failedPages) {
|
|
3159
|
+
const expectedStartPageNo = pdfPage * 2 + avgOffset;
|
|
3160
|
+
if (expectedStartPageNo < 1) {
|
|
3161
|
+
this.log(
|
|
3162
|
+
"info",
|
|
3163
|
+
`Backfill skipped for PDF ${pdfPage} (would be ${expectedStartPageNo})`
|
|
3164
|
+
);
|
|
3165
|
+
continue;
|
|
3166
|
+
}
|
|
3167
|
+
this.log(
|
|
3168
|
+
"info",
|
|
3169
|
+
`Backfill PDF ${pdfPage}: 0 -> ${expectedStartPageNo}-${expectedStartPageNo + 1}`
|
|
3170
|
+
);
|
|
3171
|
+
pageRangeMap[pdfPage] = {
|
|
3172
|
+
startPageNo: expectedStartPageNo,
|
|
3173
|
+
endPageNo: expectedStartPageNo + 1
|
|
3174
|
+
};
|
|
3175
|
+
}
|
|
3176
|
+
} else {
|
|
3177
|
+
const offsets = successfulPages.map((s) => s.pageNo - s.pdfPage);
|
|
3178
|
+
const avgOffset = Math.round(
|
|
3179
|
+
offsets.reduce((a, b) => a + b, 0) / offsets.length
|
|
3180
|
+
);
|
|
3181
|
+
this.log(
|
|
3182
|
+
"info",
|
|
3183
|
+
`Backfilling ${failedPages.length} pages with offset ${avgOffset}`
|
|
3184
|
+
);
|
|
3185
|
+
for (const pdfPage of failedPages) {
|
|
3186
|
+
const expectedPageNo = pdfPage + avgOffset;
|
|
3187
|
+
if (expectedPageNo < 1) {
|
|
3188
|
+
this.log(
|
|
3189
|
+
"info",
|
|
3190
|
+
`Backfill skipped for PDF ${pdfPage} (would be ${expectedPageNo})`
|
|
3191
|
+
);
|
|
3192
|
+
continue;
|
|
3193
|
+
}
|
|
3194
|
+
this.log("info", `Backfill PDF ${pdfPage}: 0 -> ${expectedPageNo}`);
|
|
3195
|
+
pageRangeMap[pdfPage] = {
|
|
3196
|
+
startPageNo: expectedPageNo,
|
|
3197
|
+
endPageNo: expectedPageNo
|
|
3198
|
+
};
|
|
3199
|
+
}
|
|
3200
|
+
}
|
|
3201
|
+
}
|
|
3202
|
+
/**
|
|
3203
|
+
* Build system prompt for Vision LLM
|
|
3204
|
+
*/
|
|
3205
|
+
buildSystemPrompt() {
|
|
3206
|
+
return `You are a page number extraction specialist for document images.
|
|
3207
|
+
You will receive multiple document page images. For EACH image, extract the visible page number(s).
|
|
3208
|
+
|
|
3209
|
+
**SCAN TYPES:**
|
|
3210
|
+
1. SINGLE PAGE: One document page per image. Return startPageNo only, endPageNo should be null.
|
|
3211
|
+
2. DOUBLE-SIDED: Two document pages per image (spread). Return startPageNo (left) and endPageNo (right).
|
|
3212
|
+
|
|
3213
|
+
**WHERE TO LOOK:**
|
|
3214
|
+
- Bottom center, bottom corners (most common)
|
|
3215
|
+
- Top corners (less common)
|
|
3216
|
+
- Page numbers are SMALL numbers in MARGINS, NOT in content area
|
|
3217
|
+
|
|
3218
|
+
**WHAT TO IGNORE - These are NOT page numbers:**
|
|
3219
|
+
- Roman numerals (i, ii, iii, iv, v...) - return null
|
|
3220
|
+
- Figure numbers: "Figure 5", "Fig. 5", "\uB3C4 5", "\uADF8\uB9BC 5"
|
|
3221
|
+
- Table numbers: "Table 3", "\uD45C 3"
|
|
3222
|
+
- Photo numbers: "Photo 8", "\uC0AC\uC9C4 8", "Plate 4", "\uB3C4\uD310 4"
|
|
3223
|
+
- Years in content: "2015", "(1998)"
|
|
3224
|
+
- Any numbers with text prefix or inside content area
|
|
3225
|
+
|
|
3226
|
+
**RESPONSE FORMAT:**
|
|
3227
|
+
For each image (in order), provide:
|
|
3228
|
+
- imageIndex: 0-based index of the image
|
|
3229
|
+
- startPageNo: The page number found (null if not visible/readable)
|
|
3230
|
+
- endPageNo: Right page number for double-sided scans (null for single pages)`;
|
|
3231
|
+
}
|
|
3232
|
+
/**
|
|
3233
|
+
* Build user prompt for Vision LLM
|
|
3234
|
+
*/
|
|
3235
|
+
buildUserPrompt(pageNos) {
|
|
3236
|
+
return `I am providing ${pageNos.length} document page images.
|
|
3237
|
+
These are PDF pages: ${pageNos.join(", ")}.
|
|
3238
|
+
|
|
3239
|
+
For each image (in order), extract the visible page number(s).
|
|
3240
|
+
Return null for pages where no page number is visible or readable.
|
|
3241
|
+
|
|
3242
|
+
Remember: Look for SMALL numbers in MARGINS only. Ignore figure/table/photo numbers.`;
|
|
3243
|
+
}
|
|
3244
|
+
};
|
|
3245
|
+
|
|
3246
|
+
// src/validators/base-validator.ts
|
|
3247
|
+
var BaseValidator = class extends TextLLMComponent {
|
|
3248
|
+
/**
|
|
3249
|
+
* Validator name for logging (kept for backwards compatibility)
|
|
3250
|
+
*/
|
|
3251
|
+
validatorName;
|
|
3252
|
+
/**
|
|
3253
|
+
* Constructor for BaseValidator
|
|
3254
|
+
*
|
|
3255
|
+
* @param logger - Logger instance
|
|
3256
|
+
* @param model - Language model to use for validation
|
|
3257
|
+
* @param validatorName - Name of the validator for logging (e.g., "TocContentValidator")
|
|
3258
|
+
* @param options - Optional configuration (maxRetries, temperature)
|
|
3259
|
+
* @param fallbackModel - Optional fallback model for retry on failure
|
|
3260
|
+
* @param aggregator - Optional token usage aggregator for tracking LLM calls
|
|
3261
|
+
*/
|
|
3262
|
+
constructor(logger, model, validatorName, options, fallbackModel, aggregator) {
|
|
3263
|
+
super(logger, model, validatorName, options, fallbackModel, aggregator);
|
|
3264
|
+
this.validatorName = validatorName;
|
|
3265
|
+
}
|
|
3266
|
+
/**
|
|
3267
|
+
* Call LLM with LLMCaller
|
|
3268
|
+
*
|
|
3269
|
+
* This method provides backwards compatibility with existing validators.
|
|
3270
|
+
* It wraps the parent callTextLLM method but allows passing a custom aggregator.
|
|
3271
|
+
*
|
|
3272
|
+
* @param schema - Zod schema for response validation
|
|
3273
|
+
* @param systemPrompt - System prompt
|
|
3274
|
+
* @param userPrompt - User prompt
|
|
3275
|
+
* @param phase - Phase name for tracking (e.g., 'validation', 'batch-validation')
|
|
3276
|
+
* @param aggregator - Optional token usage aggregator for tracking this call
|
|
3277
|
+
* @returns Parsed and validated LLM response with usage information
|
|
3278
|
+
*/
|
|
3279
|
+
async callLLM(schema, systemPrompt, userPrompt, phase, aggregator) {
|
|
3280
|
+
const result = await LLMCaller.call({
|
|
3281
|
+
schema,
|
|
3282
|
+
systemPrompt,
|
|
3283
|
+
userPrompt,
|
|
3284
|
+
primaryModel: this.model,
|
|
3285
|
+
fallbackModel: this.fallbackModel,
|
|
3286
|
+
maxRetries: this.maxRetries,
|
|
3287
|
+
temperature: this.temperature,
|
|
3288
|
+
abortSignal: this.abortSignal,
|
|
3289
|
+
component: this.validatorName,
|
|
3290
|
+
phase
|
|
3291
|
+
});
|
|
3292
|
+
if (aggregator) {
|
|
3293
|
+
aggregator.track(result.usage);
|
|
3294
|
+
} else {
|
|
3295
|
+
this.trackUsage(result.usage);
|
|
3296
|
+
}
|
|
3297
|
+
return {
|
|
3298
|
+
output: result.output,
|
|
3299
|
+
usage: result.usage
|
|
3300
|
+
};
|
|
3301
|
+
}
|
|
3302
|
+
};
|
|
3303
|
+
|
|
3304
|
+
// src/validators/toc-content-validator.ts
|
|
3305
|
+
var import_zod5 = require("zod");
|
|
3306
|
+
var TocContentValidationSchema = import_zod5.z.object({
|
|
3307
|
+
isToc: import_zod5.z.boolean().describe("Whether the content is a table of contents"),
|
|
3308
|
+
confidence: import_zod5.z.number().min(0).max(1).describe("Confidence score between 0 and 1"),
|
|
3309
|
+
reason: import_zod5.z.string().describe("Brief explanation for the decision")
|
|
3310
|
+
});
|
|
3311
|
+
var TocContentValidator = class extends BaseValidator {
|
|
3312
|
+
confidenceThreshold;
|
|
3313
|
+
constructor(logger, model, options, fallbackModel, aggregator) {
|
|
3314
|
+
super(
|
|
3315
|
+
logger,
|
|
3316
|
+
model,
|
|
3317
|
+
"TocContentValidator",
|
|
3318
|
+
options,
|
|
3319
|
+
fallbackModel,
|
|
3320
|
+
aggregator
|
|
3321
|
+
);
|
|
3322
|
+
this.confidenceThreshold = options?.confidenceThreshold ?? 0.7;
|
|
3323
|
+
}
|
|
3324
|
+
/**
|
|
3325
|
+
* Validate if the markdown content is a table of contents
|
|
3326
|
+
*
|
|
3327
|
+
* @param markdown - Markdown content to validate
|
|
3328
|
+
* @returns Validation result with isToc, confidence, and reason
|
|
3329
|
+
*/
|
|
3330
|
+
async validate(markdown) {
|
|
3331
|
+
this.logger.info(
|
|
3332
|
+
`[TocContentValidator] Validating content (${markdown.length} chars)`
|
|
3333
|
+
);
|
|
3334
|
+
if (!markdown.trim()) {
|
|
3335
|
+
this.logger.info(
|
|
3336
|
+
"[TocContentValidator] Empty markdown, returning invalid"
|
|
3337
|
+
);
|
|
3338
|
+
return {
|
|
3339
|
+
isToc: false,
|
|
3340
|
+
confidence: 1,
|
|
3341
|
+
reason: "Empty content"
|
|
3342
|
+
};
|
|
3343
|
+
}
|
|
3344
|
+
const { output: result } = await this.callLLM(
|
|
3345
|
+
TocContentValidationSchema,
|
|
3346
|
+
this.buildSystemPrompt(),
|
|
3347
|
+
this.buildUserPrompt(markdown),
|
|
3348
|
+
"validation",
|
|
3349
|
+
this.aggregator
|
|
3350
|
+
);
|
|
3351
|
+
this.logger.info(
|
|
3352
|
+
`[TocContentValidator] Result: isToc=${result.isToc}, confidence=${result.confidence}`
|
|
3353
|
+
);
|
|
3354
|
+
return result;
|
|
3355
|
+
}
|
|
3356
|
+
/**
|
|
3357
|
+
* Check if validation result passes threshold
|
|
3358
|
+
*
|
|
3359
|
+
* @param result - Validation result from validate()
|
|
3360
|
+
* @returns true if content is valid TOC with sufficient confidence
|
|
3361
|
+
*/
|
|
3362
|
+
isValid(result) {
|
|
3363
|
+
return result.isToc && result.confidence >= this.confidenceThreshold;
|
|
3364
|
+
}
|
|
3365
|
+
/**
|
|
3366
|
+
* Build system prompt for TOC content validation
|
|
3367
|
+
*/
|
|
3368
|
+
buildSystemPrompt() {
|
|
3369
|
+
return `You are a document structure analyst. Your task is to determine if the provided content is a Table of Contents (TOC).
|
|
3370
|
+
|
|
3371
|
+
## What IS a Table of Contents:
|
|
3372
|
+
- A structured list of chapters/sections with corresponding page numbers
|
|
3373
|
+
- Contains hierarchical section titles (e.g., "Chapter 1", "\uC81C1\uC7A5", "1.1 Introduction", etc.)
|
|
3374
|
+
- Has page number references for each entry (e.g., "..... 10", "... 5", or just a number at the end)
|
|
3375
|
+
- Multiple entries organized by document structure
|
|
3376
|
+
- Main document outline listing major chapters and sections
|
|
3377
|
+
|
|
3378
|
+
## What is NOT a Table of Contents:
|
|
3379
|
+
- Photo/image indices (\uC0AC\uC9C4 \uBAA9\uCC28, \uC0AC\uC9C4\uBAA9\uCC28, Photo Index, List of Figures, List of Photos)
|
|
3380
|
+
- Table indices (\uD45C \uBAA9\uCC28, \uD45C\uBAA9\uCC28, Table Index, List of Tables)
|
|
3381
|
+
- Drawing/diagram indices (\uB3C4\uBA74 \uBAA9\uCC28, \uB3C4\uBA74\uBAA9\uCC28, Drawing Index, List of Drawings)
|
|
3382
|
+
- Appendix indices (\uBD80\uB85D \uBAA9\uCC28, Appendix Index)
|
|
3383
|
+
- Random body text from the document
|
|
3384
|
+
- Single entries or incomplete lists (fewer than 3 items)
|
|
3385
|
+
- Reference lists or bibliographies
|
|
3386
|
+
- Index pages (alphabetical keyword lists)
|
|
3387
|
+
|
|
3388
|
+
## Response Guidelines:
|
|
3389
|
+
- Set isToc to true ONLY if content is clearly a main document TOC
|
|
3390
|
+
- Set confidence between 0.0 and 1.0 based on your certainty
|
|
3391
|
+
- Provide a brief reason explaining your decision (1-2 sentences)`;
|
|
3392
|
+
}
|
|
3393
|
+
/**
|
|
3394
|
+
* Build user prompt with markdown content
|
|
3395
|
+
*/
|
|
3396
|
+
buildUserPrompt(markdown) {
|
|
3397
|
+
return `Determine if the following content is a Table of Contents:
|
|
3398
|
+
|
|
3399
|
+
${markdown}`;
|
|
3400
|
+
}
|
|
3401
|
+
};
|
|
3402
|
+
|
|
3403
|
+
// src/validators/caption-validator.ts
|
|
3404
|
+
var import_zod6 = require("zod");
|
|
3405
|
+
var CaptionValidationItemSchema = import_zod6.z.object({
|
|
3406
|
+
index: import_zod6.z.number().int().describe("Index of the caption in the input array"),
|
|
3407
|
+
isValid: import_zod6.z.boolean().describe("Whether the parsed caption is correct"),
|
|
3408
|
+
reason: import_zod6.z.string().nullable().describe("Brief explanation if invalid, null if valid")
|
|
3409
|
+
});
|
|
3410
|
+
var CaptionValidationBatchSchema = import_zod6.z.object({
|
|
3411
|
+
results: import_zod6.z.array(CaptionValidationItemSchema)
|
|
3412
|
+
});
|
|
3413
|
+
var CaptionValidator = class extends BaseValidator {
|
|
3414
|
+
constructor(logger, model, options, fallbackModel, aggregator) {
|
|
3415
|
+
super(
|
|
3416
|
+
logger,
|
|
3417
|
+
model,
|
|
3418
|
+
"CaptionValidator",
|
|
3419
|
+
options,
|
|
3420
|
+
fallbackModel,
|
|
3421
|
+
aggregator
|
|
3422
|
+
);
|
|
3423
|
+
}
|
|
3424
|
+
/**
|
|
3425
|
+
* Validate batch of parsed captions against original texts
|
|
3426
|
+
*
|
|
3427
|
+
* @param captions - Array of parsed Caption objects
|
|
3428
|
+
* @param originalTexts - Array of original caption texts (same order as captions)
|
|
3429
|
+
* @param batchSize - Batch size for processing. Set to 0 to skip validation (assume all valid).
|
|
3430
|
+
* @returns Array of validation results (boolean) maintaining original order
|
|
3431
|
+
*/
|
|
3432
|
+
async validateBatch(captions, originalTexts, batchSize) {
|
|
3433
|
+
this.logger.info(
|
|
3434
|
+
`[CaptionValidator] Validating ${captions.length} captions with batch size ${batchSize}...`
|
|
3435
|
+
);
|
|
3436
|
+
if (captions.length !== originalTexts.length) {
|
|
3437
|
+
throw new Error(
|
|
3438
|
+
`[CaptionValidator] Captions and originalTexts length mismatch: ${captions.length} vs ${originalTexts.length}`
|
|
3439
|
+
);
|
|
3440
|
+
}
|
|
3441
|
+
if (captions.length === 0) {
|
|
3442
|
+
this.logger.info("[CaptionValidator] No captions to validate");
|
|
3443
|
+
return [];
|
|
3444
|
+
}
|
|
3445
|
+
if (batchSize === 0) {
|
|
3446
|
+
this.logger.info(
|
|
3447
|
+
"[CaptionValidator] Skipping validation (batchSize=0), assuming all captions are valid"
|
|
3448
|
+
);
|
|
3449
|
+
return new Array(captions.length).fill(true);
|
|
3450
|
+
}
|
|
3451
|
+
try {
|
|
3452
|
+
const indexedItems = captions.map((caption, index) => ({
|
|
3453
|
+
index,
|
|
3454
|
+
caption,
|
|
3455
|
+
originalText: originalTexts[index]
|
|
3456
|
+
}));
|
|
3457
|
+
const batchResults = await BatchProcessor.processBatch(
|
|
3458
|
+
indexedItems,
|
|
3459
|
+
batchSize,
|
|
3460
|
+
async (batch) => this.validateBatchInternal(batch, this.model)
|
|
3461
|
+
);
|
|
3462
|
+
batchResults.sort((a, b) => a.index - b.index);
|
|
3463
|
+
const results = batchResults.map((r) => r.isValid);
|
|
3464
|
+
const validCount = results.filter((r) => r).length;
|
|
3465
|
+
this.logger.info(
|
|
3466
|
+
`[CaptionValidator] Completed: ${validCount}/${results.length} captions validated as correct`
|
|
3467
|
+
);
|
|
3468
|
+
if (this.aggregator) {
|
|
3469
|
+
this.aggregator.logSummary(this.logger);
|
|
3470
|
+
}
|
|
3471
|
+
return results;
|
|
3472
|
+
} catch (error) {
|
|
3473
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3474
|
+
this.logger.error(`[CaptionValidator] Validation failed: ${message}`);
|
|
3475
|
+
throw new CaptionValidationError(
|
|
3476
|
+
`Failed to validate captions: ${message}`,
|
|
3477
|
+
{ cause: error }
|
|
3478
|
+
);
|
|
3479
|
+
}
|
|
3480
|
+
}
|
|
3481
|
+
/**
|
|
3482
|
+
* Internal: Validate batch of captions using LLM
|
|
3483
|
+
*
|
|
3484
|
+
* @param items - Batch of caption items with original indices
|
|
3485
|
+
* @param model - Effective model to use
|
|
3486
|
+
* @returns Array of validation results indexed correctly
|
|
3487
|
+
*/
|
|
3488
|
+
async validateBatchInternal(items, model) {
|
|
3489
|
+
const result = await LLMCaller.call({
|
|
3490
|
+
schema: CaptionValidationBatchSchema,
|
|
3491
|
+
systemPrompt: this.buildSystemPrompt(),
|
|
3492
|
+
userPrompt: this.buildUserPrompt(items),
|
|
3493
|
+
primaryModel: model,
|
|
3494
|
+
fallbackModel: this.fallbackModel,
|
|
3495
|
+
maxRetries: this.maxRetries,
|
|
3496
|
+
temperature: this.temperature,
|
|
3497
|
+
abortSignal: this.abortSignal,
|
|
3498
|
+
component: "CaptionValidator",
|
|
3499
|
+
phase: "validation"
|
|
3500
|
+
});
|
|
3501
|
+
if (this.aggregator) {
|
|
3502
|
+
this.aggregator.track(result.usage);
|
|
3503
|
+
}
|
|
3504
|
+
return result.output.results.map((item) => ({
|
|
3505
|
+
index: item.index,
|
|
3506
|
+
isValid: item.isValid
|
|
3507
|
+
}));
|
|
3508
|
+
}
|
|
3509
|
+
buildSystemPrompt() {
|
|
3510
|
+
return `You are a caption validation expert for archaeological excavation reports.
|
|
3511
|
+
|
|
3512
|
+
Your task is to validate whether parsed caption prefixes (num field) are correctly extracted from original caption texts.
|
|
3513
|
+
|
|
3514
|
+
## Caption Pattern Recognition
|
|
3515
|
+
|
|
3516
|
+
A valid caption follows the pattern: <prefix word(s)> <number>
|
|
3517
|
+
- The prefix can be ANY Korean/English word(s) that label images/tables/figures
|
|
3518
|
+
- Common examples: \uB3C4\uD310, \uC0AC\uC9C4, \uADF8\uB9BC, \uC6D0\uC0C9\uC0AC\uC9C4, \uD751\uBC31\uC0AC\uC9C4, Figure, Photo, Plate, etc.
|
|
3519
|
+
- The key is the PATTERN (text followed by number), not a specific word list
|
|
3520
|
+
- Leading punctuation/brackets should be IGNORED when extracting
|
|
3521
|
+
|
|
3522
|
+
Valid caption patterns:
|
|
3523
|
+
- "\uC6D0\uC0C9\uC0AC\uC9C4 1. \uC870\uC0AC\uC9C0\uC5ED" \u2192 num="\uC6D0\uC0C9\uC0AC\uC9C4 1" \u2713
|
|
3524
|
+
- "\uD751\uBC31\uC0AC\uC9C4 2 \uCD9C\uD1A0\uC720\uBB3C" \u2192 num="\uD751\uBC31\uC0AC\uC9C4 2" \u2713
|
|
3525
|
+
- "\uB3C4\uD310 1 \uC720\uC801 \uC804\uACBD" \u2192 num="\uB3C4\uD310 1" \u2713
|
|
3526
|
+
- "(\uC0AC\uC9C4 16> \uB290\uD2F0\uB098\uBB34" \u2192 num="\uC0AC\uC9C4 16" \u2713 (ignore leading punctuation)
|
|
3527
|
+
- "<\uB3C4\uD310 3> \uC720\uBB3C \uC0AC\uC9C4" \u2192 num="\uB3C4\uD310 3" \u2713 (ignore angle brackets)
|
|
3528
|
+
|
|
3529
|
+
Invalid patterns (num MUST be null):
|
|
3530
|
+
- "39 3\uC6D4 28\uC77C(\uBC31\uC81C \uB3C4\uB85C\uC720\uAD6C)" \u2192 null \u2713 (starts with number, no prefix)
|
|
3531
|
+
- "1. \uC720\uC801 \uC804\uACBD" \u2192 null \u2713 (numbered list item, not a caption)
|
|
3532
|
+
- "2024\uB144 \uC870\uC0AC \uD604\uD669" \u2192 null \u2713 (year reference, not a caption)
|
|
3533
|
+
|
|
3534
|
+
## Extraction Algorithm:
|
|
3535
|
+
|
|
3536
|
+
1. Extract prefix + number from the caption
|
|
3537
|
+
- The prefix is the text portion before the number
|
|
3538
|
+
- Full extraction: "\uC6D0\uC0C9\uC0AC\uC9C4 1", "\uB3C4\uD310 2-3", "\uADF8\uB9BC 3.6", "Figure 4a"
|
|
3539
|
+
|
|
3540
|
+
2. **Decimal point handling**: Include period/dot after number if directly following
|
|
3541
|
+
- "\uADF8\uB9BC 3.6. \uD55C\uBC18\uB3C4" \u2192 "\uADF8\uB9BC 3.6" (period as decimal separator included)
|
|
3542
|
+
- "\uB3C4\uD310 2. \uC720\uC801" \u2192 "\uB3C4\uD310 2" (period after space, NOT included)
|
|
3543
|
+
|
|
3544
|
+
3. **Stop rules** (extraction must stop at first occurrence of):
|
|
3545
|
+
- Punctuation (except decimal point): , : ; ! ? ~ ( ) [ ] { }
|
|
3546
|
+
- Whitespace: space, tab, newline
|
|
3547
|
+
- Underscore: _
|
|
3548
|
+
- Exception: Periods directly after digits are included as decimal separators
|
|
3549
|
+
- Exception: Hyphens within numbers are included (e.g., "2-3")
|
|
3550
|
+
|
|
3551
|
+
## Validation Rules:
|
|
3552
|
+
|
|
3553
|
+
1. **Pattern requirement**: The original text MUST follow <prefix> <number> pattern
|
|
3554
|
+
- "\uC6D0\uC0C9\uC0AC\uC9C4 1. \uC870\uC0AC\uC9C0\uC5ED" \u2192 num="\uC6D0\uC0C9\uC0AC\uC9C4 1" \u2713 (valid pattern)
|
|
3555
|
+
- "39 3\uC6D4 28\uC77C(\uBC31\uC81C)" \u2192 num="39" \u2717 (starts with number, should be null)
|
|
3556
|
+
- "1. \uC870\uC0AC \uAC1C\uC694" \u2192 num="1" \u2717 (numbered list, should be null)
|
|
3557
|
+
|
|
3558
|
+
2. **Correctness**: The parsed "num" must contain the actual prefix+number
|
|
3559
|
+
- "\uB3C4\uD310 1 \uC720\uC801 \uC804\uACBD" \u2192 num="\uB3C4\uD310 1" \u2713
|
|
3560
|
+
- "\uB3C4\uD310 1 \uC720\uC801 \uC804\uACBD" \u2192 num="\uB3C4\uD310" \u2717 (incomplete)
|
|
3561
|
+
|
|
3562
|
+
3. **Spacing**: The spacing in "num" must match the original text exactly
|
|
3563
|
+
- "\uB3C4\uD310 1" \u2192 num="\uB3C4\uD310 1" \u2713
|
|
3564
|
+
- "\uB3C4\uD3101" \u2192 num="\uB3C4\uD3101" \u2713
|
|
3565
|
+
- "\uB3C4\uD310 1" \u2192 num="\uB3C4\uD3101" \u2717 (spacing mismatch)
|
|
3566
|
+
|
|
3567
|
+
4. **Completeness**: The number part must be fully extracted
|
|
3568
|
+
- "Figure 2-3" \u2192 num="Figure 2-3" \u2713
|
|
3569
|
+
- "Figure 2-3" \u2192 num="Figure 2" \u2717 (incomplete number)
|
|
3570
|
+
|
|
3571
|
+
5. **Null handling**: If "num" is null, verify:
|
|
3572
|
+
- Either the original text has no number
|
|
3573
|
+
- OR the text starts with a number (no prefix)
|
|
3574
|
+
- "\uC720\uC801 \uC804\uACBD \uC0AC\uC9C4" \u2192 num=null \u2713 (no number in caption position)
|
|
3575
|
+
- "\uC6D0\uC0C9\uC0AC\uC9C4 1 \uC870\uC0AC" \u2192 num=null \u2717 (should extract "\uC6D0\uC0C9\uC0AC\uC9C4 1")
|
|
3576
|
+
|
|
3577
|
+
## Response:
|
|
3578
|
+
For each caption, return:
|
|
3579
|
+
- index: original position
|
|
3580
|
+
- isValid: true if parsing is correct, false otherwise
|
|
3581
|
+
- reason: null if valid, brief explanation if invalid`;
|
|
3582
|
+
}
|
|
3583
|
+
buildUserPrompt(items) {
|
|
3584
|
+
const captionList = items.map(
|
|
3585
|
+
(item) => `[${item.index}] Original: "${item.originalText}" | Parsed num: ${item.caption.num !== void 0 ? `"${item.caption.num}"` : "null"}`
|
|
3586
|
+
).join("\n");
|
|
3587
|
+
return `Validate the following caption parsing results:
|
|
3588
|
+
|
|
3589
|
+
${captionList}
|
|
3590
|
+
|
|
3591
|
+
Return the results as JSON array with "index", "isValid", and "reason" (null if valid, explanation if invalid).
|
|
3592
|
+
|
|
3593
|
+
Example format:
|
|
3594
|
+
{
|
|
3595
|
+
"results": [
|
|
3596
|
+
{ "index": 0, "isValid": true, "reason": null },
|
|
3597
|
+
{ "index": 1, "isValid": false, "reason": "Number incomplete: expected '1-2' but got '1'" },
|
|
3598
|
+
{ "index": 2, "isValid": true, "reason": null }
|
|
3599
|
+
]
|
|
3600
|
+
}`;
|
|
3601
|
+
}
|
|
3602
|
+
};
|
|
3603
|
+
var CaptionValidationError = class extends Error {
|
|
3604
|
+
constructor(message, options) {
|
|
3605
|
+
super(message, options);
|
|
3606
|
+
this.name = "CaptionValidationError";
|
|
3607
|
+
}
|
|
3608
|
+
};
|
|
3609
|
+
|
|
3610
|
+
// src/document-processor.ts
|
|
3611
|
+
var DocumentProcessor = class {
|
|
3612
|
+
logger;
|
|
3613
|
+
fallbackModel;
|
|
3614
|
+
pageRangeParserModel;
|
|
3615
|
+
tocExtractorModel;
|
|
3616
|
+
validatorModel;
|
|
3617
|
+
visionTocExtractorModel;
|
|
3618
|
+
captionParserModel;
|
|
3619
|
+
textCleanerBatchSize;
|
|
3620
|
+
captionParserBatchSize;
|
|
3621
|
+
captionValidatorBatchSize;
|
|
3622
|
+
maxRetries;
|
|
3623
|
+
enableFallbackRetry;
|
|
3624
|
+
abortSignal;
|
|
3625
|
+
idGenerator = new IdGenerator();
|
|
3626
|
+
refResolver;
|
|
3627
|
+
pageRangeParser;
|
|
3628
|
+
tocFinder;
|
|
3629
|
+
tocExtractor;
|
|
3630
|
+
tocContentValidator;
|
|
3631
|
+
captionValidator;
|
|
3632
|
+
visionTocExtractor;
|
|
3633
|
+
captionParser;
|
|
3634
|
+
chapterConverter;
|
|
3635
|
+
textCleaner = TextCleaner;
|
|
3636
|
+
usageAggregator = new LLMTokenUsageAggregator();
|
|
3637
|
+
constructor(options) {
|
|
3638
|
+
this.logger = options.logger;
|
|
3639
|
+
this.fallbackModel = options.fallbackModel;
|
|
3640
|
+
this.pageRangeParserModel = options.pageRangeParserModel ?? options.fallbackModel;
|
|
3641
|
+
this.tocExtractorModel = options.tocExtractorModel ?? options.fallbackModel;
|
|
3642
|
+
this.validatorModel = options.validatorModel ?? options.fallbackModel;
|
|
3643
|
+
this.visionTocExtractorModel = options.visionTocExtractorModel ?? options.fallbackModel;
|
|
3644
|
+
this.captionParserModel = options.captionParserModel ?? options.fallbackModel;
|
|
3645
|
+
this.textCleanerBatchSize = options.textCleanerBatchSize;
|
|
3646
|
+
this.captionParserBatchSize = options.captionParserBatchSize;
|
|
3647
|
+
this.captionValidatorBatchSize = options.captionValidatorBatchSize;
|
|
3648
|
+
this.maxRetries = options.maxRetries ?? 3;
|
|
3649
|
+
this.enableFallbackRetry = options.enableFallbackRetry ?? false;
|
|
3650
|
+
this.abortSignal = options.abortSignal;
|
|
3651
|
+
}
|
|
3652
|
+
/**
|
|
3653
|
+
* Check if abort has been requested and throw error if so
|
|
3654
|
+
*
|
|
3655
|
+
* @throws {Error} with name 'AbortError' if aborted
|
|
3656
|
+
*/
|
|
3657
|
+
checkAborted() {
|
|
3658
|
+
if (this.abortSignal?.aborted) {
|
|
3659
|
+
const error = new Error("Document processing was aborted");
|
|
3660
|
+
error.name = "AbortError";
|
|
3661
|
+
throw error;
|
|
3662
|
+
}
|
|
3663
|
+
}
|
|
3664
|
+
/**
|
|
3665
|
+
* Converts DoclingDocument to ProcessedDocument with token usage tracking.
|
|
3666
|
+
*
|
|
3667
|
+
* Conversion process:
|
|
3668
|
+
* 1. Initialize processors and resolvers
|
|
3669
|
+
* 2. Normalize and filter texts
|
|
3670
|
+
* 3. Clean texts and parse page ranges (parallel)
|
|
3671
|
+
* 4. Extract table of contents
|
|
3672
|
+
* 5. Convert images and tables (parallel)
|
|
3673
|
+
* 6. Convert chapters and link resources
|
|
3674
|
+
* 7. Assemble final ProcessedDocument
|
|
3675
|
+
* 8. Collect and report token usage
|
|
3676
|
+
*
|
|
3677
|
+
* @param doclingDoc - Original document extracted from Docling SDK
|
|
3678
|
+
* @param reportId - Report unique identifier
|
|
3679
|
+
* @param outputPath - Path containing images and pages subdirectories (images/image_0.png, pages/page_0.png, etc.)
|
|
3680
|
+
* @returns Document processing result with ProcessedDocument and token usage report
|
|
3681
|
+
*
|
|
3682
|
+
* @throws {TocExtractError} When TOC extraction fails
|
|
3683
|
+
* @throws {PageRangeParseError} When page range parsing fails
|
|
3684
|
+
* @throws {ConversionError} When error occurs during conversion
|
|
3685
|
+
*/
|
|
3686
|
+
async process(doclingDoc, reportId, outputPath) {
|
|
3687
|
+
this.logger.info("[DocumentProcessor] Starting document processing...");
|
|
3688
|
+
this.logger.info("[DocumentProcessor] Report ID:", reportId);
|
|
3689
|
+
this.usageAggregator.reset();
|
|
3690
|
+
this.checkAborted();
|
|
3691
|
+
this.initializeProcessors(doclingDoc, outputPath);
|
|
3692
|
+
const startTimeFilter = Date.now();
|
|
3693
|
+
const filtered = this.normalizeAndFilterTexts(doclingDoc);
|
|
3694
|
+
const filteringTime = Date.now() - startTimeFilter;
|
|
3695
|
+
this.logger.info(
|
|
3696
|
+
`[DocumentProcessor] Text filtering took ${filteringTime}ms`
|
|
3697
|
+
);
|
|
3698
|
+
this.checkAborted();
|
|
3699
|
+
const startTimePageRange = Date.now();
|
|
3700
|
+
const pageRangeMap = await this.parsePageRanges(doclingDoc);
|
|
3701
|
+
const pageRangeTime = Date.now() - startTimePageRange;
|
|
3702
|
+
this.logger.info(
|
|
3703
|
+
`[DocumentProcessor] Page range parsing took ${pageRangeTime}ms`
|
|
3704
|
+
);
|
|
3705
|
+
this.checkAborted();
|
|
3706
|
+
const startTimeToc = Date.now();
|
|
3707
|
+
const tocEntries = await this.extractTableOfContents(doclingDoc, filtered);
|
|
3708
|
+
const tocTime = Date.now() - startTimeToc;
|
|
3709
|
+
this.logger.info(`[DocumentProcessor] TOC extraction took ${tocTime}ms`);
|
|
3710
|
+
this.checkAborted();
|
|
3711
|
+
const startTimeResources = Date.now();
|
|
3712
|
+
const { images, tables, footnotes } = await this.convertResources(
|
|
3713
|
+
doclingDoc,
|
|
3714
|
+
outputPath
|
|
3715
|
+
);
|
|
3716
|
+
const resourcesTime = Date.now() - startTimeResources;
|
|
3717
|
+
this.logger.info(
|
|
3718
|
+
`[DocumentProcessor] Resource conversion took ${resourcesTime}ms`
|
|
3719
|
+
);
|
|
3720
|
+
this.checkAborted();
|
|
3721
|
+
const startTimeChapters = Date.now();
|
|
3722
|
+
const chapters = await this.convertChapters(
|
|
3723
|
+
doclingDoc,
|
|
3724
|
+
tocEntries,
|
|
3725
|
+
pageRangeMap,
|
|
3726
|
+
images,
|
|
3727
|
+
tables,
|
|
3728
|
+
footnotes
|
|
3729
|
+
);
|
|
3730
|
+
const chaptersTime = Date.now() - startTimeChapters;
|
|
3731
|
+
this.logger.info(
|
|
3732
|
+
`[DocumentProcessor] Chapter conversion took ${chaptersTime}ms`
|
|
3733
|
+
);
|
|
3734
|
+
const startTimeAssemble = Date.now();
|
|
3735
|
+
const processedDoc = this.assembleProcessedDocument(
|
|
3736
|
+
reportId,
|
|
3737
|
+
pageRangeMap,
|
|
3738
|
+
chapters,
|
|
3739
|
+
images,
|
|
3740
|
+
tables,
|
|
3741
|
+
footnotes
|
|
3742
|
+
);
|
|
3743
|
+
const assembleTime = Date.now() - startTimeAssemble;
|
|
3744
|
+
this.logger.info(
|
|
3745
|
+
`[DocumentProcessor] Document assembly took ${assembleTime}ms`
|
|
3746
|
+
);
|
|
3747
|
+
this.logger.info("[DocumentProcessor] Document processing completed");
|
|
3748
|
+
return {
|
|
3749
|
+
document: processedDoc,
|
|
3750
|
+
usage: this.usageAggregator.getReport()
|
|
3751
|
+
};
|
|
3752
|
+
}
|
|
3753
|
+
/**
|
|
3754
|
+
* Initialize all processors and resolvers
|
|
3755
|
+
*
|
|
3756
|
+
* Sets up RefResolver, PageRangeParser, TocFinder, and TocExtractor
|
|
3757
|
+
*/
|
|
3758
|
+
initializeProcessors(doclingDoc, outputPath) {
|
|
3759
|
+
this.logger.info("[DocumentProcessor] Initializing processors...");
|
|
3760
|
+
this.logger.info("[DocumentProcessor] - RefResolver");
|
|
3761
|
+
this.refResolver = new RefResolver(this.logger, doclingDoc);
|
|
3762
|
+
this.logger.info("[DocumentProcessor] - PageRangeParser");
|
|
3763
|
+
this.pageRangeParser = new PageRangeParser(
|
|
3764
|
+
this.logger,
|
|
3765
|
+
this.pageRangeParserModel,
|
|
3766
|
+
outputPath,
|
|
3767
|
+
this.maxRetries,
|
|
3768
|
+
this.enableFallbackRetry ? this.fallbackModel : void 0,
|
|
3769
|
+
this.usageAggregator,
|
|
3770
|
+
this.abortSignal
|
|
3771
|
+
);
|
|
3772
|
+
this.logger.info("[DocumentProcessor] - TocFinder");
|
|
3773
|
+
this.tocFinder = new TocFinder(this.logger, this.refResolver);
|
|
3774
|
+
this.logger.info("[DocumentProcessor] - TocExtractor");
|
|
3775
|
+
this.tocExtractor = new TocExtractor(
|
|
3776
|
+
this.logger,
|
|
3777
|
+
this.tocExtractorModel,
|
|
3778
|
+
{
|
|
3779
|
+
maxRetries: this.maxRetries
|
|
3780
|
+
},
|
|
3781
|
+
this.enableFallbackRetry ? this.fallbackModel : void 0,
|
|
3782
|
+
this.abortSignal
|
|
3783
|
+
);
|
|
3784
|
+
this.logger.info("[DocumentProcessor] - TocContentValidator");
|
|
3785
|
+
this.tocContentValidator = new TocContentValidator(
|
|
3786
|
+
this.logger,
|
|
3787
|
+
this.validatorModel,
|
|
3788
|
+
{ maxRetries: this.maxRetries, abortSignal: this.abortSignal },
|
|
3789
|
+
this.enableFallbackRetry ? this.fallbackModel : void 0,
|
|
3790
|
+
this.usageAggregator
|
|
3791
|
+
);
|
|
3792
|
+
this.logger.info("[DocumentProcessor] - CaptionValidator");
|
|
3793
|
+
this.captionValidator = new CaptionValidator(
|
|
3794
|
+
this.logger,
|
|
3795
|
+
this.validatorModel,
|
|
3796
|
+
{ maxRetries: this.maxRetries, abortSignal: this.abortSignal },
|
|
3797
|
+
this.enableFallbackRetry ? this.fallbackModel : void 0,
|
|
3798
|
+
this.usageAggregator
|
|
3799
|
+
);
|
|
3800
|
+
this.logger.info("[DocumentProcessor] - VisionTocExtractor");
|
|
3801
|
+
this.visionTocExtractor = new VisionTocExtractor(
|
|
3802
|
+
this.logger,
|
|
3803
|
+
this.visionTocExtractorModel,
|
|
3804
|
+
outputPath,
|
|
3805
|
+
{ maxRetries: this.maxRetries, abortSignal: this.abortSignal },
|
|
3806
|
+
this.enableFallbackRetry ? this.fallbackModel : void 0,
|
|
3807
|
+
this.usageAggregator
|
|
3808
|
+
);
|
|
3809
|
+
this.logger.info("[DocumentProcessor] - CaptionParser");
|
|
3810
|
+
this.captionParser = new CaptionParser(
|
|
3811
|
+
this.logger,
|
|
3812
|
+
this.captionParserModel,
|
|
3813
|
+
{ maxRetries: this.maxRetries, abortSignal: this.abortSignal },
|
|
3814
|
+
this.enableFallbackRetry ? this.fallbackModel : void 0,
|
|
3815
|
+
this.usageAggregator
|
|
3816
|
+
);
|
|
3817
|
+
this.logger.info("[DocumentProcessor] - ChapterConverter");
|
|
3818
|
+
this.chapterConverter = new ChapterConverter(this.logger, this.idGenerator);
|
|
3819
|
+
this.logger.info("[DocumentProcessor] All processors initialized");
|
|
3820
|
+
}
|
|
3821
|
+
/**
|
|
3822
|
+
* Normalize and filter texts using TextCleaner
|
|
3823
|
+
*
|
|
3824
|
+
* Performs basic text normalization (unicode, whitespace, punctuation)
|
|
3825
|
+
* and filters out invalid texts (empty, numbers-only, etc.)
|
|
3826
|
+
*/
|
|
3827
|
+
normalizeAndFilterTexts(doclingDoc) {
|
|
3828
|
+
this.logger.info("[DocumentProcessor] Normalizing and filtering texts...");
|
|
3829
|
+
const texts = doclingDoc.texts.map((text) => text.text);
|
|
3830
|
+
const filtered = this.textCleaner.normalizeAndFilterBatch(
|
|
3831
|
+
texts,
|
|
3832
|
+
this.textCleanerBatchSize
|
|
3833
|
+
);
|
|
3834
|
+
this.logger.info(
|
|
3835
|
+
`[DocumentProcessor] Filtered ${filtered.length} texts from ${texts.length} original texts`
|
|
3836
|
+
);
|
|
3837
|
+
return filtered;
|
|
3838
|
+
}
|
|
3839
|
+
/**
|
|
3840
|
+
* Parse page ranges using Vision LLM
|
|
3841
|
+
*
|
|
3842
|
+
* Extracts actual page numbers from page images and creates mapping.
|
|
3843
|
+
* Token usage is automatically tracked by PageRangeParser into the shared aggregator.
|
|
3844
|
+
*/
|
|
3845
|
+
async parsePageRanges(doclingDoc) {
|
|
3846
|
+
this.logger.info("[DocumentProcessor] Starting page range parsing...");
|
|
3847
|
+
const result = await this.pageRangeParser.parse(doclingDoc);
|
|
3848
|
+
const pageRangeMap = result.pageRangeMap;
|
|
3849
|
+
this.logger.info(
|
|
3850
|
+
`[DocumentProcessor] Page range map entries: ${Object.keys(pageRangeMap).length}`
|
|
3851
|
+
);
|
|
3852
|
+
return pageRangeMap;
|
|
3853
|
+
}
|
|
3854
|
+
/**
|
|
3855
|
+
* Convert images, tables, and footnotes
|
|
3856
|
+
*
|
|
3857
|
+
* Runs conversions:
|
|
3858
|
+
* - Images conversion (with caption extraction)
|
|
3859
|
+
* - Tables conversion (with caption extraction, excluding TOC tables)
|
|
3860
|
+
* - Footnotes conversion (synchronous, from text items with label='footnote')
|
|
3861
|
+
*/
|
|
3862
|
+
async convertResources(doclingDoc, outputPath) {
|
|
3863
|
+
this.logger.info(
|
|
3864
|
+
"[DocumentProcessor] Converting images, tables, and footnotes..."
|
|
3865
|
+
);
|
|
3866
|
+
const [images, tables] = await Promise.all([
|
|
3867
|
+
this.convertImages(doclingDoc, outputPath),
|
|
3868
|
+
this.convertTables(doclingDoc)
|
|
3869
|
+
]);
|
|
3870
|
+
const footnotes = this.convertFootnotes(doclingDoc);
|
|
3871
|
+
this.logger.info(
|
|
3872
|
+
`[DocumentProcessor] Converted ${images.length} images, ${tables.length} tables, and ${footnotes.length} footnotes`
|
|
3873
|
+
);
|
|
3874
|
+
return { images, tables, footnotes };
|
|
3875
|
+
}
|
|
3876
|
+
/**
|
|
3877
|
+
* Convert footnotes
|
|
3878
|
+
*
|
|
3879
|
+
* Extracts footnotes from DoclingDocument text items with label='footnote'
|
|
3880
|
+
*/
|
|
3881
|
+
convertFootnotes(doclingDoc) {
|
|
3882
|
+
const footnoteItems = doclingDoc.texts.filter(
|
|
3883
|
+
(item) => item.label === "footnote"
|
|
3884
|
+
);
|
|
3885
|
+
this.logger.info(
|
|
3886
|
+
`[DocumentProcessor] Converting ${footnoteItems.length} footnotes...`
|
|
3887
|
+
);
|
|
3888
|
+
const footnotes = [];
|
|
3889
|
+
for (const item of footnoteItems) {
|
|
3890
|
+
if (!this.textCleaner.isValidText(item.text)) {
|
|
3891
|
+
continue;
|
|
3892
|
+
}
|
|
3893
|
+
const pdfPageNo = item.prov?.[0]?.page_no ?? 1;
|
|
3894
|
+
const footnoteId = this.idGenerator.generateFootnoteId();
|
|
3895
|
+
footnotes.push({
|
|
3896
|
+
id: footnoteId,
|
|
3897
|
+
text: this.textCleaner.normalize(item.text),
|
|
3898
|
+
pdfPageNo
|
|
3899
|
+
});
|
|
3900
|
+
}
|
|
3901
|
+
this.logger.info(
|
|
3902
|
+
`[DocumentProcessor] Converted ${footnotes.length} valid footnotes`
|
|
3903
|
+
);
|
|
3904
|
+
return footnotes;
|
|
3905
|
+
}
|
|
3906
|
+
/**
|
|
3907
|
+
* Assemble the final ProcessedDocument
|
|
3908
|
+
*
|
|
3909
|
+
* Creates the ProcessedDocument structure with all converted components
|
|
3910
|
+
*/
|
|
3911
|
+
assembleProcessedDocument(reportId, pageRangeMap, chapters, images, tables, footnotes) {
|
|
3912
|
+
this.logger.info("[DocumentProcessor] Assembling ProcessedDocument...");
|
|
3913
|
+
const processedDoc = {
|
|
3914
|
+
reportId,
|
|
3915
|
+
pageRangeMap,
|
|
3916
|
+
chapters,
|
|
3917
|
+
images,
|
|
3918
|
+
tables,
|
|
3919
|
+
footnotes
|
|
3920
|
+
};
|
|
3921
|
+
this.logger.info(
|
|
3922
|
+
`[DocumentProcessor] Assembled document with ${chapters.length} chapters, ${images.length} images, ${tables.length} tables, ${footnotes.length} footnotes`
|
|
3923
|
+
);
|
|
3924
|
+
return processedDoc;
|
|
3925
|
+
}
|
|
3926
|
+
/**
|
|
3927
|
+
* Extract table of contents (TOC)
|
|
3928
|
+
*
|
|
3929
|
+
* Uses rule-based extraction with LLM validation and vision fallback:
|
|
3930
|
+
* 1. TocFinder - find TOC area in document (rule-based)
|
|
3931
|
+
* 2. MarkdownConverter - convert TOC items to Markdown
|
|
3932
|
+
* 3. TocContentValidator - validate if content is actually a TOC (LLM)
|
|
3933
|
+
* 4. If invalid: VisionTocExtractor - extract from page images (vision LLM fallback)
|
|
3934
|
+
* 5. TocExtractor - LLM-based structured extraction
|
|
3935
|
+
*/
|
|
3936
|
+
async extractTableOfContents(doclingDoc, _filteredTexts) {
|
|
3937
|
+
this.logger.info("[DocumentProcessor] Extracting TOC...");
|
|
3938
|
+
let markdown = null;
|
|
3939
|
+
try {
|
|
3940
|
+
const tocArea = this.tocFinder.find(doclingDoc);
|
|
3941
|
+
this.logger.info(
|
|
3942
|
+
`[DocumentProcessor] Found TOC area: pages ${tocArea.startPage}-${tocArea.endPage}`
|
|
3943
|
+
);
|
|
3944
|
+
markdown = MarkdownConverter.convert(tocArea.itemRefs, this.refResolver);
|
|
3945
|
+
this.logger.info(
|
|
3946
|
+
`[DocumentProcessor] Converted TOC to Markdown (${markdown.length} chars)`
|
|
3947
|
+
);
|
|
3948
|
+
const validation = await this.tocContentValidator.validate(markdown);
|
|
3949
|
+
if (!this.tocContentValidator.isValid(validation)) {
|
|
3950
|
+
this.logger.warn(
|
|
3951
|
+
`[DocumentProcessor] TOC validation failed: ${validation.reason}`
|
|
3952
|
+
);
|
|
3953
|
+
markdown = null;
|
|
3954
|
+
} else {
|
|
3955
|
+
this.logger.info(
|
|
3956
|
+
`[DocumentProcessor] TOC validation passed (confidence: ${validation.confidence})`
|
|
3957
|
+
);
|
|
3958
|
+
}
|
|
3959
|
+
} catch (error) {
|
|
3960
|
+
if (error instanceof TocNotFoundError) {
|
|
3961
|
+
this.logger.info(
|
|
3962
|
+
"[DocumentProcessor] Rule-based TOC not found, will try vision fallback"
|
|
3963
|
+
);
|
|
3964
|
+
} else {
|
|
3965
|
+
throw error;
|
|
3966
|
+
}
|
|
3967
|
+
}
|
|
3968
|
+
if (!markdown) {
|
|
3969
|
+
this.logger.info("[DocumentProcessor] Using vision fallback for TOC");
|
|
3970
|
+
const totalPages = Object.keys(doclingDoc.pages).length;
|
|
3971
|
+
markdown = await this.visionTocExtractor.extract(totalPages);
|
|
3972
|
+
if (!markdown) {
|
|
3973
|
+
this.logger.warn(
|
|
3974
|
+
"[DocumentProcessor] TOC not found in any method, returning empty"
|
|
3975
|
+
);
|
|
3976
|
+
return [];
|
|
3977
|
+
}
|
|
3978
|
+
this.logger.info(
|
|
3979
|
+
`[DocumentProcessor] Vision extracted TOC markdown (${markdown.length} chars)`
|
|
3980
|
+
);
|
|
3981
|
+
}
|
|
3982
|
+
const tocResult = await this.tocExtractor.extract(markdown);
|
|
3983
|
+
this.usageAggregator.track(tocResult.usage);
|
|
3984
|
+
this.logger.info(
|
|
3985
|
+
`[DocumentProcessor] Extracted ${tocResult.entries.length} top-level TOC entries`
|
|
3986
|
+
);
|
|
3987
|
+
return tocResult.entries;
|
|
3988
|
+
}
|
|
3989
|
+
/**
|
|
3990
|
+
* Process resource captions (for images and tables)
|
|
3991
|
+
*
|
|
3992
|
+
* Common caption processing pipeline:
|
|
3993
|
+
* 1. Parse captions in batch
|
|
3994
|
+
* 2. Validate parsed captions
|
|
3995
|
+
* 3. Reparse failed captions with fallback model
|
|
3996
|
+
*
|
|
3997
|
+
* @param captionTexts - Array of caption texts to process
|
|
3998
|
+
* @param resourceType - Type of resource for logging (e.g., 'image', 'table')
|
|
3999
|
+
* @returns Parsed captions with index mapping
|
|
4000
|
+
*/
|
|
4001
|
+
async processResourceCaptions(captionTexts, resourceType) {
|
|
4002
|
+
const captionsByIndex = /* @__PURE__ */ new Map();
|
|
4003
|
+
const validCaptionData = [];
|
|
4004
|
+
for (let i = 0; i < captionTexts.length; i++) {
|
|
4005
|
+
const text = captionTexts[i];
|
|
4006
|
+
if (text !== void 0) {
|
|
4007
|
+
validCaptionData.push({
|
|
4008
|
+
resourceIndex: i,
|
|
4009
|
+
filteredIndex: validCaptionData.length,
|
|
4010
|
+
text
|
|
4011
|
+
});
|
|
4012
|
+
}
|
|
4013
|
+
}
|
|
4014
|
+
const validCaptionTexts = validCaptionData.map((item) => item.text);
|
|
4015
|
+
const parsedCaptions = validCaptionTexts.length > 0 ? await this.captionParser.parseBatch(
|
|
4016
|
+
validCaptionTexts,
|
|
4017
|
+
this.captionParserBatchSize
|
|
4018
|
+
) : [];
|
|
4019
|
+
let finalValidCaptionData = validCaptionData;
|
|
4020
|
+
let finalParsedCaptions = parsedCaptions;
|
|
4021
|
+
if (parsedCaptions.length !== validCaptionData.length) {
|
|
4022
|
+
this.logger.warn(
|
|
4023
|
+
`[DocumentProcessor] Caption parsing length mismatch for ${resourceType}: expected ${validCaptionData.length}, got ${parsedCaptions.length}. Attempting recovery by matching fullText...`
|
|
4024
|
+
);
|
|
4025
|
+
const parsedMap = /* @__PURE__ */ new Map();
|
|
4026
|
+
for (const parsed of parsedCaptions) {
|
|
4027
|
+
parsedMap.set(parsed.fullText, parsed);
|
|
4028
|
+
}
|
|
4029
|
+
const recoveredData = [];
|
|
4030
|
+
for (const item of validCaptionData) {
|
|
4031
|
+
if (parsedMap.has(item.text)) {
|
|
4032
|
+
recoveredData.push(item);
|
|
4033
|
+
} else {
|
|
4034
|
+
this.logger.warn(
|
|
4035
|
+
`[DocumentProcessor] Skipping ${resourceType} caption at index ${item.resourceIndex}: "${item.text}" (not found in parsed results)`
|
|
4036
|
+
);
|
|
4037
|
+
}
|
|
4038
|
+
}
|
|
4039
|
+
const recoveredCaptions = [];
|
|
4040
|
+
for (const item of recoveredData) {
|
|
4041
|
+
const caption = parsedMap.get(item.text);
|
|
4042
|
+
if (caption) {
|
|
4043
|
+
recoveredCaptions.push(caption);
|
|
4044
|
+
}
|
|
4045
|
+
}
|
|
4046
|
+
if (recoveredCaptions.length !== recoveredData.length) {
|
|
4047
|
+
throw new Error(
|
|
4048
|
+
`[DocumentProcessor] Failed to recover from length mismatch: recovered ${recoveredCaptions.length} captions for ${recoveredData.length} valid items`
|
|
4049
|
+
);
|
|
4050
|
+
}
|
|
4051
|
+
finalValidCaptionData = recoveredData;
|
|
4052
|
+
finalParsedCaptions = recoveredCaptions;
|
|
4053
|
+
this.logger.info(
|
|
4054
|
+
`[DocumentProcessor] Successfully recovered ${finalParsedCaptions.length} ${resourceType} captions after length mismatch`
|
|
4055
|
+
);
|
|
4056
|
+
}
|
|
4057
|
+
for (let i = 0; i < finalParsedCaptions.length; i++) {
|
|
4058
|
+
const resourceIndex = finalValidCaptionData[i].resourceIndex;
|
|
4059
|
+
captionsByIndex.set(resourceIndex, finalParsedCaptions[i]);
|
|
4060
|
+
}
|
|
4061
|
+
if (finalParsedCaptions.length > 0) {
|
|
4062
|
+
const finalValidCaptionTexts = finalValidCaptionData.map(
|
|
4063
|
+
(item) => item.text
|
|
4064
|
+
);
|
|
4065
|
+
const validationResults = await this.captionValidator.validateBatch(
|
|
4066
|
+
finalParsedCaptions,
|
|
4067
|
+
finalValidCaptionTexts,
|
|
4068
|
+
this.captionValidatorBatchSize
|
|
4069
|
+
);
|
|
4070
|
+
const failedIndices = validationResults.map((isValid, index) => isValid ? -1 : index).filter((index) => index !== -1);
|
|
4071
|
+
if (failedIndices.length > 0) {
|
|
4072
|
+
for (const filteredIndex of failedIndices) {
|
|
4073
|
+
const captionData = finalValidCaptionData[filteredIndex];
|
|
4074
|
+
const originalText = captionData.text;
|
|
4075
|
+
const parsedNum = finalParsedCaptions[filteredIndex].num;
|
|
4076
|
+
const resourceIndex = captionData.resourceIndex;
|
|
4077
|
+
this.logger.warn(
|
|
4078
|
+
`[DocumentProcessor] Invalid ${resourceType} caption [${resourceIndex}]: "${originalText}" | parsed num="${parsedNum}"`
|
|
4079
|
+
);
|
|
4080
|
+
}
|
|
4081
|
+
if (this.enableFallbackRetry) {
|
|
4082
|
+
this.logger.info(
|
|
4083
|
+
`[DocumentProcessor] Reparsing ${failedIndices.length} failed ${resourceType} captions with fallback model...`
|
|
4084
|
+
);
|
|
4085
|
+
const failedCaptionTexts = failedIndices.map(
|
|
4086
|
+
(filteredIndex) => finalValidCaptionData[filteredIndex].text
|
|
4087
|
+
);
|
|
4088
|
+
const fallbackCaptionParser = new CaptionParser(
|
|
4089
|
+
this.logger,
|
|
4090
|
+
this.fallbackModel,
|
|
4091
|
+
{
|
|
4092
|
+
maxRetries: this.maxRetries,
|
|
4093
|
+
componentName: "CaptionParser-fallback",
|
|
4094
|
+
abortSignal: this.abortSignal
|
|
4095
|
+
},
|
|
4096
|
+
void 0,
|
|
4097
|
+
// no fallback for the fallback
|
|
4098
|
+
this.usageAggregator
|
|
4099
|
+
);
|
|
4100
|
+
const reparsedCaptions = await fallbackCaptionParser.parseBatch(
|
|
4101
|
+
failedCaptionTexts,
|
|
4102
|
+
0
|
|
4103
|
+
// sequential processing
|
|
4104
|
+
);
|
|
4105
|
+
for (let i = 0; i < failedIndices.length; i++) {
|
|
4106
|
+
const filteredIndex = failedIndices[i];
|
|
4107
|
+
const resourceIndex = finalValidCaptionData[filteredIndex].resourceIndex;
|
|
4108
|
+
captionsByIndex.set(resourceIndex, reparsedCaptions[i]);
|
|
4109
|
+
}
|
|
4110
|
+
this.logger.info(
|
|
4111
|
+
`[DocumentProcessor] Reparsed ${reparsedCaptions.length} ${resourceType} captions`
|
|
4112
|
+
);
|
|
4113
|
+
} else {
|
|
4114
|
+
this.logger.warn(
|
|
4115
|
+
`[DocumentProcessor] ${failedIndices.length} ${resourceType} captions failed validation (kept as-is, fallback retry disabled)`
|
|
4116
|
+
);
|
|
4117
|
+
}
|
|
4118
|
+
}
|
|
4119
|
+
}
|
|
4120
|
+
return captionsByIndex;
|
|
4121
|
+
}
|
|
4122
|
+
/**
|
|
4123
|
+
* Extract caption text from resource
|
|
4124
|
+
*
|
|
4125
|
+
* Handles both string references and $ref resolution
|
|
4126
|
+
*/
|
|
4127
|
+
extractCaptionText(captions) {
|
|
4128
|
+
if (!captions?.[0]) {
|
|
4129
|
+
return void 0;
|
|
4130
|
+
}
|
|
4131
|
+
const captionRef = captions[0];
|
|
4132
|
+
if (typeof captionRef === "string") {
|
|
4133
|
+
return captionRef;
|
|
4134
|
+
}
|
|
4135
|
+
if (this.refResolver && "$ref" in captionRef) {
|
|
4136
|
+
const resolved = this.refResolver.resolveText(captionRef.$ref);
|
|
4137
|
+
return resolved?.text;
|
|
4138
|
+
}
|
|
4139
|
+
return void 0;
|
|
4140
|
+
}
|
|
4141
|
+
/**
|
|
4142
|
+
* Convert images
|
|
4143
|
+
*
|
|
4144
|
+
* Converts pictures from DoclingDocument to ProcessedImage
|
|
4145
|
+
*/
|
|
4146
|
+
async convertImages(doclingDoc, outputPath) {
|
|
4147
|
+
this.logger.info(
|
|
4148
|
+
`[DocumentProcessor] Converting ${doclingDoc.pictures.length} images...`
|
|
4149
|
+
);
|
|
4150
|
+
const images = [];
|
|
4151
|
+
const captionTexts = [];
|
|
4152
|
+
for (const picture of doclingDoc.pictures) {
|
|
4153
|
+
const pdfPageNo = picture.prov?.[0]?.page_no ?? 0;
|
|
4154
|
+
const imageId = this.idGenerator?.generateImageId() ?? `img-${images.length + 1}`;
|
|
4155
|
+
const captionText = this.extractCaptionText(picture.captions);
|
|
4156
|
+
captionTexts.push(captionText);
|
|
4157
|
+
images.push({
|
|
4158
|
+
id: imageId,
|
|
4159
|
+
path: `${outputPath}/images/image_${images.length}.png`,
|
|
4160
|
+
pdfPageNo
|
|
4161
|
+
// caption will be assigned later
|
|
4162
|
+
});
|
|
4163
|
+
}
|
|
4164
|
+
const captionsByIndex = await this.processResourceCaptions(
|
|
4165
|
+
captionTexts,
|
|
4166
|
+
"image"
|
|
4167
|
+
);
|
|
4168
|
+
for (let i = 0; i < images.length; i++) {
|
|
4169
|
+
if (captionsByIndex.has(i)) {
|
|
4170
|
+
images[i].caption = captionsByIndex.get(i);
|
|
4171
|
+
}
|
|
4172
|
+
}
|
|
4173
|
+
return images;
|
|
4174
|
+
}
|
|
4175
|
+
/**
|
|
4176
|
+
* Convert tables
|
|
4177
|
+
*
|
|
4178
|
+
* Converts tables from DoclingDocument to ProcessedTable
|
|
4179
|
+
*/
|
|
4180
|
+
async convertTables(doclingDoc) {
|
|
4181
|
+
this.logger.info(
|
|
4182
|
+
`[DocumentProcessor] Converting ${doclingDoc.tables.length} tables...`
|
|
4183
|
+
);
|
|
4184
|
+
const tables = [];
|
|
4185
|
+
const captionTexts = [];
|
|
4186
|
+
for (const table of doclingDoc.tables) {
|
|
4187
|
+
const pdfPageNo = table.prov?.[0]?.page_no ?? 0;
|
|
4188
|
+
const tableId = this.idGenerator?.generateTableId() ?? `tbl-${tables.length + 1}`;
|
|
4189
|
+
const grid = table.data.grid.map(
|
|
4190
|
+
(row) => row.map((cell) => ({
|
|
4191
|
+
text: cell.text,
|
|
4192
|
+
rowSpan: cell.row_span ?? 1,
|
|
4193
|
+
colSpan: cell.col_span ?? 1,
|
|
4194
|
+
isHeader: cell.column_header || cell.row_header || false
|
|
4195
|
+
}))
|
|
4196
|
+
);
|
|
4197
|
+
const captionText = this.extractCaptionText(table.captions);
|
|
4198
|
+
captionTexts.push(captionText);
|
|
4199
|
+
tables.push({
|
|
4200
|
+
id: tableId,
|
|
4201
|
+
pdfPageNo,
|
|
4202
|
+
numRows: grid.length,
|
|
4203
|
+
numCols: grid[0]?.length ?? 0,
|
|
4204
|
+
grid
|
|
4205
|
+
// caption will be assigned later
|
|
4206
|
+
});
|
|
4207
|
+
}
|
|
4208
|
+
const captionsByIndex = await this.processResourceCaptions(
|
|
4209
|
+
captionTexts,
|
|
4210
|
+
"table"
|
|
4211
|
+
);
|
|
4212
|
+
for (let i = 0; i < tables.length; i++) {
|
|
4213
|
+
if (captionsByIndex.has(i)) {
|
|
4214
|
+
tables[i].caption = captionsByIndex.get(i);
|
|
4215
|
+
}
|
|
4216
|
+
}
|
|
4217
|
+
return tables;
|
|
4218
|
+
}
|
|
4219
|
+
/**
|
|
4220
|
+
* Convert chapters and link resources
|
|
4221
|
+
*
|
|
4222
|
+
* Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
|
|
4223
|
+
* Falls back to single "Document" chapter when TOC is empty.
|
|
4224
|
+
*/
|
|
4225
|
+
async convertChapters(doclingDoc, tocEntries, pageRangeMap, images, tables, footnotes) {
|
|
4226
|
+
this.logger.info("[DocumentProcessor] Converting chapters...");
|
|
4227
|
+
if (tocEntries.length === 0) {
|
|
4228
|
+
this.logger.info(
|
|
4229
|
+
"[DocumentProcessor] No TOC entries, creating fallback chapter"
|
|
4230
|
+
);
|
|
4231
|
+
return this.createFallbackChapter(
|
|
4232
|
+
doclingDoc,
|
|
4233
|
+
pageRangeMap,
|
|
4234
|
+
images,
|
|
4235
|
+
tables,
|
|
4236
|
+
footnotes
|
|
4237
|
+
);
|
|
4238
|
+
}
|
|
4239
|
+
const chapters = this.chapterConverter.convert(
|
|
4240
|
+
tocEntries,
|
|
4241
|
+
doclingDoc.texts,
|
|
4242
|
+
pageRangeMap,
|
|
4243
|
+
images,
|
|
4244
|
+
tables,
|
|
4245
|
+
footnotes
|
|
4246
|
+
);
|
|
4247
|
+
this.logger.info(
|
|
4248
|
+
`[DocumentProcessor] Converted ${chapters.length} top-level chapters`
|
|
4249
|
+
);
|
|
4250
|
+
return chapters;
|
|
4251
|
+
}
|
|
4252
|
+
/**
|
|
4253
|
+
* Create a fallback chapter when TOC is not available
|
|
4254
|
+
*
|
|
4255
|
+
* Creates a single "Document" chapter containing all text blocks,
|
|
4256
|
+
* images, tables, and footnotes from the document.
|
|
4257
|
+
*/
|
|
4258
|
+
createFallbackChapter(doclingDoc, pageRangeMap, images, tables, footnotes) {
|
|
4259
|
+
const textBlocks = doclingDoc.texts.filter(
|
|
4260
|
+
(item) => item.label !== "footnote" && this.textCleaner.isValidText(item.text)
|
|
4261
|
+
).map((item) => ({
|
|
4262
|
+
text: this.textCleaner.normalize(item.text),
|
|
4263
|
+
pdfPageNo: item.prov?.[0]?.page_no ?? 1
|
|
4264
|
+
}));
|
|
4265
|
+
if (textBlocks.length === 0 && images.length === 0 && tables.length === 0 && footnotes.length === 0) {
|
|
4266
|
+
this.logger.info(
|
|
4267
|
+
"[DocumentProcessor] No content found for fallback chapter"
|
|
4268
|
+
);
|
|
4269
|
+
return [];
|
|
4270
|
+
}
|
|
4271
|
+
const firstPdfPage = Math.min(
|
|
4272
|
+
...Object.keys(pageRangeMap).map(Number).filter((n) => !isNaN(n)),
|
|
4273
|
+
1
|
|
4274
|
+
);
|
|
4275
|
+
const firstPageRange = pageRangeMap[firstPdfPage];
|
|
4276
|
+
const pageNo = firstPageRange?.startPageNo ?? 1;
|
|
4277
|
+
const fallbackChapter = {
|
|
4278
|
+
id: this.idGenerator.generateChapterId(),
|
|
4279
|
+
originTitle: "Document",
|
|
4280
|
+
title: "Document",
|
|
4281
|
+
pageNo,
|
|
4282
|
+
level: 1,
|
|
4283
|
+
textBlocks,
|
|
4284
|
+
imageIds: images.map((img) => img.id),
|
|
4285
|
+
tableIds: tables.map((tbl) => tbl.id),
|
|
4286
|
+
footnoteIds: footnotes.map((ftn) => ftn.id),
|
|
4287
|
+
children: []
|
|
4288
|
+
};
|
|
4289
|
+
this.logger.info(
|
|
4290
|
+
`[DocumentProcessor] Created fallback chapter with ${textBlocks.length} text blocks, ${images.length} images, ${tables.length} tables, ${footnotes.length} footnotes`
|
|
4291
|
+
);
|
|
4292
|
+
return [fallbackChapter];
|
|
4293
|
+
}
|
|
4294
|
+
};
|
|
4295
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
4296
|
+
0 && (module.exports = {
|
|
4297
|
+
BaseLLMComponent,
|
|
4298
|
+
BaseValidator,
|
|
4299
|
+
CONTINUATION_MARKERS,
|
|
4300
|
+
CaptionParseError,
|
|
4301
|
+
CaptionParser,
|
|
4302
|
+
CaptionValidationError,
|
|
4303
|
+
CaptionValidator,
|
|
4304
|
+
ChapterConverter,
|
|
4305
|
+
DocumentProcessor,
|
|
4306
|
+
PAGE_NUMBER_PATTERN,
|
|
4307
|
+
PagePattern,
|
|
4308
|
+
PageRangeParseError,
|
|
4309
|
+
PageRangeParser,
|
|
4310
|
+
TOC_KEYWORDS,
|
|
4311
|
+
TextLLMComponent,
|
|
4312
|
+
TocContentValidationSchema,
|
|
4313
|
+
TocContentValidator,
|
|
4314
|
+
TocEntrySchema,
|
|
4315
|
+
TocExtractError,
|
|
4316
|
+
TocExtractor,
|
|
4317
|
+
TocFinder,
|
|
4318
|
+
TocNotFoundError,
|
|
4319
|
+
TocParseError,
|
|
4320
|
+
TocResponseSchema,
|
|
4321
|
+
VisionLLMComponent,
|
|
4322
|
+
VisionTocExtractionSchema,
|
|
4323
|
+
VisionTocExtractor
|
|
4324
|
+
});
|
|
4325
|
+
//# sourceMappingURL=index.cjs.map
|