@heripo/document-processor 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +123 -84
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +35 -13
- package/dist/index.d.ts +35 -13
- package/dist/index.js +123 -84
- package/dist/index.js.map +1 -1
- package/package.json +6 -6
package/dist/index.d.cts
CHANGED
|
@@ -267,16 +267,9 @@ declare class DocumentProcessor {
|
|
|
267
267
|
* Convert chapters and link resources
|
|
268
268
|
*
|
|
269
269
|
* Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
|
|
270
|
-
*
|
|
270
|
+
* Throws TocNotFoundError if TOC entries are empty (defensive assertion).
|
|
271
271
|
*/
|
|
272
272
|
private convertChapters;
|
|
273
|
-
/**
|
|
274
|
-
* Create a fallback chapter when TOC is not available
|
|
275
|
-
*
|
|
276
|
-
* Creates a single "Document" chapter containing all text blocks,
|
|
277
|
-
* images, tables, and footnotes from the document.
|
|
278
|
-
*/
|
|
279
|
-
private createFallbackChapter;
|
|
280
273
|
}
|
|
281
274
|
|
|
282
275
|
/**
|
|
@@ -1187,15 +1180,36 @@ declare abstract class BaseValidator<TSchema extends z.ZodType, TResult = z.infe
|
|
|
1187
1180
|
}>;
|
|
1188
1181
|
}
|
|
1189
1182
|
|
|
1183
|
+
/**
|
|
1184
|
+
* Content type for TOC validation
|
|
1185
|
+
*/
|
|
1186
|
+
type TocContentType = 'pure_toc' | 'mixed' | 'resource_only' | 'invalid';
|
|
1190
1187
|
/**
|
|
1191
1188
|
* Schema for TOC content validation response
|
|
1192
1189
|
*/
|
|
1193
1190
|
declare const TocContentValidationSchema: z.ZodObject<{
|
|
1194
|
-
|
|
1191
|
+
isValid: z.ZodBoolean;
|
|
1195
1192
|
confidence: z.ZodNumber;
|
|
1193
|
+
contentType: z.ZodEnum<{
|
|
1194
|
+
pure_toc: "pure_toc";
|
|
1195
|
+
mixed: "mixed";
|
|
1196
|
+
resource_only: "resource_only";
|
|
1197
|
+
invalid: "invalid";
|
|
1198
|
+
}>;
|
|
1199
|
+
extractedTocMarkdown: z.ZodNullable<z.ZodString>;
|
|
1196
1200
|
reason: z.ZodString;
|
|
1197
1201
|
}, z.core.$strip>;
|
|
1198
1202
|
type TocContentValidationResult = z.infer<typeof TocContentValidationSchema>;
|
|
1203
|
+
/**
|
|
1204
|
+
* Output type for TOC validation with resolved markdown
|
|
1205
|
+
*/
|
|
1206
|
+
interface TocValidationOutput {
|
|
1207
|
+
isValid: boolean;
|
|
1208
|
+
confidence: number;
|
|
1209
|
+
contentType: TocContentType;
|
|
1210
|
+
validTocMarkdown: string | null;
|
|
1211
|
+
reason: string;
|
|
1212
|
+
}
|
|
1199
1213
|
/**
|
|
1200
1214
|
* Options for TocContentValidator
|
|
1201
1215
|
*/
|
|
@@ -1210,6 +1224,7 @@ interface TocContentValidatorOptions extends BaseValidatorOptions {
|
|
|
1210
1224
|
*
|
|
1211
1225
|
* Uses LLM to validate whether extracted markdown content is actually a TOC.
|
|
1212
1226
|
* This is a semantic validation, not structural validation.
|
|
1227
|
+
* Supports mixed content extraction where main TOC is combined with resource indices.
|
|
1213
1228
|
*/
|
|
1214
1229
|
declare class TocContentValidator extends BaseValidator<typeof TocContentValidationSchema, TocContentValidationResult> {
|
|
1215
1230
|
private readonly confidenceThreshold;
|
|
@@ -1218,16 +1233,23 @@ declare class TocContentValidator extends BaseValidator<typeof TocContentValidat
|
|
|
1218
1233
|
* Validate if the markdown content is a table of contents
|
|
1219
1234
|
*
|
|
1220
1235
|
* @param markdown - Markdown content to validate
|
|
1221
|
-
* @returns Validation
|
|
1236
|
+
* @returns Validation output with resolved markdown for valid TOC
|
|
1222
1237
|
*/
|
|
1223
|
-
validate(markdown: string): Promise<
|
|
1238
|
+
validate(markdown: string): Promise<TocValidationOutput>;
|
|
1224
1239
|
/**
|
|
1225
1240
|
* Check if validation result passes threshold
|
|
1226
1241
|
*
|
|
1227
|
-
* @param result - Validation
|
|
1242
|
+
* @param result - Validation output from validate()
|
|
1228
1243
|
* @returns true if content is valid TOC with sufficient confidence
|
|
1229
1244
|
*/
|
|
1230
|
-
isValid(result:
|
|
1245
|
+
isValid(result: TocValidationOutput): boolean;
|
|
1246
|
+
/**
|
|
1247
|
+
* Get the valid TOC markdown from validation result
|
|
1248
|
+
*
|
|
1249
|
+
* @param result - Validation output from validate()
|
|
1250
|
+
* @returns Valid TOC markdown or null if invalid
|
|
1251
|
+
*/
|
|
1252
|
+
getValidMarkdown(result: TocValidationOutput): string | null;
|
|
1231
1253
|
/**
|
|
1232
1254
|
* Build system prompt for TOC content validation
|
|
1233
1255
|
*/
|
package/dist/index.d.ts
CHANGED
|
@@ -267,16 +267,9 @@ declare class DocumentProcessor {
|
|
|
267
267
|
* Convert chapters and link resources
|
|
268
268
|
*
|
|
269
269
|
* Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
|
|
270
|
-
*
|
|
270
|
+
* Throws TocNotFoundError if TOC entries are empty (defensive assertion).
|
|
271
271
|
*/
|
|
272
272
|
private convertChapters;
|
|
273
|
-
/**
|
|
274
|
-
* Create a fallback chapter when TOC is not available
|
|
275
|
-
*
|
|
276
|
-
* Creates a single "Document" chapter containing all text blocks,
|
|
277
|
-
* images, tables, and footnotes from the document.
|
|
278
|
-
*/
|
|
279
|
-
private createFallbackChapter;
|
|
280
273
|
}
|
|
281
274
|
|
|
282
275
|
/**
|
|
@@ -1187,15 +1180,36 @@ declare abstract class BaseValidator<TSchema extends z.ZodType, TResult = z.infe
|
|
|
1187
1180
|
}>;
|
|
1188
1181
|
}
|
|
1189
1182
|
|
|
1183
|
+
/**
|
|
1184
|
+
* Content type for TOC validation
|
|
1185
|
+
*/
|
|
1186
|
+
type TocContentType = 'pure_toc' | 'mixed' | 'resource_only' | 'invalid';
|
|
1190
1187
|
/**
|
|
1191
1188
|
* Schema for TOC content validation response
|
|
1192
1189
|
*/
|
|
1193
1190
|
declare const TocContentValidationSchema: z.ZodObject<{
|
|
1194
|
-
|
|
1191
|
+
isValid: z.ZodBoolean;
|
|
1195
1192
|
confidence: z.ZodNumber;
|
|
1193
|
+
contentType: z.ZodEnum<{
|
|
1194
|
+
pure_toc: "pure_toc";
|
|
1195
|
+
mixed: "mixed";
|
|
1196
|
+
resource_only: "resource_only";
|
|
1197
|
+
invalid: "invalid";
|
|
1198
|
+
}>;
|
|
1199
|
+
extractedTocMarkdown: z.ZodNullable<z.ZodString>;
|
|
1196
1200
|
reason: z.ZodString;
|
|
1197
1201
|
}, z.core.$strip>;
|
|
1198
1202
|
type TocContentValidationResult = z.infer<typeof TocContentValidationSchema>;
|
|
1203
|
+
/**
|
|
1204
|
+
* Output type for TOC validation with resolved markdown
|
|
1205
|
+
*/
|
|
1206
|
+
interface TocValidationOutput {
|
|
1207
|
+
isValid: boolean;
|
|
1208
|
+
confidence: number;
|
|
1209
|
+
contentType: TocContentType;
|
|
1210
|
+
validTocMarkdown: string | null;
|
|
1211
|
+
reason: string;
|
|
1212
|
+
}
|
|
1199
1213
|
/**
|
|
1200
1214
|
* Options for TocContentValidator
|
|
1201
1215
|
*/
|
|
@@ -1210,6 +1224,7 @@ interface TocContentValidatorOptions extends BaseValidatorOptions {
|
|
|
1210
1224
|
*
|
|
1211
1225
|
* Uses LLM to validate whether extracted markdown content is actually a TOC.
|
|
1212
1226
|
* This is a semantic validation, not structural validation.
|
|
1227
|
+
* Supports mixed content extraction where main TOC is combined with resource indices.
|
|
1213
1228
|
*/
|
|
1214
1229
|
declare class TocContentValidator extends BaseValidator<typeof TocContentValidationSchema, TocContentValidationResult> {
|
|
1215
1230
|
private readonly confidenceThreshold;
|
|
@@ -1218,16 +1233,23 @@ declare class TocContentValidator extends BaseValidator<typeof TocContentValidat
|
|
|
1218
1233
|
* Validate if the markdown content is a table of contents
|
|
1219
1234
|
*
|
|
1220
1235
|
* @param markdown - Markdown content to validate
|
|
1221
|
-
* @returns Validation
|
|
1236
|
+
* @returns Validation output with resolved markdown for valid TOC
|
|
1222
1237
|
*/
|
|
1223
|
-
validate(markdown: string): Promise<
|
|
1238
|
+
validate(markdown: string): Promise<TocValidationOutput>;
|
|
1224
1239
|
/**
|
|
1225
1240
|
* Check if validation result passes threshold
|
|
1226
1241
|
*
|
|
1227
|
-
* @param result - Validation
|
|
1242
|
+
* @param result - Validation output from validate()
|
|
1228
1243
|
* @returns true if content is valid TOC with sufficient confidence
|
|
1229
1244
|
*/
|
|
1230
|
-
isValid(result:
|
|
1245
|
+
isValid(result: TocValidationOutput): boolean;
|
|
1246
|
+
/**
|
|
1247
|
+
* Get the valid TOC markdown from validation result
|
|
1248
|
+
*
|
|
1249
|
+
* @param result - Validation output from validate()
|
|
1250
|
+
* @returns Valid TOC markdown or null if invalid
|
|
1251
|
+
*/
|
|
1252
|
+
getValidMarkdown(result: TocValidationOutput): string | null;
|
|
1231
1253
|
/**
|
|
1232
1254
|
* Build system prompt for TOC content validation
|
|
1233
1255
|
*/
|
package/dist/index.js
CHANGED
|
@@ -1846,11 +1846,10 @@ var TocExtractor = class extends TextLLMComponent {
|
|
|
1846
1846
|
async extract(markdown) {
|
|
1847
1847
|
this.log("info", `Starting TOC extraction (${markdown.length} chars)`);
|
|
1848
1848
|
if (!markdown.trim()) {
|
|
1849
|
-
this.log("
|
|
1850
|
-
|
|
1851
|
-
|
|
1852
|
-
|
|
1853
|
-
};
|
|
1849
|
+
this.log("error", "Cannot extract TOC from empty markdown content");
|
|
1850
|
+
throw new TocParseError(
|
|
1851
|
+
"TOC extraction failed: provided markdown content is empty"
|
|
1852
|
+
);
|
|
1854
1853
|
}
|
|
1855
1854
|
try {
|
|
1856
1855
|
const result = await this.callTextLLM(
|
|
@@ -2157,7 +2156,15 @@ var VisionTocExtractor = class extends VisionLLMComponent {
|
|
|
2157
2156
|
*/
|
|
2158
2157
|
async extractFromBatch(startPage, endPage) {
|
|
2159
2158
|
this.log("info", `Extracting from pages ${startPage}-${endPage}`);
|
|
2159
|
+
this.log(
|
|
2160
|
+
"info",
|
|
2161
|
+
`Preparing images for vision analysis. This can be very slow (10+ minutes, sometimes 30+ minutes) depending on batch size and image resolution.`
|
|
2162
|
+
);
|
|
2160
2163
|
const imageContents = this.loadPageImages(startPage, endPage);
|
|
2164
|
+
this.log(
|
|
2165
|
+
"info",
|
|
2166
|
+
`Calling vision LLM for TOC extraction (pages ${startPage}-${endPage})`
|
|
2167
|
+
);
|
|
2161
2168
|
const result = await LLMCaller.callVision({
|
|
2162
2169
|
schema: VisionTocExtractionSchema,
|
|
2163
2170
|
messages: [
|
|
@@ -2180,6 +2187,10 @@ var VisionTocExtractor = class extends VisionLLMComponent {
|
|
|
2180
2187
|
component: "VisionTocExtractor",
|
|
2181
2188
|
phase: "extraction"
|
|
2182
2189
|
});
|
|
2190
|
+
this.log(
|
|
2191
|
+
"info",
|
|
2192
|
+
`Vision LLM call completed (pages ${startPage}-${endPage})`
|
|
2193
|
+
);
|
|
2183
2194
|
this.trackUsage(result.usage);
|
|
2184
2195
|
return result.output;
|
|
2185
2196
|
}
|
|
@@ -3242,9 +3253,11 @@ var BaseValidator = class extends TextLLMComponent {
|
|
|
3242
3253
|
// src/validators/toc-content-validator.ts
|
|
3243
3254
|
import { z as z5 } from "zod";
|
|
3244
3255
|
var TocContentValidationSchema = z5.object({
|
|
3245
|
-
|
|
3256
|
+
isValid: z5.boolean().describe("Whether valid main document TOC was found"),
|
|
3246
3257
|
confidence: z5.number().min(0).max(1).describe("Confidence score between 0 and 1"),
|
|
3247
|
-
|
|
3258
|
+
contentType: z5.enum(["pure_toc", "mixed", "resource_only", "invalid"]).describe("Type of content detected"),
|
|
3259
|
+
extractedTocMarkdown: z5.string().nullable().describe("Extracted main TOC markdown when mixed; null otherwise"),
|
|
3260
|
+
reason: z5.string().describe("Brief explanation in English")
|
|
3248
3261
|
});
|
|
3249
3262
|
var TocContentValidator = class extends BaseValidator {
|
|
3250
3263
|
confidenceThreshold;
|
|
@@ -3263,7 +3276,7 @@ var TocContentValidator = class extends BaseValidator {
|
|
|
3263
3276
|
* Validate if the markdown content is a table of contents
|
|
3264
3277
|
*
|
|
3265
3278
|
* @param markdown - Markdown content to validate
|
|
3266
|
-
* @returns Validation
|
|
3279
|
+
* @returns Validation output with resolved markdown for valid TOC
|
|
3267
3280
|
*/
|
|
3268
3281
|
async validate(markdown) {
|
|
3269
3282
|
this.logger.info(
|
|
@@ -3274,8 +3287,10 @@ var TocContentValidator = class extends BaseValidator {
|
|
|
3274
3287
|
"[TocContentValidator] Empty markdown, returning invalid"
|
|
3275
3288
|
);
|
|
3276
3289
|
return {
|
|
3277
|
-
|
|
3290
|
+
isValid: false,
|
|
3278
3291
|
confidence: 1,
|
|
3292
|
+
contentType: "invalid",
|
|
3293
|
+
validTocMarkdown: null,
|
|
3279
3294
|
reason: "Empty content"
|
|
3280
3295
|
};
|
|
3281
3296
|
}
|
|
@@ -3287,52 +3302,106 @@ var TocContentValidator = class extends BaseValidator {
|
|
|
3287
3302
|
this.aggregator
|
|
3288
3303
|
);
|
|
3289
3304
|
this.logger.info(
|
|
3290
|
-
`[TocContentValidator] Result:
|
|
3305
|
+
`[TocContentValidator] Result: isValid=${result.isValid}, contentType=${result.contentType}, confidence=${result.confidence}`
|
|
3291
3306
|
);
|
|
3292
|
-
|
|
3307
|
+
let validTocMarkdown = null;
|
|
3308
|
+
if (result.isValid && result.confidence >= this.confidenceThreshold) {
|
|
3309
|
+
if (result.contentType === "pure_toc") {
|
|
3310
|
+
validTocMarkdown = markdown;
|
|
3311
|
+
} else if (result.contentType === "mixed" && result.extractedTocMarkdown) {
|
|
3312
|
+
validTocMarkdown = result.extractedTocMarkdown;
|
|
3313
|
+
}
|
|
3314
|
+
}
|
|
3315
|
+
return {
|
|
3316
|
+
isValid: result.isValid,
|
|
3317
|
+
confidence: result.confidence,
|
|
3318
|
+
contentType: result.contentType,
|
|
3319
|
+
validTocMarkdown,
|
|
3320
|
+
reason: result.reason
|
|
3321
|
+
};
|
|
3293
3322
|
}
|
|
3294
3323
|
/**
|
|
3295
3324
|
* Check if validation result passes threshold
|
|
3296
3325
|
*
|
|
3297
|
-
* @param result - Validation
|
|
3326
|
+
* @param result - Validation output from validate()
|
|
3298
3327
|
* @returns true if content is valid TOC with sufficient confidence
|
|
3299
3328
|
*/
|
|
3300
3329
|
isValid(result) {
|
|
3301
|
-
return result.
|
|
3330
|
+
return result.isValid && result.confidence >= this.confidenceThreshold;
|
|
3331
|
+
}
|
|
3332
|
+
/**
|
|
3333
|
+
* Get the valid TOC markdown from validation result
|
|
3334
|
+
*
|
|
3335
|
+
* @param result - Validation output from validate()
|
|
3336
|
+
* @returns Valid TOC markdown or null if invalid
|
|
3337
|
+
*/
|
|
3338
|
+
getValidMarkdown(result) {
|
|
3339
|
+
return result.validTocMarkdown;
|
|
3302
3340
|
}
|
|
3303
3341
|
/**
|
|
3304
3342
|
* Build system prompt for TOC content validation
|
|
3305
3343
|
*/
|
|
3306
3344
|
buildSystemPrompt() {
|
|
3307
|
-
return `You are a document structure analyst. Your task is to
|
|
3345
|
+
return `You are a document structure analyst. Your task is to analyze the provided content and classify it into one of four categories.
|
|
3308
3346
|
|
|
3309
|
-
##
|
|
3310
|
-
- A structured list of chapters/sections with corresponding page numbers
|
|
3311
|
-
- Contains hierarchical section titles (e.g., "Chapter 1", "\uC81C1\uC7A5", "1.1 Introduction", etc.)
|
|
3312
|
-
- Has page number references for each entry (e.g., "..... 10", "... 5", or just a number at the end)
|
|
3313
|
-
- Multiple entries organized by document structure
|
|
3314
|
-
- Main document outline listing major chapters and sections
|
|
3347
|
+
## Content Type Classification:
|
|
3315
3348
|
|
|
3316
|
-
|
|
3349
|
+
### 1. pure_toc
|
|
3350
|
+
The content is ONLY a main document Table of Contents with:
|
|
3351
|
+
- Structured list of chapters/sections with page numbers
|
|
3352
|
+
- Hierarchical section titles (e.g., "Chapter 1", "\uC81C1\uC7A5", "1.1 Introduction")
|
|
3353
|
+
- Multiple entries (3 or more) organized by document structure
|
|
3354
|
+
- NO resource indices mixed in
|
|
3355
|
+
|
|
3356
|
+
### 2. mixed
|
|
3357
|
+
The content contains BOTH:
|
|
3358
|
+
- A valid main document TOC (chapters/sections with page numbers)
|
|
3359
|
+
- AND resource indices (photo/table/drawing indices)
|
|
3360
|
+
|
|
3361
|
+
When classifying as "mixed", you MUST extract ONLY the main TOC portion and return it in extractedTocMarkdown.
|
|
3362
|
+
|
|
3363
|
+
### 3. resource_only
|
|
3364
|
+
The content contains ONLY resource indices such as:
|
|
3317
3365
|
- Photo/image indices (\uC0AC\uC9C4 \uBAA9\uCC28, \uC0AC\uC9C4\uBAA9\uCC28, Photo Index, List of Figures, List of Photos)
|
|
3318
3366
|
- Table indices (\uD45C \uBAA9\uCC28, \uD45C\uBAA9\uCC28, Table Index, List of Tables)
|
|
3319
3367
|
- Drawing/diagram indices (\uB3C4\uBA74 \uBAA9\uCC28, \uB3C4\uBA74\uBAA9\uCC28, Drawing Index, List of Drawings)
|
|
3320
3368
|
- Appendix indices (\uBD80\uB85D \uBAA9\uCC28, Appendix Index)
|
|
3321
|
-
|
|
3369
|
+
|
|
3370
|
+
### 4. invalid
|
|
3371
|
+
The content is none of the above:
|
|
3372
|
+
- Random body text
|
|
3322
3373
|
- Single entries or incomplete lists (fewer than 3 items)
|
|
3323
3374
|
- Reference lists or bibliographies
|
|
3324
3375
|
- Index pages (alphabetical keyword lists)
|
|
3376
|
+
- Unstructured content
|
|
3325
3377
|
|
|
3326
3378
|
## Response Guidelines:
|
|
3327
|
-
- Set
|
|
3379
|
+
- Set isValid to true for "pure_toc" and "mixed" types
|
|
3380
|
+
- Set isValid to false for "resource_only" and "invalid" types
|
|
3328
3381
|
- Set confidence between 0.0 and 1.0 based on your certainty
|
|
3329
|
-
-
|
|
3382
|
+
- For "mixed" type: extractedTocMarkdown MUST contain only the main TOC entries (preserve original formatting)
|
|
3383
|
+
- For other types: extractedTocMarkdown should be null
|
|
3384
|
+
- IMPORTANT: reason MUST be written in English
|
|
3385
|
+
|
|
3386
|
+
## Example Scenarios:
|
|
3387
|
+
|
|
3388
|
+
### Scenario 1: pure_toc
|
|
3389
|
+
Input: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5\\n\uC81C3\uC7A5 \uC870\uC0AC\uACB0\uACFC ..... 15"
|
|
3390
|
+
Output: { isValid: true, contentType: "pure_toc", extractedTocMarkdown: null }
|
|
3391
|
+
|
|
3392
|
+
### Scenario 2: mixed
|
|
3393
|
+
Input: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5\\n\\n\uC0AC\uC9C4\uBAA9\uCC28\\n\uC0AC\uC9C4 1 \uC804\uACBD ..... 50\\n\uC0AC\uC9C4 2 \uC720\uBB3C ..... 51"
|
|
3394
|
+
Output: { isValid: true, contentType: "mixed", extractedTocMarkdown: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5" }
|
|
3395
|
+
|
|
3396
|
+
### Scenario 3: resource_only
|
|
3397
|
+
Input: "\uC0AC\uC9C4\uBAA9\uCC28\\n\uC0AC\uC9C4 1 \uC804\uACBD ..... 50\\n\uC0AC\uC9C4 2 \uC720\uBB3C ..... 51"
|
|
3398
|
+
Output: { isValid: false, contentType: "resource_only", extractedTocMarkdown: null }`;
|
|
3330
3399
|
}
|
|
3331
3400
|
/**
|
|
3332
3401
|
* Build user prompt with markdown content
|
|
3333
3402
|
*/
|
|
3334
3403
|
buildUserPrompt(markdown) {
|
|
3335
|
-
return `
|
|
3404
|
+
return `Analyze the following content and classify it:
|
|
3336
3405
|
|
|
3337
3406
|
${markdown}`;
|
|
3338
3407
|
}
|
|
@@ -3890,9 +3959,20 @@ var DocumentProcessor = class {
|
|
|
3890
3959
|
);
|
|
3891
3960
|
markdown = null;
|
|
3892
3961
|
} else {
|
|
3893
|
-
this.
|
|
3894
|
-
|
|
3895
|
-
|
|
3962
|
+
const validMarkdown = this.tocContentValidator.getValidMarkdown(validation);
|
|
3963
|
+
if (validMarkdown) {
|
|
3964
|
+
if (validation.contentType === "mixed") {
|
|
3965
|
+
this.logger.info(
|
|
3966
|
+
`[DocumentProcessor] Mixed TOC detected, using extracted main TOC (${validMarkdown.length} chars)`
|
|
3967
|
+
);
|
|
3968
|
+
}
|
|
3969
|
+
markdown = validMarkdown;
|
|
3970
|
+
this.logger.info(
|
|
3971
|
+
`[DocumentProcessor] TOC validation passed (confidence: ${validation.confidence})`
|
|
3972
|
+
);
|
|
3973
|
+
} else {
|
|
3974
|
+
markdown = null;
|
|
3975
|
+
}
|
|
3896
3976
|
}
|
|
3897
3977
|
} catch (error) {
|
|
3898
3978
|
if (error instanceof TocNotFoundError) {
|
|
@@ -3908,10 +3988,13 @@ var DocumentProcessor = class {
|
|
|
3908
3988
|
const totalPages = Object.keys(doclingDoc.pages).length;
|
|
3909
3989
|
markdown = await this.visionTocExtractor.extract(totalPages);
|
|
3910
3990
|
if (!markdown) {
|
|
3911
|
-
|
|
3912
|
-
|
|
3991
|
+
const reason = "Both rule-based search and vision fallback failed to locate TOC";
|
|
3992
|
+
this.logger.error(
|
|
3993
|
+
`[DocumentProcessor] TOC extraction failed: ${reason}`
|
|
3994
|
+
);
|
|
3995
|
+
throw new TocNotFoundError(
|
|
3996
|
+
`Table of contents not found in the document. ${reason}.`
|
|
3913
3997
|
);
|
|
3914
|
-
return [];
|
|
3915
3998
|
}
|
|
3916
3999
|
this.logger.info(
|
|
3917
4000
|
`[DocumentProcessor] Vision extracted TOC markdown (${markdown.length} chars)`
|
|
@@ -3919,6 +4002,11 @@ var DocumentProcessor = class {
|
|
|
3919
4002
|
}
|
|
3920
4003
|
const tocResult = await this.tocExtractor.extract(markdown);
|
|
3921
4004
|
this.usageAggregator.track(tocResult.usage);
|
|
4005
|
+
if (tocResult.entries.length === 0) {
|
|
4006
|
+
const reason = "TOC area was detected but LLM could not extract any structured entries";
|
|
4007
|
+
this.logger.error(`[DocumentProcessor] TOC extraction failed: ${reason}`);
|
|
4008
|
+
throw new TocNotFoundError(`${reason}.`);
|
|
4009
|
+
}
|
|
3922
4010
|
this.logger.info(
|
|
3923
4011
|
`[DocumentProcessor] Extracted ${tocResult.entries.length} top-level TOC entries`
|
|
3924
4012
|
);
|
|
@@ -4158,21 +4246,14 @@ var DocumentProcessor = class {
|
|
|
4158
4246
|
* Convert chapters and link resources
|
|
4159
4247
|
*
|
|
4160
4248
|
* Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
|
|
4161
|
-
*
|
|
4249
|
+
* Throws TocNotFoundError if TOC entries are empty (defensive assertion).
|
|
4162
4250
|
*/
|
|
4163
4251
|
async convertChapters(doclingDoc, tocEntries, pageRangeMap, images, tables, footnotes) {
|
|
4164
4252
|
this.logger.info("[DocumentProcessor] Converting chapters...");
|
|
4165
4253
|
if (tocEntries.length === 0) {
|
|
4166
|
-
|
|
4167
|
-
|
|
4168
|
-
);
|
|
4169
|
-
return this.createFallbackChapter(
|
|
4170
|
-
doclingDoc,
|
|
4171
|
-
pageRangeMap,
|
|
4172
|
-
images,
|
|
4173
|
-
tables,
|
|
4174
|
-
footnotes
|
|
4175
|
-
);
|
|
4254
|
+
const reason = "Cannot convert chapters without TOC entries";
|
|
4255
|
+
this.logger.error(`[DocumentProcessor] ${reason}`);
|
|
4256
|
+
throw new TocNotFoundError(reason);
|
|
4176
4257
|
}
|
|
4177
4258
|
const chapters = this.chapterConverter.convert(
|
|
4178
4259
|
tocEntries,
|
|
@@ -4187,48 +4268,6 @@ var DocumentProcessor = class {
|
|
|
4187
4268
|
);
|
|
4188
4269
|
return chapters;
|
|
4189
4270
|
}
|
|
4190
|
-
/**
|
|
4191
|
-
* Create a fallback chapter when TOC is not available
|
|
4192
|
-
*
|
|
4193
|
-
* Creates a single "Document" chapter containing all text blocks,
|
|
4194
|
-
* images, tables, and footnotes from the document.
|
|
4195
|
-
*/
|
|
4196
|
-
createFallbackChapter(doclingDoc, pageRangeMap, images, tables, footnotes) {
|
|
4197
|
-
const textBlocks = doclingDoc.texts.filter(
|
|
4198
|
-
(item) => item.label !== "footnote" && this.textCleaner.isValidText(item.text)
|
|
4199
|
-
).map((item) => ({
|
|
4200
|
-
text: this.textCleaner.normalize(item.text),
|
|
4201
|
-
pdfPageNo: item.prov?.[0]?.page_no ?? 1
|
|
4202
|
-
}));
|
|
4203
|
-
if (textBlocks.length === 0 && images.length === 0 && tables.length === 0 && footnotes.length === 0) {
|
|
4204
|
-
this.logger.info(
|
|
4205
|
-
"[DocumentProcessor] No content found for fallback chapter"
|
|
4206
|
-
);
|
|
4207
|
-
return [];
|
|
4208
|
-
}
|
|
4209
|
-
const firstPdfPage = Math.min(
|
|
4210
|
-
...Object.keys(pageRangeMap).map(Number).filter((n) => !isNaN(n)),
|
|
4211
|
-
1
|
|
4212
|
-
);
|
|
4213
|
-
const firstPageRange = pageRangeMap[firstPdfPage];
|
|
4214
|
-
const pageNo = firstPageRange?.startPageNo ?? 1;
|
|
4215
|
-
const fallbackChapter = {
|
|
4216
|
-
id: this.idGenerator.generateChapterId(),
|
|
4217
|
-
originTitle: "Document",
|
|
4218
|
-
title: "Document",
|
|
4219
|
-
pageNo,
|
|
4220
|
-
level: 1,
|
|
4221
|
-
textBlocks,
|
|
4222
|
-
imageIds: images.map((img) => img.id),
|
|
4223
|
-
tableIds: tables.map((tbl) => tbl.id),
|
|
4224
|
-
footnoteIds: footnotes.map((ftn) => ftn.id),
|
|
4225
|
-
children: []
|
|
4226
|
-
};
|
|
4227
|
-
this.logger.info(
|
|
4228
|
-
`[DocumentProcessor] Created fallback chapter with ${textBlocks.length} text blocks, ${images.length} images, ${tables.length} tables, ${footnotes.length} footnotes`
|
|
4229
|
-
);
|
|
4230
|
-
return [fallbackChapter];
|
|
4231
|
-
}
|
|
4232
4271
|
};
|
|
4233
4272
|
export {
|
|
4234
4273
|
BaseLLMComponent,
|