@heripo/document-processor 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +123 -84
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +35 -13
- package/dist/index.d.ts +35 -13
- package/dist/index.js +123 -84
- package/dist/index.js.map +1 -1
- package/package.json +6 -6
package/dist/index.cjs
CHANGED
|
@@ -1908,11 +1908,10 @@ var TocExtractor = class extends TextLLMComponent {
|
|
|
1908
1908
|
async extract(markdown) {
|
|
1909
1909
|
this.log("info", `Starting TOC extraction (${markdown.length} chars)`);
|
|
1910
1910
|
if (!markdown.trim()) {
|
|
1911
|
-
this.log("
|
|
1912
|
-
|
|
1913
|
-
|
|
1914
|
-
|
|
1915
|
-
};
|
|
1911
|
+
this.log("error", "Cannot extract TOC from empty markdown content");
|
|
1912
|
+
throw new TocParseError(
|
|
1913
|
+
"TOC extraction failed: provided markdown content is empty"
|
|
1914
|
+
);
|
|
1916
1915
|
}
|
|
1917
1916
|
try {
|
|
1918
1917
|
const result = await this.callTextLLM(
|
|
@@ -2219,7 +2218,15 @@ var VisionTocExtractor = class extends VisionLLMComponent {
|
|
|
2219
2218
|
*/
|
|
2220
2219
|
async extractFromBatch(startPage, endPage) {
|
|
2221
2220
|
this.log("info", `Extracting from pages ${startPage}-${endPage}`);
|
|
2221
|
+
this.log(
|
|
2222
|
+
"info",
|
|
2223
|
+
`Preparing images for vision analysis. This can be very slow (10+ minutes, sometimes 30+ minutes) depending on batch size and image resolution.`
|
|
2224
|
+
);
|
|
2222
2225
|
const imageContents = this.loadPageImages(startPage, endPage);
|
|
2226
|
+
this.log(
|
|
2227
|
+
"info",
|
|
2228
|
+
`Calling vision LLM for TOC extraction (pages ${startPage}-${endPage})`
|
|
2229
|
+
);
|
|
2223
2230
|
const result = await LLMCaller.callVision({
|
|
2224
2231
|
schema: VisionTocExtractionSchema,
|
|
2225
2232
|
messages: [
|
|
@@ -2242,6 +2249,10 @@ var VisionTocExtractor = class extends VisionLLMComponent {
|
|
|
2242
2249
|
component: "VisionTocExtractor",
|
|
2243
2250
|
phase: "extraction"
|
|
2244
2251
|
});
|
|
2252
|
+
this.log(
|
|
2253
|
+
"info",
|
|
2254
|
+
`Vision LLM call completed (pages ${startPage}-${endPage})`
|
|
2255
|
+
);
|
|
2245
2256
|
this.trackUsage(result.usage);
|
|
2246
2257
|
return result.output;
|
|
2247
2258
|
}
|
|
@@ -3304,9 +3315,11 @@ var BaseValidator = class extends TextLLMComponent {
|
|
|
3304
3315
|
// src/validators/toc-content-validator.ts
|
|
3305
3316
|
var import_zod5 = require("zod");
|
|
3306
3317
|
var TocContentValidationSchema = import_zod5.z.object({
|
|
3307
|
-
|
|
3318
|
+
isValid: import_zod5.z.boolean().describe("Whether valid main document TOC was found"),
|
|
3308
3319
|
confidence: import_zod5.z.number().min(0).max(1).describe("Confidence score between 0 and 1"),
|
|
3309
|
-
|
|
3320
|
+
contentType: import_zod5.z.enum(["pure_toc", "mixed", "resource_only", "invalid"]).describe("Type of content detected"),
|
|
3321
|
+
extractedTocMarkdown: import_zod5.z.string().nullable().describe("Extracted main TOC markdown when mixed; null otherwise"),
|
|
3322
|
+
reason: import_zod5.z.string().describe("Brief explanation in English")
|
|
3310
3323
|
});
|
|
3311
3324
|
var TocContentValidator = class extends BaseValidator {
|
|
3312
3325
|
confidenceThreshold;
|
|
@@ -3325,7 +3338,7 @@ var TocContentValidator = class extends BaseValidator {
|
|
|
3325
3338
|
* Validate if the markdown content is a table of contents
|
|
3326
3339
|
*
|
|
3327
3340
|
* @param markdown - Markdown content to validate
|
|
3328
|
-
* @returns Validation
|
|
3341
|
+
* @returns Validation output with resolved markdown for valid TOC
|
|
3329
3342
|
*/
|
|
3330
3343
|
async validate(markdown) {
|
|
3331
3344
|
this.logger.info(
|
|
@@ -3336,8 +3349,10 @@ var TocContentValidator = class extends BaseValidator {
|
|
|
3336
3349
|
"[TocContentValidator] Empty markdown, returning invalid"
|
|
3337
3350
|
);
|
|
3338
3351
|
return {
|
|
3339
|
-
|
|
3352
|
+
isValid: false,
|
|
3340
3353
|
confidence: 1,
|
|
3354
|
+
contentType: "invalid",
|
|
3355
|
+
validTocMarkdown: null,
|
|
3341
3356
|
reason: "Empty content"
|
|
3342
3357
|
};
|
|
3343
3358
|
}
|
|
@@ -3349,52 +3364,106 @@ var TocContentValidator = class extends BaseValidator {
|
|
|
3349
3364
|
this.aggregator
|
|
3350
3365
|
);
|
|
3351
3366
|
this.logger.info(
|
|
3352
|
-
`[TocContentValidator] Result:
|
|
3367
|
+
`[TocContentValidator] Result: isValid=${result.isValid}, contentType=${result.contentType}, confidence=${result.confidence}`
|
|
3353
3368
|
);
|
|
3354
|
-
|
|
3369
|
+
let validTocMarkdown = null;
|
|
3370
|
+
if (result.isValid && result.confidence >= this.confidenceThreshold) {
|
|
3371
|
+
if (result.contentType === "pure_toc") {
|
|
3372
|
+
validTocMarkdown = markdown;
|
|
3373
|
+
} else if (result.contentType === "mixed" && result.extractedTocMarkdown) {
|
|
3374
|
+
validTocMarkdown = result.extractedTocMarkdown;
|
|
3375
|
+
}
|
|
3376
|
+
}
|
|
3377
|
+
return {
|
|
3378
|
+
isValid: result.isValid,
|
|
3379
|
+
confidence: result.confidence,
|
|
3380
|
+
contentType: result.contentType,
|
|
3381
|
+
validTocMarkdown,
|
|
3382
|
+
reason: result.reason
|
|
3383
|
+
};
|
|
3355
3384
|
}
|
|
3356
3385
|
/**
|
|
3357
3386
|
* Check if validation result passes threshold
|
|
3358
3387
|
*
|
|
3359
|
-
* @param result - Validation
|
|
3388
|
+
* @param result - Validation output from validate()
|
|
3360
3389
|
* @returns true if content is valid TOC with sufficient confidence
|
|
3361
3390
|
*/
|
|
3362
3391
|
isValid(result) {
|
|
3363
|
-
return result.
|
|
3392
|
+
return result.isValid && result.confidence >= this.confidenceThreshold;
|
|
3393
|
+
}
|
|
3394
|
+
/**
|
|
3395
|
+
* Get the valid TOC markdown from validation result
|
|
3396
|
+
*
|
|
3397
|
+
* @param result - Validation output from validate()
|
|
3398
|
+
* @returns Valid TOC markdown or null if invalid
|
|
3399
|
+
*/
|
|
3400
|
+
getValidMarkdown(result) {
|
|
3401
|
+
return result.validTocMarkdown;
|
|
3364
3402
|
}
|
|
3365
3403
|
/**
|
|
3366
3404
|
* Build system prompt for TOC content validation
|
|
3367
3405
|
*/
|
|
3368
3406
|
buildSystemPrompt() {
|
|
3369
|
-
return `You are a document structure analyst. Your task is to
|
|
3407
|
+
return `You are a document structure analyst. Your task is to analyze the provided content and classify it into one of four categories.
|
|
3370
3408
|
|
|
3371
|
-
##
|
|
3372
|
-
- A structured list of chapters/sections with corresponding page numbers
|
|
3373
|
-
- Contains hierarchical section titles (e.g., "Chapter 1", "\uC81C1\uC7A5", "1.1 Introduction", etc.)
|
|
3374
|
-
- Has page number references for each entry (e.g., "..... 10", "... 5", or just a number at the end)
|
|
3375
|
-
- Multiple entries organized by document structure
|
|
3376
|
-
- Main document outline listing major chapters and sections
|
|
3409
|
+
## Content Type Classification:
|
|
3377
3410
|
|
|
3378
|
-
|
|
3411
|
+
### 1. pure_toc
|
|
3412
|
+
The content is ONLY a main document Table of Contents with:
|
|
3413
|
+
- Structured list of chapters/sections with page numbers
|
|
3414
|
+
- Hierarchical section titles (e.g., "Chapter 1", "\uC81C1\uC7A5", "1.1 Introduction")
|
|
3415
|
+
- Multiple entries (3 or more) organized by document structure
|
|
3416
|
+
- NO resource indices mixed in
|
|
3417
|
+
|
|
3418
|
+
### 2. mixed
|
|
3419
|
+
The content contains BOTH:
|
|
3420
|
+
- A valid main document TOC (chapters/sections with page numbers)
|
|
3421
|
+
- AND resource indices (photo/table/drawing indices)
|
|
3422
|
+
|
|
3423
|
+
When classifying as "mixed", you MUST extract ONLY the main TOC portion and return it in extractedTocMarkdown.
|
|
3424
|
+
|
|
3425
|
+
### 3. resource_only
|
|
3426
|
+
The content contains ONLY resource indices such as:
|
|
3379
3427
|
- Photo/image indices (\uC0AC\uC9C4 \uBAA9\uCC28, \uC0AC\uC9C4\uBAA9\uCC28, Photo Index, List of Figures, List of Photos)
|
|
3380
3428
|
- Table indices (\uD45C \uBAA9\uCC28, \uD45C\uBAA9\uCC28, Table Index, List of Tables)
|
|
3381
3429
|
- Drawing/diagram indices (\uB3C4\uBA74 \uBAA9\uCC28, \uB3C4\uBA74\uBAA9\uCC28, Drawing Index, List of Drawings)
|
|
3382
3430
|
- Appendix indices (\uBD80\uB85D \uBAA9\uCC28, Appendix Index)
|
|
3383
|
-
|
|
3431
|
+
|
|
3432
|
+
### 4. invalid
|
|
3433
|
+
The content is none of the above:
|
|
3434
|
+
- Random body text
|
|
3384
3435
|
- Single entries or incomplete lists (fewer than 3 items)
|
|
3385
3436
|
- Reference lists or bibliographies
|
|
3386
3437
|
- Index pages (alphabetical keyword lists)
|
|
3438
|
+
- Unstructured content
|
|
3387
3439
|
|
|
3388
3440
|
## Response Guidelines:
|
|
3389
|
-
- Set
|
|
3441
|
+
- Set isValid to true for "pure_toc" and "mixed" types
|
|
3442
|
+
- Set isValid to false for "resource_only" and "invalid" types
|
|
3390
3443
|
- Set confidence between 0.0 and 1.0 based on your certainty
|
|
3391
|
-
-
|
|
3444
|
+
- For "mixed" type: extractedTocMarkdown MUST contain only the main TOC entries (preserve original formatting)
|
|
3445
|
+
- For other types: extractedTocMarkdown should be null
|
|
3446
|
+
- IMPORTANT: reason MUST be written in English
|
|
3447
|
+
|
|
3448
|
+
## Example Scenarios:
|
|
3449
|
+
|
|
3450
|
+
### Scenario 1: pure_toc
|
|
3451
|
+
Input: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5\\n\uC81C3\uC7A5 \uC870\uC0AC\uACB0\uACFC ..... 15"
|
|
3452
|
+
Output: { isValid: true, contentType: "pure_toc", extractedTocMarkdown: null }
|
|
3453
|
+
|
|
3454
|
+
### Scenario 2: mixed
|
|
3455
|
+
Input: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5\\n\\n\uC0AC\uC9C4\uBAA9\uCC28\\n\uC0AC\uC9C4 1 \uC804\uACBD ..... 50\\n\uC0AC\uC9C4 2 \uC720\uBB3C ..... 51"
|
|
3456
|
+
Output: { isValid: true, contentType: "mixed", extractedTocMarkdown: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5" }
|
|
3457
|
+
|
|
3458
|
+
### Scenario 3: resource_only
|
|
3459
|
+
Input: "\uC0AC\uC9C4\uBAA9\uCC28\\n\uC0AC\uC9C4 1 \uC804\uACBD ..... 50\\n\uC0AC\uC9C4 2 \uC720\uBB3C ..... 51"
|
|
3460
|
+
Output: { isValid: false, contentType: "resource_only", extractedTocMarkdown: null }`;
|
|
3392
3461
|
}
|
|
3393
3462
|
/**
|
|
3394
3463
|
* Build user prompt with markdown content
|
|
3395
3464
|
*/
|
|
3396
3465
|
buildUserPrompt(markdown) {
|
|
3397
|
-
return `
|
|
3466
|
+
return `Analyze the following content and classify it:
|
|
3398
3467
|
|
|
3399
3468
|
${markdown}`;
|
|
3400
3469
|
}
|
|
@@ -3952,9 +4021,20 @@ var DocumentProcessor = class {
|
|
|
3952
4021
|
);
|
|
3953
4022
|
markdown = null;
|
|
3954
4023
|
} else {
|
|
3955
|
-
this.
|
|
3956
|
-
|
|
3957
|
-
|
|
4024
|
+
const validMarkdown = this.tocContentValidator.getValidMarkdown(validation);
|
|
4025
|
+
if (validMarkdown) {
|
|
4026
|
+
if (validation.contentType === "mixed") {
|
|
4027
|
+
this.logger.info(
|
|
4028
|
+
`[DocumentProcessor] Mixed TOC detected, using extracted main TOC (${validMarkdown.length} chars)`
|
|
4029
|
+
);
|
|
4030
|
+
}
|
|
4031
|
+
markdown = validMarkdown;
|
|
4032
|
+
this.logger.info(
|
|
4033
|
+
`[DocumentProcessor] TOC validation passed (confidence: ${validation.confidence})`
|
|
4034
|
+
);
|
|
4035
|
+
} else {
|
|
4036
|
+
markdown = null;
|
|
4037
|
+
}
|
|
3958
4038
|
}
|
|
3959
4039
|
} catch (error) {
|
|
3960
4040
|
if (error instanceof TocNotFoundError) {
|
|
@@ -3970,10 +4050,13 @@ var DocumentProcessor = class {
|
|
|
3970
4050
|
const totalPages = Object.keys(doclingDoc.pages).length;
|
|
3971
4051
|
markdown = await this.visionTocExtractor.extract(totalPages);
|
|
3972
4052
|
if (!markdown) {
|
|
3973
|
-
|
|
3974
|
-
|
|
4053
|
+
const reason = "Both rule-based search and vision fallback failed to locate TOC";
|
|
4054
|
+
this.logger.error(
|
|
4055
|
+
`[DocumentProcessor] TOC extraction failed: ${reason}`
|
|
4056
|
+
);
|
|
4057
|
+
throw new TocNotFoundError(
|
|
4058
|
+
`Table of contents not found in the document. ${reason}.`
|
|
3975
4059
|
);
|
|
3976
|
-
return [];
|
|
3977
4060
|
}
|
|
3978
4061
|
this.logger.info(
|
|
3979
4062
|
`[DocumentProcessor] Vision extracted TOC markdown (${markdown.length} chars)`
|
|
@@ -3981,6 +4064,11 @@ var DocumentProcessor = class {
|
|
|
3981
4064
|
}
|
|
3982
4065
|
const tocResult = await this.tocExtractor.extract(markdown);
|
|
3983
4066
|
this.usageAggregator.track(tocResult.usage);
|
|
4067
|
+
if (tocResult.entries.length === 0) {
|
|
4068
|
+
const reason = "TOC area was detected but LLM could not extract any structured entries";
|
|
4069
|
+
this.logger.error(`[DocumentProcessor] TOC extraction failed: ${reason}`);
|
|
4070
|
+
throw new TocNotFoundError(`${reason}.`);
|
|
4071
|
+
}
|
|
3984
4072
|
this.logger.info(
|
|
3985
4073
|
`[DocumentProcessor] Extracted ${tocResult.entries.length} top-level TOC entries`
|
|
3986
4074
|
);
|
|
@@ -4220,21 +4308,14 @@ var DocumentProcessor = class {
|
|
|
4220
4308
|
* Convert chapters and link resources
|
|
4221
4309
|
*
|
|
4222
4310
|
* Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
|
|
4223
|
-
*
|
|
4311
|
+
* Throws TocNotFoundError if TOC entries are empty (defensive assertion).
|
|
4224
4312
|
*/
|
|
4225
4313
|
async convertChapters(doclingDoc, tocEntries, pageRangeMap, images, tables, footnotes) {
|
|
4226
4314
|
this.logger.info("[DocumentProcessor] Converting chapters...");
|
|
4227
4315
|
if (tocEntries.length === 0) {
|
|
4228
|
-
|
|
4229
|
-
|
|
4230
|
-
);
|
|
4231
|
-
return this.createFallbackChapter(
|
|
4232
|
-
doclingDoc,
|
|
4233
|
-
pageRangeMap,
|
|
4234
|
-
images,
|
|
4235
|
-
tables,
|
|
4236
|
-
footnotes
|
|
4237
|
-
);
|
|
4316
|
+
const reason = "Cannot convert chapters without TOC entries";
|
|
4317
|
+
this.logger.error(`[DocumentProcessor] ${reason}`);
|
|
4318
|
+
throw new TocNotFoundError(reason);
|
|
4238
4319
|
}
|
|
4239
4320
|
const chapters = this.chapterConverter.convert(
|
|
4240
4321
|
tocEntries,
|
|
@@ -4249,48 +4330,6 @@ var DocumentProcessor = class {
|
|
|
4249
4330
|
);
|
|
4250
4331
|
return chapters;
|
|
4251
4332
|
}
|
|
4252
|
-
/**
|
|
4253
|
-
* Create a fallback chapter when TOC is not available
|
|
4254
|
-
*
|
|
4255
|
-
* Creates a single "Document" chapter containing all text blocks,
|
|
4256
|
-
* images, tables, and footnotes from the document.
|
|
4257
|
-
*/
|
|
4258
|
-
createFallbackChapter(doclingDoc, pageRangeMap, images, tables, footnotes) {
|
|
4259
|
-
const textBlocks = doclingDoc.texts.filter(
|
|
4260
|
-
(item) => item.label !== "footnote" && this.textCleaner.isValidText(item.text)
|
|
4261
|
-
).map((item) => ({
|
|
4262
|
-
text: this.textCleaner.normalize(item.text),
|
|
4263
|
-
pdfPageNo: item.prov?.[0]?.page_no ?? 1
|
|
4264
|
-
}));
|
|
4265
|
-
if (textBlocks.length === 0 && images.length === 0 && tables.length === 0 && footnotes.length === 0) {
|
|
4266
|
-
this.logger.info(
|
|
4267
|
-
"[DocumentProcessor] No content found for fallback chapter"
|
|
4268
|
-
);
|
|
4269
|
-
return [];
|
|
4270
|
-
}
|
|
4271
|
-
const firstPdfPage = Math.min(
|
|
4272
|
-
...Object.keys(pageRangeMap).map(Number).filter((n) => !isNaN(n)),
|
|
4273
|
-
1
|
|
4274
|
-
);
|
|
4275
|
-
const firstPageRange = pageRangeMap[firstPdfPage];
|
|
4276
|
-
const pageNo = firstPageRange?.startPageNo ?? 1;
|
|
4277
|
-
const fallbackChapter = {
|
|
4278
|
-
id: this.idGenerator.generateChapterId(),
|
|
4279
|
-
originTitle: "Document",
|
|
4280
|
-
title: "Document",
|
|
4281
|
-
pageNo,
|
|
4282
|
-
level: 1,
|
|
4283
|
-
textBlocks,
|
|
4284
|
-
imageIds: images.map((img) => img.id),
|
|
4285
|
-
tableIds: tables.map((tbl) => tbl.id),
|
|
4286
|
-
footnoteIds: footnotes.map((ftn) => ftn.id),
|
|
4287
|
-
children: []
|
|
4288
|
-
};
|
|
4289
|
-
this.logger.info(
|
|
4290
|
-
`[DocumentProcessor] Created fallback chapter with ${textBlocks.length} text blocks, ${images.length} images, ${tables.length} tables, ${footnotes.length} footnotes`
|
|
4291
|
-
);
|
|
4292
|
-
return [fallbackChapter];
|
|
4293
|
-
}
|
|
4294
4333
|
};
|
|
4295
4334
|
// Annotate the CommonJS export names for ESM import in node:
|
|
4296
4335
|
0 && (module.exports = {
|