@heripo/document-processor 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +104 -23
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +34 -5
- package/dist/index.d.ts +34 -5
- package/dist/index.js +104 -23
- package/dist/index.js.map +1 -1
- package/package.json +7 -7
package/dist/index.cjs
CHANGED
|
@@ -2219,7 +2219,15 @@ var VisionTocExtractor = class extends VisionLLMComponent {
|
|
|
2219
2219
|
*/
|
|
2220
2220
|
async extractFromBatch(startPage, endPage) {
|
|
2221
2221
|
this.log("info", `Extracting from pages ${startPage}-${endPage}`);
|
|
2222
|
+
this.log(
|
|
2223
|
+
"info",
|
|
2224
|
+
`Preparing images for vision analysis. This can be very slow (10+ minutes, sometimes 30+ minutes) depending on batch size and image resolution.`
|
|
2225
|
+
);
|
|
2222
2226
|
const imageContents = this.loadPageImages(startPage, endPage);
|
|
2227
|
+
this.log(
|
|
2228
|
+
"info",
|
|
2229
|
+
`Calling vision LLM for TOC extraction (pages ${startPage}-${endPage})`
|
|
2230
|
+
);
|
|
2223
2231
|
const result = await LLMCaller.callVision({
|
|
2224
2232
|
schema: VisionTocExtractionSchema,
|
|
2225
2233
|
messages: [
|
|
@@ -2242,6 +2250,10 @@ var VisionTocExtractor = class extends VisionLLMComponent {
|
|
|
2242
2250
|
component: "VisionTocExtractor",
|
|
2243
2251
|
phase: "extraction"
|
|
2244
2252
|
});
|
|
2253
|
+
this.log(
|
|
2254
|
+
"info",
|
|
2255
|
+
`Vision LLM call completed (pages ${startPage}-${endPage})`
|
|
2256
|
+
);
|
|
2245
2257
|
this.trackUsage(result.usage);
|
|
2246
2258
|
return result.output;
|
|
2247
2259
|
}
|
|
@@ -3304,9 +3316,11 @@ var BaseValidator = class extends TextLLMComponent {
|
|
|
3304
3316
|
// src/validators/toc-content-validator.ts
|
|
3305
3317
|
var import_zod5 = require("zod");
|
|
3306
3318
|
var TocContentValidationSchema = import_zod5.z.object({
|
|
3307
|
-
|
|
3319
|
+
isValid: import_zod5.z.boolean().describe("Whether valid main document TOC was found"),
|
|
3308
3320
|
confidence: import_zod5.z.number().min(0).max(1).describe("Confidence score between 0 and 1"),
|
|
3309
|
-
|
|
3321
|
+
contentType: import_zod5.z.enum(["pure_toc", "mixed", "resource_only", "invalid"]).describe("Type of content detected"),
|
|
3322
|
+
extractedTocMarkdown: import_zod5.z.string().nullable().describe("Extracted main TOC markdown when mixed; null otherwise"),
|
|
3323
|
+
reason: import_zod5.z.string().describe("Brief explanation in English")
|
|
3310
3324
|
});
|
|
3311
3325
|
var TocContentValidator = class extends BaseValidator {
|
|
3312
3326
|
confidenceThreshold;
|
|
@@ -3325,7 +3339,7 @@ var TocContentValidator = class extends BaseValidator {
|
|
|
3325
3339
|
* Validate if the markdown content is a table of contents
|
|
3326
3340
|
*
|
|
3327
3341
|
* @param markdown - Markdown content to validate
|
|
3328
|
-
* @returns Validation
|
|
3342
|
+
* @returns Validation output with resolved markdown for valid TOC
|
|
3329
3343
|
*/
|
|
3330
3344
|
async validate(markdown) {
|
|
3331
3345
|
this.logger.info(
|
|
@@ -3336,8 +3350,10 @@ var TocContentValidator = class extends BaseValidator {
|
|
|
3336
3350
|
"[TocContentValidator] Empty markdown, returning invalid"
|
|
3337
3351
|
);
|
|
3338
3352
|
return {
|
|
3339
|
-
|
|
3353
|
+
isValid: false,
|
|
3340
3354
|
confidence: 1,
|
|
3355
|
+
contentType: "invalid",
|
|
3356
|
+
validTocMarkdown: null,
|
|
3341
3357
|
reason: "Empty content"
|
|
3342
3358
|
};
|
|
3343
3359
|
}
|
|
@@ -3349,52 +3365,106 @@ var TocContentValidator = class extends BaseValidator {
|
|
|
3349
3365
|
this.aggregator
|
|
3350
3366
|
);
|
|
3351
3367
|
this.logger.info(
|
|
3352
|
-
`[TocContentValidator] Result:
|
|
3368
|
+
`[TocContentValidator] Result: isValid=${result.isValid}, contentType=${result.contentType}, confidence=${result.confidence}`
|
|
3353
3369
|
);
|
|
3354
|
-
|
|
3370
|
+
let validTocMarkdown = null;
|
|
3371
|
+
if (result.isValid && result.confidence >= this.confidenceThreshold) {
|
|
3372
|
+
if (result.contentType === "pure_toc") {
|
|
3373
|
+
validTocMarkdown = markdown;
|
|
3374
|
+
} else if (result.contentType === "mixed" && result.extractedTocMarkdown) {
|
|
3375
|
+
validTocMarkdown = result.extractedTocMarkdown;
|
|
3376
|
+
}
|
|
3377
|
+
}
|
|
3378
|
+
return {
|
|
3379
|
+
isValid: result.isValid,
|
|
3380
|
+
confidence: result.confidence,
|
|
3381
|
+
contentType: result.contentType,
|
|
3382
|
+
validTocMarkdown,
|
|
3383
|
+
reason: result.reason
|
|
3384
|
+
};
|
|
3355
3385
|
}
|
|
3356
3386
|
/**
|
|
3357
3387
|
* Check if validation result passes threshold
|
|
3358
3388
|
*
|
|
3359
|
-
* @param result - Validation
|
|
3389
|
+
* @param result - Validation output from validate()
|
|
3360
3390
|
* @returns true if content is valid TOC with sufficient confidence
|
|
3361
3391
|
*/
|
|
3362
3392
|
isValid(result) {
|
|
3363
|
-
return result.
|
|
3393
|
+
return result.isValid && result.confidence >= this.confidenceThreshold;
|
|
3394
|
+
}
|
|
3395
|
+
/**
|
|
3396
|
+
* Get the valid TOC markdown from validation result
|
|
3397
|
+
*
|
|
3398
|
+
* @param result - Validation output from validate()
|
|
3399
|
+
* @returns Valid TOC markdown or null if invalid
|
|
3400
|
+
*/
|
|
3401
|
+
getValidMarkdown(result) {
|
|
3402
|
+
return result.validTocMarkdown;
|
|
3364
3403
|
}
|
|
3365
3404
|
/**
|
|
3366
3405
|
* Build system prompt for TOC content validation
|
|
3367
3406
|
*/
|
|
3368
3407
|
buildSystemPrompt() {
|
|
3369
|
-
return `You are a document structure analyst. Your task is to
|
|
3408
|
+
return `You are a document structure analyst. Your task is to analyze the provided content and classify it into one of four categories.
|
|
3409
|
+
|
|
3410
|
+
## Content Type Classification:
|
|
3411
|
+
|
|
3412
|
+
### 1. pure_toc
|
|
3413
|
+
The content is ONLY a main document Table of Contents with:
|
|
3414
|
+
- Structured list of chapters/sections with page numbers
|
|
3415
|
+
- Hierarchical section titles (e.g., "Chapter 1", "\uC81C1\uC7A5", "1.1 Introduction")
|
|
3416
|
+
- Multiple entries (3 or more) organized by document structure
|
|
3417
|
+
- NO resource indices mixed in
|
|
3418
|
+
|
|
3419
|
+
### 2. mixed
|
|
3420
|
+
The content contains BOTH:
|
|
3421
|
+
- A valid main document TOC (chapters/sections with page numbers)
|
|
3422
|
+
- AND resource indices (photo/table/drawing indices)
|
|
3370
3423
|
|
|
3371
|
-
|
|
3372
|
-
- A structured list of chapters/sections with corresponding page numbers
|
|
3373
|
-
- Contains hierarchical section titles (e.g., "Chapter 1", "\uC81C1\uC7A5", "1.1 Introduction", etc.)
|
|
3374
|
-
- Has page number references for each entry (e.g., "..... 10", "... 5", or just a number at the end)
|
|
3375
|
-
- Multiple entries organized by document structure
|
|
3376
|
-
- Main document outline listing major chapters and sections
|
|
3424
|
+
When classifying as "mixed", you MUST extract ONLY the main TOC portion and return it in extractedTocMarkdown.
|
|
3377
3425
|
|
|
3378
|
-
|
|
3426
|
+
### 3. resource_only
|
|
3427
|
+
The content contains ONLY resource indices such as:
|
|
3379
3428
|
- Photo/image indices (\uC0AC\uC9C4 \uBAA9\uCC28, \uC0AC\uC9C4\uBAA9\uCC28, Photo Index, List of Figures, List of Photos)
|
|
3380
3429
|
- Table indices (\uD45C \uBAA9\uCC28, \uD45C\uBAA9\uCC28, Table Index, List of Tables)
|
|
3381
3430
|
- Drawing/diagram indices (\uB3C4\uBA74 \uBAA9\uCC28, \uB3C4\uBA74\uBAA9\uCC28, Drawing Index, List of Drawings)
|
|
3382
3431
|
- Appendix indices (\uBD80\uB85D \uBAA9\uCC28, Appendix Index)
|
|
3383
|
-
|
|
3432
|
+
|
|
3433
|
+
### 4. invalid
|
|
3434
|
+
The content is none of the above:
|
|
3435
|
+
- Random body text
|
|
3384
3436
|
- Single entries or incomplete lists (fewer than 3 items)
|
|
3385
3437
|
- Reference lists or bibliographies
|
|
3386
3438
|
- Index pages (alphabetical keyword lists)
|
|
3439
|
+
- Unstructured content
|
|
3387
3440
|
|
|
3388
3441
|
## Response Guidelines:
|
|
3389
|
-
- Set
|
|
3442
|
+
- Set isValid to true for "pure_toc" and "mixed" types
|
|
3443
|
+
- Set isValid to false for "resource_only" and "invalid" types
|
|
3390
3444
|
- Set confidence between 0.0 and 1.0 based on your certainty
|
|
3391
|
-
-
|
|
3445
|
+
- For "mixed" type: extractedTocMarkdown MUST contain only the main TOC entries (preserve original formatting)
|
|
3446
|
+
- For other types: extractedTocMarkdown should be null
|
|
3447
|
+
- IMPORTANT: reason MUST be written in English
|
|
3448
|
+
|
|
3449
|
+
## Example Scenarios:
|
|
3450
|
+
|
|
3451
|
+
### Scenario 1: pure_toc
|
|
3452
|
+
Input: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5\\n\uC81C3\uC7A5 \uC870\uC0AC\uACB0\uACFC ..... 15"
|
|
3453
|
+
Output: { isValid: true, contentType: "pure_toc", extractedTocMarkdown: null }
|
|
3454
|
+
|
|
3455
|
+
### Scenario 2: mixed
|
|
3456
|
+
Input: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5\\n\\n\uC0AC\uC9C4\uBAA9\uCC28\\n\uC0AC\uC9C4 1 \uC804\uACBD ..... 50\\n\uC0AC\uC9C4 2 \uC720\uBB3C ..... 51"
|
|
3457
|
+
Output: { isValid: true, contentType: "mixed", extractedTocMarkdown: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5" }
|
|
3458
|
+
|
|
3459
|
+
### Scenario 3: resource_only
|
|
3460
|
+
Input: "\uC0AC\uC9C4\uBAA9\uCC28\\n\uC0AC\uC9C4 1 \uC804\uACBD ..... 50\\n\uC0AC\uC9C4 2 \uC720\uBB3C ..... 51"
|
|
3461
|
+
Output: { isValid: false, contentType: "resource_only", extractedTocMarkdown: null }`;
|
|
3392
3462
|
}
|
|
3393
3463
|
/**
|
|
3394
3464
|
* Build user prompt with markdown content
|
|
3395
3465
|
*/
|
|
3396
3466
|
buildUserPrompt(markdown) {
|
|
3397
|
-
return `
|
|
3467
|
+
return `Analyze the following content and classify it:
|
|
3398
3468
|
|
|
3399
3469
|
${markdown}`;
|
|
3400
3470
|
}
|
|
@@ -3952,9 +4022,20 @@ var DocumentProcessor = class {
|
|
|
3952
4022
|
);
|
|
3953
4023
|
markdown = null;
|
|
3954
4024
|
} else {
|
|
3955
|
-
this.
|
|
3956
|
-
|
|
3957
|
-
|
|
4025
|
+
const validMarkdown = this.tocContentValidator.getValidMarkdown(validation);
|
|
4026
|
+
if (validMarkdown) {
|
|
4027
|
+
if (validation.contentType === "mixed") {
|
|
4028
|
+
this.logger.info(
|
|
4029
|
+
`[DocumentProcessor] Mixed TOC detected, using extracted main TOC (${validMarkdown.length} chars)`
|
|
4030
|
+
);
|
|
4031
|
+
}
|
|
4032
|
+
markdown = validMarkdown;
|
|
4033
|
+
this.logger.info(
|
|
4034
|
+
`[DocumentProcessor] TOC validation passed (confidence: ${validation.confidence})`
|
|
4035
|
+
);
|
|
4036
|
+
} else {
|
|
4037
|
+
markdown = null;
|
|
4038
|
+
}
|
|
3958
4039
|
}
|
|
3959
4040
|
} catch (error) {
|
|
3960
4041
|
if (error instanceof TocNotFoundError) {
|