@heripo/document-processor 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +104 -23
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +34 -5
- package/dist/index.d.ts +34 -5
- package/dist/index.js +104 -23
- package/dist/index.js.map +1 -1
- package/package.json +7 -7
package/dist/index.d.cts
CHANGED
|
@@ -1187,15 +1187,36 @@ declare abstract class BaseValidator<TSchema extends z.ZodType, TResult = z.infe
|
|
|
1187
1187
|
}>;
|
|
1188
1188
|
}
|
|
1189
1189
|
|
|
1190
|
+
/**
|
|
1191
|
+
* Content type for TOC validation
|
|
1192
|
+
*/
|
|
1193
|
+
type TocContentType = 'pure_toc' | 'mixed' | 'resource_only' | 'invalid';
|
|
1190
1194
|
/**
|
|
1191
1195
|
* Schema for TOC content validation response
|
|
1192
1196
|
*/
|
|
1193
1197
|
declare const TocContentValidationSchema: z.ZodObject<{
|
|
1194
|
-
|
|
1198
|
+
isValid: z.ZodBoolean;
|
|
1195
1199
|
confidence: z.ZodNumber;
|
|
1200
|
+
contentType: z.ZodEnum<{
|
|
1201
|
+
pure_toc: "pure_toc";
|
|
1202
|
+
mixed: "mixed";
|
|
1203
|
+
resource_only: "resource_only";
|
|
1204
|
+
invalid: "invalid";
|
|
1205
|
+
}>;
|
|
1206
|
+
extractedTocMarkdown: z.ZodNullable<z.ZodString>;
|
|
1196
1207
|
reason: z.ZodString;
|
|
1197
1208
|
}, z.core.$strip>;
|
|
1198
1209
|
type TocContentValidationResult = z.infer<typeof TocContentValidationSchema>;
|
|
1210
|
+
/**
|
|
1211
|
+
* Output type for TOC validation with resolved markdown
|
|
1212
|
+
*/
|
|
1213
|
+
interface TocValidationOutput {
|
|
1214
|
+
isValid: boolean;
|
|
1215
|
+
confidence: number;
|
|
1216
|
+
contentType: TocContentType;
|
|
1217
|
+
validTocMarkdown: string | null;
|
|
1218
|
+
reason: string;
|
|
1219
|
+
}
|
|
1199
1220
|
/**
|
|
1200
1221
|
* Options for TocContentValidator
|
|
1201
1222
|
*/
|
|
@@ -1210,6 +1231,7 @@ interface TocContentValidatorOptions extends BaseValidatorOptions {
|
|
|
1210
1231
|
*
|
|
1211
1232
|
* Uses LLM to validate whether extracted markdown content is actually a TOC.
|
|
1212
1233
|
* This is a semantic validation, not structural validation.
|
|
1234
|
+
* Supports mixed content extraction where main TOC is combined with resource indices.
|
|
1213
1235
|
*/
|
|
1214
1236
|
declare class TocContentValidator extends BaseValidator<typeof TocContentValidationSchema, TocContentValidationResult> {
|
|
1215
1237
|
private readonly confidenceThreshold;
|
|
@@ -1218,16 +1240,23 @@ declare class TocContentValidator extends BaseValidator<typeof TocContentValidat
|
|
|
1218
1240
|
* Validate if the markdown content is a table of contents
|
|
1219
1241
|
*
|
|
1220
1242
|
* @param markdown - Markdown content to validate
|
|
1221
|
-
* @returns Validation
|
|
1243
|
+
* @returns Validation output with resolved markdown for valid TOC
|
|
1222
1244
|
*/
|
|
1223
|
-
validate(markdown: string): Promise<
|
|
1245
|
+
validate(markdown: string): Promise<TocValidationOutput>;
|
|
1224
1246
|
/**
|
|
1225
1247
|
* Check if validation result passes threshold
|
|
1226
1248
|
*
|
|
1227
|
-
* @param result - Validation
|
|
1249
|
+
* @param result - Validation output from validate()
|
|
1228
1250
|
* @returns true if content is valid TOC with sufficient confidence
|
|
1229
1251
|
*/
|
|
1230
|
-
isValid(result:
|
|
1252
|
+
isValid(result: TocValidationOutput): boolean;
|
|
1253
|
+
/**
|
|
1254
|
+
* Get the valid TOC markdown from validation result
|
|
1255
|
+
*
|
|
1256
|
+
* @param result - Validation output from validate()
|
|
1257
|
+
* @returns Valid TOC markdown or null if invalid
|
|
1258
|
+
*/
|
|
1259
|
+
getValidMarkdown(result: TocValidationOutput): string | null;
|
|
1231
1260
|
/**
|
|
1232
1261
|
* Build system prompt for TOC content validation
|
|
1233
1262
|
*/
|
package/dist/index.d.ts
CHANGED
|
@@ -1187,15 +1187,36 @@ declare abstract class BaseValidator<TSchema extends z.ZodType, TResult = z.infe
|
|
|
1187
1187
|
}>;
|
|
1188
1188
|
}
|
|
1189
1189
|
|
|
1190
|
+
/**
|
|
1191
|
+
* Content type for TOC validation
|
|
1192
|
+
*/
|
|
1193
|
+
type TocContentType = 'pure_toc' | 'mixed' | 'resource_only' | 'invalid';
|
|
1190
1194
|
/**
|
|
1191
1195
|
* Schema for TOC content validation response
|
|
1192
1196
|
*/
|
|
1193
1197
|
declare const TocContentValidationSchema: z.ZodObject<{
|
|
1194
|
-
|
|
1198
|
+
isValid: z.ZodBoolean;
|
|
1195
1199
|
confidence: z.ZodNumber;
|
|
1200
|
+
contentType: z.ZodEnum<{
|
|
1201
|
+
pure_toc: "pure_toc";
|
|
1202
|
+
mixed: "mixed";
|
|
1203
|
+
resource_only: "resource_only";
|
|
1204
|
+
invalid: "invalid";
|
|
1205
|
+
}>;
|
|
1206
|
+
extractedTocMarkdown: z.ZodNullable<z.ZodString>;
|
|
1196
1207
|
reason: z.ZodString;
|
|
1197
1208
|
}, z.core.$strip>;
|
|
1198
1209
|
type TocContentValidationResult = z.infer<typeof TocContentValidationSchema>;
|
|
1210
|
+
/**
|
|
1211
|
+
* Output type for TOC validation with resolved markdown
|
|
1212
|
+
*/
|
|
1213
|
+
interface TocValidationOutput {
|
|
1214
|
+
isValid: boolean;
|
|
1215
|
+
confidence: number;
|
|
1216
|
+
contentType: TocContentType;
|
|
1217
|
+
validTocMarkdown: string | null;
|
|
1218
|
+
reason: string;
|
|
1219
|
+
}
|
|
1199
1220
|
/**
|
|
1200
1221
|
* Options for TocContentValidator
|
|
1201
1222
|
*/
|
|
@@ -1210,6 +1231,7 @@ interface TocContentValidatorOptions extends BaseValidatorOptions {
|
|
|
1210
1231
|
*
|
|
1211
1232
|
* Uses LLM to validate whether extracted markdown content is actually a TOC.
|
|
1212
1233
|
* This is a semantic validation, not structural validation.
|
|
1234
|
+
* Supports mixed content extraction where main TOC is combined with resource indices.
|
|
1213
1235
|
*/
|
|
1214
1236
|
declare class TocContentValidator extends BaseValidator<typeof TocContentValidationSchema, TocContentValidationResult> {
|
|
1215
1237
|
private readonly confidenceThreshold;
|
|
@@ -1218,16 +1240,23 @@ declare class TocContentValidator extends BaseValidator<typeof TocContentValidat
|
|
|
1218
1240
|
* Validate if the markdown content is a table of contents
|
|
1219
1241
|
*
|
|
1220
1242
|
* @param markdown - Markdown content to validate
|
|
1221
|
-
* @returns Validation
|
|
1243
|
+
* @returns Validation output with resolved markdown for valid TOC
|
|
1222
1244
|
*/
|
|
1223
|
-
validate(markdown: string): Promise<
|
|
1245
|
+
validate(markdown: string): Promise<TocValidationOutput>;
|
|
1224
1246
|
/**
|
|
1225
1247
|
* Check if validation result passes threshold
|
|
1226
1248
|
*
|
|
1227
|
-
* @param result - Validation
|
|
1249
|
+
* @param result - Validation output from validate()
|
|
1228
1250
|
* @returns true if content is valid TOC with sufficient confidence
|
|
1229
1251
|
*/
|
|
1230
|
-
isValid(result:
|
|
1252
|
+
isValid(result: TocValidationOutput): boolean;
|
|
1253
|
+
/**
|
|
1254
|
+
* Get the valid TOC markdown from validation result
|
|
1255
|
+
*
|
|
1256
|
+
* @param result - Validation output from validate()
|
|
1257
|
+
* @returns Valid TOC markdown or null if invalid
|
|
1258
|
+
*/
|
|
1259
|
+
getValidMarkdown(result: TocValidationOutput): string | null;
|
|
1231
1260
|
/**
|
|
1232
1261
|
* Build system prompt for TOC content validation
|
|
1233
1262
|
*/
|
package/dist/index.js
CHANGED
|
@@ -2157,7 +2157,15 @@ var VisionTocExtractor = class extends VisionLLMComponent {
|
|
|
2157
2157
|
*/
|
|
2158
2158
|
async extractFromBatch(startPage, endPage) {
|
|
2159
2159
|
this.log("info", `Extracting from pages ${startPage}-${endPage}`);
|
|
2160
|
+
this.log(
|
|
2161
|
+
"info",
|
|
2162
|
+
`Preparing images for vision analysis. This can be very slow (10+ minutes, sometimes 30+ minutes) depending on batch size and image resolution.`
|
|
2163
|
+
);
|
|
2160
2164
|
const imageContents = this.loadPageImages(startPage, endPage);
|
|
2165
|
+
this.log(
|
|
2166
|
+
"info",
|
|
2167
|
+
`Calling vision LLM for TOC extraction (pages ${startPage}-${endPage})`
|
|
2168
|
+
);
|
|
2161
2169
|
const result = await LLMCaller.callVision({
|
|
2162
2170
|
schema: VisionTocExtractionSchema,
|
|
2163
2171
|
messages: [
|
|
@@ -2180,6 +2188,10 @@ var VisionTocExtractor = class extends VisionLLMComponent {
|
|
|
2180
2188
|
component: "VisionTocExtractor",
|
|
2181
2189
|
phase: "extraction"
|
|
2182
2190
|
});
|
|
2191
|
+
this.log(
|
|
2192
|
+
"info",
|
|
2193
|
+
`Vision LLM call completed (pages ${startPage}-${endPage})`
|
|
2194
|
+
);
|
|
2183
2195
|
this.trackUsage(result.usage);
|
|
2184
2196
|
return result.output;
|
|
2185
2197
|
}
|
|
@@ -3242,9 +3254,11 @@ var BaseValidator = class extends TextLLMComponent {
|
|
|
3242
3254
|
// src/validators/toc-content-validator.ts
|
|
3243
3255
|
import { z as z5 } from "zod";
|
|
3244
3256
|
var TocContentValidationSchema = z5.object({
|
|
3245
|
-
|
|
3257
|
+
isValid: z5.boolean().describe("Whether valid main document TOC was found"),
|
|
3246
3258
|
confidence: z5.number().min(0).max(1).describe("Confidence score between 0 and 1"),
|
|
3247
|
-
|
|
3259
|
+
contentType: z5.enum(["pure_toc", "mixed", "resource_only", "invalid"]).describe("Type of content detected"),
|
|
3260
|
+
extractedTocMarkdown: z5.string().nullable().describe("Extracted main TOC markdown when mixed; null otherwise"),
|
|
3261
|
+
reason: z5.string().describe("Brief explanation in English")
|
|
3248
3262
|
});
|
|
3249
3263
|
var TocContentValidator = class extends BaseValidator {
|
|
3250
3264
|
confidenceThreshold;
|
|
@@ -3263,7 +3277,7 @@ var TocContentValidator = class extends BaseValidator {
|
|
|
3263
3277
|
* Validate if the markdown content is a table of contents
|
|
3264
3278
|
*
|
|
3265
3279
|
* @param markdown - Markdown content to validate
|
|
3266
|
-
* @returns Validation
|
|
3280
|
+
* @returns Validation output with resolved markdown for valid TOC
|
|
3267
3281
|
*/
|
|
3268
3282
|
async validate(markdown) {
|
|
3269
3283
|
this.logger.info(
|
|
@@ -3274,8 +3288,10 @@ var TocContentValidator = class extends BaseValidator {
|
|
|
3274
3288
|
"[TocContentValidator] Empty markdown, returning invalid"
|
|
3275
3289
|
);
|
|
3276
3290
|
return {
|
|
3277
|
-
|
|
3291
|
+
isValid: false,
|
|
3278
3292
|
confidence: 1,
|
|
3293
|
+
contentType: "invalid",
|
|
3294
|
+
validTocMarkdown: null,
|
|
3279
3295
|
reason: "Empty content"
|
|
3280
3296
|
};
|
|
3281
3297
|
}
|
|
@@ -3287,52 +3303,106 @@ var TocContentValidator = class extends BaseValidator {
|
|
|
3287
3303
|
this.aggregator
|
|
3288
3304
|
);
|
|
3289
3305
|
this.logger.info(
|
|
3290
|
-
`[TocContentValidator] Result:
|
|
3306
|
+
`[TocContentValidator] Result: isValid=${result.isValid}, contentType=${result.contentType}, confidence=${result.confidence}`
|
|
3291
3307
|
);
|
|
3292
|
-
|
|
3308
|
+
let validTocMarkdown = null;
|
|
3309
|
+
if (result.isValid && result.confidence >= this.confidenceThreshold) {
|
|
3310
|
+
if (result.contentType === "pure_toc") {
|
|
3311
|
+
validTocMarkdown = markdown;
|
|
3312
|
+
} else if (result.contentType === "mixed" && result.extractedTocMarkdown) {
|
|
3313
|
+
validTocMarkdown = result.extractedTocMarkdown;
|
|
3314
|
+
}
|
|
3315
|
+
}
|
|
3316
|
+
return {
|
|
3317
|
+
isValid: result.isValid,
|
|
3318
|
+
confidence: result.confidence,
|
|
3319
|
+
contentType: result.contentType,
|
|
3320
|
+
validTocMarkdown,
|
|
3321
|
+
reason: result.reason
|
|
3322
|
+
};
|
|
3293
3323
|
}
|
|
3294
3324
|
/**
|
|
3295
3325
|
* Check if validation result passes threshold
|
|
3296
3326
|
*
|
|
3297
|
-
* @param result - Validation
|
|
3327
|
+
* @param result - Validation output from validate()
|
|
3298
3328
|
* @returns true if content is valid TOC with sufficient confidence
|
|
3299
3329
|
*/
|
|
3300
3330
|
isValid(result) {
|
|
3301
|
-
return result.
|
|
3331
|
+
return result.isValid && result.confidence >= this.confidenceThreshold;
|
|
3332
|
+
}
|
|
3333
|
+
/**
|
|
3334
|
+
* Get the valid TOC markdown from validation result
|
|
3335
|
+
*
|
|
3336
|
+
* @param result - Validation output from validate()
|
|
3337
|
+
* @returns Valid TOC markdown or null if invalid
|
|
3338
|
+
*/
|
|
3339
|
+
getValidMarkdown(result) {
|
|
3340
|
+
return result.validTocMarkdown;
|
|
3302
3341
|
}
|
|
3303
3342
|
/**
|
|
3304
3343
|
* Build system prompt for TOC content validation
|
|
3305
3344
|
*/
|
|
3306
3345
|
buildSystemPrompt() {
|
|
3307
|
-
return `You are a document structure analyst. Your task is to
|
|
3346
|
+
return `You are a document structure analyst. Your task is to analyze the provided content and classify it into one of four categories.
|
|
3347
|
+
|
|
3348
|
+
## Content Type Classification:
|
|
3349
|
+
|
|
3350
|
+
### 1. pure_toc
|
|
3351
|
+
The content is ONLY a main document Table of Contents with:
|
|
3352
|
+
- Structured list of chapters/sections with page numbers
|
|
3353
|
+
- Hierarchical section titles (e.g., "Chapter 1", "\uC81C1\uC7A5", "1.1 Introduction")
|
|
3354
|
+
- Multiple entries (3 or more) organized by document structure
|
|
3355
|
+
- NO resource indices mixed in
|
|
3356
|
+
|
|
3357
|
+
### 2. mixed
|
|
3358
|
+
The content contains BOTH:
|
|
3359
|
+
- A valid main document TOC (chapters/sections with page numbers)
|
|
3360
|
+
- AND resource indices (photo/table/drawing indices)
|
|
3308
3361
|
|
|
3309
|
-
|
|
3310
|
-
- A structured list of chapters/sections with corresponding page numbers
|
|
3311
|
-
- Contains hierarchical section titles (e.g., "Chapter 1", "\uC81C1\uC7A5", "1.1 Introduction", etc.)
|
|
3312
|
-
- Has page number references for each entry (e.g., "..... 10", "... 5", or just a number at the end)
|
|
3313
|
-
- Multiple entries organized by document structure
|
|
3314
|
-
- Main document outline listing major chapters and sections
|
|
3362
|
+
When classifying as "mixed", you MUST extract ONLY the main TOC portion and return it in extractedTocMarkdown.
|
|
3315
3363
|
|
|
3316
|
-
|
|
3364
|
+
### 3. resource_only
|
|
3365
|
+
The content contains ONLY resource indices such as:
|
|
3317
3366
|
- Photo/image indices (\uC0AC\uC9C4 \uBAA9\uCC28, \uC0AC\uC9C4\uBAA9\uCC28, Photo Index, List of Figures, List of Photos)
|
|
3318
3367
|
- Table indices (\uD45C \uBAA9\uCC28, \uD45C\uBAA9\uCC28, Table Index, List of Tables)
|
|
3319
3368
|
- Drawing/diagram indices (\uB3C4\uBA74 \uBAA9\uCC28, \uB3C4\uBA74\uBAA9\uCC28, Drawing Index, List of Drawings)
|
|
3320
3369
|
- Appendix indices (\uBD80\uB85D \uBAA9\uCC28, Appendix Index)
|
|
3321
|
-
|
|
3370
|
+
|
|
3371
|
+
### 4. invalid
|
|
3372
|
+
The content is none of the above:
|
|
3373
|
+
- Random body text
|
|
3322
3374
|
- Single entries or incomplete lists (fewer than 3 items)
|
|
3323
3375
|
- Reference lists or bibliographies
|
|
3324
3376
|
- Index pages (alphabetical keyword lists)
|
|
3377
|
+
- Unstructured content
|
|
3325
3378
|
|
|
3326
3379
|
## Response Guidelines:
|
|
3327
|
-
- Set
|
|
3380
|
+
- Set isValid to true for "pure_toc" and "mixed" types
|
|
3381
|
+
- Set isValid to false for "resource_only" and "invalid" types
|
|
3328
3382
|
- Set confidence between 0.0 and 1.0 based on your certainty
|
|
3329
|
-
-
|
|
3383
|
+
- For "mixed" type: extractedTocMarkdown MUST contain only the main TOC entries (preserve original formatting)
|
|
3384
|
+
- For other types: extractedTocMarkdown should be null
|
|
3385
|
+
- IMPORTANT: reason MUST be written in English
|
|
3386
|
+
|
|
3387
|
+
## Example Scenarios:
|
|
3388
|
+
|
|
3389
|
+
### Scenario 1: pure_toc
|
|
3390
|
+
Input: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5\\n\uC81C3\uC7A5 \uC870\uC0AC\uACB0\uACFC ..... 15"
|
|
3391
|
+
Output: { isValid: true, contentType: "pure_toc", extractedTocMarkdown: null }
|
|
3392
|
+
|
|
3393
|
+
### Scenario 2: mixed
|
|
3394
|
+
Input: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5\\n\\n\uC0AC\uC9C4\uBAA9\uCC28\\n\uC0AC\uC9C4 1 \uC804\uACBD ..... 50\\n\uC0AC\uC9C4 2 \uC720\uBB3C ..... 51"
|
|
3395
|
+
Output: { isValid: true, contentType: "mixed", extractedTocMarkdown: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5" }
|
|
3396
|
+
|
|
3397
|
+
### Scenario 3: resource_only
|
|
3398
|
+
Input: "\uC0AC\uC9C4\uBAA9\uCC28\\n\uC0AC\uC9C4 1 \uC804\uACBD ..... 50\\n\uC0AC\uC9C4 2 \uC720\uBB3C ..... 51"
|
|
3399
|
+
Output: { isValid: false, contentType: "resource_only", extractedTocMarkdown: null }`;
|
|
3330
3400
|
}
|
|
3331
3401
|
/**
|
|
3332
3402
|
* Build user prompt with markdown content
|
|
3333
3403
|
*/
|
|
3334
3404
|
buildUserPrompt(markdown) {
|
|
3335
|
-
return `
|
|
3405
|
+
return `Analyze the following content and classify it:
|
|
3336
3406
|
|
|
3337
3407
|
${markdown}`;
|
|
3338
3408
|
}
|
|
@@ -3890,9 +3960,20 @@ var DocumentProcessor = class {
|
|
|
3890
3960
|
);
|
|
3891
3961
|
markdown = null;
|
|
3892
3962
|
} else {
|
|
3893
|
-
this.
|
|
3894
|
-
|
|
3895
|
-
|
|
3963
|
+
const validMarkdown = this.tocContentValidator.getValidMarkdown(validation);
|
|
3964
|
+
if (validMarkdown) {
|
|
3965
|
+
if (validation.contentType === "mixed") {
|
|
3966
|
+
this.logger.info(
|
|
3967
|
+
`[DocumentProcessor] Mixed TOC detected, using extracted main TOC (${validMarkdown.length} chars)`
|
|
3968
|
+
);
|
|
3969
|
+
}
|
|
3970
|
+
markdown = validMarkdown;
|
|
3971
|
+
this.logger.info(
|
|
3972
|
+
`[DocumentProcessor] TOC validation passed (confidence: ${validation.confidence})`
|
|
3973
|
+
);
|
|
3974
|
+
} else {
|
|
3975
|
+
markdown = null;
|
|
3976
|
+
}
|
|
3896
3977
|
}
|
|
3897
3978
|
} catch (error) {
|
|
3898
3979
|
if (error instanceof TocNotFoundError) {
|