@heripo/document-processor 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -2219,7 +2219,15 @@ var VisionTocExtractor = class extends VisionLLMComponent {
2219
2219
  */
2220
2220
  async extractFromBatch(startPage, endPage) {
2221
2221
  this.log("info", `Extracting from pages ${startPage}-${endPage}`);
2222
+ this.log(
2223
+ "info",
2224
+ `Preparing images for vision analysis. This can be very slow (10+ minutes, sometimes 30+ minutes) depending on batch size and image resolution.`
2225
+ );
2222
2226
  const imageContents = this.loadPageImages(startPage, endPage);
2227
+ this.log(
2228
+ "info",
2229
+ `Calling vision LLM for TOC extraction (pages ${startPage}-${endPage})`
2230
+ );
2223
2231
  const result = await LLMCaller.callVision({
2224
2232
  schema: VisionTocExtractionSchema,
2225
2233
  messages: [
@@ -2242,6 +2250,10 @@ var VisionTocExtractor = class extends VisionLLMComponent {
2242
2250
  component: "VisionTocExtractor",
2243
2251
  phase: "extraction"
2244
2252
  });
2253
+ this.log(
2254
+ "info",
2255
+ `Vision LLM call completed (pages ${startPage}-${endPage})`
2256
+ );
2245
2257
  this.trackUsage(result.usage);
2246
2258
  return result.output;
2247
2259
  }
@@ -3304,9 +3316,11 @@ var BaseValidator = class extends TextLLMComponent {
3304
3316
  // src/validators/toc-content-validator.ts
3305
3317
  var import_zod5 = require("zod");
3306
3318
  var TocContentValidationSchema = import_zod5.z.object({
3307
- isToc: import_zod5.z.boolean().describe("Whether the content is a table of contents"),
3319
+ isValid: import_zod5.z.boolean().describe("Whether valid main document TOC was found"),
3308
3320
  confidence: import_zod5.z.number().min(0).max(1).describe("Confidence score between 0 and 1"),
3309
- reason: import_zod5.z.string().describe("Brief explanation for the decision")
3321
+ contentType: import_zod5.z.enum(["pure_toc", "mixed", "resource_only", "invalid"]).describe("Type of content detected"),
3322
+ extractedTocMarkdown: import_zod5.z.string().nullable().describe("Extracted main TOC markdown when mixed; null otherwise"),
3323
+ reason: import_zod5.z.string().describe("Brief explanation in English")
3310
3324
  });
3311
3325
  var TocContentValidator = class extends BaseValidator {
3312
3326
  confidenceThreshold;
@@ -3325,7 +3339,7 @@ var TocContentValidator = class extends BaseValidator {
3325
3339
  * Validate if the markdown content is a table of contents
3326
3340
  *
3327
3341
  * @param markdown - Markdown content to validate
3328
- * @returns Validation result with isToc, confidence, and reason
3342
+ * @returns Validation output with resolved markdown for valid TOC
3329
3343
  */
3330
3344
  async validate(markdown) {
3331
3345
  this.logger.info(
@@ -3336,8 +3350,10 @@ var TocContentValidator = class extends BaseValidator {
3336
3350
  "[TocContentValidator] Empty markdown, returning invalid"
3337
3351
  );
3338
3352
  return {
3339
- isToc: false,
3353
+ isValid: false,
3340
3354
  confidence: 1,
3355
+ contentType: "invalid",
3356
+ validTocMarkdown: null,
3341
3357
  reason: "Empty content"
3342
3358
  };
3343
3359
  }
@@ -3349,52 +3365,106 @@ var TocContentValidator = class extends BaseValidator {
3349
3365
  this.aggregator
3350
3366
  );
3351
3367
  this.logger.info(
3352
- `[TocContentValidator] Result: isToc=${result.isToc}, confidence=${result.confidence}`
3368
+ `[TocContentValidator] Result: isValid=${result.isValid}, contentType=${result.contentType}, confidence=${result.confidence}`
3353
3369
  );
3354
- return result;
3370
+ let validTocMarkdown = null;
3371
+ if (result.isValid && result.confidence >= this.confidenceThreshold) {
3372
+ if (result.contentType === "pure_toc") {
3373
+ validTocMarkdown = markdown;
3374
+ } else if (result.contentType === "mixed" && result.extractedTocMarkdown) {
3375
+ validTocMarkdown = result.extractedTocMarkdown;
3376
+ }
3377
+ }
3378
+ return {
3379
+ isValid: result.isValid,
3380
+ confidence: result.confidence,
3381
+ contentType: result.contentType,
3382
+ validTocMarkdown,
3383
+ reason: result.reason
3384
+ };
3355
3385
  }
3356
3386
  /**
3357
3387
  * Check if validation result passes threshold
3358
3388
  *
3359
- * @param result - Validation result from validate()
3389
+ * @param result - Validation output from validate()
3360
3390
  * @returns true if content is valid TOC with sufficient confidence
3361
3391
  */
3362
3392
  isValid(result) {
3363
- return result.isToc && result.confidence >= this.confidenceThreshold;
3393
+ return result.isValid && result.confidence >= this.confidenceThreshold;
3394
+ }
3395
+ /**
3396
+ * Get the valid TOC markdown from validation result
3397
+ *
3398
+ * @param result - Validation output from validate()
3399
+ * @returns Valid TOC markdown or null if invalid
3400
+ */
3401
+ getValidMarkdown(result) {
3402
+ return result.validTocMarkdown;
3364
3403
  }
3365
3404
  /**
3366
3405
  * Build system prompt for TOC content validation
3367
3406
  */
3368
3407
  buildSystemPrompt() {
3369
- return `You are a document structure analyst. Your task is to determine if the provided content is a Table of Contents (TOC).
3408
+ return `You are a document structure analyst. Your task is to analyze the provided content and classify it into one of four categories.
3409
+
3410
+ ## Content Type Classification:
3411
+
3412
+ ### 1. pure_toc
3413
+ The content is ONLY a main document Table of Contents with:
3414
+ - Structured list of chapters/sections with page numbers
3415
+ - Hierarchical section titles (e.g., "Chapter 1", "\uC81C1\uC7A5", "1.1 Introduction")
3416
+ - Multiple entries (3 or more) organized by document structure
3417
+ - NO resource indices mixed in
3418
+
3419
+ ### 2. mixed
3420
+ The content contains BOTH:
3421
+ - A valid main document TOC (chapters/sections with page numbers)
3422
+ - AND resource indices (photo/table/drawing indices)
3370
3423
 
3371
- ## What IS a Table of Contents:
3372
- - A structured list of chapters/sections with corresponding page numbers
3373
- - Contains hierarchical section titles (e.g., "Chapter 1", "\uC81C1\uC7A5", "1.1 Introduction", etc.)
3374
- - Has page number references for each entry (e.g., "..... 10", "... 5", or just a number at the end)
3375
- - Multiple entries organized by document structure
3376
- - Main document outline listing major chapters and sections
3424
+ When classifying as "mixed", you MUST extract ONLY the main TOC portion and return it in extractedTocMarkdown.
3377
3425
 
3378
- ## What is NOT a Table of Contents:
3426
+ ### 3. resource_only
3427
+ The content contains ONLY resource indices such as:
3379
3428
  - Photo/image indices (\uC0AC\uC9C4 \uBAA9\uCC28, \uC0AC\uC9C4\uBAA9\uCC28, Photo Index, List of Figures, List of Photos)
3380
3429
  - Table indices (\uD45C \uBAA9\uCC28, \uD45C\uBAA9\uCC28, Table Index, List of Tables)
3381
3430
  - Drawing/diagram indices (\uB3C4\uBA74 \uBAA9\uCC28, \uB3C4\uBA74\uBAA9\uCC28, Drawing Index, List of Drawings)
3382
3431
  - Appendix indices (\uBD80\uB85D \uBAA9\uCC28, Appendix Index)
3383
- - Random body text from the document
3432
+
3433
+ ### 4. invalid
3434
+ The content is none of the above:
3435
+ - Random body text
3384
3436
  - Single entries or incomplete lists (fewer than 3 items)
3385
3437
  - Reference lists or bibliographies
3386
3438
  - Index pages (alphabetical keyword lists)
3439
+ - Unstructured content
3387
3440
 
3388
3441
  ## Response Guidelines:
3389
- - Set isToc to true ONLY if content is clearly a main document TOC
3442
+ - Set isValid to true for "pure_toc" and "mixed" types
3443
+ - Set isValid to false for "resource_only" and "invalid" types
3390
3444
  - Set confidence between 0.0 and 1.0 based on your certainty
3391
- - Provide a brief reason explaining your decision (1-2 sentences)`;
3445
+ - For "mixed" type: extractedTocMarkdown MUST contain only the main TOC entries (preserve original formatting)
3446
+ - For other types: extractedTocMarkdown should be null
3447
+ - IMPORTANT: reason MUST be written in English
3448
+
3449
+ ## Example Scenarios:
3450
+
3451
+ ### Scenario 1: pure_toc
3452
+ Input: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5\\n\uC81C3\uC7A5 \uC870\uC0AC\uACB0\uACFC ..... 15"
3453
+ Output: { isValid: true, contentType: "pure_toc", extractedTocMarkdown: null }
3454
+
3455
+ ### Scenario 2: mixed
3456
+ Input: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5\\n\\n\uC0AC\uC9C4\uBAA9\uCC28\\n\uC0AC\uC9C4 1 \uC804\uACBD ..... 50\\n\uC0AC\uC9C4 2 \uC720\uBB3C ..... 51"
3457
+ Output: { isValid: true, contentType: "mixed", extractedTocMarkdown: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5" }
3458
+
3459
+ ### Scenario 3: resource_only
3460
+ Input: "\uC0AC\uC9C4\uBAA9\uCC28\\n\uC0AC\uC9C4 1 \uC804\uACBD ..... 50\\n\uC0AC\uC9C4 2 \uC720\uBB3C ..... 51"
3461
+ Output: { isValid: false, contentType: "resource_only", extractedTocMarkdown: null }`;
3392
3462
  }
3393
3463
  /**
3394
3464
  * Build user prompt with markdown content
3395
3465
  */
3396
3466
  buildUserPrompt(markdown) {
3397
- return `Determine if the following content is a Table of Contents:
3467
+ return `Analyze the following content and classify it:
3398
3468
 
3399
3469
  ${markdown}`;
3400
3470
  }
@@ -3952,9 +4022,20 @@ var DocumentProcessor = class {
3952
4022
  );
3953
4023
  markdown = null;
3954
4024
  } else {
3955
- this.logger.info(
3956
- `[DocumentProcessor] TOC validation passed (confidence: ${validation.confidence})`
3957
- );
4025
+ const validMarkdown = this.tocContentValidator.getValidMarkdown(validation);
4026
+ if (validMarkdown) {
4027
+ if (validation.contentType === "mixed") {
4028
+ this.logger.info(
4029
+ `[DocumentProcessor] Mixed TOC detected, using extracted main TOC (${validMarkdown.length} chars)`
4030
+ );
4031
+ }
4032
+ markdown = validMarkdown;
4033
+ this.logger.info(
4034
+ `[DocumentProcessor] TOC validation passed (confidence: ${validation.confidence})`
4035
+ );
4036
+ } else {
4037
+ markdown = null;
4038
+ }
3958
4039
  }
3959
4040
  } catch (error) {
3960
4041
  if (error instanceof TocNotFoundError) {