npm - @heripo/document-processor - Versions diffs - 0.1.2 → 0.1.3 - Mend

@heripo/document-processor 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.cjs CHANGED Viewed

@@ -2219,7 +2219,15 @@ var VisionTocExtractor = class extends VisionLLMComponent {
    */
   async extractFromBatch(startPage, endPage) {
     this.log("info", `Extracting from pages ${startPage}-${endPage}`);
+    this.log(
+      "info",
+      `Preparing images for vision analysis. This can be very slow (10+ minutes, sometimes 30+ minutes) depending on batch size and image resolution.`
+    );
     const imageContents = this.loadPageImages(startPage, endPage);
+    this.log(
+      "info",
+      `Calling vision LLM for TOC extraction (pages ${startPage}-${endPage})`
+    );
     const result = await LLMCaller.callVision({
       schema: VisionTocExtractionSchema,
       messages: [
@@ -2242,6 +2250,10 @@ var VisionTocExtractor = class extends VisionLLMComponent {
       component: "VisionTocExtractor",
       phase: "extraction"
     });
+    this.log(
+      "info",
+      `Vision LLM call completed (pages ${startPage}-${endPage})`
+    );
     this.trackUsage(result.usage);
     return result.output;
   }
@@ -3304,9 +3316,11 @@ var BaseValidator = class extends TextLLMComponent {
 // src/validators/toc-content-validator.ts
 var import_zod5 = require("zod");
 var TocContentValidationSchema = import_zod5.z.object({
-  isToc: import_zod5.z.boolean().describe("Whether the content is a table of contents"),
+  isValid: import_zod5.z.boolean().describe("Whether valid main document TOC was found"),
   confidence: import_zod5.z.number().min(0).max(1).describe("Confidence score between 0 and 1"),
-  reason: import_zod5.z.string().describe("Brief explanation for the decision")
+  contentType: import_zod5.z.enum(["pure_toc", "mixed", "resource_only", "invalid"]).describe("Type of content detected"),
+  extractedTocMarkdown: import_zod5.z.string().nullable().describe("Extracted main TOC markdown when mixed; null otherwise"),
+  reason: import_zod5.z.string().describe("Brief explanation in English")
 });
 var TocContentValidator = class extends BaseValidator {
   confidenceThreshold;
@@ -3325,7 +3339,7 @@ var TocContentValidator = class extends BaseValidator {
    * Validate if the markdown content is a table of contents
    *
    * @param markdown - Markdown content to validate
-   * @returns Validation result with isToc, confidence, and reason
+   * @returns Validation output with resolved markdown for valid TOC
    */
   async validate(markdown) {
     this.logger.info(
@@ -3336,8 +3350,10 @@ var TocContentValidator = class extends BaseValidator {
         "[TocContentValidator] Empty markdown, returning invalid"
       );
       return {
-        isToc: false,
+        isValid: false,
         confidence: 1,
+        contentType: "invalid",
+        validTocMarkdown: null,
         reason: "Empty content"
       };
     }
@@ -3349,52 +3365,106 @@ var TocContentValidator = class extends BaseValidator {
       this.aggregator
     );
     this.logger.info(
-      `[TocContentValidator] Result: isToc=${result.isToc}, confidence=${result.confidence}`
+      `[TocContentValidator] Result: isValid=${result.isValid}, contentType=${result.contentType}, confidence=${result.confidence}`
     );
-    return result;
+    let validTocMarkdown = null;
+    if (result.isValid && result.confidence >= this.confidenceThreshold) {
+      if (result.contentType === "pure_toc") {
+        validTocMarkdown = markdown;
+      } else if (result.contentType === "mixed" && result.extractedTocMarkdown) {
+        validTocMarkdown = result.extractedTocMarkdown;
+      }
+    }
+    return {
+      isValid: result.isValid,
+      confidence: result.confidence,
+      contentType: result.contentType,
+      validTocMarkdown,
+      reason: result.reason
+    };
   }
   /**
    * Check if validation result passes threshold
    *
-   * @param result - Validation result from validate()
+   * @param result - Validation output from validate()
    * @returns true if content is valid TOC with sufficient confidence
    */
   isValid(result) {
-    return result.isToc && result.confidence >= this.confidenceThreshold;
+    return result.isValid && result.confidence >= this.confidenceThreshold;
+  }
+  /**
+   * Get the valid TOC markdown from validation result
+   *
+   * @param result - Validation output from validate()
+   * @returns Valid TOC markdown or null if invalid
+   */
+  getValidMarkdown(result) {
+    return result.validTocMarkdown;
   }
   /**
    * Build system prompt for TOC content validation
    */
   buildSystemPrompt() {
-    return `You are a document structure analyst. Your task is to determine if the provided content is a Table of Contents (TOC).
+    return `You are a document structure analyst. Your task is to analyze the provided content and classify it into one of four categories.
+## Content Type Classification:
+### 1. pure_toc
+The content is ONLY a main document Table of Contents with:
+- Structured list of chapters/sections with page numbers
+- Hierarchical section titles (e.g., "Chapter 1", "\uC81C1\uC7A5", "1.1 Introduction")
+- Multiple entries (3 or more) organized by document structure
+- NO resource indices mixed in
+### 2. mixed
+The content contains BOTH:
+- A valid main document TOC (chapters/sections with page numbers)
+- AND resource indices (photo/table/drawing indices)
-## What IS a Table of Contents:
-- A structured list of chapters/sections with corresponding page numbers
-- Contains hierarchical section titles (e.g., "Chapter 1", "\uC81C1\uC7A5", "1.1 Introduction", etc.)
-- Has page number references for each entry (e.g., "..... 10", "... 5", or just a number at the end)
-- Multiple entries organized by document structure
-- Main document outline listing major chapters and sections
+When classifying as "mixed", you MUST extract ONLY the main TOC portion and return it in extractedTocMarkdown.
-## What is NOT a Table of Contents:
+### 3. resource_only
+The content contains ONLY resource indices such as:
 - Photo/image indices (\uC0AC\uC9C4 \uBAA9\uCC28, \uC0AC\uC9C4\uBAA9\uCC28, Photo Index, List of Figures, List of Photos)
 - Table indices (\uD45C \uBAA9\uCC28, \uD45C\uBAA9\uCC28, Table Index, List of Tables)
 - Drawing/diagram indices (\uB3C4\uBA74 \uBAA9\uCC28, \uB3C4\uBA74\uBAA9\uCC28, Drawing Index, List of Drawings)
 - Appendix indices (\uBD80\uB85D \uBAA9\uCC28, Appendix Index)
-- Random body text from the document
+### 4. invalid
+The content is none of the above:
+- Random body text
 - Single entries or incomplete lists (fewer than 3 items)
 - Reference lists or bibliographies
 - Index pages (alphabetical keyword lists)
+- Unstructured content
 ## Response Guidelines:
-- Set isToc to true ONLY if content is clearly a main document TOC
+- Set isValid to true for "pure_toc" and "mixed" types
+- Set isValid to false for "resource_only" and "invalid" types
 - Set confidence between 0.0 and 1.0 based on your certainty
-- Provide a brief reason explaining your decision (1-2 sentences)`;
+- For "mixed" type: extractedTocMarkdown MUST contain only the main TOC entries (preserve original formatting)
+- For other types: extractedTocMarkdown should be null
+- IMPORTANT: reason MUST be written in English
+## Example Scenarios:
+### Scenario 1: pure_toc
+Input: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5\\n\uC81C3\uC7A5 \uC870\uC0AC\uACB0\uACFC ..... 15"
+Output: { isValid: true, contentType: "pure_toc", extractedTocMarkdown: null }
+### Scenario 2: mixed
+Input: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5\\n\\n\uC0AC\uC9C4\uBAA9\uCC28\\n\uC0AC\uC9C4 1 \uC804\uACBD ..... 50\\n\uC0AC\uC9C4 2 \uC720\uBB3C ..... 51"
+Output: { isValid: true, contentType: "mixed", extractedTocMarkdown: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5" }
+### Scenario 3: resource_only
+Input: "\uC0AC\uC9C4\uBAA9\uCC28\\n\uC0AC\uC9C4 1 \uC804\uACBD ..... 50\\n\uC0AC\uC9C4 2 \uC720\uBB3C ..... 51"
+Output: { isValid: false, contentType: "resource_only", extractedTocMarkdown: null }`;
   }
   /**
    * Build user prompt with markdown content
    */
   buildUserPrompt(markdown) {
-    return `Determine if the following content is a Table of Contents:
+    return `Analyze the following content and classify it:
 ${markdown}`;
   }
@@ -3952,9 +4022,20 @@ var DocumentProcessor = class {
         );
         markdown = null;
       } else {
-        this.logger.info(
-          `[DocumentProcessor] TOC validation passed (confidence: ${validation.confidence})`
-        );
+        const validMarkdown = this.tocContentValidator.getValidMarkdown(validation);
+        if (validMarkdown) {
+          if (validation.contentType === "mixed") {
+            this.logger.info(
+              `[DocumentProcessor] Mixed TOC detected, using extracted main TOC (${validMarkdown.length} chars)`
+            );
+          }
+          markdown = validMarkdown;
+          this.logger.info(
+            `[DocumentProcessor] TOC validation passed (confidence: ${validation.confidence})`
+          );
+        } else {
+          markdown = null;
+        }
       }
     } catch (error) {
       if (error instanceof TocNotFoundError) {