npm - @heripo/document-processor - Versions diffs - 0.1.2 → 0.1.4 - Mend

@heripo/document-processor 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.cjs CHANGED Viewed

@@ -1908,11 +1908,10 @@ var TocExtractor = class extends TextLLMComponent {
   async extract(markdown) {
     this.log("info", `Starting TOC extraction (${markdown.length} chars)`);
     if (!markdown.trim()) {
-      this.log("info", "Empty markdown, returning empty array");
-      return {
-        entries: [],
-        usage: this.createEmptyUsage("extraction")
-      };
+      this.log("error", "Cannot extract TOC from empty markdown content");
+      throw new TocParseError(
+        "TOC extraction failed: provided markdown content is empty"
+      );
     }
     try {
       const result = await this.callTextLLM(
@@ -2219,7 +2218,15 @@ var VisionTocExtractor = class extends VisionLLMComponent {
    */
   async extractFromBatch(startPage, endPage) {
     this.log("info", `Extracting from pages ${startPage}-${endPage}`);
+    this.log(
+      "info",
+      `Preparing images for vision analysis. This can be very slow (10+ minutes, sometimes 30+ minutes) depending on batch size and image resolution.`
+    );
     const imageContents = this.loadPageImages(startPage, endPage);
+    this.log(
+      "info",
+      `Calling vision LLM for TOC extraction (pages ${startPage}-${endPage})`
+    );
     const result = await LLMCaller.callVision({
       schema: VisionTocExtractionSchema,
       messages: [
@@ -2242,6 +2249,10 @@ var VisionTocExtractor = class extends VisionLLMComponent {
       component: "VisionTocExtractor",
       phase: "extraction"
     });
+    this.log(
+      "info",
+      `Vision LLM call completed (pages ${startPage}-${endPage})`
+    );
     this.trackUsage(result.usage);
     return result.output;
   }
@@ -3304,9 +3315,11 @@ var BaseValidator = class extends TextLLMComponent {
 // src/validators/toc-content-validator.ts
 var import_zod5 = require("zod");
 var TocContentValidationSchema = import_zod5.z.object({
-  isToc: import_zod5.z.boolean().describe("Whether the content is a table of contents"),
+  isValid: import_zod5.z.boolean().describe("Whether valid main document TOC was found"),
   confidence: import_zod5.z.number().min(0).max(1).describe("Confidence score between 0 and 1"),
-  reason: import_zod5.z.string().describe("Brief explanation for the decision")
+  contentType: import_zod5.z.enum(["pure_toc", "mixed", "resource_only", "invalid"]).describe("Type of content detected"),
+  extractedTocMarkdown: import_zod5.z.string().nullable().describe("Extracted main TOC markdown when mixed; null otherwise"),
+  reason: import_zod5.z.string().describe("Brief explanation in English")
 });
 var TocContentValidator = class extends BaseValidator {
   confidenceThreshold;
@@ -3325,7 +3338,7 @@ var TocContentValidator = class extends BaseValidator {
    * Validate if the markdown content is a table of contents
    *
    * @param markdown - Markdown content to validate
-   * @returns Validation result with isToc, confidence, and reason
+   * @returns Validation output with resolved markdown for valid TOC
    */
   async validate(markdown) {
     this.logger.info(
@@ -3336,8 +3349,10 @@ var TocContentValidator = class extends BaseValidator {
         "[TocContentValidator] Empty markdown, returning invalid"
       );
       return {
-        isToc: false,
+        isValid: false,
         confidence: 1,
+        contentType: "invalid",
+        validTocMarkdown: null,
         reason: "Empty content"
       };
     }
@@ -3349,52 +3364,106 @@ var TocContentValidator = class extends BaseValidator {
       this.aggregator
     );
     this.logger.info(
-      `[TocContentValidator] Result: isToc=${result.isToc}, confidence=${result.confidence}`
+      `[TocContentValidator] Result: isValid=${result.isValid}, contentType=${result.contentType}, confidence=${result.confidence}`
     );
-    return result;
+    let validTocMarkdown = null;
+    if (result.isValid && result.confidence >= this.confidenceThreshold) {
+      if (result.contentType === "pure_toc") {
+        validTocMarkdown = markdown;
+      } else if (result.contentType === "mixed" && result.extractedTocMarkdown) {
+        validTocMarkdown = result.extractedTocMarkdown;
+      }
+    }
+    return {
+      isValid: result.isValid,
+      confidence: result.confidence,
+      contentType: result.contentType,
+      validTocMarkdown,
+      reason: result.reason
+    };
   }
   /**
    * Check if validation result passes threshold
    *
-   * @param result - Validation result from validate()
+   * @param result - Validation output from validate()
    * @returns true if content is valid TOC with sufficient confidence
    */
   isValid(result) {
-    return result.isToc && result.confidence >= this.confidenceThreshold;
+    return result.isValid && result.confidence >= this.confidenceThreshold;
+  }
+  /**
+   * Get the valid TOC markdown from validation result
+   *
+   * @param result - Validation output from validate()
+   * @returns Valid TOC markdown or null if invalid
+   */
+  getValidMarkdown(result) {
+    return result.validTocMarkdown;
   }
   /**
    * Build system prompt for TOC content validation
    */
   buildSystemPrompt() {
-    return `You are a document structure analyst. Your task is to determine if the provided content is a Table of Contents (TOC).
+    return `You are a document structure analyst. Your task is to analyze the provided content and classify it into one of four categories.
-## What IS a Table of Contents:
-- A structured list of chapters/sections with corresponding page numbers
-- Contains hierarchical section titles (e.g., "Chapter 1", "\uC81C1\uC7A5", "1.1 Introduction", etc.)
-- Has page number references for each entry (e.g., "..... 10", "... 5", or just a number at the end)
-- Multiple entries organized by document structure
-- Main document outline listing major chapters and sections
+## Content Type Classification:
-## What is NOT a Table of Contents:
+### 1. pure_toc
+The content is ONLY a main document Table of Contents with:
+- Structured list of chapters/sections with page numbers
+- Hierarchical section titles (e.g., "Chapter 1", "\uC81C1\uC7A5", "1.1 Introduction")
+- Multiple entries (3 or more) organized by document structure
+- NO resource indices mixed in
+### 2. mixed
+The content contains BOTH:
+- A valid main document TOC (chapters/sections with page numbers)
+- AND resource indices (photo/table/drawing indices)
+When classifying as "mixed", you MUST extract ONLY the main TOC portion and return it in extractedTocMarkdown.
+### 3. resource_only
+The content contains ONLY resource indices such as:
 - Photo/image indices (\uC0AC\uC9C4 \uBAA9\uCC28, \uC0AC\uC9C4\uBAA9\uCC28, Photo Index, List of Figures, List of Photos)
 - Table indices (\uD45C \uBAA9\uCC28, \uD45C\uBAA9\uCC28, Table Index, List of Tables)
 - Drawing/diagram indices (\uB3C4\uBA74 \uBAA9\uCC28, \uB3C4\uBA74\uBAA9\uCC28, Drawing Index, List of Drawings)
 - Appendix indices (\uBD80\uB85D \uBAA9\uCC28, Appendix Index)
-- Random body text from the document
+### 4. invalid
+The content is none of the above:
+- Random body text
 - Single entries or incomplete lists (fewer than 3 items)
 - Reference lists or bibliographies
 - Index pages (alphabetical keyword lists)
+- Unstructured content
 ## Response Guidelines:
-- Set isToc to true ONLY if content is clearly a main document TOC
+- Set isValid to true for "pure_toc" and "mixed" types
+- Set isValid to false for "resource_only" and "invalid" types
 - Set confidence between 0.0 and 1.0 based on your certainty
-- Provide a brief reason explaining your decision (1-2 sentences)`;
+- For "mixed" type: extractedTocMarkdown MUST contain only the main TOC entries (preserve original formatting)
+- For other types: extractedTocMarkdown should be null
+- IMPORTANT: reason MUST be written in English
+## Example Scenarios:
+### Scenario 1: pure_toc
+Input: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5\\n\uC81C3\uC7A5 \uC870\uC0AC\uACB0\uACFC ..... 15"
+Output: { isValid: true, contentType: "pure_toc", extractedTocMarkdown: null }
+### Scenario 2: mixed
+Input: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5\\n\\n\uC0AC\uC9C4\uBAA9\uCC28\\n\uC0AC\uC9C4 1 \uC804\uACBD ..... 50\\n\uC0AC\uC9C4 2 \uC720\uBB3C ..... 51"
+Output: { isValid: true, contentType: "mixed", extractedTocMarkdown: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5" }
+### Scenario 3: resource_only
+Input: "\uC0AC\uC9C4\uBAA9\uCC28\\n\uC0AC\uC9C4 1 \uC804\uACBD ..... 50\\n\uC0AC\uC9C4 2 \uC720\uBB3C ..... 51"
+Output: { isValid: false, contentType: "resource_only", extractedTocMarkdown: null }`;
   }
   /**
    * Build user prompt with markdown content
    */
   buildUserPrompt(markdown) {
-    return `Determine if the following content is a Table of Contents:
+    return `Analyze the following content and classify it:
 ${markdown}`;
   }
@@ -3952,9 +4021,20 @@ var DocumentProcessor = class {
         );
         markdown = null;
       } else {
-        this.logger.info(
-          `[DocumentProcessor] TOC validation passed (confidence: ${validation.confidence})`
-        );
+        const validMarkdown = this.tocContentValidator.getValidMarkdown(validation);
+        if (validMarkdown) {
+          if (validation.contentType === "mixed") {
+            this.logger.info(
+              `[DocumentProcessor] Mixed TOC detected, using extracted main TOC (${validMarkdown.length} chars)`
+            );
+          }
+          markdown = validMarkdown;
+          this.logger.info(
+            `[DocumentProcessor] TOC validation passed (confidence: ${validation.confidence})`
+          );
+        } else {
+          markdown = null;
+        }
       }
     } catch (error) {
       if (error instanceof TocNotFoundError) {
@@ -3970,10 +4050,13 @@ var DocumentProcessor = class {
       const totalPages = Object.keys(doclingDoc.pages).length;
       markdown = await this.visionTocExtractor.extract(totalPages);
       if (!markdown) {
-        this.logger.warn(
-          "[DocumentProcessor] TOC not found in any method, returning empty"
+        const reason = "Both rule-based search and vision fallback failed to locate TOC";
+        this.logger.error(
+          `[DocumentProcessor] TOC extraction failed: ${reason}`
+        );
+        throw new TocNotFoundError(
+          `Table of contents not found in the document. ${reason}.`
         );
-        return [];
       }
       this.logger.info(
         `[DocumentProcessor] Vision extracted TOC markdown (${markdown.length} chars)`
@@ -3981,6 +4064,11 @@ var DocumentProcessor = class {
     }
     const tocResult = await this.tocExtractor.extract(markdown);
     this.usageAggregator.track(tocResult.usage);
+    if (tocResult.entries.length === 0) {
+      const reason = "TOC area was detected but LLM could not extract any structured entries";
+      this.logger.error(`[DocumentProcessor] TOC extraction failed: ${reason}`);
+      throw new TocNotFoundError(`${reason}.`);
+    }
     this.logger.info(
       `[DocumentProcessor] Extracted ${tocResult.entries.length} top-level TOC entries`
     );
@@ -4220,21 +4308,14 @@ var DocumentProcessor = class {
    * Convert chapters and link resources
    *
    * Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
-   * Falls back to single "Document" chapter when TOC is empty.
+   * Throws TocNotFoundError if TOC entries are empty (defensive assertion).
    */
   async convertChapters(doclingDoc, tocEntries, pageRangeMap, images, tables, footnotes) {
     this.logger.info("[DocumentProcessor] Converting chapters...");
     if (tocEntries.length === 0) {
-      this.logger.info(
-        "[DocumentProcessor] No TOC entries, creating fallback chapter"
-      );
-      return this.createFallbackChapter(
-        doclingDoc,
-        pageRangeMap,
-        images,
-        tables,
-        footnotes
-      );
+      const reason = "Cannot convert chapters without TOC entries";
+      this.logger.error(`[DocumentProcessor] ${reason}`);
+      throw new TocNotFoundError(reason);
     }
     const chapters = this.chapterConverter.convert(
       tocEntries,
@@ -4249,48 +4330,6 @@ var DocumentProcessor = class {
     );
     return chapters;
   }
-  /**
-   * Create a fallback chapter when TOC is not available
-   *
-   * Creates a single "Document" chapter containing all text blocks,
-   * images, tables, and footnotes from the document.
-   */
-  createFallbackChapter(doclingDoc, pageRangeMap, images, tables, footnotes) {
-    const textBlocks = doclingDoc.texts.filter(
-      (item) => item.label !== "footnote" && this.textCleaner.isValidText(item.text)
-    ).map((item) => ({
-      text: this.textCleaner.normalize(item.text),
-      pdfPageNo: item.prov?.[0]?.page_no ?? 1
-    }));
-    if (textBlocks.length === 0 && images.length === 0 && tables.length === 0 && footnotes.length === 0) {
-      this.logger.info(
-        "[DocumentProcessor] No content found for fallback chapter"
-      );
-      return [];
-    }
-    const firstPdfPage = Math.min(
-      ...Object.keys(pageRangeMap).map(Number).filter((n) => !isNaN(n)),
-      1
-    );
-    const firstPageRange = pageRangeMap[firstPdfPage];
-    const pageNo = firstPageRange?.startPageNo ?? 1;
-    const fallbackChapter = {
-      id: this.idGenerator.generateChapterId(),
-      originTitle: "Document",
-      title: "Document",
-      pageNo,
-      level: 1,
-      textBlocks,
-      imageIds: images.map((img) => img.id),
-      tableIds: tables.map((tbl) => tbl.id),
-      footnoteIds: footnotes.map((ftn) => ftn.id),
-      children: []
-    };
-    this.logger.info(
-      `[DocumentProcessor] Created fallback chapter with ${textBlocks.length} text blocks, ${images.length} images, ${tables.length} tables, ${footnotes.length} footnotes`
-    );
-    return [fallbackChapter];
-  }
 };
 // Annotate the CommonJS export names for ESM import in node:
 0 && (module.exports = {