npm - @heripo/document-processor - Versions diffs - 0.1.3 → 0.1.5 - Mend

@heripo/document-processor 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.cjs CHANGED Viewed

@@ -1268,7 +1268,8 @@ var TocValidationError = class extends TocExtractError {
 // src/extractors/toc-validator.ts
 var DEFAULT_OPTIONS = {
   totalPages: Infinity,
-  maxTitleLength: 200
+  maxTitleLength: 200,
+  maxFirstEntryPageRatio: 0.3
 };
 var TocValidator = class {
   options;
@@ -1289,6 +1290,7 @@ var TocValidator = class {
   validate(entries) {
     this.issues = [];
     this.validateEntries(entries, "", null, /* @__PURE__ */ new Set());
+    this.validateFirstEntryPagePosition(entries);
     const errorCount = this.issues.length;
     return {
       valid: errorCount === 0,
@@ -1305,8 +1307,12 @@ var TocValidator = class {
   validateOrThrow(entries) {
     const result = this.validate(entries);
     if (!result.valid) {
+      const details = result.issues.map(
+        (issue) => `  [${issue.code}] ${issue.message} (path: ${issue.path}, entry: "${issue.entry.title}" page ${issue.entry.pageNo})`
+      ).join("\n");
       throw new TocValidationError(
-        `TOC validation failed with ${result.errorCount} error(s)`,
+        `TOC validation failed with ${result.errorCount} error(s):
+${details}`,
         result
       );
     }
@@ -1421,6 +1427,33 @@ var TocValidator = class {
       });
     }
   }
+  /**
+   * V007: Validate first entry page position (completeness check)
+   *
+   * If the first level-1 entry starts too late in the document,
+   * earlier entries might be missing from the TOC.
+   */
+  validateFirstEntryPagePosition(entries) {
+    if (entries.length === 0) {
+      return;
+    }
+    if (!isFinite(this.options.totalPages)) {
+      return;
+    }
+    const firstEntry = entries[0];
+    const threshold = Math.max(
+      50,
+      Math.floor(this.options.totalPages * this.options.maxFirstEntryPageRatio)
+    );
+    if (firstEntry.pageNo > threshold) {
+      this.addIssue({
+        code: "V007",
+        message: `TOC may be incomplete - first entry starts at page ${firstEntry.pageNo}, expected within first ${threshold} pages. Earlier entries might be missing.`,
+        path: "[0]",
+        entry: firstEntry
+      });
+    }
+  }
   /**
    * Add issue to the list
    */
@@ -1639,22 +1672,42 @@ var TocFinder = class {
     return numberCount > 0 && numberCount / (num_rows - 1) > 0.5;
   }
   /**
-   * Expand TOC area to consecutive pages
+   * Expand TOC area to consecutive pages (both backward and forward)
    */
   expandToConsecutivePages(initial, doc) {
     const itemRefs = [...initial.itemRefs];
+    const seenRefs = new Set(itemRefs);
+    let startPage = initial.startPage;
     let endPage = initial.endPage;
+    for (let pageNo = initial.startPage - 1; pageNo >= 1; pageNo--) {
+      const continuationItems = this.findContinuationOnPage(doc, pageNo);
+      if (continuationItems.length === 0) {
+        break;
+      }
+      const newItems = continuationItems.filter((ref) => !seenRefs.has(ref));
+      for (const ref of newItems) {
+        seenRefs.add(ref);
+      }
+      itemRefs.unshift(...newItems);
+      startPage = pageNo;
+      this.logger.info(`[TocFinder] Expanded TOC backward to page ${pageNo}`);
+    }
     for (let pageNo = initial.endPage + 1; pageNo <= this.maxSearchPages; pageNo++) {
       const continuationItems = this.findContinuationOnPage(doc, pageNo);
       if (continuationItems.length === 0) {
         break;
       }
-      itemRefs.push(...continuationItems);
+      const newItems = continuationItems.filter((ref) => !seenRefs.has(ref));
+      for (const ref of newItems) {
+        seenRefs.add(ref);
+      }
+      itemRefs.push(...newItems);
       endPage = pageNo;
+      this.logger.info(`[TocFinder] Expanded TOC forward to page ${pageNo}`);
     }
     return {
       itemRefs,
-      startPage: initial.startPage,
+      startPage,
       endPage
     };
   }
@@ -1872,12 +1925,22 @@ var TextLLMComponent = class extends BaseLLMComponent {
 };
 // src/extractors/toc-extractor.ts
+var MAX_VALIDATION_RETRIES = 3;
+var VALIDATION_CODE_DESCRIPTIONS = {
+  V001: "Page numbers must be in non-decreasing order within the same level. A decrease usually means a hierarchy or page number error.",
+  V002: "Page number is out of valid range (must be >= 1 and <= total pages).",
+  V003: "Title is empty or contains only whitespace.",
+  V004: "Title exceeds the maximum allowed length.",
+  V005: "Child page number is before parent page number. Children must start on or after the parent page.",
+  V006: "Duplicate entry detected (same title and page number).",
+  V007: "First TOC entry starts too late in the document. Earlier entries may be missing."
+};
 var TocEntrySchema = import_zod.z.lazy(
   () => import_zod.z.object({
     title: import_zod.z.string().describe("Chapter or section title"),
     level: import_zod.z.number().int().min(1).describe("Hierarchy depth (1 = top level)"),
     pageNo: import_zod.z.number().int().min(1).describe("Starting page number"),
-    children: import_zod.z.array(TocEntrySchema).optional().describe("Child sections")
+    children: import_zod.z.array(TocEntrySchema).describe("Child sections (use empty array [] if none)")
   })
 );
 var TocResponseSchema = import_zod.z.object({
@@ -1900,19 +1963,21 @@ var TocExtractor = class extends TextLLMComponent {
   /**
    * Extract TOC structure from Markdown
    *
+   * When validation fails, retries with correction feedback up to MAX_VALIDATION_RETRIES times.
+   *
    * @param markdown - Markdown representation of TOC area
-   * @returns Object with entries array and token usage information
+   * @param validationOverrides - Optional overrides for validation options (merged with constructor options)
+   * @returns Object with entries array and token usage array (initial extraction + any corrections)
    * @throws {TocParseError} When LLM fails to parse structure
-   * @throws {TocValidationError} When validation fails
+   * @throws {TocValidationError} When validation fails after all retries
    */
-  async extract(markdown) {
+  async extract(markdown, validationOverrides) {
     this.log("info", `Starting TOC extraction (${markdown.length} chars)`);
     if (!markdown.trim()) {
-      this.log("info", "Empty markdown, returning empty array");
-      return {
-        entries: [],
-        usage: this.createEmptyUsage("extraction")
-      };
+      this.log("error", "Cannot extract TOC from empty markdown content");
+      throw new TocParseError(
+        "TOC extraction failed: provided markdown content is empty"
+      );
     }
     try {
       const result = await this.callTextLLM(
@@ -1921,18 +1986,52 @@ var TocExtractor = class extends TextLLMComponent {
         this.buildUserPrompt(markdown),
         "extraction"
       );
-      const entries = this.normalizeEntries(result.output.entries);
+      const usages = [result.usage];
+      let entries = this.normalizeEntries(result.output.entries);
       if (!this.skipValidation) {
-        this.validateEntries(entries);
+        let validationError = this.tryValidateEntries(
+          entries,
+          validationOverrides
+        );
+        for (let attempt = 1; attempt <= MAX_VALIDATION_RETRIES && validationError !== null; attempt++) {
+          this.log(
+            "warn",
+            `Validation failed (attempt ${attempt}/${MAX_VALIDATION_RETRIES}), retrying with correction feedback`
+          );
+          const correctionPrompt = this.buildCorrectionPrompt(
+            markdown,
+            entries,
+            validationError.validationResult.issues
+          );
+          const correctionResult = await this.callTextLLM(
+            TocResponseSchema,
+            this.buildSystemPrompt(),
+            correctionPrompt,
+            `correction-${attempt}`
+          );
+          usages.push(correctionResult.usage);
+          entries = this.normalizeEntries(correctionResult.output.entries);
+          validationError = this.tryValidateEntries(
+            entries,
+            validationOverrides
+          );
+        }
+        if (validationError !== null) {
+          this.log(
+            "error",
+            `Validation failed after ${MAX_VALIDATION_RETRIES} retries:
+${validationError.getSummary()}`
+          );
+          throw validationError;
+        }
       }
       this.log(
         "info",
-        `Extraction completed: ${entries.length} top-level entries`
+        `Extraction completed: ${entries.length} top-level entries (${usages.length} LLM call(s))`
       );
-      return { entries, usage: result.usage };
+      return { entries, usages };
     } catch (error) {
       if (error instanceof TocValidationError) {
-        this.log("error", `Validation failed: ${error.message}`);
         throw error;
       }
       const message = error instanceof Error ? error.message : String(error);
@@ -1943,16 +2042,69 @@ var TocExtractor = class extends TextLLMComponent {
     }
   }
   /**
-   * Validate extracted entries
+   * Validate extracted entries and return error or null
    *
-   * @throws {TocValidationError} When validation fails
+   * Unlike validateOrThrow, this returns the error instead of throwing,
+   * allowing the retry loop to handle it.
+   *
+   * @returns TocValidationError if validation fails, null if valid
    */
-  validateEntries(entries) {
+  tryValidateEntries(entries, overrides) {
     if (entries.length === 0) {
-      return;
+      return null;
     }
-    const validator = new TocValidator(this.validationOptions);
-    validator.validateOrThrow(entries);
+    const options = { ...this.validationOptions, ...overrides };
+    const validator = new TocValidator(options);
+    const result = validator.validate(entries);
+    if (!result.valid) {
+      const details = result.issues.map(
+        (issue) => `  [${issue.code}] ${issue.message} (path: ${issue.path}, entry: "${issue.entry.title}" page ${issue.entry.pageNo})`
+      ).join("\n");
+      return new TocValidationError(
+        `TOC validation failed with ${result.errorCount} error(s):
+${details}`,
+        result
+      );
+    }
+    return null;
+  }
+  /**
+   * Build correction prompt with validation error feedback
+   *
+   * Includes the original markdown, previous extraction result,
+   * validation errors, and guidance for fixing common mistakes.
+   */
+  buildCorrectionPrompt(markdown, previousEntries, issues) {
+    const errorLines = issues.map((issue) => {
+      const desc = VALIDATION_CODE_DESCRIPTIONS[issue.code] ?? "Unknown validation error.";
+      return `- [${issue.code}] ${issue.message}
+  Path: ${issue.path}
+  Entry: "${issue.entry.title}" (page ${issue.entry.pageNo})
+  Rule: ${desc}`;
+    });
+    return `Your previous TOC extraction had validation errors. Please fix them and re-extract.
+## Validation Errors
+${errorLines.join("\n\n")}
+## Common Mistakes to Avoid
+1. **Hierarchy confusion**: Entries with the same numbering prefix (e.g., "4)") can belong to different hierarchy levels depending on context. Use indentation and surrounding entries to determine the correct parent-child relationship.
+2. **Page number misread**: Carefully distinguish Roman numerals (VI=6) from Arabic numerals. "VI. \uACE0\uCC30" at page 277 is NOT "V. \uACE0\uCC30" at page 27.
+3. **Page order**: Within the same parent, sibling entries must have non-decreasing page numbers. If a page number decreases, the entry likely belongs to a different hierarchy level.
+## Original Markdown
+${markdown}
+## Your Previous Extraction (with errors)
+${JSON.stringify(previousEntries, null, 2)}
+## Instructions
+Re-extract the TOC structure from the original markdown above. Fix all validation errors listed above. Return the corrected entries.`;
   }
   /**
    * Build system prompt for TOC extraction
@@ -1970,11 +2122,12 @@ var TocExtractor = class extends TextLLMComponent {
    - Level 3: Subsections (e.g., "1.1.1", "a.", "(1)")
    - Use indentation and numbering patterns to infer level
-3. **Page Number**: Extract the page number from each entry. Convert Roman numerals to Arabic numerals if present (e.g., "iv" \u2192 4).
+3. **Page Number**: Extract the page number from each entry. Use only Arabic numerals for page numbers.
 4. **Children**: Nest child entries under parent entries based on their hierarchy level.
-5. **IMPORTANT - Extract Main TOC Only**: Only extract the main document table of contents. EXCLUDE the following supplementary indices:
+5. **IMPORTANT - Extract Main TOC Only**: Only extract the main document table of contents. EXCLUDE the following:
+   - **Front matter with Roman numeral pages**: Entries whose page numbers are Roman numerals (i, ii, xxi, etc.) such as \uC77C\uB7EC\uB450\uAE30, \uBC1C\uAC04\uC0AC, \uC11C\uBB38, \uBC94\uB840, Preface, Foreword, Editorial Notes. These use a separate page numbering system and are not part of the main content.
    - Photo/image indices (\uC0AC\uC9C4 \uBAA9\uCC28, \uC0AC\uC9C4\uBAA9\uCC28, \uD654\uBCF4 \uBAA9\uCC28, Photo Index, List of Photos, List of Figures)
    - Drawing/diagram indices (\uB3C4\uBA74 \uBAA9\uCC28, \uB3C4\uBA74\uBAA9\uCC28, \uC0BD\uB3C4 \uBAA9\uCC28, Drawing Index, List of Drawings)
    - Table indices (\uD45C \uBAA9\uCC28, \uD45C\uBAA9\uCC28, Table Index, List of Tables)
@@ -2001,11 +2154,11 @@ Output:
       "level": 1,
       "pageNo": 1,
       "children": [
-        { "title": "1. \uC5F0\uAD6C \uBC30\uACBD", "level": 2, "pageNo": 3 },
-        { "title": "2. \uC5F0\uAD6C \uBAA9\uC801", "level": 2, "pageNo": 5 }
+        { "title": "1. \uC5F0\uAD6C \uBC30\uACBD", "level": 2, "pageNo": 3, "children": [] },
+        { "title": "2. \uC5F0\uAD6C \uBAA9\uC801", "level": 2, "pageNo": 5, "children": [] }
       ]
     },
-    { "title": "\uC81C2\uC7A5 \uBC29\uBC95\uB860", "level": 1, "pageNo": 10 }
+    { "title": "\uC81C2\uC7A5 \uBC29\uBC95\uB860", "level": 1, "pageNo": 10, "children": [] }
   ]
 }`;
   }
@@ -2648,7 +2801,7 @@ var PagePattern = /* @__PURE__ */ ((PagePattern2) => {
 var PageRangeParser = class extends VisionLLMComponent {
   // Configuration constants
   SAMPLE_SIZE = 3;
-  MAX_PATTERN_RETRIES = 6;
+  MAX_PATTERN_RETRIES = 19;
   SIZE_TOLERANCE = 5;
   constructor(logger, model, outputPath, maxRetries = 3, fallbackModel, aggregator, abortSignal) {
     super(
@@ -4048,20 +4201,33 @@ var DocumentProcessor = class {
     }
     if (!markdown) {
       this.logger.info("[DocumentProcessor] Using vision fallback for TOC");
-      const totalPages = Object.keys(doclingDoc.pages).length;
-      markdown = await this.visionTocExtractor.extract(totalPages);
+      const totalPages2 = Object.keys(doclingDoc.pages).length;
+      markdown = await this.visionTocExtractor.extract(totalPages2);
       if (!markdown) {
-        this.logger.warn(
-          "[DocumentProcessor] TOC not found in any method, returning empty"
+        const reason = "Both rule-based search and vision fallback failed to locate TOC";
+        this.logger.error(
+          `[DocumentProcessor] TOC extraction failed: ${reason}`
+        );
+        throw new TocNotFoundError(
+          `Table of contents not found in the document. ${reason}.`
         );
-        return [];
       }
       this.logger.info(
         `[DocumentProcessor] Vision extracted TOC markdown (${markdown.length} chars)`
       );
     }
-    const tocResult = await this.tocExtractor.extract(markdown);
-    this.usageAggregator.track(tocResult.usage);
+    const totalPages = Object.keys(doclingDoc.pages).length;
+    const tocResult = await this.tocExtractor.extract(markdown, {
+      totalPages
+    });
+    for (const usage of tocResult.usages) {
+      this.usageAggregator.track(usage);
+    }
+    if (tocResult.entries.length === 0) {
+      const reason = "TOC area was detected but LLM could not extract any structured entries";
+      this.logger.error(`[DocumentProcessor] TOC extraction failed: ${reason}`);
+      throw new TocNotFoundError(`${reason}.`);
+    }
     this.logger.info(
       `[DocumentProcessor] Extracted ${tocResult.entries.length} top-level TOC entries`
     );
@@ -4301,21 +4467,14 @@ var DocumentProcessor = class {
    * Convert chapters and link resources
    *
    * Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
-   * Falls back to single "Document" chapter when TOC is empty.
+   * Throws TocNotFoundError if TOC entries are empty (defensive assertion).
    */
   async convertChapters(doclingDoc, tocEntries, pageRangeMap, images, tables, footnotes) {
     this.logger.info("[DocumentProcessor] Converting chapters...");
     if (tocEntries.length === 0) {
-      this.logger.info(
-        "[DocumentProcessor] No TOC entries, creating fallback chapter"
-      );
-      return this.createFallbackChapter(
-        doclingDoc,
-        pageRangeMap,
-        images,
-        tables,
-        footnotes
-      );
+      const reason = "Cannot convert chapters without TOC entries";
+      this.logger.error(`[DocumentProcessor] ${reason}`);
+      throw new TocNotFoundError(reason);
     }
     const chapters = this.chapterConverter.convert(
       tocEntries,
@@ -4330,48 +4489,6 @@ var DocumentProcessor = class {
     );
     return chapters;
   }
-  /**
-   * Create a fallback chapter when TOC is not available
-   *
-   * Creates a single "Document" chapter containing all text blocks,
-   * images, tables, and footnotes from the document.
-   */
-  createFallbackChapter(doclingDoc, pageRangeMap, images, tables, footnotes) {
-    const textBlocks = doclingDoc.texts.filter(
-      (item) => item.label !== "footnote" && this.textCleaner.isValidText(item.text)
-    ).map((item) => ({
-      text: this.textCleaner.normalize(item.text),
-      pdfPageNo: item.prov?.[0]?.page_no ?? 1
-    }));
-    if (textBlocks.length === 0 && images.length === 0 && tables.length === 0 && footnotes.length === 0) {
-      this.logger.info(
-        "[DocumentProcessor] No content found for fallback chapter"
-      );
-      return [];
-    }
-    const firstPdfPage = Math.min(
-      ...Object.keys(pageRangeMap).map(Number).filter((n) => !isNaN(n)),
-      1
-    );
-    const firstPageRange = pageRangeMap[firstPdfPage];
-    const pageNo = firstPageRange?.startPageNo ?? 1;
-    const fallbackChapter = {
-      id: this.idGenerator.generateChapterId(),
-      originTitle: "Document",
-      title: "Document",
-      pageNo,
-      level: 1,
-      textBlocks,
-      imageIds: images.map((img) => img.id),
-      tableIds: tables.map((tbl) => tbl.id),
-      footnoteIds: footnotes.map((ftn) => ftn.id),
-      children: []
-    };
-    this.logger.info(
-      `[DocumentProcessor] Created fallback chapter with ${textBlocks.length} text blocks, ${images.length} images, ${tables.length} tables, ${footnotes.length} footnotes`
-    );
-    return [fallbackChapter];
-  }
 };
 // Annotate the CommonJS export names for ESM import in node:
 0 && (module.exports = {