npm - @heripo/document-processor - Versions diffs - 0.1.3 → 0.1.4 - Mend

@heripo/document-processor 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.cjs CHANGED Viewed

@@ -1908,11 +1908,10 @@ var TocExtractor = class extends TextLLMComponent {
   async extract(markdown) {
     this.log("info", `Starting TOC extraction (${markdown.length} chars)`);
     if (!markdown.trim()) {
-      this.log("info", "Empty markdown, returning empty array");
-      return {
-        entries: [],
-        usage: this.createEmptyUsage("extraction")
-      };
+      this.log("error", "Cannot extract TOC from empty markdown content");
+      throw new TocParseError(
+        "TOC extraction failed: provided markdown content is empty"
+      );
     }
     try {
       const result = await this.callTextLLM(
@@ -4051,10 +4050,13 @@ var DocumentProcessor = class {
       const totalPages = Object.keys(doclingDoc.pages).length;
       markdown = await this.visionTocExtractor.extract(totalPages);
       if (!markdown) {
-        this.logger.warn(
-          "[DocumentProcessor] TOC not found in any method, returning empty"
+        const reason = "Both rule-based search and vision fallback failed to locate TOC";
+        this.logger.error(
+          `[DocumentProcessor] TOC extraction failed: ${reason}`
+        );
+        throw new TocNotFoundError(
+          `Table of contents not found in the document. ${reason}.`
         );
-        return [];
       }
       this.logger.info(
         `[DocumentProcessor] Vision extracted TOC markdown (${markdown.length} chars)`
@@ -4062,6 +4064,11 @@ var DocumentProcessor = class {
     }
     const tocResult = await this.tocExtractor.extract(markdown);
     this.usageAggregator.track(tocResult.usage);
+    if (tocResult.entries.length === 0) {
+      const reason = "TOC area was detected but LLM could not extract any structured entries";
+      this.logger.error(`[DocumentProcessor] TOC extraction failed: ${reason}`);
+      throw new TocNotFoundError(`${reason}.`);
+    }
     this.logger.info(
       `[DocumentProcessor] Extracted ${tocResult.entries.length} top-level TOC entries`
     );
@@ -4301,21 +4308,14 @@ var DocumentProcessor = class {
    * Convert chapters and link resources
    *
    * Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
-   * Falls back to single "Document" chapter when TOC is empty.
+   * Throws TocNotFoundError if TOC entries are empty (defensive assertion).
    */
   async convertChapters(doclingDoc, tocEntries, pageRangeMap, images, tables, footnotes) {
     this.logger.info("[DocumentProcessor] Converting chapters...");
     if (tocEntries.length === 0) {
-      this.logger.info(
-        "[DocumentProcessor] No TOC entries, creating fallback chapter"
-      );
-      return this.createFallbackChapter(
-        doclingDoc,
-        pageRangeMap,
-        images,
-        tables,
-        footnotes
-      );
+      const reason = "Cannot convert chapters without TOC entries";
+      this.logger.error(`[DocumentProcessor] ${reason}`);
+      throw new TocNotFoundError(reason);
     }
     const chapters = this.chapterConverter.convert(
       tocEntries,
@@ -4330,48 +4330,6 @@ var DocumentProcessor = class {
     );
     return chapters;
   }
-  /**
-   * Create a fallback chapter when TOC is not available
-   *
-   * Creates a single "Document" chapter containing all text blocks,
-   * images, tables, and footnotes from the document.
-   */
-  createFallbackChapter(doclingDoc, pageRangeMap, images, tables, footnotes) {
-    const textBlocks = doclingDoc.texts.filter(
-      (item) => item.label !== "footnote" && this.textCleaner.isValidText(item.text)
-    ).map((item) => ({
-      text: this.textCleaner.normalize(item.text),
-      pdfPageNo: item.prov?.[0]?.page_no ?? 1
-    }));
-    if (textBlocks.length === 0 && images.length === 0 && tables.length === 0 && footnotes.length === 0) {
-      this.logger.info(
-        "[DocumentProcessor] No content found for fallback chapter"
-      );
-      return [];
-    }
-    const firstPdfPage = Math.min(
-      ...Object.keys(pageRangeMap).map(Number).filter((n) => !isNaN(n)),
-      1
-    );
-    const firstPageRange = pageRangeMap[firstPdfPage];
-    const pageNo = firstPageRange?.startPageNo ?? 1;
-    const fallbackChapter = {
-      id: this.idGenerator.generateChapterId(),
-      originTitle: "Document",
-      title: "Document",
-      pageNo,
-      level: 1,
-      textBlocks,
-      imageIds: images.map((img) => img.id),
-      tableIds: tables.map((tbl) => tbl.id),
-      footnoteIds: footnotes.map((ftn) => ftn.id),
-      children: []
-    };
-    this.logger.info(
-      `[DocumentProcessor] Created fallback chapter with ${textBlocks.length} text blocks, ${images.length} images, ${tables.length} tables, ${footnotes.length} footnotes`
-    );
-    return [fallbackChapter];
-  }
 };
 // Annotate the CommonJS export names for ESM import in node:
 0 && (module.exports = {