@heripo/document-processor 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1908,11 +1908,10 @@ var TocExtractor = class extends TextLLMComponent {
1908
1908
  async extract(markdown) {
1909
1909
  this.log("info", `Starting TOC extraction (${markdown.length} chars)`);
1910
1910
  if (!markdown.trim()) {
1911
- this.log("info", "Empty markdown, returning empty array");
1912
- return {
1913
- entries: [],
1914
- usage: this.createEmptyUsage("extraction")
1915
- };
1911
+ this.log("error", "Cannot extract TOC from empty markdown content");
1912
+ throw new TocParseError(
1913
+ "TOC extraction failed: provided markdown content is empty"
1914
+ );
1916
1915
  }
1917
1916
  try {
1918
1917
  const result = await this.callTextLLM(
@@ -4051,10 +4050,13 @@ var DocumentProcessor = class {
4051
4050
  const totalPages = Object.keys(doclingDoc.pages).length;
4052
4051
  markdown = await this.visionTocExtractor.extract(totalPages);
4053
4052
  if (!markdown) {
4054
- this.logger.warn(
4055
- "[DocumentProcessor] TOC not found in any method, returning empty"
4053
+ const reason = "Both rule-based search and vision fallback failed to locate TOC";
4054
+ this.logger.error(
4055
+ `[DocumentProcessor] TOC extraction failed: ${reason}`
4056
+ );
4057
+ throw new TocNotFoundError(
4058
+ `Table of contents not found in the document. ${reason}.`
4056
4059
  );
4057
- return [];
4058
4060
  }
4059
4061
  this.logger.info(
4060
4062
  `[DocumentProcessor] Vision extracted TOC markdown (${markdown.length} chars)`
@@ -4062,6 +4064,11 @@ var DocumentProcessor = class {
4062
4064
  }
4063
4065
  const tocResult = await this.tocExtractor.extract(markdown);
4064
4066
  this.usageAggregator.track(tocResult.usage);
4067
+ if (tocResult.entries.length === 0) {
4068
+ const reason = "TOC area was detected but LLM could not extract any structured entries";
4069
+ this.logger.error(`[DocumentProcessor] TOC extraction failed: ${reason}`);
4070
+ throw new TocNotFoundError(`${reason}.`);
4071
+ }
4065
4072
  this.logger.info(
4066
4073
  `[DocumentProcessor] Extracted ${tocResult.entries.length} top-level TOC entries`
4067
4074
  );
@@ -4301,21 +4308,14 @@ var DocumentProcessor = class {
4301
4308
  * Convert chapters and link resources
4302
4309
  *
4303
4310
  * Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
4304
- * Falls back to single "Document" chapter when TOC is empty.
4311
+ * Throws TocNotFoundError if TOC entries are empty (defensive assertion).
4305
4312
  */
4306
4313
  async convertChapters(doclingDoc, tocEntries, pageRangeMap, images, tables, footnotes) {
4307
4314
  this.logger.info("[DocumentProcessor] Converting chapters...");
4308
4315
  if (tocEntries.length === 0) {
4309
- this.logger.info(
4310
- "[DocumentProcessor] No TOC entries, creating fallback chapter"
4311
- );
4312
- return this.createFallbackChapter(
4313
- doclingDoc,
4314
- pageRangeMap,
4315
- images,
4316
- tables,
4317
- footnotes
4318
- );
4316
+ const reason = "Cannot convert chapters without TOC entries";
4317
+ this.logger.error(`[DocumentProcessor] ${reason}`);
4318
+ throw new TocNotFoundError(reason);
4319
4319
  }
4320
4320
  const chapters = this.chapterConverter.convert(
4321
4321
  tocEntries,
@@ -4330,48 +4330,6 @@ var DocumentProcessor = class {
4330
4330
  );
4331
4331
  return chapters;
4332
4332
  }
4333
- /**
4334
- * Create a fallback chapter when TOC is not available
4335
- *
4336
- * Creates a single "Document" chapter containing all text blocks,
4337
- * images, tables, and footnotes from the document.
4338
- */
4339
- createFallbackChapter(doclingDoc, pageRangeMap, images, tables, footnotes) {
4340
- const textBlocks = doclingDoc.texts.filter(
4341
- (item) => item.label !== "footnote" && this.textCleaner.isValidText(item.text)
4342
- ).map((item) => ({
4343
- text: this.textCleaner.normalize(item.text),
4344
- pdfPageNo: item.prov?.[0]?.page_no ?? 1
4345
- }));
4346
- if (textBlocks.length === 0 && images.length === 0 && tables.length === 0 && footnotes.length === 0) {
4347
- this.logger.info(
4348
- "[DocumentProcessor] No content found for fallback chapter"
4349
- );
4350
- return [];
4351
- }
4352
- const firstPdfPage = Math.min(
4353
- ...Object.keys(pageRangeMap).map(Number).filter((n) => !isNaN(n)),
4354
- 1
4355
- );
4356
- const firstPageRange = pageRangeMap[firstPdfPage];
4357
- const pageNo = firstPageRange?.startPageNo ?? 1;
4358
- const fallbackChapter = {
4359
- id: this.idGenerator.generateChapterId(),
4360
- originTitle: "Document",
4361
- title: "Document",
4362
- pageNo,
4363
- level: 1,
4364
- textBlocks,
4365
- imageIds: images.map((img) => img.id),
4366
- tableIds: tables.map((tbl) => tbl.id),
4367
- footnoteIds: footnotes.map((ftn) => ftn.id),
4368
- children: []
4369
- };
4370
- this.logger.info(
4371
- `[DocumentProcessor] Created fallback chapter with ${textBlocks.length} text blocks, ${images.length} images, ${tables.length} tables, ${footnotes.length} footnotes`
4372
- );
4373
- return [fallbackChapter];
4374
- }
4375
4333
  };
4376
4334
  // Annotate the CommonJS export names for ESM import in node:
4377
4335
  0 && (module.exports = {