@heripo/document-processor 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -267,16 +267,9 @@ declare class DocumentProcessor {
267
267
  * Convert chapters and link resources
268
268
  *
269
269
  * Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
270
- * Falls back to single "Document" chapter when TOC is empty.
270
+ * Throws TocNotFoundError if TOC entries are empty (defensive assertion).
271
271
  */
272
272
  private convertChapters;
273
- /**
274
- * Create a fallback chapter when TOC is not available
275
- *
276
- * Creates a single "Document" chapter containing all text blocks,
277
- * images, tables, and footnotes from the document.
278
- */
279
- private createFallbackChapter;
280
273
  }
281
274
 
282
275
  /**
package/dist/index.d.ts CHANGED
@@ -267,16 +267,9 @@ declare class DocumentProcessor {
267
267
  * Convert chapters and link resources
268
268
  *
269
269
  * Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
270
- * Falls back to single "Document" chapter when TOC is empty.
270
+ * Throws TocNotFoundError if TOC entries are empty (defensive assertion).
271
271
  */
272
272
  private convertChapters;
273
- /**
274
- * Create a fallback chapter when TOC is not available
275
- *
276
- * Creates a single "Document" chapter containing all text blocks,
277
- * images, tables, and footnotes from the document.
278
- */
279
- private createFallbackChapter;
280
273
  }
281
274
 
282
275
  /**
package/dist/index.js CHANGED
@@ -1846,11 +1846,10 @@ var TocExtractor = class extends TextLLMComponent {
1846
1846
  async extract(markdown) {
1847
1847
  this.log("info", `Starting TOC extraction (${markdown.length} chars)`);
1848
1848
  if (!markdown.trim()) {
1849
- this.log("info", "Empty markdown, returning empty array");
1850
- return {
1851
- entries: [],
1852
- usage: this.createEmptyUsage("extraction")
1853
- };
1849
+ this.log("error", "Cannot extract TOC from empty markdown content");
1850
+ throw new TocParseError(
1851
+ "TOC extraction failed: provided markdown content is empty"
1852
+ );
1854
1853
  }
1855
1854
  try {
1856
1855
  const result = await this.callTextLLM(
@@ -3989,10 +3988,13 @@ var DocumentProcessor = class {
3989
3988
  const totalPages = Object.keys(doclingDoc.pages).length;
3990
3989
  markdown = await this.visionTocExtractor.extract(totalPages);
3991
3990
  if (!markdown) {
3992
- this.logger.warn(
3993
- "[DocumentProcessor] TOC not found in any method, returning empty"
3991
+ const reason = "Both rule-based search and vision fallback failed to locate TOC";
3992
+ this.logger.error(
3993
+ `[DocumentProcessor] TOC extraction failed: ${reason}`
3994
+ );
3995
+ throw new TocNotFoundError(
3996
+ `Table of contents not found in the document. ${reason}.`
3994
3997
  );
3995
- return [];
3996
3998
  }
3997
3999
  this.logger.info(
3998
4000
  `[DocumentProcessor] Vision extracted TOC markdown (${markdown.length} chars)`
@@ -4000,6 +4002,11 @@ var DocumentProcessor = class {
4000
4002
  }
4001
4003
  const tocResult = await this.tocExtractor.extract(markdown);
4002
4004
  this.usageAggregator.track(tocResult.usage);
4005
+ if (tocResult.entries.length === 0) {
4006
+ const reason = "TOC area was detected but LLM could not extract any structured entries";
4007
+ this.logger.error(`[DocumentProcessor] TOC extraction failed: ${reason}`);
4008
+ throw new TocNotFoundError(`${reason}.`);
4009
+ }
4003
4010
  this.logger.info(
4004
4011
  `[DocumentProcessor] Extracted ${tocResult.entries.length} top-level TOC entries`
4005
4012
  );
@@ -4239,21 +4246,14 @@ var DocumentProcessor = class {
4239
4246
  * Convert chapters and link resources
4240
4247
  *
4241
4248
  * Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
4242
- * Falls back to single "Document" chapter when TOC is empty.
4249
+ * Throws TocNotFoundError if TOC entries are empty (defensive assertion).
4243
4250
  */
4244
4251
  async convertChapters(doclingDoc, tocEntries, pageRangeMap, images, tables, footnotes) {
4245
4252
  this.logger.info("[DocumentProcessor] Converting chapters...");
4246
4253
  if (tocEntries.length === 0) {
4247
- this.logger.info(
4248
- "[DocumentProcessor] No TOC entries, creating fallback chapter"
4249
- );
4250
- return this.createFallbackChapter(
4251
- doclingDoc,
4252
- pageRangeMap,
4253
- images,
4254
- tables,
4255
- footnotes
4256
- );
4254
+ const reason = "Cannot convert chapters without TOC entries";
4255
+ this.logger.error(`[DocumentProcessor] ${reason}`);
4256
+ throw new TocNotFoundError(reason);
4257
4257
  }
4258
4258
  const chapters = this.chapterConverter.convert(
4259
4259
  tocEntries,
@@ -4268,48 +4268,6 @@ var DocumentProcessor = class {
4268
4268
  );
4269
4269
  return chapters;
4270
4270
  }
4271
- /**
4272
- * Create a fallback chapter when TOC is not available
4273
- *
4274
- * Creates a single "Document" chapter containing all text blocks,
4275
- * images, tables, and footnotes from the document.
4276
- */
4277
- createFallbackChapter(doclingDoc, pageRangeMap, images, tables, footnotes) {
4278
- const textBlocks = doclingDoc.texts.filter(
4279
- (item) => item.label !== "footnote" && this.textCleaner.isValidText(item.text)
4280
- ).map((item) => ({
4281
- text: this.textCleaner.normalize(item.text),
4282
- pdfPageNo: item.prov?.[0]?.page_no ?? 1
4283
- }));
4284
- if (textBlocks.length === 0 && images.length === 0 && tables.length === 0 && footnotes.length === 0) {
4285
- this.logger.info(
4286
- "[DocumentProcessor] No content found for fallback chapter"
4287
- );
4288
- return [];
4289
- }
4290
- const firstPdfPage = Math.min(
4291
- ...Object.keys(pageRangeMap).map(Number).filter((n) => !isNaN(n)),
4292
- 1
4293
- );
4294
- const firstPageRange = pageRangeMap[firstPdfPage];
4295
- const pageNo = firstPageRange?.startPageNo ?? 1;
4296
- const fallbackChapter = {
4297
- id: this.idGenerator.generateChapterId(),
4298
- originTitle: "Document",
4299
- title: "Document",
4300
- pageNo,
4301
- level: 1,
4302
- textBlocks,
4303
- imageIds: images.map((img) => img.id),
4304
- tableIds: tables.map((tbl) => tbl.id),
4305
- footnoteIds: footnotes.map((ftn) => ftn.id),
4306
- children: []
4307
- };
4308
- this.logger.info(
4309
- `[DocumentProcessor] Created fallback chapter with ${textBlocks.length} text blocks, ${images.length} images, ${tables.length} tables, ${footnotes.length} footnotes`
4310
- );
4311
- return [fallbackChapter];
4312
- }
4313
4271
  };
4314
4272
  export {
4315
4273
  BaseLLMComponent,