@heripo/document-processor 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +19 -61
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1 -8
- package/dist/index.d.ts +1 -8
- package/dist/index.js +19 -61
- package/dist/index.js.map +1 -1
- package/package.json +5 -5
package/dist/index.cjs
CHANGED
|
@@ -1908,11 +1908,10 @@ var TocExtractor = class extends TextLLMComponent {
|
|
|
1908
1908
|
async extract(markdown) {
|
|
1909
1909
|
this.log("info", `Starting TOC extraction (${markdown.length} chars)`);
|
|
1910
1910
|
if (!markdown.trim()) {
|
|
1911
|
-
this.log("
|
|
1912
|
-
|
|
1913
|
-
|
|
1914
|
-
|
|
1915
|
-
};
|
|
1911
|
+
this.log("error", "Cannot extract TOC from empty markdown content");
|
|
1912
|
+
throw new TocParseError(
|
|
1913
|
+
"TOC extraction failed: provided markdown content is empty"
|
|
1914
|
+
);
|
|
1916
1915
|
}
|
|
1917
1916
|
try {
|
|
1918
1917
|
const result = await this.callTextLLM(
|
|
@@ -4051,10 +4050,13 @@ var DocumentProcessor = class {
|
|
|
4051
4050
|
const totalPages = Object.keys(doclingDoc.pages).length;
|
|
4052
4051
|
markdown = await this.visionTocExtractor.extract(totalPages);
|
|
4053
4052
|
if (!markdown) {
|
|
4054
|
-
|
|
4055
|
-
|
|
4053
|
+
const reason = "Both rule-based search and vision fallback failed to locate TOC";
|
|
4054
|
+
this.logger.error(
|
|
4055
|
+
`[DocumentProcessor] TOC extraction failed: ${reason}`
|
|
4056
|
+
);
|
|
4057
|
+
throw new TocNotFoundError(
|
|
4058
|
+
`Table of contents not found in the document. ${reason}.`
|
|
4056
4059
|
);
|
|
4057
|
-
return [];
|
|
4058
4060
|
}
|
|
4059
4061
|
this.logger.info(
|
|
4060
4062
|
`[DocumentProcessor] Vision extracted TOC markdown (${markdown.length} chars)`
|
|
@@ -4062,6 +4064,11 @@ var DocumentProcessor = class {
|
|
|
4062
4064
|
}
|
|
4063
4065
|
const tocResult = await this.tocExtractor.extract(markdown);
|
|
4064
4066
|
this.usageAggregator.track(tocResult.usage);
|
|
4067
|
+
if (tocResult.entries.length === 0) {
|
|
4068
|
+
const reason = "TOC area was detected but LLM could not extract any structured entries";
|
|
4069
|
+
this.logger.error(`[DocumentProcessor] TOC extraction failed: ${reason}`);
|
|
4070
|
+
throw new TocNotFoundError(`${reason}.`);
|
|
4071
|
+
}
|
|
4065
4072
|
this.logger.info(
|
|
4066
4073
|
`[DocumentProcessor] Extracted ${tocResult.entries.length} top-level TOC entries`
|
|
4067
4074
|
);
|
|
@@ -4301,21 +4308,14 @@ var DocumentProcessor = class {
|
|
|
4301
4308
|
* Convert chapters and link resources
|
|
4302
4309
|
*
|
|
4303
4310
|
* Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
|
|
4304
|
-
*
|
|
4311
|
+
* Throws TocNotFoundError if TOC entries are empty (defensive assertion).
|
|
4305
4312
|
*/
|
|
4306
4313
|
async convertChapters(doclingDoc, tocEntries, pageRangeMap, images, tables, footnotes) {
|
|
4307
4314
|
this.logger.info("[DocumentProcessor] Converting chapters...");
|
|
4308
4315
|
if (tocEntries.length === 0) {
|
|
4309
|
-
|
|
4310
|
-
|
|
4311
|
-
);
|
|
4312
|
-
return this.createFallbackChapter(
|
|
4313
|
-
doclingDoc,
|
|
4314
|
-
pageRangeMap,
|
|
4315
|
-
images,
|
|
4316
|
-
tables,
|
|
4317
|
-
footnotes
|
|
4318
|
-
);
|
|
4316
|
+
const reason = "Cannot convert chapters without TOC entries";
|
|
4317
|
+
this.logger.error(`[DocumentProcessor] ${reason}`);
|
|
4318
|
+
throw new TocNotFoundError(reason);
|
|
4319
4319
|
}
|
|
4320
4320
|
const chapters = this.chapterConverter.convert(
|
|
4321
4321
|
tocEntries,
|
|
@@ -4330,48 +4330,6 @@ var DocumentProcessor = class {
|
|
|
4330
4330
|
);
|
|
4331
4331
|
return chapters;
|
|
4332
4332
|
}
|
|
4333
|
-
/**
|
|
4334
|
-
* Create a fallback chapter when TOC is not available
|
|
4335
|
-
*
|
|
4336
|
-
* Creates a single "Document" chapter containing all text blocks,
|
|
4337
|
-
* images, tables, and footnotes from the document.
|
|
4338
|
-
*/
|
|
4339
|
-
createFallbackChapter(doclingDoc, pageRangeMap, images, tables, footnotes) {
|
|
4340
|
-
const textBlocks = doclingDoc.texts.filter(
|
|
4341
|
-
(item) => item.label !== "footnote" && this.textCleaner.isValidText(item.text)
|
|
4342
|
-
).map((item) => ({
|
|
4343
|
-
text: this.textCleaner.normalize(item.text),
|
|
4344
|
-
pdfPageNo: item.prov?.[0]?.page_no ?? 1
|
|
4345
|
-
}));
|
|
4346
|
-
if (textBlocks.length === 0 && images.length === 0 && tables.length === 0 && footnotes.length === 0) {
|
|
4347
|
-
this.logger.info(
|
|
4348
|
-
"[DocumentProcessor] No content found for fallback chapter"
|
|
4349
|
-
);
|
|
4350
|
-
return [];
|
|
4351
|
-
}
|
|
4352
|
-
const firstPdfPage = Math.min(
|
|
4353
|
-
...Object.keys(pageRangeMap).map(Number).filter((n) => !isNaN(n)),
|
|
4354
|
-
1
|
|
4355
|
-
);
|
|
4356
|
-
const firstPageRange = pageRangeMap[firstPdfPage];
|
|
4357
|
-
const pageNo = firstPageRange?.startPageNo ?? 1;
|
|
4358
|
-
const fallbackChapter = {
|
|
4359
|
-
id: this.idGenerator.generateChapterId(),
|
|
4360
|
-
originTitle: "Document",
|
|
4361
|
-
title: "Document",
|
|
4362
|
-
pageNo,
|
|
4363
|
-
level: 1,
|
|
4364
|
-
textBlocks,
|
|
4365
|
-
imageIds: images.map((img) => img.id),
|
|
4366
|
-
tableIds: tables.map((tbl) => tbl.id),
|
|
4367
|
-
footnoteIds: footnotes.map((ftn) => ftn.id),
|
|
4368
|
-
children: []
|
|
4369
|
-
};
|
|
4370
|
-
this.logger.info(
|
|
4371
|
-
`[DocumentProcessor] Created fallback chapter with ${textBlocks.length} text blocks, ${images.length} images, ${tables.length} tables, ${footnotes.length} footnotes`
|
|
4372
|
-
);
|
|
4373
|
-
return [fallbackChapter];
|
|
4374
|
-
}
|
|
4375
4333
|
};
|
|
4376
4334
|
// Annotate the CommonJS export names for ESM import in node:
|
|
4377
4335
|
0 && (module.exports = {
|