@heripo/document-processor 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +19 -61
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1 -8
- package/dist/index.d.ts +1 -8
- package/dist/index.js +19 -61
- package/dist/index.js.map +1 -1
- package/package.json +5 -5
package/dist/index.d.cts
CHANGED
|
@@ -267,16 +267,9 @@ declare class DocumentProcessor {
|
|
|
267
267
|
* Convert chapters and link resources
|
|
268
268
|
*
|
|
269
269
|
* Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
|
|
270
|
-
*
|
|
270
|
+
* Throws TocNotFoundError if TOC entries are empty (defensive assertion).
|
|
271
271
|
*/
|
|
272
272
|
private convertChapters;
|
|
273
|
-
/**
|
|
274
|
-
* Create a fallback chapter when TOC is not available
|
|
275
|
-
*
|
|
276
|
-
* Creates a single "Document" chapter containing all text blocks,
|
|
277
|
-
* images, tables, and footnotes from the document.
|
|
278
|
-
*/
|
|
279
|
-
private createFallbackChapter;
|
|
280
273
|
}
|
|
281
274
|
|
|
282
275
|
/**
|
package/dist/index.d.ts
CHANGED
|
@@ -267,16 +267,9 @@ declare class DocumentProcessor {
|
|
|
267
267
|
* Convert chapters and link resources
|
|
268
268
|
*
|
|
269
269
|
* Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
|
|
270
|
-
*
|
|
270
|
+
* Throws TocNotFoundError if TOC entries are empty (defensive assertion).
|
|
271
271
|
*/
|
|
272
272
|
private convertChapters;
|
|
273
|
-
/**
|
|
274
|
-
* Create a fallback chapter when TOC is not available
|
|
275
|
-
*
|
|
276
|
-
* Creates a single "Document" chapter containing all text blocks,
|
|
277
|
-
* images, tables, and footnotes from the document.
|
|
278
|
-
*/
|
|
279
|
-
private createFallbackChapter;
|
|
280
273
|
}
|
|
281
274
|
|
|
282
275
|
/**
|
package/dist/index.js
CHANGED
|
@@ -1846,11 +1846,10 @@ var TocExtractor = class extends TextLLMComponent {
|
|
|
1846
1846
|
async extract(markdown) {
|
|
1847
1847
|
this.log("info", `Starting TOC extraction (${markdown.length} chars)`);
|
|
1848
1848
|
if (!markdown.trim()) {
|
|
1849
|
-
this.log("
|
|
1850
|
-
|
|
1851
|
-
|
|
1852
|
-
|
|
1853
|
-
};
|
|
1849
|
+
this.log("error", "Cannot extract TOC from empty markdown content");
|
|
1850
|
+
throw new TocParseError(
|
|
1851
|
+
"TOC extraction failed: provided markdown content is empty"
|
|
1852
|
+
);
|
|
1854
1853
|
}
|
|
1855
1854
|
try {
|
|
1856
1855
|
const result = await this.callTextLLM(
|
|
@@ -3989,10 +3988,13 @@ var DocumentProcessor = class {
|
|
|
3989
3988
|
const totalPages = Object.keys(doclingDoc.pages).length;
|
|
3990
3989
|
markdown = await this.visionTocExtractor.extract(totalPages);
|
|
3991
3990
|
if (!markdown) {
|
|
3992
|
-
|
|
3993
|
-
|
|
3991
|
+
const reason = "Both rule-based search and vision fallback failed to locate TOC";
|
|
3992
|
+
this.logger.error(
|
|
3993
|
+
`[DocumentProcessor] TOC extraction failed: ${reason}`
|
|
3994
|
+
);
|
|
3995
|
+
throw new TocNotFoundError(
|
|
3996
|
+
`Table of contents not found in the document. ${reason}.`
|
|
3994
3997
|
);
|
|
3995
|
-
return [];
|
|
3996
3998
|
}
|
|
3997
3999
|
this.logger.info(
|
|
3998
4000
|
`[DocumentProcessor] Vision extracted TOC markdown (${markdown.length} chars)`
|
|
@@ -4000,6 +4002,11 @@ var DocumentProcessor = class {
|
|
|
4000
4002
|
}
|
|
4001
4003
|
const tocResult = await this.tocExtractor.extract(markdown);
|
|
4002
4004
|
this.usageAggregator.track(tocResult.usage);
|
|
4005
|
+
if (tocResult.entries.length === 0) {
|
|
4006
|
+
const reason = "TOC area was detected but LLM could not extract any structured entries";
|
|
4007
|
+
this.logger.error(`[DocumentProcessor] TOC extraction failed: ${reason}`);
|
|
4008
|
+
throw new TocNotFoundError(`${reason}.`);
|
|
4009
|
+
}
|
|
4003
4010
|
this.logger.info(
|
|
4004
4011
|
`[DocumentProcessor] Extracted ${tocResult.entries.length} top-level TOC entries`
|
|
4005
4012
|
);
|
|
@@ -4239,21 +4246,14 @@ var DocumentProcessor = class {
|
|
|
4239
4246
|
* Convert chapters and link resources
|
|
4240
4247
|
*
|
|
4241
4248
|
* Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
|
|
4242
|
-
*
|
|
4249
|
+
* Throws TocNotFoundError if TOC entries are empty (defensive assertion).
|
|
4243
4250
|
*/
|
|
4244
4251
|
async convertChapters(doclingDoc, tocEntries, pageRangeMap, images, tables, footnotes) {
|
|
4245
4252
|
this.logger.info("[DocumentProcessor] Converting chapters...");
|
|
4246
4253
|
if (tocEntries.length === 0) {
|
|
4247
|
-
|
|
4248
|
-
|
|
4249
|
-
);
|
|
4250
|
-
return this.createFallbackChapter(
|
|
4251
|
-
doclingDoc,
|
|
4252
|
-
pageRangeMap,
|
|
4253
|
-
images,
|
|
4254
|
-
tables,
|
|
4255
|
-
footnotes
|
|
4256
|
-
);
|
|
4254
|
+
const reason = "Cannot convert chapters without TOC entries";
|
|
4255
|
+
this.logger.error(`[DocumentProcessor] ${reason}`);
|
|
4256
|
+
throw new TocNotFoundError(reason);
|
|
4257
4257
|
}
|
|
4258
4258
|
const chapters = this.chapterConverter.convert(
|
|
4259
4259
|
tocEntries,
|
|
@@ -4268,48 +4268,6 @@ var DocumentProcessor = class {
|
|
|
4268
4268
|
);
|
|
4269
4269
|
return chapters;
|
|
4270
4270
|
}
|
|
4271
|
-
/**
|
|
4272
|
-
* Create a fallback chapter when TOC is not available
|
|
4273
|
-
*
|
|
4274
|
-
* Creates a single "Document" chapter containing all text blocks,
|
|
4275
|
-
* images, tables, and footnotes from the document.
|
|
4276
|
-
*/
|
|
4277
|
-
createFallbackChapter(doclingDoc, pageRangeMap, images, tables, footnotes) {
|
|
4278
|
-
const textBlocks = doclingDoc.texts.filter(
|
|
4279
|
-
(item) => item.label !== "footnote" && this.textCleaner.isValidText(item.text)
|
|
4280
|
-
).map((item) => ({
|
|
4281
|
-
text: this.textCleaner.normalize(item.text),
|
|
4282
|
-
pdfPageNo: item.prov?.[0]?.page_no ?? 1
|
|
4283
|
-
}));
|
|
4284
|
-
if (textBlocks.length === 0 && images.length === 0 && tables.length === 0 && footnotes.length === 0) {
|
|
4285
|
-
this.logger.info(
|
|
4286
|
-
"[DocumentProcessor] No content found for fallback chapter"
|
|
4287
|
-
);
|
|
4288
|
-
return [];
|
|
4289
|
-
}
|
|
4290
|
-
const firstPdfPage = Math.min(
|
|
4291
|
-
...Object.keys(pageRangeMap).map(Number).filter((n) => !isNaN(n)),
|
|
4292
|
-
1
|
|
4293
|
-
);
|
|
4294
|
-
const firstPageRange = pageRangeMap[firstPdfPage];
|
|
4295
|
-
const pageNo = firstPageRange?.startPageNo ?? 1;
|
|
4296
|
-
const fallbackChapter = {
|
|
4297
|
-
id: this.idGenerator.generateChapterId(),
|
|
4298
|
-
originTitle: "Document",
|
|
4299
|
-
title: "Document",
|
|
4300
|
-
pageNo,
|
|
4301
|
-
level: 1,
|
|
4302
|
-
textBlocks,
|
|
4303
|
-
imageIds: images.map((img) => img.id),
|
|
4304
|
-
tableIds: tables.map((tbl) => tbl.id),
|
|
4305
|
-
footnoteIds: footnotes.map((ftn) => ftn.id),
|
|
4306
|
-
children: []
|
|
4307
|
-
};
|
|
4308
|
-
this.logger.info(
|
|
4309
|
-
`[DocumentProcessor] Created fallback chapter with ${textBlocks.length} text blocks, ${images.length} images, ${tables.length} tables, ${footnotes.length} footnotes`
|
|
4310
|
-
);
|
|
4311
|
-
return [fallbackChapter];
|
|
4312
|
-
}
|
|
4313
4271
|
};
|
|
4314
4272
|
export {
|
|
4315
4273
|
BaseLLMComponent,
|