@heripo/document-processor 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +189 -30
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +51 -8
- package/dist/index.d.ts +51 -8
- package/dist/index.js +189 -30
- package/dist/index.js.map +1 -1
- package/package.json +9 -9
package/dist/index.d.cts
CHANGED
|
@@ -722,6 +722,27 @@ declare class PageRangeParser extends VisionLLMComponent {
|
|
|
722
722
|
protected buildUserPrompt(pageNos: number[]): string;
|
|
723
723
|
}
|
|
724
724
|
|
|
725
|
+
/**
|
|
726
|
+
* Single validation issue detected during TOC validation
|
|
727
|
+
*/
|
|
728
|
+
interface TocValidationIssue {
|
|
729
|
+
/**
|
|
730
|
+
* Issue code (V001, V002, etc.)
|
|
731
|
+
*/
|
|
732
|
+
code: string;
|
|
733
|
+
/**
|
|
734
|
+
* Human-readable error message
|
|
735
|
+
*/
|
|
736
|
+
message: string;
|
|
737
|
+
/**
|
|
738
|
+
* Path to the problematic entry (e.g., "[0].children[2]")
|
|
739
|
+
*/
|
|
740
|
+
path: string;
|
|
741
|
+
/**
|
|
742
|
+
* The problematic entry
|
|
743
|
+
*/
|
|
744
|
+
entry: TocEntry;
|
|
745
|
+
}
|
|
725
746
|
/**
|
|
726
747
|
* TocExtractError
|
|
727
748
|
*
|
|
@@ -768,6 +789,12 @@ interface TocValidationOptions {
|
|
|
768
789
|
* Maximum allowed title length (default: 200)
|
|
769
790
|
*/
|
|
770
791
|
maxTitleLength?: number;
|
|
792
|
+
/**
|
|
793
|
+
* Maximum ratio of the first entry's page to total pages (default: 0.3)
|
|
794
|
+
* Used for V007 completeness check - if the first level-1 entry
|
|
795
|
+
* starts beyond max(50, totalPages * ratio), the TOC may be incomplete.
|
|
796
|
+
*/
|
|
797
|
+
maxFirstEntryPageRatio?: number;
|
|
771
798
|
}
|
|
772
799
|
|
|
773
800
|
/**
|
|
@@ -956,7 +983,7 @@ declare class TocFinder {
|
|
|
956
983
|
*/
|
|
957
984
|
private isTableTocLike;
|
|
958
985
|
/**
|
|
959
|
-
* Expand TOC area to consecutive pages
|
|
986
|
+
* Expand TOC area to consecutive pages (both backward and forward)
|
|
960
987
|
*/
|
|
961
988
|
private expandToConsecutivePages;
|
|
962
989
|
/**
|
|
@@ -1016,6 +1043,9 @@ interface TocExtractorOptions extends BaseLLMComponentOptions {
|
|
|
1016
1043
|
*
|
|
1017
1044
|
* Uses high-performance LLM to extract structured TOC from Markdown representation.
|
|
1018
1045
|
* Extends TextLLMComponent for standardized LLM call handling.
|
|
1046
|
+
*
|
|
1047
|
+
* When validation fails, automatically retries with correction feedback
|
|
1048
|
+
* up to MAX_VALIDATION_RETRIES times before throwing.
|
|
1019
1049
|
*/
|
|
1020
1050
|
declare class TocExtractor extends TextLLMComponent {
|
|
1021
1051
|
private readonly validationOptions?;
|
|
@@ -1024,21 +1054,34 @@ declare class TocExtractor extends TextLLMComponent {
|
|
|
1024
1054
|
/**
|
|
1025
1055
|
* Extract TOC structure from Markdown
|
|
1026
1056
|
*
|
|
1057
|
+
* When validation fails, retries with correction feedback up to MAX_VALIDATION_RETRIES times.
|
|
1058
|
+
*
|
|
1027
1059
|
* @param markdown - Markdown representation of TOC area
|
|
1028
|
-
* @
|
|
1060
|
+
* @param validationOverrides - Optional overrides for validation options (merged with constructor options)
|
|
1061
|
+
* @returns Object with entries array and token usage array (initial extraction + any corrections)
|
|
1029
1062
|
* @throws {TocParseError} When LLM fails to parse structure
|
|
1030
|
-
* @throws {TocValidationError} When validation fails
|
|
1063
|
+
* @throws {TocValidationError} When validation fails after all retries
|
|
1031
1064
|
*/
|
|
1032
|
-
extract(markdown: string): Promise<{
|
|
1065
|
+
extract(markdown: string, validationOverrides?: Partial<TocValidationOptions>): Promise<{
|
|
1033
1066
|
entries: TocEntry[];
|
|
1034
|
-
|
|
1067
|
+
usages: ExtendedTokenUsage[];
|
|
1035
1068
|
}>;
|
|
1036
1069
|
/**
|
|
1037
|
-
* Validate extracted entries
|
|
1070
|
+
* Validate extracted entries and return error or null
|
|
1071
|
+
*
|
|
1072
|
+
* Unlike validateOrThrow, this returns the error instead of throwing,
|
|
1073
|
+
* allowing the retry loop to handle it.
|
|
1074
|
+
*
|
|
1075
|
+
* @returns TocValidationError if validation fails, null if valid
|
|
1076
|
+
*/
|
|
1077
|
+
private tryValidateEntries;
|
|
1078
|
+
/**
|
|
1079
|
+
* Build correction prompt with validation error feedback
|
|
1038
1080
|
*
|
|
1039
|
-
*
|
|
1081
|
+
* Includes the original markdown, previous extraction result,
|
|
1082
|
+
* validation errors, and guidance for fixing common mistakes.
|
|
1040
1083
|
*/
|
|
1041
|
-
|
|
1084
|
+
protected buildCorrectionPrompt(markdown: string, previousEntries: TocEntry[], issues: TocValidationIssue[]): string;
|
|
1042
1085
|
/**
|
|
1043
1086
|
* Build system prompt for TOC extraction
|
|
1044
1087
|
*/
|
package/dist/index.d.ts
CHANGED
|
@@ -722,6 +722,27 @@ declare class PageRangeParser extends VisionLLMComponent {
|
|
|
722
722
|
protected buildUserPrompt(pageNos: number[]): string;
|
|
723
723
|
}
|
|
724
724
|
|
|
725
|
+
/**
|
|
726
|
+
* Single validation issue detected during TOC validation
|
|
727
|
+
*/
|
|
728
|
+
interface TocValidationIssue {
|
|
729
|
+
/**
|
|
730
|
+
* Issue code (V001, V002, etc.)
|
|
731
|
+
*/
|
|
732
|
+
code: string;
|
|
733
|
+
/**
|
|
734
|
+
* Human-readable error message
|
|
735
|
+
*/
|
|
736
|
+
message: string;
|
|
737
|
+
/**
|
|
738
|
+
* Path to the problematic entry (e.g., "[0].children[2]")
|
|
739
|
+
*/
|
|
740
|
+
path: string;
|
|
741
|
+
/**
|
|
742
|
+
* The problematic entry
|
|
743
|
+
*/
|
|
744
|
+
entry: TocEntry;
|
|
745
|
+
}
|
|
725
746
|
/**
|
|
726
747
|
* TocExtractError
|
|
727
748
|
*
|
|
@@ -768,6 +789,12 @@ interface TocValidationOptions {
|
|
|
768
789
|
* Maximum allowed title length (default: 200)
|
|
769
790
|
*/
|
|
770
791
|
maxTitleLength?: number;
|
|
792
|
+
/**
|
|
793
|
+
* Maximum ratio of the first entry's page to total pages (default: 0.3)
|
|
794
|
+
* Used for V007 completeness check - if the first level-1 entry
|
|
795
|
+
* starts beyond max(50, totalPages * ratio), the TOC may be incomplete.
|
|
796
|
+
*/
|
|
797
|
+
maxFirstEntryPageRatio?: number;
|
|
771
798
|
}
|
|
772
799
|
|
|
773
800
|
/**
|
|
@@ -956,7 +983,7 @@ declare class TocFinder {
|
|
|
956
983
|
*/
|
|
957
984
|
private isTableTocLike;
|
|
958
985
|
/**
|
|
959
|
-
* Expand TOC area to consecutive pages
|
|
986
|
+
* Expand TOC area to consecutive pages (both backward and forward)
|
|
960
987
|
*/
|
|
961
988
|
private expandToConsecutivePages;
|
|
962
989
|
/**
|
|
@@ -1016,6 +1043,9 @@ interface TocExtractorOptions extends BaseLLMComponentOptions {
|
|
|
1016
1043
|
*
|
|
1017
1044
|
* Uses high-performance LLM to extract structured TOC from Markdown representation.
|
|
1018
1045
|
* Extends TextLLMComponent for standardized LLM call handling.
|
|
1046
|
+
*
|
|
1047
|
+
* When validation fails, automatically retries with correction feedback
|
|
1048
|
+
* up to MAX_VALIDATION_RETRIES times before throwing.
|
|
1019
1049
|
*/
|
|
1020
1050
|
declare class TocExtractor extends TextLLMComponent {
|
|
1021
1051
|
private readonly validationOptions?;
|
|
@@ -1024,21 +1054,34 @@ declare class TocExtractor extends TextLLMComponent {
|
|
|
1024
1054
|
/**
|
|
1025
1055
|
* Extract TOC structure from Markdown
|
|
1026
1056
|
*
|
|
1057
|
+
* When validation fails, retries with correction feedback up to MAX_VALIDATION_RETRIES times.
|
|
1058
|
+
*
|
|
1027
1059
|
* @param markdown - Markdown representation of TOC area
|
|
1028
|
-
* @
|
|
1060
|
+
* @param validationOverrides - Optional overrides for validation options (merged with constructor options)
|
|
1061
|
+
* @returns Object with entries array and token usage array (initial extraction + any corrections)
|
|
1029
1062
|
* @throws {TocParseError} When LLM fails to parse structure
|
|
1030
|
-
* @throws {TocValidationError} When validation fails
|
|
1063
|
+
* @throws {TocValidationError} When validation fails after all retries
|
|
1031
1064
|
*/
|
|
1032
|
-
extract(markdown: string): Promise<{
|
|
1065
|
+
extract(markdown: string, validationOverrides?: Partial<TocValidationOptions>): Promise<{
|
|
1033
1066
|
entries: TocEntry[];
|
|
1034
|
-
|
|
1067
|
+
usages: ExtendedTokenUsage[];
|
|
1035
1068
|
}>;
|
|
1036
1069
|
/**
|
|
1037
|
-
* Validate extracted entries
|
|
1070
|
+
* Validate extracted entries and return error or null
|
|
1071
|
+
*
|
|
1072
|
+
* Unlike validateOrThrow, this returns the error instead of throwing,
|
|
1073
|
+
* allowing the retry loop to handle it.
|
|
1074
|
+
*
|
|
1075
|
+
* @returns TocValidationError if validation fails, null if valid
|
|
1076
|
+
*/
|
|
1077
|
+
private tryValidateEntries;
|
|
1078
|
+
/**
|
|
1079
|
+
* Build correction prompt with validation error feedback
|
|
1038
1080
|
*
|
|
1039
|
-
*
|
|
1081
|
+
* Includes the original markdown, previous extraction result,
|
|
1082
|
+
* validation errors, and guidance for fixing common mistakes.
|
|
1040
1083
|
*/
|
|
1041
|
-
|
|
1084
|
+
protected buildCorrectionPrompt(markdown: string, previousEntries: TocEntry[], issues: TocValidationIssue[]): string;
|
|
1042
1085
|
/**
|
|
1043
1086
|
* Build system prompt for TOC extraction
|
|
1044
1087
|
*/
|
package/dist/index.js
CHANGED
|
@@ -1206,7 +1206,8 @@ var TocValidationError = class extends TocExtractError {
|
|
|
1206
1206
|
// src/extractors/toc-validator.ts
|
|
1207
1207
|
var DEFAULT_OPTIONS = {
|
|
1208
1208
|
totalPages: Infinity,
|
|
1209
|
-
maxTitleLength: 200
|
|
1209
|
+
maxTitleLength: 200,
|
|
1210
|
+
maxFirstEntryPageRatio: 0.3
|
|
1210
1211
|
};
|
|
1211
1212
|
var TocValidator = class {
|
|
1212
1213
|
options;
|
|
@@ -1227,6 +1228,7 @@ var TocValidator = class {
|
|
|
1227
1228
|
validate(entries) {
|
|
1228
1229
|
this.issues = [];
|
|
1229
1230
|
this.validateEntries(entries, "", null, /* @__PURE__ */ new Set());
|
|
1231
|
+
this.validateFirstEntryPagePosition(entries);
|
|
1230
1232
|
const errorCount = this.issues.length;
|
|
1231
1233
|
return {
|
|
1232
1234
|
valid: errorCount === 0,
|
|
@@ -1243,8 +1245,12 @@ var TocValidator = class {
|
|
|
1243
1245
|
validateOrThrow(entries) {
|
|
1244
1246
|
const result = this.validate(entries);
|
|
1245
1247
|
if (!result.valid) {
|
|
1248
|
+
const details = result.issues.map(
|
|
1249
|
+
(issue) => ` [${issue.code}] ${issue.message} (path: ${issue.path}, entry: "${issue.entry.title}" page ${issue.entry.pageNo})`
|
|
1250
|
+
).join("\n");
|
|
1246
1251
|
throw new TocValidationError(
|
|
1247
|
-
`TOC validation failed with ${result.errorCount} error(s)
|
|
1252
|
+
`TOC validation failed with ${result.errorCount} error(s):
|
|
1253
|
+
${details}`,
|
|
1248
1254
|
result
|
|
1249
1255
|
);
|
|
1250
1256
|
}
|
|
@@ -1359,6 +1365,33 @@ var TocValidator = class {
|
|
|
1359
1365
|
});
|
|
1360
1366
|
}
|
|
1361
1367
|
}
|
|
1368
|
+
/**
|
|
1369
|
+
* V007: Validate first entry page position (completeness check)
|
|
1370
|
+
*
|
|
1371
|
+
* If the first level-1 entry starts too late in the document,
|
|
1372
|
+
* earlier entries might be missing from the TOC.
|
|
1373
|
+
*/
|
|
1374
|
+
validateFirstEntryPagePosition(entries) {
|
|
1375
|
+
if (entries.length === 0) {
|
|
1376
|
+
return;
|
|
1377
|
+
}
|
|
1378
|
+
if (!isFinite(this.options.totalPages)) {
|
|
1379
|
+
return;
|
|
1380
|
+
}
|
|
1381
|
+
const firstEntry = entries[0];
|
|
1382
|
+
const threshold = Math.max(
|
|
1383
|
+
50,
|
|
1384
|
+
Math.floor(this.options.totalPages * this.options.maxFirstEntryPageRatio)
|
|
1385
|
+
);
|
|
1386
|
+
if (firstEntry.pageNo > threshold) {
|
|
1387
|
+
this.addIssue({
|
|
1388
|
+
code: "V007",
|
|
1389
|
+
message: `TOC may be incomplete - first entry starts at page ${firstEntry.pageNo}, expected within first ${threshold} pages. Earlier entries might be missing.`,
|
|
1390
|
+
path: "[0]",
|
|
1391
|
+
entry: firstEntry
|
|
1392
|
+
});
|
|
1393
|
+
}
|
|
1394
|
+
}
|
|
1362
1395
|
/**
|
|
1363
1396
|
* Add issue to the list
|
|
1364
1397
|
*/
|
|
@@ -1577,22 +1610,42 @@ var TocFinder = class {
|
|
|
1577
1610
|
return numberCount > 0 && numberCount / (num_rows - 1) > 0.5;
|
|
1578
1611
|
}
|
|
1579
1612
|
/**
|
|
1580
|
-
* Expand TOC area to consecutive pages
|
|
1613
|
+
* Expand TOC area to consecutive pages (both backward and forward)
|
|
1581
1614
|
*/
|
|
1582
1615
|
expandToConsecutivePages(initial, doc) {
|
|
1583
1616
|
const itemRefs = [...initial.itemRefs];
|
|
1617
|
+
const seenRefs = new Set(itemRefs);
|
|
1618
|
+
let startPage = initial.startPage;
|
|
1584
1619
|
let endPage = initial.endPage;
|
|
1620
|
+
for (let pageNo = initial.startPage - 1; pageNo >= 1; pageNo--) {
|
|
1621
|
+
const continuationItems = this.findContinuationOnPage(doc, pageNo);
|
|
1622
|
+
if (continuationItems.length === 0) {
|
|
1623
|
+
break;
|
|
1624
|
+
}
|
|
1625
|
+
const newItems = continuationItems.filter((ref) => !seenRefs.has(ref));
|
|
1626
|
+
for (const ref of newItems) {
|
|
1627
|
+
seenRefs.add(ref);
|
|
1628
|
+
}
|
|
1629
|
+
itemRefs.unshift(...newItems);
|
|
1630
|
+
startPage = pageNo;
|
|
1631
|
+
this.logger.info(`[TocFinder] Expanded TOC backward to page ${pageNo}`);
|
|
1632
|
+
}
|
|
1585
1633
|
for (let pageNo = initial.endPage + 1; pageNo <= this.maxSearchPages; pageNo++) {
|
|
1586
1634
|
const continuationItems = this.findContinuationOnPage(doc, pageNo);
|
|
1587
1635
|
if (continuationItems.length === 0) {
|
|
1588
1636
|
break;
|
|
1589
1637
|
}
|
|
1590
|
-
|
|
1638
|
+
const newItems = continuationItems.filter((ref) => !seenRefs.has(ref));
|
|
1639
|
+
for (const ref of newItems) {
|
|
1640
|
+
seenRefs.add(ref);
|
|
1641
|
+
}
|
|
1642
|
+
itemRefs.push(...newItems);
|
|
1591
1643
|
endPage = pageNo;
|
|
1644
|
+
this.logger.info(`[TocFinder] Expanded TOC forward to page ${pageNo}`);
|
|
1592
1645
|
}
|
|
1593
1646
|
return {
|
|
1594
1647
|
itemRefs,
|
|
1595
|
-
startPage
|
|
1648
|
+
startPage,
|
|
1596
1649
|
endPage
|
|
1597
1650
|
};
|
|
1598
1651
|
}
|
|
@@ -1810,12 +1863,22 @@ var TextLLMComponent = class extends BaseLLMComponent {
|
|
|
1810
1863
|
};
|
|
1811
1864
|
|
|
1812
1865
|
// src/extractors/toc-extractor.ts
|
|
1866
|
+
var MAX_VALIDATION_RETRIES = 3;
|
|
1867
|
+
var VALIDATION_CODE_DESCRIPTIONS = {
|
|
1868
|
+
V001: "Page numbers must be in non-decreasing order within the same level. A decrease usually means a hierarchy or page number error.",
|
|
1869
|
+
V002: "Page number is out of valid range (must be >= 1 and <= total pages).",
|
|
1870
|
+
V003: "Title is empty or contains only whitespace.",
|
|
1871
|
+
V004: "Title exceeds the maximum allowed length.",
|
|
1872
|
+
V005: "Child page number is before parent page number. Children must start on or after the parent page.",
|
|
1873
|
+
V006: "Duplicate entry detected (same title and page number).",
|
|
1874
|
+
V007: "First TOC entry starts too late in the document. Earlier entries may be missing."
|
|
1875
|
+
};
|
|
1813
1876
|
var TocEntrySchema = z.lazy(
|
|
1814
1877
|
() => z.object({
|
|
1815
1878
|
title: z.string().describe("Chapter or section title"),
|
|
1816
1879
|
level: z.number().int().min(1).describe("Hierarchy depth (1 = top level)"),
|
|
1817
1880
|
pageNo: z.number().int().min(1).describe("Starting page number"),
|
|
1818
|
-
children: z.array(TocEntrySchema).
|
|
1881
|
+
children: z.array(TocEntrySchema).describe("Child sections (use empty array [] if none)")
|
|
1819
1882
|
})
|
|
1820
1883
|
);
|
|
1821
1884
|
var TocResponseSchema = z.object({
|
|
@@ -1838,12 +1901,15 @@ var TocExtractor = class extends TextLLMComponent {
|
|
|
1838
1901
|
/**
|
|
1839
1902
|
* Extract TOC structure from Markdown
|
|
1840
1903
|
*
|
|
1904
|
+
* When validation fails, retries with correction feedback up to MAX_VALIDATION_RETRIES times.
|
|
1905
|
+
*
|
|
1841
1906
|
* @param markdown - Markdown representation of TOC area
|
|
1842
|
-
* @
|
|
1907
|
+
* @param validationOverrides - Optional overrides for validation options (merged with constructor options)
|
|
1908
|
+
* @returns Object with entries array and token usage array (initial extraction + any corrections)
|
|
1843
1909
|
* @throws {TocParseError} When LLM fails to parse structure
|
|
1844
|
-
* @throws {TocValidationError} When validation fails
|
|
1910
|
+
* @throws {TocValidationError} When validation fails after all retries
|
|
1845
1911
|
*/
|
|
1846
|
-
async extract(markdown) {
|
|
1912
|
+
async extract(markdown, validationOverrides) {
|
|
1847
1913
|
this.log("info", `Starting TOC extraction (${markdown.length} chars)`);
|
|
1848
1914
|
if (!markdown.trim()) {
|
|
1849
1915
|
this.log("error", "Cannot extract TOC from empty markdown content");
|
|
@@ -1858,18 +1924,52 @@ var TocExtractor = class extends TextLLMComponent {
|
|
|
1858
1924
|
this.buildUserPrompt(markdown),
|
|
1859
1925
|
"extraction"
|
|
1860
1926
|
);
|
|
1861
|
-
const
|
|
1927
|
+
const usages = [result.usage];
|
|
1928
|
+
let entries = this.normalizeEntries(result.output.entries);
|
|
1862
1929
|
if (!this.skipValidation) {
|
|
1863
|
-
this.
|
|
1930
|
+
let validationError = this.tryValidateEntries(
|
|
1931
|
+
entries,
|
|
1932
|
+
validationOverrides
|
|
1933
|
+
);
|
|
1934
|
+
for (let attempt = 1; attempt <= MAX_VALIDATION_RETRIES && validationError !== null; attempt++) {
|
|
1935
|
+
this.log(
|
|
1936
|
+
"warn",
|
|
1937
|
+
`Validation failed (attempt ${attempt}/${MAX_VALIDATION_RETRIES}), retrying with correction feedback`
|
|
1938
|
+
);
|
|
1939
|
+
const correctionPrompt = this.buildCorrectionPrompt(
|
|
1940
|
+
markdown,
|
|
1941
|
+
entries,
|
|
1942
|
+
validationError.validationResult.issues
|
|
1943
|
+
);
|
|
1944
|
+
const correctionResult = await this.callTextLLM(
|
|
1945
|
+
TocResponseSchema,
|
|
1946
|
+
this.buildSystemPrompt(),
|
|
1947
|
+
correctionPrompt,
|
|
1948
|
+
`correction-${attempt}`
|
|
1949
|
+
);
|
|
1950
|
+
usages.push(correctionResult.usage);
|
|
1951
|
+
entries = this.normalizeEntries(correctionResult.output.entries);
|
|
1952
|
+
validationError = this.tryValidateEntries(
|
|
1953
|
+
entries,
|
|
1954
|
+
validationOverrides
|
|
1955
|
+
);
|
|
1956
|
+
}
|
|
1957
|
+
if (validationError !== null) {
|
|
1958
|
+
this.log(
|
|
1959
|
+
"error",
|
|
1960
|
+
`Validation failed after ${MAX_VALIDATION_RETRIES} retries:
|
|
1961
|
+
${validationError.getSummary()}`
|
|
1962
|
+
);
|
|
1963
|
+
throw validationError;
|
|
1964
|
+
}
|
|
1864
1965
|
}
|
|
1865
1966
|
this.log(
|
|
1866
1967
|
"info",
|
|
1867
|
-
`Extraction completed: ${entries.length} top-level entries`
|
|
1968
|
+
`Extraction completed: ${entries.length} top-level entries (${usages.length} LLM call(s))`
|
|
1868
1969
|
);
|
|
1869
|
-
return { entries,
|
|
1970
|
+
return { entries, usages };
|
|
1870
1971
|
} catch (error) {
|
|
1871
1972
|
if (error instanceof TocValidationError) {
|
|
1872
|
-
this.log("error", `Validation failed: ${error.message}`);
|
|
1873
1973
|
throw error;
|
|
1874
1974
|
}
|
|
1875
1975
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -1880,16 +1980,69 @@ var TocExtractor = class extends TextLLMComponent {
|
|
|
1880
1980
|
}
|
|
1881
1981
|
}
|
|
1882
1982
|
/**
|
|
1883
|
-
* Validate extracted entries
|
|
1983
|
+
* Validate extracted entries and return error or null
|
|
1884
1984
|
*
|
|
1885
|
-
*
|
|
1985
|
+
* Unlike validateOrThrow, this returns the error instead of throwing,
|
|
1986
|
+
* allowing the retry loop to handle it.
|
|
1987
|
+
*
|
|
1988
|
+
* @returns TocValidationError if validation fails, null if valid
|
|
1886
1989
|
*/
|
|
1887
|
-
|
|
1990
|
+
tryValidateEntries(entries, overrides) {
|
|
1888
1991
|
if (entries.length === 0) {
|
|
1889
|
-
return;
|
|
1992
|
+
return null;
|
|
1890
1993
|
}
|
|
1891
|
-
const
|
|
1892
|
-
validator
|
|
1994
|
+
const options = { ...this.validationOptions, ...overrides };
|
|
1995
|
+
const validator = new TocValidator(options);
|
|
1996
|
+
const result = validator.validate(entries);
|
|
1997
|
+
if (!result.valid) {
|
|
1998
|
+
const details = result.issues.map(
|
|
1999
|
+
(issue) => ` [${issue.code}] ${issue.message} (path: ${issue.path}, entry: "${issue.entry.title}" page ${issue.entry.pageNo})`
|
|
2000
|
+
).join("\n");
|
|
2001
|
+
return new TocValidationError(
|
|
2002
|
+
`TOC validation failed with ${result.errorCount} error(s):
|
|
2003
|
+
${details}`,
|
|
2004
|
+
result
|
|
2005
|
+
);
|
|
2006
|
+
}
|
|
2007
|
+
return null;
|
|
2008
|
+
}
|
|
2009
|
+
/**
|
|
2010
|
+
* Build correction prompt with validation error feedback
|
|
2011
|
+
*
|
|
2012
|
+
* Includes the original markdown, previous extraction result,
|
|
2013
|
+
* validation errors, and guidance for fixing common mistakes.
|
|
2014
|
+
*/
|
|
2015
|
+
buildCorrectionPrompt(markdown, previousEntries, issues) {
|
|
2016
|
+
const errorLines = issues.map((issue) => {
|
|
2017
|
+
const desc = VALIDATION_CODE_DESCRIPTIONS[issue.code] ?? "Unknown validation error.";
|
|
2018
|
+
return `- [${issue.code}] ${issue.message}
|
|
2019
|
+
Path: ${issue.path}
|
|
2020
|
+
Entry: "${issue.entry.title}" (page ${issue.entry.pageNo})
|
|
2021
|
+
Rule: ${desc}`;
|
|
2022
|
+
});
|
|
2023
|
+
return `Your previous TOC extraction had validation errors. Please fix them and re-extract.
|
|
2024
|
+
|
|
2025
|
+
## Validation Errors
|
|
2026
|
+
|
|
2027
|
+
${errorLines.join("\n\n")}
|
|
2028
|
+
|
|
2029
|
+
## Common Mistakes to Avoid
|
|
2030
|
+
|
|
2031
|
+
1. **Hierarchy confusion**: Entries with the same numbering prefix (e.g., "4)") can belong to different hierarchy levels depending on context. Use indentation and surrounding entries to determine the correct parent-child relationship.
|
|
2032
|
+
2. **Page number misread**: Carefully distinguish Roman numerals (VI=6) from Arabic numerals. "VI. \uACE0\uCC30" at page 277 is NOT "V. \uACE0\uCC30" at page 27.
|
|
2033
|
+
3. **Page order**: Within the same parent, sibling entries must have non-decreasing page numbers. If a page number decreases, the entry likely belongs to a different hierarchy level.
|
|
2034
|
+
|
|
2035
|
+
## Original Markdown
|
|
2036
|
+
|
|
2037
|
+
${markdown}
|
|
2038
|
+
|
|
2039
|
+
## Your Previous Extraction (with errors)
|
|
2040
|
+
|
|
2041
|
+
${JSON.stringify(previousEntries, null, 2)}
|
|
2042
|
+
|
|
2043
|
+
## Instructions
|
|
2044
|
+
|
|
2045
|
+
Re-extract the TOC structure from the original markdown above. Fix all validation errors listed above. Return the corrected entries.`;
|
|
1893
2046
|
}
|
|
1894
2047
|
/**
|
|
1895
2048
|
* Build system prompt for TOC extraction
|
|
@@ -1907,11 +2060,12 @@ var TocExtractor = class extends TextLLMComponent {
|
|
|
1907
2060
|
- Level 3: Subsections (e.g., "1.1.1", "a.", "(1)")
|
|
1908
2061
|
- Use indentation and numbering patterns to infer level
|
|
1909
2062
|
|
|
1910
|
-
3. **Page Number**: Extract the page number from each entry.
|
|
2063
|
+
3. **Page Number**: Extract the page number from each entry. Use only Arabic numerals for page numbers.
|
|
1911
2064
|
|
|
1912
2065
|
4. **Children**: Nest child entries under parent entries based on their hierarchy level.
|
|
1913
2066
|
|
|
1914
|
-
5. **IMPORTANT - Extract Main TOC Only**: Only extract the main document table of contents. EXCLUDE the following
|
|
2067
|
+
5. **IMPORTANT - Extract Main TOC Only**: Only extract the main document table of contents. EXCLUDE the following:
|
|
2068
|
+
- **Front matter with Roman numeral pages**: Entries whose page numbers are Roman numerals (i, ii, xxi, etc.) such as \uC77C\uB7EC\uB450\uAE30, \uBC1C\uAC04\uC0AC, \uC11C\uBB38, \uBC94\uB840, Preface, Foreword, Editorial Notes. These use a separate page numbering system and are not part of the main content.
|
|
1915
2069
|
- Photo/image indices (\uC0AC\uC9C4 \uBAA9\uCC28, \uC0AC\uC9C4\uBAA9\uCC28, \uD654\uBCF4 \uBAA9\uCC28, Photo Index, List of Photos, List of Figures)
|
|
1916
2070
|
- Drawing/diagram indices (\uB3C4\uBA74 \uBAA9\uCC28, \uB3C4\uBA74\uBAA9\uCC28, \uC0BD\uB3C4 \uBAA9\uCC28, Drawing Index, List of Drawings)
|
|
1917
2071
|
- Table indices (\uD45C \uBAA9\uCC28, \uD45C\uBAA9\uCC28, Table Index, List of Tables)
|
|
@@ -1938,11 +2092,11 @@ Output:
|
|
|
1938
2092
|
"level": 1,
|
|
1939
2093
|
"pageNo": 1,
|
|
1940
2094
|
"children": [
|
|
1941
|
-
{ "title": "1. \uC5F0\uAD6C \uBC30\uACBD", "level": 2, "pageNo": 3 },
|
|
1942
|
-
{ "title": "2. \uC5F0\uAD6C \uBAA9\uC801", "level": 2, "pageNo": 5 }
|
|
2095
|
+
{ "title": "1. \uC5F0\uAD6C \uBC30\uACBD", "level": 2, "pageNo": 3, "children": [] },
|
|
2096
|
+
{ "title": "2. \uC5F0\uAD6C \uBAA9\uC801", "level": 2, "pageNo": 5, "children": [] }
|
|
1943
2097
|
]
|
|
1944
2098
|
},
|
|
1945
|
-
{ "title": "\uC81C2\uC7A5 \uBC29\uBC95\uB860", "level": 1, "pageNo": 10 }
|
|
2099
|
+
{ "title": "\uC81C2\uC7A5 \uBC29\uBC95\uB860", "level": 1, "pageNo": 10, "children": [] }
|
|
1946
2100
|
]
|
|
1947
2101
|
}`;
|
|
1948
2102
|
}
|
|
@@ -2585,7 +2739,7 @@ var PagePattern = /* @__PURE__ */ ((PagePattern2) => {
|
|
|
2585
2739
|
var PageRangeParser = class extends VisionLLMComponent {
|
|
2586
2740
|
// Configuration constants
|
|
2587
2741
|
SAMPLE_SIZE = 3;
|
|
2588
|
-
MAX_PATTERN_RETRIES =
|
|
2742
|
+
MAX_PATTERN_RETRIES = 19;
|
|
2589
2743
|
SIZE_TOLERANCE = 5;
|
|
2590
2744
|
constructor(logger, model, outputPath, maxRetries = 3, fallbackModel, aggregator, abortSignal) {
|
|
2591
2745
|
super(
|
|
@@ -3985,8 +4139,8 @@ var DocumentProcessor = class {
|
|
|
3985
4139
|
}
|
|
3986
4140
|
if (!markdown) {
|
|
3987
4141
|
this.logger.info("[DocumentProcessor] Using vision fallback for TOC");
|
|
3988
|
-
const
|
|
3989
|
-
markdown = await this.visionTocExtractor.extract(
|
|
4142
|
+
const totalPages2 = Object.keys(doclingDoc.pages).length;
|
|
4143
|
+
markdown = await this.visionTocExtractor.extract(totalPages2);
|
|
3990
4144
|
if (!markdown) {
|
|
3991
4145
|
const reason = "Both rule-based search and vision fallback failed to locate TOC";
|
|
3992
4146
|
this.logger.error(
|
|
@@ -4000,8 +4154,13 @@ var DocumentProcessor = class {
|
|
|
4000
4154
|
`[DocumentProcessor] Vision extracted TOC markdown (${markdown.length} chars)`
|
|
4001
4155
|
);
|
|
4002
4156
|
}
|
|
4003
|
-
const
|
|
4004
|
-
this.
|
|
4157
|
+
const totalPages = Object.keys(doclingDoc.pages).length;
|
|
4158
|
+
const tocResult = await this.tocExtractor.extract(markdown, {
|
|
4159
|
+
totalPages
|
|
4160
|
+
});
|
|
4161
|
+
for (const usage of tocResult.usages) {
|
|
4162
|
+
this.usageAggregator.track(usage);
|
|
4163
|
+
}
|
|
4005
4164
|
if (tocResult.entries.length === 0) {
|
|
4006
4165
|
const reason = "TOC area was detected but LLM could not extract any structured entries";
|
|
4007
4166
|
this.logger.error(`[DocumentProcessor] TOC extraction failed: ${reason}`);
|