@heripo/document-processor 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +189 -30
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +51 -8
- package/dist/index.d.ts +51 -8
- package/dist/index.js +189 -30
- package/dist/index.js.map +1 -1
- package/package.json +9 -9
package/dist/index.cjs
CHANGED
|
@@ -1268,7 +1268,8 @@ var TocValidationError = class extends TocExtractError {
|
|
|
1268
1268
|
// src/extractors/toc-validator.ts
|
|
1269
1269
|
var DEFAULT_OPTIONS = {
|
|
1270
1270
|
totalPages: Infinity,
|
|
1271
|
-
maxTitleLength: 200
|
|
1271
|
+
maxTitleLength: 200,
|
|
1272
|
+
maxFirstEntryPageRatio: 0.3
|
|
1272
1273
|
};
|
|
1273
1274
|
var TocValidator = class {
|
|
1274
1275
|
options;
|
|
@@ -1289,6 +1290,7 @@ var TocValidator = class {
|
|
|
1289
1290
|
validate(entries) {
|
|
1290
1291
|
this.issues = [];
|
|
1291
1292
|
this.validateEntries(entries, "", null, /* @__PURE__ */ new Set());
|
|
1293
|
+
this.validateFirstEntryPagePosition(entries);
|
|
1292
1294
|
const errorCount = this.issues.length;
|
|
1293
1295
|
return {
|
|
1294
1296
|
valid: errorCount === 0,
|
|
@@ -1305,8 +1307,12 @@ var TocValidator = class {
|
|
|
1305
1307
|
validateOrThrow(entries) {
|
|
1306
1308
|
const result = this.validate(entries);
|
|
1307
1309
|
if (!result.valid) {
|
|
1310
|
+
const details = result.issues.map(
|
|
1311
|
+
(issue) => ` [${issue.code}] ${issue.message} (path: ${issue.path}, entry: "${issue.entry.title}" page ${issue.entry.pageNo})`
|
|
1312
|
+
).join("\n");
|
|
1308
1313
|
throw new TocValidationError(
|
|
1309
|
-
`TOC validation failed with ${result.errorCount} error(s)
|
|
1314
|
+
`TOC validation failed with ${result.errorCount} error(s):
|
|
1315
|
+
${details}`,
|
|
1310
1316
|
result
|
|
1311
1317
|
);
|
|
1312
1318
|
}
|
|
@@ -1421,6 +1427,33 @@ var TocValidator = class {
|
|
|
1421
1427
|
});
|
|
1422
1428
|
}
|
|
1423
1429
|
}
|
|
1430
|
+
/**
|
|
1431
|
+
* V007: Validate first entry page position (completeness check)
|
|
1432
|
+
*
|
|
1433
|
+
* If the first level-1 entry starts too late in the document,
|
|
1434
|
+
* earlier entries might be missing from the TOC.
|
|
1435
|
+
*/
|
|
1436
|
+
validateFirstEntryPagePosition(entries) {
|
|
1437
|
+
if (entries.length === 0) {
|
|
1438
|
+
return;
|
|
1439
|
+
}
|
|
1440
|
+
if (!isFinite(this.options.totalPages)) {
|
|
1441
|
+
return;
|
|
1442
|
+
}
|
|
1443
|
+
const firstEntry = entries[0];
|
|
1444
|
+
const threshold = Math.max(
|
|
1445
|
+
50,
|
|
1446
|
+
Math.floor(this.options.totalPages * this.options.maxFirstEntryPageRatio)
|
|
1447
|
+
);
|
|
1448
|
+
if (firstEntry.pageNo > threshold) {
|
|
1449
|
+
this.addIssue({
|
|
1450
|
+
code: "V007",
|
|
1451
|
+
message: `TOC may be incomplete - first entry starts at page ${firstEntry.pageNo}, expected within first ${threshold} pages. Earlier entries might be missing.`,
|
|
1452
|
+
path: "[0]",
|
|
1453
|
+
entry: firstEntry
|
|
1454
|
+
});
|
|
1455
|
+
}
|
|
1456
|
+
}
|
|
1424
1457
|
/**
|
|
1425
1458
|
* Add issue to the list
|
|
1426
1459
|
*/
|
|
@@ -1639,22 +1672,42 @@ var TocFinder = class {
|
|
|
1639
1672
|
return numberCount > 0 && numberCount / (num_rows - 1) > 0.5;
|
|
1640
1673
|
}
|
|
1641
1674
|
/**
|
|
1642
|
-
* Expand TOC area to consecutive pages
|
|
1675
|
+
* Expand TOC area to consecutive pages (both backward and forward)
|
|
1643
1676
|
*/
|
|
1644
1677
|
expandToConsecutivePages(initial, doc) {
|
|
1645
1678
|
const itemRefs = [...initial.itemRefs];
|
|
1679
|
+
const seenRefs = new Set(itemRefs);
|
|
1680
|
+
let startPage = initial.startPage;
|
|
1646
1681
|
let endPage = initial.endPage;
|
|
1682
|
+
for (let pageNo = initial.startPage - 1; pageNo >= 1; pageNo--) {
|
|
1683
|
+
const continuationItems = this.findContinuationOnPage(doc, pageNo);
|
|
1684
|
+
if (continuationItems.length === 0) {
|
|
1685
|
+
break;
|
|
1686
|
+
}
|
|
1687
|
+
const newItems = continuationItems.filter((ref) => !seenRefs.has(ref));
|
|
1688
|
+
for (const ref of newItems) {
|
|
1689
|
+
seenRefs.add(ref);
|
|
1690
|
+
}
|
|
1691
|
+
itemRefs.unshift(...newItems);
|
|
1692
|
+
startPage = pageNo;
|
|
1693
|
+
this.logger.info(`[TocFinder] Expanded TOC backward to page ${pageNo}`);
|
|
1694
|
+
}
|
|
1647
1695
|
for (let pageNo = initial.endPage + 1; pageNo <= this.maxSearchPages; pageNo++) {
|
|
1648
1696
|
const continuationItems = this.findContinuationOnPage(doc, pageNo);
|
|
1649
1697
|
if (continuationItems.length === 0) {
|
|
1650
1698
|
break;
|
|
1651
1699
|
}
|
|
1652
|
-
|
|
1700
|
+
const newItems = continuationItems.filter((ref) => !seenRefs.has(ref));
|
|
1701
|
+
for (const ref of newItems) {
|
|
1702
|
+
seenRefs.add(ref);
|
|
1703
|
+
}
|
|
1704
|
+
itemRefs.push(...newItems);
|
|
1653
1705
|
endPage = pageNo;
|
|
1706
|
+
this.logger.info(`[TocFinder] Expanded TOC forward to page ${pageNo}`);
|
|
1654
1707
|
}
|
|
1655
1708
|
return {
|
|
1656
1709
|
itemRefs,
|
|
1657
|
-
startPage
|
|
1710
|
+
startPage,
|
|
1658
1711
|
endPage
|
|
1659
1712
|
};
|
|
1660
1713
|
}
|
|
@@ -1872,12 +1925,22 @@ var TextLLMComponent = class extends BaseLLMComponent {
|
|
|
1872
1925
|
};
|
|
1873
1926
|
|
|
1874
1927
|
// src/extractors/toc-extractor.ts
|
|
1928
|
+
var MAX_VALIDATION_RETRIES = 3;
|
|
1929
|
+
var VALIDATION_CODE_DESCRIPTIONS = {
|
|
1930
|
+
V001: "Page numbers must be in non-decreasing order within the same level. A decrease usually means a hierarchy or page number error.",
|
|
1931
|
+
V002: "Page number is out of valid range (must be >= 1 and <= total pages).",
|
|
1932
|
+
V003: "Title is empty or contains only whitespace.",
|
|
1933
|
+
V004: "Title exceeds the maximum allowed length.",
|
|
1934
|
+
V005: "Child page number is before parent page number. Children must start on or after the parent page.",
|
|
1935
|
+
V006: "Duplicate entry detected (same title and page number).",
|
|
1936
|
+
V007: "First TOC entry starts too late in the document. Earlier entries may be missing."
|
|
1937
|
+
};
|
|
1875
1938
|
var TocEntrySchema = import_zod.z.lazy(
|
|
1876
1939
|
() => import_zod.z.object({
|
|
1877
1940
|
title: import_zod.z.string().describe("Chapter or section title"),
|
|
1878
1941
|
level: import_zod.z.number().int().min(1).describe("Hierarchy depth (1 = top level)"),
|
|
1879
1942
|
pageNo: import_zod.z.number().int().min(1).describe("Starting page number"),
|
|
1880
|
-
children: import_zod.z.array(TocEntrySchema).
|
|
1943
|
+
children: import_zod.z.array(TocEntrySchema).describe("Child sections (use empty array [] if none)")
|
|
1881
1944
|
})
|
|
1882
1945
|
);
|
|
1883
1946
|
var TocResponseSchema = import_zod.z.object({
|
|
@@ -1900,12 +1963,15 @@ var TocExtractor = class extends TextLLMComponent {
|
|
|
1900
1963
|
/**
|
|
1901
1964
|
* Extract TOC structure from Markdown
|
|
1902
1965
|
*
|
|
1966
|
+
* When validation fails, retries with correction feedback up to MAX_VALIDATION_RETRIES times.
|
|
1967
|
+
*
|
|
1903
1968
|
* @param markdown - Markdown representation of TOC area
|
|
1904
|
-
* @
|
|
1969
|
+
* @param validationOverrides - Optional overrides for validation options (merged with constructor options)
|
|
1970
|
+
* @returns Object with entries array and token usage array (initial extraction + any corrections)
|
|
1905
1971
|
* @throws {TocParseError} When LLM fails to parse structure
|
|
1906
|
-
* @throws {TocValidationError} When validation fails
|
|
1972
|
+
* @throws {TocValidationError} When validation fails after all retries
|
|
1907
1973
|
*/
|
|
1908
|
-
async extract(markdown) {
|
|
1974
|
+
async extract(markdown, validationOverrides) {
|
|
1909
1975
|
this.log("info", `Starting TOC extraction (${markdown.length} chars)`);
|
|
1910
1976
|
if (!markdown.trim()) {
|
|
1911
1977
|
this.log("error", "Cannot extract TOC from empty markdown content");
|
|
@@ -1920,18 +1986,52 @@ var TocExtractor = class extends TextLLMComponent {
|
|
|
1920
1986
|
this.buildUserPrompt(markdown),
|
|
1921
1987
|
"extraction"
|
|
1922
1988
|
);
|
|
1923
|
-
const
|
|
1989
|
+
const usages = [result.usage];
|
|
1990
|
+
let entries = this.normalizeEntries(result.output.entries);
|
|
1924
1991
|
if (!this.skipValidation) {
|
|
1925
|
-
this.
|
|
1992
|
+
let validationError = this.tryValidateEntries(
|
|
1993
|
+
entries,
|
|
1994
|
+
validationOverrides
|
|
1995
|
+
);
|
|
1996
|
+
for (let attempt = 1; attempt <= MAX_VALIDATION_RETRIES && validationError !== null; attempt++) {
|
|
1997
|
+
this.log(
|
|
1998
|
+
"warn",
|
|
1999
|
+
`Validation failed (attempt ${attempt}/${MAX_VALIDATION_RETRIES}), retrying with correction feedback`
|
|
2000
|
+
);
|
|
2001
|
+
const correctionPrompt = this.buildCorrectionPrompt(
|
|
2002
|
+
markdown,
|
|
2003
|
+
entries,
|
|
2004
|
+
validationError.validationResult.issues
|
|
2005
|
+
);
|
|
2006
|
+
const correctionResult = await this.callTextLLM(
|
|
2007
|
+
TocResponseSchema,
|
|
2008
|
+
this.buildSystemPrompt(),
|
|
2009
|
+
correctionPrompt,
|
|
2010
|
+
`correction-${attempt}`
|
|
2011
|
+
);
|
|
2012
|
+
usages.push(correctionResult.usage);
|
|
2013
|
+
entries = this.normalizeEntries(correctionResult.output.entries);
|
|
2014
|
+
validationError = this.tryValidateEntries(
|
|
2015
|
+
entries,
|
|
2016
|
+
validationOverrides
|
|
2017
|
+
);
|
|
2018
|
+
}
|
|
2019
|
+
if (validationError !== null) {
|
|
2020
|
+
this.log(
|
|
2021
|
+
"error",
|
|
2022
|
+
`Validation failed after ${MAX_VALIDATION_RETRIES} retries:
|
|
2023
|
+
${validationError.getSummary()}`
|
|
2024
|
+
);
|
|
2025
|
+
throw validationError;
|
|
2026
|
+
}
|
|
1926
2027
|
}
|
|
1927
2028
|
this.log(
|
|
1928
2029
|
"info",
|
|
1929
|
-
`Extraction completed: ${entries.length} top-level entries`
|
|
2030
|
+
`Extraction completed: ${entries.length} top-level entries (${usages.length} LLM call(s))`
|
|
1930
2031
|
);
|
|
1931
|
-
return { entries,
|
|
2032
|
+
return { entries, usages };
|
|
1932
2033
|
} catch (error) {
|
|
1933
2034
|
if (error instanceof TocValidationError) {
|
|
1934
|
-
this.log("error", `Validation failed: ${error.message}`);
|
|
1935
2035
|
throw error;
|
|
1936
2036
|
}
|
|
1937
2037
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -1942,16 +2042,69 @@ var TocExtractor = class extends TextLLMComponent {
|
|
|
1942
2042
|
}
|
|
1943
2043
|
}
|
|
1944
2044
|
/**
|
|
1945
|
-
* Validate extracted entries
|
|
2045
|
+
* Validate extracted entries and return error or null
|
|
1946
2046
|
*
|
|
1947
|
-
*
|
|
2047
|
+
* Unlike validateOrThrow, this returns the error instead of throwing,
|
|
2048
|
+
* allowing the retry loop to handle it.
|
|
2049
|
+
*
|
|
2050
|
+
* @returns TocValidationError if validation fails, null if valid
|
|
1948
2051
|
*/
|
|
1949
|
-
|
|
2052
|
+
tryValidateEntries(entries, overrides) {
|
|
1950
2053
|
if (entries.length === 0) {
|
|
1951
|
-
return;
|
|
2054
|
+
return null;
|
|
1952
2055
|
}
|
|
1953
|
-
const
|
|
1954
|
-
validator
|
|
2056
|
+
const options = { ...this.validationOptions, ...overrides };
|
|
2057
|
+
const validator = new TocValidator(options);
|
|
2058
|
+
const result = validator.validate(entries);
|
|
2059
|
+
if (!result.valid) {
|
|
2060
|
+
const details = result.issues.map(
|
|
2061
|
+
(issue) => ` [${issue.code}] ${issue.message} (path: ${issue.path}, entry: "${issue.entry.title}" page ${issue.entry.pageNo})`
|
|
2062
|
+
).join("\n");
|
|
2063
|
+
return new TocValidationError(
|
|
2064
|
+
`TOC validation failed with ${result.errorCount} error(s):
|
|
2065
|
+
${details}`,
|
|
2066
|
+
result
|
|
2067
|
+
);
|
|
2068
|
+
}
|
|
2069
|
+
return null;
|
|
2070
|
+
}
|
|
2071
|
+
/**
|
|
2072
|
+
* Build correction prompt with validation error feedback
|
|
2073
|
+
*
|
|
2074
|
+
* Includes the original markdown, previous extraction result,
|
|
2075
|
+
* validation errors, and guidance for fixing common mistakes.
|
|
2076
|
+
*/
|
|
2077
|
+
buildCorrectionPrompt(markdown, previousEntries, issues) {
|
|
2078
|
+
const errorLines = issues.map((issue) => {
|
|
2079
|
+
const desc = VALIDATION_CODE_DESCRIPTIONS[issue.code] ?? "Unknown validation error.";
|
|
2080
|
+
return `- [${issue.code}] ${issue.message}
|
|
2081
|
+
Path: ${issue.path}
|
|
2082
|
+
Entry: "${issue.entry.title}" (page ${issue.entry.pageNo})
|
|
2083
|
+
Rule: ${desc}`;
|
|
2084
|
+
});
|
|
2085
|
+
return `Your previous TOC extraction had validation errors. Please fix them and re-extract.
|
|
2086
|
+
|
|
2087
|
+
## Validation Errors
|
|
2088
|
+
|
|
2089
|
+
${errorLines.join("\n\n")}
|
|
2090
|
+
|
|
2091
|
+
## Common Mistakes to Avoid
|
|
2092
|
+
|
|
2093
|
+
1. **Hierarchy confusion**: Entries with the same numbering prefix (e.g., "4)") can belong to different hierarchy levels depending on context. Use indentation and surrounding entries to determine the correct parent-child relationship.
|
|
2094
|
+
2. **Page number misread**: Carefully distinguish Roman numerals (VI=6) from Arabic numerals. "VI. \uACE0\uCC30" at page 277 is NOT "V. \uACE0\uCC30" at page 27.
|
|
2095
|
+
3. **Page order**: Within the same parent, sibling entries must have non-decreasing page numbers. If a page number decreases, the entry likely belongs to a different hierarchy level.
|
|
2096
|
+
|
|
2097
|
+
## Original Markdown
|
|
2098
|
+
|
|
2099
|
+
${markdown}
|
|
2100
|
+
|
|
2101
|
+
## Your Previous Extraction (with errors)
|
|
2102
|
+
|
|
2103
|
+
${JSON.stringify(previousEntries, null, 2)}
|
|
2104
|
+
|
|
2105
|
+
## Instructions
|
|
2106
|
+
|
|
2107
|
+
Re-extract the TOC structure from the original markdown above. Fix all validation errors listed above. Return the corrected entries.`;
|
|
1955
2108
|
}
|
|
1956
2109
|
/**
|
|
1957
2110
|
* Build system prompt for TOC extraction
|
|
@@ -1969,11 +2122,12 @@ var TocExtractor = class extends TextLLMComponent {
|
|
|
1969
2122
|
- Level 3: Subsections (e.g., "1.1.1", "a.", "(1)")
|
|
1970
2123
|
- Use indentation and numbering patterns to infer level
|
|
1971
2124
|
|
|
1972
|
-
3. **Page Number**: Extract the page number from each entry.
|
|
2125
|
+
3. **Page Number**: Extract the page number from each entry. Use only Arabic numerals for page numbers.
|
|
1973
2126
|
|
|
1974
2127
|
4. **Children**: Nest child entries under parent entries based on their hierarchy level.
|
|
1975
2128
|
|
|
1976
|
-
5. **IMPORTANT - Extract Main TOC Only**: Only extract the main document table of contents. EXCLUDE the following
|
|
2129
|
+
5. **IMPORTANT - Extract Main TOC Only**: Only extract the main document table of contents. EXCLUDE the following:
|
|
2130
|
+
- **Front matter with Roman numeral pages**: Entries whose page numbers are Roman numerals (i, ii, xxi, etc.) such as \uC77C\uB7EC\uB450\uAE30, \uBC1C\uAC04\uC0AC, \uC11C\uBB38, \uBC94\uB840, Preface, Foreword, Editorial Notes. These use a separate page numbering system and are not part of the main content.
|
|
1977
2131
|
- Photo/image indices (\uC0AC\uC9C4 \uBAA9\uCC28, \uC0AC\uC9C4\uBAA9\uCC28, \uD654\uBCF4 \uBAA9\uCC28, Photo Index, List of Photos, List of Figures)
|
|
1978
2132
|
- Drawing/diagram indices (\uB3C4\uBA74 \uBAA9\uCC28, \uB3C4\uBA74\uBAA9\uCC28, \uC0BD\uB3C4 \uBAA9\uCC28, Drawing Index, List of Drawings)
|
|
1979
2133
|
- Table indices (\uD45C \uBAA9\uCC28, \uD45C\uBAA9\uCC28, Table Index, List of Tables)
|
|
@@ -2000,11 +2154,11 @@ Output:
|
|
|
2000
2154
|
"level": 1,
|
|
2001
2155
|
"pageNo": 1,
|
|
2002
2156
|
"children": [
|
|
2003
|
-
{ "title": "1. \uC5F0\uAD6C \uBC30\uACBD", "level": 2, "pageNo": 3 },
|
|
2004
|
-
{ "title": "2. \uC5F0\uAD6C \uBAA9\uC801", "level": 2, "pageNo": 5 }
|
|
2157
|
+
{ "title": "1. \uC5F0\uAD6C \uBC30\uACBD", "level": 2, "pageNo": 3, "children": [] },
|
|
2158
|
+
{ "title": "2. \uC5F0\uAD6C \uBAA9\uC801", "level": 2, "pageNo": 5, "children": [] }
|
|
2005
2159
|
]
|
|
2006
2160
|
},
|
|
2007
|
-
{ "title": "\uC81C2\uC7A5 \uBC29\uBC95\uB860", "level": 1, "pageNo": 10 }
|
|
2161
|
+
{ "title": "\uC81C2\uC7A5 \uBC29\uBC95\uB860", "level": 1, "pageNo": 10, "children": [] }
|
|
2008
2162
|
]
|
|
2009
2163
|
}`;
|
|
2010
2164
|
}
|
|
@@ -2647,7 +2801,7 @@ var PagePattern = /* @__PURE__ */ ((PagePattern2) => {
|
|
|
2647
2801
|
var PageRangeParser = class extends VisionLLMComponent {
|
|
2648
2802
|
// Configuration constants
|
|
2649
2803
|
SAMPLE_SIZE = 3;
|
|
2650
|
-
MAX_PATTERN_RETRIES =
|
|
2804
|
+
MAX_PATTERN_RETRIES = 19;
|
|
2651
2805
|
SIZE_TOLERANCE = 5;
|
|
2652
2806
|
constructor(logger, model, outputPath, maxRetries = 3, fallbackModel, aggregator, abortSignal) {
|
|
2653
2807
|
super(
|
|
@@ -4047,8 +4201,8 @@ var DocumentProcessor = class {
|
|
|
4047
4201
|
}
|
|
4048
4202
|
if (!markdown) {
|
|
4049
4203
|
this.logger.info("[DocumentProcessor] Using vision fallback for TOC");
|
|
4050
|
-
const
|
|
4051
|
-
markdown = await this.visionTocExtractor.extract(
|
|
4204
|
+
const totalPages2 = Object.keys(doclingDoc.pages).length;
|
|
4205
|
+
markdown = await this.visionTocExtractor.extract(totalPages2);
|
|
4052
4206
|
if (!markdown) {
|
|
4053
4207
|
const reason = "Both rule-based search and vision fallback failed to locate TOC";
|
|
4054
4208
|
this.logger.error(
|
|
@@ -4062,8 +4216,13 @@ var DocumentProcessor = class {
|
|
|
4062
4216
|
`[DocumentProcessor] Vision extracted TOC markdown (${markdown.length} chars)`
|
|
4063
4217
|
);
|
|
4064
4218
|
}
|
|
4065
|
-
const
|
|
4066
|
-
this.
|
|
4219
|
+
const totalPages = Object.keys(doclingDoc.pages).length;
|
|
4220
|
+
const tocResult = await this.tocExtractor.extract(markdown, {
|
|
4221
|
+
totalPages
|
|
4222
|
+
});
|
|
4223
|
+
for (const usage of tocResult.usages) {
|
|
4224
|
+
this.usageAggregator.track(usage);
|
|
4225
|
+
}
|
|
4067
4226
|
if (tocResult.entries.length === 0) {
|
|
4068
4227
|
const reason = "TOC area was detected but LLM could not extract any structured entries";
|
|
4069
4228
|
this.logger.error(`[DocumentProcessor] TOC extraction failed: ${reason}`);
|