@heripo/document-processor 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1268,7 +1268,8 @@ var TocValidationError = class extends TocExtractError {
1268
1268
  // src/extractors/toc-validator.ts
1269
1269
  var DEFAULT_OPTIONS = {
1270
1270
  totalPages: Infinity,
1271
- maxTitleLength: 200
1271
+ maxTitleLength: 200,
1272
+ maxFirstEntryPageRatio: 0.3
1272
1273
  };
1273
1274
  var TocValidator = class {
1274
1275
  options;
@@ -1289,6 +1290,7 @@ var TocValidator = class {
1289
1290
  validate(entries) {
1290
1291
  this.issues = [];
1291
1292
  this.validateEntries(entries, "", null, /* @__PURE__ */ new Set());
1293
+ this.validateFirstEntryPagePosition(entries);
1292
1294
  const errorCount = this.issues.length;
1293
1295
  return {
1294
1296
  valid: errorCount === 0,
@@ -1305,8 +1307,12 @@ var TocValidator = class {
1305
1307
  validateOrThrow(entries) {
1306
1308
  const result = this.validate(entries);
1307
1309
  if (!result.valid) {
1310
+ const details = result.issues.map(
1311
+ (issue) => ` [${issue.code}] ${issue.message} (path: ${issue.path}, entry: "${issue.entry.title}" page ${issue.entry.pageNo})`
1312
+ ).join("\n");
1308
1313
  throw new TocValidationError(
1309
- `TOC validation failed with ${result.errorCount} error(s)`,
1314
+ `TOC validation failed with ${result.errorCount} error(s):
1315
+ ${details}`,
1310
1316
  result
1311
1317
  );
1312
1318
  }
@@ -1421,6 +1427,33 @@ var TocValidator = class {
1421
1427
  });
1422
1428
  }
1423
1429
  }
1430
+ /**
1431
+ * V007: Validate first entry page position (completeness check)
1432
+ *
1433
+ * If the first level-1 entry starts too late in the document,
1434
+ * earlier entries might be missing from the TOC.
1435
+ */
1436
+ validateFirstEntryPagePosition(entries) {
1437
+ if (entries.length === 0) {
1438
+ return;
1439
+ }
1440
+ if (!isFinite(this.options.totalPages)) {
1441
+ return;
1442
+ }
1443
+ const firstEntry = entries[0];
1444
+ const threshold = Math.max(
1445
+ 50,
1446
+ Math.floor(this.options.totalPages * this.options.maxFirstEntryPageRatio)
1447
+ );
1448
+ if (firstEntry.pageNo > threshold) {
1449
+ this.addIssue({
1450
+ code: "V007",
1451
+ message: `TOC may be incomplete - first entry starts at page ${firstEntry.pageNo}, expected within first ${threshold} pages. Earlier entries might be missing.`,
1452
+ path: "[0]",
1453
+ entry: firstEntry
1454
+ });
1455
+ }
1456
+ }
1424
1457
  /**
1425
1458
  * Add issue to the list
1426
1459
  */
@@ -1639,22 +1672,42 @@ var TocFinder = class {
1639
1672
  return numberCount > 0 && numberCount / (num_rows - 1) > 0.5;
1640
1673
  }
1641
1674
  /**
1642
- * Expand TOC area to consecutive pages
1675
+ * Expand TOC area to consecutive pages (both backward and forward)
1643
1676
  */
1644
1677
  expandToConsecutivePages(initial, doc) {
1645
1678
  const itemRefs = [...initial.itemRefs];
1679
+ const seenRefs = new Set(itemRefs);
1680
+ let startPage = initial.startPage;
1646
1681
  let endPage = initial.endPage;
1682
+ for (let pageNo = initial.startPage - 1; pageNo >= 1; pageNo--) {
1683
+ const continuationItems = this.findContinuationOnPage(doc, pageNo);
1684
+ if (continuationItems.length === 0) {
1685
+ break;
1686
+ }
1687
+ const newItems = continuationItems.filter((ref) => !seenRefs.has(ref));
1688
+ for (const ref of newItems) {
1689
+ seenRefs.add(ref);
1690
+ }
1691
+ itemRefs.unshift(...newItems);
1692
+ startPage = pageNo;
1693
+ this.logger.info(`[TocFinder] Expanded TOC backward to page ${pageNo}`);
1694
+ }
1647
1695
  for (let pageNo = initial.endPage + 1; pageNo <= this.maxSearchPages; pageNo++) {
1648
1696
  const continuationItems = this.findContinuationOnPage(doc, pageNo);
1649
1697
  if (continuationItems.length === 0) {
1650
1698
  break;
1651
1699
  }
1652
- itemRefs.push(...continuationItems);
1700
+ const newItems = continuationItems.filter((ref) => !seenRefs.has(ref));
1701
+ for (const ref of newItems) {
1702
+ seenRefs.add(ref);
1703
+ }
1704
+ itemRefs.push(...newItems);
1653
1705
  endPage = pageNo;
1706
+ this.logger.info(`[TocFinder] Expanded TOC forward to page ${pageNo}`);
1654
1707
  }
1655
1708
  return {
1656
1709
  itemRefs,
1657
- startPage: initial.startPage,
1710
+ startPage,
1658
1711
  endPage
1659
1712
  };
1660
1713
  }
@@ -1872,12 +1925,22 @@ var TextLLMComponent = class extends BaseLLMComponent {
1872
1925
  };
1873
1926
 
1874
1927
  // src/extractors/toc-extractor.ts
1928
+ var MAX_VALIDATION_RETRIES = 3;
1929
+ var VALIDATION_CODE_DESCRIPTIONS = {
1930
+ V001: "Page numbers must be in non-decreasing order within the same level. A decrease usually means a hierarchy or page number error.",
1931
+ V002: "Page number is out of valid range (must be >= 1 and <= total pages).",
1932
+ V003: "Title is empty or contains only whitespace.",
1933
+ V004: "Title exceeds the maximum allowed length.",
1934
+ V005: "Child page number is before parent page number. Children must start on or after the parent page.",
1935
+ V006: "Duplicate entry detected (same title and page number).",
1936
+ V007: "First TOC entry starts too late in the document. Earlier entries may be missing."
1937
+ };
1875
1938
  var TocEntrySchema = import_zod.z.lazy(
1876
1939
  () => import_zod.z.object({
1877
1940
  title: import_zod.z.string().describe("Chapter or section title"),
1878
1941
  level: import_zod.z.number().int().min(1).describe("Hierarchy depth (1 = top level)"),
1879
1942
  pageNo: import_zod.z.number().int().min(1).describe("Starting page number"),
1880
- children: import_zod.z.array(TocEntrySchema).optional().describe("Child sections")
1943
+ children: import_zod.z.array(TocEntrySchema).describe("Child sections (use empty array [] if none)")
1881
1944
  })
1882
1945
  );
1883
1946
  var TocResponseSchema = import_zod.z.object({
@@ -1900,19 +1963,21 @@ var TocExtractor = class extends TextLLMComponent {
1900
1963
  /**
1901
1964
  * Extract TOC structure from Markdown
1902
1965
  *
1966
+ * When validation fails, retries with correction feedback up to MAX_VALIDATION_RETRIES times.
1967
+ *
1903
1968
  * @param markdown - Markdown representation of TOC area
1904
- * @returns Object with entries array and token usage information
1969
+ * @param validationOverrides - Optional overrides for validation options (merged with constructor options)
1970
+ * @returns Object with entries array and token usage array (initial extraction + any corrections)
1905
1971
  * @throws {TocParseError} When LLM fails to parse structure
1906
- * @throws {TocValidationError} When validation fails
1972
+ * @throws {TocValidationError} When validation fails after all retries
1907
1973
  */
1908
- async extract(markdown) {
1974
+ async extract(markdown, validationOverrides) {
1909
1975
  this.log("info", `Starting TOC extraction (${markdown.length} chars)`);
1910
1976
  if (!markdown.trim()) {
1911
- this.log("info", "Empty markdown, returning empty array");
1912
- return {
1913
- entries: [],
1914
- usage: this.createEmptyUsage("extraction")
1915
- };
1977
+ this.log("error", "Cannot extract TOC from empty markdown content");
1978
+ throw new TocParseError(
1979
+ "TOC extraction failed: provided markdown content is empty"
1980
+ );
1916
1981
  }
1917
1982
  try {
1918
1983
  const result = await this.callTextLLM(
@@ -1921,18 +1986,52 @@ var TocExtractor = class extends TextLLMComponent {
1921
1986
  this.buildUserPrompt(markdown),
1922
1987
  "extraction"
1923
1988
  );
1924
- const entries = this.normalizeEntries(result.output.entries);
1989
+ const usages = [result.usage];
1990
+ let entries = this.normalizeEntries(result.output.entries);
1925
1991
  if (!this.skipValidation) {
1926
- this.validateEntries(entries);
1992
+ let validationError = this.tryValidateEntries(
1993
+ entries,
1994
+ validationOverrides
1995
+ );
1996
+ for (let attempt = 1; attempt <= MAX_VALIDATION_RETRIES && validationError !== null; attempt++) {
1997
+ this.log(
1998
+ "warn",
1999
+ `Validation failed (attempt ${attempt}/${MAX_VALIDATION_RETRIES}), retrying with correction feedback`
2000
+ );
2001
+ const correctionPrompt = this.buildCorrectionPrompt(
2002
+ markdown,
2003
+ entries,
2004
+ validationError.validationResult.issues
2005
+ );
2006
+ const correctionResult = await this.callTextLLM(
2007
+ TocResponseSchema,
2008
+ this.buildSystemPrompt(),
2009
+ correctionPrompt,
2010
+ `correction-${attempt}`
2011
+ );
2012
+ usages.push(correctionResult.usage);
2013
+ entries = this.normalizeEntries(correctionResult.output.entries);
2014
+ validationError = this.tryValidateEntries(
2015
+ entries,
2016
+ validationOverrides
2017
+ );
2018
+ }
2019
+ if (validationError !== null) {
2020
+ this.log(
2021
+ "error",
2022
+ `Validation failed after ${MAX_VALIDATION_RETRIES} retries:
2023
+ ${validationError.getSummary()}`
2024
+ );
2025
+ throw validationError;
2026
+ }
1927
2027
  }
1928
2028
  this.log(
1929
2029
  "info",
1930
- `Extraction completed: ${entries.length} top-level entries`
2030
+ `Extraction completed: ${entries.length} top-level entries (${usages.length} LLM call(s))`
1931
2031
  );
1932
- return { entries, usage: result.usage };
2032
+ return { entries, usages };
1933
2033
  } catch (error) {
1934
2034
  if (error instanceof TocValidationError) {
1935
- this.log("error", `Validation failed: ${error.message}`);
1936
2035
  throw error;
1937
2036
  }
1938
2037
  const message = error instanceof Error ? error.message : String(error);
@@ -1943,16 +2042,69 @@ var TocExtractor = class extends TextLLMComponent {
1943
2042
  }
1944
2043
  }
1945
2044
  /**
1946
- * Validate extracted entries
2045
+ * Validate extracted entries and return error or null
1947
2046
  *
1948
- * @throws {TocValidationError} When validation fails
2047
+ * Unlike validateOrThrow, this returns the error instead of throwing,
2048
+ * allowing the retry loop to handle it.
2049
+ *
2050
+ * @returns TocValidationError if validation fails, null if valid
1949
2051
  */
1950
- validateEntries(entries) {
2052
+ tryValidateEntries(entries, overrides) {
1951
2053
  if (entries.length === 0) {
1952
- return;
2054
+ return null;
1953
2055
  }
1954
- const validator = new TocValidator(this.validationOptions);
1955
- validator.validateOrThrow(entries);
2056
+ const options = { ...this.validationOptions, ...overrides };
2057
+ const validator = new TocValidator(options);
2058
+ const result = validator.validate(entries);
2059
+ if (!result.valid) {
2060
+ const details = result.issues.map(
2061
+ (issue) => ` [${issue.code}] ${issue.message} (path: ${issue.path}, entry: "${issue.entry.title}" page ${issue.entry.pageNo})`
2062
+ ).join("\n");
2063
+ return new TocValidationError(
2064
+ `TOC validation failed with ${result.errorCount} error(s):
2065
+ ${details}`,
2066
+ result
2067
+ );
2068
+ }
2069
+ return null;
2070
+ }
2071
+ /**
2072
+ * Build correction prompt with validation error feedback
2073
+ *
2074
+ * Includes the original markdown, previous extraction result,
2075
+ * validation errors, and guidance for fixing common mistakes.
2076
+ */
2077
+ buildCorrectionPrompt(markdown, previousEntries, issues) {
2078
+ const errorLines = issues.map((issue) => {
2079
+ const desc = VALIDATION_CODE_DESCRIPTIONS[issue.code] ?? "Unknown validation error.";
2080
+ return `- [${issue.code}] ${issue.message}
2081
+ Path: ${issue.path}
2082
+ Entry: "${issue.entry.title}" (page ${issue.entry.pageNo})
2083
+ Rule: ${desc}`;
2084
+ });
2085
+ return `Your previous TOC extraction had validation errors. Please fix them and re-extract.
2086
+
2087
+ ## Validation Errors
2088
+
2089
+ ${errorLines.join("\n\n")}
2090
+
2091
+ ## Common Mistakes to Avoid
2092
+
2093
+ 1. **Hierarchy confusion**: Entries with the same numbering prefix (e.g., "4)") can belong to different hierarchy levels depending on context. Use indentation and surrounding entries to determine the correct parent-child relationship.
2094
+ 2. **Page number misread**: Carefully distinguish Roman numerals (VI=6) from Arabic numerals. "VI. \uACE0\uCC30" at page 277 is NOT "V. \uACE0\uCC30" at page 27.
2095
+ 3. **Page order**: Within the same parent, sibling entries must have non-decreasing page numbers. If a page number decreases, the entry likely belongs to a different hierarchy level.
2096
+
2097
+ ## Original Markdown
2098
+
2099
+ ${markdown}
2100
+
2101
+ ## Your Previous Extraction (with errors)
2102
+
2103
+ ${JSON.stringify(previousEntries, null, 2)}
2104
+
2105
+ ## Instructions
2106
+
2107
+ Re-extract the TOC structure from the original markdown above. Fix all validation errors listed above. Return the corrected entries.`;
1956
2108
  }
1957
2109
  /**
1958
2110
  * Build system prompt for TOC extraction
@@ -1970,11 +2122,12 @@ var TocExtractor = class extends TextLLMComponent {
1970
2122
  - Level 3: Subsections (e.g., "1.1.1", "a.", "(1)")
1971
2123
  - Use indentation and numbering patterns to infer level
1972
2124
 
1973
- 3. **Page Number**: Extract the page number from each entry. Convert Roman numerals to Arabic numerals if present (e.g., "iv" \u2192 4).
2125
+ 3. **Page Number**: Extract the page number from each entry. Use only Arabic numerals for page numbers.
1974
2126
 
1975
2127
  4. **Children**: Nest child entries under parent entries based on their hierarchy level.
1976
2128
 
1977
- 5. **IMPORTANT - Extract Main TOC Only**: Only extract the main document table of contents. EXCLUDE the following supplementary indices:
2129
+ 5. **IMPORTANT - Extract Main TOC Only**: Only extract the main document table of contents. EXCLUDE the following:
2130
+ - **Front matter with Roman numeral pages**: Entries whose page numbers are Roman numerals (i, ii, xxi, etc.) such as \uC77C\uB7EC\uB450\uAE30, \uBC1C\uAC04\uC0AC, \uC11C\uBB38, \uBC94\uB840, Preface, Foreword, Editorial Notes. These use a separate page numbering system and are not part of the main content.
1978
2131
  - Photo/image indices (\uC0AC\uC9C4 \uBAA9\uCC28, \uC0AC\uC9C4\uBAA9\uCC28, \uD654\uBCF4 \uBAA9\uCC28, Photo Index, List of Photos, List of Figures)
1979
2132
  - Drawing/diagram indices (\uB3C4\uBA74 \uBAA9\uCC28, \uB3C4\uBA74\uBAA9\uCC28, \uC0BD\uB3C4 \uBAA9\uCC28, Drawing Index, List of Drawings)
1980
2133
  - Table indices (\uD45C \uBAA9\uCC28, \uD45C\uBAA9\uCC28, Table Index, List of Tables)
@@ -2001,11 +2154,11 @@ Output:
2001
2154
  "level": 1,
2002
2155
  "pageNo": 1,
2003
2156
  "children": [
2004
- { "title": "1. \uC5F0\uAD6C \uBC30\uACBD", "level": 2, "pageNo": 3 },
2005
- { "title": "2. \uC5F0\uAD6C \uBAA9\uC801", "level": 2, "pageNo": 5 }
2157
+ { "title": "1. \uC5F0\uAD6C \uBC30\uACBD", "level": 2, "pageNo": 3, "children": [] },
2158
+ { "title": "2. \uC5F0\uAD6C \uBAA9\uC801", "level": 2, "pageNo": 5, "children": [] }
2006
2159
  ]
2007
2160
  },
2008
- { "title": "\uC81C2\uC7A5 \uBC29\uBC95\uB860", "level": 1, "pageNo": 10 }
2161
+ { "title": "\uC81C2\uC7A5 \uBC29\uBC95\uB860", "level": 1, "pageNo": 10, "children": [] }
2009
2162
  ]
2010
2163
  }`;
2011
2164
  }
@@ -2648,7 +2801,7 @@ var PagePattern = /* @__PURE__ */ ((PagePattern2) => {
2648
2801
  var PageRangeParser = class extends VisionLLMComponent {
2649
2802
  // Configuration constants
2650
2803
  SAMPLE_SIZE = 3;
2651
- MAX_PATTERN_RETRIES = 6;
2804
+ MAX_PATTERN_RETRIES = 19;
2652
2805
  SIZE_TOLERANCE = 5;
2653
2806
  constructor(logger, model, outputPath, maxRetries = 3, fallbackModel, aggregator, abortSignal) {
2654
2807
  super(
@@ -4048,20 +4201,33 @@ var DocumentProcessor = class {
4048
4201
  }
4049
4202
  if (!markdown) {
4050
4203
  this.logger.info("[DocumentProcessor] Using vision fallback for TOC");
4051
- const totalPages = Object.keys(doclingDoc.pages).length;
4052
- markdown = await this.visionTocExtractor.extract(totalPages);
4204
+ const totalPages2 = Object.keys(doclingDoc.pages).length;
4205
+ markdown = await this.visionTocExtractor.extract(totalPages2);
4053
4206
  if (!markdown) {
4054
- this.logger.warn(
4055
- "[DocumentProcessor] TOC not found in any method, returning empty"
4207
+ const reason = "Both rule-based search and vision fallback failed to locate TOC";
4208
+ this.logger.error(
4209
+ `[DocumentProcessor] TOC extraction failed: ${reason}`
4210
+ );
4211
+ throw new TocNotFoundError(
4212
+ `Table of contents not found in the document. ${reason}.`
4056
4213
  );
4057
- return [];
4058
4214
  }
4059
4215
  this.logger.info(
4060
4216
  `[DocumentProcessor] Vision extracted TOC markdown (${markdown.length} chars)`
4061
4217
  );
4062
4218
  }
4063
- const tocResult = await this.tocExtractor.extract(markdown);
4064
- this.usageAggregator.track(tocResult.usage);
4219
+ const totalPages = Object.keys(doclingDoc.pages).length;
4220
+ const tocResult = await this.tocExtractor.extract(markdown, {
4221
+ totalPages
4222
+ });
4223
+ for (const usage of tocResult.usages) {
4224
+ this.usageAggregator.track(usage);
4225
+ }
4226
+ if (tocResult.entries.length === 0) {
4227
+ const reason = "TOC area was detected but LLM could not extract any structured entries";
4228
+ this.logger.error(`[DocumentProcessor] TOC extraction failed: ${reason}`);
4229
+ throw new TocNotFoundError(`${reason}.`);
4230
+ }
4065
4231
  this.logger.info(
4066
4232
  `[DocumentProcessor] Extracted ${tocResult.entries.length} top-level TOC entries`
4067
4233
  );
@@ -4301,21 +4467,14 @@ var DocumentProcessor = class {
4301
4467
  * Convert chapters and link resources
4302
4468
  *
4303
4469
  * Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
4304
- * Falls back to single "Document" chapter when TOC is empty.
4470
+ * Throws TocNotFoundError if TOC entries are empty (defensive assertion).
4305
4471
  */
4306
4472
  async convertChapters(doclingDoc, tocEntries, pageRangeMap, images, tables, footnotes) {
4307
4473
  this.logger.info("[DocumentProcessor] Converting chapters...");
4308
4474
  if (tocEntries.length === 0) {
4309
- this.logger.info(
4310
- "[DocumentProcessor] No TOC entries, creating fallback chapter"
4311
- );
4312
- return this.createFallbackChapter(
4313
- doclingDoc,
4314
- pageRangeMap,
4315
- images,
4316
- tables,
4317
- footnotes
4318
- );
4475
+ const reason = "Cannot convert chapters without TOC entries";
4476
+ this.logger.error(`[DocumentProcessor] ${reason}`);
4477
+ throw new TocNotFoundError(reason);
4319
4478
  }
4320
4479
  const chapters = this.chapterConverter.convert(
4321
4480
  tocEntries,
@@ -4330,48 +4489,6 @@ var DocumentProcessor = class {
4330
4489
  );
4331
4490
  return chapters;
4332
4491
  }
4333
- /**
4334
- * Create a fallback chapter when TOC is not available
4335
- *
4336
- * Creates a single "Document" chapter containing all text blocks,
4337
- * images, tables, and footnotes from the document.
4338
- */
4339
- createFallbackChapter(doclingDoc, pageRangeMap, images, tables, footnotes) {
4340
- const textBlocks = doclingDoc.texts.filter(
4341
- (item) => item.label !== "footnote" && this.textCleaner.isValidText(item.text)
4342
- ).map((item) => ({
4343
- text: this.textCleaner.normalize(item.text),
4344
- pdfPageNo: item.prov?.[0]?.page_no ?? 1
4345
- }));
4346
- if (textBlocks.length === 0 && images.length === 0 && tables.length === 0 && footnotes.length === 0) {
4347
- this.logger.info(
4348
- "[DocumentProcessor] No content found for fallback chapter"
4349
- );
4350
- return [];
4351
- }
4352
- const firstPdfPage = Math.min(
4353
- ...Object.keys(pageRangeMap).map(Number).filter((n) => !isNaN(n)),
4354
- 1
4355
- );
4356
- const firstPageRange = pageRangeMap[firstPdfPage];
4357
- const pageNo = firstPageRange?.startPageNo ?? 1;
4358
- const fallbackChapter = {
4359
- id: this.idGenerator.generateChapterId(),
4360
- originTitle: "Document",
4361
- title: "Document",
4362
- pageNo,
4363
- level: 1,
4364
- textBlocks,
4365
- imageIds: images.map((img) => img.id),
4366
- tableIds: tables.map((tbl) => tbl.id),
4367
- footnoteIds: footnotes.map((ftn) => ftn.id),
4368
- children: []
4369
- };
4370
- this.logger.info(
4371
- `[DocumentProcessor] Created fallback chapter with ${textBlocks.length} text blocks, ${images.length} images, ${tables.length} tables, ${footnotes.length} footnotes`
4372
- );
4373
- return [fallbackChapter];
4374
- }
4375
4492
  };
4376
4493
  // Annotate the CommonJS export names for ESM import in node:
4377
4494
  0 && (module.exports = {