@heripo/document-processor 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1206,7 +1206,8 @@ var TocValidationError = class extends TocExtractError {
1206
1206
  // src/extractors/toc-validator.ts
1207
1207
  var DEFAULT_OPTIONS = {
1208
1208
  totalPages: Infinity,
1209
- maxTitleLength: 200
1209
+ maxTitleLength: 200,
1210
+ maxFirstEntryPageRatio: 0.3
1210
1211
  };
1211
1212
  var TocValidator = class {
1212
1213
  options;
@@ -1227,6 +1228,7 @@ var TocValidator = class {
1227
1228
  validate(entries) {
1228
1229
  this.issues = [];
1229
1230
  this.validateEntries(entries, "", null, /* @__PURE__ */ new Set());
1231
+ this.validateFirstEntryPagePosition(entries);
1230
1232
  const errorCount = this.issues.length;
1231
1233
  return {
1232
1234
  valid: errorCount === 0,
@@ -1243,8 +1245,12 @@ var TocValidator = class {
1243
1245
  validateOrThrow(entries) {
1244
1246
  const result = this.validate(entries);
1245
1247
  if (!result.valid) {
1248
+ const details = result.issues.map(
1249
+ (issue) => ` [${issue.code}] ${issue.message} (path: ${issue.path}, entry: "${issue.entry.title}" page ${issue.entry.pageNo})`
1250
+ ).join("\n");
1246
1251
  throw new TocValidationError(
1247
- `TOC validation failed with ${result.errorCount} error(s)`,
1252
+ `TOC validation failed with ${result.errorCount} error(s):
1253
+ ${details}`,
1248
1254
  result
1249
1255
  );
1250
1256
  }
@@ -1359,6 +1365,33 @@ var TocValidator = class {
1359
1365
  });
1360
1366
  }
1361
1367
  }
1368
+ /**
1369
+ * V007: Validate first entry page position (completeness check)
1370
+ *
1371
+ * If the first level-1 entry starts too late in the document,
1372
+ * earlier entries might be missing from the TOC.
1373
+ */
1374
+ validateFirstEntryPagePosition(entries) {
1375
+ if (entries.length === 0) {
1376
+ return;
1377
+ }
1378
+ if (!isFinite(this.options.totalPages)) {
1379
+ return;
1380
+ }
1381
+ const firstEntry = entries[0];
1382
+ const threshold = Math.max(
1383
+ 50,
1384
+ Math.floor(this.options.totalPages * this.options.maxFirstEntryPageRatio)
1385
+ );
1386
+ if (firstEntry.pageNo > threshold) {
1387
+ this.addIssue({
1388
+ code: "V007",
1389
+ message: `TOC may be incomplete - first entry starts at page ${firstEntry.pageNo}, expected within first ${threshold} pages. Earlier entries might be missing.`,
1390
+ path: "[0]",
1391
+ entry: firstEntry
1392
+ });
1393
+ }
1394
+ }
1362
1395
  /**
1363
1396
  * Add issue to the list
1364
1397
  */
@@ -1577,22 +1610,42 @@ var TocFinder = class {
1577
1610
  return numberCount > 0 && numberCount / (num_rows - 1) > 0.5;
1578
1611
  }
1579
1612
  /**
1580
- * Expand TOC area to consecutive pages
1613
+ * Expand TOC area to consecutive pages (both backward and forward)
1581
1614
  */
1582
1615
  expandToConsecutivePages(initial, doc) {
1583
1616
  const itemRefs = [...initial.itemRefs];
1617
+ const seenRefs = new Set(itemRefs);
1618
+ let startPage = initial.startPage;
1584
1619
  let endPage = initial.endPage;
1620
+ for (let pageNo = initial.startPage - 1; pageNo >= 1; pageNo--) {
1621
+ const continuationItems = this.findContinuationOnPage(doc, pageNo);
1622
+ if (continuationItems.length === 0) {
1623
+ break;
1624
+ }
1625
+ const newItems = continuationItems.filter((ref) => !seenRefs.has(ref));
1626
+ for (const ref of newItems) {
1627
+ seenRefs.add(ref);
1628
+ }
1629
+ itemRefs.unshift(...newItems);
1630
+ startPage = pageNo;
1631
+ this.logger.info(`[TocFinder] Expanded TOC backward to page ${pageNo}`);
1632
+ }
1585
1633
  for (let pageNo = initial.endPage + 1; pageNo <= this.maxSearchPages; pageNo++) {
1586
1634
  const continuationItems = this.findContinuationOnPage(doc, pageNo);
1587
1635
  if (continuationItems.length === 0) {
1588
1636
  break;
1589
1637
  }
1590
- itemRefs.push(...continuationItems);
1638
+ const newItems = continuationItems.filter((ref) => !seenRefs.has(ref));
1639
+ for (const ref of newItems) {
1640
+ seenRefs.add(ref);
1641
+ }
1642
+ itemRefs.push(...newItems);
1591
1643
  endPage = pageNo;
1644
+ this.logger.info(`[TocFinder] Expanded TOC forward to page ${pageNo}`);
1592
1645
  }
1593
1646
  return {
1594
1647
  itemRefs,
1595
- startPage: initial.startPage,
1648
+ startPage,
1596
1649
  endPage
1597
1650
  };
1598
1651
  }
@@ -1810,12 +1863,22 @@ var TextLLMComponent = class extends BaseLLMComponent {
1810
1863
  };
1811
1864
 
1812
1865
  // src/extractors/toc-extractor.ts
1866
+ var MAX_VALIDATION_RETRIES = 3;
1867
+ var VALIDATION_CODE_DESCRIPTIONS = {
1868
+ V001: "Page numbers must be in non-decreasing order within the same level. A decrease usually means a hierarchy or page number error.",
1869
+ V002: "Page number is out of valid range (must be >= 1 and <= total pages).",
1870
+ V003: "Title is empty or contains only whitespace.",
1871
+ V004: "Title exceeds the maximum allowed length.",
1872
+ V005: "Child page number is before parent page number. Children must start on or after the parent page.",
1873
+ V006: "Duplicate entry detected (same title and page number).",
1874
+ V007: "First TOC entry starts too late in the document. Earlier entries may be missing."
1875
+ };
1813
1876
  var TocEntrySchema = z.lazy(
1814
1877
  () => z.object({
1815
1878
  title: z.string().describe("Chapter or section title"),
1816
1879
  level: z.number().int().min(1).describe("Hierarchy depth (1 = top level)"),
1817
1880
  pageNo: z.number().int().min(1).describe("Starting page number"),
1818
- children: z.array(TocEntrySchema).optional().describe("Child sections")
1881
+ children: z.array(TocEntrySchema).describe("Child sections (use empty array [] if none)")
1819
1882
  })
1820
1883
  );
1821
1884
  var TocResponseSchema = z.object({
@@ -1838,19 +1901,21 @@ var TocExtractor = class extends TextLLMComponent {
1838
1901
  /**
1839
1902
  * Extract TOC structure from Markdown
1840
1903
  *
1904
+ * When validation fails, retries with correction feedback up to MAX_VALIDATION_RETRIES times.
1905
+ *
1841
1906
  * @param markdown - Markdown representation of TOC area
1842
- * @returns Object with entries array and token usage information
1907
+ * @param validationOverrides - Optional overrides for validation options (merged with constructor options)
1908
+ * @returns Object with entries array and token usage array (initial extraction + any corrections)
1843
1909
  * @throws {TocParseError} When LLM fails to parse structure
1844
- * @throws {TocValidationError} When validation fails
1910
+ * @throws {TocValidationError} When validation fails after all retries
1845
1911
  */
1846
- async extract(markdown) {
1912
+ async extract(markdown, validationOverrides) {
1847
1913
  this.log("info", `Starting TOC extraction (${markdown.length} chars)`);
1848
1914
  if (!markdown.trim()) {
1849
- this.log("info", "Empty markdown, returning empty array");
1850
- return {
1851
- entries: [],
1852
- usage: this.createEmptyUsage("extraction")
1853
- };
1915
+ this.log("error", "Cannot extract TOC from empty markdown content");
1916
+ throw new TocParseError(
1917
+ "TOC extraction failed: provided markdown content is empty"
1918
+ );
1854
1919
  }
1855
1920
  try {
1856
1921
  const result = await this.callTextLLM(
@@ -1859,18 +1924,52 @@ var TocExtractor = class extends TextLLMComponent {
1859
1924
  this.buildUserPrompt(markdown),
1860
1925
  "extraction"
1861
1926
  );
1862
- const entries = this.normalizeEntries(result.output.entries);
1927
+ const usages = [result.usage];
1928
+ let entries = this.normalizeEntries(result.output.entries);
1863
1929
  if (!this.skipValidation) {
1864
- this.validateEntries(entries);
1930
+ let validationError = this.tryValidateEntries(
1931
+ entries,
1932
+ validationOverrides
1933
+ );
1934
+ for (let attempt = 1; attempt <= MAX_VALIDATION_RETRIES && validationError !== null; attempt++) {
1935
+ this.log(
1936
+ "warn",
1937
+ `Validation failed (attempt ${attempt}/${MAX_VALIDATION_RETRIES}), retrying with correction feedback`
1938
+ );
1939
+ const correctionPrompt = this.buildCorrectionPrompt(
1940
+ markdown,
1941
+ entries,
1942
+ validationError.validationResult.issues
1943
+ );
1944
+ const correctionResult = await this.callTextLLM(
1945
+ TocResponseSchema,
1946
+ this.buildSystemPrompt(),
1947
+ correctionPrompt,
1948
+ `correction-${attempt}`
1949
+ );
1950
+ usages.push(correctionResult.usage);
1951
+ entries = this.normalizeEntries(correctionResult.output.entries);
1952
+ validationError = this.tryValidateEntries(
1953
+ entries,
1954
+ validationOverrides
1955
+ );
1956
+ }
1957
+ if (validationError !== null) {
1958
+ this.log(
1959
+ "error",
1960
+ `Validation failed after ${MAX_VALIDATION_RETRIES} retries:
1961
+ ${validationError.getSummary()}`
1962
+ );
1963
+ throw validationError;
1964
+ }
1865
1965
  }
1866
1966
  this.log(
1867
1967
  "info",
1868
- `Extraction completed: ${entries.length} top-level entries`
1968
+ `Extraction completed: ${entries.length} top-level entries (${usages.length} LLM call(s))`
1869
1969
  );
1870
- return { entries, usage: result.usage };
1970
+ return { entries, usages };
1871
1971
  } catch (error) {
1872
1972
  if (error instanceof TocValidationError) {
1873
- this.log("error", `Validation failed: ${error.message}`);
1874
1973
  throw error;
1875
1974
  }
1876
1975
  const message = error instanceof Error ? error.message : String(error);
@@ -1881,16 +1980,69 @@ var TocExtractor = class extends TextLLMComponent {
1881
1980
  }
1882
1981
  }
1883
1982
  /**
1884
- * Validate extracted entries
1983
+ * Validate extracted entries and return error or null
1885
1984
  *
1886
- * @throws {TocValidationError} When validation fails
1985
+ * Unlike validateOrThrow, this returns the error instead of throwing,
1986
+ * allowing the retry loop to handle it.
1987
+ *
1988
+ * @returns TocValidationError if validation fails, null if valid
1887
1989
  */
1888
- validateEntries(entries) {
1990
+ tryValidateEntries(entries, overrides) {
1889
1991
  if (entries.length === 0) {
1890
- return;
1992
+ return null;
1891
1993
  }
1892
- const validator = new TocValidator(this.validationOptions);
1893
- validator.validateOrThrow(entries);
1994
+ const options = { ...this.validationOptions, ...overrides };
1995
+ const validator = new TocValidator(options);
1996
+ const result = validator.validate(entries);
1997
+ if (!result.valid) {
1998
+ const details = result.issues.map(
1999
+ (issue) => ` [${issue.code}] ${issue.message} (path: ${issue.path}, entry: "${issue.entry.title}" page ${issue.entry.pageNo})`
2000
+ ).join("\n");
2001
+ return new TocValidationError(
2002
+ `TOC validation failed with ${result.errorCount} error(s):
2003
+ ${details}`,
2004
+ result
2005
+ );
2006
+ }
2007
+ return null;
2008
+ }
2009
+ /**
2010
+ * Build correction prompt with validation error feedback
2011
+ *
2012
+ * Includes the original markdown, previous extraction result,
2013
+ * validation errors, and guidance for fixing common mistakes.
2014
+ */
2015
+ buildCorrectionPrompt(markdown, previousEntries, issues) {
2016
+ const errorLines = issues.map((issue) => {
2017
+ const desc = VALIDATION_CODE_DESCRIPTIONS[issue.code] ?? "Unknown validation error.";
2018
+ return `- [${issue.code}] ${issue.message}
2019
+ Path: ${issue.path}
2020
+ Entry: "${issue.entry.title}" (page ${issue.entry.pageNo})
2021
+ Rule: ${desc}`;
2022
+ });
2023
+ return `Your previous TOC extraction had validation errors. Please fix them and re-extract.
2024
+
2025
+ ## Validation Errors
2026
+
2027
+ ${errorLines.join("\n\n")}
2028
+
2029
+ ## Common Mistakes to Avoid
2030
+
2031
+ 1. **Hierarchy confusion**: Entries with the same numbering prefix (e.g., "4)") can belong to different hierarchy levels depending on context. Use indentation and surrounding entries to determine the correct parent-child relationship.
2032
+ 2. **Page number misread**: Carefully distinguish Roman numerals (VI=6) from Arabic numerals. "VI. \uACE0\uCC30" at page 277 is NOT "V. \uACE0\uCC30" at page 27.
2033
+ 3. **Page order**: Within the same parent, sibling entries must have non-decreasing page numbers. If a page number decreases, the entry likely belongs to a different hierarchy level.
2034
+
2035
+ ## Original Markdown
2036
+
2037
+ ${markdown}
2038
+
2039
+ ## Your Previous Extraction (with errors)
2040
+
2041
+ ${JSON.stringify(previousEntries, null, 2)}
2042
+
2043
+ ## Instructions
2044
+
2045
+ Re-extract the TOC structure from the original markdown above. Fix all validation errors listed above. Return the corrected entries.`;
1894
2046
  }
1895
2047
  /**
1896
2048
  * Build system prompt for TOC extraction
@@ -1908,11 +2060,12 @@ var TocExtractor = class extends TextLLMComponent {
1908
2060
  - Level 3: Subsections (e.g., "1.1.1", "a.", "(1)")
1909
2061
  - Use indentation and numbering patterns to infer level
1910
2062
 
1911
- 3. **Page Number**: Extract the page number from each entry. Convert Roman numerals to Arabic numerals if present (e.g., "iv" \u2192 4).
2063
+ 3. **Page Number**: Extract the page number from each entry. Use only Arabic numerals for page numbers.
1912
2064
 
1913
2065
  4. **Children**: Nest child entries under parent entries based on their hierarchy level.
1914
2066
 
1915
- 5. **IMPORTANT - Extract Main TOC Only**: Only extract the main document table of contents. EXCLUDE the following supplementary indices:
2067
+ 5. **IMPORTANT - Extract Main TOC Only**: Only extract the main document table of contents. EXCLUDE the following:
2068
+ - **Front matter with Roman numeral pages**: Entries whose page numbers are Roman numerals (i, ii, xxi, etc.) such as \uC77C\uB7EC\uB450\uAE30, \uBC1C\uAC04\uC0AC, \uC11C\uBB38, \uBC94\uB840, Preface, Foreword, Editorial Notes. These use a separate page numbering system and are not part of the main content.
1916
2069
  - Photo/image indices (\uC0AC\uC9C4 \uBAA9\uCC28, \uC0AC\uC9C4\uBAA9\uCC28, \uD654\uBCF4 \uBAA9\uCC28, Photo Index, List of Photos, List of Figures)
1917
2070
  - Drawing/diagram indices (\uB3C4\uBA74 \uBAA9\uCC28, \uB3C4\uBA74\uBAA9\uCC28, \uC0BD\uB3C4 \uBAA9\uCC28, Drawing Index, List of Drawings)
1918
2071
  - Table indices (\uD45C \uBAA9\uCC28, \uD45C\uBAA9\uCC28, Table Index, List of Tables)
@@ -1939,11 +2092,11 @@ Output:
1939
2092
  "level": 1,
1940
2093
  "pageNo": 1,
1941
2094
  "children": [
1942
- { "title": "1. \uC5F0\uAD6C \uBC30\uACBD", "level": 2, "pageNo": 3 },
1943
- { "title": "2. \uC5F0\uAD6C \uBAA9\uC801", "level": 2, "pageNo": 5 }
2095
+ { "title": "1. \uC5F0\uAD6C \uBC30\uACBD", "level": 2, "pageNo": 3, "children": [] },
2096
+ { "title": "2. \uC5F0\uAD6C \uBAA9\uC801", "level": 2, "pageNo": 5, "children": [] }
1944
2097
  ]
1945
2098
  },
1946
- { "title": "\uC81C2\uC7A5 \uBC29\uBC95\uB860", "level": 1, "pageNo": 10 }
2099
+ { "title": "\uC81C2\uC7A5 \uBC29\uBC95\uB860", "level": 1, "pageNo": 10, "children": [] }
1947
2100
  ]
1948
2101
  }`;
1949
2102
  }
@@ -2586,7 +2739,7 @@ var PagePattern = /* @__PURE__ */ ((PagePattern2) => {
2586
2739
  var PageRangeParser = class extends VisionLLMComponent {
2587
2740
  // Configuration constants
2588
2741
  SAMPLE_SIZE = 3;
2589
- MAX_PATTERN_RETRIES = 6;
2742
+ MAX_PATTERN_RETRIES = 19;
2590
2743
  SIZE_TOLERANCE = 5;
2591
2744
  constructor(logger, model, outputPath, maxRetries = 3, fallbackModel, aggregator, abortSignal) {
2592
2745
  super(
@@ -3986,20 +4139,33 @@ var DocumentProcessor = class {
3986
4139
  }
3987
4140
  if (!markdown) {
3988
4141
  this.logger.info("[DocumentProcessor] Using vision fallback for TOC");
3989
- const totalPages = Object.keys(doclingDoc.pages).length;
3990
- markdown = await this.visionTocExtractor.extract(totalPages);
4142
+ const totalPages2 = Object.keys(doclingDoc.pages).length;
4143
+ markdown = await this.visionTocExtractor.extract(totalPages2);
3991
4144
  if (!markdown) {
3992
- this.logger.warn(
3993
- "[DocumentProcessor] TOC not found in any method, returning empty"
4145
+ const reason = "Both rule-based search and vision fallback failed to locate TOC";
4146
+ this.logger.error(
4147
+ `[DocumentProcessor] TOC extraction failed: ${reason}`
4148
+ );
4149
+ throw new TocNotFoundError(
4150
+ `Table of contents not found in the document. ${reason}.`
3994
4151
  );
3995
- return [];
3996
4152
  }
3997
4153
  this.logger.info(
3998
4154
  `[DocumentProcessor] Vision extracted TOC markdown (${markdown.length} chars)`
3999
4155
  );
4000
4156
  }
4001
- const tocResult = await this.tocExtractor.extract(markdown);
4002
- this.usageAggregator.track(tocResult.usage);
4157
+ const totalPages = Object.keys(doclingDoc.pages).length;
4158
+ const tocResult = await this.tocExtractor.extract(markdown, {
4159
+ totalPages
4160
+ });
4161
+ for (const usage of tocResult.usages) {
4162
+ this.usageAggregator.track(usage);
4163
+ }
4164
+ if (tocResult.entries.length === 0) {
4165
+ const reason = "TOC area was detected but LLM could not extract any structured entries";
4166
+ this.logger.error(`[DocumentProcessor] TOC extraction failed: ${reason}`);
4167
+ throw new TocNotFoundError(`${reason}.`);
4168
+ }
4003
4169
  this.logger.info(
4004
4170
  `[DocumentProcessor] Extracted ${tocResult.entries.length} top-level TOC entries`
4005
4171
  );
@@ -4239,21 +4405,14 @@ var DocumentProcessor = class {
4239
4405
  * Convert chapters and link resources
4240
4406
  *
4241
4407
  * Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
4242
- * Falls back to single "Document" chapter when TOC is empty.
4408
+ * Throws TocNotFoundError if TOC entries are empty (defensive assertion).
4243
4409
  */
4244
4410
  async convertChapters(doclingDoc, tocEntries, pageRangeMap, images, tables, footnotes) {
4245
4411
  this.logger.info("[DocumentProcessor] Converting chapters...");
4246
4412
  if (tocEntries.length === 0) {
4247
- this.logger.info(
4248
- "[DocumentProcessor] No TOC entries, creating fallback chapter"
4249
- );
4250
- return this.createFallbackChapter(
4251
- doclingDoc,
4252
- pageRangeMap,
4253
- images,
4254
- tables,
4255
- footnotes
4256
- );
4413
+ const reason = "Cannot convert chapters without TOC entries";
4414
+ this.logger.error(`[DocumentProcessor] ${reason}`);
4415
+ throw new TocNotFoundError(reason);
4257
4416
  }
4258
4417
  const chapters = this.chapterConverter.convert(
4259
4418
  tocEntries,
@@ -4268,48 +4427,6 @@ var DocumentProcessor = class {
4268
4427
  );
4269
4428
  return chapters;
4270
4429
  }
4271
- /**
4272
- * Create a fallback chapter when TOC is not available
4273
- *
4274
- * Creates a single "Document" chapter containing all text blocks,
4275
- * images, tables, and footnotes from the document.
4276
- */
4277
- createFallbackChapter(doclingDoc, pageRangeMap, images, tables, footnotes) {
4278
- const textBlocks = doclingDoc.texts.filter(
4279
- (item) => item.label !== "footnote" && this.textCleaner.isValidText(item.text)
4280
- ).map((item) => ({
4281
- text: this.textCleaner.normalize(item.text),
4282
- pdfPageNo: item.prov?.[0]?.page_no ?? 1
4283
- }));
4284
- if (textBlocks.length === 0 && images.length === 0 && tables.length === 0 && footnotes.length === 0) {
4285
- this.logger.info(
4286
- "[DocumentProcessor] No content found for fallback chapter"
4287
- );
4288
- return [];
4289
- }
4290
- const firstPdfPage = Math.min(
4291
- ...Object.keys(pageRangeMap).map(Number).filter((n) => !isNaN(n)),
4292
- 1
4293
- );
4294
- const firstPageRange = pageRangeMap[firstPdfPage];
4295
- const pageNo = firstPageRange?.startPageNo ?? 1;
4296
- const fallbackChapter = {
4297
- id: this.idGenerator.generateChapterId(),
4298
- originTitle: "Document",
4299
- title: "Document",
4300
- pageNo,
4301
- level: 1,
4302
- textBlocks,
4303
- imageIds: images.map((img) => img.id),
4304
- tableIds: tables.map((tbl) => tbl.id),
4305
- footnoteIds: footnotes.map((ftn) => ftn.id),
4306
- children: []
4307
- };
4308
- this.logger.info(
4309
- `[DocumentProcessor] Created fallback chapter with ${textBlocks.length} text blocks, ${images.length} images, ${tables.length} tables, ${footnotes.length} footnotes`
4310
- );
4311
- return [fallbackChapter];
4312
- }
4313
4430
  };
4314
4431
  export {
4315
4432
  BaseLLMComponent,