@heripo/document-processor 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1908,11 +1908,10 @@ var TocExtractor = class extends TextLLMComponent {
1908
1908
  async extract(markdown) {
1909
1909
  this.log("info", `Starting TOC extraction (${markdown.length} chars)`);
1910
1910
  if (!markdown.trim()) {
1911
- this.log("info", "Empty markdown, returning empty array");
1912
- return {
1913
- entries: [],
1914
- usage: this.createEmptyUsage("extraction")
1915
- };
1911
+ this.log("error", "Cannot extract TOC from empty markdown content");
1912
+ throw new TocParseError(
1913
+ "TOC extraction failed: provided markdown content is empty"
1914
+ );
1916
1915
  }
1917
1916
  try {
1918
1917
  const result = await this.callTextLLM(
@@ -2219,7 +2218,15 @@ var VisionTocExtractor = class extends VisionLLMComponent {
2219
2218
  */
2220
2219
  async extractFromBatch(startPage, endPage) {
2221
2220
  this.log("info", `Extracting from pages ${startPage}-${endPage}`);
2221
+ this.log(
2222
+ "info",
2223
+ `Preparing images for vision analysis. This can be very slow (10+ minutes, sometimes 30+ minutes) depending on batch size and image resolution.`
2224
+ );
2222
2225
  const imageContents = this.loadPageImages(startPage, endPage);
2226
+ this.log(
2227
+ "info",
2228
+ `Calling vision LLM for TOC extraction (pages ${startPage}-${endPage})`
2229
+ );
2223
2230
  const result = await LLMCaller.callVision({
2224
2231
  schema: VisionTocExtractionSchema,
2225
2232
  messages: [
@@ -2242,6 +2249,10 @@ var VisionTocExtractor = class extends VisionLLMComponent {
2242
2249
  component: "VisionTocExtractor",
2243
2250
  phase: "extraction"
2244
2251
  });
2252
+ this.log(
2253
+ "info",
2254
+ `Vision LLM call completed (pages ${startPage}-${endPage})`
2255
+ );
2245
2256
  this.trackUsage(result.usage);
2246
2257
  return result.output;
2247
2258
  }
@@ -3304,9 +3315,11 @@ var BaseValidator = class extends TextLLMComponent {
3304
3315
  // src/validators/toc-content-validator.ts
3305
3316
  var import_zod5 = require("zod");
3306
3317
  var TocContentValidationSchema = import_zod5.z.object({
3307
- isToc: import_zod5.z.boolean().describe("Whether the content is a table of contents"),
3318
+ isValid: import_zod5.z.boolean().describe("Whether valid main document TOC was found"),
3308
3319
  confidence: import_zod5.z.number().min(0).max(1).describe("Confidence score between 0 and 1"),
3309
- reason: import_zod5.z.string().describe("Brief explanation for the decision")
3320
+ contentType: import_zod5.z.enum(["pure_toc", "mixed", "resource_only", "invalid"]).describe("Type of content detected"),
3321
+ extractedTocMarkdown: import_zod5.z.string().nullable().describe("Extracted main TOC markdown when mixed; null otherwise"),
3322
+ reason: import_zod5.z.string().describe("Brief explanation in English")
3310
3323
  });
3311
3324
  var TocContentValidator = class extends BaseValidator {
3312
3325
  confidenceThreshold;
@@ -3325,7 +3338,7 @@ var TocContentValidator = class extends BaseValidator {
3325
3338
  * Validate if the markdown content is a table of contents
3326
3339
  *
3327
3340
  * @param markdown - Markdown content to validate
3328
- * @returns Validation result with isToc, confidence, and reason
3341
+ * @returns Validation output with resolved markdown for valid TOC
3329
3342
  */
3330
3343
  async validate(markdown) {
3331
3344
  this.logger.info(
@@ -3336,8 +3349,10 @@ var TocContentValidator = class extends BaseValidator {
3336
3349
  "[TocContentValidator] Empty markdown, returning invalid"
3337
3350
  );
3338
3351
  return {
3339
- isToc: false,
3352
+ isValid: false,
3340
3353
  confidence: 1,
3354
+ contentType: "invalid",
3355
+ validTocMarkdown: null,
3341
3356
  reason: "Empty content"
3342
3357
  };
3343
3358
  }
@@ -3349,52 +3364,106 @@ var TocContentValidator = class extends BaseValidator {
3349
3364
  this.aggregator
3350
3365
  );
3351
3366
  this.logger.info(
3352
- `[TocContentValidator] Result: isToc=${result.isToc}, confidence=${result.confidence}`
3367
+ `[TocContentValidator] Result: isValid=${result.isValid}, contentType=${result.contentType}, confidence=${result.confidence}`
3353
3368
  );
3354
- return result;
3369
+ let validTocMarkdown = null;
3370
+ if (result.isValid && result.confidence >= this.confidenceThreshold) {
3371
+ if (result.contentType === "pure_toc") {
3372
+ validTocMarkdown = markdown;
3373
+ } else if (result.contentType === "mixed" && result.extractedTocMarkdown) {
3374
+ validTocMarkdown = result.extractedTocMarkdown;
3375
+ }
3376
+ }
3377
+ return {
3378
+ isValid: result.isValid,
3379
+ confidence: result.confidence,
3380
+ contentType: result.contentType,
3381
+ validTocMarkdown,
3382
+ reason: result.reason
3383
+ };
3355
3384
  }
3356
3385
  /**
3357
3386
  * Check if validation result passes threshold
3358
3387
  *
3359
- * @param result - Validation result from validate()
3388
+ * @param result - Validation output from validate()
3360
3389
  * @returns true if content is valid TOC with sufficient confidence
3361
3390
  */
3362
3391
  isValid(result) {
3363
- return result.isToc && result.confidence >= this.confidenceThreshold;
3392
+ return result.isValid && result.confidence >= this.confidenceThreshold;
3393
+ }
3394
+ /**
3395
+ * Get the valid TOC markdown from validation result
3396
+ *
3397
+ * @param result - Validation output from validate()
3398
+ * @returns Valid TOC markdown or null if invalid
3399
+ */
3400
+ getValidMarkdown(result) {
3401
+ return result.validTocMarkdown;
3364
3402
  }
3365
3403
  /**
3366
3404
  * Build system prompt for TOC content validation
3367
3405
  */
3368
3406
  buildSystemPrompt() {
3369
- return `You are a document structure analyst. Your task is to determine if the provided content is a Table of Contents (TOC).
3407
+ return `You are a document structure analyst. Your task is to analyze the provided content and classify it into one of four categories.
3370
3408
 
3371
- ## What IS a Table of Contents:
3372
- - A structured list of chapters/sections with corresponding page numbers
3373
- - Contains hierarchical section titles (e.g., "Chapter 1", "\uC81C1\uC7A5", "1.1 Introduction", etc.)
3374
- - Has page number references for each entry (e.g., "..... 10", "... 5", or just a number at the end)
3375
- - Multiple entries organized by document structure
3376
- - Main document outline listing major chapters and sections
3409
+ ## Content Type Classification:
3377
3410
 
3378
- ## What is NOT a Table of Contents:
3411
+ ### 1. pure_toc
3412
+ The content is ONLY a main document Table of Contents with:
3413
+ - Structured list of chapters/sections with page numbers
3414
+ - Hierarchical section titles (e.g., "Chapter 1", "\uC81C1\uC7A5", "1.1 Introduction")
3415
+ - Multiple entries (3 or more) organized by document structure
3416
+ - NO resource indices mixed in
3417
+
3418
+ ### 2. mixed
3419
+ The content contains BOTH:
3420
+ - A valid main document TOC (chapters/sections with page numbers)
3421
+ - AND resource indices (photo/table/drawing indices)
3422
+
3423
+ When classifying as "mixed", you MUST extract ONLY the main TOC portion and return it in extractedTocMarkdown.
3424
+
3425
+ ### 3. resource_only
3426
+ The content contains ONLY resource indices such as:
3379
3427
  - Photo/image indices (\uC0AC\uC9C4 \uBAA9\uCC28, \uC0AC\uC9C4\uBAA9\uCC28, Photo Index, List of Figures, List of Photos)
3380
3428
  - Table indices (\uD45C \uBAA9\uCC28, \uD45C\uBAA9\uCC28, Table Index, List of Tables)
3381
3429
  - Drawing/diagram indices (\uB3C4\uBA74 \uBAA9\uCC28, \uB3C4\uBA74\uBAA9\uCC28, Drawing Index, List of Drawings)
3382
3430
  - Appendix indices (\uBD80\uB85D \uBAA9\uCC28, Appendix Index)
3383
- - Random body text from the document
3431
+
3432
+ ### 4. invalid
3433
+ The content is none of the above:
3434
+ - Random body text
3384
3435
  - Single entries or incomplete lists (fewer than 3 items)
3385
3436
  - Reference lists or bibliographies
3386
3437
  - Index pages (alphabetical keyword lists)
3438
+ - Unstructured content
3387
3439
 
3388
3440
  ## Response Guidelines:
3389
- - Set isToc to true ONLY if content is clearly a main document TOC
3441
+ - Set isValid to true for "pure_toc" and "mixed" types
3442
+ - Set isValid to false for "resource_only" and "invalid" types
3390
3443
  - Set confidence between 0.0 and 1.0 based on your certainty
3391
- - Provide a brief reason explaining your decision (1-2 sentences)`;
3444
+ - For "mixed" type: extractedTocMarkdown MUST contain only the main TOC entries (preserve original formatting)
3445
+ - For other types: extractedTocMarkdown should be null
3446
+ - IMPORTANT: reason MUST be written in English
3447
+
3448
+ ## Example Scenarios:
3449
+
3450
+ ### Scenario 1: pure_toc
3451
+ Input: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5\\n\uC81C3\uC7A5 \uC870\uC0AC\uACB0\uACFC ..... 15"
3452
+ Output: { isValid: true, contentType: "pure_toc", extractedTocMarkdown: null }
3453
+
3454
+ ### Scenario 2: mixed
3455
+ Input: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5\\n\\n\uC0AC\uC9C4\uBAA9\uCC28\\n\uC0AC\uC9C4 1 \uC804\uACBD ..... 50\\n\uC0AC\uC9C4 2 \uC720\uBB3C ..... 51"
3456
+ Output: { isValid: true, contentType: "mixed", extractedTocMarkdown: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5" }
3457
+
3458
+ ### Scenario 3: resource_only
3459
+ Input: "\uC0AC\uC9C4\uBAA9\uCC28\\n\uC0AC\uC9C4 1 \uC804\uACBD ..... 50\\n\uC0AC\uC9C4 2 \uC720\uBB3C ..... 51"
3460
+ Output: { isValid: false, contentType: "resource_only", extractedTocMarkdown: null }`;
3392
3461
  }
3393
3462
  /**
3394
3463
  * Build user prompt with markdown content
3395
3464
  */
3396
3465
  buildUserPrompt(markdown) {
3397
- return `Determine if the following content is a Table of Contents:
3466
+ return `Analyze the following content and classify it:
3398
3467
 
3399
3468
  ${markdown}`;
3400
3469
  }
@@ -3952,9 +4021,20 @@ var DocumentProcessor = class {
3952
4021
  );
3953
4022
  markdown = null;
3954
4023
  } else {
3955
- this.logger.info(
3956
- `[DocumentProcessor] TOC validation passed (confidence: ${validation.confidence})`
3957
- );
4024
+ const validMarkdown = this.tocContentValidator.getValidMarkdown(validation);
4025
+ if (validMarkdown) {
4026
+ if (validation.contentType === "mixed") {
4027
+ this.logger.info(
4028
+ `[DocumentProcessor] Mixed TOC detected, using extracted main TOC (${validMarkdown.length} chars)`
4029
+ );
4030
+ }
4031
+ markdown = validMarkdown;
4032
+ this.logger.info(
4033
+ `[DocumentProcessor] TOC validation passed (confidence: ${validation.confidence})`
4034
+ );
4035
+ } else {
4036
+ markdown = null;
4037
+ }
3958
4038
  }
3959
4039
  } catch (error) {
3960
4040
  if (error instanceof TocNotFoundError) {
@@ -3970,10 +4050,13 @@ var DocumentProcessor = class {
3970
4050
  const totalPages = Object.keys(doclingDoc.pages).length;
3971
4051
  markdown = await this.visionTocExtractor.extract(totalPages);
3972
4052
  if (!markdown) {
3973
- this.logger.warn(
3974
- "[DocumentProcessor] TOC not found in any method, returning empty"
4053
+ const reason = "Both rule-based search and vision fallback failed to locate TOC";
4054
+ this.logger.error(
4055
+ `[DocumentProcessor] TOC extraction failed: ${reason}`
4056
+ );
4057
+ throw new TocNotFoundError(
4058
+ `Table of contents not found in the document. ${reason}.`
3975
4059
  );
3976
- return [];
3977
4060
  }
3978
4061
  this.logger.info(
3979
4062
  `[DocumentProcessor] Vision extracted TOC markdown (${markdown.length} chars)`
@@ -3981,6 +4064,11 @@ var DocumentProcessor = class {
3981
4064
  }
3982
4065
  const tocResult = await this.tocExtractor.extract(markdown);
3983
4066
  this.usageAggregator.track(tocResult.usage);
4067
+ if (tocResult.entries.length === 0) {
4068
+ const reason = "TOC area was detected but LLM could not extract any structured entries";
4069
+ this.logger.error(`[DocumentProcessor] TOC extraction failed: ${reason}`);
4070
+ throw new TocNotFoundError(`${reason}.`);
4071
+ }
3984
4072
  this.logger.info(
3985
4073
  `[DocumentProcessor] Extracted ${tocResult.entries.length} top-level TOC entries`
3986
4074
  );
@@ -4220,21 +4308,14 @@ var DocumentProcessor = class {
4220
4308
  * Convert chapters and link resources
4221
4309
  *
4222
4310
  * Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
4223
- * Falls back to single "Document" chapter when TOC is empty.
4311
+ * Throws TocNotFoundError if TOC entries are empty (defensive assertion).
4224
4312
  */
4225
4313
  async convertChapters(doclingDoc, tocEntries, pageRangeMap, images, tables, footnotes) {
4226
4314
  this.logger.info("[DocumentProcessor] Converting chapters...");
4227
4315
  if (tocEntries.length === 0) {
4228
- this.logger.info(
4229
- "[DocumentProcessor] No TOC entries, creating fallback chapter"
4230
- );
4231
- return this.createFallbackChapter(
4232
- doclingDoc,
4233
- pageRangeMap,
4234
- images,
4235
- tables,
4236
- footnotes
4237
- );
4316
+ const reason = "Cannot convert chapters without TOC entries";
4317
+ this.logger.error(`[DocumentProcessor] ${reason}`);
4318
+ throw new TocNotFoundError(reason);
4238
4319
  }
4239
4320
  const chapters = this.chapterConverter.convert(
4240
4321
  tocEntries,
@@ -4249,48 +4330,6 @@ var DocumentProcessor = class {
4249
4330
  );
4250
4331
  return chapters;
4251
4332
  }
4252
- /**
4253
- * Create a fallback chapter when TOC is not available
4254
- *
4255
- * Creates a single "Document" chapter containing all text blocks,
4256
- * images, tables, and footnotes from the document.
4257
- */
4258
- createFallbackChapter(doclingDoc, pageRangeMap, images, tables, footnotes) {
4259
- const textBlocks = doclingDoc.texts.filter(
4260
- (item) => item.label !== "footnote" && this.textCleaner.isValidText(item.text)
4261
- ).map((item) => ({
4262
- text: this.textCleaner.normalize(item.text),
4263
- pdfPageNo: item.prov?.[0]?.page_no ?? 1
4264
- }));
4265
- if (textBlocks.length === 0 && images.length === 0 && tables.length === 0 && footnotes.length === 0) {
4266
- this.logger.info(
4267
- "[DocumentProcessor] No content found for fallback chapter"
4268
- );
4269
- return [];
4270
- }
4271
- const firstPdfPage = Math.min(
4272
- ...Object.keys(pageRangeMap).map(Number).filter((n) => !isNaN(n)),
4273
- 1
4274
- );
4275
- const firstPageRange = pageRangeMap[firstPdfPage];
4276
- const pageNo = firstPageRange?.startPageNo ?? 1;
4277
- const fallbackChapter = {
4278
- id: this.idGenerator.generateChapterId(),
4279
- originTitle: "Document",
4280
- title: "Document",
4281
- pageNo,
4282
- level: 1,
4283
- textBlocks,
4284
- imageIds: images.map((img) => img.id),
4285
- tableIds: tables.map((tbl) => tbl.id),
4286
- footnoteIds: footnotes.map((ftn) => ftn.id),
4287
- children: []
4288
- };
4289
- this.logger.info(
4290
- `[DocumentProcessor] Created fallback chapter with ${textBlocks.length} text blocks, ${images.length} images, ${tables.length} tables, ${footnotes.length} footnotes`
4291
- );
4292
- return [fallbackChapter];
4293
- }
4294
4333
  };
4295
4334
  // Annotate the CommonJS export names for ESM import in node:
4296
4335
  0 && (module.exports = {