@heripo/document-processor 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -267,16 +267,9 @@ declare class DocumentProcessor {
267
267
  * Convert chapters and link resources
268
268
  *
269
269
  * Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
270
- * Falls back to single "Document" chapter when TOC is empty.
270
+ * Throws TocNotFoundError if TOC entries are empty (defensive assertion).
271
271
  */
272
272
  private convertChapters;
273
- /**
274
- * Create a fallback chapter when TOC is not available
275
- *
276
- * Creates a single "Document" chapter containing all text blocks,
277
- * images, tables, and footnotes from the document.
278
- */
279
- private createFallbackChapter;
280
273
  }
281
274
 
282
275
  /**
@@ -1187,15 +1180,36 @@ declare abstract class BaseValidator<TSchema extends z.ZodType, TResult = z.infe
1187
1180
  }>;
1188
1181
  }
1189
1182
 
1183
+ /**
1184
+ * Content type for TOC validation
1185
+ */
1186
+ type TocContentType = 'pure_toc' | 'mixed' | 'resource_only' | 'invalid';
1190
1187
  /**
1191
1188
  * Schema for TOC content validation response
1192
1189
  */
1193
1190
  declare const TocContentValidationSchema: z.ZodObject<{
1194
- isToc: z.ZodBoolean;
1191
+ isValid: z.ZodBoolean;
1195
1192
  confidence: z.ZodNumber;
1193
+ contentType: z.ZodEnum<{
1194
+ pure_toc: "pure_toc";
1195
+ mixed: "mixed";
1196
+ resource_only: "resource_only";
1197
+ invalid: "invalid";
1198
+ }>;
1199
+ extractedTocMarkdown: z.ZodNullable<z.ZodString>;
1196
1200
  reason: z.ZodString;
1197
1201
  }, z.core.$strip>;
1198
1202
  type TocContentValidationResult = z.infer<typeof TocContentValidationSchema>;
1203
+ /**
1204
+ * Output type for TOC validation with resolved markdown
1205
+ */
1206
+ interface TocValidationOutput {
1207
+ isValid: boolean;
1208
+ confidence: number;
1209
+ contentType: TocContentType;
1210
+ validTocMarkdown: string | null;
1211
+ reason: string;
1212
+ }
1199
1213
  /**
1200
1214
  * Options for TocContentValidator
1201
1215
  */
@@ -1210,6 +1224,7 @@ interface TocContentValidatorOptions extends BaseValidatorOptions {
1210
1224
  *
1211
1225
  * Uses LLM to validate whether extracted markdown content is actually a TOC.
1212
1226
  * This is a semantic validation, not structural validation.
1227
+ * Supports mixed content extraction where main TOC is combined with resource indices.
1213
1228
  */
1214
1229
  declare class TocContentValidator extends BaseValidator<typeof TocContentValidationSchema, TocContentValidationResult> {
1215
1230
  private readonly confidenceThreshold;
@@ -1218,16 +1233,23 @@ declare class TocContentValidator extends BaseValidator<typeof TocContentValidat
1218
1233
  * Validate if the markdown content is a table of contents
1219
1234
  *
1220
1235
  * @param markdown - Markdown content to validate
1221
- * @returns Validation result with isToc, confidence, and reason
1236
+ * @returns Validation output with resolved markdown for valid TOC
1222
1237
  */
1223
- validate(markdown: string): Promise<TocContentValidationResult>;
1238
+ validate(markdown: string): Promise<TocValidationOutput>;
1224
1239
  /**
1225
1240
  * Check if validation result passes threshold
1226
1241
  *
1227
- * @param result - Validation result from validate()
1242
+ * @param result - Validation output from validate()
1228
1243
  * @returns true if content is valid TOC with sufficient confidence
1229
1244
  */
1230
- isValid(result: TocContentValidationResult): boolean;
1245
+ isValid(result: TocValidationOutput): boolean;
1246
+ /**
1247
+ * Get the valid TOC markdown from validation result
1248
+ *
1249
+ * @param result - Validation output from validate()
1250
+ * @returns Valid TOC markdown or null if invalid
1251
+ */
1252
+ getValidMarkdown(result: TocValidationOutput): string | null;
1231
1253
  /**
1232
1254
  * Build system prompt for TOC content validation
1233
1255
  */
package/dist/index.d.ts CHANGED
@@ -267,16 +267,9 @@ declare class DocumentProcessor {
267
267
  * Convert chapters and link resources
268
268
  *
269
269
  * Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
270
- * Falls back to single "Document" chapter when TOC is empty.
270
+ * Throws TocNotFoundError if TOC entries are empty (defensive assertion).
271
271
  */
272
272
  private convertChapters;
273
- /**
274
- * Create a fallback chapter when TOC is not available
275
- *
276
- * Creates a single "Document" chapter containing all text blocks,
277
- * images, tables, and footnotes from the document.
278
- */
279
- private createFallbackChapter;
280
273
  }
281
274
 
282
275
  /**
@@ -1187,15 +1180,36 @@ declare abstract class BaseValidator<TSchema extends z.ZodType, TResult = z.infe
1187
1180
  }>;
1188
1181
  }
1189
1182
 
1183
+ /**
1184
+ * Content type for TOC validation
1185
+ */
1186
+ type TocContentType = 'pure_toc' | 'mixed' | 'resource_only' | 'invalid';
1190
1187
  /**
1191
1188
  * Schema for TOC content validation response
1192
1189
  */
1193
1190
  declare const TocContentValidationSchema: z.ZodObject<{
1194
- isToc: z.ZodBoolean;
1191
+ isValid: z.ZodBoolean;
1195
1192
  confidence: z.ZodNumber;
1193
+ contentType: z.ZodEnum<{
1194
+ pure_toc: "pure_toc";
1195
+ mixed: "mixed";
1196
+ resource_only: "resource_only";
1197
+ invalid: "invalid";
1198
+ }>;
1199
+ extractedTocMarkdown: z.ZodNullable<z.ZodString>;
1196
1200
  reason: z.ZodString;
1197
1201
  }, z.core.$strip>;
1198
1202
  type TocContentValidationResult = z.infer<typeof TocContentValidationSchema>;
1203
+ /**
1204
+ * Output type for TOC validation with resolved markdown
1205
+ */
1206
+ interface TocValidationOutput {
1207
+ isValid: boolean;
1208
+ confidence: number;
1209
+ contentType: TocContentType;
1210
+ validTocMarkdown: string | null;
1211
+ reason: string;
1212
+ }
1199
1213
  /**
1200
1214
  * Options for TocContentValidator
1201
1215
  */
@@ -1210,6 +1224,7 @@ interface TocContentValidatorOptions extends BaseValidatorOptions {
1210
1224
  *
1211
1225
  * Uses LLM to validate whether extracted markdown content is actually a TOC.
1212
1226
  * This is a semantic validation, not structural validation.
1227
+ * Supports mixed content extraction where main TOC is combined with resource indices.
1213
1228
  */
1214
1229
  declare class TocContentValidator extends BaseValidator<typeof TocContentValidationSchema, TocContentValidationResult> {
1215
1230
  private readonly confidenceThreshold;
@@ -1218,16 +1233,23 @@ declare class TocContentValidator extends BaseValidator<typeof TocContentValidat
1218
1233
  * Validate if the markdown content is a table of contents
1219
1234
  *
1220
1235
  * @param markdown - Markdown content to validate
1221
- * @returns Validation result with isToc, confidence, and reason
1236
+ * @returns Validation output with resolved markdown for valid TOC
1222
1237
  */
1223
- validate(markdown: string): Promise<TocContentValidationResult>;
1238
+ validate(markdown: string): Promise<TocValidationOutput>;
1224
1239
  /**
1225
1240
  * Check if validation result passes threshold
1226
1241
  *
1227
- * @param result - Validation result from validate()
1242
+ * @param result - Validation output from validate()
1228
1243
  * @returns true if content is valid TOC with sufficient confidence
1229
1244
  */
1230
- isValid(result: TocContentValidationResult): boolean;
1245
+ isValid(result: TocValidationOutput): boolean;
1246
+ /**
1247
+ * Get the valid TOC markdown from validation result
1248
+ *
1249
+ * @param result - Validation output from validate()
1250
+ * @returns Valid TOC markdown or null if invalid
1251
+ */
1252
+ getValidMarkdown(result: TocValidationOutput): string | null;
1231
1253
  /**
1232
1254
  * Build system prompt for TOC content validation
1233
1255
  */
package/dist/index.js CHANGED
@@ -1846,11 +1846,10 @@ var TocExtractor = class extends TextLLMComponent {
1846
1846
  async extract(markdown) {
1847
1847
  this.log("info", `Starting TOC extraction (${markdown.length} chars)`);
1848
1848
  if (!markdown.trim()) {
1849
- this.log("info", "Empty markdown, returning empty array");
1850
- return {
1851
- entries: [],
1852
- usage: this.createEmptyUsage("extraction")
1853
- };
1849
+ this.log("error", "Cannot extract TOC from empty markdown content");
1850
+ throw new TocParseError(
1851
+ "TOC extraction failed: provided markdown content is empty"
1852
+ );
1854
1853
  }
1855
1854
  try {
1856
1855
  const result = await this.callTextLLM(
@@ -2157,7 +2156,15 @@ var VisionTocExtractor = class extends VisionLLMComponent {
2157
2156
  */
2158
2157
  async extractFromBatch(startPage, endPage) {
2159
2158
  this.log("info", `Extracting from pages ${startPage}-${endPage}`);
2159
+ this.log(
2160
+ "info",
2161
+ `Preparing images for vision analysis. This can be very slow (10+ minutes, sometimes 30+ minutes) depending on batch size and image resolution.`
2162
+ );
2160
2163
  const imageContents = this.loadPageImages(startPage, endPage);
2164
+ this.log(
2165
+ "info",
2166
+ `Calling vision LLM for TOC extraction (pages ${startPage}-${endPage})`
2167
+ );
2161
2168
  const result = await LLMCaller.callVision({
2162
2169
  schema: VisionTocExtractionSchema,
2163
2170
  messages: [
@@ -2180,6 +2187,10 @@ var VisionTocExtractor = class extends VisionLLMComponent {
2180
2187
  component: "VisionTocExtractor",
2181
2188
  phase: "extraction"
2182
2189
  });
2190
+ this.log(
2191
+ "info",
2192
+ `Vision LLM call completed (pages ${startPage}-${endPage})`
2193
+ );
2183
2194
  this.trackUsage(result.usage);
2184
2195
  return result.output;
2185
2196
  }
@@ -3242,9 +3253,11 @@ var BaseValidator = class extends TextLLMComponent {
3242
3253
  // src/validators/toc-content-validator.ts
3243
3254
  import { z as z5 } from "zod";
3244
3255
  var TocContentValidationSchema = z5.object({
3245
- isToc: z5.boolean().describe("Whether the content is a table of contents"),
3256
+ isValid: z5.boolean().describe("Whether valid main document TOC was found"),
3246
3257
  confidence: z5.number().min(0).max(1).describe("Confidence score between 0 and 1"),
3247
- reason: z5.string().describe("Brief explanation for the decision")
3258
+ contentType: z5.enum(["pure_toc", "mixed", "resource_only", "invalid"]).describe("Type of content detected"),
3259
+ extractedTocMarkdown: z5.string().nullable().describe("Extracted main TOC markdown when mixed; null otherwise"),
3260
+ reason: z5.string().describe("Brief explanation in English")
3248
3261
  });
3249
3262
  var TocContentValidator = class extends BaseValidator {
3250
3263
  confidenceThreshold;
@@ -3263,7 +3276,7 @@ var TocContentValidator = class extends BaseValidator {
3263
3276
  * Validate if the markdown content is a table of contents
3264
3277
  *
3265
3278
  * @param markdown - Markdown content to validate
3266
- * @returns Validation result with isToc, confidence, and reason
3279
+ * @returns Validation output with resolved markdown for valid TOC
3267
3280
  */
3268
3281
  async validate(markdown) {
3269
3282
  this.logger.info(
@@ -3274,8 +3287,10 @@ var TocContentValidator = class extends BaseValidator {
3274
3287
  "[TocContentValidator] Empty markdown, returning invalid"
3275
3288
  );
3276
3289
  return {
3277
- isToc: false,
3290
+ isValid: false,
3278
3291
  confidence: 1,
3292
+ contentType: "invalid",
3293
+ validTocMarkdown: null,
3279
3294
  reason: "Empty content"
3280
3295
  };
3281
3296
  }
@@ -3287,52 +3302,106 @@ var TocContentValidator = class extends BaseValidator {
3287
3302
  this.aggregator
3288
3303
  );
3289
3304
  this.logger.info(
3290
- `[TocContentValidator] Result: isToc=${result.isToc}, confidence=${result.confidence}`
3305
+ `[TocContentValidator] Result: isValid=${result.isValid}, contentType=${result.contentType}, confidence=${result.confidence}`
3291
3306
  );
3292
- return result;
3307
+ let validTocMarkdown = null;
3308
+ if (result.isValid && result.confidence >= this.confidenceThreshold) {
3309
+ if (result.contentType === "pure_toc") {
3310
+ validTocMarkdown = markdown;
3311
+ } else if (result.contentType === "mixed" && result.extractedTocMarkdown) {
3312
+ validTocMarkdown = result.extractedTocMarkdown;
3313
+ }
3314
+ }
3315
+ return {
3316
+ isValid: result.isValid,
3317
+ confidence: result.confidence,
3318
+ contentType: result.contentType,
3319
+ validTocMarkdown,
3320
+ reason: result.reason
3321
+ };
3293
3322
  }
3294
3323
  /**
3295
3324
  * Check if validation result passes threshold
3296
3325
  *
3297
- * @param result - Validation result from validate()
3326
+ * @param result - Validation output from validate()
3298
3327
  * @returns true if content is valid TOC with sufficient confidence
3299
3328
  */
3300
3329
  isValid(result) {
3301
- return result.isToc && result.confidence >= this.confidenceThreshold;
3330
+ return result.isValid && result.confidence >= this.confidenceThreshold;
3331
+ }
3332
+ /**
3333
+ * Get the valid TOC markdown from validation result
3334
+ *
3335
+ * @param result - Validation output from validate()
3336
+ * @returns Valid TOC markdown or null if invalid
3337
+ */
3338
+ getValidMarkdown(result) {
3339
+ return result.validTocMarkdown;
3302
3340
  }
3303
3341
  /**
3304
3342
  * Build system prompt for TOC content validation
3305
3343
  */
3306
3344
  buildSystemPrompt() {
3307
- return `You are a document structure analyst. Your task is to determine if the provided content is a Table of Contents (TOC).
3345
+ return `You are a document structure analyst. Your task is to analyze the provided content and classify it into one of four categories.
3308
3346
 
3309
- ## What IS a Table of Contents:
3310
- - A structured list of chapters/sections with corresponding page numbers
3311
- - Contains hierarchical section titles (e.g., "Chapter 1", "\uC81C1\uC7A5", "1.1 Introduction", etc.)
3312
- - Has page number references for each entry (e.g., "..... 10", "... 5", or just a number at the end)
3313
- - Multiple entries organized by document structure
3314
- - Main document outline listing major chapters and sections
3347
+ ## Content Type Classification:
3315
3348
 
3316
- ## What is NOT a Table of Contents:
3349
+ ### 1. pure_toc
3350
+ The content is ONLY a main document Table of Contents with:
3351
+ - Structured list of chapters/sections with page numbers
3352
+ - Hierarchical section titles (e.g., "Chapter 1", "\uC81C1\uC7A5", "1.1 Introduction")
3353
+ - Multiple entries (3 or more) organized by document structure
3354
+ - NO resource indices mixed in
3355
+
3356
+ ### 2. mixed
3357
+ The content contains BOTH:
3358
+ - A valid main document TOC (chapters/sections with page numbers)
3359
+ - AND resource indices (photo/table/drawing indices)
3360
+
3361
+ When classifying as "mixed", you MUST extract ONLY the main TOC portion and return it in extractedTocMarkdown.
3362
+
3363
+ ### 3. resource_only
3364
+ The content contains ONLY resource indices such as:
3317
3365
  - Photo/image indices (\uC0AC\uC9C4 \uBAA9\uCC28, \uC0AC\uC9C4\uBAA9\uCC28, Photo Index, List of Figures, List of Photos)
3318
3366
  - Table indices (\uD45C \uBAA9\uCC28, \uD45C\uBAA9\uCC28, Table Index, List of Tables)
3319
3367
  - Drawing/diagram indices (\uB3C4\uBA74 \uBAA9\uCC28, \uB3C4\uBA74\uBAA9\uCC28, Drawing Index, List of Drawings)
3320
3368
  - Appendix indices (\uBD80\uB85D \uBAA9\uCC28, Appendix Index)
3321
- - Random body text from the document
3369
+
3370
+ ### 4. invalid
3371
+ The content is none of the above:
3372
+ - Random body text
3322
3373
  - Single entries or incomplete lists (fewer than 3 items)
3323
3374
  - Reference lists or bibliographies
3324
3375
  - Index pages (alphabetical keyword lists)
3376
+ - Unstructured content
3325
3377
 
3326
3378
  ## Response Guidelines:
3327
- - Set isToc to true ONLY if content is clearly a main document TOC
3379
+ - Set isValid to true for "pure_toc" and "mixed" types
3380
+ - Set isValid to false for "resource_only" and "invalid" types
3328
3381
  - Set confidence between 0.0 and 1.0 based on your certainty
3329
- - Provide a brief reason explaining your decision (1-2 sentences)`;
3382
+ - For "mixed" type: extractedTocMarkdown MUST contain only the main TOC entries (preserve original formatting)
3383
+ - For other types: extractedTocMarkdown should be null
3384
+ - IMPORTANT: reason MUST be written in English
3385
+
3386
+ ## Example Scenarios:
3387
+
3388
+ ### Scenario 1: pure_toc
3389
+ Input: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5\\n\uC81C3\uC7A5 \uC870\uC0AC\uACB0\uACFC ..... 15"
3390
+ Output: { isValid: true, contentType: "pure_toc", extractedTocMarkdown: null }
3391
+
3392
+ ### Scenario 2: mixed
3393
+ Input: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5\\n\\n\uC0AC\uC9C4\uBAA9\uCC28\\n\uC0AC\uC9C4 1 \uC804\uACBD ..... 50\\n\uC0AC\uC9C4 2 \uC720\uBB3C ..... 51"
3394
+ Output: { isValid: true, contentType: "mixed", extractedTocMarkdown: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5" }
3395
+
3396
+ ### Scenario 3: resource_only
3397
+ Input: "\uC0AC\uC9C4\uBAA9\uCC28\\n\uC0AC\uC9C4 1 \uC804\uACBD ..... 50\\n\uC0AC\uC9C4 2 \uC720\uBB3C ..... 51"
3398
+ Output: { isValid: false, contentType: "resource_only", extractedTocMarkdown: null }`;
3330
3399
  }
3331
3400
  /**
3332
3401
  * Build user prompt with markdown content
3333
3402
  */
3334
3403
  buildUserPrompt(markdown) {
3335
- return `Determine if the following content is a Table of Contents:
3404
+ return `Analyze the following content and classify it:
3336
3405
 
3337
3406
  ${markdown}`;
3338
3407
  }
@@ -3890,9 +3959,20 @@ var DocumentProcessor = class {
3890
3959
  );
3891
3960
  markdown = null;
3892
3961
  } else {
3893
- this.logger.info(
3894
- `[DocumentProcessor] TOC validation passed (confidence: ${validation.confidence})`
3895
- );
3962
+ const validMarkdown = this.tocContentValidator.getValidMarkdown(validation);
3963
+ if (validMarkdown) {
3964
+ if (validation.contentType === "mixed") {
3965
+ this.logger.info(
3966
+ `[DocumentProcessor] Mixed TOC detected, using extracted main TOC (${validMarkdown.length} chars)`
3967
+ );
3968
+ }
3969
+ markdown = validMarkdown;
3970
+ this.logger.info(
3971
+ `[DocumentProcessor] TOC validation passed (confidence: ${validation.confidence})`
3972
+ );
3973
+ } else {
3974
+ markdown = null;
3975
+ }
3896
3976
  }
3897
3977
  } catch (error) {
3898
3978
  if (error instanceof TocNotFoundError) {
@@ -3908,10 +3988,13 @@ var DocumentProcessor = class {
3908
3988
  const totalPages = Object.keys(doclingDoc.pages).length;
3909
3989
  markdown = await this.visionTocExtractor.extract(totalPages);
3910
3990
  if (!markdown) {
3911
- this.logger.warn(
3912
- "[DocumentProcessor] TOC not found in any method, returning empty"
3991
+ const reason = "Both rule-based search and vision fallback failed to locate TOC";
3992
+ this.logger.error(
3993
+ `[DocumentProcessor] TOC extraction failed: ${reason}`
3994
+ );
3995
+ throw new TocNotFoundError(
3996
+ `Table of contents not found in the document. ${reason}.`
3913
3997
  );
3914
- return [];
3915
3998
  }
3916
3999
  this.logger.info(
3917
4000
  `[DocumentProcessor] Vision extracted TOC markdown (${markdown.length} chars)`
@@ -3919,6 +4002,11 @@ var DocumentProcessor = class {
3919
4002
  }
3920
4003
  const tocResult = await this.tocExtractor.extract(markdown);
3921
4004
  this.usageAggregator.track(tocResult.usage);
4005
+ if (tocResult.entries.length === 0) {
4006
+ const reason = "TOC area was detected but LLM could not extract any structured entries";
4007
+ this.logger.error(`[DocumentProcessor] TOC extraction failed: ${reason}`);
4008
+ throw new TocNotFoundError(`${reason}.`);
4009
+ }
3922
4010
  this.logger.info(
3923
4011
  `[DocumentProcessor] Extracted ${tocResult.entries.length} top-level TOC entries`
3924
4012
  );
@@ -4158,21 +4246,14 @@ var DocumentProcessor = class {
4158
4246
  * Convert chapters and link resources
4159
4247
  *
4160
4248
  * Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
4161
- * Falls back to single "Document" chapter when TOC is empty.
4249
+ * Throws TocNotFoundError if TOC entries are empty (defensive assertion).
4162
4250
  */
4163
4251
  async convertChapters(doclingDoc, tocEntries, pageRangeMap, images, tables, footnotes) {
4164
4252
  this.logger.info("[DocumentProcessor] Converting chapters...");
4165
4253
  if (tocEntries.length === 0) {
4166
- this.logger.info(
4167
- "[DocumentProcessor] No TOC entries, creating fallback chapter"
4168
- );
4169
- return this.createFallbackChapter(
4170
- doclingDoc,
4171
- pageRangeMap,
4172
- images,
4173
- tables,
4174
- footnotes
4175
- );
4254
+ const reason = "Cannot convert chapters without TOC entries";
4255
+ this.logger.error(`[DocumentProcessor] ${reason}`);
4256
+ throw new TocNotFoundError(reason);
4176
4257
  }
4177
4258
  const chapters = this.chapterConverter.convert(
4178
4259
  tocEntries,
@@ -4187,48 +4268,6 @@ var DocumentProcessor = class {
4187
4268
  );
4188
4269
  return chapters;
4189
4270
  }
4190
- /**
4191
- * Create a fallback chapter when TOC is not available
4192
- *
4193
- * Creates a single "Document" chapter containing all text blocks,
4194
- * images, tables, and footnotes from the document.
4195
- */
4196
- createFallbackChapter(doclingDoc, pageRangeMap, images, tables, footnotes) {
4197
- const textBlocks = doclingDoc.texts.filter(
4198
- (item) => item.label !== "footnote" && this.textCleaner.isValidText(item.text)
4199
- ).map((item) => ({
4200
- text: this.textCleaner.normalize(item.text),
4201
- pdfPageNo: item.prov?.[0]?.page_no ?? 1
4202
- }));
4203
- if (textBlocks.length === 0 && images.length === 0 && tables.length === 0 && footnotes.length === 0) {
4204
- this.logger.info(
4205
- "[DocumentProcessor] No content found for fallback chapter"
4206
- );
4207
- return [];
4208
- }
4209
- const firstPdfPage = Math.min(
4210
- ...Object.keys(pageRangeMap).map(Number).filter((n) => !isNaN(n)),
4211
- 1
4212
- );
4213
- const firstPageRange = pageRangeMap[firstPdfPage];
4214
- const pageNo = firstPageRange?.startPageNo ?? 1;
4215
- const fallbackChapter = {
4216
- id: this.idGenerator.generateChapterId(),
4217
- originTitle: "Document",
4218
- title: "Document",
4219
- pageNo,
4220
- level: 1,
4221
- textBlocks,
4222
- imageIds: images.map((img) => img.id),
4223
- tableIds: tables.map((tbl) => tbl.id),
4224
- footnoteIds: footnotes.map((ftn) => ftn.id),
4225
- children: []
4226
- };
4227
- this.logger.info(
4228
- `[DocumentProcessor] Created fallback chapter with ${textBlocks.length} text blocks, ${images.length} images, ${tables.length} tables, ${footnotes.length} footnotes`
4229
- );
4230
- return [fallbackChapter];
4231
- }
4232
4271
  };
4233
4272
  export {
4234
4273
  BaseLLMComponent,