@heripo/document-processor 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -1187,15 +1187,36 @@ declare abstract class BaseValidator<TSchema extends z.ZodType, TResult = z.infe
1187
1187
  }>;
1188
1188
  }
1189
1189
 
1190
+ /**
1191
+ * Content type for TOC validation
1192
+ */
1193
+ type TocContentType = 'pure_toc' | 'mixed' | 'resource_only' | 'invalid';
1190
1194
  /**
1191
1195
  * Schema for TOC content validation response
1192
1196
  */
1193
1197
  declare const TocContentValidationSchema: z.ZodObject<{
1194
- isToc: z.ZodBoolean;
1198
+ isValid: z.ZodBoolean;
1195
1199
  confidence: z.ZodNumber;
1200
+ contentType: z.ZodEnum<{
1201
+ pure_toc: "pure_toc";
1202
+ mixed: "mixed";
1203
+ resource_only: "resource_only";
1204
+ invalid: "invalid";
1205
+ }>;
1206
+ extractedTocMarkdown: z.ZodNullable<z.ZodString>;
1196
1207
  reason: z.ZodString;
1197
1208
  }, z.core.$strip>;
1198
1209
  type TocContentValidationResult = z.infer<typeof TocContentValidationSchema>;
1210
+ /**
1211
+ * Output type for TOC validation with resolved markdown
1212
+ */
1213
+ interface TocValidationOutput {
1214
+ isValid: boolean;
1215
+ confidence: number;
1216
+ contentType: TocContentType;
1217
+ validTocMarkdown: string | null;
1218
+ reason: string;
1219
+ }
1199
1220
  /**
1200
1221
  * Options for TocContentValidator
1201
1222
  */
@@ -1210,6 +1231,7 @@ interface TocContentValidatorOptions extends BaseValidatorOptions {
1210
1231
  *
1211
1232
  * Uses LLM to validate whether extracted markdown content is actually a TOC.
1212
1233
  * This is a semantic validation, not structural validation.
1234
+ * Supports mixed content extraction where main TOC is combined with resource indices.
1213
1235
  */
1214
1236
  declare class TocContentValidator extends BaseValidator<typeof TocContentValidationSchema, TocContentValidationResult> {
1215
1237
  private readonly confidenceThreshold;
@@ -1218,16 +1240,23 @@ declare class TocContentValidator extends BaseValidator<typeof TocContentValidat
1218
1240
  * Validate if the markdown content is a table of contents
1219
1241
  *
1220
1242
  * @param markdown - Markdown content to validate
1221
- * @returns Validation result with isToc, confidence, and reason
1243
+ * @returns Validation output with resolved markdown for valid TOC
1222
1244
  */
1223
- validate(markdown: string): Promise<TocContentValidationResult>;
1245
+ validate(markdown: string): Promise<TocValidationOutput>;
1224
1246
  /**
1225
1247
  * Check if validation result passes threshold
1226
1248
  *
1227
- * @param result - Validation result from validate()
1249
+ * @param result - Validation output from validate()
1228
1250
  * @returns true if content is valid TOC with sufficient confidence
1229
1251
  */
1230
- isValid(result: TocContentValidationResult): boolean;
1252
+ isValid(result: TocValidationOutput): boolean;
1253
+ /**
1254
+ * Get the valid TOC markdown from validation result
1255
+ *
1256
+ * @param result - Validation output from validate()
1257
+ * @returns Valid TOC markdown or null if invalid
1258
+ */
1259
+ getValidMarkdown(result: TocValidationOutput): string | null;
1231
1260
  /**
1232
1261
  * Build system prompt for TOC content validation
1233
1262
  */
package/dist/index.d.ts CHANGED
@@ -1187,15 +1187,36 @@ declare abstract class BaseValidator<TSchema extends z.ZodType, TResult = z.infe
1187
1187
  }>;
1188
1188
  }
1189
1189
 
1190
+ /**
1191
+ * Content type for TOC validation
1192
+ */
1193
+ type TocContentType = 'pure_toc' | 'mixed' | 'resource_only' | 'invalid';
1190
1194
  /**
1191
1195
  * Schema for TOC content validation response
1192
1196
  */
1193
1197
  declare const TocContentValidationSchema: z.ZodObject<{
1194
- isToc: z.ZodBoolean;
1198
+ isValid: z.ZodBoolean;
1195
1199
  confidence: z.ZodNumber;
1200
+ contentType: z.ZodEnum<{
1201
+ pure_toc: "pure_toc";
1202
+ mixed: "mixed";
1203
+ resource_only: "resource_only";
1204
+ invalid: "invalid";
1205
+ }>;
1206
+ extractedTocMarkdown: z.ZodNullable<z.ZodString>;
1196
1207
  reason: z.ZodString;
1197
1208
  }, z.core.$strip>;
1198
1209
  type TocContentValidationResult = z.infer<typeof TocContentValidationSchema>;
1210
+ /**
1211
+ * Output type for TOC validation with resolved markdown
1212
+ */
1213
+ interface TocValidationOutput {
1214
+ isValid: boolean;
1215
+ confidence: number;
1216
+ contentType: TocContentType;
1217
+ validTocMarkdown: string | null;
1218
+ reason: string;
1219
+ }
1199
1220
  /**
1200
1221
  * Options for TocContentValidator
1201
1222
  */
@@ -1210,6 +1231,7 @@ interface TocContentValidatorOptions extends BaseValidatorOptions {
1210
1231
  *
1211
1232
  * Uses LLM to validate whether extracted markdown content is actually a TOC.
1212
1233
  * This is a semantic validation, not structural validation.
1234
+ * Supports mixed content extraction where main TOC is combined with resource indices.
1213
1235
  */
1214
1236
  declare class TocContentValidator extends BaseValidator<typeof TocContentValidationSchema, TocContentValidationResult> {
1215
1237
  private readonly confidenceThreshold;
@@ -1218,16 +1240,23 @@ declare class TocContentValidator extends BaseValidator<typeof TocContentValidat
1218
1240
  * Validate if the markdown content is a table of contents
1219
1241
  *
1220
1242
  * @param markdown - Markdown content to validate
1221
- * @returns Validation result with isToc, confidence, and reason
1243
+ * @returns Validation output with resolved markdown for valid TOC
1222
1244
  */
1223
- validate(markdown: string): Promise<TocContentValidationResult>;
1245
+ validate(markdown: string): Promise<TocValidationOutput>;
1224
1246
  /**
1225
1247
  * Check if validation result passes threshold
1226
1248
  *
1227
- * @param result - Validation result from validate()
1249
+ * @param result - Validation output from validate()
1228
1250
  * @returns true if content is valid TOC with sufficient confidence
1229
1251
  */
1230
- isValid(result: TocContentValidationResult): boolean;
1252
+ isValid(result: TocValidationOutput): boolean;
1253
+ /**
1254
+ * Get the valid TOC markdown from validation result
1255
+ *
1256
+ * @param result - Validation output from validate()
1257
+ * @returns Valid TOC markdown or null if invalid
1258
+ */
1259
+ getValidMarkdown(result: TocValidationOutput): string | null;
1231
1260
  /**
1232
1261
  * Build system prompt for TOC content validation
1233
1262
  */
package/dist/index.js CHANGED
@@ -2157,7 +2157,15 @@ var VisionTocExtractor = class extends VisionLLMComponent {
2157
2157
  */
2158
2158
  async extractFromBatch(startPage, endPage) {
2159
2159
  this.log("info", `Extracting from pages ${startPage}-${endPage}`);
2160
+ this.log(
2161
+ "info",
2162
+ `Preparing images for vision analysis. This can be very slow (10+ minutes, sometimes 30+ minutes) depending on batch size and image resolution.`
2163
+ );
2160
2164
  const imageContents = this.loadPageImages(startPage, endPage);
2165
+ this.log(
2166
+ "info",
2167
+ `Calling vision LLM for TOC extraction (pages ${startPage}-${endPage})`
2168
+ );
2161
2169
  const result = await LLMCaller.callVision({
2162
2170
  schema: VisionTocExtractionSchema,
2163
2171
  messages: [
@@ -2180,6 +2188,10 @@ var VisionTocExtractor = class extends VisionLLMComponent {
2180
2188
  component: "VisionTocExtractor",
2181
2189
  phase: "extraction"
2182
2190
  });
2191
+ this.log(
2192
+ "info",
2193
+ `Vision LLM call completed (pages ${startPage}-${endPage})`
2194
+ );
2183
2195
  this.trackUsage(result.usage);
2184
2196
  return result.output;
2185
2197
  }
@@ -3242,9 +3254,11 @@ var BaseValidator = class extends TextLLMComponent {
3242
3254
  // src/validators/toc-content-validator.ts
3243
3255
  import { z as z5 } from "zod";
3244
3256
  var TocContentValidationSchema = z5.object({
3245
- isToc: z5.boolean().describe("Whether the content is a table of contents"),
3257
+ isValid: z5.boolean().describe("Whether valid main document TOC was found"),
3246
3258
  confidence: z5.number().min(0).max(1).describe("Confidence score between 0 and 1"),
3247
- reason: z5.string().describe("Brief explanation for the decision")
3259
+ contentType: z5.enum(["pure_toc", "mixed", "resource_only", "invalid"]).describe("Type of content detected"),
3260
+ extractedTocMarkdown: z5.string().nullable().describe("Extracted main TOC markdown when mixed; null otherwise"),
3261
+ reason: z5.string().describe("Brief explanation in English")
3248
3262
  });
3249
3263
  var TocContentValidator = class extends BaseValidator {
3250
3264
  confidenceThreshold;
@@ -3263,7 +3277,7 @@ var TocContentValidator = class extends BaseValidator {
3263
3277
  * Validate if the markdown content is a table of contents
3264
3278
  *
3265
3279
  * @param markdown - Markdown content to validate
3266
- * @returns Validation result with isToc, confidence, and reason
3280
+ * @returns Validation output with resolved markdown for valid TOC
3267
3281
  */
3268
3282
  async validate(markdown) {
3269
3283
  this.logger.info(
@@ -3274,8 +3288,10 @@ var TocContentValidator = class extends BaseValidator {
3274
3288
  "[TocContentValidator] Empty markdown, returning invalid"
3275
3289
  );
3276
3290
  return {
3277
- isToc: false,
3291
+ isValid: false,
3278
3292
  confidence: 1,
3293
+ contentType: "invalid",
3294
+ validTocMarkdown: null,
3279
3295
  reason: "Empty content"
3280
3296
  };
3281
3297
  }
@@ -3287,52 +3303,106 @@ var TocContentValidator = class extends BaseValidator {
3287
3303
  this.aggregator
3288
3304
  );
3289
3305
  this.logger.info(
3290
- `[TocContentValidator] Result: isToc=${result.isToc}, confidence=${result.confidence}`
3306
+ `[TocContentValidator] Result: isValid=${result.isValid}, contentType=${result.contentType}, confidence=${result.confidence}`
3291
3307
  );
3292
- return result;
3308
+ let validTocMarkdown = null;
3309
+ if (result.isValid && result.confidence >= this.confidenceThreshold) {
3310
+ if (result.contentType === "pure_toc") {
3311
+ validTocMarkdown = markdown;
3312
+ } else if (result.contentType === "mixed" && result.extractedTocMarkdown) {
3313
+ validTocMarkdown = result.extractedTocMarkdown;
3314
+ }
3315
+ }
3316
+ return {
3317
+ isValid: result.isValid,
3318
+ confidence: result.confidence,
3319
+ contentType: result.contentType,
3320
+ validTocMarkdown,
3321
+ reason: result.reason
3322
+ };
3293
3323
  }
3294
3324
  /**
3295
3325
  * Check if validation result passes threshold
3296
3326
  *
3297
- * @param result - Validation result from validate()
3327
+ * @param result - Validation output from validate()
3298
3328
  * @returns true if content is valid TOC with sufficient confidence
3299
3329
  */
3300
3330
  isValid(result) {
3301
- return result.isToc && result.confidence >= this.confidenceThreshold;
3331
+ return result.isValid && result.confidence >= this.confidenceThreshold;
3332
+ }
3333
+ /**
3334
+ * Get the valid TOC markdown from validation result
3335
+ *
3336
+ * @param result - Validation output from validate()
3337
+ * @returns Valid TOC markdown or null if invalid
3338
+ */
3339
+ getValidMarkdown(result) {
3340
+ return result.validTocMarkdown;
3302
3341
  }
3303
3342
  /**
3304
3343
  * Build system prompt for TOC content validation
3305
3344
  */
3306
3345
  buildSystemPrompt() {
3307
- return `You are a document structure analyst. Your task is to determine if the provided content is a Table of Contents (TOC).
3346
+ return `You are a document structure analyst. Your task is to analyze the provided content and classify it into one of four categories.
3347
+
3348
+ ## Content Type Classification:
3349
+
3350
+ ### 1. pure_toc
3351
+ The content is ONLY a main document Table of Contents with:
3352
+ - Structured list of chapters/sections with page numbers
3353
+ - Hierarchical section titles (e.g., "Chapter 1", "\uC81C1\uC7A5", "1.1 Introduction")
3354
+ - Multiple entries (3 or more) organized by document structure
3355
+ - NO resource indices mixed in
3356
+
3357
+ ### 2. mixed
3358
+ The content contains BOTH:
3359
+ - A valid main document TOC (chapters/sections with page numbers)
3360
+ - AND resource indices (photo/table/drawing indices)
3308
3361
 
3309
- ## What IS a Table of Contents:
3310
- - A structured list of chapters/sections with corresponding page numbers
3311
- - Contains hierarchical section titles (e.g., "Chapter 1", "\uC81C1\uC7A5", "1.1 Introduction", etc.)
3312
- - Has page number references for each entry (e.g., "..... 10", "... 5", or just a number at the end)
3313
- - Multiple entries organized by document structure
3314
- - Main document outline listing major chapters and sections
3362
+ When classifying as "mixed", you MUST extract ONLY the main TOC portion and return it in extractedTocMarkdown.
3315
3363
 
3316
- ## What is NOT a Table of Contents:
3364
+ ### 3. resource_only
3365
+ The content contains ONLY resource indices such as:
3317
3366
  - Photo/image indices (\uC0AC\uC9C4 \uBAA9\uCC28, \uC0AC\uC9C4\uBAA9\uCC28, Photo Index, List of Figures, List of Photos)
3318
3367
  - Table indices (\uD45C \uBAA9\uCC28, \uD45C\uBAA9\uCC28, Table Index, List of Tables)
3319
3368
  - Drawing/diagram indices (\uB3C4\uBA74 \uBAA9\uCC28, \uB3C4\uBA74\uBAA9\uCC28, Drawing Index, List of Drawings)
3320
3369
  - Appendix indices (\uBD80\uB85D \uBAA9\uCC28, Appendix Index)
3321
- - Random body text from the document
3370
+
3371
+ ### 4. invalid
3372
+ The content is none of the above:
3373
+ - Random body text
3322
3374
  - Single entries or incomplete lists (fewer than 3 items)
3323
3375
  - Reference lists or bibliographies
3324
3376
  - Index pages (alphabetical keyword lists)
3377
+ - Unstructured content
3325
3378
 
3326
3379
  ## Response Guidelines:
3327
- - Set isToc to true ONLY if content is clearly a main document TOC
3380
+ - Set isValid to true for "pure_toc" and "mixed" types
3381
+ - Set isValid to false for "resource_only" and "invalid" types
3328
3382
  - Set confidence between 0.0 and 1.0 based on your certainty
3329
- - Provide a brief reason explaining your decision (1-2 sentences)`;
3383
+ - For "mixed" type: extractedTocMarkdown MUST contain only the main TOC entries (preserve original formatting)
3384
+ - For other types: extractedTocMarkdown should be null
3385
+ - IMPORTANT: reason MUST be written in English
3386
+
3387
+ ## Example Scenarios:
3388
+
3389
+ ### Scenario 1: pure_toc
3390
+ Input: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5\\n\uC81C3\uC7A5 \uC870\uC0AC\uACB0\uACFC ..... 15"
3391
+ Output: { isValid: true, contentType: "pure_toc", extractedTocMarkdown: null }
3392
+
3393
+ ### Scenario 2: mixed
3394
+ Input: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5\\n\\n\uC0AC\uC9C4\uBAA9\uCC28\\n\uC0AC\uC9C4 1 \uC804\uACBD ..... 50\\n\uC0AC\uC9C4 2 \uC720\uBB3C ..... 51"
3395
+ Output: { isValid: true, contentType: "mixed", extractedTocMarkdown: "\uC81C1\uC7A5 \uC11C\uB860 ..... 1\\n\uC81C2\uC7A5 \uC870\uC0AC\uAC1C\uC694 ..... 5" }
3396
+
3397
+ ### Scenario 3: resource_only
3398
+ Input: "\uC0AC\uC9C4\uBAA9\uCC28\\n\uC0AC\uC9C4 1 \uC804\uACBD ..... 50\\n\uC0AC\uC9C4 2 \uC720\uBB3C ..... 51"
3399
+ Output: { isValid: false, contentType: "resource_only", extractedTocMarkdown: null }`;
3330
3400
  }
3331
3401
  /**
3332
3402
  * Build user prompt with markdown content
3333
3403
  */
3334
3404
  buildUserPrompt(markdown) {
3335
- return `Determine if the following content is a Table of Contents:
3405
+ return `Analyze the following content and classify it:
3336
3406
 
3337
3407
  ${markdown}`;
3338
3408
  }
@@ -3890,9 +3960,20 @@ var DocumentProcessor = class {
3890
3960
  );
3891
3961
  markdown = null;
3892
3962
  } else {
3893
- this.logger.info(
3894
- `[DocumentProcessor] TOC validation passed (confidence: ${validation.confidence})`
3895
- );
3963
+ const validMarkdown = this.tocContentValidator.getValidMarkdown(validation);
3964
+ if (validMarkdown) {
3965
+ if (validation.contentType === "mixed") {
3966
+ this.logger.info(
3967
+ `[DocumentProcessor] Mixed TOC detected, using extracted main TOC (${validMarkdown.length} chars)`
3968
+ );
3969
+ }
3970
+ markdown = validMarkdown;
3971
+ this.logger.info(
3972
+ `[DocumentProcessor] TOC validation passed (confidence: ${validation.confidence})`
3973
+ );
3974
+ } else {
3975
+ markdown = null;
3976
+ }
3896
3977
  }
3897
3978
  } catch (error) {
3898
3979
  if (error instanceof TocNotFoundError) {