@heripo/document-processor 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -267,16 +267,9 @@ declare class DocumentProcessor {
267
267
  * Convert chapters and link resources
268
268
  *
269
269
  * Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
270
- * Falls back to single "Document" chapter when TOC is empty.
270
+ * Throws TocNotFoundError if TOC entries are empty (defensive assertion).
271
271
  */
272
272
  private convertChapters;
273
- /**
274
- * Create a fallback chapter when TOC is not available
275
- *
276
- * Creates a single "Document" chapter containing all text blocks,
277
- * images, tables, and footnotes from the document.
278
- */
279
- private createFallbackChapter;
280
273
  }
281
274
 
282
275
  /**
@@ -729,6 +722,27 @@ declare class PageRangeParser extends VisionLLMComponent {
729
722
  protected buildUserPrompt(pageNos: number[]): string;
730
723
  }
731
724
 
725
+ /**
726
+ * Single validation issue detected during TOC validation
727
+ */
728
+ interface TocValidationIssue {
729
+ /**
730
+ * Issue code (V001, V002, etc.)
731
+ */
732
+ code: string;
733
+ /**
734
+ * Human-readable error message
735
+ */
736
+ message: string;
737
+ /**
738
+ * Path to the problematic entry (e.g., "[0].children[2]")
739
+ */
740
+ path: string;
741
+ /**
742
+ * The problematic entry
743
+ */
744
+ entry: TocEntry;
745
+ }
732
746
  /**
733
747
  * TocExtractError
734
748
  *
@@ -775,6 +789,12 @@ interface TocValidationOptions {
775
789
  * Maximum allowed title length (default: 200)
776
790
  */
777
791
  maxTitleLength?: number;
792
+ /**
793
+ * Maximum ratio of the first entry's page to total pages (default: 0.3)
794
+ * Used for V007 completeness check - if the first level-1 entry
795
+ * starts beyond max(50, totalPages * ratio), the TOC may be incomplete.
796
+ */
797
+ maxFirstEntryPageRatio?: number;
778
798
  }
779
799
 
780
800
  /**
@@ -963,7 +983,7 @@ declare class TocFinder {
963
983
  */
964
984
  private isTableTocLike;
965
985
  /**
966
- * Expand TOC area to consecutive pages
986
+ * Expand TOC area to consecutive pages (both backward and forward)
967
987
  */
968
988
  private expandToConsecutivePages;
969
989
  /**
@@ -1023,6 +1043,9 @@ interface TocExtractorOptions extends BaseLLMComponentOptions {
1023
1043
  *
1024
1044
  * Uses high-performance LLM to extract structured TOC from Markdown representation.
1025
1045
  * Extends TextLLMComponent for standardized LLM call handling.
1046
+ *
1047
+ * When validation fails, automatically retries with correction feedback
1048
+ * up to MAX_VALIDATION_RETRIES times before throwing.
1026
1049
  */
1027
1050
  declare class TocExtractor extends TextLLMComponent {
1028
1051
  private readonly validationOptions?;
@@ -1031,21 +1054,34 @@ declare class TocExtractor extends TextLLMComponent {
1031
1054
  /**
1032
1055
  * Extract TOC structure from Markdown
1033
1056
  *
1057
+ * When validation fails, retries with correction feedback up to MAX_VALIDATION_RETRIES times.
1058
+ *
1034
1059
  * @param markdown - Markdown representation of TOC area
1035
- * @returns Object with entries array and token usage information
1060
+ * @param validationOverrides - Optional overrides for validation options (merged with constructor options)
1061
+ * @returns Object with entries array and token usage array (initial extraction + any corrections)
1036
1062
  * @throws {TocParseError} When LLM fails to parse structure
1037
- * @throws {TocValidationError} When validation fails
1063
+ * @throws {TocValidationError} When validation fails after all retries
1038
1064
  */
1039
- extract(markdown: string): Promise<{
1065
+ extract(markdown: string, validationOverrides?: Partial<TocValidationOptions>): Promise<{
1040
1066
  entries: TocEntry[];
1041
- usage: ExtendedTokenUsage;
1067
+ usages: ExtendedTokenUsage[];
1042
1068
  }>;
1043
1069
  /**
1044
- * Validate extracted entries
1070
+ * Validate extracted entries and return error or null
1071
+ *
1072
+ * Unlike validateOrThrow, this returns the error instead of throwing,
1073
+ * allowing the retry loop to handle it.
1074
+ *
1075
+ * @returns TocValidationError if validation fails, null if valid
1076
+ */
1077
+ private tryValidateEntries;
1078
+ /**
1079
+ * Build correction prompt with validation error feedback
1045
1080
  *
1046
- * @throws {TocValidationError} When validation fails
1081
+ * Includes the original markdown, previous extraction result,
1082
+ * validation errors, and guidance for fixing common mistakes.
1047
1083
  */
1048
- private validateEntries;
1084
+ protected buildCorrectionPrompt(markdown: string, previousEntries: TocEntry[], issues: TocValidationIssue[]): string;
1049
1085
  /**
1050
1086
  * Build system prompt for TOC extraction
1051
1087
  */
package/dist/index.d.ts CHANGED
@@ -267,16 +267,9 @@ declare class DocumentProcessor {
267
267
  * Convert chapters and link resources
268
268
  *
269
269
  * Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
270
- * Falls back to single "Document" chapter when TOC is empty.
270
+ * Throws TocNotFoundError if TOC entries are empty (defensive assertion).
271
271
  */
272
272
  private convertChapters;
273
- /**
274
- * Create a fallback chapter when TOC is not available
275
- *
276
- * Creates a single "Document" chapter containing all text blocks,
277
- * images, tables, and footnotes from the document.
278
- */
279
- private createFallbackChapter;
280
273
  }
281
274
 
282
275
  /**
@@ -729,6 +722,27 @@ declare class PageRangeParser extends VisionLLMComponent {
729
722
  protected buildUserPrompt(pageNos: number[]): string;
730
723
  }
731
724
 
725
+ /**
726
+ * Single validation issue detected during TOC validation
727
+ */
728
+ interface TocValidationIssue {
729
+ /**
730
+ * Issue code (V001, V002, etc.)
731
+ */
732
+ code: string;
733
+ /**
734
+ * Human-readable error message
735
+ */
736
+ message: string;
737
+ /**
738
+ * Path to the problematic entry (e.g., "[0].children[2]")
739
+ */
740
+ path: string;
741
+ /**
742
+ * The problematic entry
743
+ */
744
+ entry: TocEntry;
745
+ }
732
746
  /**
733
747
  * TocExtractError
734
748
  *
@@ -775,6 +789,12 @@ interface TocValidationOptions {
775
789
  * Maximum allowed title length (default: 200)
776
790
  */
777
791
  maxTitleLength?: number;
792
+ /**
793
+ * Maximum ratio of the first entry's page to total pages (default: 0.3)
794
+ * Used for V007 completeness check - if the first level-1 entry
795
+ * starts beyond max(50, totalPages * ratio), the TOC may be incomplete.
796
+ */
797
+ maxFirstEntryPageRatio?: number;
778
798
  }
779
799
 
780
800
  /**
@@ -963,7 +983,7 @@ declare class TocFinder {
963
983
  */
964
984
  private isTableTocLike;
965
985
  /**
966
- * Expand TOC area to consecutive pages
986
+ * Expand TOC area to consecutive pages (both backward and forward)
967
987
  */
968
988
  private expandToConsecutivePages;
969
989
  /**
@@ -1023,6 +1043,9 @@ interface TocExtractorOptions extends BaseLLMComponentOptions {
1023
1043
  *
1024
1044
  * Uses high-performance LLM to extract structured TOC from Markdown representation.
1025
1045
  * Extends TextLLMComponent for standardized LLM call handling.
1046
+ *
1047
+ * When validation fails, automatically retries with correction feedback
1048
+ * up to MAX_VALIDATION_RETRIES times before throwing.
1026
1049
  */
1027
1050
  declare class TocExtractor extends TextLLMComponent {
1028
1051
  private readonly validationOptions?;
@@ -1031,21 +1054,34 @@ declare class TocExtractor extends TextLLMComponent {
1031
1054
  /**
1032
1055
  * Extract TOC structure from Markdown
1033
1056
  *
1057
+ * When validation fails, retries with correction feedback up to MAX_VALIDATION_RETRIES times.
1058
+ *
1034
1059
  * @param markdown - Markdown representation of TOC area
1035
- * @returns Object with entries array and token usage information
1060
+ * @param validationOverrides - Optional overrides for validation options (merged with constructor options)
1061
+ * @returns Object with entries array and token usage array (initial extraction + any corrections)
1036
1062
  * @throws {TocParseError} When LLM fails to parse structure
1037
- * @throws {TocValidationError} When validation fails
1063
+ * @throws {TocValidationError} When validation fails after all retries
1038
1064
  */
1039
- extract(markdown: string): Promise<{
1065
+ extract(markdown: string, validationOverrides?: Partial<TocValidationOptions>): Promise<{
1040
1066
  entries: TocEntry[];
1041
- usage: ExtendedTokenUsage;
1067
+ usages: ExtendedTokenUsage[];
1042
1068
  }>;
1043
1069
  /**
1044
- * Validate extracted entries
1070
+ * Validate extracted entries and return error or null
1071
+ *
1072
+ * Unlike validateOrThrow, this returns the error instead of throwing,
1073
+ * allowing the retry loop to handle it.
1074
+ *
1075
+ * @returns TocValidationError if validation fails, null if valid
1076
+ */
1077
+ private tryValidateEntries;
1078
+ /**
1079
+ * Build correction prompt with validation error feedback
1045
1080
  *
1046
- * @throws {TocValidationError} When validation fails
1081
+ * Includes the original markdown, previous extraction result,
1082
+ * validation errors, and guidance for fixing common mistakes.
1047
1083
  */
1048
- private validateEntries;
1084
+ protected buildCorrectionPrompt(markdown: string, previousEntries: TocEntry[], issues: TocValidationIssue[]): string;
1049
1085
  /**
1050
1086
  * Build system prompt for TOC extraction
1051
1087
  */