@heripo/document-processor 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +208 -91
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +52 -16
- package/dist/index.d.ts +52 -16
- package/dist/index.js +208 -91
- package/dist/index.js.map +1 -1
- package/package.json +9 -9
package/dist/index.d.cts
CHANGED
|
@@ -267,16 +267,9 @@ declare class DocumentProcessor {
|
|
|
267
267
|
* Convert chapters and link resources
|
|
268
268
|
*
|
|
269
269
|
* Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
|
|
270
|
-
*
|
|
270
|
+
* Throws TocNotFoundError if TOC entries are empty (defensive assertion).
|
|
271
271
|
*/
|
|
272
272
|
private convertChapters;
|
|
273
|
-
/**
|
|
274
|
-
* Create a fallback chapter when TOC is not available
|
|
275
|
-
*
|
|
276
|
-
* Creates a single "Document" chapter containing all text blocks,
|
|
277
|
-
* images, tables, and footnotes from the document.
|
|
278
|
-
*/
|
|
279
|
-
private createFallbackChapter;
|
|
280
273
|
}
|
|
281
274
|
|
|
282
275
|
/**
|
|
@@ -729,6 +722,27 @@ declare class PageRangeParser extends VisionLLMComponent {
|
|
|
729
722
|
protected buildUserPrompt(pageNos: number[]): string;
|
|
730
723
|
}
|
|
731
724
|
|
|
725
|
+
/**
|
|
726
|
+
* Single validation issue detected during TOC validation
|
|
727
|
+
*/
|
|
728
|
+
interface TocValidationIssue {
|
|
729
|
+
/**
|
|
730
|
+
* Issue code (V001, V002, etc.)
|
|
731
|
+
*/
|
|
732
|
+
code: string;
|
|
733
|
+
/**
|
|
734
|
+
* Human-readable error message
|
|
735
|
+
*/
|
|
736
|
+
message: string;
|
|
737
|
+
/**
|
|
738
|
+
* Path to the problematic entry (e.g., "[0].children[2]")
|
|
739
|
+
*/
|
|
740
|
+
path: string;
|
|
741
|
+
/**
|
|
742
|
+
* The problematic entry
|
|
743
|
+
*/
|
|
744
|
+
entry: TocEntry;
|
|
745
|
+
}
|
|
732
746
|
/**
|
|
733
747
|
* TocExtractError
|
|
734
748
|
*
|
|
@@ -775,6 +789,12 @@ interface TocValidationOptions {
|
|
|
775
789
|
* Maximum allowed title length (default: 200)
|
|
776
790
|
*/
|
|
777
791
|
maxTitleLength?: number;
|
|
792
|
+
/**
|
|
793
|
+
* Maximum ratio of the first entry's page to total pages (default: 0.3)
|
|
794
|
+
* Used for V007 completeness check - if the first level-1 entry
|
|
795
|
+
* starts beyond max(50, totalPages * ratio), the TOC may be incomplete.
|
|
796
|
+
*/
|
|
797
|
+
maxFirstEntryPageRatio?: number;
|
|
778
798
|
}
|
|
779
799
|
|
|
780
800
|
/**
|
|
@@ -963,7 +983,7 @@ declare class TocFinder {
|
|
|
963
983
|
*/
|
|
964
984
|
private isTableTocLike;
|
|
965
985
|
/**
|
|
966
|
-
* Expand TOC area to consecutive pages
|
|
986
|
+
* Expand TOC area to consecutive pages (both backward and forward)
|
|
967
987
|
*/
|
|
968
988
|
private expandToConsecutivePages;
|
|
969
989
|
/**
|
|
@@ -1023,6 +1043,9 @@ interface TocExtractorOptions extends BaseLLMComponentOptions {
|
|
|
1023
1043
|
*
|
|
1024
1044
|
* Uses high-performance LLM to extract structured TOC from Markdown representation.
|
|
1025
1045
|
* Extends TextLLMComponent for standardized LLM call handling.
|
|
1046
|
+
*
|
|
1047
|
+
* When validation fails, automatically retries with correction feedback
|
|
1048
|
+
* up to MAX_VALIDATION_RETRIES times before throwing.
|
|
1026
1049
|
*/
|
|
1027
1050
|
declare class TocExtractor extends TextLLMComponent {
|
|
1028
1051
|
private readonly validationOptions?;
|
|
@@ -1031,21 +1054,34 @@ declare class TocExtractor extends TextLLMComponent {
|
|
|
1031
1054
|
/**
|
|
1032
1055
|
* Extract TOC structure from Markdown
|
|
1033
1056
|
*
|
|
1057
|
+
* When validation fails, retries with correction feedback up to MAX_VALIDATION_RETRIES times.
|
|
1058
|
+
*
|
|
1034
1059
|
* @param markdown - Markdown representation of TOC area
|
|
1035
|
-
* @
|
|
1060
|
+
* @param validationOverrides - Optional overrides for validation options (merged with constructor options)
|
|
1061
|
+
* @returns Object with entries array and token usage array (initial extraction + any corrections)
|
|
1036
1062
|
* @throws {TocParseError} When LLM fails to parse structure
|
|
1037
|
-
* @throws {TocValidationError} When validation fails
|
|
1063
|
+
* @throws {TocValidationError} When validation fails after all retries
|
|
1038
1064
|
*/
|
|
1039
|
-
extract(markdown: string): Promise<{
|
|
1065
|
+
extract(markdown: string, validationOverrides?: Partial<TocValidationOptions>): Promise<{
|
|
1040
1066
|
entries: TocEntry[];
|
|
1041
|
-
|
|
1067
|
+
usages: ExtendedTokenUsage[];
|
|
1042
1068
|
}>;
|
|
1043
1069
|
/**
|
|
1044
|
-
* Validate extracted entries
|
|
1070
|
+
* Validate extracted entries and return error or null
|
|
1071
|
+
*
|
|
1072
|
+
* Unlike validateOrThrow, this returns the error instead of throwing,
|
|
1073
|
+
* allowing the retry loop to handle it.
|
|
1074
|
+
*
|
|
1075
|
+
* @returns TocValidationError if validation fails, null if valid
|
|
1076
|
+
*/
|
|
1077
|
+
private tryValidateEntries;
|
|
1078
|
+
/**
|
|
1079
|
+
* Build correction prompt with validation error feedback
|
|
1045
1080
|
*
|
|
1046
|
-
*
|
|
1081
|
+
* Includes the original markdown, previous extraction result,
|
|
1082
|
+
* validation errors, and guidance for fixing common mistakes.
|
|
1047
1083
|
*/
|
|
1048
|
-
|
|
1084
|
+
protected buildCorrectionPrompt(markdown: string, previousEntries: TocEntry[], issues: TocValidationIssue[]): string;
|
|
1049
1085
|
/**
|
|
1050
1086
|
* Build system prompt for TOC extraction
|
|
1051
1087
|
*/
|
package/dist/index.d.ts
CHANGED
|
@@ -267,16 +267,9 @@ declare class DocumentProcessor {
|
|
|
267
267
|
* Convert chapters and link resources
|
|
268
268
|
*
|
|
269
269
|
* Generates chapters based on TOC and links images/tables/footnotes using ChapterConverter.
|
|
270
|
-
*
|
|
270
|
+
* Throws TocNotFoundError if TOC entries are empty (defensive assertion).
|
|
271
271
|
*/
|
|
272
272
|
private convertChapters;
|
|
273
|
-
/**
|
|
274
|
-
* Create a fallback chapter when TOC is not available
|
|
275
|
-
*
|
|
276
|
-
* Creates a single "Document" chapter containing all text blocks,
|
|
277
|
-
* images, tables, and footnotes from the document.
|
|
278
|
-
*/
|
|
279
|
-
private createFallbackChapter;
|
|
280
273
|
}
|
|
281
274
|
|
|
282
275
|
/**
|
|
@@ -729,6 +722,27 @@ declare class PageRangeParser extends VisionLLMComponent {
|
|
|
729
722
|
protected buildUserPrompt(pageNos: number[]): string;
|
|
730
723
|
}
|
|
731
724
|
|
|
725
|
+
/**
|
|
726
|
+
* Single validation issue detected during TOC validation
|
|
727
|
+
*/
|
|
728
|
+
interface TocValidationIssue {
|
|
729
|
+
/**
|
|
730
|
+
* Issue code (V001, V002, etc.)
|
|
731
|
+
*/
|
|
732
|
+
code: string;
|
|
733
|
+
/**
|
|
734
|
+
* Human-readable error message
|
|
735
|
+
*/
|
|
736
|
+
message: string;
|
|
737
|
+
/**
|
|
738
|
+
* Path to the problematic entry (e.g., "[0].children[2]")
|
|
739
|
+
*/
|
|
740
|
+
path: string;
|
|
741
|
+
/**
|
|
742
|
+
* The problematic entry
|
|
743
|
+
*/
|
|
744
|
+
entry: TocEntry;
|
|
745
|
+
}
|
|
732
746
|
/**
|
|
733
747
|
* TocExtractError
|
|
734
748
|
*
|
|
@@ -775,6 +789,12 @@ interface TocValidationOptions {
|
|
|
775
789
|
* Maximum allowed title length (default: 200)
|
|
776
790
|
*/
|
|
777
791
|
maxTitleLength?: number;
|
|
792
|
+
/**
|
|
793
|
+
* Maximum ratio of the first entry's page to total pages (default: 0.3)
|
|
794
|
+
* Used for V007 completeness check - if the first level-1 entry
|
|
795
|
+
* starts beyond max(50, totalPages * ratio), the TOC may be incomplete.
|
|
796
|
+
*/
|
|
797
|
+
maxFirstEntryPageRatio?: number;
|
|
778
798
|
}
|
|
779
799
|
|
|
780
800
|
/**
|
|
@@ -963,7 +983,7 @@ declare class TocFinder {
|
|
|
963
983
|
*/
|
|
964
984
|
private isTableTocLike;
|
|
965
985
|
/**
|
|
966
|
-
* Expand TOC area to consecutive pages
|
|
986
|
+
* Expand TOC area to consecutive pages (both backward and forward)
|
|
967
987
|
*/
|
|
968
988
|
private expandToConsecutivePages;
|
|
969
989
|
/**
|
|
@@ -1023,6 +1043,9 @@ interface TocExtractorOptions extends BaseLLMComponentOptions {
|
|
|
1023
1043
|
*
|
|
1024
1044
|
* Uses high-performance LLM to extract structured TOC from Markdown representation.
|
|
1025
1045
|
* Extends TextLLMComponent for standardized LLM call handling.
|
|
1046
|
+
*
|
|
1047
|
+
* When validation fails, automatically retries with correction feedback
|
|
1048
|
+
* up to MAX_VALIDATION_RETRIES times before throwing.
|
|
1026
1049
|
*/
|
|
1027
1050
|
declare class TocExtractor extends TextLLMComponent {
|
|
1028
1051
|
private readonly validationOptions?;
|
|
@@ -1031,21 +1054,34 @@ declare class TocExtractor extends TextLLMComponent {
|
|
|
1031
1054
|
/**
|
|
1032
1055
|
* Extract TOC structure from Markdown
|
|
1033
1056
|
*
|
|
1057
|
+
* When validation fails, retries with correction feedback up to MAX_VALIDATION_RETRIES times.
|
|
1058
|
+
*
|
|
1034
1059
|
* @param markdown - Markdown representation of TOC area
|
|
1035
|
-
* @
|
|
1060
|
+
* @param validationOverrides - Optional overrides for validation options (merged with constructor options)
|
|
1061
|
+
* @returns Object with entries array and token usage array (initial extraction + any corrections)
|
|
1036
1062
|
* @throws {TocParseError} When LLM fails to parse structure
|
|
1037
|
-
* @throws {TocValidationError} When validation fails
|
|
1063
|
+
* @throws {TocValidationError} When validation fails after all retries
|
|
1038
1064
|
*/
|
|
1039
|
-
extract(markdown: string): Promise<{
|
|
1065
|
+
extract(markdown: string, validationOverrides?: Partial<TocValidationOptions>): Promise<{
|
|
1040
1066
|
entries: TocEntry[];
|
|
1041
|
-
|
|
1067
|
+
usages: ExtendedTokenUsage[];
|
|
1042
1068
|
}>;
|
|
1043
1069
|
/**
|
|
1044
|
-
* Validate extracted entries
|
|
1070
|
+
* Validate extracted entries and return error or null
|
|
1071
|
+
*
|
|
1072
|
+
* Unlike validateOrThrow, this returns the error instead of throwing,
|
|
1073
|
+
* allowing the retry loop to handle it.
|
|
1074
|
+
*
|
|
1075
|
+
* @returns TocValidationError if validation fails, null if valid
|
|
1076
|
+
*/
|
|
1077
|
+
private tryValidateEntries;
|
|
1078
|
+
/**
|
|
1079
|
+
* Build correction prompt with validation error feedback
|
|
1045
1080
|
*
|
|
1046
|
-
*
|
|
1081
|
+
* Includes the original markdown, previous extraction result,
|
|
1082
|
+
* validation errors, and guidance for fixing common mistakes.
|
|
1047
1083
|
*/
|
|
1048
|
-
|
|
1084
|
+
protected buildCorrectionPrompt(markdown: string, previousEntries: TocEntry[], issues: TocValidationIssue[]): string;
|
|
1049
1085
|
/**
|
|
1050
1086
|
* Build system prompt for TOC extraction
|
|
1051
1087
|
*/
|