@heripo/model 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/index.ts"],"sourcesContent":["export type * from './docling-document';\nexport type * from './hanja-assessment';\nexport type * from './processed-document';\nexport type * from './token-usage-report';\nexport type * from './document-process-result';\n"],"mappings":";;;;;;;;;;;;;;;;AAAA;AAAA;","names":[]}
1
+ {"version":3,"sources":["../src/index.ts"],"sourcesContent":["export type * from './docling-document';\nexport type * from './processed-document';\nexport type * from './token-usage-report';\nexport type * from './document-process-result';\nexport type * from './ocr-strategy';\n"],"mappings":";;;;;;;;;;;;;;;;AAAA;AAAA;","names":[]}
package/dist/index.d.cts CHANGED
@@ -109,43 +109,6 @@ interface DoclingDocument {
109
109
  pages: Record<string, DoclingPage>;
110
110
  }
111
111
 
112
- /**
113
- * Result of Hanja (KCJ) quality assessment
114
- *
115
- * Evaluates OCR quality of Korean-Chinese-Japanese (KCJ/KCJ) characters
116
- * in the document by sampling pages and comparing with Vision LLM.
117
- */
118
- interface HanjaAssessment {
119
- /**
120
- * Whether the document should be re-parsed using VLM pipeline
121
- * due to significant KCJ character corruption
122
- */
123
- needsVlmReparse: boolean;
124
- /**
125
- * Severity of KCJ character corruption
126
- * - 'none': No KCJ characters found or no corruption detected
127
- * - 'minor': Some corruption but still usable
128
- * - 'severe': Significant corruption requiring VLM re-parse
129
- */
130
- severity: 'none' | 'minor' | 'severe';
131
- /**
132
- * Total number of text pages considered as candidates for assessment
133
- */
134
- kcjPageCount: number;
135
- /**
136
- * Number of pages actually sampled for quality assessment
137
- */
138
- sampledPageCount: number;
139
- /**
140
- * Ratio of corrupted characters (0.0 ~ 1.0)
141
- */
142
- corruptedRatio: number;
143
- /**
144
- * Human-readable reason for the assessment result
145
- */
146
- reason: string;
147
- }
148
-
149
112
  /**
150
113
  * Caption information
151
114
  *
@@ -734,4 +697,26 @@ interface DocumentProcessResult {
734
697
  usage: TokenUsageReport;
735
698
  }
736
699
 
737
- export type { Caption, Chapter, ComponentUsageReport, DoclingBBox, DoclingBaseNode, DoclingBody, DoclingDocument, DoclingGroupItem, DoclingOrigin, DoclingPage, DoclingPageImage, DoclingPictureItem, DoclingProv, DoclingReference, DoclingTableCell, DoclingTableData, DoclingTableItem, DoclingTextItem, DocumentProcessResult, HanjaAssessment, ModelUsageDetail, PageRange, PhaseUsageReport, ProcessedDocument, ProcessedFootnote, ProcessedImage, ProcessedTable, ProcessedTableCell, TextBlock, TokenUsageReport, TokenUsageSummary };
700
+ /**
701
+ * Result of the OCR strategy sampling phase.
702
+ * Determines whether to use ocrmac (standard Docling pipeline)
703
+ * or VLM (direct vision language model processing) for a given document.
704
+ */
705
+ interface OcrStrategy {
706
+ /** Selected OCR method */
707
+ method: 'ocrmac' | 'vlm';
708
+ /** OCR language weights for ocrmac (e.g., ['ko-KR', 'en-US'] or ['zh-Hant', 'ko-KR']) */
709
+ ocrLanguages?: string[];
710
+ /** BCP 47 language tags detected during sampling (e.g., ['ko-KR', 'en-US']) */
711
+ detectedLanguages?: string[];
712
+ /** Human-readable explanation of the decision */
713
+ reason: string;
714
+ /** Number of pages that were sampled for the decision */
715
+ sampledPages: number;
716
+ /** Total number of pages in the document */
717
+ totalPages: number;
718
+ /** 1-based page numbers where Korean-Hanja mixed script was detected in text layer */
719
+ koreanHanjaMixPages?: number[];
720
+ }
721
+
722
+ export type { Caption, Chapter, ComponentUsageReport, DoclingBBox, DoclingBaseNode, DoclingBody, DoclingDocument, DoclingGroupItem, DoclingOrigin, DoclingPage, DoclingPageImage, DoclingPictureItem, DoclingProv, DoclingReference, DoclingTableCell, DoclingTableData, DoclingTableItem, DoclingTextItem, DocumentProcessResult, ModelUsageDetail, OcrStrategy, PageRange, PhaseUsageReport, ProcessedDocument, ProcessedFootnote, ProcessedImage, ProcessedTable, ProcessedTableCell, TextBlock, TokenUsageReport, TokenUsageSummary };
package/dist/index.d.ts CHANGED
@@ -109,43 +109,6 @@ interface DoclingDocument {
109
109
  pages: Record<string, DoclingPage>;
110
110
  }
111
111
 
112
- /**
113
- * Result of Hanja (KCJ) quality assessment
114
- *
115
- * Evaluates OCR quality of Korean-Chinese-Japanese (KCJ/KCJ) characters
116
- * in the document by sampling pages and comparing with Vision LLM.
117
- */
118
- interface HanjaAssessment {
119
- /**
120
- * Whether the document should be re-parsed using VLM pipeline
121
- * due to significant KCJ character corruption
122
- */
123
- needsVlmReparse: boolean;
124
- /**
125
- * Severity of KCJ character corruption
126
- * - 'none': No KCJ characters found or no corruption detected
127
- * - 'minor': Some corruption but still usable
128
- * - 'severe': Significant corruption requiring VLM re-parse
129
- */
130
- severity: 'none' | 'minor' | 'severe';
131
- /**
132
- * Total number of text pages considered as candidates for assessment
133
- */
134
- kcjPageCount: number;
135
- /**
136
- * Number of pages actually sampled for quality assessment
137
- */
138
- sampledPageCount: number;
139
- /**
140
- * Ratio of corrupted characters (0.0 ~ 1.0)
141
- */
142
- corruptedRatio: number;
143
- /**
144
- * Human-readable reason for the assessment result
145
- */
146
- reason: string;
147
- }
148
-
149
112
  /**
150
113
  * Caption information
151
114
  *
@@ -734,4 +697,26 @@ interface DocumentProcessResult {
734
697
  usage: TokenUsageReport;
735
698
  }
736
699
 
737
- export type { Caption, Chapter, ComponentUsageReport, DoclingBBox, DoclingBaseNode, DoclingBody, DoclingDocument, DoclingGroupItem, DoclingOrigin, DoclingPage, DoclingPageImage, DoclingPictureItem, DoclingProv, DoclingReference, DoclingTableCell, DoclingTableData, DoclingTableItem, DoclingTextItem, DocumentProcessResult, HanjaAssessment, ModelUsageDetail, PageRange, PhaseUsageReport, ProcessedDocument, ProcessedFootnote, ProcessedImage, ProcessedTable, ProcessedTableCell, TextBlock, TokenUsageReport, TokenUsageSummary };
700
+ /**
701
+ * Result of the OCR strategy sampling phase.
702
+ * Determines whether to use ocrmac (standard Docling pipeline)
703
+ * or VLM (direct vision language model processing) for a given document.
704
+ */
705
+ interface OcrStrategy {
706
+ /** Selected OCR method */
707
+ method: 'ocrmac' | 'vlm';
708
+ /** OCR language weights for ocrmac (e.g., ['ko-KR', 'en-US'] or ['zh-Hant', 'ko-KR']) */
709
+ ocrLanguages?: string[];
710
+ /** BCP 47 language tags detected during sampling (e.g., ['ko-KR', 'en-US']) */
711
+ detectedLanguages?: string[];
712
+ /** Human-readable explanation of the decision */
713
+ reason: string;
714
+ /** Number of pages that were sampled for the decision */
715
+ sampledPages: number;
716
+ /** Total number of pages in the document */
717
+ totalPages: number;
718
+ /** 1-based page numbers where Korean-Hanja mixed script was detected in text layer */
719
+ koreanHanjaMixPages?: number[];
720
+ }
721
+
722
+ export type { Caption, Chapter, ComponentUsageReport, DoclingBBox, DoclingBaseNode, DoclingBody, DoclingDocument, DoclingGroupItem, DoclingOrigin, DoclingPage, DoclingPageImage, DoclingPictureItem, DoclingProv, DoclingReference, DoclingTableCell, DoclingTableData, DoclingTableItem, DoclingTextItem, DocumentProcessResult, ModelUsageDetail, OcrStrategy, PageRange, PhaseUsageReport, ProcessedDocument, ProcessedFootnote, ProcessedImage, ProcessedTable, ProcessedTableCell, TextBlock, TokenUsageReport, TokenUsageSummary };
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "@heripo/model",
3
3
  "private": false,
4
4
  "type": "module",
5
- "version": "0.1.7",
5
+ "version": "0.1.8",
6
6
  "description": "Document models and type definitions for heripo engine",
7
7
  "main": "dist/index.cjs",
8
8
  "module": "dist/index.js",