@heripo/model 0.1.6 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +23 -38
- package/dist/index.d.ts +23 -38
- package/package.json +1 -1
package/dist/index.cjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/index.ts"],"sourcesContent":["export type * from './docling-document';\nexport type * from './
|
|
1
|
+
{"version":3,"sources":["../src/index.ts"],"sourcesContent":["export type * from './docling-document';\nexport type * from './processed-document';\nexport type * from './token-usage-report';\nexport type * from './document-process-result';\nexport type * from './ocr-strategy';\n"],"mappings":";;;;;;;;;;;;;;;;AAAA;AAAA;","names":[]}
|
package/dist/index.d.cts
CHANGED
|
@@ -109,43 +109,6 @@ interface DoclingDocument {
|
|
|
109
109
|
pages: Record<string, DoclingPage>;
|
|
110
110
|
}
|
|
111
111
|
|
|
112
|
-
/**
|
|
113
|
-
* Result of Hanja (KCJ) quality assessment
|
|
114
|
-
*
|
|
115
|
-
* Evaluates OCR quality of Korean-Chinese-Japanese (KCJ/KCJ) characters
|
|
116
|
-
* in the document by sampling pages and comparing with Vision LLM.
|
|
117
|
-
*/
|
|
118
|
-
interface HanjaAssessment {
|
|
119
|
-
/**
|
|
120
|
-
* Whether the document should be re-parsed using VLM pipeline
|
|
121
|
-
* due to significant KCJ character corruption
|
|
122
|
-
*/
|
|
123
|
-
needsVlmReparse: boolean;
|
|
124
|
-
/**
|
|
125
|
-
* Severity of KCJ character corruption
|
|
126
|
-
* - 'none': No KCJ characters found or no corruption detected
|
|
127
|
-
* - 'minor': Some corruption but still usable
|
|
128
|
-
* - 'severe': Significant corruption requiring VLM re-parse
|
|
129
|
-
*/
|
|
130
|
-
severity: 'none' | 'minor' | 'severe';
|
|
131
|
-
/**
|
|
132
|
-
* Total number of text pages considered as candidates for assessment
|
|
133
|
-
*/
|
|
134
|
-
kcjPageCount: number;
|
|
135
|
-
/**
|
|
136
|
-
* Number of pages actually sampled for quality assessment
|
|
137
|
-
*/
|
|
138
|
-
sampledPageCount: number;
|
|
139
|
-
/**
|
|
140
|
-
* Ratio of corrupted characters (0.0 ~ 1.0)
|
|
141
|
-
*/
|
|
142
|
-
corruptedRatio: number;
|
|
143
|
-
/**
|
|
144
|
-
* Human-readable reason for the assessment result
|
|
145
|
-
*/
|
|
146
|
-
reason: string;
|
|
147
|
-
}
|
|
148
|
-
|
|
149
112
|
/**
|
|
150
113
|
* Caption information
|
|
151
114
|
*
|
|
@@ -734,4 +697,26 @@ interface DocumentProcessResult {
|
|
|
734
697
|
usage: TokenUsageReport;
|
|
735
698
|
}
|
|
736
699
|
|
|
737
|
-
|
|
700
|
+
/**
|
|
701
|
+
* Result of the OCR strategy sampling phase.
|
|
702
|
+
* Determines whether to use ocrmac (standard Docling pipeline)
|
|
703
|
+
* or VLM (direct vision language model processing) for a given document.
|
|
704
|
+
*/
|
|
705
|
+
interface OcrStrategy {
|
|
706
|
+
/** Selected OCR method */
|
|
707
|
+
method: 'ocrmac' | 'vlm';
|
|
708
|
+
/** OCR language weights for ocrmac (e.g., ['ko-KR', 'en-US'] or ['zh-Hant', 'ko-KR']) */
|
|
709
|
+
ocrLanguages?: string[];
|
|
710
|
+
/** BCP 47 language tags detected during sampling (e.g., ['ko-KR', 'en-US']) */
|
|
711
|
+
detectedLanguages?: string[];
|
|
712
|
+
/** Human-readable explanation of the decision */
|
|
713
|
+
reason: string;
|
|
714
|
+
/** Number of pages that were sampled for the decision */
|
|
715
|
+
sampledPages: number;
|
|
716
|
+
/** Total number of pages in the document */
|
|
717
|
+
totalPages: number;
|
|
718
|
+
/** 1-based page numbers where Korean-Hanja mixed script was detected in text layer */
|
|
719
|
+
koreanHanjaMixPages?: number[];
|
|
720
|
+
}
|
|
721
|
+
|
|
722
|
+
export type { Caption, Chapter, ComponentUsageReport, DoclingBBox, DoclingBaseNode, DoclingBody, DoclingDocument, DoclingGroupItem, DoclingOrigin, DoclingPage, DoclingPageImage, DoclingPictureItem, DoclingProv, DoclingReference, DoclingTableCell, DoclingTableData, DoclingTableItem, DoclingTextItem, DocumentProcessResult, ModelUsageDetail, OcrStrategy, PageRange, PhaseUsageReport, ProcessedDocument, ProcessedFootnote, ProcessedImage, ProcessedTable, ProcessedTableCell, TextBlock, TokenUsageReport, TokenUsageSummary };
|
package/dist/index.d.ts
CHANGED
|
@@ -109,43 +109,6 @@ interface DoclingDocument {
|
|
|
109
109
|
pages: Record<string, DoclingPage>;
|
|
110
110
|
}
|
|
111
111
|
|
|
112
|
-
/**
|
|
113
|
-
* Result of Hanja (KCJ) quality assessment
|
|
114
|
-
*
|
|
115
|
-
* Evaluates OCR quality of Korean-Chinese-Japanese (KCJ/KCJ) characters
|
|
116
|
-
* in the document by sampling pages and comparing with Vision LLM.
|
|
117
|
-
*/
|
|
118
|
-
interface HanjaAssessment {
|
|
119
|
-
/**
|
|
120
|
-
* Whether the document should be re-parsed using VLM pipeline
|
|
121
|
-
* due to significant KCJ character corruption
|
|
122
|
-
*/
|
|
123
|
-
needsVlmReparse: boolean;
|
|
124
|
-
/**
|
|
125
|
-
* Severity of KCJ character corruption
|
|
126
|
-
* - 'none': No KCJ characters found or no corruption detected
|
|
127
|
-
* - 'minor': Some corruption but still usable
|
|
128
|
-
* - 'severe': Significant corruption requiring VLM re-parse
|
|
129
|
-
*/
|
|
130
|
-
severity: 'none' | 'minor' | 'severe';
|
|
131
|
-
/**
|
|
132
|
-
* Total number of text pages considered as candidates for assessment
|
|
133
|
-
*/
|
|
134
|
-
kcjPageCount: number;
|
|
135
|
-
/**
|
|
136
|
-
* Number of pages actually sampled for quality assessment
|
|
137
|
-
*/
|
|
138
|
-
sampledPageCount: number;
|
|
139
|
-
/**
|
|
140
|
-
* Ratio of corrupted characters (0.0 ~ 1.0)
|
|
141
|
-
*/
|
|
142
|
-
corruptedRatio: number;
|
|
143
|
-
/**
|
|
144
|
-
* Human-readable reason for the assessment result
|
|
145
|
-
*/
|
|
146
|
-
reason: string;
|
|
147
|
-
}
|
|
148
|
-
|
|
149
112
|
/**
|
|
150
113
|
* Caption information
|
|
151
114
|
*
|
|
@@ -734,4 +697,26 @@ interface DocumentProcessResult {
|
|
|
734
697
|
usage: TokenUsageReport;
|
|
735
698
|
}
|
|
736
699
|
|
|
737
|
-
|
|
700
|
+
/**
|
|
701
|
+
* Result of the OCR strategy sampling phase.
|
|
702
|
+
* Determines whether to use ocrmac (standard Docling pipeline)
|
|
703
|
+
* or VLM (direct vision language model processing) for a given document.
|
|
704
|
+
*/
|
|
705
|
+
interface OcrStrategy {
|
|
706
|
+
/** Selected OCR method */
|
|
707
|
+
method: 'ocrmac' | 'vlm';
|
|
708
|
+
/** OCR language weights for ocrmac (e.g., ['ko-KR', 'en-US'] or ['zh-Hant', 'ko-KR']) */
|
|
709
|
+
ocrLanguages?: string[];
|
|
710
|
+
/** BCP 47 language tags detected during sampling (e.g., ['ko-KR', 'en-US']) */
|
|
711
|
+
detectedLanguages?: string[];
|
|
712
|
+
/** Human-readable explanation of the decision */
|
|
713
|
+
reason: string;
|
|
714
|
+
/** Number of pages that were sampled for the decision */
|
|
715
|
+
sampledPages: number;
|
|
716
|
+
/** Total number of pages in the document */
|
|
717
|
+
totalPages: number;
|
|
718
|
+
/** 1-based page numbers where Korean-Hanja mixed script was detected in text layer */
|
|
719
|
+
koreanHanjaMixPages?: number[];
|
|
720
|
+
}
|
|
721
|
+
|
|
722
|
+
export type { Caption, Chapter, ComponentUsageReport, DoclingBBox, DoclingBaseNode, DoclingBody, DoclingDocument, DoclingGroupItem, DoclingOrigin, DoclingPage, DoclingPageImage, DoclingPictureItem, DoclingProv, DoclingReference, DoclingTableCell, DoclingTableData, DoclingTableItem, DoclingTextItem, DocumentProcessResult, ModelUsageDetail, OcrStrategy, PageRange, PhaseUsageReport, ProcessedDocument, ProcessedFootnote, ProcessedImage, ProcessedTable, ProcessedTableCell, TextBlock, TokenUsageReport, TokenUsageSummary };
|
package/package.json
CHANGED