npm - @heripo/model - Versions diffs - 0.1.6 → 0.1.8 - Mend

@heripo/model 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/index.cjs.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"sources":["../src/index.ts"],"sourcesContent":["export type * from './docling-document';\nexport type * from './~~hanja~~-~~assessment~~';\nexport type * from './~~processed~~-~~document~~';\nexport type * from './~~token~~-~~usage~~-~~report~~';\nexport type * from './~~document~~-~~process-result~~';\n"],"mappings":";;;;;;;;;;;;;;;;AAAA;AAAA;","names":[]}
1	+ {"version":3,"sources":["../src/index.ts"],"sourcesContent":["export type * from './docling-document';\nexport type * from './processed-document';\nexport type * from './token-usage-report';\nexport type * from './document-process-result';\nexport type * from './ocr-strategy';\n"],"mappings":";;;;;;;;;;;;;;;;AAAA;AAAA;","names":[]}

package/dist/index.d.cts CHANGED Viewed

@@ -109,43 +109,6 @@ interface DoclingDocument {
     pages: Record<string, DoclingPage>;
 }
-/**
- * Result of Hanja (KCJ) quality assessment
- *
- * Evaluates OCR quality of Korean-Chinese-Japanese (KCJ/KCJ) characters
- * in the document by sampling pages and comparing with Vision LLM.
- */
-interface HanjaAssessment {
-    /**
-     * Whether the document should be re-parsed using VLM pipeline
-     * due to significant KCJ character corruption
-     */
-    needsVlmReparse: boolean;
-    /**
-     * Severity of KCJ character corruption
-     * - 'none': No KCJ characters found or no corruption detected
-     * - 'minor': Some corruption but still usable
-     * - 'severe': Significant corruption requiring VLM re-parse
-     */
-    severity: 'none' | 'minor' | 'severe';
-    /**
-     * Total number of text pages considered as candidates for assessment
-     */
-    kcjPageCount: number;
-    /**
-     * Number of pages actually sampled for quality assessment
-     */
-    sampledPageCount: number;
-    /**
-     * Ratio of corrupted characters (0.0 ~ 1.0)
-     */
-    corruptedRatio: number;
-    /**
-     * Human-readable reason for the assessment result
-     */
-    reason: string;
-}
 /**
  * Caption information
  *
@@ -734,4 +697,26 @@ interface DocumentProcessResult {
     usage: TokenUsageReport;
 }
-export type { Caption, Chapter, ComponentUsageReport, DoclingBBox, DoclingBaseNode, DoclingBody, DoclingDocument, DoclingGroupItem, DoclingOrigin, DoclingPage, DoclingPageImage, DoclingPictureItem, DoclingProv, DoclingReference, DoclingTableCell, DoclingTableData, DoclingTableItem, DoclingTextItem, DocumentProcessResult, HanjaAssessment, ModelUsageDetail, PageRange, PhaseUsageReport, ProcessedDocument, ProcessedFootnote, ProcessedImage, ProcessedTable, ProcessedTableCell, TextBlock, TokenUsageReport, TokenUsageSummary };
+/**
+ * Result of the OCR strategy sampling phase.
+ * Determines whether to use ocrmac (standard Docling pipeline)
+ * or VLM (direct vision language model processing) for a given document.
+ */
+interface OcrStrategy {
+    /** Selected OCR method */
+    method: 'ocrmac' | 'vlm';
+    /** OCR language weights for ocrmac (e.g., ['ko-KR', 'en-US'] or ['zh-Hant', 'ko-KR']) */
+    ocrLanguages?: string[];
+    /** BCP 47 language tags detected during sampling (e.g., ['ko-KR', 'en-US']) */
+    detectedLanguages?: string[];
+    /** Human-readable explanation of the decision */
+    reason: string;
+    /** Number of pages that were sampled for the decision */
+    sampledPages: number;
+    /** Total number of pages in the document */
+    totalPages: number;
+    /** 1-based page numbers where Korean-Hanja mixed script was detected in text layer */
+    koreanHanjaMixPages?: number[];
+}
+export type { Caption, Chapter, ComponentUsageReport, DoclingBBox, DoclingBaseNode, DoclingBody, DoclingDocument, DoclingGroupItem, DoclingOrigin, DoclingPage, DoclingPageImage, DoclingPictureItem, DoclingProv, DoclingReference, DoclingTableCell, DoclingTableData, DoclingTableItem, DoclingTextItem, DocumentProcessResult, ModelUsageDetail, OcrStrategy, PageRange, PhaseUsageReport, ProcessedDocument, ProcessedFootnote, ProcessedImage, ProcessedTable, ProcessedTableCell, TextBlock, TokenUsageReport, TokenUsageSummary };

package/dist/index.d.ts CHANGED Viewed

@@ -109,43 +109,6 @@ interface DoclingDocument {
     pages: Record<string, DoclingPage>;
 }
-/**
- * Result of Hanja (KCJ) quality assessment
- *
- * Evaluates OCR quality of Korean-Chinese-Japanese (KCJ/KCJ) characters
- * in the document by sampling pages and comparing with Vision LLM.
- */
-interface HanjaAssessment {
-    /**
-     * Whether the document should be re-parsed using VLM pipeline
-     * due to significant KCJ character corruption
-     */
-    needsVlmReparse: boolean;
-    /**
-     * Severity of KCJ character corruption
-     * - 'none': No KCJ characters found or no corruption detected
-     * - 'minor': Some corruption but still usable
-     * - 'severe': Significant corruption requiring VLM re-parse
-     */
-    severity: 'none' | 'minor' | 'severe';
-    /**
-     * Total number of text pages considered as candidates for assessment
-     */
-    kcjPageCount: number;
-    /**
-     * Number of pages actually sampled for quality assessment
-     */
-    sampledPageCount: number;
-    /**
-     * Ratio of corrupted characters (0.0 ~ 1.0)
-     */
-    corruptedRatio: number;
-    /**
-     * Human-readable reason for the assessment result
-     */
-    reason: string;
-}
 /**
  * Caption information
  *
@@ -734,4 +697,26 @@ interface DocumentProcessResult {
     usage: TokenUsageReport;
 }
-export type { Caption, Chapter, ComponentUsageReport, DoclingBBox, DoclingBaseNode, DoclingBody, DoclingDocument, DoclingGroupItem, DoclingOrigin, DoclingPage, DoclingPageImage, DoclingPictureItem, DoclingProv, DoclingReference, DoclingTableCell, DoclingTableData, DoclingTableItem, DoclingTextItem, DocumentProcessResult, HanjaAssessment, ModelUsageDetail, PageRange, PhaseUsageReport, ProcessedDocument, ProcessedFootnote, ProcessedImage, ProcessedTable, ProcessedTableCell, TextBlock, TokenUsageReport, TokenUsageSummary };
+/**
+ * Result of the OCR strategy sampling phase.
+ * Determines whether to use ocrmac (standard Docling pipeline)
+ * or VLM (direct vision language model processing) for a given document.
+ */
+interface OcrStrategy {
+    /** Selected OCR method */
+    method: 'ocrmac' | 'vlm';
+    /** OCR language weights for ocrmac (e.g., ['ko-KR', 'en-US'] or ['zh-Hant', 'ko-KR']) */
+    ocrLanguages?: string[];
+    /** BCP 47 language tags detected during sampling (e.g., ['ko-KR', 'en-US']) */
+    detectedLanguages?: string[];
+    /** Human-readable explanation of the decision */
+    reason: string;
+    /** Number of pages that were sampled for the decision */
+    sampledPages: number;
+    /** Total number of pages in the document */
+    totalPages: number;
+    /** 1-based page numbers where Korean-Hanja mixed script was detected in text layer */
+    koreanHanjaMixPages?: number[];
+}
+export type { Caption, Chapter, ComponentUsageReport, DoclingBBox, DoclingBaseNode, DoclingBody, DoclingDocument, DoclingGroupItem, DoclingOrigin, DoclingPage, DoclingPageImage, DoclingPictureItem, DoclingProv, DoclingReference, DoclingTableCell, DoclingTableData, DoclingTableItem, DoclingTextItem, DocumentProcessResult, ModelUsageDetail, OcrStrategy, PageRange, PhaseUsageReport, ProcessedDocument, ProcessedFootnote, ProcessedImage, ProcessedTable, ProcessedTableCell, TextBlock, TokenUsageReport, TokenUsageSummary };

package/package.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "name": "@heripo/model",
   "private": false,
   "type": "module",
-  "version": "0.1.6",
+  "version": "0.1.8",
   "description": "Document models and type definitions for heripo engine",
   "main": "dist/index.cjs",
   "module": "dist/index.js",