npm - @heripo/model - Versions diffs - 0.1.16 → 0.1.17 - Mend

@heripo/model 0.1.16 → 0.1.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/README.ko.md CHANGED Viewed

@@ -72,10 +72,11 @@ import type { ProcessedDocument } from '@heripo/model';
 interface ProcessedDocument {
   reportId: string; // 리포트 ID
-  pageRangeMap: PageRange[]; // PDF 페이지 → 문서 페이지 매핑
+  pageRangeMap: Record<number, PageRange>; // PDF 페이지 → 문서 페이지 매핑
   chapters: Chapter[]; // 계층적 챕터 구조
   images: ProcessedImage[]; // 추출된 이미지 메타데이터
   tables: ProcessedTable[]; // 추출된 테이블 데이터
+  footnotes: ProcessedFootnote[]; // 추출된 각주
 }
 ```
@@ -89,12 +90,14 @@ import type { Chapter } from '@heripo/model';
 interface Chapter {
   id: string; // 챕터 ID
   title: string; // 챕터 제목
+  originTitle: string; // 원본 제목
   level: number; // 계층 레벨 (1, 2, 3, ...)
-  pageNo?: number; // 시작 페이지 번호
+  pageNo: number; // 시작 페이지 번호
   textBlocks: TextBlock[]; // 텍스트 블록
   imageIds: string[]; // 이미지 ID 참조
   tableIds: string[]; // 테이블 ID 참조
-  children: Chapter[]; // 하위 챕터
+  footnoteIds: string[]; // 각주 ID 참조
+  children?: Chapter[]; // 하위 챕터 (선택)
 }
 ```
@@ -107,7 +110,7 @@ import type { TextBlock } from '@heripo/model';
 interface TextBlock {
   text: string; // 텍스트 내용
-  pageNo?: number; // 페이지 번호
+  pdfPageNo: number; // PDF 페이지 번호
 }
 ```
@@ -121,8 +124,8 @@ import type { ProcessedImage } from '@heripo/model';
 interface ProcessedImage {
   id: string; // 이미지 ID
   caption?: Caption; // 캡션 (선택)
-  pdfPageNo?: number; // PDF 페이지 번호
-  filePath: string; // 이미지 파일 경로
+  pdfPageNo: number; // PDF 페이지 번호
+  path: string; // 이미지 파일 경로
 }
 ```
@@ -136,8 +139,8 @@ import type { ProcessedTable } from '@heripo/model';
 interface ProcessedTable {
   id: string; // 테이블 ID
   caption?: Caption; // 캡션 (선택)
-  pdfPageNo?: number; // PDF 페이지 번호
-  data: ProcessedTableCell[][]; // 2D 그리드 데이터
+  pdfPageNo: number; // PDF 페이지 번호
+  grid: ProcessedTableCell[][]; // 2D 그리드 데이터
   numRows: number; // 행 개수
   numCols: number; // 열 개수
 }
@@ -152,8 +155,8 @@ import type { ProcessedTableCell } from '@heripo/model';
 interface ProcessedTableCell {
   text: string; // 셀 텍스트
-  rowspan: number; // 행 병합
-  colspan: number; // 열 병합
+  rowSpan: number; // 행 병합
+  colSpan: number; // 열 병합
   isHeader: boolean; // 헤더 셀 여부
 }
 ```
@@ -166,7 +169,7 @@ interface ProcessedTableCell {
 import type { Caption } from '@heripo/model';
 interface Caption {
-  num?: number; // 캡션 번호 (예: "그림 1"의 1)
+  num?: string; // 캡션 번호 (예: "그림 1"의 "1")
   fullText: string; // 전체 캡션 텍스트
 }
 ```
@@ -179,11 +182,130 @@ PDF 페이지와 문서 페이지 매핑입니다.
 import type { PageRange } from '@heripo/model';
 interface PageRange {
+  startPageNo: number; // 시작 페이지 번호
+  endPageNo: number; // 끝 페이지 번호
+}
+```
+### ProcessedFootnote
+문서에서 추출된 각주입니다.
+```typescript
+import type { ProcessedFootnote } from '@heripo/model';
+interface ProcessedFootnote {
+  id: string; // 각주 ID
+  text: string; // 각주 텍스트
   pdfPageNo: number; // PDF 페이지 번호
-  pageNo: number; // 문서 논리적 페이지 번호
 }
 ```
+### DocumentProcessResult
+문서 처리 결과로, 처리된 문서와 토큰 사용량 리포트를 포함합니다.
+```typescript
+import type { DocumentProcessResult } from '@heripo/model';
+interface DocumentProcessResult {
+  document: ProcessedDocument; // 처리된 문서
+  usage: TokenUsageReport; // 토큰 사용량 리포트
+}
+```
+### OcrStrategy
+OCR 전략 선택 결과입니다.
+```typescript
+import type { OcrStrategy } from '@heripo/model';
+interface OcrStrategy {
+  method: 'ocrmac' | 'vlm'; // OCR 방법
+  ocrLanguages?: string[]; // OCR 언어
+  detectedLanguages?: Bcp47LanguageTag[]; // 감지된 BCP-47 언어 태그
+  reason: string; // 전략 선택 이유
+  sampledPages: number; // 샘플링된 페이지 수
+  totalPages: number; // 문서 전체 페이지 수
+  koreanHanjaMixPages?: number[]; // 한국어-한자 혼용 페이지
+}
+```
+### 토큰 사용량 타입
+처리 단계별 LLM 토큰 사용량을 추적하기 위한 타입입니다.
+```typescript
+import type {
+  ComponentUsageReport,
+  ModelUsageDetail,
+  PhaseUsageReport,
+  TokenUsageReport,
+  TokenUsageSummary,
+} from '@heripo/model';
+interface TokenUsageReport {
+  components: ComponentUsageReport[]; // 컴포넌트별 사용량
+  total: TokenUsageSummary; // 전체 사용량 요약
+}
+interface ComponentUsageReport {
+  component: string; // 컴포넌트 이름
+  phases: PhaseUsageReport[]; // 단계별 사용량
+  total: TokenUsageSummary; // 컴포넌트 합계
+}
+interface PhaseUsageReport {
+  phase: string; // 단계 이름
+  primary?: ModelUsageDetail; // 기본 모델 사용량
+  fallback?: ModelUsageDetail; // 폴백 모델 사용량
+  total: TokenUsageSummary; // 단계 합계
+}
+interface ModelUsageDetail {
+  modelName: string; // 모델 이름
+  inputTokens: number; // 입력 토큰 수
+  outputTokens: number; // 출력 토큰 수
+  totalTokens: number; // 전체 토큰 수
+}
+interface TokenUsageSummary {
+  inputTokens: number; // 입력 토큰 수
+  outputTokens: number; // 출력 토큰 수
+  totalTokens: number; // 전체 토큰 수
+}
+```
+### BCP-47 언어 태그 유틸리티
+BCP-47 언어 태그를 다루기 위한 유틸리티입니다.
+```typescript
+import {
+  type Bcp47LanguageTag,
+  BCP47_LANGUAGE_TAGS,
+  BCP47_LANGUAGE_TAG_SET,
+  isValidBcp47Tag,
+  normalizeToBcp47,
+} from '@heripo/model';
+// Bcp47LanguageTag - 지원되는 BCP-47 언어 태그의 유니온 타입
+type Bcp47LanguageTag = 'ko' | 'en' | 'ja' | 'zh' | /* ... */ string;
+// BCP47_LANGUAGE_TAGS - 30개 지원 태그의 상수 배열
+const BCP47_LANGUAGE_TAGS: readonly Bcp47LanguageTag[];
+// BCP47_LANGUAGE_TAG_SET - O(1) 조회를 위한 ReadonlySet
+const BCP47_LANGUAGE_TAG_SET: ReadonlySet<string>;
+// isValidBcp47Tag - 문자열이 유효한 BCP-47 태그인지 확인
+function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag;
+// normalizeToBcp47 - 언어 문자열을 BCP-47 형식으로 정규화
+function normalizeToBcp47(tag: string): Bcp47LanguageTag | undefined;
+```
 ## 사용법
 ### ProcessedDocument 읽기
@@ -200,7 +322,7 @@ function analyzeDocument(doc: ProcessedDocument) {
     console.log(`  텍스트 블록: ${chapter.textBlocks.length}개`);
     console.log(`  이미지: ${chapter.imageIds.length}개`);
     console.log(`  테이블: ${chapter.tableIds.length}개`);
-    console.log(`  하위 챕터: ${chapter.children.length}개`);
+    console.log(`  하위 챕터: ${chapter.children?.length ?? 0}개`);
   });
   // 이미지 확인
@@ -209,7 +331,7 @@ function analyzeDocument(doc: ProcessedDocument) {
     if (image.caption) {
       console.log(`  캡션: ${image.caption.fullText}`);
     }
-    console.log(`  경로: ${image.filePath}`);
+    console.log(`  경로: ${image.path}`);
   });
   // 테이블 확인
@@ -233,7 +355,7 @@ function traverseChapters(chapter: Chapter, depth: number = 0) {
   console.log(`${indent}- ${chapter.title}`);
   // 재귀적으로 하위 챕터 순회
-  chapter.children.forEach((child) => {
+  chapter.children?.forEach((child) => {
     traverseChapters(child, depth + 1);
   });
 }

package/README.md CHANGED Viewed

@@ -72,10 +72,11 @@ import type { ProcessedDocument } from '@heripo/model';
 interface ProcessedDocument {
   reportId: string; // Report ID
-  pageRangeMap: PageRange[]; // PDF page → document page mapping
+  pageRangeMap: Record<number, PageRange>; // PDF page → document page mapping
   chapters: Chapter[]; // Hierarchical chapter structure
   images: ProcessedImage[]; // Extracted image metadata
   tables: ProcessedTable[]; // Extracted table data
+  footnotes: ProcessedFootnote[]; // Extracted footnotes
 }
 ```
@@ -89,12 +90,14 @@ import type { Chapter } from '@heripo/model';
 interface Chapter {
   id: string; // Chapter ID
   title: string; // Chapter title
+  originTitle: string; // Original title from source
   level: number; // Hierarchy level (1, 2, 3, ...)
-  pageNo?: number; // Start page number
+  pageNo: number; // Start page number
   textBlocks: TextBlock[]; // Text blocks
   imageIds: string[]; // Image ID references
   tableIds: string[]; // Table ID references
-  children: Chapter[]; // Sub-chapters
+  footnoteIds: string[]; // Footnote ID references
+  children?: Chapter[]; // Sub-chapters (optional)
 }
 ```
@@ -107,7 +110,7 @@ import type { TextBlock } from '@heripo/model';
 interface TextBlock {
   text: string; // Text content
-  pageNo?: number; // Page number
+  pdfPageNo: number; // PDF page number
 }
 ```
@@ -121,8 +124,8 @@ import type { ProcessedImage } from '@heripo/model';
 interface ProcessedImage {
   id: string; // Image ID
   caption?: Caption; // Caption (optional)
-  pdfPageNo?: number; // PDF page number
-  filePath: string; // Image file path
+  pdfPageNo: number; // PDF page number
+  path: string; // Image file path
 }
 ```
@@ -136,8 +139,8 @@ import type { ProcessedTable } from '@heripo/model';
 interface ProcessedTable {
   id: string; // Table ID
   caption?: Caption; // Caption (optional)
-  pdfPageNo?: number; // PDF page number
-  data: ProcessedTableCell[][]; // 2D grid data
+  pdfPageNo: number; // PDF page number
+  grid: ProcessedTableCell[][]; // 2D grid data
   numRows: number; // Row count
   numCols: number; // Column count
 }
@@ -152,8 +155,8 @@ import type { ProcessedTableCell } from '@heripo/model';
 interface ProcessedTableCell {
   text: string; // Cell text
-  rowspan: number; // Row span
-  colspan: number; // Column span
+  rowSpan: number; // Row span
+  colSpan: number; // Column span
   isHeader: boolean; // Is header cell
 }
 ```
@@ -166,7 +169,7 @@ Image and table captions.
 import type { Caption } from '@heripo/model';
 interface Caption {
-  num?: number; // Caption number (e.g., 1 in "Figure 1")
+  num?: string; // Caption number (e.g., "1" in "Figure 1")
   fullText: string; // Full caption text
 }
 ```
@@ -179,11 +182,130 @@ PDF page to document page mapping.
 import type { PageRange } from '@heripo/model';
 interface PageRange {
+  startPageNo: number; // Start page number
+  endPageNo: number; // End page number
+}
+```
+### ProcessedFootnote
+Footnote extracted from the document.
+```typescript
+import type { ProcessedFootnote } from '@heripo/model';
+interface ProcessedFootnote {
+  id: string; // Footnote ID
+  text: string; // Footnote text
   pdfPageNo: number; // PDF page number
-  pageNo: number; // Document logical page number
 }
 ```
+### DocumentProcessResult
+Result of document processing, including the processed document and token usage report.
+```typescript
+import type { DocumentProcessResult } from '@heripo/model';
+interface DocumentProcessResult {
+  document: ProcessedDocument; // Processed document
+  usage: TokenUsageReport; // Token usage report
+}
+```
+### OcrStrategy
+OCR strategy selection result.
+```typescript
+import type { OcrStrategy } from '@heripo/model';
+interface OcrStrategy {
+  method: 'ocrmac' | 'vlm'; // OCR method
+  ocrLanguages?: string[]; // OCR languages
+  detectedLanguages?: Bcp47LanguageTag[]; // Detected BCP-47 language tags
+  reason: string; // Reason for strategy selection
+  sampledPages: number; // Number of sampled pages
+  totalPages: number; // Total pages in document
+  koreanHanjaMixPages?: number[]; // Pages with Korean-Hanja mixed script
+}
+```
+### Token Usage Types
+Types for tracking LLM token usage across processing phases.
+```typescript
+import type {
+  ComponentUsageReport,
+  ModelUsageDetail,
+  PhaseUsageReport,
+  TokenUsageReport,
+  TokenUsageSummary,
+} from '@heripo/model';
+interface TokenUsageReport {
+  components: ComponentUsageReport[]; // Usage per component
+  total: TokenUsageSummary; // Total usage summary
+}
+interface ComponentUsageReport {
+  component: string; // Component name
+  phases: PhaseUsageReport[]; // Usage per phase
+  total: TokenUsageSummary; // Component total
+}
+interface PhaseUsageReport {
+  phase: string; // Phase name
+  primary?: ModelUsageDetail; // Primary model usage
+  fallback?: ModelUsageDetail; // Fallback model usage
+  total: TokenUsageSummary; // Phase total
+}
+interface ModelUsageDetail {
+  modelName: string; // Model name
+  inputTokens: number; // Input token count
+  outputTokens: number; // Output token count
+  totalTokens: number; // Total token count
+}
+interface TokenUsageSummary {
+  inputTokens: number; // Input token count
+  outputTokens: number; // Output token count
+  totalTokens: number; // Total token count
+}
+```
+### BCP-47 Language Tag Utilities
+Utilities for working with BCP-47 language tags.
+```typescript
+import {
+  type Bcp47LanguageTag,
+  BCP47_LANGUAGE_TAGS,
+  BCP47_LANGUAGE_TAG_SET,
+  isValidBcp47Tag,
+  normalizeToBcp47,
+} from '@heripo/model';
+// Bcp47LanguageTag - Union type of supported BCP-47 language tags
+type Bcp47LanguageTag = 'ko' | 'en' | 'ja' | 'zh' | /* ... */ string;
+// BCP47_LANGUAGE_TAGS - Const array of 30 supported tags
+const BCP47_LANGUAGE_TAGS: readonly Bcp47LanguageTag[];
+// BCP47_LANGUAGE_TAG_SET - ReadonlySet for O(1) lookup
+const BCP47_LANGUAGE_TAG_SET: ReadonlySet<string>;
+// isValidBcp47Tag - Check if a string is a valid BCP-47 tag
+function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag;
+// normalizeToBcp47 - Normalize a language string to BCP-47 format
+function normalizeToBcp47(tag: string): Bcp47LanguageTag | undefined;
+```
 ## Usage
 ### Reading ProcessedDocument
@@ -200,7 +322,7 @@ function analyzeDocument(doc: ProcessedDocument) {
     console.log(`  Text blocks: ${chapter.textBlocks.length}`);
     console.log(`  Images: ${chapter.imageIds.length}`);
     console.log(`  Tables: ${chapter.tableIds.length}`);
-    console.log(`  Sub-chapters: ${chapter.children.length}`);
+    console.log(`  Sub-chapters: ${chapter.children?.length ?? 0}`);
   });
   // Check images
@@ -209,7 +331,7 @@ function analyzeDocument(doc: ProcessedDocument) {
     if (image.caption) {
       console.log(`  Caption: ${image.caption.fullText}`);
     }
-    console.log(`  Path: ${image.filePath}`);
+    console.log(`  Path: ${image.path}`);
   });
   // Check tables
@@ -233,7 +355,7 @@ function traverseChapters(chapter: Chapter, depth: number = 0) {
   console.log(`${indent}- ${chapter.title}`);
   // Recursively traverse sub-chapters
-  chapter.children.forEach((child) => {
+  chapter.children?.forEach((child) => {
     traverseChapters(child, depth + 1);
   });
 }

package/dist/index.cjs CHANGED Viewed

@@ -22,12 +22,15 @@ var index_exports = {};
 __export(index_exports, {
   BCP47_LANGUAGE_TAGS: () => BCP47_LANGUAGE_TAGS,
   BCP47_LANGUAGE_TAG_SET: () => BCP47_LANGUAGE_TAG_SET,
+  LANGUAGE_DISPLAY_NAMES: () => LANGUAGE_DISPLAY_NAMES,
+  buildLanguageDescription: () => buildLanguageDescription,
+  getLanguageDisplayName: () => getLanguageDisplayName,
   isValidBcp47Tag: () => isValidBcp47Tag,
   normalizeToBcp47: () => normalizeToBcp47
 });
 module.exports = __toCommonJS(index_exports);
-// src/bcp47-language-tag.ts
+// src/language/bcp47-language-tag.ts
 var BCP47_LANGUAGE_TAGS = [
   "ar-SA",
   "ars-SA",
@@ -103,10 +106,38 @@ function normalizeToBcp47(tag) {
   }
   return null;
 }
+// src/language/language-display.ts
+var LANGUAGE_DISPLAY_NAMES = {
+  ko: "Korean (\uD55C\uAD6D\uC5B4)",
+  ja: "Japanese (\u65E5\u672C\u8A9E)",
+  zh: "Chinese (\u4E2D\u6587)",
+  en: "English",
+  fr: "French (Fran\xE7ais)",
+  de: "German (Deutsch)",
+  es: "Spanish (Espa\xF1ol)",
+  pt: "Portuguese (Portugu\xEAs)",
+  ru: "Russian (\u0420\u0443\u0441\u0441\u043A\u0438\u0439)",
+  uk: "Ukrainian (\u0423\u043A\u0440\u0430\u0457\u043D\u0441\u044C\u043A\u0430)",
+  it: "Italian (Italiano)"
+};
+function getLanguageDisplayName(code) {
+  if (!code) return "unknown";
+  const baseCode = code.split("-")[0];
+  return LANGUAGE_DISPLAY_NAMES[baseCode] ?? code;
+}
+function buildLanguageDescription(documentLanguages) {
+  const primaryName = getLanguageDisplayName(documentLanguages[0]);
+  const otherNames = documentLanguages.slice(1).map((code) => getLanguageDisplayName(code));
+  return otherNames.length > 0 ? `primarily written in ${primaryName}, with ${otherNames.join(", ")} also present` : `written in ${primaryName}`;
+}
 // Annotate the CommonJS export names for ESM import in node:
 0 && (module.exports = {
   BCP47_LANGUAGE_TAGS,
   BCP47_LANGUAGE_TAG_SET,
+  LANGUAGE_DISPLAY_NAMES,
+  buildLanguageDescription,
+  getLanguageDisplayName,
   isValidBcp47Tag,
   normalizeToBcp47
 });

package/dist/index.cjs.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"sources":["../src/index.ts","../src/bcp47-language-tag.ts"],"sourcesContent":["~~export~~ type * from './bcp47-language-tag';\nexport {\n BCP47_LANGUAGE_TAGS,\n BCP47_LANGUAGE_TAG_SET,\n isValidBcp47Tag,\n normalizeToBcp47,\n} from './bcp47-language-tag';\nexport type * from './docling-document';\nexport type * from './~~processed~~-~~document~~';\nexport type * from './~~token~~-~~usage-report~~';\nexport type * from './document~~-process-result~~';\nexport type * from './~~ocr~~-~~strategy~~';\n","/*\n Language tags supported by the ocrmac OCR engine (via docling-serve).\n * These are the only tags that can be passed to the Docling pipeline without\n * triggering \"Invalid language preference\" errors.\n /\nexport const BCP47_LANGUAGE_TAGS = [\n 'ar-SA',\n 'ars-SA',\n 'cs-CZ',\n 'da-DK',\n 'de-DE',\n 'en-US',\n 'es-ES',\n 'fr-FR',\n 'id-ID',\n 'it-IT',\n 'ja-JP',\n 'ko-KR',\n 'ms-MY',\n 'nb-NO',\n 'nl-NL',\n 'nn-NO',\n 'no-NO',\n 'pl-PL',\n 'pt-BR',\n 'ro-RO',\n 'ru-RU',\n 'sv-SE',\n 'th-TH',\n 'tr-TR',\n 'uk-UA',\n 'vi-VT',\n 'yue-Hans',\n 'yue-Hant',\n 'zh-Hans',\n 'zh-Hant',\n] as const;\n\n/* Union type of all supported BCP 47 language tags /\nexport type Bcp47LanguageTag = (typeof BCP47_LANGUAGE_TAGS)[number];\n\n/* Set for O(1) lookup of valid BCP 47 tags /\nexport const BCP47_LANGUAGE_TAG_SET: ReadonlySet<string> = new Set(\n BCP47_LANGUAGE_TAGS,\n);\n\n/* Check whether a string is a valid BCP 47 language tag /\nexport function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag {\n return BCP47_LANGUAGE_TAG_SET.has(tag);\n}\n\n/\n Maps bare language codes to their default BCP 47 tag.\n * Used when VLM returns only a language code without a region subtag.\n /\nconst DEFAULT_REGION_MAP: Record<string, Bcp47LanguageTag> = {\n ar: 'ar-SA',\n cs: 'cs-CZ',\n da: 'da-DK',\n de: 'de-DE',\n en: 'en-US',\n es: 'es-ES',\n fr: 'fr-FR',\n id: 'id-ID',\n it: 'it-IT',\n ja: 'ja-JP',\n ko: 'ko-KR',\n ms: 'ms-MY',\n nl: 'nl-NL',\n no: 'no-NO',\n pl: 'pl-PL',\n pt: 'pt-BR',\n ro: 'ro-RO',\n ru: 'ru-RU',\n sv: 'sv-SE',\n th: 'th-TH',\n tr: 'tr-TR',\n uk: 'uk-UA',\n vi: 'vi-VT',\n zh: 'zh-Hans',\n};\n\n/\n Normalize a language string to a valid BCP 47 tag.\n \n - If the input is already a valid full tag (e.g. \"en-US\"), return it as-is.\n * - If it is a bare language code (e.g. \"en\", \"ko\"), map it to the default region.\n * - Otherwise return null (e.g. \"und\", \"unknown\", empty string).\n */\nexport function normalizeToBcp47(tag: string): Bcp47LanguageTag \| null {\n if (isValidBcp47Tag(tag)) {\n return tag;\n }\n\n const lower = tag.toLowerCase();\n const mapped = DEFAULT_REGION_MAP[lower];\n if (mapped) {\n return mapped;\n }\n\n return null;\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACKO,IAAM,sBAAsB;AAAA,EACjC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAMO,IAAM,yBAA8C,IAAI;AAAA,EAC7D;AACF;AAGO,SAAS,gBAAgB,KAAsC;AACpE,SAAO,uBAAuB,IAAI,GAAG;AACvC;AAMA,IAAM,qBAAuD;AAAA,EAC3D,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AACN;AASO,SAAS,iBAAiB,KAAsC;AACrE,MAAI,gBAAgB,GAAG,GAAG;AACxB,WAAO;AAAA,EACT;AAEA,QAAM,QAAQ,IAAI,YAAY;AAC9B,QAAM,SAAS,mBAAmB,KAAK;AACvC,MAAI,QAAQ;AACV,WAAO;AAAA,EACT;AAEA,SAAO;AACT;","names":[]}
1	+ {"version":3,"sources":["../src/index.ts","../src/language/bcp47-language-tag.ts","../src/language/language-display.ts"],"sourcesContent":["// Language utilities\nexport type * from './language/bcp47-language-tag';\nexport {\n BCP47_LANGUAGE_TAGS,\n BCP47_LANGUAGE_TAG_SET,\n isValidBcp47Tag,\n normalizeToBcp47,\n} from './language/bcp47-language-tag';\nexport {\n LANGUAGE_DISPLAY_NAMES,\n buildLanguageDescription,\n getLanguageDisplayName,\n} from './language/language-display';\n\n// Type definitions\nexport type * from './types/docling-document';\nexport type * from './types/document-process-result';\nexport type * from './types/ocr-strategy';\nexport type * from './types/processed-document';\nexport type * from './types/token-usage-report';\n","/*\n Language tags supported by the ocrmac OCR engine (via docling-serve).\n * These are the only tags that can be passed to the Docling pipeline without\n * triggering \"Invalid language preference\" errors.\n /\nexport const BCP47_LANGUAGE_TAGS = [\n 'ar-SA',\n 'ars-SA',\n 'cs-CZ',\n 'da-DK',\n 'de-DE',\n 'en-US',\n 'es-ES',\n 'fr-FR',\n 'id-ID',\n 'it-IT',\n 'ja-JP',\n 'ko-KR',\n 'ms-MY',\n 'nb-NO',\n 'nl-NL',\n 'nn-NO',\n 'no-NO',\n 'pl-PL',\n 'pt-BR',\n 'ro-RO',\n 'ru-RU',\n 'sv-SE',\n 'th-TH',\n 'tr-TR',\n 'uk-UA',\n 'vi-VT',\n 'yue-Hans',\n 'yue-Hant',\n 'zh-Hans',\n 'zh-Hant',\n] as const;\n\n/* Union type of all supported BCP 47 language tags /\nexport type Bcp47LanguageTag = (typeof BCP47_LANGUAGE_TAGS)[number];\n\n/* Set for O(1) lookup of valid BCP 47 tags /\nexport const BCP47_LANGUAGE_TAG_SET: ReadonlySet<string> = new Set(\n BCP47_LANGUAGE_TAGS,\n);\n\n/* Check whether a string is a valid BCP 47 language tag /\nexport function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag {\n return BCP47_LANGUAGE_TAG_SET.has(tag);\n}\n\n/\n Maps bare language codes to their default BCP 47 tag.\n * Used when VLM returns only a language code without a region subtag.\n /\nconst DEFAULT_REGION_MAP: Record<string, Bcp47LanguageTag> = {\n ar: 'ar-SA',\n cs: 'cs-CZ',\n da: 'da-DK',\n de: 'de-DE',\n en: 'en-US',\n es: 'es-ES',\n fr: 'fr-FR',\n id: 'id-ID',\n it: 'it-IT',\n ja: 'ja-JP',\n ko: 'ko-KR',\n ms: 'ms-MY',\n nl: 'nl-NL',\n no: 'no-NO',\n pl: 'pl-PL',\n pt: 'pt-BR',\n ro: 'ro-RO',\n ru: 'ru-RU',\n sv: 'sv-SE',\n th: 'th-TH',\n tr: 'tr-TR',\n uk: 'uk-UA',\n vi: 'vi-VT',\n zh: 'zh-Hans',\n};\n\n/\n Normalize a language string to a valid BCP 47 tag.\n \n - If the input is already a valid full tag (e.g. \"en-US\"), return it as-is.\n * - If it is a bare language code (e.g. \"en\", \"ko\"), map it to the default region.\n * - Otherwise return null (e.g. \"und\", \"unknown\", empty string).\n /\nexport function normalizeToBcp47(tag: string): Bcp47LanguageTag \| null {\n if (isValidBcp47Tag(tag)) {\n return tag;\n }\n\n const lower = tag.toLowerCase();\n const mapped = DEFAULT_REGION_MAP[lower];\n if (mapped) {\n return mapped;\n }\n\n return null;\n}\n","/* Language display names for prompt context (keyed by ISO 639-1 base language code) /\nexport const LANGUAGE_DISPLAY_NAMES: Record<string, string> = {\n ko: 'Korean (한국어)',\n ja: 'Japanese (日本語)',\n zh: 'Chinese (中文)',\n en: 'English',\n fr: 'French (Français)',\n de: 'German (Deutsch)',\n es: 'Spanish (Español)',\n pt: 'Portuguese (Português)',\n ru: 'Russian (Русский)',\n uk: 'Ukrainian (Українська)',\n it: 'Italian (Italiano)',\n};\n\n/\n Get human-readable display name for a BCP 47 or ISO 639-1 language code.\n /\nexport function getLanguageDisplayName(code?: string): string {\n if (!code) return 'unknown';\n const baseCode = code.split('-')[0];\n return LANGUAGE_DISPLAY_NAMES[baseCode] ?? code;\n}\n\n/\n Build language description string from document languages.\n * @returns e.g. \"primarily written in Korean (한국어), with English also present\"\n */\nexport function buildLanguageDescription(documentLanguages: string[]): string {\n const primaryName = getLanguageDisplayName(documentLanguages[0]);\n const otherNames = documentLanguages\n .slice(1)\n .map((code) => getLanguageDisplayName(code));\n return otherNames.length > 0\n ? `primarily written in ${primaryName}, with ${otherNames.join(', ')} also present`\n : `written in ${primaryName}`;\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACKO,IAAM,sBAAsB;AAAA,EACjC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAMO,IAAM,yBAA8C,IAAI;AAAA,EAC7D;AACF;AAGO,SAAS,gBAAgB,KAAsC;AACpE,SAAO,uBAAuB,IAAI,GAAG;AACvC;AAMA,IAAM,qBAAuD;AAAA,EAC3D,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AACN;AASO,SAAS,iBAAiB,KAAsC;AACrE,MAAI,gBAAgB,GAAG,GAAG;AACxB,WAAO;AAAA,EACT;AAEA,QAAM,QAAQ,IAAI,YAAY;AAC9B,QAAM,SAAS,mBAAmB,KAAK;AACvC,MAAI,QAAQ;AACV,WAAO;AAAA,EACT;AAEA,SAAO;AACT;;;ACpGO,IAAM,yBAAiD;AAAA,EAC5D,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AACN;AAKO,SAAS,uBAAuB,MAAuB;AAC5D,MAAI,CAAC,KAAM,QAAO;AAClB,QAAM,WAAW,KAAK,MAAM,GAAG,EAAE,CAAC;AAClC,SAAO,uBAAuB,QAAQ,KAAK;AAC7C;AAMO,SAAS,yBAAyB,mBAAqC;AAC5E,QAAM,cAAc,uBAAuB,kBAAkB,CAAC,CAAC;AAC/D,QAAM,aAAa,kBAChB,MAAM,CAAC,EACP,IAAI,CAAC,SAAS,uBAAuB,IAAI,CAAC;AAC7C,SAAO,WAAW,SAAS,IACvB,wBAAwB,WAAW,UAAU,WAAW,KAAK,IAAI,CAAC,kBAClE,cAAc,WAAW;AAC/B;","names":[]}

package/dist/index.d.cts CHANGED Viewed

@@ -19,6 +19,18 @@ declare function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag;
  */
 declare function normalizeToBcp47(tag: string): Bcp47LanguageTag | null;
+/** Language display names for prompt context (keyed by ISO 639-1 base language code) */
+declare const LANGUAGE_DISPLAY_NAMES: Record<string, string>;
+/**
+ * Get human-readable display name for a BCP 47 or ISO 639-1 language code.
+ */
+declare function getLanguageDisplayName(code?: string): string;
+/**
+ * Build language description string from document languages.
+ * @returns e.g. "primarily written in Korean (한국어), with English also present"
+ */
+declare function buildLanguageDescription(documentLanguages: string[]): string;
 interface DoclingReference {
     $ref: string;
 }
@@ -740,4 +752,4 @@ interface OcrStrategy {
     koreanHanjaMixPages?: number[];
 }
-export { BCP47_LANGUAGE_TAGS, BCP47_LANGUAGE_TAG_SET, type Bcp47LanguageTag, type Caption, type Chapter, type ComponentUsageReport, type DoclingBBox, type DoclingBaseNode, type DoclingBody, type DoclingDocument, type DoclingGroupItem, type DoclingOrigin, type DoclingPage, type DoclingPageImage, type DoclingPictureItem, type DoclingProv, type DoclingReference, type DoclingTableCell, type DoclingTableData, type DoclingTableItem, type DoclingTextItem, type DocumentProcessResult, type ModelUsageDetail, type OcrStrategy, type PageRange, type PhaseUsageReport, type ProcessedDocument, type ProcessedFootnote, type ProcessedImage, type ProcessedTable, type ProcessedTableCell, type TextBlock, type TokenUsageReport, type TokenUsageSummary, isValidBcp47Tag, normalizeToBcp47 };
+export { BCP47_LANGUAGE_TAGS, BCP47_LANGUAGE_TAG_SET, type Bcp47LanguageTag, type Caption, type Chapter, type ComponentUsageReport, type DoclingBBox, type DoclingBaseNode, type DoclingBody, type DoclingDocument, type DoclingGroupItem, type DoclingOrigin, type DoclingPage, type DoclingPageImage, type DoclingPictureItem, type DoclingProv, type DoclingReference, type DoclingTableCell, type DoclingTableData, type DoclingTableItem, type DoclingTextItem, type DocumentProcessResult, LANGUAGE_DISPLAY_NAMES, type ModelUsageDetail, type OcrStrategy, type PageRange, type PhaseUsageReport, type ProcessedDocument, type ProcessedFootnote, type ProcessedImage, type ProcessedTable, type ProcessedTableCell, type TextBlock, type TokenUsageReport, type TokenUsageSummary, buildLanguageDescription, getLanguageDisplayName, isValidBcp47Tag, normalizeToBcp47 };

package/dist/index.d.ts CHANGED Viewed

@@ -19,6 +19,18 @@ declare function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag;
  */
 declare function normalizeToBcp47(tag: string): Bcp47LanguageTag | null;
+/** Language display names for prompt context (keyed by ISO 639-1 base language code) */
+declare const LANGUAGE_DISPLAY_NAMES: Record<string, string>;
+/**
+ * Get human-readable display name for a BCP 47 or ISO 639-1 language code.
+ */
+declare function getLanguageDisplayName(code?: string): string;
+/**
+ * Build language description string from document languages.
+ * @returns e.g. "primarily written in Korean (한국어), with English also present"
+ */
+declare function buildLanguageDescription(documentLanguages: string[]): string;
 interface DoclingReference {
     $ref: string;
 }
@@ -740,4 +752,4 @@ interface OcrStrategy {
     koreanHanjaMixPages?: number[];
 }
-export { BCP47_LANGUAGE_TAGS, BCP47_LANGUAGE_TAG_SET, type Bcp47LanguageTag, type Caption, type Chapter, type ComponentUsageReport, type DoclingBBox, type DoclingBaseNode, type DoclingBody, type DoclingDocument, type DoclingGroupItem, type DoclingOrigin, type DoclingPage, type DoclingPageImage, type DoclingPictureItem, type DoclingProv, type DoclingReference, type DoclingTableCell, type DoclingTableData, type DoclingTableItem, type DoclingTextItem, type DocumentProcessResult, type ModelUsageDetail, type OcrStrategy, type PageRange, type PhaseUsageReport, type ProcessedDocument, type ProcessedFootnote, type ProcessedImage, type ProcessedTable, type ProcessedTableCell, type TextBlock, type TokenUsageReport, type TokenUsageSummary, isValidBcp47Tag, normalizeToBcp47 };
+export { BCP47_LANGUAGE_TAGS, BCP47_LANGUAGE_TAG_SET, type Bcp47LanguageTag, type Caption, type Chapter, type ComponentUsageReport, type DoclingBBox, type DoclingBaseNode, type DoclingBody, type DoclingDocument, type DoclingGroupItem, type DoclingOrigin, type DoclingPage, type DoclingPageImage, type DoclingPictureItem, type DoclingProv, type DoclingReference, type DoclingTableCell, type DoclingTableData, type DoclingTableItem, type DoclingTextItem, type DocumentProcessResult, LANGUAGE_DISPLAY_NAMES, type ModelUsageDetail, type OcrStrategy, type PageRange, type PhaseUsageReport, type ProcessedDocument, type ProcessedFootnote, type ProcessedImage, type ProcessedTable, type ProcessedTableCell, type TextBlock, type TokenUsageReport, type TokenUsageSummary, buildLanguageDescription, getLanguageDisplayName, isValidBcp47Tag, normalizeToBcp47 };

package/dist/index.js CHANGED Viewed

@@ -1,4 +1,4 @@
-// src/bcp47-language-tag.ts
+// src/language/bcp47-language-tag.ts
 var BCP47_LANGUAGE_TAGS = [
   "ar-SA",
   "ars-SA",
@@ -74,9 +74,37 @@ function normalizeToBcp47(tag) {
   }
   return null;
 }
+// src/language/language-display.ts
+var LANGUAGE_DISPLAY_NAMES = {
+  ko: "Korean (\uD55C\uAD6D\uC5B4)",
+  ja: "Japanese (\u65E5\u672C\u8A9E)",
+  zh: "Chinese (\u4E2D\u6587)",
+  en: "English",
+  fr: "French (Fran\xE7ais)",
+  de: "German (Deutsch)",
+  es: "Spanish (Espa\xF1ol)",
+  pt: "Portuguese (Portugu\xEAs)",
+  ru: "Russian (\u0420\u0443\u0441\u0441\u043A\u0438\u0439)",
+  uk: "Ukrainian (\u0423\u043A\u0440\u0430\u0457\u043D\u0441\u044C\u043A\u0430)",
+  it: "Italian (Italiano)"
+};
+function getLanguageDisplayName(code) {
+  if (!code) return "unknown";
+  const baseCode = code.split("-")[0];
+  return LANGUAGE_DISPLAY_NAMES[baseCode] ?? code;
+}
+function buildLanguageDescription(documentLanguages) {
+  const primaryName = getLanguageDisplayName(documentLanguages[0]);
+  const otherNames = documentLanguages.slice(1).map((code) => getLanguageDisplayName(code));
+  return otherNames.length > 0 ? `primarily written in ${primaryName}, with ${otherNames.join(", ")} also present` : `written in ${primaryName}`;
+}
 export {
   BCP47_LANGUAGE_TAGS,
   BCP47_LANGUAGE_TAG_SET,
+  LANGUAGE_DISPLAY_NAMES,
+  buildLanguageDescription,
+  getLanguageDisplayName,
   isValidBcp47Tag,
   normalizeToBcp47
 };

package/dist/index.js.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"sources":["../src/bcp47-language-tag.ts"],"sourcesContent":["/*\n Language tags supported by the ocrmac OCR engine (via docling-serve).\n * These are the only tags that can be passed to the Docling pipeline without\n * triggering \"Invalid language preference\" errors.\n /\nexport const BCP47_LANGUAGE_TAGS = [\n 'ar-SA',\n 'ars-SA',\n 'cs-CZ',\n 'da-DK',\n 'de-DE',\n 'en-US',\n 'es-ES',\n 'fr-FR',\n 'id-ID',\n 'it-IT',\n 'ja-JP',\n 'ko-KR',\n 'ms-MY',\n 'nb-NO',\n 'nl-NL',\n 'nn-NO',\n 'no-NO',\n 'pl-PL',\n 'pt-BR',\n 'ro-RO',\n 'ru-RU',\n 'sv-SE',\n 'th-TH',\n 'tr-TR',\n 'uk-UA',\n 'vi-VT',\n 'yue-Hans',\n 'yue-Hant',\n 'zh-Hans',\n 'zh-Hant',\n] as const;\n\n/* Union type of all supported BCP 47 language tags /\nexport type Bcp47LanguageTag = (typeof BCP47_LANGUAGE_TAGS)[number];\n\n/* Set for O(1) lookup of valid BCP 47 tags /\nexport const BCP47_LANGUAGE_TAG_SET: ReadonlySet<string> = new Set(\n BCP47_LANGUAGE_TAGS,\n);\n\n/* Check whether a string is a valid BCP 47 language tag /\nexport function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag {\n return BCP47_LANGUAGE_TAG_SET.has(tag);\n}\n\n/\n Maps bare language codes to their default BCP 47 tag.\n * Used when VLM returns only a language code without a region subtag.\n /\nconst DEFAULT_REGION_MAP: Record<string, Bcp47LanguageTag> = {\n ar: 'ar-SA',\n cs: 'cs-CZ',\n da: 'da-DK',\n de: 'de-DE',\n en: 'en-US',\n es: 'es-ES',\n fr: 'fr-FR',\n id: 'id-ID',\n it: 'it-IT',\n ja: 'ja-JP',\n ko: 'ko-KR',\n ms: 'ms-MY',\n nl: 'nl-NL',\n no: 'no-NO',\n pl: 'pl-PL',\n pt: 'pt-BR',\n ro: 'ro-RO',\n ru: 'ru-RU',\n sv: 'sv-SE',\n th: 'th-TH',\n tr: 'tr-TR',\n uk: 'uk-UA',\n vi: 'vi-VT',\n zh: 'zh-Hans',\n};\n\n/\n Normalize a language string to a valid BCP 47 tag.\n \n - If the input is already a valid full tag (e.g. \"en-US\"), return it as-is.\n * - If it is a bare language code (e.g. \"en\", \"ko\"), map it to the default region.\n * - Otherwise return null (e.g. \"und\", \"unknown\", empty string).\n */\nexport function normalizeToBcp47(tag: string): Bcp47LanguageTag \| null {\n if (isValidBcp47Tag(tag)) {\n return tag;\n }\n\n const lower = tag.toLowerCase();\n const mapped = DEFAULT_REGION_MAP[lower];\n if (mapped) {\n return mapped;\n }\n\n return null;\n}\n"],"mappings":";AAKO,IAAM,sBAAsB;AAAA,EACjC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAMO,IAAM,yBAA8C,IAAI;AAAA,EAC7D;AACF;AAGO,SAAS,gBAAgB,KAAsC;AACpE,SAAO,uBAAuB,IAAI,GAAG;AACvC;AAMA,IAAM,qBAAuD;AAAA,EAC3D,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AACN;AASO,SAAS,iBAAiB,KAAsC;AACrE,MAAI,gBAAgB,GAAG,GAAG;AACxB,WAAO;AAAA,EACT;AAEA,QAAM,QAAQ,IAAI,YAAY;AAC9B,QAAM,SAAS,mBAAmB,KAAK;AACvC,MAAI,QAAQ;AACV,WAAO;AAAA,EACT;AAEA,SAAO;AACT;","names":[]}
1	+ {"version":3,"sources":["../src/language/bcp47-language-tag.ts","../src/language/language-display.ts"],"sourcesContent":["/*\n Language tags supported by the ocrmac OCR engine (via docling-serve).\n * These are the only tags that can be passed to the Docling pipeline without\n * triggering \"Invalid language preference\" errors.\n /\nexport const BCP47_LANGUAGE_TAGS = [\n 'ar-SA',\n 'ars-SA',\n 'cs-CZ',\n 'da-DK',\n 'de-DE',\n 'en-US',\n 'es-ES',\n 'fr-FR',\n 'id-ID',\n 'it-IT',\n 'ja-JP',\n 'ko-KR',\n 'ms-MY',\n 'nb-NO',\n 'nl-NL',\n 'nn-NO',\n 'no-NO',\n 'pl-PL',\n 'pt-BR',\n 'ro-RO',\n 'ru-RU',\n 'sv-SE',\n 'th-TH',\n 'tr-TR',\n 'uk-UA',\n 'vi-VT',\n 'yue-Hans',\n 'yue-Hant',\n 'zh-Hans',\n 'zh-Hant',\n] as const;\n\n/* Union type of all supported BCP 47 language tags /\nexport type Bcp47LanguageTag = (typeof BCP47_LANGUAGE_TAGS)[number];\n\n/* Set for O(1) lookup of valid BCP 47 tags /\nexport const BCP47_LANGUAGE_TAG_SET: ReadonlySet<string> = new Set(\n BCP47_LANGUAGE_TAGS,\n);\n\n/* Check whether a string is a valid BCP 47 language tag /\nexport function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag {\n return BCP47_LANGUAGE_TAG_SET.has(tag);\n}\n\n/\n Maps bare language codes to their default BCP 47 tag.\n * Used when VLM returns only a language code without a region subtag.\n /\nconst DEFAULT_REGION_MAP: Record<string, Bcp47LanguageTag> = {\n ar: 'ar-SA',\n cs: 'cs-CZ',\n da: 'da-DK',\n de: 'de-DE',\n en: 'en-US',\n es: 'es-ES',\n fr: 'fr-FR',\n id: 'id-ID',\n it: 'it-IT',\n ja: 'ja-JP',\n ko: 'ko-KR',\n ms: 'ms-MY',\n nl: 'nl-NL',\n no: 'no-NO',\n pl: 'pl-PL',\n pt: 'pt-BR',\n ro: 'ro-RO',\n ru: 'ru-RU',\n sv: 'sv-SE',\n th: 'th-TH',\n tr: 'tr-TR',\n uk: 'uk-UA',\n vi: 'vi-VT',\n zh: 'zh-Hans',\n};\n\n/\n Normalize a language string to a valid BCP 47 tag.\n \n - If the input is already a valid full tag (e.g. \"en-US\"), return it as-is.\n * - If it is a bare language code (e.g. \"en\", \"ko\"), map it to the default region.\n * - Otherwise return null (e.g. \"und\", \"unknown\", empty string).\n /\nexport function normalizeToBcp47(tag: string): Bcp47LanguageTag \| null {\n if (isValidBcp47Tag(tag)) {\n return tag;\n }\n\n const lower = tag.toLowerCase();\n const mapped = DEFAULT_REGION_MAP[lower];\n if (mapped) {\n return mapped;\n }\n\n return null;\n}\n","/* Language display names for prompt context (keyed by ISO 639-1 base language code) /\nexport const LANGUAGE_DISPLAY_NAMES: Record<string, string> = {\n ko: 'Korean (한국어)',\n ja: 'Japanese (日本語)',\n zh: 'Chinese (中文)',\n en: 'English',\n fr: 'French (Français)',\n de: 'German (Deutsch)',\n es: 'Spanish (Español)',\n pt: 'Portuguese (Português)',\n ru: 'Russian (Русский)',\n uk: 'Ukrainian (Українська)',\n it: 'Italian (Italiano)',\n};\n\n/\n Get human-readable display name for a BCP 47 or ISO 639-1 language code.\n /\nexport function getLanguageDisplayName(code?: string): string {\n if (!code) return 'unknown';\n const baseCode = code.split('-')[0];\n return LANGUAGE_DISPLAY_NAMES[baseCode] ?? code;\n}\n\n/\n Build language description string from document languages.\n * @returns e.g. \"primarily written in Korean (한국어), with English also present\"\n */\nexport function buildLanguageDescription(documentLanguages: string[]): string {\n const primaryName = getLanguageDisplayName(documentLanguages[0]);\n const otherNames = documentLanguages\n .slice(1)\n .map((code) => getLanguageDisplayName(code));\n return otherNames.length > 0\n ? `primarily written in ${primaryName}, with ${otherNames.join(', ')} also present`\n : `written in ${primaryName}`;\n}\n"],"mappings":";AAKO,IAAM,sBAAsB;AAAA,EACjC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAMO,IAAM,yBAA8C,IAAI;AAAA,EAC7D;AACF;AAGO,SAAS,gBAAgB,KAAsC;AACpE,SAAO,uBAAuB,IAAI,GAAG;AACvC;AAMA,IAAM,qBAAuD;AAAA,EAC3D,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AACN;AASO,SAAS,iBAAiB,KAAsC;AACrE,MAAI,gBAAgB,GAAG,GAAG;AACxB,WAAO;AAAA,EACT;AAEA,QAAM,QAAQ,IAAI,YAAY;AAC9B,QAAM,SAAS,mBAAmB,KAAK;AACvC,MAAI,QAAQ;AACV,WAAO;AAAA,EACT;AAEA,SAAO;AACT;;;ACpGO,IAAM,yBAAiD;AAAA,EAC5D,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AACN;AAKO,SAAS,uBAAuB,MAAuB;AAC5D,MAAI,CAAC,KAAM,QAAO;AAClB,QAAM,WAAW,KAAK,MAAM,GAAG,EAAE,CAAC;AAClC,SAAO,uBAAuB,QAAQ,KAAK;AAC7C;AAMO,SAAS,yBAAyB,mBAAqC;AAC5E,QAAM,cAAc,uBAAuB,kBAAkB,CAAC,CAAC;AAC/D,QAAM,aAAa,kBAChB,MAAM,CAAC,EACP,IAAI,CAAC,SAAS,uBAAuB,IAAI,CAAC;AAC7C,SAAO,WAAW,SAAS,IACvB,wBAAwB,WAAW,UAAU,WAAW,KAAK,IAAI,CAAC,kBAClE,cAAc,WAAW;AAC/B;","names":[]}

package/package.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "name": "@heripo/model",
   "private": false,
   "type": "module",
-  "version": "0.1.16",
+  "version": "0.1.17",
   "description": "Document models and type definitions for heripo engine",
   "main": "dist/index.cjs",
   "module": "dist/index.js",
@@ -51,8 +51,11 @@
     "access": "public"
   },
   "devDependencies": {
+    "@vitest/coverage-v8": "^4.1.0",
     "tsup": "^8.5.1",
+    "vitest": "^4.1.0",
     "@heripo/tsconfig": "0.0.0",
+    "@heripo/vitest-config": "0.0.0",
     "@heripo/tsup-config": "0.0.0"
   },
   "scripts": {
@@ -60,6 +63,11 @@
     "build": "pnpm clean && tsup",
     "dev": "tsup --watch",
     "typecheck": "tsc --noEmit",
+    "test": "vitest run",
+    "test:watch": "vitest",
+    "test:ci": "TEST_MODE=ci vitest run --coverage",
+    "test:coverage": "vitest run --coverage",
+    "test:coverage:watch": "vitest --coverage",
     "lint": "eslint src/**/*.ts",
     "lint:fix": "eslint src/**/*.ts --fix"
   }