@heripo/model 0.1.16 → 0.1.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.ko.md CHANGED
@@ -72,10 +72,11 @@ import type { ProcessedDocument } from '@heripo/model';
72
72
 
73
73
  interface ProcessedDocument {
74
74
  reportId: string; // 리포트 ID
75
- pageRangeMap: PageRange[]; // PDF 페이지 → 문서 페이지 매핑
75
+ pageRangeMap: Record<number, PageRange>; // PDF 페이지 → 문서 페이지 매핑
76
76
  chapters: Chapter[]; // 계층적 챕터 구조
77
77
  images: ProcessedImage[]; // 추출된 이미지 메타데이터
78
78
  tables: ProcessedTable[]; // 추출된 테이블 데이터
79
+ footnotes: ProcessedFootnote[]; // 추출된 각주
79
80
  }
80
81
  ```
81
82
 
@@ -89,12 +90,14 @@ import type { Chapter } from '@heripo/model';
89
90
  interface Chapter {
90
91
  id: string; // 챕터 ID
91
92
  title: string; // 챕터 제목
93
+ originTitle: string; // 원본 제목
92
94
  level: number; // 계층 레벨 (1, 2, 3, ...)
93
- pageNo?: number; // 시작 페이지 번호
95
+ pageNo: number; // 시작 페이지 번호
94
96
  textBlocks: TextBlock[]; // 텍스트 블록
95
97
  imageIds: string[]; // 이미지 ID 참조
96
98
  tableIds: string[]; // 테이블 ID 참조
97
- children: Chapter[]; // 하위 챕터
99
+ footnoteIds: string[]; // 각주 ID 참조
100
+ children?: Chapter[]; // 하위 챕터 (선택)
98
101
  }
99
102
  ```
100
103
 
@@ -107,7 +110,7 @@ import type { TextBlock } from '@heripo/model';
107
110
 
108
111
  interface TextBlock {
109
112
  text: string; // 텍스트 내용
110
- pageNo?: number; // 페이지 번호
113
+ pdfPageNo: number; // PDF 페이지 번호
111
114
  }
112
115
  ```
113
116
 
@@ -121,8 +124,8 @@ import type { ProcessedImage } from '@heripo/model';
121
124
  interface ProcessedImage {
122
125
  id: string; // 이미지 ID
123
126
  caption?: Caption; // 캡션 (선택)
124
- pdfPageNo?: number; // PDF 페이지 번호
125
- filePath: string; // 이미지 파일 경로
127
+ pdfPageNo: number; // PDF 페이지 번호
128
+ path: string; // 이미지 파일 경로
126
129
  }
127
130
  ```
128
131
 
@@ -136,8 +139,8 @@ import type { ProcessedTable } from '@heripo/model';
136
139
  interface ProcessedTable {
137
140
  id: string; // 테이블 ID
138
141
  caption?: Caption; // 캡션 (선택)
139
- pdfPageNo?: number; // PDF 페이지 번호
140
- data: ProcessedTableCell[][]; // 2D 그리드 데이터
142
+ pdfPageNo: number; // PDF 페이지 번호
143
+ grid: ProcessedTableCell[][]; // 2D 그리드 데이터
141
144
  numRows: number; // 행 개수
142
145
  numCols: number; // 열 개수
143
146
  }
@@ -152,8 +155,8 @@ import type { ProcessedTableCell } from '@heripo/model';
152
155
 
153
156
  interface ProcessedTableCell {
154
157
  text: string; // 셀 텍스트
155
- rowspan: number; // 행 병합
156
- colspan: number; // 열 병합
158
+ rowSpan: number; // 행 병합
159
+ colSpan: number; // 열 병합
157
160
  isHeader: boolean; // 헤더 셀 여부
158
161
  }
159
162
  ```
@@ -166,7 +169,7 @@ interface ProcessedTableCell {
166
169
  import type { Caption } from '@heripo/model';
167
170
 
168
171
  interface Caption {
169
- num?: number; // 캡션 번호 (예: "그림 1"의 1)
172
+ num?: string; // 캡션 번호 (예: "그림 1"의 "1")
170
173
  fullText: string; // 전체 캡션 텍스트
171
174
  }
172
175
  ```
@@ -179,11 +182,130 @@ PDF 페이지와 문서 페이지 매핑입니다.
179
182
  import type { PageRange } from '@heripo/model';
180
183
 
181
184
  interface PageRange {
185
+ startPageNo: number; // 시작 페이지 번호
186
+ endPageNo: number; // 끝 페이지 번호
187
+ }
188
+ ```
189
+
190
+ ### ProcessedFootnote
191
+
192
+ 문서에서 추출된 각주입니다.
193
+
194
+ ```typescript
195
+ import type { ProcessedFootnote } from '@heripo/model';
196
+
197
+ interface ProcessedFootnote {
198
+ id: string; // 각주 ID
199
+ text: string; // 각주 텍스트
182
200
  pdfPageNo: number; // PDF 페이지 번호
183
- pageNo: number; // 문서 논리적 페이지 번호
184
201
  }
185
202
  ```
186
203
 
204
+ ### DocumentProcessResult
205
+
206
+ 문서 처리 결과로, 처리된 문서와 토큰 사용량 리포트를 포함합니다.
207
+
208
+ ```typescript
209
+ import type { DocumentProcessResult } from '@heripo/model';
210
+
211
+ interface DocumentProcessResult {
212
+ document: ProcessedDocument; // 처리된 문서
213
+ usage: TokenUsageReport; // 토큰 사용량 리포트
214
+ }
215
+ ```
216
+
217
+ ### OcrStrategy
218
+
219
+ OCR 전략 선택 결과입니다.
220
+
221
+ ```typescript
222
+ import type { OcrStrategy } from '@heripo/model';
223
+
224
+ interface OcrStrategy {
225
+ method: 'ocrmac' | 'vlm'; // OCR 방법
226
+ ocrLanguages?: string[]; // OCR 언어
227
+ detectedLanguages?: Bcp47LanguageTag[]; // 감지된 BCP-47 언어 태그
228
+ reason: string; // 전략 선택 이유
229
+ sampledPages: number; // 샘플링된 페이지 수
230
+ totalPages: number; // 문서 전체 페이지 수
231
+ koreanHanjaMixPages?: number[]; // 한국어-한자 혼용 페이지
232
+ }
233
+ ```
234
+
235
+ ### 토큰 사용량 타입
236
+
237
+ 처리 단계별 LLM 토큰 사용량을 추적하기 위한 타입입니다.
238
+
239
+ ```typescript
240
+ import type {
241
+ ComponentUsageReport,
242
+ ModelUsageDetail,
243
+ PhaseUsageReport,
244
+ TokenUsageReport,
245
+ TokenUsageSummary,
246
+ } from '@heripo/model';
247
+
248
+ interface TokenUsageReport {
249
+ components: ComponentUsageReport[]; // 컴포넌트별 사용량
250
+ total: TokenUsageSummary; // 전체 사용량 요약
251
+ }
252
+
253
+ interface ComponentUsageReport {
254
+ component: string; // 컴포넌트 이름
255
+ phases: PhaseUsageReport[]; // 단계별 사용량
256
+ total: TokenUsageSummary; // 컴포넌트 합계
257
+ }
258
+
259
+ interface PhaseUsageReport {
260
+ phase: string; // 단계 이름
261
+ primary?: ModelUsageDetail; // 기본 모델 사용량
262
+ fallback?: ModelUsageDetail; // 폴백 모델 사용량
263
+ total: TokenUsageSummary; // 단계 합계
264
+ }
265
+
266
+ interface ModelUsageDetail {
267
+ modelName: string; // 모델 이름
268
+ inputTokens: number; // 입력 토큰 수
269
+ outputTokens: number; // 출력 토큰 수
270
+ totalTokens: number; // 전체 토큰 수
271
+ }
272
+
273
+ interface TokenUsageSummary {
274
+ inputTokens: number; // 입력 토큰 수
275
+ outputTokens: number; // 출력 토큰 수
276
+ totalTokens: number; // 전체 토큰 수
277
+ }
278
+ ```
279
+
280
+ ### BCP-47 언어 태그 유틸리티
281
+
282
+ BCP-47 언어 태그를 다루기 위한 유틸리티입니다.
283
+
284
+ ```typescript
285
+ import {
286
+ type Bcp47LanguageTag,
287
+ BCP47_LANGUAGE_TAGS,
288
+ BCP47_LANGUAGE_TAG_SET,
289
+ isValidBcp47Tag,
290
+ normalizeToBcp47,
291
+ } from '@heripo/model';
292
+
293
+ // Bcp47LanguageTag - 지원되는 BCP-47 언어 태그의 유니온 타입
294
+ type Bcp47LanguageTag = 'ko' | 'en' | 'ja' | 'zh' | /* ... */ string;
295
+
296
+ // BCP47_LANGUAGE_TAGS - 30개 지원 태그의 상수 배열
297
+ const BCP47_LANGUAGE_TAGS: readonly Bcp47LanguageTag[];
298
+
299
+ // BCP47_LANGUAGE_TAG_SET - O(1) 조회를 위한 ReadonlySet
300
+ const BCP47_LANGUAGE_TAG_SET: ReadonlySet<string>;
301
+
302
+ // isValidBcp47Tag - 문자열이 유효한 BCP-47 태그인지 확인
303
+ function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag;
304
+
305
+ // normalizeToBcp47 - 언어 문자열을 BCP-47 형식으로 정규화
306
+ function normalizeToBcp47(tag: string): Bcp47LanguageTag | undefined;
307
+ ```
308
+
187
309
  ## 사용법
188
310
 
189
311
  ### ProcessedDocument 읽기
@@ -200,7 +322,7 @@ function analyzeDocument(doc: ProcessedDocument) {
200
322
  console.log(` 텍스트 블록: ${chapter.textBlocks.length}개`);
201
323
  console.log(` 이미지: ${chapter.imageIds.length}개`);
202
324
  console.log(` 테이블: ${chapter.tableIds.length}개`);
203
- console.log(` 하위 챕터: ${chapter.children.length}개`);
325
+ console.log(` 하위 챕터: ${chapter.children?.length ?? 0}개`);
204
326
  });
205
327
 
206
328
  // 이미지 확인
@@ -209,7 +331,7 @@ function analyzeDocument(doc: ProcessedDocument) {
209
331
  if (image.caption) {
210
332
  console.log(` 캡션: ${image.caption.fullText}`);
211
333
  }
212
- console.log(` 경로: ${image.filePath}`);
334
+ console.log(` 경로: ${image.path}`);
213
335
  });
214
336
 
215
337
  // 테이블 확인
@@ -233,7 +355,7 @@ function traverseChapters(chapter: Chapter, depth: number = 0) {
233
355
  console.log(`${indent}- ${chapter.title}`);
234
356
 
235
357
  // 재귀적으로 하위 챕터 순회
236
- chapter.children.forEach((child) => {
358
+ chapter.children?.forEach((child) => {
237
359
  traverseChapters(child, depth + 1);
238
360
  });
239
361
  }
package/README.md CHANGED
@@ -72,10 +72,11 @@ import type { ProcessedDocument } from '@heripo/model';
72
72
 
73
73
  interface ProcessedDocument {
74
74
  reportId: string; // Report ID
75
- pageRangeMap: PageRange[]; // PDF page → document page mapping
75
+ pageRangeMap: Record<number, PageRange>; // PDF page → document page mapping
76
76
  chapters: Chapter[]; // Hierarchical chapter structure
77
77
  images: ProcessedImage[]; // Extracted image metadata
78
78
  tables: ProcessedTable[]; // Extracted table data
79
+ footnotes: ProcessedFootnote[]; // Extracted footnotes
79
80
  }
80
81
  ```
81
82
 
@@ -89,12 +90,14 @@ import type { Chapter } from '@heripo/model';
89
90
  interface Chapter {
90
91
  id: string; // Chapter ID
91
92
  title: string; // Chapter title
93
+ originTitle: string; // Original title from source
92
94
  level: number; // Hierarchy level (1, 2, 3, ...)
93
- pageNo?: number; // Start page number
95
+ pageNo: number; // Start page number
94
96
  textBlocks: TextBlock[]; // Text blocks
95
97
  imageIds: string[]; // Image ID references
96
98
  tableIds: string[]; // Table ID references
97
- children: Chapter[]; // Sub-chapters
99
+ footnoteIds: string[]; // Footnote ID references
100
+ children?: Chapter[]; // Sub-chapters (optional)
98
101
  }
99
102
  ```
100
103
 
@@ -107,7 +110,7 @@ import type { TextBlock } from '@heripo/model';
107
110
 
108
111
  interface TextBlock {
109
112
  text: string; // Text content
110
- pageNo?: number; // Page number
113
+ pdfPageNo: number; // PDF page number
111
114
  }
112
115
  ```
113
116
 
@@ -121,8 +124,8 @@ import type { ProcessedImage } from '@heripo/model';
121
124
  interface ProcessedImage {
122
125
  id: string; // Image ID
123
126
  caption?: Caption; // Caption (optional)
124
- pdfPageNo?: number; // PDF page number
125
- filePath: string; // Image file path
127
+ pdfPageNo: number; // PDF page number
128
+ path: string; // Image file path
126
129
  }
127
130
  ```
128
131
 
@@ -136,8 +139,8 @@ import type { ProcessedTable } from '@heripo/model';
136
139
  interface ProcessedTable {
137
140
  id: string; // Table ID
138
141
  caption?: Caption; // Caption (optional)
139
- pdfPageNo?: number; // PDF page number
140
- data: ProcessedTableCell[][]; // 2D grid data
142
+ pdfPageNo: number; // PDF page number
143
+ grid: ProcessedTableCell[][]; // 2D grid data
141
144
  numRows: number; // Row count
142
145
  numCols: number; // Column count
143
146
  }
@@ -152,8 +155,8 @@ import type { ProcessedTableCell } from '@heripo/model';
152
155
 
153
156
  interface ProcessedTableCell {
154
157
  text: string; // Cell text
155
- rowspan: number; // Row span
156
- colspan: number; // Column span
158
+ rowSpan: number; // Row span
159
+ colSpan: number; // Column span
157
160
  isHeader: boolean; // Is header cell
158
161
  }
159
162
  ```
@@ -166,7 +169,7 @@ Image and table captions.
166
169
  import type { Caption } from '@heripo/model';
167
170
 
168
171
  interface Caption {
169
- num?: number; // Caption number (e.g., 1 in "Figure 1")
172
+ num?: string; // Caption number (e.g., "1" in "Figure 1")
170
173
  fullText: string; // Full caption text
171
174
  }
172
175
  ```
@@ -179,11 +182,130 @@ PDF page to document page mapping.
179
182
  import type { PageRange } from '@heripo/model';
180
183
 
181
184
  interface PageRange {
185
+ startPageNo: number; // Start page number
186
+ endPageNo: number; // End page number
187
+ }
188
+ ```
189
+
190
+ ### ProcessedFootnote
191
+
192
+ Footnote extracted from the document.
193
+
194
+ ```typescript
195
+ import type { ProcessedFootnote } from '@heripo/model';
196
+
197
+ interface ProcessedFootnote {
198
+ id: string; // Footnote ID
199
+ text: string; // Footnote text
182
200
  pdfPageNo: number; // PDF page number
183
- pageNo: number; // Document logical page number
184
201
  }
185
202
  ```
186
203
 
204
+ ### DocumentProcessResult
205
+
206
+ Result of document processing, including the processed document and token usage report.
207
+
208
+ ```typescript
209
+ import type { DocumentProcessResult } from '@heripo/model';
210
+
211
+ interface DocumentProcessResult {
212
+ document: ProcessedDocument; // Processed document
213
+ usage: TokenUsageReport; // Token usage report
214
+ }
215
+ ```
216
+
217
+ ### OcrStrategy
218
+
219
+ OCR strategy selection result.
220
+
221
+ ```typescript
222
+ import type { OcrStrategy } from '@heripo/model';
223
+
224
+ interface OcrStrategy {
225
+ method: 'ocrmac' | 'vlm'; // OCR method
226
+ ocrLanguages?: string[]; // OCR languages
227
+ detectedLanguages?: Bcp47LanguageTag[]; // Detected BCP-47 language tags
228
+ reason: string; // Reason for strategy selection
229
+ sampledPages: number; // Number of sampled pages
230
+ totalPages: number; // Total pages in document
231
+ koreanHanjaMixPages?: number[]; // Pages with Korean-Hanja mixed script
232
+ }
233
+ ```
234
+
235
+ ### Token Usage Types
236
+
237
+ Types for tracking LLM token usage across processing phases.
238
+
239
+ ```typescript
240
+ import type {
241
+ ComponentUsageReport,
242
+ ModelUsageDetail,
243
+ PhaseUsageReport,
244
+ TokenUsageReport,
245
+ TokenUsageSummary,
246
+ } from '@heripo/model';
247
+
248
+ interface TokenUsageReport {
249
+ components: ComponentUsageReport[]; // Usage per component
250
+ total: TokenUsageSummary; // Total usage summary
251
+ }
252
+
253
+ interface ComponentUsageReport {
254
+ component: string; // Component name
255
+ phases: PhaseUsageReport[]; // Usage per phase
256
+ total: TokenUsageSummary; // Component total
257
+ }
258
+
259
+ interface PhaseUsageReport {
260
+ phase: string; // Phase name
261
+ primary?: ModelUsageDetail; // Primary model usage
262
+ fallback?: ModelUsageDetail; // Fallback model usage
263
+ total: TokenUsageSummary; // Phase total
264
+ }
265
+
266
+ interface ModelUsageDetail {
267
+ modelName: string; // Model name
268
+ inputTokens: number; // Input token count
269
+ outputTokens: number; // Output token count
270
+ totalTokens: number; // Total token count
271
+ }
272
+
273
+ interface TokenUsageSummary {
274
+ inputTokens: number; // Input token count
275
+ outputTokens: number; // Output token count
276
+ totalTokens: number; // Total token count
277
+ }
278
+ ```
279
+
280
+ ### BCP-47 Language Tag Utilities
281
+
282
+ Utilities for working with BCP-47 language tags.
283
+
284
+ ```typescript
285
+ import {
286
+ type Bcp47LanguageTag,
287
+ BCP47_LANGUAGE_TAGS,
288
+ BCP47_LANGUAGE_TAG_SET,
289
+ isValidBcp47Tag,
290
+ normalizeToBcp47,
291
+ } from '@heripo/model';
292
+
293
+ // Bcp47LanguageTag - Union type of supported BCP-47 language tags
294
+ type Bcp47LanguageTag = 'ko' | 'en' | 'ja' | 'zh' | /* ... */ string;
295
+
296
+ // BCP47_LANGUAGE_TAGS - Const array of 30 supported tags
297
+ const BCP47_LANGUAGE_TAGS: readonly Bcp47LanguageTag[];
298
+
299
+ // BCP47_LANGUAGE_TAG_SET - ReadonlySet for O(1) lookup
300
+ const BCP47_LANGUAGE_TAG_SET: ReadonlySet<string>;
301
+
302
+ // isValidBcp47Tag - Check if a string is a valid BCP-47 tag
303
+ function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag;
304
+
305
+ // normalizeToBcp47 - Normalize a language string to BCP-47 format
306
+ function normalizeToBcp47(tag: string): Bcp47LanguageTag | undefined;
307
+ ```
308
+
187
309
  ## Usage
188
310
 
189
311
  ### Reading ProcessedDocument
@@ -200,7 +322,7 @@ function analyzeDocument(doc: ProcessedDocument) {
200
322
  console.log(` Text blocks: ${chapter.textBlocks.length}`);
201
323
  console.log(` Images: ${chapter.imageIds.length}`);
202
324
  console.log(` Tables: ${chapter.tableIds.length}`);
203
- console.log(` Sub-chapters: ${chapter.children.length}`);
325
+ console.log(` Sub-chapters: ${chapter.children?.length ?? 0}`);
204
326
  });
205
327
 
206
328
  // Check images
@@ -209,7 +331,7 @@ function analyzeDocument(doc: ProcessedDocument) {
209
331
  if (image.caption) {
210
332
  console.log(` Caption: ${image.caption.fullText}`);
211
333
  }
212
- console.log(` Path: ${image.filePath}`);
334
+ console.log(` Path: ${image.path}`);
213
335
  });
214
336
 
215
337
  // Check tables
@@ -233,7 +355,7 @@ function traverseChapters(chapter: Chapter, depth: number = 0) {
233
355
  console.log(`${indent}- ${chapter.title}`);
234
356
 
235
357
  // Recursively traverse sub-chapters
236
- chapter.children.forEach((child) => {
358
+ chapter.children?.forEach((child) => {
237
359
  traverseChapters(child, depth + 1);
238
360
  });
239
361
  }
package/dist/index.cjs CHANGED
@@ -22,12 +22,15 @@ var index_exports = {};
22
22
  __export(index_exports, {
23
23
  BCP47_LANGUAGE_TAGS: () => BCP47_LANGUAGE_TAGS,
24
24
  BCP47_LANGUAGE_TAG_SET: () => BCP47_LANGUAGE_TAG_SET,
25
+ LANGUAGE_DISPLAY_NAMES: () => LANGUAGE_DISPLAY_NAMES,
26
+ buildLanguageDescription: () => buildLanguageDescription,
27
+ getLanguageDisplayName: () => getLanguageDisplayName,
25
28
  isValidBcp47Tag: () => isValidBcp47Tag,
26
29
  normalizeToBcp47: () => normalizeToBcp47
27
30
  });
28
31
  module.exports = __toCommonJS(index_exports);
29
32
 
30
- // src/bcp47-language-tag.ts
33
+ // src/language/bcp47-language-tag.ts
31
34
  var BCP47_LANGUAGE_TAGS = [
32
35
  "ar-SA",
33
36
  "ars-SA",
@@ -103,10 +106,38 @@ function normalizeToBcp47(tag) {
103
106
  }
104
107
  return null;
105
108
  }
109
+
110
+ // src/language/language-display.ts
111
+ var LANGUAGE_DISPLAY_NAMES = {
112
+ ko: "Korean (\uD55C\uAD6D\uC5B4)",
113
+ ja: "Japanese (\u65E5\u672C\u8A9E)",
114
+ zh: "Chinese (\u4E2D\u6587)",
115
+ en: "English",
116
+ fr: "French (Fran\xE7ais)",
117
+ de: "German (Deutsch)",
118
+ es: "Spanish (Espa\xF1ol)",
119
+ pt: "Portuguese (Portugu\xEAs)",
120
+ ru: "Russian (\u0420\u0443\u0441\u0441\u043A\u0438\u0439)",
121
+ uk: "Ukrainian (\u0423\u043A\u0440\u0430\u0457\u043D\u0441\u044C\u043A\u0430)",
122
+ it: "Italian (Italiano)"
123
+ };
124
+ function getLanguageDisplayName(code) {
125
+ if (!code) return "unknown";
126
+ const baseCode = code.split("-")[0];
127
+ return LANGUAGE_DISPLAY_NAMES[baseCode] ?? code;
128
+ }
129
+ function buildLanguageDescription(documentLanguages) {
130
+ const primaryName = getLanguageDisplayName(documentLanguages[0]);
131
+ const otherNames = documentLanguages.slice(1).map((code) => getLanguageDisplayName(code));
132
+ return otherNames.length > 0 ? `primarily written in ${primaryName}, with ${otherNames.join(", ")} also present` : `written in ${primaryName}`;
133
+ }
106
134
  // Annotate the CommonJS export names for ESM import in node:
107
135
  0 && (module.exports = {
108
136
  BCP47_LANGUAGE_TAGS,
109
137
  BCP47_LANGUAGE_TAG_SET,
138
+ LANGUAGE_DISPLAY_NAMES,
139
+ buildLanguageDescription,
140
+ getLanguageDisplayName,
110
141
  isValidBcp47Tag,
111
142
  normalizeToBcp47
112
143
  });
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/index.ts","../src/bcp47-language-tag.ts"],"sourcesContent":["export type * from './bcp47-language-tag';\nexport {\n BCP47_LANGUAGE_TAGS,\n BCP47_LANGUAGE_TAG_SET,\n isValidBcp47Tag,\n normalizeToBcp47,\n} from './bcp47-language-tag';\nexport type * from './docling-document';\nexport type * from './processed-document';\nexport type * from './token-usage-report';\nexport type * from './document-process-result';\nexport type * from './ocr-strategy';\n","/**\n * Language tags supported by the ocrmac OCR engine (via docling-serve).\n * These are the only tags that can be passed to the Docling pipeline without\n * triggering \"Invalid language preference\" errors.\n */\nexport const BCP47_LANGUAGE_TAGS = [\n 'ar-SA',\n 'ars-SA',\n 'cs-CZ',\n 'da-DK',\n 'de-DE',\n 'en-US',\n 'es-ES',\n 'fr-FR',\n 'id-ID',\n 'it-IT',\n 'ja-JP',\n 'ko-KR',\n 'ms-MY',\n 'nb-NO',\n 'nl-NL',\n 'nn-NO',\n 'no-NO',\n 'pl-PL',\n 'pt-BR',\n 'ro-RO',\n 'ru-RU',\n 'sv-SE',\n 'th-TH',\n 'tr-TR',\n 'uk-UA',\n 'vi-VT',\n 'yue-Hans',\n 'yue-Hant',\n 'zh-Hans',\n 'zh-Hant',\n] as const;\n\n/** Union type of all supported BCP 47 language tags */\nexport type Bcp47LanguageTag = (typeof BCP47_LANGUAGE_TAGS)[number];\n\n/** Set for O(1) lookup of valid BCP 47 tags */\nexport const BCP47_LANGUAGE_TAG_SET: ReadonlySet<string> = new Set(\n BCP47_LANGUAGE_TAGS,\n);\n\n/** Check whether a string is a valid BCP 47 language tag */\nexport function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag {\n return BCP47_LANGUAGE_TAG_SET.has(tag);\n}\n\n/**\n * Maps bare language codes to their default BCP 47 tag.\n * Used when VLM returns only a language code without a region subtag.\n */\nconst DEFAULT_REGION_MAP: Record<string, Bcp47LanguageTag> = {\n ar: 'ar-SA',\n cs: 'cs-CZ',\n da: 'da-DK',\n de: 'de-DE',\n en: 'en-US',\n es: 'es-ES',\n fr: 'fr-FR',\n id: 'id-ID',\n it: 'it-IT',\n ja: 'ja-JP',\n ko: 'ko-KR',\n ms: 'ms-MY',\n nl: 'nl-NL',\n no: 'no-NO',\n pl: 'pl-PL',\n pt: 'pt-BR',\n ro: 'ro-RO',\n ru: 'ru-RU',\n sv: 'sv-SE',\n th: 'th-TH',\n tr: 'tr-TR',\n uk: 'uk-UA',\n vi: 'vi-VT',\n zh: 'zh-Hans',\n};\n\n/**\n * Normalize a language string to a valid BCP 47 tag.\n *\n * - If the input is already a valid full tag (e.g. \"en-US\"), return it as-is.\n * - If it is a bare language code (e.g. \"en\", \"ko\"), map it to the default region.\n * - Otherwise return null (e.g. \"und\", \"unknown\", empty string).\n */\nexport function normalizeToBcp47(tag: string): Bcp47LanguageTag | null {\n if (isValidBcp47Tag(tag)) {\n return tag;\n }\n\n const lower = tag.toLowerCase();\n const mapped = DEFAULT_REGION_MAP[lower];\n if (mapped) {\n return mapped;\n }\n\n return null;\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACKO,IAAM,sBAAsB;AAAA,EACjC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAMO,IAAM,yBAA8C,IAAI;AAAA,EAC7D;AACF;AAGO,SAAS,gBAAgB,KAAsC;AACpE,SAAO,uBAAuB,IAAI,GAAG;AACvC;AAMA,IAAM,qBAAuD;AAAA,EAC3D,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AACN;AASO,SAAS,iBAAiB,KAAsC;AACrE,MAAI,gBAAgB,GAAG,GAAG;AACxB,WAAO;AAAA,EACT;AAEA,QAAM,QAAQ,IAAI,YAAY;AAC9B,QAAM,SAAS,mBAAmB,KAAK;AACvC,MAAI,QAAQ;AACV,WAAO;AAAA,EACT;AAEA,SAAO;AACT;","names":[]}
1
+ {"version":3,"sources":["../src/index.ts","../src/language/bcp47-language-tag.ts","../src/language/language-display.ts"],"sourcesContent":["// Language utilities\nexport type * from './language/bcp47-language-tag';\nexport {\n BCP47_LANGUAGE_TAGS,\n BCP47_LANGUAGE_TAG_SET,\n isValidBcp47Tag,\n normalizeToBcp47,\n} from './language/bcp47-language-tag';\nexport {\n LANGUAGE_DISPLAY_NAMES,\n buildLanguageDescription,\n getLanguageDisplayName,\n} from './language/language-display';\n\n// Type definitions\nexport type * from './types/docling-document';\nexport type * from './types/document-process-result';\nexport type * from './types/ocr-strategy';\nexport type * from './types/processed-document';\nexport type * from './types/token-usage-report';\n","/**\n * Language tags supported by the ocrmac OCR engine (via docling-serve).\n * These are the only tags that can be passed to the Docling pipeline without\n * triggering \"Invalid language preference\" errors.\n */\nexport const BCP47_LANGUAGE_TAGS = [\n 'ar-SA',\n 'ars-SA',\n 'cs-CZ',\n 'da-DK',\n 'de-DE',\n 'en-US',\n 'es-ES',\n 'fr-FR',\n 'id-ID',\n 'it-IT',\n 'ja-JP',\n 'ko-KR',\n 'ms-MY',\n 'nb-NO',\n 'nl-NL',\n 'nn-NO',\n 'no-NO',\n 'pl-PL',\n 'pt-BR',\n 'ro-RO',\n 'ru-RU',\n 'sv-SE',\n 'th-TH',\n 'tr-TR',\n 'uk-UA',\n 'vi-VT',\n 'yue-Hans',\n 'yue-Hant',\n 'zh-Hans',\n 'zh-Hant',\n] as const;\n\n/** Union type of all supported BCP 47 language tags */\nexport type Bcp47LanguageTag = (typeof BCP47_LANGUAGE_TAGS)[number];\n\n/** Set for O(1) lookup of valid BCP 47 tags */\nexport const BCP47_LANGUAGE_TAG_SET: ReadonlySet<string> = new Set(\n BCP47_LANGUAGE_TAGS,\n);\n\n/** Check whether a string is a valid BCP 47 language tag */\nexport function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag {\n return BCP47_LANGUAGE_TAG_SET.has(tag);\n}\n\n/**\n * Maps bare language codes to their default BCP 47 tag.\n * Used when VLM returns only a language code without a region subtag.\n */\nconst DEFAULT_REGION_MAP: Record<string, Bcp47LanguageTag> = {\n ar: 'ar-SA',\n cs: 'cs-CZ',\n da: 'da-DK',\n de: 'de-DE',\n en: 'en-US',\n es: 'es-ES',\n fr: 'fr-FR',\n id: 'id-ID',\n it: 'it-IT',\n ja: 'ja-JP',\n ko: 'ko-KR',\n ms: 'ms-MY',\n nl: 'nl-NL',\n no: 'no-NO',\n pl: 'pl-PL',\n pt: 'pt-BR',\n ro: 'ro-RO',\n ru: 'ru-RU',\n sv: 'sv-SE',\n th: 'th-TH',\n tr: 'tr-TR',\n uk: 'uk-UA',\n vi: 'vi-VT',\n zh: 'zh-Hans',\n};\n\n/**\n * Normalize a language string to a valid BCP 47 tag.\n *\n * - If the input is already a valid full tag (e.g. \"en-US\"), return it as-is.\n * - If it is a bare language code (e.g. \"en\", \"ko\"), map it to the default region.\n * - Otherwise return null (e.g. \"und\", \"unknown\", empty string).\n */\nexport function normalizeToBcp47(tag: string): Bcp47LanguageTag | null {\n if (isValidBcp47Tag(tag)) {\n return tag;\n }\n\n const lower = tag.toLowerCase();\n const mapped = DEFAULT_REGION_MAP[lower];\n if (mapped) {\n return mapped;\n }\n\n return null;\n}\n","/** Language display names for prompt context (keyed by ISO 639-1 base language code) */\nexport const LANGUAGE_DISPLAY_NAMES: Record<string, string> = {\n ko: 'Korean (한국어)',\n ja: 'Japanese (日本語)',\n zh: 'Chinese (中文)',\n en: 'English',\n fr: 'French (Français)',\n de: 'German (Deutsch)',\n es: 'Spanish (Español)',\n pt: 'Portuguese (Português)',\n ru: 'Russian (Русский)',\n uk: 'Ukrainian (Українська)',\n it: 'Italian (Italiano)',\n};\n\n/**\n * Get human-readable display name for a BCP 47 or ISO 639-1 language code.\n */\nexport function getLanguageDisplayName(code?: string): string {\n if (!code) return 'unknown';\n const baseCode = code.split('-')[0];\n return LANGUAGE_DISPLAY_NAMES[baseCode] ?? code;\n}\n\n/**\n * Build language description string from document languages.\n * @returns e.g. \"primarily written in Korean (한국어), with English also present\"\n */\nexport function buildLanguageDescription(documentLanguages: string[]): string {\n const primaryName = getLanguageDisplayName(documentLanguages[0]);\n const otherNames = documentLanguages\n .slice(1)\n .map((code) => getLanguageDisplayName(code));\n return otherNames.length > 0\n ? `primarily written in ${primaryName}, with ${otherNames.join(', ')} also present`\n : `written in ${primaryName}`;\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACKO,IAAM,sBAAsB;AAAA,EACjC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAMO,IAAM,yBAA8C,IAAI;AAAA,EAC7D;AACF;AAGO,SAAS,gBAAgB,KAAsC;AACpE,SAAO,uBAAuB,IAAI,GAAG;AACvC;AAMA,IAAM,qBAAuD;AAAA,EAC3D,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AACN;AASO,SAAS,iBAAiB,KAAsC;AACrE,MAAI,gBAAgB,GAAG,GAAG;AACxB,WAAO;AAAA,EACT;AAEA,QAAM,QAAQ,IAAI,YAAY;AAC9B,QAAM,SAAS,mBAAmB,KAAK;AACvC,MAAI,QAAQ;AACV,WAAO;AAAA,EACT;AAEA,SAAO;AACT;;;ACpGO,IAAM,yBAAiD;AAAA,EAC5D,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AACN;AAKO,SAAS,uBAAuB,MAAuB;AAC5D,MAAI,CAAC,KAAM,QAAO;AAClB,QAAM,WAAW,KAAK,MAAM,GAAG,EAAE,CAAC;AAClC,SAAO,uBAAuB,QAAQ,KAAK;AAC7C;AAMO,SAAS,yBAAyB,mBAAqC;AAC5E,QAAM,cAAc,uBAAuB,kBAAkB,CAAC,CAAC;AAC/D,QAAM,aAAa,kBAChB,MAAM,CAAC,EACP,IAAI,CAAC,SAAS,uBAAuB,IAAI,CAAC;AAC7C,SAAO,WAAW,SAAS,IACvB,wBAAwB,WAAW,UAAU,WAAW,KAAK,IAAI,CAAC,kBAClE,cAAc,WAAW;AAC/B;","names":[]}
package/dist/index.d.cts CHANGED
@@ -19,6 +19,18 @@ declare function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag;
19
19
  */
20
20
  declare function normalizeToBcp47(tag: string): Bcp47LanguageTag | null;
21
21
 
22
+ /** Language display names for prompt context (keyed by ISO 639-1 base language code) */
23
+ declare const LANGUAGE_DISPLAY_NAMES: Record<string, string>;
24
+ /**
25
+ * Get human-readable display name for a BCP 47 or ISO 639-1 language code.
26
+ */
27
+ declare function getLanguageDisplayName(code?: string): string;
28
+ /**
29
+ * Build language description string from document languages.
30
+ * @returns e.g. "primarily written in Korean (한국어), with English also present"
31
+ */
32
+ declare function buildLanguageDescription(documentLanguages: string[]): string;
33
+
22
34
  interface DoclingReference {
23
35
  $ref: string;
24
36
  }
@@ -740,4 +752,4 @@ interface OcrStrategy {
740
752
  koreanHanjaMixPages?: number[];
741
753
  }
742
754
 
743
- export { BCP47_LANGUAGE_TAGS, BCP47_LANGUAGE_TAG_SET, type Bcp47LanguageTag, type Caption, type Chapter, type ComponentUsageReport, type DoclingBBox, type DoclingBaseNode, type DoclingBody, type DoclingDocument, type DoclingGroupItem, type DoclingOrigin, type DoclingPage, type DoclingPageImage, type DoclingPictureItem, type DoclingProv, type DoclingReference, type DoclingTableCell, type DoclingTableData, type DoclingTableItem, type DoclingTextItem, type DocumentProcessResult, type ModelUsageDetail, type OcrStrategy, type PageRange, type PhaseUsageReport, type ProcessedDocument, type ProcessedFootnote, type ProcessedImage, type ProcessedTable, type ProcessedTableCell, type TextBlock, type TokenUsageReport, type TokenUsageSummary, isValidBcp47Tag, normalizeToBcp47 };
755
+ export { BCP47_LANGUAGE_TAGS, BCP47_LANGUAGE_TAG_SET, type Bcp47LanguageTag, type Caption, type Chapter, type ComponentUsageReport, type DoclingBBox, type DoclingBaseNode, type DoclingBody, type DoclingDocument, type DoclingGroupItem, type DoclingOrigin, type DoclingPage, type DoclingPageImage, type DoclingPictureItem, type DoclingProv, type DoclingReference, type DoclingTableCell, type DoclingTableData, type DoclingTableItem, type DoclingTextItem, type DocumentProcessResult, LANGUAGE_DISPLAY_NAMES, type ModelUsageDetail, type OcrStrategy, type PageRange, type PhaseUsageReport, type ProcessedDocument, type ProcessedFootnote, type ProcessedImage, type ProcessedTable, type ProcessedTableCell, type TextBlock, type TokenUsageReport, type TokenUsageSummary, buildLanguageDescription, getLanguageDisplayName, isValidBcp47Tag, normalizeToBcp47 };
package/dist/index.d.ts CHANGED
@@ -19,6 +19,18 @@ declare function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag;
19
19
  */
20
20
  declare function normalizeToBcp47(tag: string): Bcp47LanguageTag | null;
21
21
 
22
+ /** Language display names for prompt context (keyed by ISO 639-1 base language code) */
23
+ declare const LANGUAGE_DISPLAY_NAMES: Record<string, string>;
24
+ /**
25
+ * Get human-readable display name for a BCP 47 or ISO 639-1 language code.
26
+ */
27
+ declare function getLanguageDisplayName(code?: string): string;
28
+ /**
29
+ * Build language description string from document languages.
30
+ * @returns e.g. "primarily written in Korean (한국어), with English also present"
31
+ */
32
+ declare function buildLanguageDescription(documentLanguages: string[]): string;
33
+
22
34
  interface DoclingReference {
23
35
  $ref: string;
24
36
  }
@@ -740,4 +752,4 @@ interface OcrStrategy {
740
752
  koreanHanjaMixPages?: number[];
741
753
  }
742
754
 
743
- export { BCP47_LANGUAGE_TAGS, BCP47_LANGUAGE_TAG_SET, type Bcp47LanguageTag, type Caption, type Chapter, type ComponentUsageReport, type DoclingBBox, type DoclingBaseNode, type DoclingBody, type DoclingDocument, type DoclingGroupItem, type DoclingOrigin, type DoclingPage, type DoclingPageImage, type DoclingPictureItem, type DoclingProv, type DoclingReference, type DoclingTableCell, type DoclingTableData, type DoclingTableItem, type DoclingTextItem, type DocumentProcessResult, type ModelUsageDetail, type OcrStrategy, type PageRange, type PhaseUsageReport, type ProcessedDocument, type ProcessedFootnote, type ProcessedImage, type ProcessedTable, type ProcessedTableCell, type TextBlock, type TokenUsageReport, type TokenUsageSummary, isValidBcp47Tag, normalizeToBcp47 };
755
+ export { BCP47_LANGUAGE_TAGS, BCP47_LANGUAGE_TAG_SET, type Bcp47LanguageTag, type Caption, type Chapter, type ComponentUsageReport, type DoclingBBox, type DoclingBaseNode, type DoclingBody, type DoclingDocument, type DoclingGroupItem, type DoclingOrigin, type DoclingPage, type DoclingPageImage, type DoclingPictureItem, type DoclingProv, type DoclingReference, type DoclingTableCell, type DoclingTableData, type DoclingTableItem, type DoclingTextItem, type DocumentProcessResult, LANGUAGE_DISPLAY_NAMES, type ModelUsageDetail, type OcrStrategy, type PageRange, type PhaseUsageReport, type ProcessedDocument, type ProcessedFootnote, type ProcessedImage, type ProcessedTable, type ProcessedTableCell, type TextBlock, type TokenUsageReport, type TokenUsageSummary, buildLanguageDescription, getLanguageDisplayName, isValidBcp47Tag, normalizeToBcp47 };
package/dist/index.js CHANGED
@@ -1,4 +1,4 @@
1
- // src/bcp47-language-tag.ts
1
+ // src/language/bcp47-language-tag.ts
2
2
  var BCP47_LANGUAGE_TAGS = [
3
3
  "ar-SA",
4
4
  "ars-SA",
@@ -74,9 +74,37 @@ function normalizeToBcp47(tag) {
74
74
  }
75
75
  return null;
76
76
  }
77
+
78
+ // src/language/language-display.ts
79
+ var LANGUAGE_DISPLAY_NAMES = {
80
+ ko: "Korean (\uD55C\uAD6D\uC5B4)",
81
+ ja: "Japanese (\u65E5\u672C\u8A9E)",
82
+ zh: "Chinese (\u4E2D\u6587)",
83
+ en: "English",
84
+ fr: "French (Fran\xE7ais)",
85
+ de: "German (Deutsch)",
86
+ es: "Spanish (Espa\xF1ol)",
87
+ pt: "Portuguese (Portugu\xEAs)",
88
+ ru: "Russian (\u0420\u0443\u0441\u0441\u043A\u0438\u0439)",
89
+ uk: "Ukrainian (\u0423\u043A\u0440\u0430\u0457\u043D\u0441\u044C\u043A\u0430)",
90
+ it: "Italian (Italiano)"
91
+ };
92
+ function getLanguageDisplayName(code) {
93
+ if (!code) return "unknown";
94
+ const baseCode = code.split("-")[0];
95
+ return LANGUAGE_DISPLAY_NAMES[baseCode] ?? code;
96
+ }
97
+ function buildLanguageDescription(documentLanguages) {
98
+ const primaryName = getLanguageDisplayName(documentLanguages[0]);
99
+ const otherNames = documentLanguages.slice(1).map((code) => getLanguageDisplayName(code));
100
+ return otherNames.length > 0 ? `primarily written in ${primaryName}, with ${otherNames.join(", ")} also present` : `written in ${primaryName}`;
101
+ }
77
102
  export {
78
103
  BCP47_LANGUAGE_TAGS,
79
104
  BCP47_LANGUAGE_TAG_SET,
105
+ LANGUAGE_DISPLAY_NAMES,
106
+ buildLanguageDescription,
107
+ getLanguageDisplayName,
80
108
  isValidBcp47Tag,
81
109
  normalizeToBcp47
82
110
  };
package/dist/index.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/bcp47-language-tag.ts"],"sourcesContent":["/**\n * Language tags supported by the ocrmac OCR engine (via docling-serve).\n * These are the only tags that can be passed to the Docling pipeline without\n * triggering \"Invalid language preference\" errors.\n */\nexport const BCP47_LANGUAGE_TAGS = [\n 'ar-SA',\n 'ars-SA',\n 'cs-CZ',\n 'da-DK',\n 'de-DE',\n 'en-US',\n 'es-ES',\n 'fr-FR',\n 'id-ID',\n 'it-IT',\n 'ja-JP',\n 'ko-KR',\n 'ms-MY',\n 'nb-NO',\n 'nl-NL',\n 'nn-NO',\n 'no-NO',\n 'pl-PL',\n 'pt-BR',\n 'ro-RO',\n 'ru-RU',\n 'sv-SE',\n 'th-TH',\n 'tr-TR',\n 'uk-UA',\n 'vi-VT',\n 'yue-Hans',\n 'yue-Hant',\n 'zh-Hans',\n 'zh-Hant',\n] as const;\n\n/** Union type of all supported BCP 47 language tags */\nexport type Bcp47LanguageTag = (typeof BCP47_LANGUAGE_TAGS)[number];\n\n/** Set for O(1) lookup of valid BCP 47 tags */\nexport const BCP47_LANGUAGE_TAG_SET: ReadonlySet<string> = new Set(\n BCP47_LANGUAGE_TAGS,\n);\n\n/** Check whether a string is a valid BCP 47 language tag */\nexport function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag {\n return BCP47_LANGUAGE_TAG_SET.has(tag);\n}\n\n/**\n * Maps bare language codes to their default BCP 47 tag.\n * Used when VLM returns only a language code without a region subtag.\n */\nconst DEFAULT_REGION_MAP: Record<string, Bcp47LanguageTag> = {\n ar: 'ar-SA',\n cs: 'cs-CZ',\n da: 'da-DK',\n de: 'de-DE',\n en: 'en-US',\n es: 'es-ES',\n fr: 'fr-FR',\n id: 'id-ID',\n it: 'it-IT',\n ja: 'ja-JP',\n ko: 'ko-KR',\n ms: 'ms-MY',\n nl: 'nl-NL',\n no: 'no-NO',\n pl: 'pl-PL',\n pt: 'pt-BR',\n ro: 'ro-RO',\n ru: 'ru-RU',\n sv: 'sv-SE',\n th: 'th-TH',\n tr: 'tr-TR',\n uk: 'uk-UA',\n vi: 'vi-VT',\n zh: 'zh-Hans',\n};\n\n/**\n * Normalize a language string to a valid BCP 47 tag.\n *\n * - If the input is already a valid full tag (e.g. \"en-US\"), return it as-is.\n * - If it is a bare language code (e.g. \"en\", \"ko\"), map it to the default region.\n * - Otherwise return null (e.g. \"und\", \"unknown\", empty string).\n */\nexport function normalizeToBcp47(tag: string): Bcp47LanguageTag | null {\n if (isValidBcp47Tag(tag)) {\n return tag;\n }\n\n const lower = tag.toLowerCase();\n const mapped = DEFAULT_REGION_MAP[lower];\n if (mapped) {\n return mapped;\n }\n\n return null;\n}\n"],"mappings":";AAKO,IAAM,sBAAsB;AAAA,EACjC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAMO,IAAM,yBAA8C,IAAI;AAAA,EAC7D;AACF;AAGO,SAAS,gBAAgB,KAAsC;AACpE,SAAO,uBAAuB,IAAI,GAAG;AACvC;AAMA,IAAM,qBAAuD;AAAA,EAC3D,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AACN;AASO,SAAS,iBAAiB,KAAsC;AACrE,MAAI,gBAAgB,GAAG,GAAG;AACxB,WAAO;AAAA,EACT;AAEA,QAAM,QAAQ,IAAI,YAAY;AAC9B,QAAM,SAAS,mBAAmB,KAAK;AACvC,MAAI,QAAQ;AACV,WAAO;AAAA,EACT;AAEA,SAAO;AACT;","names":[]}
1
+ {"version":3,"sources":["../src/language/bcp47-language-tag.ts","../src/language/language-display.ts"],"sourcesContent":["/**\n * Language tags supported by the ocrmac OCR engine (via docling-serve).\n * These are the only tags that can be passed to the Docling pipeline without\n * triggering \"Invalid language preference\" errors.\n */\nexport const BCP47_LANGUAGE_TAGS = [\n 'ar-SA',\n 'ars-SA',\n 'cs-CZ',\n 'da-DK',\n 'de-DE',\n 'en-US',\n 'es-ES',\n 'fr-FR',\n 'id-ID',\n 'it-IT',\n 'ja-JP',\n 'ko-KR',\n 'ms-MY',\n 'nb-NO',\n 'nl-NL',\n 'nn-NO',\n 'no-NO',\n 'pl-PL',\n 'pt-BR',\n 'ro-RO',\n 'ru-RU',\n 'sv-SE',\n 'th-TH',\n 'tr-TR',\n 'uk-UA',\n 'vi-VT',\n 'yue-Hans',\n 'yue-Hant',\n 'zh-Hans',\n 'zh-Hant',\n] as const;\n\n/** Union type of all supported BCP 47 language tags */\nexport type Bcp47LanguageTag = (typeof BCP47_LANGUAGE_TAGS)[number];\n\n/** Set for O(1) lookup of valid BCP 47 tags */\nexport const BCP47_LANGUAGE_TAG_SET: ReadonlySet<string> = new Set(\n BCP47_LANGUAGE_TAGS,\n);\n\n/** Check whether a string is a valid BCP 47 language tag */\nexport function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag {\n return BCP47_LANGUAGE_TAG_SET.has(tag);\n}\n\n/**\n * Maps bare language codes to their default BCP 47 tag.\n * Used when VLM returns only a language code without a region subtag.\n */\nconst DEFAULT_REGION_MAP: Record<string, Bcp47LanguageTag> = {\n ar: 'ar-SA',\n cs: 'cs-CZ',\n da: 'da-DK',\n de: 'de-DE',\n en: 'en-US',\n es: 'es-ES',\n fr: 'fr-FR',\n id: 'id-ID',\n it: 'it-IT',\n ja: 'ja-JP',\n ko: 'ko-KR',\n ms: 'ms-MY',\n nl: 'nl-NL',\n no: 'no-NO',\n pl: 'pl-PL',\n pt: 'pt-BR',\n ro: 'ro-RO',\n ru: 'ru-RU',\n sv: 'sv-SE',\n th: 'th-TH',\n tr: 'tr-TR',\n uk: 'uk-UA',\n vi: 'vi-VT',\n zh: 'zh-Hans',\n};\n\n/**\n * Normalize a language string to a valid BCP 47 tag.\n *\n * - If the input is already a valid full tag (e.g. \"en-US\"), return it as-is.\n * - If it is a bare language code (e.g. \"en\", \"ko\"), map it to the default region.\n * - Otherwise return null (e.g. \"und\", \"unknown\", empty string).\n */\nexport function normalizeToBcp47(tag: string): Bcp47LanguageTag | null {\n if (isValidBcp47Tag(tag)) {\n return tag;\n }\n\n const lower = tag.toLowerCase();\n const mapped = DEFAULT_REGION_MAP[lower];\n if (mapped) {\n return mapped;\n }\n\n return null;\n}\n","/** Language display names for prompt context (keyed by ISO 639-1 base language code) */\nexport const LANGUAGE_DISPLAY_NAMES: Record<string, string> = {\n ko: 'Korean (한국어)',\n ja: 'Japanese (日本語)',\n zh: 'Chinese (中文)',\n en: 'English',\n fr: 'French (Français)',\n de: 'German (Deutsch)',\n es: 'Spanish (Español)',\n pt: 'Portuguese (Português)',\n ru: 'Russian (Русский)',\n uk: 'Ukrainian (Українська)',\n it: 'Italian (Italiano)',\n};\n\n/**\n * Get human-readable display name for a BCP 47 or ISO 639-1 language code.\n */\nexport function getLanguageDisplayName(code?: string): string {\n if (!code) return 'unknown';\n const baseCode = code.split('-')[0];\n return LANGUAGE_DISPLAY_NAMES[baseCode] ?? code;\n}\n\n/**\n * Build language description string from document languages.\n * @returns e.g. \"primarily written in Korean (한국어), with English also present\"\n */\nexport function buildLanguageDescription(documentLanguages: string[]): string {\n const primaryName = getLanguageDisplayName(documentLanguages[0]);\n const otherNames = documentLanguages\n .slice(1)\n .map((code) => getLanguageDisplayName(code));\n return otherNames.length > 0\n ? `primarily written in ${primaryName}, with ${otherNames.join(', ')} also present`\n : `written in ${primaryName}`;\n}\n"],"mappings":";AAKO,IAAM,sBAAsB;AAAA,EACjC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAMO,IAAM,yBAA8C,IAAI;AAAA,EAC7D;AACF;AAGO,SAAS,gBAAgB,KAAsC;AACpE,SAAO,uBAAuB,IAAI,GAAG;AACvC;AAMA,IAAM,qBAAuD;AAAA,EAC3D,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AACN;AASO,SAAS,iBAAiB,KAAsC;AACrE,MAAI,gBAAgB,GAAG,GAAG;AACxB,WAAO;AAAA,EACT;AAEA,QAAM,QAAQ,IAAI,YAAY;AAC9B,QAAM,SAAS,mBAAmB,KAAK;AACvC,MAAI,QAAQ;AACV,WAAO;AAAA,EACT;AAEA,SAAO;AACT;;;ACpGO,IAAM,yBAAiD;AAAA,EAC5D,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AACN;AAKO,SAAS,uBAAuB,MAAuB;AAC5D,MAAI,CAAC,KAAM,QAAO;AAClB,QAAM,WAAW,KAAK,MAAM,GAAG,EAAE,CAAC;AAClC,SAAO,uBAAuB,QAAQ,KAAK;AAC7C;AAMO,SAAS,yBAAyB,mBAAqC;AAC5E,QAAM,cAAc,uBAAuB,kBAAkB,CAAC,CAAC;AAC/D,QAAM,aAAa,kBAChB,MAAM,CAAC,EACP,IAAI,CAAC,SAAS,uBAAuB,IAAI,CAAC;AAC7C,SAAO,WAAW,SAAS,IACvB,wBAAwB,WAAW,UAAU,WAAW,KAAK,IAAI,CAAC,kBAClE,cAAc,WAAW;AAC/B;","names":[]}
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "@heripo/model",
3
3
  "private": false,
4
4
  "type": "module",
5
- "version": "0.1.16",
5
+ "version": "0.1.17",
6
6
  "description": "Document models and type definitions for heripo engine",
7
7
  "main": "dist/index.cjs",
8
8
  "module": "dist/index.js",
@@ -51,8 +51,11 @@
51
51
  "access": "public"
52
52
  },
53
53
  "devDependencies": {
54
+ "@vitest/coverage-v8": "^4.1.0",
54
55
  "tsup": "^8.5.1",
56
+ "vitest": "^4.1.0",
55
57
  "@heripo/tsconfig": "0.0.0",
58
+ "@heripo/vitest-config": "0.0.0",
56
59
  "@heripo/tsup-config": "0.0.0"
57
60
  },
58
61
  "scripts": {
@@ -60,6 +63,11 @@
60
63
  "build": "pnpm clean && tsup",
61
64
  "dev": "tsup --watch",
62
65
  "typecheck": "tsc --noEmit",
66
+ "test": "vitest run",
67
+ "test:watch": "vitest",
68
+ "test:ci": "TEST_MODE=ci vitest run --coverage",
69
+ "test:coverage": "vitest run --coverage",
70
+ "test:coverage:watch": "vitest --coverage",
63
71
  "lint": "eslint src/**/*.ts",
64
72
  "lint:fix": "eslint src/**/*.ts --fix"
65
73
  }