@heripo/model 0.1.15 → 0.1.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.ko.md +137 -15
- package/README.md +137 -15
- package/dist/index.cjs +32 -1
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +13 -1
- package/dist/index.d.ts +13 -1
- package/dist/index.js +29 -1
- package/dist/index.js.map +1 -1
- package/package.json +9 -1
package/README.ko.md
CHANGED
|
@@ -72,10 +72,11 @@ import type { ProcessedDocument } from '@heripo/model';
|
|
|
72
72
|
|
|
73
73
|
interface ProcessedDocument {
|
|
74
74
|
reportId: string; // 리포트 ID
|
|
75
|
-
pageRangeMap: PageRange
|
|
75
|
+
pageRangeMap: Record<number, PageRange>; // PDF 페이지 → 문서 페이지 매핑
|
|
76
76
|
chapters: Chapter[]; // 계층적 챕터 구조
|
|
77
77
|
images: ProcessedImage[]; // 추출된 이미지 메타데이터
|
|
78
78
|
tables: ProcessedTable[]; // 추출된 테이블 데이터
|
|
79
|
+
footnotes: ProcessedFootnote[]; // 추출된 각주
|
|
79
80
|
}
|
|
80
81
|
```
|
|
81
82
|
|
|
@@ -89,12 +90,14 @@ import type { Chapter } from '@heripo/model';
|
|
|
89
90
|
interface Chapter {
|
|
90
91
|
id: string; // 챕터 ID
|
|
91
92
|
title: string; // 챕터 제목
|
|
93
|
+
originTitle: string; // 원본 제목
|
|
92
94
|
level: number; // 계층 레벨 (1, 2, 3, ...)
|
|
93
|
-
pageNo
|
|
95
|
+
pageNo: number; // 시작 페이지 번호
|
|
94
96
|
textBlocks: TextBlock[]; // 텍스트 블록
|
|
95
97
|
imageIds: string[]; // 이미지 ID 참조
|
|
96
98
|
tableIds: string[]; // 테이블 ID 참조
|
|
97
|
-
|
|
99
|
+
footnoteIds: string[]; // 각주 ID 참조
|
|
100
|
+
children?: Chapter[]; // 하위 챕터 (선택)
|
|
98
101
|
}
|
|
99
102
|
```
|
|
100
103
|
|
|
@@ -107,7 +110,7 @@ import type { TextBlock } from '@heripo/model';
|
|
|
107
110
|
|
|
108
111
|
interface TextBlock {
|
|
109
112
|
text: string; // 텍스트 내용
|
|
110
|
-
|
|
113
|
+
pdfPageNo: number; // PDF 페이지 번호
|
|
111
114
|
}
|
|
112
115
|
```
|
|
113
116
|
|
|
@@ -121,8 +124,8 @@ import type { ProcessedImage } from '@heripo/model';
|
|
|
121
124
|
interface ProcessedImage {
|
|
122
125
|
id: string; // 이미지 ID
|
|
123
126
|
caption?: Caption; // 캡션 (선택)
|
|
124
|
-
pdfPageNo
|
|
125
|
-
|
|
127
|
+
pdfPageNo: number; // PDF 페이지 번호
|
|
128
|
+
path: string; // 이미지 파일 경로
|
|
126
129
|
}
|
|
127
130
|
```
|
|
128
131
|
|
|
@@ -136,8 +139,8 @@ import type { ProcessedTable } from '@heripo/model';
|
|
|
136
139
|
interface ProcessedTable {
|
|
137
140
|
id: string; // 테이블 ID
|
|
138
141
|
caption?: Caption; // 캡션 (선택)
|
|
139
|
-
pdfPageNo
|
|
140
|
-
|
|
142
|
+
pdfPageNo: number; // PDF 페이지 번호
|
|
143
|
+
grid: ProcessedTableCell[][]; // 2D 그리드 데이터
|
|
141
144
|
numRows: number; // 행 개수
|
|
142
145
|
numCols: number; // 열 개수
|
|
143
146
|
}
|
|
@@ -152,8 +155,8 @@ import type { ProcessedTableCell } from '@heripo/model';
|
|
|
152
155
|
|
|
153
156
|
interface ProcessedTableCell {
|
|
154
157
|
text: string; // 셀 텍스트
|
|
155
|
-
|
|
156
|
-
|
|
158
|
+
rowSpan: number; // 행 병합
|
|
159
|
+
colSpan: number; // 열 병합
|
|
157
160
|
isHeader: boolean; // 헤더 셀 여부
|
|
158
161
|
}
|
|
159
162
|
```
|
|
@@ -166,7 +169,7 @@ interface ProcessedTableCell {
|
|
|
166
169
|
import type { Caption } from '@heripo/model';
|
|
167
170
|
|
|
168
171
|
interface Caption {
|
|
169
|
-
num?:
|
|
172
|
+
num?: string; // 캡션 번호 (예: "그림 1"의 "1")
|
|
170
173
|
fullText: string; // 전체 캡션 텍스트
|
|
171
174
|
}
|
|
172
175
|
```
|
|
@@ -179,11 +182,130 @@ PDF 페이지와 문서 페이지 매핑입니다.
|
|
|
179
182
|
import type { PageRange } from '@heripo/model';
|
|
180
183
|
|
|
181
184
|
interface PageRange {
|
|
185
|
+
startPageNo: number; // 시작 페이지 번호
|
|
186
|
+
endPageNo: number; // 끝 페이지 번호
|
|
187
|
+
}
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
### ProcessedFootnote
|
|
191
|
+
|
|
192
|
+
문서에서 추출된 각주입니다.
|
|
193
|
+
|
|
194
|
+
```typescript
|
|
195
|
+
import type { ProcessedFootnote } from '@heripo/model';
|
|
196
|
+
|
|
197
|
+
interface ProcessedFootnote {
|
|
198
|
+
id: string; // 각주 ID
|
|
199
|
+
text: string; // 각주 텍스트
|
|
182
200
|
pdfPageNo: number; // PDF 페이지 번호
|
|
183
|
-
pageNo: number; // 문서 논리적 페이지 번호
|
|
184
201
|
}
|
|
185
202
|
```
|
|
186
203
|
|
|
204
|
+
### DocumentProcessResult
|
|
205
|
+
|
|
206
|
+
문서 처리 결과로, 처리된 문서와 토큰 사용량 리포트를 포함합니다.
|
|
207
|
+
|
|
208
|
+
```typescript
|
|
209
|
+
import type { DocumentProcessResult } from '@heripo/model';
|
|
210
|
+
|
|
211
|
+
interface DocumentProcessResult {
|
|
212
|
+
document: ProcessedDocument; // 처리된 문서
|
|
213
|
+
usage: TokenUsageReport; // 토큰 사용량 리포트
|
|
214
|
+
}
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
### OcrStrategy
|
|
218
|
+
|
|
219
|
+
OCR 전략 선택 결과입니다.
|
|
220
|
+
|
|
221
|
+
```typescript
|
|
222
|
+
import type { OcrStrategy } from '@heripo/model';
|
|
223
|
+
|
|
224
|
+
interface OcrStrategy {
|
|
225
|
+
method: 'ocrmac' | 'vlm'; // OCR 방법
|
|
226
|
+
ocrLanguages?: string[]; // OCR 언어
|
|
227
|
+
detectedLanguages?: Bcp47LanguageTag[]; // 감지된 BCP-47 언어 태그
|
|
228
|
+
reason: string; // 전략 선택 이유
|
|
229
|
+
sampledPages: number; // 샘플링된 페이지 수
|
|
230
|
+
totalPages: number; // 문서 전체 페이지 수
|
|
231
|
+
koreanHanjaMixPages?: number[]; // 한국어-한자 혼용 페이지
|
|
232
|
+
}
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
### 토큰 사용량 타입
|
|
236
|
+
|
|
237
|
+
처리 단계별 LLM 토큰 사용량을 추적하기 위한 타입입니다.
|
|
238
|
+
|
|
239
|
+
```typescript
|
|
240
|
+
import type {
|
|
241
|
+
ComponentUsageReport,
|
|
242
|
+
ModelUsageDetail,
|
|
243
|
+
PhaseUsageReport,
|
|
244
|
+
TokenUsageReport,
|
|
245
|
+
TokenUsageSummary,
|
|
246
|
+
} from '@heripo/model';
|
|
247
|
+
|
|
248
|
+
interface TokenUsageReport {
|
|
249
|
+
components: ComponentUsageReport[]; // 컴포넌트별 사용량
|
|
250
|
+
total: TokenUsageSummary; // 전체 사용량 요약
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
interface ComponentUsageReport {
|
|
254
|
+
component: string; // 컴포넌트 이름
|
|
255
|
+
phases: PhaseUsageReport[]; // 단계별 사용량
|
|
256
|
+
total: TokenUsageSummary; // 컴포넌트 합계
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
interface PhaseUsageReport {
|
|
260
|
+
phase: string; // 단계 이름
|
|
261
|
+
primary?: ModelUsageDetail; // 기본 모델 사용량
|
|
262
|
+
fallback?: ModelUsageDetail; // 폴백 모델 사용량
|
|
263
|
+
total: TokenUsageSummary; // 단계 합계
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
interface ModelUsageDetail {
|
|
267
|
+
modelName: string; // 모델 이름
|
|
268
|
+
inputTokens: number; // 입력 토큰 수
|
|
269
|
+
outputTokens: number; // 출력 토큰 수
|
|
270
|
+
totalTokens: number; // 전체 토큰 수
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
interface TokenUsageSummary {
|
|
274
|
+
inputTokens: number; // 입력 토큰 수
|
|
275
|
+
outputTokens: number; // 출력 토큰 수
|
|
276
|
+
totalTokens: number; // 전체 토큰 수
|
|
277
|
+
}
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
### BCP-47 언어 태그 유틸리티
|
|
281
|
+
|
|
282
|
+
BCP-47 언어 태그를 다루기 위한 유틸리티입니다.
|
|
283
|
+
|
|
284
|
+
```typescript
|
|
285
|
+
import {
|
|
286
|
+
type Bcp47LanguageTag,
|
|
287
|
+
BCP47_LANGUAGE_TAGS,
|
|
288
|
+
BCP47_LANGUAGE_TAG_SET,
|
|
289
|
+
isValidBcp47Tag,
|
|
290
|
+
normalizeToBcp47,
|
|
291
|
+
} from '@heripo/model';
|
|
292
|
+
|
|
293
|
+
// Bcp47LanguageTag - 지원되는 BCP-47 언어 태그의 유니온 타입
|
|
294
|
+
type Bcp47LanguageTag = 'ko' | 'en' | 'ja' | 'zh' | /* ... */ string;
|
|
295
|
+
|
|
296
|
+
// BCP47_LANGUAGE_TAGS - 30개 지원 태그의 상수 배열
|
|
297
|
+
const BCP47_LANGUAGE_TAGS: readonly Bcp47LanguageTag[];
|
|
298
|
+
|
|
299
|
+
// BCP47_LANGUAGE_TAG_SET - O(1) 조회를 위한 ReadonlySet
|
|
300
|
+
const BCP47_LANGUAGE_TAG_SET: ReadonlySet<string>;
|
|
301
|
+
|
|
302
|
+
// isValidBcp47Tag - 문자열이 유효한 BCP-47 태그인지 확인
|
|
303
|
+
function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag;
|
|
304
|
+
|
|
305
|
+
// normalizeToBcp47 - 언어 문자열을 BCP-47 형식으로 정규화
|
|
306
|
+
function normalizeToBcp47(tag: string): Bcp47LanguageTag | undefined;
|
|
307
|
+
```
|
|
308
|
+
|
|
187
309
|
## 사용법
|
|
188
310
|
|
|
189
311
|
### ProcessedDocument 읽기
|
|
@@ -200,7 +322,7 @@ function analyzeDocument(doc: ProcessedDocument) {
|
|
|
200
322
|
console.log(` 텍스트 블록: ${chapter.textBlocks.length}개`);
|
|
201
323
|
console.log(` 이미지: ${chapter.imageIds.length}개`);
|
|
202
324
|
console.log(` 테이블: ${chapter.tableIds.length}개`);
|
|
203
|
-
console.log(` 하위 챕터: ${chapter.children
|
|
325
|
+
console.log(` 하위 챕터: ${chapter.children?.length ?? 0}개`);
|
|
204
326
|
});
|
|
205
327
|
|
|
206
328
|
// 이미지 확인
|
|
@@ -209,7 +331,7 @@ function analyzeDocument(doc: ProcessedDocument) {
|
|
|
209
331
|
if (image.caption) {
|
|
210
332
|
console.log(` 캡션: ${image.caption.fullText}`);
|
|
211
333
|
}
|
|
212
|
-
console.log(` 경로: ${image.
|
|
334
|
+
console.log(` 경로: ${image.path}`);
|
|
213
335
|
});
|
|
214
336
|
|
|
215
337
|
// 테이블 확인
|
|
@@ -233,7 +355,7 @@ function traverseChapters(chapter: Chapter, depth: number = 0) {
|
|
|
233
355
|
console.log(`${indent}- ${chapter.title}`);
|
|
234
356
|
|
|
235
357
|
// 재귀적으로 하위 챕터 순회
|
|
236
|
-
chapter.children
|
|
358
|
+
chapter.children?.forEach((child) => {
|
|
237
359
|
traverseChapters(child, depth + 1);
|
|
238
360
|
});
|
|
239
361
|
}
|
package/README.md
CHANGED
|
@@ -72,10 +72,11 @@ import type { ProcessedDocument } from '@heripo/model';
|
|
|
72
72
|
|
|
73
73
|
interface ProcessedDocument {
|
|
74
74
|
reportId: string; // Report ID
|
|
75
|
-
pageRangeMap: PageRange
|
|
75
|
+
pageRangeMap: Record<number, PageRange>; // PDF page → document page mapping
|
|
76
76
|
chapters: Chapter[]; // Hierarchical chapter structure
|
|
77
77
|
images: ProcessedImage[]; // Extracted image metadata
|
|
78
78
|
tables: ProcessedTable[]; // Extracted table data
|
|
79
|
+
footnotes: ProcessedFootnote[]; // Extracted footnotes
|
|
79
80
|
}
|
|
80
81
|
```
|
|
81
82
|
|
|
@@ -89,12 +90,14 @@ import type { Chapter } from '@heripo/model';
|
|
|
89
90
|
interface Chapter {
|
|
90
91
|
id: string; // Chapter ID
|
|
91
92
|
title: string; // Chapter title
|
|
93
|
+
originTitle: string; // Original title from source
|
|
92
94
|
level: number; // Hierarchy level (1, 2, 3, ...)
|
|
93
|
-
pageNo
|
|
95
|
+
pageNo: number; // Start page number
|
|
94
96
|
textBlocks: TextBlock[]; // Text blocks
|
|
95
97
|
imageIds: string[]; // Image ID references
|
|
96
98
|
tableIds: string[]; // Table ID references
|
|
97
|
-
|
|
99
|
+
footnoteIds: string[]; // Footnote ID references
|
|
100
|
+
children?: Chapter[]; // Sub-chapters (optional)
|
|
98
101
|
}
|
|
99
102
|
```
|
|
100
103
|
|
|
@@ -107,7 +110,7 @@ import type { TextBlock } from '@heripo/model';
|
|
|
107
110
|
|
|
108
111
|
interface TextBlock {
|
|
109
112
|
text: string; // Text content
|
|
110
|
-
|
|
113
|
+
pdfPageNo: number; // PDF page number
|
|
111
114
|
}
|
|
112
115
|
```
|
|
113
116
|
|
|
@@ -121,8 +124,8 @@ import type { ProcessedImage } from '@heripo/model';
|
|
|
121
124
|
interface ProcessedImage {
|
|
122
125
|
id: string; // Image ID
|
|
123
126
|
caption?: Caption; // Caption (optional)
|
|
124
|
-
pdfPageNo
|
|
125
|
-
|
|
127
|
+
pdfPageNo: number; // PDF page number
|
|
128
|
+
path: string; // Image file path
|
|
126
129
|
}
|
|
127
130
|
```
|
|
128
131
|
|
|
@@ -136,8 +139,8 @@ import type { ProcessedTable } from '@heripo/model';
|
|
|
136
139
|
interface ProcessedTable {
|
|
137
140
|
id: string; // Table ID
|
|
138
141
|
caption?: Caption; // Caption (optional)
|
|
139
|
-
pdfPageNo
|
|
140
|
-
|
|
142
|
+
pdfPageNo: number; // PDF page number
|
|
143
|
+
grid: ProcessedTableCell[][]; // 2D grid data
|
|
141
144
|
numRows: number; // Row count
|
|
142
145
|
numCols: number; // Column count
|
|
143
146
|
}
|
|
@@ -152,8 +155,8 @@ import type { ProcessedTableCell } from '@heripo/model';
|
|
|
152
155
|
|
|
153
156
|
interface ProcessedTableCell {
|
|
154
157
|
text: string; // Cell text
|
|
155
|
-
|
|
156
|
-
|
|
158
|
+
rowSpan: number; // Row span
|
|
159
|
+
colSpan: number; // Column span
|
|
157
160
|
isHeader: boolean; // Is header cell
|
|
158
161
|
}
|
|
159
162
|
```
|
|
@@ -166,7 +169,7 @@ Image and table captions.
|
|
|
166
169
|
import type { Caption } from '@heripo/model';
|
|
167
170
|
|
|
168
171
|
interface Caption {
|
|
169
|
-
num?:
|
|
172
|
+
num?: string; // Caption number (e.g., "1" in "Figure 1")
|
|
170
173
|
fullText: string; // Full caption text
|
|
171
174
|
}
|
|
172
175
|
```
|
|
@@ -179,11 +182,130 @@ PDF page to document page mapping.
|
|
|
179
182
|
import type { PageRange } from '@heripo/model';
|
|
180
183
|
|
|
181
184
|
interface PageRange {
|
|
185
|
+
startPageNo: number; // Start page number
|
|
186
|
+
endPageNo: number; // End page number
|
|
187
|
+
}
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
### ProcessedFootnote
|
|
191
|
+
|
|
192
|
+
Footnote extracted from the document.
|
|
193
|
+
|
|
194
|
+
```typescript
|
|
195
|
+
import type { ProcessedFootnote } from '@heripo/model';
|
|
196
|
+
|
|
197
|
+
interface ProcessedFootnote {
|
|
198
|
+
id: string; // Footnote ID
|
|
199
|
+
text: string; // Footnote text
|
|
182
200
|
pdfPageNo: number; // PDF page number
|
|
183
|
-
pageNo: number; // Document logical page number
|
|
184
201
|
}
|
|
185
202
|
```
|
|
186
203
|
|
|
204
|
+
### DocumentProcessResult
|
|
205
|
+
|
|
206
|
+
Result of document processing, including the processed document and token usage report.
|
|
207
|
+
|
|
208
|
+
```typescript
|
|
209
|
+
import type { DocumentProcessResult } from '@heripo/model';
|
|
210
|
+
|
|
211
|
+
interface DocumentProcessResult {
|
|
212
|
+
document: ProcessedDocument; // Processed document
|
|
213
|
+
usage: TokenUsageReport; // Token usage report
|
|
214
|
+
}
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
### OcrStrategy
|
|
218
|
+
|
|
219
|
+
OCR strategy selection result.
|
|
220
|
+
|
|
221
|
+
```typescript
|
|
222
|
+
import type { OcrStrategy } from '@heripo/model';
|
|
223
|
+
|
|
224
|
+
interface OcrStrategy {
|
|
225
|
+
method: 'ocrmac' | 'vlm'; // OCR method
|
|
226
|
+
ocrLanguages?: string[]; // OCR languages
|
|
227
|
+
detectedLanguages?: Bcp47LanguageTag[]; // Detected BCP-47 language tags
|
|
228
|
+
reason: string; // Reason for strategy selection
|
|
229
|
+
sampledPages: number; // Number of sampled pages
|
|
230
|
+
totalPages: number; // Total pages in document
|
|
231
|
+
koreanHanjaMixPages?: number[]; // Pages with Korean-Hanja mixed script
|
|
232
|
+
}
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
### Token Usage Types
|
|
236
|
+
|
|
237
|
+
Types for tracking LLM token usage across processing phases.
|
|
238
|
+
|
|
239
|
+
```typescript
|
|
240
|
+
import type {
|
|
241
|
+
ComponentUsageReport,
|
|
242
|
+
ModelUsageDetail,
|
|
243
|
+
PhaseUsageReport,
|
|
244
|
+
TokenUsageReport,
|
|
245
|
+
TokenUsageSummary,
|
|
246
|
+
} from '@heripo/model';
|
|
247
|
+
|
|
248
|
+
interface TokenUsageReport {
|
|
249
|
+
components: ComponentUsageReport[]; // Usage per component
|
|
250
|
+
total: TokenUsageSummary; // Total usage summary
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
interface ComponentUsageReport {
|
|
254
|
+
component: string; // Component name
|
|
255
|
+
phases: PhaseUsageReport[]; // Usage per phase
|
|
256
|
+
total: TokenUsageSummary; // Component total
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
interface PhaseUsageReport {
|
|
260
|
+
phase: string; // Phase name
|
|
261
|
+
primary?: ModelUsageDetail; // Primary model usage
|
|
262
|
+
fallback?: ModelUsageDetail; // Fallback model usage
|
|
263
|
+
total: TokenUsageSummary; // Phase total
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
interface ModelUsageDetail {
|
|
267
|
+
modelName: string; // Model name
|
|
268
|
+
inputTokens: number; // Input token count
|
|
269
|
+
outputTokens: number; // Output token count
|
|
270
|
+
totalTokens: number; // Total token count
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
interface TokenUsageSummary {
|
|
274
|
+
inputTokens: number; // Input token count
|
|
275
|
+
outputTokens: number; // Output token count
|
|
276
|
+
totalTokens: number; // Total token count
|
|
277
|
+
}
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
### BCP-47 Language Tag Utilities
|
|
281
|
+
|
|
282
|
+
Utilities for working with BCP-47 language tags.
|
|
283
|
+
|
|
284
|
+
```typescript
|
|
285
|
+
import {
|
|
286
|
+
type Bcp47LanguageTag,
|
|
287
|
+
BCP47_LANGUAGE_TAGS,
|
|
288
|
+
BCP47_LANGUAGE_TAG_SET,
|
|
289
|
+
isValidBcp47Tag,
|
|
290
|
+
normalizeToBcp47,
|
|
291
|
+
} from '@heripo/model';
|
|
292
|
+
|
|
293
|
+
// Bcp47LanguageTag - Union type of supported BCP-47 language tags
|
|
294
|
+
type Bcp47LanguageTag = 'ko' | 'en' | 'ja' | 'zh' | /* ... */ string;
|
|
295
|
+
|
|
296
|
+
// BCP47_LANGUAGE_TAGS - Const array of 30 supported tags
|
|
297
|
+
const BCP47_LANGUAGE_TAGS: readonly Bcp47LanguageTag[];
|
|
298
|
+
|
|
299
|
+
// BCP47_LANGUAGE_TAG_SET - ReadonlySet for O(1) lookup
|
|
300
|
+
const BCP47_LANGUAGE_TAG_SET: ReadonlySet<string>;
|
|
301
|
+
|
|
302
|
+
// isValidBcp47Tag - Check if a string is a valid BCP-47 tag
|
|
303
|
+
function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag;
|
|
304
|
+
|
|
305
|
+
// normalizeToBcp47 - Normalize a language string to BCP-47 format
|
|
306
|
+
function normalizeToBcp47(tag: string): Bcp47LanguageTag | undefined;
|
|
307
|
+
```
|
|
308
|
+
|
|
187
309
|
## Usage
|
|
188
310
|
|
|
189
311
|
### Reading ProcessedDocument
|
|
@@ -200,7 +322,7 @@ function analyzeDocument(doc: ProcessedDocument) {
|
|
|
200
322
|
console.log(` Text blocks: ${chapter.textBlocks.length}`);
|
|
201
323
|
console.log(` Images: ${chapter.imageIds.length}`);
|
|
202
324
|
console.log(` Tables: ${chapter.tableIds.length}`);
|
|
203
|
-
console.log(` Sub-chapters: ${chapter.children
|
|
325
|
+
console.log(` Sub-chapters: ${chapter.children?.length ?? 0}`);
|
|
204
326
|
});
|
|
205
327
|
|
|
206
328
|
// Check images
|
|
@@ -209,7 +331,7 @@ function analyzeDocument(doc: ProcessedDocument) {
|
|
|
209
331
|
if (image.caption) {
|
|
210
332
|
console.log(` Caption: ${image.caption.fullText}`);
|
|
211
333
|
}
|
|
212
|
-
console.log(` Path: ${image.
|
|
334
|
+
console.log(` Path: ${image.path}`);
|
|
213
335
|
});
|
|
214
336
|
|
|
215
337
|
// Check tables
|
|
@@ -233,7 +355,7 @@ function traverseChapters(chapter: Chapter, depth: number = 0) {
|
|
|
233
355
|
console.log(`${indent}- ${chapter.title}`);
|
|
234
356
|
|
|
235
357
|
// Recursively traverse sub-chapters
|
|
236
|
-
chapter.children
|
|
358
|
+
chapter.children?.forEach((child) => {
|
|
237
359
|
traverseChapters(child, depth + 1);
|
|
238
360
|
});
|
|
239
361
|
}
|
package/dist/index.cjs
CHANGED
|
@@ -22,12 +22,15 @@ var index_exports = {};
|
|
|
22
22
|
__export(index_exports, {
|
|
23
23
|
BCP47_LANGUAGE_TAGS: () => BCP47_LANGUAGE_TAGS,
|
|
24
24
|
BCP47_LANGUAGE_TAG_SET: () => BCP47_LANGUAGE_TAG_SET,
|
|
25
|
+
LANGUAGE_DISPLAY_NAMES: () => LANGUAGE_DISPLAY_NAMES,
|
|
26
|
+
buildLanguageDescription: () => buildLanguageDescription,
|
|
27
|
+
getLanguageDisplayName: () => getLanguageDisplayName,
|
|
25
28
|
isValidBcp47Tag: () => isValidBcp47Tag,
|
|
26
29
|
normalizeToBcp47: () => normalizeToBcp47
|
|
27
30
|
});
|
|
28
31
|
module.exports = __toCommonJS(index_exports);
|
|
29
32
|
|
|
30
|
-
// src/bcp47-language-tag.ts
|
|
33
|
+
// src/language/bcp47-language-tag.ts
|
|
31
34
|
var BCP47_LANGUAGE_TAGS = [
|
|
32
35
|
"ar-SA",
|
|
33
36
|
"ars-SA",
|
|
@@ -103,10 +106,38 @@ function normalizeToBcp47(tag) {
|
|
|
103
106
|
}
|
|
104
107
|
return null;
|
|
105
108
|
}
|
|
109
|
+
|
|
110
|
+
// src/language/language-display.ts
|
|
111
|
+
var LANGUAGE_DISPLAY_NAMES = {
|
|
112
|
+
ko: "Korean (\uD55C\uAD6D\uC5B4)",
|
|
113
|
+
ja: "Japanese (\u65E5\u672C\u8A9E)",
|
|
114
|
+
zh: "Chinese (\u4E2D\u6587)",
|
|
115
|
+
en: "English",
|
|
116
|
+
fr: "French (Fran\xE7ais)",
|
|
117
|
+
de: "German (Deutsch)",
|
|
118
|
+
es: "Spanish (Espa\xF1ol)",
|
|
119
|
+
pt: "Portuguese (Portugu\xEAs)",
|
|
120
|
+
ru: "Russian (\u0420\u0443\u0441\u0441\u043A\u0438\u0439)",
|
|
121
|
+
uk: "Ukrainian (\u0423\u043A\u0440\u0430\u0457\u043D\u0441\u044C\u043A\u0430)",
|
|
122
|
+
it: "Italian (Italiano)"
|
|
123
|
+
};
|
|
124
|
+
function getLanguageDisplayName(code) {
|
|
125
|
+
if (!code) return "unknown";
|
|
126
|
+
const baseCode = code.split("-")[0];
|
|
127
|
+
return LANGUAGE_DISPLAY_NAMES[baseCode] ?? code;
|
|
128
|
+
}
|
|
129
|
+
function buildLanguageDescription(documentLanguages) {
|
|
130
|
+
const primaryName = getLanguageDisplayName(documentLanguages[0]);
|
|
131
|
+
const otherNames = documentLanguages.slice(1).map((code) => getLanguageDisplayName(code));
|
|
132
|
+
return otherNames.length > 0 ? `primarily written in ${primaryName}, with ${otherNames.join(", ")} also present` : `written in ${primaryName}`;
|
|
133
|
+
}
|
|
106
134
|
// Annotate the CommonJS export names for ESM import in node:
|
|
107
135
|
0 && (module.exports = {
|
|
108
136
|
BCP47_LANGUAGE_TAGS,
|
|
109
137
|
BCP47_LANGUAGE_TAG_SET,
|
|
138
|
+
LANGUAGE_DISPLAY_NAMES,
|
|
139
|
+
buildLanguageDescription,
|
|
140
|
+
getLanguageDisplayName,
|
|
110
141
|
isValidBcp47Tag,
|
|
111
142
|
normalizeToBcp47
|
|
112
143
|
});
|
package/dist/index.cjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/index.ts","../src/bcp47-language-tag.ts"],"sourcesContent":["
|
|
1
|
+
{"version":3,"sources":["../src/index.ts","../src/language/bcp47-language-tag.ts","../src/language/language-display.ts"],"sourcesContent":["// Language utilities\nexport type * from './language/bcp47-language-tag';\nexport {\n BCP47_LANGUAGE_TAGS,\n BCP47_LANGUAGE_TAG_SET,\n isValidBcp47Tag,\n normalizeToBcp47,\n} from './language/bcp47-language-tag';\nexport {\n LANGUAGE_DISPLAY_NAMES,\n buildLanguageDescription,\n getLanguageDisplayName,\n} from './language/language-display';\n\n// Type definitions\nexport type * from './types/docling-document';\nexport type * from './types/document-process-result';\nexport type * from './types/ocr-strategy';\nexport type * from './types/processed-document';\nexport type * from './types/token-usage-report';\n","/**\n * Language tags supported by the ocrmac OCR engine (via docling-serve).\n * These are the only tags that can be passed to the Docling pipeline without\n * triggering \"Invalid language preference\" errors.\n */\nexport const BCP47_LANGUAGE_TAGS = [\n 'ar-SA',\n 'ars-SA',\n 'cs-CZ',\n 'da-DK',\n 'de-DE',\n 'en-US',\n 'es-ES',\n 'fr-FR',\n 'id-ID',\n 'it-IT',\n 'ja-JP',\n 'ko-KR',\n 'ms-MY',\n 'nb-NO',\n 'nl-NL',\n 'nn-NO',\n 'no-NO',\n 'pl-PL',\n 'pt-BR',\n 'ro-RO',\n 'ru-RU',\n 'sv-SE',\n 'th-TH',\n 'tr-TR',\n 'uk-UA',\n 'vi-VT',\n 'yue-Hans',\n 'yue-Hant',\n 'zh-Hans',\n 'zh-Hant',\n] as const;\n\n/** Union type of all supported BCP 47 language tags */\nexport type Bcp47LanguageTag = (typeof BCP47_LANGUAGE_TAGS)[number];\n\n/** Set for O(1) lookup of valid BCP 47 tags */\nexport const BCP47_LANGUAGE_TAG_SET: ReadonlySet<string> = new Set(\n BCP47_LANGUAGE_TAGS,\n);\n\n/** Check whether a string is a valid BCP 47 language tag */\nexport function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag {\n return BCP47_LANGUAGE_TAG_SET.has(tag);\n}\n\n/**\n * Maps bare language codes to their default BCP 47 tag.\n * Used when VLM returns only a language code without a region subtag.\n */\nconst DEFAULT_REGION_MAP: Record<string, Bcp47LanguageTag> = {\n ar: 'ar-SA',\n cs: 'cs-CZ',\n da: 'da-DK',\n de: 'de-DE',\n en: 'en-US',\n es: 'es-ES',\n fr: 'fr-FR',\n id: 'id-ID',\n it: 'it-IT',\n ja: 'ja-JP',\n ko: 'ko-KR',\n ms: 'ms-MY',\n nl: 'nl-NL',\n no: 'no-NO',\n pl: 'pl-PL',\n pt: 'pt-BR',\n ro: 'ro-RO',\n ru: 'ru-RU',\n sv: 'sv-SE',\n th: 'th-TH',\n tr: 'tr-TR',\n uk: 'uk-UA',\n vi: 'vi-VT',\n zh: 'zh-Hans',\n};\n\n/**\n * Normalize a language string to a valid BCP 47 tag.\n *\n * - If the input is already a valid full tag (e.g. \"en-US\"), return it as-is.\n * - If it is a bare language code (e.g. \"en\", \"ko\"), map it to the default region.\n * - Otherwise return null (e.g. \"und\", \"unknown\", empty string).\n */\nexport function normalizeToBcp47(tag: string): Bcp47LanguageTag | null {\n if (isValidBcp47Tag(tag)) {\n return tag;\n }\n\n const lower = tag.toLowerCase();\n const mapped = DEFAULT_REGION_MAP[lower];\n if (mapped) {\n return mapped;\n }\n\n return null;\n}\n","/** Language display names for prompt context (keyed by ISO 639-1 base language code) */\nexport const LANGUAGE_DISPLAY_NAMES: Record<string, string> = {\n ko: 'Korean (한국어)',\n ja: 'Japanese (日本語)',\n zh: 'Chinese (中文)',\n en: 'English',\n fr: 'French (Français)',\n de: 'German (Deutsch)',\n es: 'Spanish (Español)',\n pt: 'Portuguese (Português)',\n ru: 'Russian (Русский)',\n uk: 'Ukrainian (Українська)',\n it: 'Italian (Italiano)',\n};\n\n/**\n * Get human-readable display name for a BCP 47 or ISO 639-1 language code.\n */\nexport function getLanguageDisplayName(code?: string): string {\n if (!code) return 'unknown';\n const baseCode = code.split('-')[0];\n return LANGUAGE_DISPLAY_NAMES[baseCode] ?? code;\n}\n\n/**\n * Build language description string from document languages.\n * @returns e.g. \"primarily written in Korean (한국어), with English also present\"\n */\nexport function buildLanguageDescription(documentLanguages: string[]): string {\n const primaryName = getLanguageDisplayName(documentLanguages[0]);\n const otherNames = documentLanguages\n .slice(1)\n .map((code) => getLanguageDisplayName(code));\n return otherNames.length > 0\n ? `primarily written in ${primaryName}, with ${otherNames.join(', ')} also present`\n : `written in ${primaryName}`;\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACKO,IAAM,sBAAsB;AAAA,EACjC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAMO,IAAM,yBAA8C,IAAI;AAAA,EAC7D;AACF;AAGO,SAAS,gBAAgB,KAAsC;AACpE,SAAO,uBAAuB,IAAI,GAAG;AACvC;AAMA,IAAM,qBAAuD;AAAA,EAC3D,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AACN;AASO,SAAS,iBAAiB,KAAsC;AACrE,MAAI,gBAAgB,GAAG,GAAG;AACxB,WAAO;AAAA,EACT;AAEA,QAAM,QAAQ,IAAI,YAAY;AAC9B,QAAM,SAAS,mBAAmB,KAAK;AACvC,MAAI,QAAQ;AACV,WAAO;AAAA,EACT;AAEA,SAAO;AACT;;;ACpGO,IAAM,yBAAiD;AAAA,EAC5D,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AACN;AAKO,SAAS,uBAAuB,MAAuB;AAC5D,MAAI,CAAC,KAAM,QAAO;AAClB,QAAM,WAAW,KAAK,MAAM,GAAG,EAAE,CAAC;AAClC,SAAO,uBAAuB,QAAQ,KAAK;AAC7C;AAMO,SAAS,yBAAyB,mBAAqC;AAC5E,QAAM,cAAc,uBAAuB,kBAAkB,CAAC,CAAC;AAC/D,QAAM,aAAa,kBAChB,MAAM,CAAC,EACP,IAAI,CAAC,SAAS,uBAAuB,IAAI,CAAC;AAC7C,SAAO,WAAW,SAAS,IACvB,wBAAwB,WAAW,UAAU,WAAW,KAAK,IAAI,CAAC,kBAClE,cAAc,WAAW;AAC/B;","names":[]}
|
package/dist/index.d.cts
CHANGED
|
@@ -19,6 +19,18 @@ declare function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag;
|
|
|
19
19
|
*/
|
|
20
20
|
declare function normalizeToBcp47(tag: string): Bcp47LanguageTag | null;
|
|
21
21
|
|
|
22
|
+
/** Language display names for prompt context (keyed by ISO 639-1 base language code) */
|
|
23
|
+
declare const LANGUAGE_DISPLAY_NAMES: Record<string, string>;
|
|
24
|
+
/**
|
|
25
|
+
* Get human-readable display name for a BCP 47 or ISO 639-1 language code.
|
|
26
|
+
*/
|
|
27
|
+
declare function getLanguageDisplayName(code?: string): string;
|
|
28
|
+
/**
|
|
29
|
+
* Build language description string from document languages.
|
|
30
|
+
* @returns e.g. "primarily written in Korean (한국어), with English also present"
|
|
31
|
+
*/
|
|
32
|
+
declare function buildLanguageDescription(documentLanguages: string[]): string;
|
|
33
|
+
|
|
22
34
|
interface DoclingReference {
|
|
23
35
|
$ref: string;
|
|
24
36
|
}
|
|
@@ -740,4 +752,4 @@ interface OcrStrategy {
|
|
|
740
752
|
koreanHanjaMixPages?: number[];
|
|
741
753
|
}
|
|
742
754
|
|
|
743
|
-
export { BCP47_LANGUAGE_TAGS, BCP47_LANGUAGE_TAG_SET, type Bcp47LanguageTag, type Caption, type Chapter, type ComponentUsageReport, type DoclingBBox, type DoclingBaseNode, type DoclingBody, type DoclingDocument, type DoclingGroupItem, type DoclingOrigin, type DoclingPage, type DoclingPageImage, type DoclingPictureItem, type DoclingProv, type DoclingReference, type DoclingTableCell, type DoclingTableData, type DoclingTableItem, type DoclingTextItem, type DocumentProcessResult, type ModelUsageDetail, type OcrStrategy, type PageRange, type PhaseUsageReport, type ProcessedDocument, type ProcessedFootnote, type ProcessedImage, type ProcessedTable, type ProcessedTableCell, type TextBlock, type TokenUsageReport, type TokenUsageSummary, isValidBcp47Tag, normalizeToBcp47 };
|
|
755
|
+
export { BCP47_LANGUAGE_TAGS, BCP47_LANGUAGE_TAG_SET, type Bcp47LanguageTag, type Caption, type Chapter, type ComponentUsageReport, type DoclingBBox, type DoclingBaseNode, type DoclingBody, type DoclingDocument, type DoclingGroupItem, type DoclingOrigin, type DoclingPage, type DoclingPageImage, type DoclingPictureItem, type DoclingProv, type DoclingReference, type DoclingTableCell, type DoclingTableData, type DoclingTableItem, type DoclingTextItem, type DocumentProcessResult, LANGUAGE_DISPLAY_NAMES, type ModelUsageDetail, type OcrStrategy, type PageRange, type PhaseUsageReport, type ProcessedDocument, type ProcessedFootnote, type ProcessedImage, type ProcessedTable, type ProcessedTableCell, type TextBlock, type TokenUsageReport, type TokenUsageSummary, buildLanguageDescription, getLanguageDisplayName, isValidBcp47Tag, normalizeToBcp47 };
|
package/dist/index.d.ts
CHANGED
|
@@ -19,6 +19,18 @@ declare function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag;
|
|
|
19
19
|
*/
|
|
20
20
|
declare function normalizeToBcp47(tag: string): Bcp47LanguageTag | null;
|
|
21
21
|
|
|
22
|
+
/** Language display names for prompt context (keyed by ISO 639-1 base language code) */
|
|
23
|
+
declare const LANGUAGE_DISPLAY_NAMES: Record<string, string>;
|
|
24
|
+
/**
|
|
25
|
+
* Get human-readable display name for a BCP 47 or ISO 639-1 language code.
|
|
26
|
+
*/
|
|
27
|
+
declare function getLanguageDisplayName(code?: string): string;
|
|
28
|
+
/**
|
|
29
|
+
* Build language description string from document languages.
|
|
30
|
+
* @returns e.g. "primarily written in Korean (한국어), with English also present"
|
|
31
|
+
*/
|
|
32
|
+
declare function buildLanguageDescription(documentLanguages: string[]): string;
|
|
33
|
+
|
|
22
34
|
interface DoclingReference {
|
|
23
35
|
$ref: string;
|
|
24
36
|
}
|
|
@@ -740,4 +752,4 @@ interface OcrStrategy {
|
|
|
740
752
|
koreanHanjaMixPages?: number[];
|
|
741
753
|
}
|
|
742
754
|
|
|
743
|
-
export { BCP47_LANGUAGE_TAGS, BCP47_LANGUAGE_TAG_SET, type Bcp47LanguageTag, type Caption, type Chapter, type ComponentUsageReport, type DoclingBBox, type DoclingBaseNode, type DoclingBody, type DoclingDocument, type DoclingGroupItem, type DoclingOrigin, type DoclingPage, type DoclingPageImage, type DoclingPictureItem, type DoclingProv, type DoclingReference, type DoclingTableCell, type DoclingTableData, type DoclingTableItem, type DoclingTextItem, type DocumentProcessResult, type ModelUsageDetail, type OcrStrategy, type PageRange, type PhaseUsageReport, type ProcessedDocument, type ProcessedFootnote, type ProcessedImage, type ProcessedTable, type ProcessedTableCell, type TextBlock, type TokenUsageReport, type TokenUsageSummary, isValidBcp47Tag, normalizeToBcp47 };
|
|
755
|
+
export { BCP47_LANGUAGE_TAGS, BCP47_LANGUAGE_TAG_SET, type Bcp47LanguageTag, type Caption, type Chapter, type ComponentUsageReport, type DoclingBBox, type DoclingBaseNode, type DoclingBody, type DoclingDocument, type DoclingGroupItem, type DoclingOrigin, type DoclingPage, type DoclingPageImage, type DoclingPictureItem, type DoclingProv, type DoclingReference, type DoclingTableCell, type DoclingTableData, type DoclingTableItem, type DoclingTextItem, type DocumentProcessResult, LANGUAGE_DISPLAY_NAMES, type ModelUsageDetail, type OcrStrategy, type PageRange, type PhaseUsageReport, type ProcessedDocument, type ProcessedFootnote, type ProcessedImage, type ProcessedTable, type ProcessedTableCell, type TextBlock, type TokenUsageReport, type TokenUsageSummary, buildLanguageDescription, getLanguageDisplayName, isValidBcp47Tag, normalizeToBcp47 };
|
package/dist/index.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
// src/bcp47-language-tag.ts
|
|
1
|
+
// src/language/bcp47-language-tag.ts
|
|
2
2
|
var BCP47_LANGUAGE_TAGS = [
|
|
3
3
|
"ar-SA",
|
|
4
4
|
"ars-SA",
|
|
@@ -74,9 +74,37 @@ function normalizeToBcp47(tag) {
|
|
|
74
74
|
}
|
|
75
75
|
return null;
|
|
76
76
|
}
|
|
77
|
+
|
|
78
|
+
// src/language/language-display.ts
|
|
79
|
+
var LANGUAGE_DISPLAY_NAMES = {
|
|
80
|
+
ko: "Korean (\uD55C\uAD6D\uC5B4)",
|
|
81
|
+
ja: "Japanese (\u65E5\u672C\u8A9E)",
|
|
82
|
+
zh: "Chinese (\u4E2D\u6587)",
|
|
83
|
+
en: "English",
|
|
84
|
+
fr: "French (Fran\xE7ais)",
|
|
85
|
+
de: "German (Deutsch)",
|
|
86
|
+
es: "Spanish (Espa\xF1ol)",
|
|
87
|
+
pt: "Portuguese (Portugu\xEAs)",
|
|
88
|
+
ru: "Russian (\u0420\u0443\u0441\u0441\u043A\u0438\u0439)",
|
|
89
|
+
uk: "Ukrainian (\u0423\u043A\u0440\u0430\u0457\u043D\u0441\u044C\u043A\u0430)",
|
|
90
|
+
it: "Italian (Italiano)"
|
|
91
|
+
};
|
|
92
|
+
function getLanguageDisplayName(code) {
|
|
93
|
+
if (!code) return "unknown";
|
|
94
|
+
const baseCode = code.split("-")[0];
|
|
95
|
+
return LANGUAGE_DISPLAY_NAMES[baseCode] ?? code;
|
|
96
|
+
}
|
|
97
|
+
function buildLanguageDescription(documentLanguages) {
|
|
98
|
+
const primaryName = getLanguageDisplayName(documentLanguages[0]);
|
|
99
|
+
const otherNames = documentLanguages.slice(1).map((code) => getLanguageDisplayName(code));
|
|
100
|
+
return otherNames.length > 0 ? `primarily written in ${primaryName}, with ${otherNames.join(", ")} also present` : `written in ${primaryName}`;
|
|
101
|
+
}
|
|
77
102
|
export {
|
|
78
103
|
BCP47_LANGUAGE_TAGS,
|
|
79
104
|
BCP47_LANGUAGE_TAG_SET,
|
|
105
|
+
LANGUAGE_DISPLAY_NAMES,
|
|
106
|
+
buildLanguageDescription,
|
|
107
|
+
getLanguageDisplayName,
|
|
80
108
|
isValidBcp47Tag,
|
|
81
109
|
normalizeToBcp47
|
|
82
110
|
};
|
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/bcp47-language-tag.ts"],"sourcesContent":["/**\n * Language tags supported by the ocrmac OCR engine (via docling-serve).\n * These are the only tags that can be passed to the Docling pipeline without\n * triggering \"Invalid language preference\" errors.\n */\nexport const BCP47_LANGUAGE_TAGS = [\n 'ar-SA',\n 'ars-SA',\n 'cs-CZ',\n 'da-DK',\n 'de-DE',\n 'en-US',\n 'es-ES',\n 'fr-FR',\n 'id-ID',\n 'it-IT',\n 'ja-JP',\n 'ko-KR',\n 'ms-MY',\n 'nb-NO',\n 'nl-NL',\n 'nn-NO',\n 'no-NO',\n 'pl-PL',\n 'pt-BR',\n 'ro-RO',\n 'ru-RU',\n 'sv-SE',\n 'th-TH',\n 'tr-TR',\n 'uk-UA',\n 'vi-VT',\n 'yue-Hans',\n 'yue-Hant',\n 'zh-Hans',\n 'zh-Hant',\n] as const;\n\n/** Union type of all supported BCP 47 language tags */\nexport type Bcp47LanguageTag = (typeof BCP47_LANGUAGE_TAGS)[number];\n\n/** Set for O(1) lookup of valid BCP 47 tags */\nexport const BCP47_LANGUAGE_TAG_SET: ReadonlySet<string> = new Set(\n BCP47_LANGUAGE_TAGS,\n);\n\n/** Check whether a string is a valid BCP 47 language tag */\nexport function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag {\n return BCP47_LANGUAGE_TAG_SET.has(tag);\n}\n\n/**\n * Maps bare language codes to their default BCP 47 tag.\n * Used when VLM returns only a language code without a region subtag.\n */\nconst DEFAULT_REGION_MAP: Record<string, Bcp47LanguageTag> = {\n ar: 'ar-SA',\n cs: 'cs-CZ',\n da: 'da-DK',\n de: 'de-DE',\n en: 'en-US',\n es: 'es-ES',\n fr: 'fr-FR',\n id: 'id-ID',\n it: 'it-IT',\n ja: 'ja-JP',\n ko: 'ko-KR',\n ms: 'ms-MY',\n nl: 'nl-NL',\n no: 'no-NO',\n pl: 'pl-PL',\n pt: 'pt-BR',\n ro: 'ro-RO',\n ru: 'ru-RU',\n sv: 'sv-SE',\n th: 'th-TH',\n tr: 'tr-TR',\n uk: 'uk-UA',\n vi: 'vi-VT',\n zh: 'zh-Hans',\n};\n\n/**\n * Normalize a language string to a valid BCP 47 tag.\n *\n * - If the input is already a valid full tag (e.g. \"en-US\"), return it as-is.\n * - If it is a bare language code (e.g. \"en\", \"ko\"), map it to the default region.\n * - Otherwise return null (e.g. \"und\", \"unknown\", empty string).\n */\nexport function normalizeToBcp47(tag: string): Bcp47LanguageTag | null {\n if (isValidBcp47Tag(tag)) {\n return tag;\n }\n\n const lower = tag.toLowerCase();\n const mapped = DEFAULT_REGION_MAP[lower];\n if (mapped) {\n return mapped;\n }\n\n return null;\n}\n"],"mappings":";AAKO,IAAM,sBAAsB;AAAA,EACjC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAMO,IAAM,yBAA8C,IAAI;AAAA,EAC7D;AACF;AAGO,SAAS,gBAAgB,KAAsC;AACpE,SAAO,uBAAuB,IAAI,GAAG;AACvC;AAMA,IAAM,qBAAuD;AAAA,EAC3D,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AACN;AASO,SAAS,iBAAiB,KAAsC;AACrE,MAAI,gBAAgB,GAAG,GAAG;AACxB,WAAO;AAAA,EACT;AAEA,QAAM,QAAQ,IAAI,YAAY;AAC9B,QAAM,SAAS,mBAAmB,KAAK;AACvC,MAAI,QAAQ;AACV,WAAO;AAAA,EACT;AAEA,SAAO;AACT;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../src/language/bcp47-language-tag.ts","../src/language/language-display.ts"],"sourcesContent":["/**\n * Language tags supported by the ocrmac OCR engine (via docling-serve).\n * These are the only tags that can be passed to the Docling pipeline without\n * triggering \"Invalid language preference\" errors.\n */\nexport const BCP47_LANGUAGE_TAGS = [\n 'ar-SA',\n 'ars-SA',\n 'cs-CZ',\n 'da-DK',\n 'de-DE',\n 'en-US',\n 'es-ES',\n 'fr-FR',\n 'id-ID',\n 'it-IT',\n 'ja-JP',\n 'ko-KR',\n 'ms-MY',\n 'nb-NO',\n 'nl-NL',\n 'nn-NO',\n 'no-NO',\n 'pl-PL',\n 'pt-BR',\n 'ro-RO',\n 'ru-RU',\n 'sv-SE',\n 'th-TH',\n 'tr-TR',\n 'uk-UA',\n 'vi-VT',\n 'yue-Hans',\n 'yue-Hant',\n 'zh-Hans',\n 'zh-Hant',\n] as const;\n\n/** Union type of all supported BCP 47 language tags */\nexport type Bcp47LanguageTag = (typeof BCP47_LANGUAGE_TAGS)[number];\n\n/** Set for O(1) lookup of valid BCP 47 tags */\nexport const BCP47_LANGUAGE_TAG_SET: ReadonlySet<string> = new Set(\n BCP47_LANGUAGE_TAGS,\n);\n\n/** Check whether a string is a valid BCP 47 language tag */\nexport function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag {\n return BCP47_LANGUAGE_TAG_SET.has(tag);\n}\n\n/**\n * Maps bare language codes to their default BCP 47 tag.\n * Used when VLM returns only a language code without a region subtag.\n */\nconst DEFAULT_REGION_MAP: Record<string, Bcp47LanguageTag> = {\n ar: 'ar-SA',\n cs: 'cs-CZ',\n da: 'da-DK',\n de: 'de-DE',\n en: 'en-US',\n es: 'es-ES',\n fr: 'fr-FR',\n id: 'id-ID',\n it: 'it-IT',\n ja: 'ja-JP',\n ko: 'ko-KR',\n ms: 'ms-MY',\n nl: 'nl-NL',\n no: 'no-NO',\n pl: 'pl-PL',\n pt: 'pt-BR',\n ro: 'ro-RO',\n ru: 'ru-RU',\n sv: 'sv-SE',\n th: 'th-TH',\n tr: 'tr-TR',\n uk: 'uk-UA',\n vi: 'vi-VT',\n zh: 'zh-Hans',\n};\n\n/**\n * Normalize a language string to a valid BCP 47 tag.\n *\n * - If the input is already a valid full tag (e.g. \"en-US\"), return it as-is.\n * - If it is a bare language code (e.g. \"en\", \"ko\"), map it to the default region.\n * - Otherwise return null (e.g. \"und\", \"unknown\", empty string).\n */\nexport function normalizeToBcp47(tag: string): Bcp47LanguageTag | null {\n if (isValidBcp47Tag(tag)) {\n return tag;\n }\n\n const lower = tag.toLowerCase();\n const mapped = DEFAULT_REGION_MAP[lower];\n if (mapped) {\n return mapped;\n }\n\n return null;\n}\n","/** Language display names for prompt context (keyed by ISO 639-1 base language code) */\nexport const LANGUAGE_DISPLAY_NAMES: Record<string, string> = {\n ko: 'Korean (한국어)',\n ja: 'Japanese (日本語)',\n zh: 'Chinese (中文)',\n en: 'English',\n fr: 'French (Français)',\n de: 'German (Deutsch)',\n es: 'Spanish (Español)',\n pt: 'Portuguese (Português)',\n ru: 'Russian (Русский)',\n uk: 'Ukrainian (Українська)',\n it: 'Italian (Italiano)',\n};\n\n/**\n * Get human-readable display name for a BCP 47 or ISO 639-1 language code.\n */\nexport function getLanguageDisplayName(code?: string): string {\n if (!code) return 'unknown';\n const baseCode = code.split('-')[0];\n return LANGUAGE_DISPLAY_NAMES[baseCode] ?? code;\n}\n\n/**\n * Build language description string from document languages.\n * @returns e.g. \"primarily written in Korean (한국어), with English also present\"\n */\nexport function buildLanguageDescription(documentLanguages: string[]): string {\n const primaryName = getLanguageDisplayName(documentLanguages[0]);\n const otherNames = documentLanguages\n .slice(1)\n .map((code) => getLanguageDisplayName(code));\n return otherNames.length > 0\n ? `primarily written in ${primaryName}, with ${otherNames.join(', ')} also present`\n : `written in ${primaryName}`;\n}\n"],"mappings":";AAKO,IAAM,sBAAsB;AAAA,EACjC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAMO,IAAM,yBAA8C,IAAI;AAAA,EAC7D;AACF;AAGO,SAAS,gBAAgB,KAAsC;AACpE,SAAO,uBAAuB,IAAI,GAAG;AACvC;AAMA,IAAM,qBAAuD;AAAA,EAC3D,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AACN;AASO,SAAS,iBAAiB,KAAsC;AACrE,MAAI,gBAAgB,GAAG,GAAG;AACxB,WAAO;AAAA,EACT;AAEA,QAAM,QAAQ,IAAI,YAAY;AAC9B,QAAM,SAAS,mBAAmB,KAAK;AACvC,MAAI,QAAQ;AACV,WAAO;AAAA,EACT;AAEA,SAAO;AACT;;;ACpGO,IAAM,yBAAiD;AAAA,EAC5D,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AACN;AAKO,SAAS,uBAAuB,MAAuB;AAC5D,MAAI,CAAC,KAAM,QAAO;AAClB,QAAM,WAAW,KAAK,MAAM,GAAG,EAAE,CAAC;AAClC,SAAO,uBAAuB,QAAQ,KAAK;AAC7C;AAMO,SAAS,yBAAyB,mBAAqC;AAC5E,QAAM,cAAc,uBAAuB,kBAAkB,CAAC,CAAC;AAC/D,QAAM,aAAa,kBAChB,MAAM,CAAC,EACP,IAAI,CAAC,SAAS,uBAAuB,IAAI,CAAC;AAC7C,SAAO,WAAW,SAAS,IACvB,wBAAwB,WAAW,UAAU,WAAW,KAAK,IAAI,CAAC,kBAClE,cAAc,WAAW;AAC/B;","names":[]}
|
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "@heripo/model",
|
|
3
3
|
"private": false,
|
|
4
4
|
"type": "module",
|
|
5
|
-
"version": "0.1.
|
|
5
|
+
"version": "0.1.17",
|
|
6
6
|
"description": "Document models and type definitions for heripo engine",
|
|
7
7
|
"main": "dist/index.cjs",
|
|
8
8
|
"module": "dist/index.js",
|
|
@@ -51,8 +51,11 @@
|
|
|
51
51
|
"access": "public"
|
|
52
52
|
},
|
|
53
53
|
"devDependencies": {
|
|
54
|
+
"@vitest/coverage-v8": "^4.1.0",
|
|
54
55
|
"tsup": "^8.5.1",
|
|
56
|
+
"vitest": "^4.1.0",
|
|
55
57
|
"@heripo/tsconfig": "0.0.0",
|
|
58
|
+
"@heripo/vitest-config": "0.0.0",
|
|
56
59
|
"@heripo/tsup-config": "0.0.0"
|
|
57
60
|
},
|
|
58
61
|
"scripts": {
|
|
@@ -60,6 +63,11 @@
|
|
|
60
63
|
"build": "pnpm clean && tsup",
|
|
61
64
|
"dev": "tsup --watch",
|
|
62
65
|
"typecheck": "tsc --noEmit",
|
|
66
|
+
"test": "vitest run",
|
|
67
|
+
"test:watch": "vitest",
|
|
68
|
+
"test:ci": "TEST_MODE=ci vitest run --coverage",
|
|
69
|
+
"test:coverage": "vitest run --coverage",
|
|
70
|
+
"test:coverage:watch": "vitest --coverage",
|
|
63
71
|
"lint": "eslint src/**/*.ts",
|
|
64
72
|
"lint:fix": "eslint src/**/*.ts --fix"
|
|
65
73
|
}
|