@heripo/model 0.1.12 → 0.1.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +10 -105
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +4 -3
- package/dist/index.d.ts +4 -3
- package/dist/index.js +10 -105
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -29,82 +29,36 @@ module.exports = __toCommonJS(index_exports);
|
|
|
29
29
|
|
|
30
30
|
// src/bcp47-language-tag.ts
|
|
31
31
|
var BCP47_LANGUAGE_TAGS = [
|
|
32
|
-
"af-ZA",
|
|
33
|
-
"am-ET",
|
|
34
32
|
"ar-SA",
|
|
35
|
-
"
|
|
36
|
-
"az-AZ",
|
|
37
|
-
"be-BY",
|
|
38
|
-
"bg-BG",
|
|
39
|
-
"bn-IN",
|
|
40
|
-
"bs-BA",
|
|
41
|
-
"ca-ES",
|
|
33
|
+
"ars-SA",
|
|
42
34
|
"cs-CZ",
|
|
43
|
-
"cy-GB",
|
|
44
35
|
"da-DK",
|
|
45
36
|
"de-DE",
|
|
46
|
-
"el-GR",
|
|
47
37
|
"en-US",
|
|
48
38
|
"es-ES",
|
|
49
|
-
"et-EE",
|
|
50
|
-
"eu-ES",
|
|
51
|
-
"fa-IR",
|
|
52
|
-
"fi-FI",
|
|
53
39
|
"fr-FR",
|
|
54
|
-
"ga-IE",
|
|
55
|
-
"gl-ES",
|
|
56
|
-
"gu-IN",
|
|
57
|
-
"he-IL",
|
|
58
|
-
"hi-IN",
|
|
59
|
-
"hr-HR",
|
|
60
|
-
"hu-HU",
|
|
61
|
-
"hy-AM",
|
|
62
40
|
"id-ID",
|
|
63
|
-
"is-IS",
|
|
64
41
|
"it-IT",
|
|
65
42
|
"ja-JP",
|
|
66
|
-
"ka-GE",
|
|
67
|
-
"kk-KZ",
|
|
68
|
-
"km-KH",
|
|
69
|
-
"kn-IN",
|
|
70
43
|
"ko-KR",
|
|
71
|
-
"lo-LA",
|
|
72
|
-
"lt-LT",
|
|
73
|
-
"lv-LV",
|
|
74
|
-
"mk-MK",
|
|
75
|
-
"ml-IN",
|
|
76
|
-
"mn-MN",
|
|
77
|
-
"mr-IN",
|
|
78
44
|
"ms-MY",
|
|
79
|
-
"
|
|
80
|
-
"ne-NP",
|
|
45
|
+
"nb-NO",
|
|
81
46
|
"nl-NL",
|
|
47
|
+
"nn-NO",
|
|
82
48
|
"no-NO",
|
|
83
|
-
"or-IN",
|
|
84
|
-
"pa-IN",
|
|
85
49
|
"pl-PL",
|
|
86
50
|
"pt-BR",
|
|
87
|
-
"pt-PT",
|
|
88
51
|
"ro-RO",
|
|
89
52
|
"ru-RU",
|
|
90
|
-
"si-LK",
|
|
91
|
-
"sk-SK",
|
|
92
|
-
"sl-SI",
|
|
93
|
-
"sq-AL",
|
|
94
|
-
"sr-RS",
|
|
95
53
|
"sv-SE",
|
|
96
|
-
"sw-KE",
|
|
97
|
-
"ta-IN",
|
|
98
|
-
"te-IN",
|
|
99
54
|
"th-TH",
|
|
100
55
|
"tr-TR",
|
|
101
56
|
"uk-UA",
|
|
102
|
-
"
|
|
103
|
-
"
|
|
104
|
-
"
|
|
105
|
-
"zh-
|
|
106
|
-
"zh-Hant"
|
|
107
|
-
"zh-TW"
|
|
57
|
+
"vi-VT",
|
|
58
|
+
"yue-Hans",
|
|
59
|
+
"yue-Hant",
|
|
60
|
+
"zh-Hans",
|
|
61
|
+
"zh-Hant"
|
|
108
62
|
];
|
|
109
63
|
var BCP47_LANGUAGE_TAG_SET = new Set(
|
|
110
64
|
BCP47_LANGUAGE_TAGS
|
|
@@ -113,79 +67,30 @@ function isValidBcp47Tag(tag) {
|
|
|
113
67
|
return BCP47_LANGUAGE_TAG_SET.has(tag);
|
|
114
68
|
}
|
|
115
69
|
var DEFAULT_REGION_MAP = {
|
|
116
|
-
af: "af-ZA",
|
|
117
|
-
am: "am-ET",
|
|
118
70
|
ar: "ar-SA",
|
|
119
|
-
as: "as-IN",
|
|
120
|
-
az: "az-AZ",
|
|
121
|
-
be: "be-BY",
|
|
122
|
-
bg: "bg-BG",
|
|
123
|
-
bn: "bn-IN",
|
|
124
|
-
bs: "bs-BA",
|
|
125
|
-
ca: "ca-ES",
|
|
126
71
|
cs: "cs-CZ",
|
|
127
|
-
cy: "cy-GB",
|
|
128
72
|
da: "da-DK",
|
|
129
73
|
de: "de-DE",
|
|
130
|
-
el: "el-GR",
|
|
131
74
|
en: "en-US",
|
|
132
75
|
es: "es-ES",
|
|
133
|
-
et: "et-EE",
|
|
134
|
-
eu: "eu-ES",
|
|
135
|
-
fa: "fa-IR",
|
|
136
|
-
fi: "fi-FI",
|
|
137
76
|
fr: "fr-FR",
|
|
138
|
-
ga: "ga-IE",
|
|
139
|
-
gl: "gl-ES",
|
|
140
|
-
gu: "gu-IN",
|
|
141
|
-
he: "he-IL",
|
|
142
|
-
hi: "hi-IN",
|
|
143
|
-
hr: "hr-HR",
|
|
144
|
-
hu: "hu-HU",
|
|
145
|
-
hy: "hy-AM",
|
|
146
77
|
id: "id-ID",
|
|
147
|
-
is: "is-IS",
|
|
148
78
|
it: "it-IT",
|
|
149
79
|
ja: "ja-JP",
|
|
150
|
-
ka: "ka-GE",
|
|
151
|
-
kk: "kk-KZ",
|
|
152
|
-
km: "km-KH",
|
|
153
|
-
kn: "kn-IN",
|
|
154
80
|
ko: "ko-KR",
|
|
155
|
-
lo: "lo-LA",
|
|
156
|
-
lt: "lt-LT",
|
|
157
|
-
lv: "lv-LV",
|
|
158
|
-
mk: "mk-MK",
|
|
159
|
-
ml: "ml-IN",
|
|
160
|
-
mn: "mn-MN",
|
|
161
|
-
mr: "mr-IN",
|
|
162
81
|
ms: "ms-MY",
|
|
163
|
-
my: "my-MM",
|
|
164
|
-
ne: "ne-NP",
|
|
165
82
|
nl: "nl-NL",
|
|
166
83
|
no: "no-NO",
|
|
167
|
-
or: "or-IN",
|
|
168
|
-
pa: "pa-IN",
|
|
169
84
|
pl: "pl-PL",
|
|
170
85
|
pt: "pt-BR",
|
|
171
86
|
ro: "ro-RO",
|
|
172
87
|
ru: "ru-RU",
|
|
173
|
-
si: "si-LK",
|
|
174
|
-
sk: "sk-SK",
|
|
175
|
-
sl: "sl-SI",
|
|
176
|
-
sq: "sq-AL",
|
|
177
|
-
sr: "sr-RS",
|
|
178
88
|
sv: "sv-SE",
|
|
179
|
-
sw: "sw-KE",
|
|
180
|
-
ta: "ta-IN",
|
|
181
|
-
te: "te-IN",
|
|
182
89
|
th: "th-TH",
|
|
183
90
|
tr: "tr-TR",
|
|
184
91
|
uk: "uk-UA",
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
vi: "vi-VN",
|
|
188
|
-
zh: "zh-CN"
|
|
92
|
+
vi: "vi-VT",
|
|
93
|
+
zh: "zh-Hans"
|
|
189
94
|
};
|
|
190
95
|
function normalizeToBcp47(tag) {
|
|
191
96
|
if (isValidBcp47Tag(tag)) {
|
package/dist/index.cjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/index.ts","../src/bcp47-language-tag.ts"],"sourcesContent":["export type * from './bcp47-language-tag';\nexport {\n BCP47_LANGUAGE_TAGS,\n BCP47_LANGUAGE_TAG_SET,\n isValidBcp47Tag,\n normalizeToBcp47,\n} from './bcp47-language-tag';\nexport type * from './docling-document';\nexport type * from './processed-document';\nexport type * from './token-usage-report';\nexport type * from './document-process-result';\nexport type * from './ocr-strategy';\n","/**\n *
|
|
1
|
+
{"version":3,"sources":["../src/index.ts","../src/bcp47-language-tag.ts"],"sourcesContent":["export type * from './bcp47-language-tag';\nexport {\n BCP47_LANGUAGE_TAGS,\n BCP47_LANGUAGE_TAG_SET,\n isValidBcp47Tag,\n normalizeToBcp47,\n} from './bcp47-language-tag';\nexport type * from './docling-document';\nexport type * from './processed-document';\nexport type * from './token-usage-report';\nexport type * from './document-process-result';\nexport type * from './ocr-strategy';\n","/**\n * Language tags supported by the ocrmac OCR engine (via docling-serve).\n * These are the only tags that can be passed to the Docling pipeline without\n * triggering \"Invalid language preference\" errors.\n */\nexport const BCP47_LANGUAGE_TAGS = [\n 'ar-SA',\n 'ars-SA',\n 'cs-CZ',\n 'da-DK',\n 'de-DE',\n 'en-US',\n 'es-ES',\n 'fr-FR',\n 'id-ID',\n 'it-IT',\n 'ja-JP',\n 'ko-KR',\n 'ms-MY',\n 'nb-NO',\n 'nl-NL',\n 'nn-NO',\n 'no-NO',\n 'pl-PL',\n 'pt-BR',\n 'ro-RO',\n 'ru-RU',\n 'sv-SE',\n 'th-TH',\n 'tr-TR',\n 'uk-UA',\n 'vi-VT',\n 'yue-Hans',\n 'yue-Hant',\n 'zh-Hans',\n 'zh-Hant',\n] as const;\n\n/** Union type of all supported BCP 47 language tags */\nexport type Bcp47LanguageTag = (typeof BCP47_LANGUAGE_TAGS)[number];\n\n/** Set for O(1) lookup of valid BCP 47 tags */\nexport const BCP47_LANGUAGE_TAG_SET: ReadonlySet<string> = new Set(\n BCP47_LANGUAGE_TAGS,\n);\n\n/** Check whether a string is a valid BCP 47 language tag */\nexport function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag {\n return BCP47_LANGUAGE_TAG_SET.has(tag);\n}\n\n/**\n * Maps bare language codes to their default BCP 47 tag.\n * Used when VLM returns only a language code without a region subtag.\n */\nconst DEFAULT_REGION_MAP: Record<string, Bcp47LanguageTag> = {\n ar: 'ar-SA',\n cs: 'cs-CZ',\n da: 'da-DK',\n de: 'de-DE',\n en: 'en-US',\n es: 'es-ES',\n fr: 'fr-FR',\n id: 'id-ID',\n it: 'it-IT',\n ja: 'ja-JP',\n ko: 'ko-KR',\n ms: 'ms-MY',\n nl: 'nl-NL',\n no: 'no-NO',\n pl: 'pl-PL',\n pt: 'pt-BR',\n ro: 'ro-RO',\n ru: 'ru-RU',\n sv: 'sv-SE',\n th: 'th-TH',\n tr: 'tr-TR',\n uk: 'uk-UA',\n vi: 'vi-VT',\n zh: 'zh-Hans',\n};\n\n/**\n * Normalize a language string to a valid BCP 47 tag.\n *\n * - If the input is already a valid full tag (e.g. \"en-US\"), return it as-is.\n * - If it is a bare language code (e.g. \"en\", \"ko\"), map it to the default region.\n * - Otherwise return null (e.g. \"und\", \"unknown\", empty string).\n */\nexport function normalizeToBcp47(tag: string): Bcp47LanguageTag | null {\n if (isValidBcp47Tag(tag)) {\n return tag;\n }\n\n const lower = tag.toLowerCase();\n const mapped = DEFAULT_REGION_MAP[lower];\n if (mapped) {\n return mapped;\n }\n\n return null;\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACKO,IAAM,sBAAsB;AAAA,EACjC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAMO,IAAM,yBAA8C,IAAI;AAAA,EAC7D;AACF;AAGO,SAAS,gBAAgB,KAAsC;AACpE,SAAO,uBAAuB,IAAI,GAAG;AACvC;AAMA,IAAM,qBAAuD;AAAA,EAC3D,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AACN;AASO,SAAS,iBAAiB,KAAsC;AACrE,MAAI,gBAAgB,GAAG,GAAG;AACxB,WAAO;AAAA,EACT;AAEA,QAAM,QAAQ,IAAI,YAAY;AAC9B,QAAM,SAAS,mBAAmB,KAAK;AACvC,MAAI,QAAQ;AACV,WAAO;AAAA,EACT;AAEA,SAAO;AACT;","names":[]}
|
package/dist/index.d.cts
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
3
|
-
*
|
|
2
|
+
* Language tags supported by the ocrmac OCR engine (via docling-serve).
|
|
3
|
+
* These are the only tags that can be passed to the Docling pipeline without
|
|
4
|
+
* triggering "Invalid language preference" errors.
|
|
4
5
|
*/
|
|
5
|
-
declare const BCP47_LANGUAGE_TAGS: readonly ["
|
|
6
|
+
declare const BCP47_LANGUAGE_TAGS: readonly ["ar-SA", "ars-SA", "cs-CZ", "da-DK", "de-DE", "en-US", "es-ES", "fr-FR", "id-ID", "it-IT", "ja-JP", "ko-KR", "ms-MY", "nb-NO", "nl-NL", "nn-NO", "no-NO", "pl-PL", "pt-BR", "ro-RO", "ru-RU", "sv-SE", "th-TH", "tr-TR", "uk-UA", "vi-VT", "yue-Hans", "yue-Hant", "zh-Hans", "zh-Hant"];
|
|
6
7
|
/** Union type of all supported BCP 47 language tags */
|
|
7
8
|
type Bcp47LanguageTag = (typeof BCP47_LANGUAGE_TAGS)[number];
|
|
8
9
|
/** Set for O(1) lookup of valid BCP 47 tags */
|
package/dist/index.d.ts
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
3
|
-
*
|
|
2
|
+
* Language tags supported by the ocrmac OCR engine (via docling-serve).
|
|
3
|
+
* These are the only tags that can be passed to the Docling pipeline without
|
|
4
|
+
* triggering "Invalid language preference" errors.
|
|
4
5
|
*/
|
|
5
|
-
declare const BCP47_LANGUAGE_TAGS: readonly ["
|
|
6
|
+
declare const BCP47_LANGUAGE_TAGS: readonly ["ar-SA", "ars-SA", "cs-CZ", "da-DK", "de-DE", "en-US", "es-ES", "fr-FR", "id-ID", "it-IT", "ja-JP", "ko-KR", "ms-MY", "nb-NO", "nl-NL", "nn-NO", "no-NO", "pl-PL", "pt-BR", "ro-RO", "ru-RU", "sv-SE", "th-TH", "tr-TR", "uk-UA", "vi-VT", "yue-Hans", "yue-Hant", "zh-Hans", "zh-Hant"];
|
|
6
7
|
/** Union type of all supported BCP 47 language tags */
|
|
7
8
|
type Bcp47LanguageTag = (typeof BCP47_LANGUAGE_TAGS)[number];
|
|
8
9
|
/** Set for O(1) lookup of valid BCP 47 tags */
|
package/dist/index.js
CHANGED
|
@@ -1,81 +1,35 @@
|
|
|
1
1
|
// src/bcp47-language-tag.ts
|
|
2
2
|
var BCP47_LANGUAGE_TAGS = [
|
|
3
|
-
"af-ZA",
|
|
4
|
-
"am-ET",
|
|
5
3
|
"ar-SA",
|
|
6
|
-
"
|
|
7
|
-
"az-AZ",
|
|
8
|
-
"be-BY",
|
|
9
|
-
"bg-BG",
|
|
10
|
-
"bn-IN",
|
|
11
|
-
"bs-BA",
|
|
12
|
-
"ca-ES",
|
|
4
|
+
"ars-SA",
|
|
13
5
|
"cs-CZ",
|
|
14
|
-
"cy-GB",
|
|
15
6
|
"da-DK",
|
|
16
7
|
"de-DE",
|
|
17
|
-
"el-GR",
|
|
18
8
|
"en-US",
|
|
19
9
|
"es-ES",
|
|
20
|
-
"et-EE",
|
|
21
|
-
"eu-ES",
|
|
22
|
-
"fa-IR",
|
|
23
|
-
"fi-FI",
|
|
24
10
|
"fr-FR",
|
|
25
|
-
"ga-IE",
|
|
26
|
-
"gl-ES",
|
|
27
|
-
"gu-IN",
|
|
28
|
-
"he-IL",
|
|
29
|
-
"hi-IN",
|
|
30
|
-
"hr-HR",
|
|
31
|
-
"hu-HU",
|
|
32
|
-
"hy-AM",
|
|
33
11
|
"id-ID",
|
|
34
|
-
"is-IS",
|
|
35
12
|
"it-IT",
|
|
36
13
|
"ja-JP",
|
|
37
|
-
"ka-GE",
|
|
38
|
-
"kk-KZ",
|
|
39
|
-
"km-KH",
|
|
40
|
-
"kn-IN",
|
|
41
14
|
"ko-KR",
|
|
42
|
-
"lo-LA",
|
|
43
|
-
"lt-LT",
|
|
44
|
-
"lv-LV",
|
|
45
|
-
"mk-MK",
|
|
46
|
-
"ml-IN",
|
|
47
|
-
"mn-MN",
|
|
48
|
-
"mr-IN",
|
|
49
15
|
"ms-MY",
|
|
50
|
-
"
|
|
51
|
-
"ne-NP",
|
|
16
|
+
"nb-NO",
|
|
52
17
|
"nl-NL",
|
|
18
|
+
"nn-NO",
|
|
53
19
|
"no-NO",
|
|
54
|
-
"or-IN",
|
|
55
|
-
"pa-IN",
|
|
56
20
|
"pl-PL",
|
|
57
21
|
"pt-BR",
|
|
58
|
-
"pt-PT",
|
|
59
22
|
"ro-RO",
|
|
60
23
|
"ru-RU",
|
|
61
|
-
"si-LK",
|
|
62
|
-
"sk-SK",
|
|
63
|
-
"sl-SI",
|
|
64
|
-
"sq-AL",
|
|
65
|
-
"sr-RS",
|
|
66
24
|
"sv-SE",
|
|
67
|
-
"sw-KE",
|
|
68
|
-
"ta-IN",
|
|
69
|
-
"te-IN",
|
|
70
25
|
"th-TH",
|
|
71
26
|
"tr-TR",
|
|
72
27
|
"uk-UA",
|
|
73
|
-
"
|
|
74
|
-
"
|
|
75
|
-
"
|
|
76
|
-
"zh-
|
|
77
|
-
"zh-Hant"
|
|
78
|
-
"zh-TW"
|
|
28
|
+
"vi-VT",
|
|
29
|
+
"yue-Hans",
|
|
30
|
+
"yue-Hant",
|
|
31
|
+
"zh-Hans",
|
|
32
|
+
"zh-Hant"
|
|
79
33
|
];
|
|
80
34
|
var BCP47_LANGUAGE_TAG_SET = new Set(
|
|
81
35
|
BCP47_LANGUAGE_TAGS
|
|
@@ -84,79 +38,30 @@ function isValidBcp47Tag(tag) {
|
|
|
84
38
|
return BCP47_LANGUAGE_TAG_SET.has(tag);
|
|
85
39
|
}
|
|
86
40
|
var DEFAULT_REGION_MAP = {
|
|
87
|
-
af: "af-ZA",
|
|
88
|
-
am: "am-ET",
|
|
89
41
|
ar: "ar-SA",
|
|
90
|
-
as: "as-IN",
|
|
91
|
-
az: "az-AZ",
|
|
92
|
-
be: "be-BY",
|
|
93
|
-
bg: "bg-BG",
|
|
94
|
-
bn: "bn-IN",
|
|
95
|
-
bs: "bs-BA",
|
|
96
|
-
ca: "ca-ES",
|
|
97
42
|
cs: "cs-CZ",
|
|
98
|
-
cy: "cy-GB",
|
|
99
43
|
da: "da-DK",
|
|
100
44
|
de: "de-DE",
|
|
101
|
-
el: "el-GR",
|
|
102
45
|
en: "en-US",
|
|
103
46
|
es: "es-ES",
|
|
104
|
-
et: "et-EE",
|
|
105
|
-
eu: "eu-ES",
|
|
106
|
-
fa: "fa-IR",
|
|
107
|
-
fi: "fi-FI",
|
|
108
47
|
fr: "fr-FR",
|
|
109
|
-
ga: "ga-IE",
|
|
110
|
-
gl: "gl-ES",
|
|
111
|
-
gu: "gu-IN",
|
|
112
|
-
he: "he-IL",
|
|
113
|
-
hi: "hi-IN",
|
|
114
|
-
hr: "hr-HR",
|
|
115
|
-
hu: "hu-HU",
|
|
116
|
-
hy: "hy-AM",
|
|
117
48
|
id: "id-ID",
|
|
118
|
-
is: "is-IS",
|
|
119
49
|
it: "it-IT",
|
|
120
50
|
ja: "ja-JP",
|
|
121
|
-
ka: "ka-GE",
|
|
122
|
-
kk: "kk-KZ",
|
|
123
|
-
km: "km-KH",
|
|
124
|
-
kn: "kn-IN",
|
|
125
51
|
ko: "ko-KR",
|
|
126
|
-
lo: "lo-LA",
|
|
127
|
-
lt: "lt-LT",
|
|
128
|
-
lv: "lv-LV",
|
|
129
|
-
mk: "mk-MK",
|
|
130
|
-
ml: "ml-IN",
|
|
131
|
-
mn: "mn-MN",
|
|
132
|
-
mr: "mr-IN",
|
|
133
52
|
ms: "ms-MY",
|
|
134
|
-
my: "my-MM",
|
|
135
|
-
ne: "ne-NP",
|
|
136
53
|
nl: "nl-NL",
|
|
137
54
|
no: "no-NO",
|
|
138
|
-
or: "or-IN",
|
|
139
|
-
pa: "pa-IN",
|
|
140
55
|
pl: "pl-PL",
|
|
141
56
|
pt: "pt-BR",
|
|
142
57
|
ro: "ro-RO",
|
|
143
58
|
ru: "ru-RU",
|
|
144
|
-
si: "si-LK",
|
|
145
|
-
sk: "sk-SK",
|
|
146
|
-
sl: "sl-SI",
|
|
147
|
-
sq: "sq-AL",
|
|
148
|
-
sr: "sr-RS",
|
|
149
59
|
sv: "sv-SE",
|
|
150
|
-
sw: "sw-KE",
|
|
151
|
-
ta: "ta-IN",
|
|
152
|
-
te: "te-IN",
|
|
153
60
|
th: "th-TH",
|
|
154
61
|
tr: "tr-TR",
|
|
155
62
|
uk: "uk-UA",
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
vi: "vi-VN",
|
|
159
|
-
zh: "zh-CN"
|
|
63
|
+
vi: "vi-VT",
|
|
64
|
+
zh: "zh-Hans"
|
|
160
65
|
};
|
|
161
66
|
function normalizeToBcp47(tag) {
|
|
162
67
|
if (isValidBcp47Tag(tag)) {
|
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/bcp47-language-tag.ts"],"sourcesContent":["/**\n *
|
|
1
|
+
{"version":3,"sources":["../src/bcp47-language-tag.ts"],"sourcesContent":["/**\n * Language tags supported by the ocrmac OCR engine (via docling-serve).\n * These are the only tags that can be passed to the Docling pipeline without\n * triggering \"Invalid language preference\" errors.\n */\nexport const BCP47_LANGUAGE_TAGS = [\n 'ar-SA',\n 'ars-SA',\n 'cs-CZ',\n 'da-DK',\n 'de-DE',\n 'en-US',\n 'es-ES',\n 'fr-FR',\n 'id-ID',\n 'it-IT',\n 'ja-JP',\n 'ko-KR',\n 'ms-MY',\n 'nb-NO',\n 'nl-NL',\n 'nn-NO',\n 'no-NO',\n 'pl-PL',\n 'pt-BR',\n 'ro-RO',\n 'ru-RU',\n 'sv-SE',\n 'th-TH',\n 'tr-TR',\n 'uk-UA',\n 'vi-VT',\n 'yue-Hans',\n 'yue-Hant',\n 'zh-Hans',\n 'zh-Hant',\n] as const;\n\n/** Union type of all supported BCP 47 language tags */\nexport type Bcp47LanguageTag = (typeof BCP47_LANGUAGE_TAGS)[number];\n\n/** Set for O(1) lookup of valid BCP 47 tags */\nexport const BCP47_LANGUAGE_TAG_SET: ReadonlySet<string> = new Set(\n BCP47_LANGUAGE_TAGS,\n);\n\n/** Check whether a string is a valid BCP 47 language tag */\nexport function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag {\n return BCP47_LANGUAGE_TAG_SET.has(tag);\n}\n\n/**\n * Maps bare language codes to their default BCP 47 tag.\n * Used when VLM returns only a language code without a region subtag.\n */\nconst DEFAULT_REGION_MAP: Record<string, Bcp47LanguageTag> = {\n ar: 'ar-SA',\n cs: 'cs-CZ',\n da: 'da-DK',\n de: 'de-DE',\n en: 'en-US',\n es: 'es-ES',\n fr: 'fr-FR',\n id: 'id-ID',\n it: 'it-IT',\n ja: 'ja-JP',\n ko: 'ko-KR',\n ms: 'ms-MY',\n nl: 'nl-NL',\n no: 'no-NO',\n pl: 'pl-PL',\n pt: 'pt-BR',\n ro: 'ro-RO',\n ru: 'ru-RU',\n sv: 'sv-SE',\n th: 'th-TH',\n tr: 'tr-TR',\n uk: 'uk-UA',\n vi: 'vi-VT',\n zh: 'zh-Hans',\n};\n\n/**\n * Normalize a language string to a valid BCP 47 tag.\n *\n * - If the input is already a valid full tag (e.g. \"en-US\"), return it as-is.\n * - If it is a bare language code (e.g. \"en\", \"ko\"), map it to the default region.\n * - Otherwise return null (e.g. \"und\", \"unknown\", empty string).\n */\nexport function normalizeToBcp47(tag: string): Bcp47LanguageTag | null {\n if (isValidBcp47Tag(tag)) {\n return tag;\n }\n\n const lower = tag.toLowerCase();\n const mapped = DEFAULT_REGION_MAP[lower];\n if (mapped) {\n return mapped;\n }\n\n return null;\n}\n"],"mappings":";AAKO,IAAM,sBAAsB;AAAA,EACjC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAMO,IAAM,yBAA8C,IAAI;AAAA,EAC7D;AACF;AAGO,SAAS,gBAAgB,KAAsC;AACpE,SAAO,uBAAuB,IAAI,GAAG;AACvC;AAMA,IAAM,qBAAuD;AAAA,EAC3D,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AACN;AASO,SAAS,iBAAiB,KAAsC;AACrE,MAAI,gBAAgB,GAAG,GAAG;AACxB,WAAO;AAAA,EACT;AAEA,QAAM,QAAQ,IAAI,YAAY;AAC9B,QAAM,SAAS,mBAAmB,KAAK;AACvC,MAAI,QAAQ;AACV,WAAO;AAAA,EACT;AAEA,SAAO;AACT;","names":[]}
|
package/package.json
CHANGED