@heripo/model 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.ko.md CHANGED
@@ -3,7 +3,7 @@
3
3
  > 문서 모델 및 타입 정의
4
4
 
5
5
  [![npm version](https://img.shields.io/npm/v/@heripo/model.svg)](https://www.npmjs.com/package/@heripo/model)
6
- [![Node.js](https://img.shields.io/badge/Node.js-%3E%3D22-339933?logo=node.js&logoColor=white)](https://nodejs.org/)
6
+ [![Node.js](https://img.shields.io/badge/Node.js-%3E%3D24-339933?logo=node.js&logoColor=white)](https://nodejs.org/)
7
7
  [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](../../LICENSE)
8
8
 
9
9
  [English](./README.md) | **한국어**
package/README.md CHANGED
@@ -3,7 +3,7 @@
3
3
  > Document models and type definitions
4
4
 
5
5
  [![npm version](https://img.shields.io/npm/v/@heripo/model.svg)](https://www.npmjs.com/package/@heripo/model)
6
- [![Node.js](https://img.shields.io/badge/Node.js-%3E%3D22-339933?logo=node.js&logoColor=white)](https://nodejs.org/)
6
+ [![Node.js](https://img.shields.io/badge/Node.js-%3E%3D24-339933?logo=node.js&logoColor=white)](https://nodejs.org/)
7
7
  [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](../../LICENSE)
8
8
 
9
9
  **English** | [한국어](./README.ko.md)
package/dist/index.cjs CHANGED
@@ -3,6 +3,10 @@ var __defProp = Object.defineProperty;
3
3
  var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
4
  var __getOwnPropNames = Object.getOwnPropertyNames;
5
5
  var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
6
10
  var __copyProps = (to, from, except, desc) => {
7
11
  if (from && typeof from === "object" || typeof from === "function") {
8
12
  for (let key of __getOwnPropNames(from))
@@ -15,5 +19,190 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
15
19
 
16
20
  // src/index.ts
17
21
  var index_exports = {};
22
+ __export(index_exports, {
23
+ BCP47_LANGUAGE_TAGS: () => BCP47_LANGUAGE_TAGS,
24
+ BCP47_LANGUAGE_TAG_SET: () => BCP47_LANGUAGE_TAG_SET,
25
+ isValidBcp47Tag: () => isValidBcp47Tag,
26
+ normalizeToBcp47: () => normalizeToBcp47
27
+ });
18
28
  module.exports = __toCommonJS(index_exports);
29
+
30
+ // src/bcp47-language-tag.ts
31
+ var BCP47_LANGUAGE_TAGS = [
32
+ "af-ZA",
33
+ "am-ET",
34
+ "ar-SA",
35
+ "as-IN",
36
+ "az-AZ",
37
+ "be-BY",
38
+ "bg-BG",
39
+ "bn-IN",
40
+ "bs-BA",
41
+ "ca-ES",
42
+ "cs-CZ",
43
+ "cy-GB",
44
+ "da-DK",
45
+ "de-DE",
46
+ "el-GR",
47
+ "en-US",
48
+ "es-ES",
49
+ "et-EE",
50
+ "eu-ES",
51
+ "fa-IR",
52
+ "fi-FI",
53
+ "fr-FR",
54
+ "ga-IE",
55
+ "gl-ES",
56
+ "gu-IN",
57
+ "he-IL",
58
+ "hi-IN",
59
+ "hr-HR",
60
+ "hu-HU",
61
+ "hy-AM",
62
+ "id-ID",
63
+ "is-IS",
64
+ "it-IT",
65
+ "ja-JP",
66
+ "ka-GE",
67
+ "kk-KZ",
68
+ "km-KH",
69
+ "kn-IN",
70
+ "ko-KR",
71
+ "lo-LA",
72
+ "lt-LT",
73
+ "lv-LV",
74
+ "mk-MK",
75
+ "ml-IN",
76
+ "mn-MN",
77
+ "mr-IN",
78
+ "ms-MY",
79
+ "my-MM",
80
+ "ne-NP",
81
+ "nl-NL",
82
+ "no-NO",
83
+ "or-IN",
84
+ "pa-IN",
85
+ "pl-PL",
86
+ "pt-BR",
87
+ "pt-PT",
88
+ "ro-RO",
89
+ "ru-RU",
90
+ "si-LK",
91
+ "sk-SK",
92
+ "sl-SI",
93
+ "sq-AL",
94
+ "sr-RS",
95
+ "sv-SE",
96
+ "sw-KE",
97
+ "ta-IN",
98
+ "te-IN",
99
+ "th-TH",
100
+ "tr-TR",
101
+ "uk-UA",
102
+ "ur-PK",
103
+ "uz-UZ",
104
+ "vi-VN",
105
+ "zh-CN",
106
+ "zh-Hant",
107
+ "zh-TW"
108
+ ];
109
+ var BCP47_LANGUAGE_TAG_SET = new Set(
110
+ BCP47_LANGUAGE_TAGS
111
+ );
112
+ function isValidBcp47Tag(tag) {
113
+ return BCP47_LANGUAGE_TAG_SET.has(tag);
114
+ }
115
+ var DEFAULT_REGION_MAP = {
116
+ af: "af-ZA",
117
+ am: "am-ET",
118
+ ar: "ar-SA",
119
+ as: "as-IN",
120
+ az: "az-AZ",
121
+ be: "be-BY",
122
+ bg: "bg-BG",
123
+ bn: "bn-IN",
124
+ bs: "bs-BA",
125
+ ca: "ca-ES",
126
+ cs: "cs-CZ",
127
+ cy: "cy-GB",
128
+ da: "da-DK",
129
+ de: "de-DE",
130
+ el: "el-GR",
131
+ en: "en-US",
132
+ es: "es-ES",
133
+ et: "et-EE",
134
+ eu: "eu-ES",
135
+ fa: "fa-IR",
136
+ fi: "fi-FI",
137
+ fr: "fr-FR",
138
+ ga: "ga-IE",
139
+ gl: "gl-ES",
140
+ gu: "gu-IN",
141
+ he: "he-IL",
142
+ hi: "hi-IN",
143
+ hr: "hr-HR",
144
+ hu: "hu-HU",
145
+ hy: "hy-AM",
146
+ id: "id-ID",
147
+ is: "is-IS",
148
+ it: "it-IT",
149
+ ja: "ja-JP",
150
+ ka: "ka-GE",
151
+ kk: "kk-KZ",
152
+ km: "km-KH",
153
+ kn: "kn-IN",
154
+ ko: "ko-KR",
155
+ lo: "lo-LA",
156
+ lt: "lt-LT",
157
+ lv: "lv-LV",
158
+ mk: "mk-MK",
159
+ ml: "ml-IN",
160
+ mn: "mn-MN",
161
+ mr: "mr-IN",
162
+ ms: "ms-MY",
163
+ my: "my-MM",
164
+ ne: "ne-NP",
165
+ nl: "nl-NL",
166
+ no: "no-NO",
167
+ or: "or-IN",
168
+ pa: "pa-IN",
169
+ pl: "pl-PL",
170
+ pt: "pt-BR",
171
+ ro: "ro-RO",
172
+ ru: "ru-RU",
173
+ si: "si-LK",
174
+ sk: "sk-SK",
175
+ sl: "sl-SI",
176
+ sq: "sq-AL",
177
+ sr: "sr-RS",
178
+ sv: "sv-SE",
179
+ sw: "sw-KE",
180
+ ta: "ta-IN",
181
+ te: "te-IN",
182
+ th: "th-TH",
183
+ tr: "tr-TR",
184
+ uk: "uk-UA",
185
+ ur: "ur-PK",
186
+ uz: "uz-UZ",
187
+ vi: "vi-VN",
188
+ zh: "zh-CN"
189
+ };
190
+ function normalizeToBcp47(tag) {
191
+ if (isValidBcp47Tag(tag)) {
192
+ return tag;
193
+ }
194
+ const lower = tag.toLowerCase();
195
+ const mapped = DEFAULT_REGION_MAP[lower];
196
+ if (mapped) {
197
+ return mapped;
198
+ }
199
+ return null;
200
+ }
201
+ // Annotate the CommonJS export names for ESM import in node:
202
+ 0 && (module.exports = {
203
+ BCP47_LANGUAGE_TAGS,
204
+ BCP47_LANGUAGE_TAG_SET,
205
+ isValidBcp47Tag,
206
+ normalizeToBcp47
207
+ });
19
208
  //# sourceMappingURL=index.cjs.map
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/index.ts"],"sourcesContent":["export type * from './docling-document';\nexport type * from './processed-document';\nexport type * from './token-usage-report';\nexport type * from './document-process-result';\nexport type * from './ocr-strategy';\n"],"mappings":";;;;;;;;;;;;;;;;AAAA;AAAA;","names":[]}
1
+ {"version":3,"sources":["../src/index.ts","../src/bcp47-language-tag.ts"],"sourcesContent":["export type * from './bcp47-language-tag';\nexport {\n BCP47_LANGUAGE_TAGS,\n BCP47_LANGUAGE_TAG_SET,\n isValidBcp47Tag,\n normalizeToBcp47,\n} from './bcp47-language-tag';\nexport type * from './docling-document';\nexport type * from './processed-document';\nexport type * from './token-usage-report';\nexport type * from './document-process-result';\nexport type * from './ocr-strategy';\n","/**\n * BCP 47 language tags supported by Docling OCR engines.\n * Covers major languages encountered in archaeological report processing.\n */\nexport const BCP47_LANGUAGE_TAGS = [\n 'af-ZA',\n 'am-ET',\n 'ar-SA',\n 'as-IN',\n 'az-AZ',\n 'be-BY',\n 'bg-BG',\n 'bn-IN',\n 'bs-BA',\n 'ca-ES',\n 'cs-CZ',\n 'cy-GB',\n 'da-DK',\n 'de-DE',\n 'el-GR',\n 'en-US',\n 'es-ES',\n 'et-EE',\n 'eu-ES',\n 'fa-IR',\n 'fi-FI',\n 'fr-FR',\n 'ga-IE',\n 'gl-ES',\n 'gu-IN',\n 'he-IL',\n 'hi-IN',\n 'hr-HR',\n 'hu-HU',\n 'hy-AM',\n 'id-ID',\n 'is-IS',\n 'it-IT',\n 'ja-JP',\n 'ka-GE',\n 'kk-KZ',\n 'km-KH',\n 'kn-IN',\n 'ko-KR',\n 'lo-LA',\n 'lt-LT',\n 'lv-LV',\n 'mk-MK',\n 'ml-IN',\n 'mn-MN',\n 'mr-IN',\n 'ms-MY',\n 'my-MM',\n 'ne-NP',\n 'nl-NL',\n 'no-NO',\n 'or-IN',\n 'pa-IN',\n 'pl-PL',\n 'pt-BR',\n 'pt-PT',\n 'ro-RO',\n 'ru-RU',\n 'si-LK',\n 'sk-SK',\n 'sl-SI',\n 'sq-AL',\n 'sr-RS',\n 'sv-SE',\n 'sw-KE',\n 'ta-IN',\n 'te-IN',\n 'th-TH',\n 'tr-TR',\n 'uk-UA',\n 'ur-PK',\n 'uz-UZ',\n 'vi-VN',\n 'zh-CN',\n 'zh-Hant',\n 'zh-TW',\n] as const;\n\n/** Union type of all supported BCP 47 language tags */\nexport type Bcp47LanguageTag = (typeof BCP47_LANGUAGE_TAGS)[number];\n\n/** Set for O(1) lookup of valid BCP 47 tags */\nexport const BCP47_LANGUAGE_TAG_SET: ReadonlySet<string> = new Set(\n BCP47_LANGUAGE_TAGS,\n);\n\n/** Check whether a string is a valid BCP 47 language tag */\nexport function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag {\n return BCP47_LANGUAGE_TAG_SET.has(tag);\n}\n\n/**\n * Maps bare language codes to their default BCP 47 tag.\n * Used when VLM returns only a language code without a region subtag.\n */\nconst DEFAULT_REGION_MAP: Record<string, Bcp47LanguageTag> = {\n af: 'af-ZA',\n am: 'am-ET',\n ar: 'ar-SA',\n as: 'as-IN',\n az: 'az-AZ',\n be: 'be-BY',\n bg: 'bg-BG',\n bn: 'bn-IN',\n bs: 'bs-BA',\n ca: 'ca-ES',\n cs: 'cs-CZ',\n cy: 'cy-GB',\n da: 'da-DK',\n de: 'de-DE',\n el: 'el-GR',\n en: 'en-US',\n es: 'es-ES',\n et: 'et-EE',\n eu: 'eu-ES',\n fa: 'fa-IR',\n fi: 'fi-FI',\n fr: 'fr-FR',\n ga: 'ga-IE',\n gl: 'gl-ES',\n gu: 'gu-IN',\n he: 'he-IL',\n hi: 'hi-IN',\n hr: 'hr-HR',\n hu: 'hu-HU',\n hy: 'hy-AM',\n id: 'id-ID',\n is: 'is-IS',\n it: 'it-IT',\n ja: 'ja-JP',\n ka: 'ka-GE',\n kk: 'kk-KZ',\n km: 'km-KH',\n kn: 'kn-IN',\n ko: 'ko-KR',\n lo: 'lo-LA',\n lt: 'lt-LT',\n lv: 'lv-LV',\n mk: 'mk-MK',\n ml: 'ml-IN',\n mn: 'mn-MN',\n mr: 'mr-IN',\n ms: 'ms-MY',\n my: 'my-MM',\n ne: 'ne-NP',\n nl: 'nl-NL',\n no: 'no-NO',\n or: 'or-IN',\n pa: 'pa-IN',\n pl: 'pl-PL',\n pt: 'pt-BR',\n ro: 'ro-RO',\n ru: 'ru-RU',\n si: 'si-LK',\n sk: 'sk-SK',\n sl: 'sl-SI',\n sq: 'sq-AL',\n sr: 'sr-RS',\n sv: 'sv-SE',\n sw: 'sw-KE',\n ta: 'ta-IN',\n te: 'te-IN',\n th: 'th-TH',\n tr: 'tr-TR',\n uk: 'uk-UA',\n ur: 'ur-PK',\n uz: 'uz-UZ',\n vi: 'vi-VN',\n zh: 'zh-CN',\n};\n\n/**\n * Normalize a language string to a valid BCP 47 tag.\n *\n * - If the input is already a valid full tag (e.g. \"en-US\"), return it as-is.\n * - If it is a bare language code (e.g. \"en\", \"ko\"), map it to the default region.\n * - Otherwise return null (e.g. \"und\", \"unknown\", empty string).\n */\nexport function normalizeToBcp47(tag: string): Bcp47LanguageTag | null {\n if (isValidBcp47Tag(tag)) {\n return tag;\n }\n\n const lower = tag.toLowerCase();\n const mapped = DEFAULT_REGION_MAP[lower];\n if (mapped) {\n return mapped;\n }\n\n return null;\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACIO,IAAM,sBAAsB;AAAA,EACjC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAMO,IAAM,yBAA8C,IAAI;AAAA,EAC7D;AACF;AAGO,SAAS,gBAAgB,KAAsC;AACpE,SAAO,uBAAuB,IAAI,GAAG;AACvC;AAMA,IAAM,qBAAuD;AAAA,EAC3D,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AACN;AASO,SAAS,iBAAiB,KAAsC;AACrE,MAAI,gBAAgB,GAAG,GAAG;AACxB,WAAO;AAAA,EACT;AAEA,QAAM,QAAQ,IAAI,YAAY;AAC9B,QAAM,SAAS,mBAAmB,KAAK;AACvC,MAAI,QAAQ;AACV,WAAO;AAAA,EACT;AAEA,SAAO;AACT;","names":[]}
package/dist/index.d.cts CHANGED
@@ -1,3 +1,23 @@
1
+ /**
2
+ * BCP 47 language tags supported by Docling OCR engines.
3
+ * Covers major languages encountered in archaeological report processing.
4
+ */
5
+ declare const BCP47_LANGUAGE_TAGS: readonly ["af-ZA", "am-ET", "ar-SA", "as-IN", "az-AZ", "be-BY", "bg-BG", "bn-IN", "bs-BA", "ca-ES", "cs-CZ", "cy-GB", "da-DK", "de-DE", "el-GR", "en-US", "es-ES", "et-EE", "eu-ES", "fa-IR", "fi-FI", "fr-FR", "ga-IE", "gl-ES", "gu-IN", "he-IL", "hi-IN", "hr-HR", "hu-HU", "hy-AM", "id-ID", "is-IS", "it-IT", "ja-JP", "ka-GE", "kk-KZ", "km-KH", "kn-IN", "ko-KR", "lo-LA", "lt-LT", "lv-LV", "mk-MK", "ml-IN", "mn-MN", "mr-IN", "ms-MY", "my-MM", "ne-NP", "nl-NL", "no-NO", "or-IN", "pa-IN", "pl-PL", "pt-BR", "pt-PT", "ro-RO", "ru-RU", "si-LK", "sk-SK", "sl-SI", "sq-AL", "sr-RS", "sv-SE", "sw-KE", "ta-IN", "te-IN", "th-TH", "tr-TR", "uk-UA", "ur-PK", "uz-UZ", "vi-VN", "zh-CN", "zh-Hant", "zh-TW"];
6
+ /** Union type of all supported BCP 47 language tags */
7
+ type Bcp47LanguageTag = (typeof BCP47_LANGUAGE_TAGS)[number];
8
+ /** Set for O(1) lookup of valid BCP 47 tags */
9
+ declare const BCP47_LANGUAGE_TAG_SET: ReadonlySet<string>;
10
+ /** Check whether a string is a valid BCP 47 language tag */
11
+ declare function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag;
12
+ /**
13
+ * Normalize a language string to a valid BCP 47 tag.
14
+ *
15
+ * - If the input is already a valid full tag (e.g. "en-US"), return it as-is.
16
+ * - If it is a bare language code (e.g. "en", "ko"), map it to the default region.
17
+ * - Otherwise return null (e.g. "und", "unknown", empty string).
18
+ */
19
+ declare function normalizeToBcp47(tag: string): Bcp47LanguageTag | null;
20
+
1
21
  interface DoclingReference {
2
22
  $ref: string;
3
23
  }
@@ -707,8 +727,8 @@ interface OcrStrategy {
707
727
  method: 'ocrmac' | 'vlm';
708
728
  /** OCR language weights for ocrmac (e.g., ['ko-KR', 'en-US'] or ['zh-Hant', 'ko-KR']) */
709
729
  ocrLanguages?: string[];
710
- /** BCP 47 language tags detected during sampling (e.g., ['ko-KR', 'en-US']) */
711
- detectedLanguages?: string[];
730
+ /** BCP 47 language tags detected during sampling, ordered by frequency (e.g., ['ko-KR', 'en-US']) */
731
+ detectedLanguages?: Bcp47LanguageTag[];
712
732
  /** Human-readable explanation of the decision */
713
733
  reason: string;
714
734
  /** Number of pages that were sampled for the decision */
@@ -719,4 +739,4 @@ interface OcrStrategy {
719
739
  koreanHanjaMixPages?: number[];
720
740
  }
721
741
 
722
- export type { Caption, Chapter, ComponentUsageReport, DoclingBBox, DoclingBaseNode, DoclingBody, DoclingDocument, DoclingGroupItem, DoclingOrigin, DoclingPage, DoclingPageImage, DoclingPictureItem, DoclingProv, DoclingReference, DoclingTableCell, DoclingTableData, DoclingTableItem, DoclingTextItem, DocumentProcessResult, ModelUsageDetail, OcrStrategy, PageRange, PhaseUsageReport, ProcessedDocument, ProcessedFootnote, ProcessedImage, ProcessedTable, ProcessedTableCell, TextBlock, TokenUsageReport, TokenUsageSummary };
742
+ export { BCP47_LANGUAGE_TAGS, BCP47_LANGUAGE_TAG_SET, type Bcp47LanguageTag, type Caption, type Chapter, type ComponentUsageReport, type DoclingBBox, type DoclingBaseNode, type DoclingBody, type DoclingDocument, type DoclingGroupItem, type DoclingOrigin, type DoclingPage, type DoclingPageImage, type DoclingPictureItem, type DoclingProv, type DoclingReference, type DoclingTableCell, type DoclingTableData, type DoclingTableItem, type DoclingTextItem, type DocumentProcessResult, type ModelUsageDetail, type OcrStrategy, type PageRange, type PhaseUsageReport, type ProcessedDocument, type ProcessedFootnote, type ProcessedImage, type ProcessedTable, type ProcessedTableCell, type TextBlock, type TokenUsageReport, type TokenUsageSummary, isValidBcp47Tag, normalizeToBcp47 };
package/dist/index.d.ts CHANGED
@@ -1,3 +1,23 @@
1
+ /**
2
+ * BCP 47 language tags supported by Docling OCR engines.
3
+ * Covers major languages encountered in archaeological report processing.
4
+ */
5
+ declare const BCP47_LANGUAGE_TAGS: readonly ["af-ZA", "am-ET", "ar-SA", "as-IN", "az-AZ", "be-BY", "bg-BG", "bn-IN", "bs-BA", "ca-ES", "cs-CZ", "cy-GB", "da-DK", "de-DE", "el-GR", "en-US", "es-ES", "et-EE", "eu-ES", "fa-IR", "fi-FI", "fr-FR", "ga-IE", "gl-ES", "gu-IN", "he-IL", "hi-IN", "hr-HR", "hu-HU", "hy-AM", "id-ID", "is-IS", "it-IT", "ja-JP", "ka-GE", "kk-KZ", "km-KH", "kn-IN", "ko-KR", "lo-LA", "lt-LT", "lv-LV", "mk-MK", "ml-IN", "mn-MN", "mr-IN", "ms-MY", "my-MM", "ne-NP", "nl-NL", "no-NO", "or-IN", "pa-IN", "pl-PL", "pt-BR", "pt-PT", "ro-RO", "ru-RU", "si-LK", "sk-SK", "sl-SI", "sq-AL", "sr-RS", "sv-SE", "sw-KE", "ta-IN", "te-IN", "th-TH", "tr-TR", "uk-UA", "ur-PK", "uz-UZ", "vi-VN", "zh-CN", "zh-Hant", "zh-TW"];
6
+ /** Union type of all supported BCP 47 language tags */
7
+ type Bcp47LanguageTag = (typeof BCP47_LANGUAGE_TAGS)[number];
8
+ /** Set for O(1) lookup of valid BCP 47 tags */
9
+ declare const BCP47_LANGUAGE_TAG_SET: ReadonlySet<string>;
10
+ /** Check whether a string is a valid BCP 47 language tag */
11
+ declare function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag;
12
+ /**
13
+ * Normalize a language string to a valid BCP 47 tag.
14
+ *
15
+ * - If the input is already a valid full tag (e.g. "en-US"), return it as-is.
16
+ * - If it is a bare language code (e.g. "en", "ko"), map it to the default region.
17
+ * - Otherwise return null (e.g. "und", "unknown", empty string).
18
+ */
19
+ declare function normalizeToBcp47(tag: string): Bcp47LanguageTag | null;
20
+
1
21
  interface DoclingReference {
2
22
  $ref: string;
3
23
  }
@@ -707,8 +727,8 @@ interface OcrStrategy {
707
727
  method: 'ocrmac' | 'vlm';
708
728
  /** OCR language weights for ocrmac (e.g., ['ko-KR', 'en-US'] or ['zh-Hant', 'ko-KR']) */
709
729
  ocrLanguages?: string[];
710
- /** BCP 47 language tags detected during sampling (e.g., ['ko-KR', 'en-US']) */
711
- detectedLanguages?: string[];
730
+ /** BCP 47 language tags detected during sampling, ordered by frequency (e.g., ['ko-KR', 'en-US']) */
731
+ detectedLanguages?: Bcp47LanguageTag[];
712
732
  /** Human-readable explanation of the decision */
713
733
  reason: string;
714
734
  /** Number of pages that were sampled for the decision */
@@ -719,4 +739,4 @@ interface OcrStrategy {
719
739
  koreanHanjaMixPages?: number[];
720
740
  }
721
741
 
722
- export type { Caption, Chapter, ComponentUsageReport, DoclingBBox, DoclingBaseNode, DoclingBody, DoclingDocument, DoclingGroupItem, DoclingOrigin, DoclingPage, DoclingPageImage, DoclingPictureItem, DoclingProv, DoclingReference, DoclingTableCell, DoclingTableData, DoclingTableItem, DoclingTextItem, DocumentProcessResult, ModelUsageDetail, OcrStrategy, PageRange, PhaseUsageReport, ProcessedDocument, ProcessedFootnote, ProcessedImage, ProcessedTable, ProcessedTableCell, TextBlock, TokenUsageReport, TokenUsageSummary };
742
+ export { BCP47_LANGUAGE_TAGS, BCP47_LANGUAGE_TAG_SET, type Bcp47LanguageTag, type Caption, type Chapter, type ComponentUsageReport, type DoclingBBox, type DoclingBaseNode, type DoclingBody, type DoclingDocument, type DoclingGroupItem, type DoclingOrigin, type DoclingPage, type DoclingPageImage, type DoclingPictureItem, type DoclingProv, type DoclingReference, type DoclingTableCell, type DoclingTableData, type DoclingTableItem, type DoclingTextItem, type DocumentProcessResult, type ModelUsageDetail, type OcrStrategy, type PageRange, type PhaseUsageReport, type ProcessedDocument, type ProcessedFootnote, type ProcessedImage, type ProcessedTable, type ProcessedTableCell, type TextBlock, type TokenUsageReport, type TokenUsageSummary, isValidBcp47Tag, normalizeToBcp47 };
package/dist/index.js CHANGED
@@ -1 +1,178 @@
1
+ // src/bcp47-language-tag.ts
2
+ var BCP47_LANGUAGE_TAGS = [
3
+ "af-ZA",
4
+ "am-ET",
5
+ "ar-SA",
6
+ "as-IN",
7
+ "az-AZ",
8
+ "be-BY",
9
+ "bg-BG",
10
+ "bn-IN",
11
+ "bs-BA",
12
+ "ca-ES",
13
+ "cs-CZ",
14
+ "cy-GB",
15
+ "da-DK",
16
+ "de-DE",
17
+ "el-GR",
18
+ "en-US",
19
+ "es-ES",
20
+ "et-EE",
21
+ "eu-ES",
22
+ "fa-IR",
23
+ "fi-FI",
24
+ "fr-FR",
25
+ "ga-IE",
26
+ "gl-ES",
27
+ "gu-IN",
28
+ "he-IL",
29
+ "hi-IN",
30
+ "hr-HR",
31
+ "hu-HU",
32
+ "hy-AM",
33
+ "id-ID",
34
+ "is-IS",
35
+ "it-IT",
36
+ "ja-JP",
37
+ "ka-GE",
38
+ "kk-KZ",
39
+ "km-KH",
40
+ "kn-IN",
41
+ "ko-KR",
42
+ "lo-LA",
43
+ "lt-LT",
44
+ "lv-LV",
45
+ "mk-MK",
46
+ "ml-IN",
47
+ "mn-MN",
48
+ "mr-IN",
49
+ "ms-MY",
50
+ "my-MM",
51
+ "ne-NP",
52
+ "nl-NL",
53
+ "no-NO",
54
+ "or-IN",
55
+ "pa-IN",
56
+ "pl-PL",
57
+ "pt-BR",
58
+ "pt-PT",
59
+ "ro-RO",
60
+ "ru-RU",
61
+ "si-LK",
62
+ "sk-SK",
63
+ "sl-SI",
64
+ "sq-AL",
65
+ "sr-RS",
66
+ "sv-SE",
67
+ "sw-KE",
68
+ "ta-IN",
69
+ "te-IN",
70
+ "th-TH",
71
+ "tr-TR",
72
+ "uk-UA",
73
+ "ur-PK",
74
+ "uz-UZ",
75
+ "vi-VN",
76
+ "zh-CN",
77
+ "zh-Hant",
78
+ "zh-TW"
79
+ ];
80
+ var BCP47_LANGUAGE_TAG_SET = new Set(
81
+ BCP47_LANGUAGE_TAGS
82
+ );
83
+ function isValidBcp47Tag(tag) {
84
+ return BCP47_LANGUAGE_TAG_SET.has(tag);
85
+ }
86
+ var DEFAULT_REGION_MAP = {
87
+ af: "af-ZA",
88
+ am: "am-ET",
89
+ ar: "ar-SA",
90
+ as: "as-IN",
91
+ az: "az-AZ",
92
+ be: "be-BY",
93
+ bg: "bg-BG",
94
+ bn: "bn-IN",
95
+ bs: "bs-BA",
96
+ ca: "ca-ES",
97
+ cs: "cs-CZ",
98
+ cy: "cy-GB",
99
+ da: "da-DK",
100
+ de: "de-DE",
101
+ el: "el-GR",
102
+ en: "en-US",
103
+ es: "es-ES",
104
+ et: "et-EE",
105
+ eu: "eu-ES",
106
+ fa: "fa-IR",
107
+ fi: "fi-FI",
108
+ fr: "fr-FR",
109
+ ga: "ga-IE",
110
+ gl: "gl-ES",
111
+ gu: "gu-IN",
112
+ he: "he-IL",
113
+ hi: "hi-IN",
114
+ hr: "hr-HR",
115
+ hu: "hu-HU",
116
+ hy: "hy-AM",
117
+ id: "id-ID",
118
+ is: "is-IS",
119
+ it: "it-IT",
120
+ ja: "ja-JP",
121
+ ka: "ka-GE",
122
+ kk: "kk-KZ",
123
+ km: "km-KH",
124
+ kn: "kn-IN",
125
+ ko: "ko-KR",
126
+ lo: "lo-LA",
127
+ lt: "lt-LT",
128
+ lv: "lv-LV",
129
+ mk: "mk-MK",
130
+ ml: "ml-IN",
131
+ mn: "mn-MN",
132
+ mr: "mr-IN",
133
+ ms: "ms-MY",
134
+ my: "my-MM",
135
+ ne: "ne-NP",
136
+ nl: "nl-NL",
137
+ no: "no-NO",
138
+ or: "or-IN",
139
+ pa: "pa-IN",
140
+ pl: "pl-PL",
141
+ pt: "pt-BR",
142
+ ro: "ro-RO",
143
+ ru: "ru-RU",
144
+ si: "si-LK",
145
+ sk: "sk-SK",
146
+ sl: "sl-SI",
147
+ sq: "sq-AL",
148
+ sr: "sr-RS",
149
+ sv: "sv-SE",
150
+ sw: "sw-KE",
151
+ ta: "ta-IN",
152
+ te: "te-IN",
153
+ th: "th-TH",
154
+ tr: "tr-TR",
155
+ uk: "uk-UA",
156
+ ur: "ur-PK",
157
+ uz: "uz-UZ",
158
+ vi: "vi-VN",
159
+ zh: "zh-CN"
160
+ };
161
+ function normalizeToBcp47(tag) {
162
+ if (isValidBcp47Tag(tag)) {
163
+ return tag;
164
+ }
165
+ const lower = tag.toLowerCase();
166
+ const mapped = DEFAULT_REGION_MAP[lower];
167
+ if (mapped) {
168
+ return mapped;
169
+ }
170
+ return null;
171
+ }
172
+ export {
173
+ BCP47_LANGUAGE_TAGS,
174
+ BCP47_LANGUAGE_TAG_SET,
175
+ isValidBcp47Tag,
176
+ normalizeToBcp47
177
+ };
1
178
  //# sourceMappingURL=index.js.map
package/dist/index.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
1
+ {"version":3,"sources":["../src/bcp47-language-tag.ts"],"sourcesContent":["/**\n * BCP 47 language tags supported by Docling OCR engines.\n * Covers major languages encountered in archaeological report processing.\n */\nexport const BCP47_LANGUAGE_TAGS = [\n 'af-ZA',\n 'am-ET',\n 'ar-SA',\n 'as-IN',\n 'az-AZ',\n 'be-BY',\n 'bg-BG',\n 'bn-IN',\n 'bs-BA',\n 'ca-ES',\n 'cs-CZ',\n 'cy-GB',\n 'da-DK',\n 'de-DE',\n 'el-GR',\n 'en-US',\n 'es-ES',\n 'et-EE',\n 'eu-ES',\n 'fa-IR',\n 'fi-FI',\n 'fr-FR',\n 'ga-IE',\n 'gl-ES',\n 'gu-IN',\n 'he-IL',\n 'hi-IN',\n 'hr-HR',\n 'hu-HU',\n 'hy-AM',\n 'id-ID',\n 'is-IS',\n 'it-IT',\n 'ja-JP',\n 'ka-GE',\n 'kk-KZ',\n 'km-KH',\n 'kn-IN',\n 'ko-KR',\n 'lo-LA',\n 'lt-LT',\n 'lv-LV',\n 'mk-MK',\n 'ml-IN',\n 'mn-MN',\n 'mr-IN',\n 'ms-MY',\n 'my-MM',\n 'ne-NP',\n 'nl-NL',\n 'no-NO',\n 'or-IN',\n 'pa-IN',\n 'pl-PL',\n 'pt-BR',\n 'pt-PT',\n 'ro-RO',\n 'ru-RU',\n 'si-LK',\n 'sk-SK',\n 'sl-SI',\n 'sq-AL',\n 'sr-RS',\n 'sv-SE',\n 'sw-KE',\n 'ta-IN',\n 'te-IN',\n 'th-TH',\n 'tr-TR',\n 'uk-UA',\n 'ur-PK',\n 'uz-UZ',\n 'vi-VN',\n 'zh-CN',\n 'zh-Hant',\n 'zh-TW',\n] as const;\n\n/** Union type of all supported BCP 47 language tags */\nexport type Bcp47LanguageTag = (typeof BCP47_LANGUAGE_TAGS)[number];\n\n/** Set for O(1) lookup of valid BCP 47 tags */\nexport const BCP47_LANGUAGE_TAG_SET: ReadonlySet<string> = new Set(\n BCP47_LANGUAGE_TAGS,\n);\n\n/** Check whether a string is a valid BCP 47 language tag */\nexport function isValidBcp47Tag(tag: string): tag is Bcp47LanguageTag {\n return BCP47_LANGUAGE_TAG_SET.has(tag);\n}\n\n/**\n * Maps bare language codes to their default BCP 47 tag.\n * Used when VLM returns only a language code without a region subtag.\n */\nconst DEFAULT_REGION_MAP: Record<string, Bcp47LanguageTag> = {\n af: 'af-ZA',\n am: 'am-ET',\n ar: 'ar-SA',\n as: 'as-IN',\n az: 'az-AZ',\n be: 'be-BY',\n bg: 'bg-BG',\n bn: 'bn-IN',\n bs: 'bs-BA',\n ca: 'ca-ES',\n cs: 'cs-CZ',\n cy: 'cy-GB',\n da: 'da-DK',\n de: 'de-DE',\n el: 'el-GR',\n en: 'en-US',\n es: 'es-ES',\n et: 'et-EE',\n eu: 'eu-ES',\n fa: 'fa-IR',\n fi: 'fi-FI',\n fr: 'fr-FR',\n ga: 'ga-IE',\n gl: 'gl-ES',\n gu: 'gu-IN',\n he: 'he-IL',\n hi: 'hi-IN',\n hr: 'hr-HR',\n hu: 'hu-HU',\n hy: 'hy-AM',\n id: 'id-ID',\n is: 'is-IS',\n it: 'it-IT',\n ja: 'ja-JP',\n ka: 'ka-GE',\n kk: 'kk-KZ',\n km: 'km-KH',\n kn: 'kn-IN',\n ko: 'ko-KR',\n lo: 'lo-LA',\n lt: 'lt-LT',\n lv: 'lv-LV',\n mk: 'mk-MK',\n ml: 'ml-IN',\n mn: 'mn-MN',\n mr: 'mr-IN',\n ms: 'ms-MY',\n my: 'my-MM',\n ne: 'ne-NP',\n nl: 'nl-NL',\n no: 'no-NO',\n or: 'or-IN',\n pa: 'pa-IN',\n pl: 'pl-PL',\n pt: 'pt-BR',\n ro: 'ro-RO',\n ru: 'ru-RU',\n si: 'si-LK',\n sk: 'sk-SK',\n sl: 'sl-SI',\n sq: 'sq-AL',\n sr: 'sr-RS',\n sv: 'sv-SE',\n sw: 'sw-KE',\n ta: 'ta-IN',\n te: 'te-IN',\n th: 'th-TH',\n tr: 'tr-TR',\n uk: 'uk-UA',\n ur: 'ur-PK',\n uz: 'uz-UZ',\n vi: 'vi-VN',\n zh: 'zh-CN',\n};\n\n/**\n * Normalize a language string to a valid BCP 47 tag.\n *\n * - If the input is already a valid full tag (e.g. \"en-US\"), return it as-is.\n * - If it is a bare language code (e.g. \"en\", \"ko\"), map it to the default region.\n * - Otherwise return null (e.g. \"und\", \"unknown\", empty string).\n */\nexport function normalizeToBcp47(tag: string): Bcp47LanguageTag | null {\n if (isValidBcp47Tag(tag)) {\n return tag;\n }\n\n const lower = tag.toLowerCase();\n const mapped = DEFAULT_REGION_MAP[lower];\n if (mapped) {\n return mapped;\n }\n\n return null;\n}\n"],"mappings":";AAIO,IAAM,sBAAsB;AAAA,EACjC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAMO,IAAM,yBAA8C,IAAI;AAAA,EAC7D;AACF;AAGO,SAAS,gBAAgB,KAAsC;AACpE,SAAO,uBAAuB,IAAI,GAAG;AACvC;AAMA,IAAM,qBAAuD;AAAA,EAC3D,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AAAA,EACJ,IAAI;AACN;AASO,SAAS,iBAAiB,KAAsC;AACrE,MAAI,gBAAgB,GAAG,GAAG;AACxB,WAAO;AAAA,EACT;AAEA,QAAM,QAAQ,IAAI,YAAY;AAC9B,QAAM,SAAS,mBAAmB,KAAK;AACvC,MAAI,QAAQ;AACV,WAAO;AAAA,EACT;AAEA,SAAO;AACT;","names":[]}
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "@heripo/model",
3
3
  "private": false,
4
4
  "type": "module",
5
- "version": "0.1.8",
5
+ "version": "0.1.9",
6
6
  "description": "Document models and type definitions for heripo engine",
7
7
  "main": "dist/index.cjs",
8
8
  "module": "dist/index.js",