@fisharmy100/auto-i18n-cli 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,34 @@
1
+ import { LanguageCode } from './langs.js';
2
+ export type ProgressUpdate = {
3
+ current: number;
4
+ total: number;
5
+ lang: string;
6
+ };
7
+ type TranslateArgsBase = {
8
+ segments: {
9
+ [i: string]: (string | string[]);
10
+ };
11
+ langs: LanguageCode[];
12
+ src_lang: LanguageCode;
13
+ max_tokens: number;
14
+ };
15
+ export type NllbArgs = TranslateArgsBase & {
16
+ backend: "nllb";
17
+ nllb_model?: string;
18
+ };
19
+ export type AzureArgs = TranslateArgsBase & {
20
+ backend: "azure";
21
+ azure_key: string;
22
+ azure_region?: string;
23
+ azure_endpoint?: string;
24
+ };
25
+ export type TranslateArgs = NllbArgs | AzureArgs;
26
+ export type TranslateResult = {
27
+ values: {
28
+ [lang: string]: {
29
+ [key: string]: (string | string[]);
30
+ };
31
+ };
32
+ };
33
+ export declare function runPython(args: TranslateArgs, onProgress?: (update: ProgressUpdate) => void): Promise<TranslateResult>;
34
+ export {};
@@ -0,0 +1,50 @@
1
+ import path from 'node:path';
2
+ import { fileURLToPath } from 'node:url';
3
+ import { PythonShell } from 'python-shell';
4
+ import { logMessage, logWarning } from './utils.js';
5
+ const __filename = fileURLToPath(import.meta.url);
6
+ const __dirname = path.dirname(__filename);
7
+ const python_path = path.resolve(__dirname, "./translator/translate.py");
8
+ export async function runPython(args, onProgress) {
9
+ let converted = JSON.stringify(args);
10
+ return new Promise((resolve, reject) => {
11
+ let settled = false;
12
+ const safeResolve = (val) => { if (!settled) {
13
+ settled = true;
14
+ resolve(val);
15
+ } };
16
+ const safeReject = (err) => { if (!settled) {
17
+ settled = true;
18
+ reject(err);
19
+ } };
20
+ logMessage("Loading Python shell...");
21
+ const shell = new PythonShell(python_path, {
22
+ args: [converted],
23
+ encoding: "utf-8",
24
+ });
25
+ logMessage(`Running ${python_path} with backend: ${args.backend}...`);
26
+ shell.on("message", (raw) => {
27
+ try {
28
+ const msg = JSON.parse(raw);
29
+ if (msg.type === "progress") {
30
+ onProgress?.(msg);
31
+ }
32
+ else if (msg.type === "result") {
33
+ safeResolve({ values: msg.values });
34
+ }
35
+ else if (msg.type === "error") {
36
+ safeReject(new Error(msg.message));
37
+ }
38
+ else {
39
+ logWarning(`Unknown message: ${msg}`);
40
+ }
41
+ }
42
+ catch {
43
+ logWarning(`Unparsable message from python`);
44
+ }
45
+ });
46
+ shell.on("error", safeReject);
47
+ shell.end(err => { if (err)
48
+ safeReject(err); });
49
+ });
50
+ }
@@ -0,0 +1,5 @@
1
+ pydantic
2
+ transformers
3
+ torch
4
+ sentencepiece
5
+ requests
@@ -0,0 +1,388 @@
1
+ import sys
2
+ from pydantic import BaseModel
3
+ import json
4
+ import io
5
+ import re
6
+
7
+ NLLB_TO_AZURE: dict[str, str | None] = {
8
+ "ace_Arab": None,
9
+ "ace_Latn": None,
10
+ "acm_Arab": None,
11
+ "acq_Arab": None,
12
+ "aeb_Arab": None,
13
+ "afr_Latn": "af",
14
+ "als_Latn": "sq",
15
+ "amh_Ethi": "am",
16
+ "apc_Arab": None,
17
+ "arb_Arab": "ar",
18
+ "arb_Latn": "ar",
19
+ "arg_Latn": None,
20
+ "ars_Arab": "ar",
21
+ "ary_Arab": None,
22
+ "arz_Arab": None,
23
+ "asm_Beng": "as",
24
+ "ast_Latn": None,
25
+ "awa_Deva": None,
26
+ "ayr_Latn": None,
27
+ "azb_Arab": None,
28
+ "azj_Latn": "az",
29
+ "bak_Cyrl": "ba",
30
+ "bam_Latn": None,
31
+ "ban_Latn": None,
32
+ "bel_Cyrl": None,
33
+ "bem_Latn": None,
34
+ "ben_Beng": "bn",
35
+ "bho_Deva": "bho",
36
+ "bjn_Arab": None,
37
+ "bjn_Latn": None,
38
+ "bod_Tibt": "bo",
39
+ "bos_Latn": "bs",
40
+ "brx_Deva": None,
41
+ "bug_Latn": None,
42
+ "bul_Cyrl": "bg",
43
+ "cat_Latn": "ca",
44
+ "ceb_Latn": "ceb",
45
+ "ces_Latn": "cs",
46
+ "chv_Cyrl": None,
47
+ "cjk_Latn": None,
48
+ "ckb_Arab": "ku",
49
+ "cmn_Hans": "zh-Hans",
50
+ "cmn_Hant": "zh-Hant",
51
+ "crh_Latn": "crh",
52
+ "cym_Latn": "cy",
53
+ "dan_Latn": "da",
54
+ "dar_Cyrl": None,
55
+ "deu_Latn": "de",
56
+ "dgo_Deva": None,
57
+ "dik_Latn": None,
58
+ "dyu_Latn": None,
59
+ "dzo_Tibt": None,
60
+ "ekk_Latn": "et",
61
+ "ell_Grek": "el",
62
+ "eng_Latn": "en",
63
+ "epo_Latn": None,
64
+ "eus_Latn": "eu",
65
+ "ewe_Latn": None,
66
+ "fao_Latn": "fo",
67
+ "fij_Latn": "fj",
68
+ "fil_Latn": "fil",
69
+ "fin_Latn": "fi",
70
+ "fon_Latn": None,
71
+ "fra_Latn": "fr",
72
+ "fur_Latn": None,
73
+ "fuv_Latn": None,
74
+ "gaz_Latn": "om",
75
+ "gla_Latn": "ga", # Scottish Gaelic; Azure 'gd' not consistently available
76
+ "gle_Latn": "ga",
77
+ "glg_Latn": "gl",
78
+ "gom_Deva": None,
79
+ "gug_Latn": None,
80
+ "guj_Gujr": "gu",
81
+ "hat_Latn": "ht",
82
+ "hau_Latn": "ha",
83
+ "heb_Hebr": "he",
84
+ "hin_Deva": "hi",
85
+ "hne_Deva": None,
86
+ "hrv_Latn": "hr",
87
+ "hun_Latn": "hu",
88
+ "hye_Armn": "hy",
89
+ "ibo_Latn": "ig",
90
+ "ilo_Latn": None,
91
+ "ind_Latn": "id",
92
+ "isl_Latn": "is",
93
+ "ita_Latn": "it",
94
+ "jav_Latn": "jv",
95
+ "jpn_Jpan": "ja",
96
+ "kaa_Latn": None,
97
+ "kab_Latn": None,
98
+ "kac_Latn": None,
99
+ "kam_Latn": None,
100
+ "kan_Knda": "kn",
101
+ "kas_Arab": None,
102
+ "kas_Deva": None,
103
+ "kat_Geor": "ka",
104
+ "kaz_Cyrl": "kk",
105
+ "kbp_Latn": None,
106
+ "kea_Latn": None,
107
+ "khk_Cyrl": "mn-Cyrl",
108
+ "khm_Khmr": "km",
109
+ "kik_Latn": None,
110
+ "kin_Latn": "rw",
111
+ "kir_Cyrl": "ky",
112
+ "kmb_Latn": None,
113
+ "kmr_Latn": "kmr",
114
+ "knc_Arab": None,
115
+ "knc_Latn": None,
116
+ "kor_Hang": "ko",
117
+ "ktu_Latn": None,
118
+ "lao_Laoo": "lo",
119
+ "lij_Latn": None,
120
+ "lim_Latn": None,
121
+ "lin_Latn": None,
122
+ "lit_Latn": "lt",
123
+ "lld_Latn": None,
124
+ "lmo_Latn": None,
125
+ "ltg_Latn": None,
126
+ "ltz_Latn": "lb",
127
+ "lua_Latn": None,
128
+ "lug_Latn": None,
129
+ "luo_Latn": None,
130
+ "lus_Latn": None,
131
+ "lvs_Latn": "lv",
132
+ "mag_Deva": None,
133
+ "mai_Deva": "mai",
134
+ "mal_Mlym": "ml",
135
+ "mar_Deva": "mr",
136
+ "mfe_Latn": None,
137
+ "mhr_Cyrl": None,
138
+ "min_Arab": None,
139
+ "min_Latn": None,
140
+ "mkd_Cyrl": "mk",
141
+ "mlt_Latn": "mt",
142
+ "mni_Beng": "mni",
143
+ "mni_Mtei": None,
144
+ "mos_Latn": None,
145
+ "mri_Latn": "mi",
146
+ "mya_Mymr": "my",
147
+ "myv_Cyrl": None,
148
+ "nld_Latn": "nl",
149
+ "nno_Latn": "nb", # Azure merges Nynorsk/Bokmål under 'nb'
150
+ "nob_Latn": "nb",
151
+ "npi_Deva": "ne",
152
+ "nqo_Nkoo": None,
153
+ "nso_Latn": None,
154
+ "nus_Latn": None,
155
+ "nya_Latn": "ny",
156
+ "oci_Latn": None,
157
+ "ory_Orya": "or",
158
+ "pag_Latn": None,
159
+ "pan_Guru": "pa",
160
+ "pap_Latn": None,
161
+ "pbt_Arab": "ps",
162
+ "pes_Arab": "fa",
163
+ "plt_Latn": "mg",
164
+ "pol_Latn": "pl",
165
+ "por_Latn": "pt",
166
+ "prs_Arab": "prs",
167
+ "quy_Latn": None,
168
+ "ron_Latn": "ro",
169
+ "run_Latn": None,
170
+ "rus_Cyrl": "ru",
171
+ "sag_Latn": None,
172
+ "san_Deva": None,
173
+ "sat_Olck": None,
174
+ "scn_Latn": None,
175
+ "shn_Mymr": None,
176
+ "sin_Sinh": "si",
177
+ "slk_Latn": "sk",
178
+ "slv_Latn": "sl",
179
+ "smo_Latn": "sm",
180
+ "sna_Latn": None,
181
+ "snd_Arab": "sd",
182
+ "snd_Deva": "sd",
183
+ "som_Latn": "so",
184
+ "sot_Latn": "st",
185
+ "spa_Latn": "es",
186
+ "srd_Latn": None,
187
+ "srp_Cyrl": "sr-Cyrl",
188
+ "ssw_Latn": None,
189
+ "sun_Latn": "su",
190
+ "swe_Latn": "sv",
191
+ "swh_Latn": "sw",
192
+ "szl_Latn": None,
193
+ "tam_Taml": "ta",
194
+ "taq_Latn": None,
195
+ "taq_Tfng": None,
196
+ "tat_Cyrl": "tt",
197
+ "tel_Telu": "te",
198
+ "tgk_Cyrl": "tg",
199
+ "tha_Thai": "th",
200
+ "tir_Ethi": "ti",
201
+ "tpi_Latn": "to",
202
+ "tsn_Latn": "tn",
203
+ "tso_Latn": None,
204
+ "tuk_Latn": "tk",
205
+ "tum_Latn": None,
206
+ "tur_Latn": "tr",
207
+ "twi_Latn": None,
208
+ "tyv_Cyrl": None,
209
+ "uig_Arab": "ug",
210
+ "ukr_Cyrl": "uk",
211
+ "umb_Latn": None,
212
+ "urd_Arab": "ur",
213
+ "uzn_Latn": "uz",
214
+ "uzs_Arab": None,
215
+ "vec_Latn": None,
216
+ "vie_Latn": "vi",
217
+ "vmw_Latn": None,
218
+ "war_Latn": None,
219
+ "wol_Latn": None,
220
+ "wuu_Hans": "wuu",
221
+ "xho_Latn": "xh",
222
+ "ydd_Hebr": None, # Yiddish not reliably supported in Azure
223
+ "yor_Latn": "yo",
224
+ "yue_Hant": "yue",
225
+ "zgh_Tfng": None,
226
+ "zsm_Latn": "ms",
227
+ "zul_Latn": "zu",
228
+ }
229
+
230
+
231
+ def nllb_to_azure(code: str) -> str | None:
232
+ """
233
+ Converts an NLLB language code to an Azure Translator BCP-47 language tag.
234
+ Returns None if there is no Azure equivalent for the given code.
235
+ """
236
+ return NLLB_TO_AZURE.get(code, None)
237
+
238
+ sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
239
+
240
+
241
+ class Args(BaseModel):
242
+ backend: str # "nllb" or "azure"
243
+ src_lang: str
244
+ segments: dict[str, str | list[str]]
245
+ langs: list[str]
246
+ max_tokens: int = 512
247
+ # NLLB-specific
248
+ nllb_model: str = "facebook/nllb-200-distilled-1.3B"
249
+ # Azure-specific
250
+ azure_key: str = ""
251
+ azure_region: str = "eastus"
252
+ azure_endpoint: str = "https://api.cognitive.microsofttranslator.com"
253
+
254
+
255
+ class Ret(BaseModel):
256
+ values: dict[str, dict[str, str | list[str]]]
257
+
258
+
259
+ def emit_progress(current: int, total: int, lang: str):
260
+ progress = {"type": "progress", "current": current, "total": total, "lang": lang}
261
+ print(json.dumps(progress, ensure_ascii=False), flush=True)
262
+
263
+
264
+ def translate_nllb(segment: str, lang: str, src_lang: str, model, tokenizer, max_tokens: int) -> str:
265
+ protected: list[str] = re.findall(r'\{\{.*?\}\}', segment)
266
+ placeholder_map: dict[str, tuple[str, bool]] = {}
267
+
268
+ for i, match in enumerate(protected):
269
+ token = f"XXPLACEHOLDER{i}XX"
270
+ placeholder_map[token] = (match, match[2] == "$")
271
+ segment = segment.replace(match, token, 1)
272
+
273
+ tokenizer.src_lang = src_lang
274
+ inputs = tokenizer(segment, return_tensors="pt")
275
+ lang_id = tokenizer.convert_tokens_to_ids(lang)
276
+
277
+ translated_tokens = model.generate(
278
+ **inputs,
279
+ forced_bos_token_id=lang_id,
280
+ max_length=max_tokens
281
+ )
282
+
283
+ translation: str = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
284
+
285
+ for token, original in placeholder_map.items():
286
+ if original[1]:
287
+ translation = translation.replace(token, original[0])
288
+ else:
289
+ end = len(original[0]) - 2
290
+ r = original[0][2:end]
291
+ translation = translation.replace(token, r)
292
+
293
+ return translation
294
+
295
+
296
+ def translate_azure(segment: str, lang: str, args: Args) -> str:
297
+ import requests
298
+
299
+ azure_lang = NLLB_TO_AZURE[lang]
300
+ if azure_lang is None:
301
+ raise ValueError(f"Language {lang} does not have a Azure compatible version")
302
+
303
+ protected = re.findall(r'\{\{.*?\}\}', segment)
304
+ placeholder_map: dict[str, tuple[str, bool]] = {}
305
+
306
+ for i, match in enumerate(protected):
307
+ token = f"XXPLACEHOLDER{i}XX"
308
+ placeholder_map[token] = (match, match[2] == "$")
309
+ segment = segment.replace(match, token, 1)
310
+
311
+ url = f"{args.azure_endpoint}/translate?api-version=3.0&to={azure_lang}"
312
+ headers = {
313
+ "Ocp-Apim-Subscription-Key": args.azure_key,
314
+ "Ocp-Apim-Subscription-Region": args.azure_region,
315
+ "Content-type": "application/json"
316
+ }
317
+ response = requests.post(url, headers=headers, json=[{"text": segment}])
318
+ response.raise_for_status()
319
+ translation: str = response.json()[0]["translations"][0]["text"]
320
+
321
+ for token, original in placeholder_map.items():
322
+ if original[1]:
323
+ translation = translation.replace(token, original[0])
324
+ else:
325
+ end = len(original[0]) - 2
326
+ r = original[0][2:end]
327
+ translation = translation.replace(token, r)
328
+
329
+ return translation
330
+
331
+
332
+ def translate(segment: str, lang: str, args: Args, model=None, tokenizer=None) -> str:
333
+ if args.backend == "nllb":
334
+ return translate_nllb(segment, lang, args.src_lang, model, tokenizer, args.max_tokens)
335
+ elif args.backend == "azure":
336
+ return translate_azure(segment, lang, args)
337
+ else:
338
+ raise ValueError(f"Unknown backend: {args.backend!r}. Use 'nllb' or 'azure'.")
339
+
340
+
341
+ def main():
342
+ try:
343
+ args = Args.model_validate_json(sys.argv[1])
344
+
345
+ model, tokenizer = None, None
346
+ if args.backend == "nllb":
347
+ from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
348
+ tokenizer = NllbTokenizer.from_pretrained(args.nllb_model)
349
+ model = AutoModelForSeq2SeqLM.from_pretrained(args.nllb_model)
350
+ elif args.backend == "azure":
351
+ if not args.azure_key:
352
+ raise ValueError("azure_key is required when using the Azure backend.")
353
+
354
+ total = len(args.langs) * len(args.segments)
355
+ current = 0
356
+
357
+ ret = Ret(values={})
358
+ for lang in args.langs:
359
+ ret.values[lang] = {}
360
+ for k, s in args.segments.items():
361
+ if isinstance(s, str):
362
+ ret.values[lang][k] = translate(s, lang, args, model, tokenizer)
363
+ else:
364
+ ret.values[lang][k] = [
365
+ translate(se, lang, args, model, tokenizer) for se in s
366
+ ]
367
+ current += 1
368
+ emit_progress(current, total, lang)
369
+
370
+ # Pass-through source lang with non-variable placeholders stripped
371
+ ret.values[args.src_lang] = {}
372
+ for k, s in args.segments.items():
373
+ if isinstance(s, str):
374
+ ret.values[args.src_lang][k] = re.sub(r"\{\{(?!\$)([^}]+)\}\}", r"\1", s)
375
+ else:
376
+ ret.values[args.src_lang][k] = [
377
+ re.sub(r"\{\{(?!\$)([^}]+)\}\}", r"\1", se) for se in s
378
+ ]
379
+
380
+ print(json.dumps({"type": "result", **ret.model_dump()}, ensure_ascii=False))
381
+
382
+ except Exception as e:
383
+ print(json.dumps({"type": "error", "message": str(e)}), flush=True)
384
+ sys.exit(1)
385
+
386
+
387
+ if __name__ == "__main__":
388
+ main()
@@ -0,0 +1,3 @@
1
+ export declare const logError: (msg: string) => void;
2
+ export declare const logWarning: (msg: string) => void;
3
+ export declare const logMessage: (msg: string) => void;
package/dist/utils.js ADDED
@@ -0,0 +1,4 @@
1
+ import chalk from "chalk";
2
+ export const logError = (msg) => console.error(chalk.red.bold("[ERROR]: ") + chalk.red(msg));
3
+ export const logWarning = (msg) => console.warn(chalk.yellow.bold("[WARNING]: ") + chalk.yellow(msg));
4
+ export const logMessage = (msg) => console.log(chalk.green.bold("[LOG]: ") + chalk.green(msg));
package/package.json ADDED
@@ -0,0 +1,44 @@
1
+ {
2
+ "name": "@fisharmy100/auto-i18n-cli",
3
+ "version": "1.0.0",
4
+ "description": "A translation generation library",
5
+ "type": "module",
6
+ "main": "dist/index.js",
7
+ "scripts": {
8
+ "build": "tsc",
9
+ "postbuild": "tsx postbuild.ts",
10
+ "prepare": "npm run build",
11
+ "dev": "tsx src/index.ts",
12
+ "test": "vitest",
13
+ "prepublishOnly": "npm run build",
14
+ "postinstall": "node postinstall.cjs"
15
+ },
16
+ "keywords": [
17
+ "cli",
18
+ "typescript",
19
+ "translation"
20
+ ],
21
+ "author": "Nate Craver",
22
+ "license": "MIT",
23
+ "files": [
24
+ "dist"
25
+ ],
26
+ "bin": {
27
+ "auto-i18n-cli": "./dist/index.js"
28
+ },
29
+ "devDependencies": {
30
+ "@types/cli-progress": "^3.11.6",
31
+ "@types/node": "^25.2.3",
32
+ "@types/yargs": "^17.0.35",
33
+ "tsx": "^4.0.0",
34
+ "typescript": "^5.0.0",
35
+ "vitest": "^4.0.18"
36
+ },
37
+ "dependencies": {
38
+ "chalk": "^5",
39
+ "cli-progress": "^3.12.0",
40
+ "glob": "^13.0.3",
41
+ "python-shell": "^5.0.0",
42
+ "ts-command-line-args": "^2.5.1"
43
+ }
44
+ }