jtcg_locale_detector 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +37 -0
  3. data/PACKAGING_SUMMARY.md +195 -0
  4. data/README.md +226 -0
  5. data/bin/locale-detector +159 -0
  6. data/jtcg_locale_detector.gemspec +48 -0
  7. data/lib/locale_detector/client.rb +163 -0
  8. data/lib/locale_detector/detector.rb +46 -0
  9. data/lib/locale_detector/version.rb +3 -0
  10. data/lib/locale_detector.rb +25 -0
  11. data/locale_detector.gemspec +46 -0
  12. data/python/cli.py +220 -0
  13. data/python/requirements.txt +8 -0
  14. data/python/src/__init__.py +10 -0
  15. data/python/src/__pycache__/__init__.cpython-311.pyc +0 -0
  16. data/python/src/__pycache__/__init__.cpython-313.pyc +0 -0
  17. data/python/src/__pycache__/locale_data.cpython-311.pyc +0 -0
  18. data/python/src/__pycache__/locale_data.cpython-313.pyc +0 -0
  19. data/python/src/__pycache__/locale_detector.cpython-311.pyc +0 -0
  20. data/python/src/__pycache__/locale_detector.cpython-313.pyc +0 -0
  21. data/python/src/artifacts/fasttext/lid.176.bin +0 -0
  22. data/python/src/artifacts/fasttext/lid.176.ftz +0 -0
  23. data/python/src/download_fasttext.py +69 -0
  24. data/python/src/locale_data.py +178 -0
  25. data/python/src/locale_detector.py +534 -0
  26. data/python/src/locale_detector_c.c +403 -0
  27. data/python/src/locale_detector_c.h +37 -0
  28. data/python/src/locale_detector_cy.cpp +23126 -0
  29. data/python/src/locale_detector_cy.cpython-311-darwin.so +0 -0
  30. data/python/src/locale_detector_cy.cpython-313-darwin.so +0 -0
  31. data/python/src/locale_detector_cy.html +6460 -0
  32. data/python/src/locale_detector_cy.pyx +501 -0
  33. data/python/src/utils/__init__.py +1 -0
  34. data/python/src/utils/__pycache__/__init__.cpython-311.pyc +0 -0
  35. data/python/src/utils/__pycache__/__init__.cpython-313.pyc +0 -0
  36. data/python/src/utils/__pycache__/data_utils.cpython-311.pyc +0 -0
  37. data/python/src/utils/__pycache__/data_utils.cpython-313.pyc +0 -0
  38. data/python/src/utils/data_utils.py +50 -0
  39. data/python/src/utils/data_utils_cy.cpp +10086 -0
  40. data/python/src/utils/data_utils_cy.cpython-311-darwin.so +0 -0
  41. data/python/src/utils/data_utils_cy.cpython-313-darwin.so +0 -0
  42. data/python/src/utils/data_utils_cy.html +600 -0
  43. data/python/src/utils/data_utils_cy.pyx +94 -0
  44. data/python/src/zhon/__init__.py +7 -0
  45. data/python/src/zhon/__pycache__/__init__.cpython-311.pyc +0 -0
  46. data/python/src/zhon/__pycache__/hanzi.cpython-311.pyc +0 -0
  47. data/python/src/zhon/__pycache__/pinyin.cpython-311.pyc +0 -0
  48. data/python/src/zhon/__pycache__/zhuyin.cpython-311.pyc +0 -0
  49. data/python/src/zhon/cedict/__init__.py +14 -0
  50. data/python/src/zhon/cedict/__pycache__/__init__.cpython-311.pyc +0 -0
  51. data/python/src/zhon/cedict/__pycache__/all.cpython-311.pyc +0 -0
  52. data/python/src/zhon/cedict/__pycache__/simplified.cpython-311.pyc +0 -0
  53. data/python/src/zhon/cedict/__pycache__/traditional.cpython-311.pyc +0 -0
  54. data/python/src/zhon/cedict/all.py +4 -0
  55. data/python/src/zhon/cedict/simplified.py +4 -0
  56. data/python/src/zhon/cedict/traditional.py +4 -0
  57. data/python/src/zhon/hanzi.py +81 -0
  58. data/python/src/zhon/pinyin.py +187 -0
  59. data/python/src/zhon/zhuyin.py +46 -0
  60. metadata +198 -0
@@ -0,0 +1,534 @@
1
+ import asyncio
2
+ import os
3
+ import re
4
+ import sys
5
+ import shutil
6
+ from functools import lru_cache
7
+ from typing import Literal
8
+
9
+ import fasttext
10
+ import requests
11
+ from opencc import OpenCC
12
+ from zhon import cedict
13
+
14
+ from .locale_data import LOCALE_MAP
15
+ from .utils.data_utils import logger
16
+
17
+
18
+ class LocaleDetector:
19
+ """
20
+ Multi-language locale detector with specialized Chinese variant detection.
21
+
22
+ This detector uses FastText for initial language identification of all text.
23
+ When text is identified as Chinese (zh), it performs additional analysis to
24
+ determine whether it's Traditional Chinese (zh-TW) or Simplified Chinese (zh-CN).
25
+
26
+ The detector combines multiple techniques for Chinese variant detection:
27
+ - Character set analysis (Traditional vs Simplified character counts)
28
+ - OpenCC conversion comparison
29
+ - Character ratio analysis
30
+
31
+ All detected language codes are mapped to i18n codes using LOCALE_MAP.
32
+ """
33
+
34
+ # Class-level constants for character type identification
35
+ UNKNOWN = "UNKNOWN"
36
+ TRADITIONAL = "TRADITIONAL"
37
+ SIMPLIFIED = "SIMPLIFIED"
38
+ BOTH = "BOTH"
39
+ MIXED = "MIXED"
40
+
41
+ # FastText model cache
42
+ MODELS = {"low_mem": None, "high_mem": None}
43
+ # Use local artifacts directory for models
44
+ FTLANG_CACHE = os.path.join(os.path.dirname(__file__), "artifacts", "fasttext")
45
+
46
+ # Default locale when mapping fails
47
+ DEFAULT_LOCALE = "en-US"
48
+
49
+ def __init__(self, low_memory: bool = False):
50
+ """
51
+ Initialize the locale detector.
52
+
53
+ Args:
54
+ low_memory: Whether to use the smaller FastText model (lid.176.ftz)
55
+ instead of the full model (lid.176.bin)
56
+ """
57
+ # Initialize converters for Chinese detection
58
+ self.cc_s2t = OpenCC("s2t")
59
+ self.cc_t2s = OpenCC("t2s")
60
+
61
+ # Constants for character sets
62
+ self.TRAD = set(cedict.traditional)
63
+ self.SIMP = set(cedict.simplified)
64
+ self.SHARED = self.TRAD.intersection(self.SIMP)
65
+ self.ALL_HAN = cedict.all
66
+ self.HANZI_RE = re.compile(f"[^{self.ALL_HAN}]")
67
+
68
+ # FastText configuration
69
+ self.low_memory = low_memory
70
+
71
+ # Load FastText model (required)
72
+ self._load_fasttext_model()
73
+
74
+ def _download_model(self, name: str) -> str:
75
+ """
76
+ Get the FastText model path, download if not already cached.
77
+
78
+ Args:
79
+ name: Model filename to get/download
80
+
81
+ Returns:
82
+ str: Path to the model file
83
+ """
84
+ # 模型快取目錄
85
+ cache_dir = os.path.expanduser("~/.cache/locale-detector")
86
+ os.makedirs(cache_dir, exist_ok=True)
87
+ cache_path = os.path.join(cache_dir, name)
88
+
89
+ # 若快取已存在,優先使用
90
+ if os.path.exists(cache_path):
91
+ logger.info({"Using cached FastText model": cache_path})
92
+ return cache_path
93
+
94
+ # 若在 PyInstaller 執行環境,將模型從 sys._MEIPASS 複製到快取
95
+ if hasattr(sys, "_MEIPASS"):
96
+ pyinstaller_model_path = os.path.join(sys._MEIPASS, "src", "artifacts", "fasttext", name)
97
+ if os.path.exists(pyinstaller_model_path):
98
+ shutil.copy2(pyinstaller_model_path, cache_path)
99
+ logger.info({"Copied FastText model from bundle to cache": cache_path})
100
+ return cache_path
101
+
102
+ target_path = os.path.join(self.FTLANG_CACHE, name)
103
+
104
+ # Check if model exists locally
105
+ if os.path.exists(target_path):
106
+ logger.info({"Using local FastText model": target_path})
107
+ return target_path
108
+
109
+ # Try to create cache directory and download
110
+ try:
111
+ logger.info({"Downloading FastText model": name})
112
+ url = f"https://dl.fbaipublicfiles.com/fasttext/supervised-models/{name}"
113
+ os.makedirs(self.FTLANG_CACHE, exist_ok=True)
114
+
115
+ response = requests.get(url, timeout=300)
116
+ response.raise_for_status()
117
+
118
+ with open(target_path, "wb") as fp:
119
+ fp.write(response.content)
120
+ logger.info({"Downloaded FastText model": target_path})
121
+ return target_path
122
+
123
+ except Exception as e:
124
+ logger.error({"FastText model download failed": str(e)})
125
+ raise RuntimeError(f"Failed to download FastText model {name}: {e}")
126
+
127
+ def _load_fasttext_model(self):
128
+ """
129
+ Load the FastText language detection model.
130
+ This is required for the detector to function.
131
+ """
132
+ try:
133
+ if self.low_memory:
134
+ if not self.MODELS.get("low_mem"):
135
+ model_path = self._download_model("lid.176.ftz")
136
+ self.MODELS["low_mem"] = fasttext.load_model(model_path)
137
+ self.ft_model = self.MODELS["low_mem"]
138
+ else:
139
+ if not self.MODELS.get("high_mem"):
140
+ model_path = self._download_model("lid.176.bin")
141
+ self.MODELS["high_mem"] = fasttext.load_model(model_path)
142
+ self.ft_model = self.MODELS["high_mem"]
143
+ logger.info({"FastText model loaded": self.low_memory})
144
+ except Exception as e:
145
+ logger.error({"Failed to load FastText model": str(e)})
146
+ raise RuntimeError({"FastText model loading failed": str(e)})
147
+
148
+ def _preprocess_text(self, text: str) -> str:
149
+ """
150
+ Preprocess text to handle escape characters that might break FastText.
151
+
152
+ Args:
153
+ text: Input text to preprocess
154
+
155
+ Returns:
156
+ str: Preprocessed text safe for FastText
157
+ """
158
+ if not text:
159
+ return ""
160
+
161
+ # Replace problematic escape sequences
162
+ text = text.replace("\n", " ")
163
+ text = text.replace("\r", " ")
164
+ text = text.replace("\t", " ")
165
+
166
+ # Remove multiple spaces
167
+ return " ".join(text.split())
168
+
169
+ def _map_to_i18n_code(self, lang_code: str) -> str:
170
+ """
171
+ Map a detected language code to an i18n code using LOCALE_MAP.
172
+
173
+ Args:
174
+ lang_code: The language code to map
175
+
176
+ Returns:
177
+ str: The mapped i18n code or the original code if mapping fails
178
+ """
179
+ if lang_code in LOCALE_MAP:
180
+ i18n_code = LOCALE_MAP[lang_code]["i18n_code"]
181
+ if i18n_code != "not_supported":
182
+ return i18n_code
183
+
184
+ # Special case for Chinese variants which aren't in the map
185
+ if lang_code in ["zh-TW", "zh-CN"]:
186
+ return lang_code
187
+
188
+ # If mapping fails, return the original code or default
189
+ return lang_code if lang_code else self.DEFAULT_LOCALE
190
+
191
+ def detect_language(self, text: str) -> dict[str, str | float]:
192
+ """
193
+ Detect the language of the text using FastText.
194
+
195
+ Args:
196
+ text: Input text to analyze
197
+
198
+ Returns:
199
+ dict: {"lang": detected language code, "score": confidence score}
200
+ """
201
+ if not text or not text.strip():
202
+ return {"lang": "unknown", "score": 0.0}
203
+
204
+ # Preprocess text to handle escape characters
205
+ processed_text = self._preprocess_text(text)
206
+ if not processed_text:
207
+ return {"lang": "unknown", "score": 0.0}
208
+
209
+ try:
210
+ labels, scores = self.ft_model.predict(processed_text)
211
+ label = labels[0].replace("__label__", "")
212
+ score = min(float(scores[0]), 1.0)
213
+ return {"lang": label, "score": score}
214
+ except Exception as e:
215
+ logger.error({"FastText prediction error": str(e)})
216
+ return {"lang": "unknown", "score": 0.0}
217
+
218
+ def extract_hanzi(self, s: str) -> set[str]:
219
+ """Extract only Chinese characters from text."""
220
+ return set(self.HANZI_RE.sub("", s))
221
+
222
+ def identify(self, s: str) -> str:
223
+ """
224
+ Identify what kind of Chinese characters a string contains.
225
+
226
+ Returns:
227
+ str: One of TRADITIONAL, SIMPLIFIED, BOTH, MIXED, or UNKNOWN
228
+ """
229
+ chinese = self.extract_hanzi(s)
230
+ if not chinese:
231
+ return self.UNKNOWN
232
+ if chinese.issubset(self.SHARED):
233
+ return self.BOTH
234
+ if chinese.issubset(self.TRAD):
235
+ return self.TRADITIONAL
236
+ if chinese.issubset(self.SIMP):
237
+ return self.SIMPLIFIED
238
+ return self.MIXED
239
+
240
+ def is_traditional(self, s: str) -> bool:
241
+ """Check if a string's Chinese characters are Traditional."""
242
+ chinese_chars = self.extract_hanzi(s)
243
+ if not chinese_chars:
244
+ return False
245
+ if chinese_chars.issubset(self.SHARED):
246
+ return True
247
+ return bool(chinese_chars.issubset(self.TRAD))
248
+
249
+ def is_simplified(self, s: str) -> bool:
250
+ """Check if a string's Chinese characters are Simplified."""
251
+ chinese_chars = self.extract_hanzi(s)
252
+ if not chinese_chars:
253
+ return False
254
+ if chinese_chars.issubset(self.SHARED):
255
+ return True
256
+ return bool(chinese_chars.issubset(self.SIMP))
257
+
258
+ def _sync_count_trad(self, text: str) -> int:
259
+ """Synchronously count traditional characters."""
260
+ return sum(1 for ch in text if ch in self.TRAD)
261
+
262
+ def _sync_count_simp(self, text: str) -> int:
263
+ """Synchronously count simplified characters."""
264
+ return sum(1 for ch in text if ch in self.SIMP)
265
+
266
+ def _sync_identify_zh_hanzi(self, text: str) -> str:
267
+ """Identify Chinese type using character set analysis."""
268
+ if self.is_simplified(text):
269
+ return "zh-CN"
270
+ if self.is_traditional(text):
271
+ return "zh-TW"
272
+ return "unknown"
273
+
274
+ def _sync_identify_zh_opencc(self, text: str) -> str:
275
+ """Identify Chinese type using pure OpenCC comparison."""
276
+ # If text remains unchanged after converting to traditional
277
+ if text == self.cc_s2t.convert(text):
278
+ return "zh-TW" # it's already traditional
279
+ # If text remains unchanged after converting to simplified
280
+ if text == self.cc_t2s.convert(text):
281
+ return "zh-CN" # it's already simplified
282
+ return "zh-TW" # Default to Traditional if mixed
283
+
284
+ @lru_cache(maxsize=1024)
285
+ def detect_by_ratio_analysis(self, text: str) -> str | None:
286
+ """
287
+ Detect Chinese locale using ratio-based analysis (synchronous).
288
+ Uses a combination of:
289
+ - Character ratio analysis (Traditional vs Simplified count)
290
+ - OpenCC conversion comparison
291
+ - Character set analysis
292
+ """
293
+ # Extract Chinese characters
294
+ hanzi = self.extract_hanzi(text)
295
+ if not hanzi:
296
+ return None
297
+
298
+ # OpenCC shape detection
299
+ to_t = self.cc_s2t.convert(text)
300
+ to_s = self.cc_t2s.convert(text)
301
+ if text == to_t:
302
+ base = "T"
303
+ elif text == to_s:
304
+ base = "S"
305
+ else:
306
+ base = "M"
307
+
308
+ # Character set analysis
309
+ kind = self.identify(text)
310
+
311
+ # Calculate Traditional/Simplified ratio
312
+ trad_count = self._sync_count_trad(text)
313
+ simp_count = self._sync_count_simp(text)
314
+ total = trad_count + simp_count or 1
315
+ trad_ratio = trad_count / total
316
+ simp_ratio = simp_count / total
317
+
318
+ # Decision logic
319
+ if base == "T" and kind in (self.TRADITIONAL, self.BOTH):
320
+ return "zh-TW"
321
+ if base == "S" and kind in (self.SIMPLIFIED, self.BOTH):
322
+ return "zh-CN"
323
+ if trad_ratio > 0.6 and trad_ratio > simp_ratio:
324
+ return "zh-TW"
325
+ if simp_ratio > 0.6 and simp_ratio > trad_ratio:
326
+ return "zh-CN"
327
+ return "zh-TW" if trad_ratio >= simp_ratio else "zh-CN"
328
+
329
+ async def adetect_by_ratio_analysis(self, text: str) -> str | None:
330
+ """
331
+ Detect Chinese locale using ratio-based analysis (asynchronous).
332
+ Uses the same comprehensive analysis as detect_by_ratio_analysis but
333
+ parallelizes CPU-intensive operations for better performance.
334
+ """
335
+ # Extract Chinese characters first (this is fast enough to do synchronously)
336
+ hanzi = self.extract_hanzi(text)
337
+ if not hanzi:
338
+ return None
339
+
340
+ loop = asyncio.get_event_loop()
341
+
342
+ # Run CPU-intensive operations in thread pool
343
+ to_t_future = loop.run_in_executor(None, self.cc_s2t.convert, text)
344
+ to_s_future = loop.run_in_executor(None, self.cc_t2s.convert, text)
345
+ kind_future = loop.run_in_executor(None, self.identify, text)
346
+ trad_count_future = loop.run_in_executor(None, self._sync_count_trad, text)
347
+ simp_count_future = loop.run_in_executor(None, self._sync_count_simp, text)
348
+
349
+ # Wait for all thread pool tasks to complete
350
+ to_t, to_s, kind, trad_count, simp_count = await asyncio.gather(
351
+ to_t_future, to_s_future, kind_future, trad_count_future, simp_count_future
352
+ )
353
+
354
+ # Determine base state
355
+ if text == to_t:
356
+ base = "T"
357
+ elif text == to_s:
358
+ base = "S"
359
+ else:
360
+ base = "M"
361
+
362
+ # Calculate ratios
363
+ total = trad_count + simp_count or 1
364
+ trad_ratio = trad_count / total
365
+ simp_ratio = simp_count / total
366
+
367
+ # Decision logic
368
+ if base == "T" and kind in (self.TRADITIONAL, self.BOTH):
369
+ return "zh-TW"
370
+ if base == "S" and kind in (self.SIMPLIFIED, self.BOTH):
371
+ return "zh-CN"
372
+ if trad_ratio > 0.6 and trad_ratio > simp_ratio:
373
+ return "zh-TW"
374
+ if simp_ratio > 0.6 and simp_ratio > trad_ratio:
375
+ return "zh-CN"
376
+ return "zh-TW" if trad_ratio >= simp_ratio else "zh-CN"
377
+
378
+ async def adetect_by_conversion_analysis(self, text: str) -> str:
379
+ """
380
+ Quick Chinese locale detection using conversion-based analysis (asynchronous).
381
+ Uses a combination of:
382
+ - Character set membership check
383
+ - OpenCC conversion comparison
384
+ Faster but might be less accurate for mixed text.
385
+ """
386
+ # Run both detection methods in parallel using asyncio
387
+ loop = asyncio.get_event_loop()
388
+ hanzi_task = loop.run_in_executor(None, self._sync_identify_zh_hanzi, text)
389
+ opencc_task = loop.run_in_executor(None, self._sync_identify_zh_opencc, text)
390
+
391
+ # Wait for both tasks to complete
392
+ hanzi_result, opencc_result = await asyncio.gather(hanzi_task, opencc_task)
393
+
394
+ if hanzi_result == "unknown":
395
+ return "zh-CN"
396
+ if opencc_result == "zh-TW":
397
+ return "zh-TW"
398
+ return "zh-CN"
399
+
400
+ async def adetect_hybrid(self, text: str, mode: Literal["conversion", "ratio", "both"] = "both") -> str:
401
+ """
402
+ Hybrid async detection method that can use both approaches.
403
+
404
+ Args:
405
+ text: Input text to analyze
406
+ mode: Detection mode:
407
+ - 'conversion': Use conversion-based analysis (faster)
408
+ - 'ratio': Use ratio-based analysis (more accurate)
409
+ - 'both': Use both approaches and combine results (default)
410
+
411
+ Returns:
412
+ str: 'zh-TW' for Traditional Chinese, 'zh-CN' for Simplified Chinese
413
+ """
414
+ if not text.strip():
415
+ return "zh-TW" # Default for empty text
416
+
417
+ if mode == "conversion":
418
+ return await self.adetect_by_conversion_analysis(text)
419
+ if mode == "ratio":
420
+ result = await self.adetect_by_ratio_analysis(text)
421
+ return result if result else "zh-TW"
422
+ # mode == "both"
423
+ # Run both detection methods in parallel
424
+ conversion_task = self.adetect_by_conversion_analysis(text)
425
+ ratio_task = self.adetect_by_ratio_analysis(text)
426
+
427
+ conversion_result, ratio_result = await asyncio.gather(conversion_task, ratio_task)
428
+
429
+ # If ratio-based method returns None or results differ, trust conversion method
430
+ if not ratio_result or conversion_result != ratio_result:
431
+ return conversion_result
432
+
433
+ return ratio_result
434
+
435
+ async def detect(self, text: str) -> str:
436
+ """
437
+ Detect the locale of text and map to i18n code.
438
+ First identifies language with FastText, then applies Chinese variant
439
+ detection if the text is Chinese.
440
+
441
+ Args:
442
+ text: Input text to analyze
443
+
444
+ Returns:
445
+ str: Detected i18n locale code (e.g., 'en-US', 'zh-TW', 'zh-CN', 'ja', etc.)
446
+ """
447
+ # First detect language with FastText
448
+ lang_result = self.detect_language(text)
449
+ language = lang_result["lang"]
450
+
451
+ # If not Chinese, map the FastText result to i18n code
452
+ if language != "zh":
453
+ return self._map_to_i18n_code(language)
454
+
455
+ # For Chinese, use specialized detection with the ratio method
456
+ chinese_locale = await self.adetect_by_ratio_analysis(text)
457
+ return chinese_locale if chinese_locale else "zh-TW" # Default to zh-TW if detection fails
458
+
459
+ # Chinese variants are already i18n codes
460
+
461
+ async def adetect(self, text: str) -> str:
462
+ """
463
+ Asynchronously detect the locale of text and map to i18n code.
464
+ First identifies language with FastText, then applies Chinese variant
465
+ detection if the text is Chinese.
466
+
467
+ Args:
468
+ text: Input text to analyze
469
+
470
+ Returns:
471
+ str: Detected i18n locale code (e.g., 'en-US', 'zh-TW', 'zh-CN', 'ja', etc.)
472
+ """
473
+ return await self.detect(text)
474
+
475
+ async def adetect_with_details(self, text: str, mode: Literal["conversion", "ratio", "both"] = "ratio") -> dict[str, str | float]:
476
+ """
477
+ Detect the locale of text with detailed information and map to i18n code.
478
+
479
+ Args:
480
+ text: Input text to analyze
481
+ mode: Detection mode for Chinese texts:
482
+ - 'conversion': Use conversion-based analysis (faster)
483
+ - 'ratio': Use ratio-based analysis (more accurate - default)
484
+ - 'both': Use both approaches and combine results (mixture)
485
+
486
+ Returns:
487
+ dict: {
488
+ "locale": detected i18n locale code,
489
+ "language": base language code,
490
+ "score": confidence score
491
+ }
492
+ """
493
+ # First detect language with FastText
494
+ lang_result = self.detect_language(text)
495
+ language = lang_result["lang"]
496
+ score = lang_result["score"]
497
+
498
+ # If not Chinese, map the FastText result to i18n code
499
+ if language != "zh":
500
+ i18n_code = self._map_to_i18n_code(language)
501
+ return {"locale": i18n_code, "language": language, "score": score}
502
+
503
+ # For Chinese, use specialized detection based on mode
504
+ if mode == "conversion":
505
+ chinese_locale = await self.adetect_by_conversion_analysis(text)
506
+ elif mode == "ratio":
507
+ chinese_locale = await self.adetect_by_ratio_analysis(text)
508
+ if not chinese_locale:
509
+ chinese_locale = "zh-TW" # Default
510
+ else: # mode == "both"
511
+ chinese_locale = await self.adetect_hybrid(text, mode="both")
512
+
513
+ # Chinese variants are already i18n codes
514
+ return {"locale": chinese_locale, "language": "zh", "score": score}
515
+
516
+ async def adetect_batch(self, texts: list[str], mode: Literal["conversion", "ratio", "both"] = "ratio") -> list[str]:
517
+ """
518
+ Batch process multiple texts concurrently and map results to i18n codes.
519
+ First identifies language with FastText for each text, then applies
520
+ Chinese variant detection for texts identified as Chinese.
521
+
522
+ Args:
523
+ texts: List of texts to analyze
524
+ mode: Detection mode for Chinese texts:
525
+ - 'conversion': Use conversion-based analysis (faster)
526
+ - 'ratio': Use ratio-based analysis (more accurate - default)
527
+ - 'both': Use both approaches and combine results (mixture)
528
+
529
+ Returns:
530
+ list[str]: List of detected i18n locale codes
531
+ """
532
+ tasks = [self.adetect_with_details(text, mode=mode) for text in texts]
533
+ results = await asyncio.gather(*tasks)
534
+ return [result["locale"] for result in results]