RubyGems - jtcg_locale_detector - Versions diffs - 1.0.1 - Mend

jtcg_locale_detector 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +37 -0
data/PACKAGING_SUMMARY.md +195 -0
data/README.md +226 -0
data/bin/locale-detector +159 -0
data/jtcg_locale_detector.gemspec +48 -0
data/lib/locale_detector/client.rb +163 -0
data/lib/locale_detector/detector.rb +46 -0
data/lib/locale_detector/version.rb +3 -0
data/lib/locale_detector.rb +25 -0
data/locale_detector.gemspec +46 -0
data/python/cli.py +220 -0
data/python/requirements.txt +8 -0
data/python/src/__init__.py +10 -0
data/python/src/__pycache__/__init__.cpython-311.pyc +0 -0
data/python/src/__pycache__/__init__.cpython-313.pyc +0 -0
data/python/src/__pycache__/locale_data.cpython-311.pyc +0 -0
data/python/src/__pycache__/locale_data.cpython-313.pyc +0 -0
data/python/src/__pycache__/locale_detector.cpython-311.pyc +0 -0
data/python/src/__pycache__/locale_detector.cpython-313.pyc +0 -0
data/python/src/artifacts/fasttext/lid.176.bin +0 -0
data/python/src/artifacts/fasttext/lid.176.ftz +0 -0
data/python/src/download_fasttext.py +69 -0
data/python/src/locale_data.py +178 -0
data/python/src/locale_detector.py +534 -0
data/python/src/locale_detector_c.c +403 -0
data/python/src/locale_detector_c.h +37 -0
data/python/src/locale_detector_cy.cpp +23126 -0
data/python/src/locale_detector_cy.cpython-311-darwin.so +0 -0
data/python/src/locale_detector_cy.cpython-313-darwin.so +0 -0
data/python/src/locale_detector_cy.html +6460 -0
data/python/src/locale_detector_cy.pyx +501 -0
data/python/src/utils/__init__.py +1 -0
data/python/src/utils/__pycache__/__init__.cpython-311.pyc +0 -0
data/python/src/utils/__pycache__/__init__.cpython-313.pyc +0 -0
data/python/src/utils/__pycache__/data_utils.cpython-311.pyc +0 -0
data/python/src/utils/__pycache__/data_utils.cpython-313.pyc +0 -0
data/python/src/utils/data_utils.py +50 -0
data/python/src/utils/data_utils_cy.cpp +10086 -0
data/python/src/utils/data_utils_cy.cpython-311-darwin.so +0 -0
data/python/src/utils/data_utils_cy.cpython-313-darwin.so +0 -0
data/python/src/utils/data_utils_cy.html +600 -0
data/python/src/utils/data_utils_cy.pyx +94 -0
data/python/src/zhon/__init__.py +7 -0
data/python/src/zhon/__pycache__/__init__.cpython-311.pyc +0 -0
data/python/src/zhon/__pycache__/hanzi.cpython-311.pyc +0 -0
data/python/src/zhon/__pycache__/pinyin.cpython-311.pyc +0 -0
data/python/src/zhon/__pycache__/zhuyin.cpython-311.pyc +0 -0
data/python/src/zhon/cedict/__init__.py +14 -0
data/python/src/zhon/cedict/__pycache__/__init__.cpython-311.pyc +0 -0
data/python/src/zhon/cedict/__pycache__/all.cpython-311.pyc +0 -0
data/python/src/zhon/cedict/__pycache__/simplified.cpython-311.pyc +0 -0
data/python/src/zhon/cedict/__pycache__/traditional.cpython-311.pyc +0 -0
data/python/src/zhon/cedict/all.py +4 -0
data/python/src/zhon/cedict/simplified.py +4 -0
data/python/src/zhon/cedict/traditional.py +4 -0
data/python/src/zhon/hanzi.py +81 -0
data/python/src/zhon/pinyin.py +187 -0
data/python/src/zhon/zhuyin.py +46 -0
metadata +198 -0

data/python/src/locale_detector.py ADDED Viewed

@@ -0,0 +1,534 @@
+import asyncio
+import os
+import re
+import sys
+import shutil
+from functools import lru_cache
+from typing import Literal
+import fasttext
+import requests
+from opencc import OpenCC
+from zhon import cedict
+from .locale_data import LOCALE_MAP
+from .utils.data_utils import logger
+class LocaleDetector:
+    """
+    Multi-language locale detector with specialized Chinese variant detection.
+    This detector uses FastText for initial language identification of all text.
+    When text is identified as Chinese (zh), it performs additional analysis to
+    determine whether it's Traditional Chinese (zh-TW) or Simplified Chinese (zh-CN).
+    The detector combines multiple techniques for Chinese variant detection:
+    - Character set analysis (Traditional vs Simplified character counts)
+    - OpenCC conversion comparison
+    - Character ratio analysis
+    All detected language codes are mapped to i18n codes using LOCALE_MAP.
+    """
+    # Class-level constants for character type identification
+    UNKNOWN = "UNKNOWN"
+    TRADITIONAL = "TRADITIONAL"
+    SIMPLIFIED = "SIMPLIFIED"
+    BOTH = "BOTH"
+    MIXED = "MIXED"
+    # FastText model cache
+    MODELS = {"low_mem": None, "high_mem": None}
+    # Use local artifacts directory for models
+    FTLANG_CACHE = os.path.join(os.path.dirname(__file__), "artifacts", "fasttext")
+    # Default locale when mapping fails
+    DEFAULT_LOCALE = "en-US"
+    def __init__(self, low_memory: bool = False):
+        """
+        Initialize the locale detector.
+        Args:
+            low_memory: Whether to use the smaller FastText model (lid.176.ftz)
+                        instead of the full model (lid.176.bin)
+        """
+        # Initialize converters for Chinese detection
+        self.cc_s2t = OpenCC("s2t")
+        self.cc_t2s = OpenCC("t2s")
+        # Constants for character sets
+        self.TRAD = set(cedict.traditional)
+        self.SIMP = set(cedict.simplified)
+        self.SHARED = self.TRAD.intersection(self.SIMP)
+        self.ALL_HAN = cedict.all
+        self.HANZI_RE = re.compile(f"[^{self.ALL_HAN}]")
+        # FastText configuration
+        self.low_memory = low_memory
+        # Load FastText model (required)
+        self._load_fasttext_model()
+    def _download_model(self, name: str) -> str:
+        """
+        Get the FastText model path, download if not already cached.
+        Args:
+            name: Model filename to get/download
+        Returns:
+            str: Path to the model file
+        """
+        # 模型快取目錄
+        cache_dir = os.path.expanduser("~/.cache/locale-detector")
+        os.makedirs(cache_dir, exist_ok=True)
+        cache_path = os.path.join(cache_dir, name)
+        # 若快取已存在，優先使用
+        if os.path.exists(cache_path):
+            logger.info({"Using cached FastText model": cache_path})
+            return cache_path
+        # 若在 PyInstaller 執行環境，將模型從 sys._MEIPASS 複製到快取
+        if hasattr(sys, "_MEIPASS"):
+            pyinstaller_model_path = os.path.join(sys._MEIPASS, "src", "artifacts", "fasttext", name)
+            if os.path.exists(pyinstaller_model_path):
+                shutil.copy2(pyinstaller_model_path, cache_path)
+                logger.info({"Copied FastText model from bundle to cache": cache_path})
+                return cache_path
+        target_path = os.path.join(self.FTLANG_CACHE, name)
+        # Check if model exists locally
+        if os.path.exists(target_path):
+            logger.info({"Using local FastText model": target_path})
+            return target_path
+        # Try to create cache directory and download
+        try:
+            logger.info({"Downloading FastText model": name})
+            url = f"https://dl.fbaipublicfiles.com/fasttext/supervised-models/{name}"
+            os.makedirs(self.FTLANG_CACHE, exist_ok=True)
+            response = requests.get(url, timeout=300)
+            response.raise_for_status()
+            with open(target_path, "wb") as fp:
+                fp.write(response.content)
+            logger.info({"Downloaded FastText model": target_path})
+            return target_path
+        except Exception as e:
+            logger.error({"FastText model download failed": str(e)})
+            raise RuntimeError(f"Failed to download FastText model {name}: {e}")
+    def _load_fasttext_model(self):
+        """
+        Load the FastText language detection model.
+        This is required for the detector to function.
+        """
+        try:
+            if self.low_memory:
+                if not self.MODELS.get("low_mem"):
+                    model_path = self._download_model("lid.176.ftz")
+                    self.MODELS["low_mem"] = fasttext.load_model(model_path)
+                self.ft_model = self.MODELS["low_mem"]
+            else:
+                if not self.MODELS.get("high_mem"):
+                    model_path = self._download_model("lid.176.bin")
+                    self.MODELS["high_mem"] = fasttext.load_model(model_path)
+                self.ft_model = self.MODELS["high_mem"]
+            logger.info({"FastText model loaded": self.low_memory})
+        except Exception as e:
+            logger.error({"Failed to load FastText model": str(e)})
+            raise RuntimeError({"FastText model loading failed": str(e)})
+    def _preprocess_text(self, text: str) -> str:
+        """
+        Preprocess text to handle escape characters that might break FastText.
+        Args:
+            text: Input text to preprocess
+        Returns:
+            str: Preprocessed text safe for FastText
+        """
+        if not text:
+            return ""
+        # Replace problematic escape sequences
+        text = text.replace("\n", " ")
+        text = text.replace("\r", " ")
+        text = text.replace("\t", " ")
+        # Remove multiple spaces
+        return " ".join(text.split())
+    def _map_to_i18n_code(self, lang_code: str) -> str:
+        """
+        Map a detected language code to an i18n code using LOCALE_MAP.
+        Args:
+            lang_code: The language code to map
+        Returns:
+            str: The mapped i18n code or the original code if mapping fails
+        """
+        if lang_code in LOCALE_MAP:
+            i18n_code = LOCALE_MAP[lang_code]["i18n_code"]
+            if i18n_code != "not_supported":
+                return i18n_code
+        # Special case for Chinese variants which aren't in the map
+        if lang_code in ["zh-TW", "zh-CN"]:
+            return lang_code
+        # If mapping fails, return the original code or default
+        return lang_code if lang_code else self.DEFAULT_LOCALE
+    def detect_language(self, text: str) -> dict[str, str | float]:
+        """
+        Detect the language of the text using FastText.
+        Args:
+            text: Input text to analyze
+        Returns:
+            dict: {"lang": detected language code, "score": confidence score}
+        """
+        if not text or not text.strip():
+            return {"lang": "unknown", "score": 0.0}
+        # Preprocess text to handle escape characters
+        processed_text = self._preprocess_text(text)
+        if not processed_text:
+            return {"lang": "unknown", "score": 0.0}
+        try:
+            labels, scores = self.ft_model.predict(processed_text)
+            label = labels[0].replace("__label__", "")
+            score = min(float(scores[0]), 1.0)
+            return {"lang": label, "score": score}
+        except Exception as e:
+            logger.error({"FastText prediction error": str(e)})
+            return {"lang": "unknown", "score": 0.0}
+    def extract_hanzi(self, s: str) -> set[str]:
+        """Extract only Chinese characters from text."""
+        return set(self.HANZI_RE.sub("", s))
+    def identify(self, s: str) -> str:
+        """
+        Identify what kind of Chinese characters a string contains.
+        Returns:
+            str: One of TRADITIONAL, SIMPLIFIED, BOTH, MIXED, or UNKNOWN
+        """
+        chinese = self.extract_hanzi(s)
+        if not chinese:
+            return self.UNKNOWN
+        if chinese.issubset(self.SHARED):
+            return self.BOTH
+        if chinese.issubset(self.TRAD):
+            return self.TRADITIONAL
+        if chinese.issubset(self.SIMP):
+            return self.SIMPLIFIED
+        return self.MIXED
+    def is_traditional(self, s: str) -> bool:
+        """Check if a string's Chinese characters are Traditional."""
+        chinese_chars = self.extract_hanzi(s)
+        if not chinese_chars:
+            return False
+        if chinese_chars.issubset(self.SHARED):
+            return True
+        return bool(chinese_chars.issubset(self.TRAD))
+    def is_simplified(self, s: str) -> bool:
+        """Check if a string's Chinese characters are Simplified."""
+        chinese_chars = self.extract_hanzi(s)
+        if not chinese_chars:
+            return False
+        if chinese_chars.issubset(self.SHARED):
+            return True
+        return bool(chinese_chars.issubset(self.SIMP))
+    def _sync_count_trad(self, text: str) -> int:
+        """Synchronously count traditional characters."""
+        return sum(1 for ch in text if ch in self.TRAD)
+    def _sync_count_simp(self, text: str) -> int:
+        """Synchronously count simplified characters."""
+        return sum(1 for ch in text if ch in self.SIMP)
+    def _sync_identify_zh_hanzi(self, text: str) -> str:
+        """Identify Chinese type using character set analysis."""
+        if self.is_simplified(text):
+            return "zh-CN"
+        if self.is_traditional(text):
+            return "zh-TW"
+        return "unknown"
+    def _sync_identify_zh_opencc(self, text: str) -> str:
+        """Identify Chinese type using pure OpenCC comparison."""
+        # If text remains unchanged after converting to traditional
+        if text == self.cc_s2t.convert(text):
+            return "zh-TW"  # it's already traditional
+        # If text remains unchanged after converting to simplified
+        if text == self.cc_t2s.convert(text):
+            return "zh-CN"  # it's already simplified
+        return "zh-TW"  # Default to Traditional if mixed
+    @lru_cache(maxsize=1024)
+    def detect_by_ratio_analysis(self, text: str) -> str | None:
+        """
+        Detect Chinese locale using ratio-based analysis (synchronous).
+        Uses a combination of:
+        - Character ratio analysis (Traditional vs Simplified count)
+        - OpenCC conversion comparison
+        - Character set analysis
+        """
+        # Extract Chinese characters
+        hanzi = self.extract_hanzi(text)
+        if not hanzi:
+            return None
+        # OpenCC shape detection
+        to_t = self.cc_s2t.convert(text)
+        to_s = self.cc_t2s.convert(text)
+        if text == to_t:
+            base = "T"
+        elif text == to_s:
+            base = "S"
+        else:
+            base = "M"
+        # Character set analysis
+        kind = self.identify(text)
+        # Calculate Traditional/Simplified ratio
+        trad_count = self._sync_count_trad(text)
+        simp_count = self._sync_count_simp(text)
+        total = trad_count + simp_count or 1
+        trad_ratio = trad_count / total
+        simp_ratio = simp_count / total
+        # Decision logic
+        if base == "T" and kind in (self.TRADITIONAL, self.BOTH):
+            return "zh-TW"
+        if base == "S" and kind in (self.SIMPLIFIED, self.BOTH):
+            return "zh-CN"
+        if trad_ratio > 0.6 and trad_ratio > simp_ratio:
+            return "zh-TW"
+        if simp_ratio > 0.6 and simp_ratio > trad_ratio:
+            return "zh-CN"
+        return "zh-TW" if trad_ratio >= simp_ratio else "zh-CN"
+    async def adetect_by_ratio_analysis(self, text: str) -> str | None:
+        """
+        Detect Chinese locale using ratio-based analysis (asynchronous).
+        Uses the same comprehensive analysis as detect_by_ratio_analysis but
+        parallelizes CPU-intensive operations for better performance.
+        """
+        # Extract Chinese characters first (this is fast enough to do synchronously)
+        hanzi = self.extract_hanzi(text)
+        if not hanzi:
+            return None
+        loop = asyncio.get_event_loop()
+        # Run CPU-intensive operations in thread pool
+        to_t_future = loop.run_in_executor(None, self.cc_s2t.convert, text)
+        to_s_future = loop.run_in_executor(None, self.cc_t2s.convert, text)
+        kind_future = loop.run_in_executor(None, self.identify, text)
+        trad_count_future = loop.run_in_executor(None, self._sync_count_trad, text)
+        simp_count_future = loop.run_in_executor(None, self._sync_count_simp, text)
+        # Wait for all thread pool tasks to complete
+        to_t, to_s, kind, trad_count, simp_count = await asyncio.gather(
+            to_t_future, to_s_future, kind_future, trad_count_future, simp_count_future
+        )
+        # Determine base state
+        if text == to_t:
+            base = "T"
+        elif text == to_s:
+            base = "S"
+        else:
+            base = "M"
+        # Calculate ratios
+        total = trad_count + simp_count or 1
+        trad_ratio = trad_count / total
+        simp_ratio = simp_count / total
+        # Decision logic
+        if base == "T" and kind in (self.TRADITIONAL, self.BOTH):
+            return "zh-TW"
+        if base == "S" and kind in (self.SIMPLIFIED, self.BOTH):
+            return "zh-CN"
+        if trad_ratio > 0.6 and trad_ratio > simp_ratio:
+            return "zh-TW"
+        if simp_ratio > 0.6 and simp_ratio > trad_ratio:
+            return "zh-CN"
+        return "zh-TW" if trad_ratio >= simp_ratio else "zh-CN"
+    async def adetect_by_conversion_analysis(self, text: str) -> str:
+        """
+        Quick Chinese locale detection using conversion-based analysis (asynchronous).
+        Uses a combination of:
+        - Character set membership check
+        - OpenCC conversion comparison
+        Faster but might be less accurate for mixed text.
+        """
+        # Run both detection methods in parallel using asyncio
+        loop = asyncio.get_event_loop()
+        hanzi_task = loop.run_in_executor(None, self._sync_identify_zh_hanzi, text)
+        opencc_task = loop.run_in_executor(None, self._sync_identify_zh_opencc, text)
+        # Wait for both tasks to complete
+        hanzi_result, opencc_result = await asyncio.gather(hanzi_task, opencc_task)
+        if hanzi_result == "unknown":
+            return "zh-CN"
+        if opencc_result == "zh-TW":
+            return "zh-TW"
+        return "zh-CN"
+    async def adetect_hybrid(self, text: str, mode: Literal["conversion", "ratio", "both"] = "both") -> str:
+        """
+        Hybrid async detection method that can use both approaches.
+        Args:
+            text: Input text to analyze
+            mode: Detection mode:
+                - 'conversion': Use conversion-based analysis (faster)
+                - 'ratio': Use ratio-based analysis (more accurate)
+                - 'both': Use both approaches and combine results (default)
+        Returns:
+            str: 'zh-TW' for Traditional Chinese, 'zh-CN' for Simplified Chinese
+        """
+        if not text.strip():
+            return "zh-TW"  # Default for empty text
+        if mode == "conversion":
+            return await self.adetect_by_conversion_analysis(text)
+        if mode == "ratio":
+            result = await self.adetect_by_ratio_analysis(text)
+            return result if result else "zh-TW"
+        # mode == "both"
+        # Run both detection methods in parallel
+        conversion_task = self.adetect_by_conversion_analysis(text)
+        ratio_task = self.adetect_by_ratio_analysis(text)
+        conversion_result, ratio_result = await asyncio.gather(conversion_task, ratio_task)
+        # If ratio-based method returns None or results differ, trust conversion method
+        if not ratio_result or conversion_result != ratio_result:
+            return conversion_result
+        return ratio_result
+    async def detect(self, text: str) -> str:
+        """
+        Detect the locale of text and map to i18n code.
+        First identifies language with FastText, then applies Chinese variant
+        detection if the text is Chinese.
+        Args:
+            text: Input text to analyze
+        Returns:
+            str: Detected i18n locale code (e.g., 'en-US', 'zh-TW', 'zh-CN', 'ja', etc.)
+        """
+        # First detect language with FastText
+        lang_result = self.detect_language(text)
+        language = lang_result["lang"]
+        # If not Chinese, map the FastText result to i18n code
+        if language != "zh":
+            return self._map_to_i18n_code(language)
+        # For Chinese, use specialized detection with the ratio method
+        chinese_locale = await self.adetect_by_ratio_analysis(text)
+        return chinese_locale if chinese_locale else "zh-TW"  # Default to zh-TW if detection fails
+        # Chinese variants are already i18n codes
+    async def adetect(self, text: str) -> str:
+        """
+        Asynchronously detect the locale of text and map to i18n code.
+        First identifies language with FastText, then applies Chinese variant
+        detection if the text is Chinese.
+        Args:
+            text: Input text to analyze
+        Returns:
+            str: Detected i18n locale code (e.g., 'en-US', 'zh-TW', 'zh-CN', 'ja', etc.)
+        """
+        return await self.detect(text)
+    async def adetect_with_details(self, text: str, mode: Literal["conversion", "ratio", "both"] = "ratio") -> dict[str, str | float]:
+        """
+        Detect the locale of text with detailed information and map to i18n code.
+        Args:
+            text: Input text to analyze
+            mode: Detection mode for Chinese texts:
+                - 'conversion': Use conversion-based analysis (faster)
+                - 'ratio': Use ratio-based analysis (more accurate - default)
+                - 'both': Use both approaches and combine results (mixture)
+        Returns:
+            dict: {
+                "locale": detected i18n locale code,
+                "language": base language code,
+                "score": confidence score
+            }
+        """
+        # First detect language with FastText
+        lang_result = self.detect_language(text)
+        language = lang_result["lang"]
+        score = lang_result["score"]
+        # If not Chinese, map the FastText result to i18n code
+        if language != "zh":
+            i18n_code = self._map_to_i18n_code(language)
+            return {"locale": i18n_code, "language": language, "score": score}
+        # For Chinese, use specialized detection based on mode
+        if mode == "conversion":
+            chinese_locale = await self.adetect_by_conversion_analysis(text)
+        elif mode == "ratio":
+            chinese_locale = await self.adetect_by_ratio_analysis(text)
+            if not chinese_locale:
+                chinese_locale = "zh-TW"  # Default
+        else:  # mode == "both"
+            chinese_locale = await self.adetect_hybrid(text, mode="both")
+        # Chinese variants are already i18n codes
+        return {"locale": chinese_locale, "language": "zh", "score": score}
+    async def adetect_batch(self, texts: list[str], mode: Literal["conversion", "ratio", "both"] = "ratio") -> list[str]:
+        """
+        Batch process multiple texts concurrently and map results to i18n codes.
+        First identifies language with FastText for each text, then applies
+        Chinese variant detection for texts identified as Chinese.
+        Args:
+            texts: List of texts to analyze
+            mode: Detection mode for Chinese texts:
+                - 'conversion': Use conversion-based analysis (faster)
+                - 'ratio': Use ratio-based analysis (more accurate - default)
+                - 'both': Use both approaches and combine results (mixture)
+        Returns:
+            list[str]: List of detected i18n locale codes
+        """
+        tasks = [self.adetect_with_details(text, mode=mode) for text in texts]
+        results = await asyncio.gather(*tasks)
+        return [result["locale"] for result in results]