jtcg_locale_detector 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +37 -0
- data/PACKAGING_SUMMARY.md +195 -0
- data/README.md +226 -0
- data/bin/locale-detector +159 -0
- data/jtcg_locale_detector.gemspec +48 -0
- data/lib/locale_detector/client.rb +163 -0
- data/lib/locale_detector/detector.rb +46 -0
- data/lib/locale_detector/version.rb +3 -0
- data/lib/locale_detector.rb +25 -0
- data/locale_detector.gemspec +46 -0
- data/python/cli.py +220 -0
- data/python/requirements.txt +8 -0
- data/python/src/__init__.py +10 -0
- data/python/src/__pycache__/__init__.cpython-311.pyc +0 -0
- data/python/src/__pycache__/__init__.cpython-313.pyc +0 -0
- data/python/src/__pycache__/locale_data.cpython-311.pyc +0 -0
- data/python/src/__pycache__/locale_data.cpython-313.pyc +0 -0
- data/python/src/__pycache__/locale_detector.cpython-311.pyc +0 -0
- data/python/src/__pycache__/locale_detector.cpython-313.pyc +0 -0
- data/python/src/artifacts/fasttext/lid.176.bin +0 -0
- data/python/src/artifacts/fasttext/lid.176.ftz +0 -0
- data/python/src/download_fasttext.py +69 -0
- data/python/src/locale_data.py +178 -0
- data/python/src/locale_detector.py +534 -0
- data/python/src/locale_detector_c.c +403 -0
- data/python/src/locale_detector_c.h +37 -0
- data/python/src/locale_detector_cy.cpp +23126 -0
- data/python/src/locale_detector_cy.cpython-311-darwin.so +0 -0
- data/python/src/locale_detector_cy.cpython-313-darwin.so +0 -0
- data/python/src/locale_detector_cy.html +6460 -0
- data/python/src/locale_detector_cy.pyx +501 -0
- data/python/src/utils/__init__.py +1 -0
- data/python/src/utils/__pycache__/__init__.cpython-311.pyc +0 -0
- data/python/src/utils/__pycache__/__init__.cpython-313.pyc +0 -0
- data/python/src/utils/__pycache__/data_utils.cpython-311.pyc +0 -0
- data/python/src/utils/__pycache__/data_utils.cpython-313.pyc +0 -0
- data/python/src/utils/data_utils.py +50 -0
- data/python/src/utils/data_utils_cy.cpp +10086 -0
- data/python/src/utils/data_utils_cy.cpython-311-darwin.so +0 -0
- data/python/src/utils/data_utils_cy.cpython-313-darwin.so +0 -0
- data/python/src/utils/data_utils_cy.html +600 -0
- data/python/src/utils/data_utils_cy.pyx +94 -0
- data/python/src/zhon/__init__.py +7 -0
- data/python/src/zhon/__pycache__/__init__.cpython-311.pyc +0 -0
- data/python/src/zhon/__pycache__/hanzi.cpython-311.pyc +0 -0
- data/python/src/zhon/__pycache__/pinyin.cpython-311.pyc +0 -0
- data/python/src/zhon/__pycache__/zhuyin.cpython-311.pyc +0 -0
- data/python/src/zhon/cedict/__init__.py +14 -0
- data/python/src/zhon/cedict/__pycache__/__init__.cpython-311.pyc +0 -0
- data/python/src/zhon/cedict/__pycache__/all.cpython-311.pyc +0 -0
- data/python/src/zhon/cedict/__pycache__/simplified.cpython-311.pyc +0 -0
- data/python/src/zhon/cedict/__pycache__/traditional.cpython-311.pyc +0 -0
- data/python/src/zhon/cedict/all.py +4 -0
- data/python/src/zhon/cedict/simplified.py +4 -0
- data/python/src/zhon/cedict/traditional.py +4 -0
- data/python/src/zhon/hanzi.py +81 -0
- data/python/src/zhon/pinyin.py +187 -0
- data/python/src/zhon/zhuyin.py +46 -0
- metadata +198 -0
|
@@ -0,0 +1,501 @@
|
|
|
1
|
+
# cython: language_level=3
|
|
2
|
+
# cython: boundscheck=False
|
|
3
|
+
# cython: wraparound=False
|
|
4
|
+
# cython: nonecheck=False
|
|
5
|
+
# cython: cdivision=True
|
|
6
|
+
# cython: embedsignature=True
|
|
7
|
+
# cython: optimize.use_switch=True
|
|
8
|
+
# cython: optimize.unpack_method_calls=True
|
|
9
|
+
|
|
10
|
+
import asyncio
|
|
11
|
+
import os
|
|
12
|
+
import re
|
|
13
|
+
from functools import lru_cache
|
|
14
|
+
from typing import Literal
|
|
15
|
+
cimport cython
|
|
16
|
+
from libc.stdlib cimport malloc, free
|
|
17
|
+
from libc.string cimport strlen
|
|
18
|
+
|
|
19
|
+
import fasttext
|
|
20
|
+
import requests
|
|
21
|
+
from opencc import OpenCC
|
|
22
|
+
# 直接嵌入繁體/簡體字集,移除 zhon 依賴
|
|
23
|
+
TRAD_CHARS = """制咖片型超聲盤鑒定仔點他命書歌粉巾字帳恤手指記憶棒形轉彎溝光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞㠯㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮㵎㵪㶸㷖㷭㹢㹴犬㺢狓㺵㼝㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓䠶䥯䦉䯝䰾魚䲔䳗䳘䵹鼄䶑一對應映射丁不識下兒子做二休世丘之貉並中台原則串為甚謂乾淨了百事無成八變五十些人得道雞升天代如併來去個國政策勁幽靈在歐洲遊蕩接樣蘿蔔坑側化傳價元論醇共再准刀兩斷切分耕耘收穫錢貨物向看舊就緒險刻千金動勞永逸匙零夜半卡通回復返影蹤反常態口咬氣句話同吐快吹周味呼諾嗚品紅鍋哄而散起唱和問三知生熟團漆黑火糟堆場空塊麵塌糊塗塵染壁廂夔已足多情露水大早到晚夫妻當關萬莫開失古恨套所料既往孔見提師要家主審寸陰難買鬥牛小撮部陣局展身層巴掌帆風順席地帶過年計於春頭載四季期被蛇怕井繩度願式份彈頃深前律徑心意念差愁孤行俱全房廳交遮打技長把抓死拿眼淚鼻涕鑰鎖折段抿拍即合掃排掬揮撥擁上入擊洞擲攬改故轍敗文值名斑方面旁族日秋餐隔雅里終父旦時晌會霎間晃暴寒曝更月望垠際朝夕本正經利杯羹東西板枝獨秀根筋桿進條龍服務概模次函數又性程總付步腳印趨登毛拔呵氧氮碳決雌雄波未平派謊言流清楚白準溜煙潭有獲聞是處降琴鶴甲病發可拾沙目然瞭直以相眨穿睹瞥瞬矢的解石鳥神教秉虔誠秘種窩蜂窮竅笑置筆苟勾銷抹殺煞等獎箍節吃箭仇雙鵰詩籌籮筐系列紙級士官統絲毫掛維網盡線微吭響股腦胎脈承腔臂力致效資源址器舉功投般說講規貿易葉障著慎滿皆輸號木電池衣傾鐘高低視仁覺醒覽遺角銀幣觸潰九鼎蔽抄出駟馬追重語破貧洗貫走路安蹴至幾蹶振躍役膽汗較輩輪辭贊退六連遍遞邊針血錘音錯門思閃真倒項栽霧類保護川先驚乍體鬨鱗爪鳴滴泡鄰域黨專鼓作齊炒丑烯亥克內酯冬加奴卯肝炎基尺梁街褲鎬客寵庭巳汝昌烷玲磊糖肇酉醛啷青縣韙良香骨鯛丂七集河市弦喜嘴張舌堵區工業姊妹星架構巧彩扭歪拼湊餘熱曜武州爺浮屠美鄉老階樹葷素碎落能魄鰓鰻珠丄丅丆万俟丈尚摸母娘量管群亞虎必我堂令申件裝伏位博俠義界表女墟臺戲臭皮匠勝諸葛亮賽頂倍催請運算包立叉戟離疫苗土史志演圍揭瓦曬夷姑婆帝村寶爛尖杉鹼屜桌山岔島由紀峽壩庫鎮廢從德後拗湯治旬食明昧曹朋友框欄極權冪曲歸依貓民氟硼氯磷鐵江侗自旅法司洋浦梅園溫暖灣焦班幸用田略番疊皇炮捶硝苯酸腺苷稜草鏡穗跳遠索錦綱聚氰胺聯店胚膲愛色堇紫羅蘭芝茶飯菱雲蟲藏藩亂叛蘇親債凳學座恐戀柱測肌腹衩錐係貂企烏跪叩軍車農題迭都甘油屯奏鍵短阿姨陪姐隻顧茅廬槽駕魂鮮鹿頁其菜單乘任供勢午齒漢組織吊調瀉唇坡城報墳外夸將尉建築岸崗公床揚新劍昇杭林栗校樓標款汽社浣海商館劇院鋼華港機械廣媒環球融第醫科證券綜財樂育游漲猶嶺疏癮瞼確兵領導繳肢膛船艾瑟爾蒼蔡虞傚衫覆訪訴課諭議軌述野鉤限敵鞋頜頷顎饒首齦站例修凡劃垂屆屬崽頦廚拜挫擺放旋削棋榻檻禮沉注滑營獄畫确儀聘花葬詔員跌轄週達酒錨閘陷陸雨雪飛威丌于丹久乏予理評產亢卑亦乎舞己悲矩圓詞害誌但住佞佳便俗信票案幅翁倦倫假偏倚斜虧鬼敲停備傷脾胃僅此像儉匱免宜穴焉戴兼容許凍伯仲負彼晝皂軒輊實刊划顛衛戰哥比省非好黃飾別拘束掩奶睬選擇搖擾煩苦枚寫協厭及格受歡迎約只估侵犯割狀告或缺抗拒挽撤救藥喻磨滅端倪少逆逾越避靠適吉譽吝玉含延咎歹聽啻淵善謀均勻堪忍夠太惹妙妥妨孕症孝術室完納推冠積宣疑辯慄碴稱屈撓屑干涉衡待很忙惡忿怎麼怠急恥恭息悅惑惜惟想愉愧怍慌憤啟懂懈懷材才緊招認扣抵拉捨也罷插揣冒搭撞南牆擴核支攻敢雷攀敬裡嗎需景智暇曾罪遇朽枉止況競爭辱求癒渝溶濟左右袒困補爽特寂寞示弱找謝畏強疾徐痛癢冤符眠睦瞅董何厚云措活疲羞者輕玻璃祥兆禁移稂莠穩佛換答簡結果盟絕縷途給談否羈翼耐肖脛毋寧興舒若菲萊痕跡窠臼虛衰臉兔撒鷹棺範該詳諱抬泰讓鬚眉象眾貲賬費灰賴奇慮訓輟辨菽麥辛近送透逞徒速續逮捕遂遑違遜斧鉞艱醉鏽隨觀棄顯飽脂肪使丏丐幫丒且慢末丕替桃宗王尊涼爵各圖屋脊糧署錄壇吾祿職胄襲君廈丗北壑桐疹損逢陵鷸丙寅戌氨腈唑綸辰酮脫氫酶醚丞丟現掉紗帽弄扯砲碗丠両丣坐存激肩臻蒂蓮悖序驅丨丩丫挺杈髻鬟細介俄伊犁京尼布訂普渡央委監察檢查劑圈設警隊斯督剩震境航舶革防托播促質版蠑螈鋒研藝歷殘消頻譜精密製造陲郵候埔堅壓壢凹匯執府究邦俘攝寮彬狼嶽肺腫庸英訊診埋粒胞括控碼韓暑槍樞砥澳哇牟壽甸鑽探篇簽綴縫繼耳肯照婦埃懸璧軸櫃檯辣擱淺邪跑纖阮陽私囊魔丮丰姿采丱燒丳丵丶丷丸參寨朗桂瑞砂衷霞貌"""
|
|
24
|
+
SIMP_CHARS = """制咖片型超声盘鉴定仔点他命书歌粉巾字帐恤手指记忆棒形转弯沟光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞以㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮涧㵪㶸㷖㷭㹢㹴犬㺢狓㺵碗㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓射䥯䦉䯝鲃鱼䲔䳗鹅䵹鼄䶑一对应映射丁不识下儿子做二休世丘之貉并中台原则串为甚谓干净了百事无成八变五十些人得道鸡升天代如并来去个国政策劲幽灵在欧洲游荡接样萝卜坑侧化传价元论醇共再准刀两断切分耕耘收获钱货物向看旧就绪险刻千金动劳永逸匙零夜半卡通回复返影踪反常态口咬气句话同吐快吹周味呼诺呜品红锅哄而散起唱和问三知生熟团漆黑火糟堆场空块面塌糊涂尘染壁厢夔已足多情露水大早到晚夫妻当关万莫开失古恨套所料既往孔见提师要家主审寸阴难买斗牛小撮部阵局展身层巴掌帆风顺席地带过年计于春头载四季期被蛇怕井绳度愿式份弹顷深前律径心意念差愁孤行俱全房厅交遮打技长把抓死拿眼泪鼻涕钥锁折段抿拍即合扫排掬挥拨拥上入击洞掷揽改故辙败文值名斑方面旁族日秋餐隔雅里终父旦时晌会霎间晃暴寒曝更月望垠际朝夕本正经利杯羹东西板枝独秀根筋杆进条龙服务概模次函数又性程总付步脚印趋登毛拔呵氧氮碳决雌雄波未平派谎言流清楚白准溜烟潭有获闻是处降琴鹤甲病发可拾沙目然了直以相眨穿睹瞥瞬矢的解石鸟神教秉虔诚秘种窝蜂穷窍笑置笔苟勾销抹杀煞等奖箍节吃箭仇双雕诗筹箩筐系列纸级士官统丝毫挂维网尽线微吭响股脑胎脉承腔臂力致效资源址器举功投般说讲规贸易叶障着慎满皆输号木电池衣倾钟高低视仁觉醒览遗角银币触溃九鼎蔽抄出驷马追重语破贫洗贯走路安蹴至几蹶振跃役胆汗较辈轮辞赞退六连遍递边针血锤音错门思闪真倒项栽雾类保护川先惊乍体哄鳞爪鸣滴泡邻域党专鼓作齐炒丑烯亥克内酯冬加奴卯肝炎基尺梁街裤镐客宠庭巳汝昌烷玲磊糖肇酉醛啷青县韪良香骨鲷丂七集河市弦喜嘴张舌堵区工业姊妹星架构巧彩扭歪拼凑余热曜武州爷浮屠美乡老阶树荤素碎落能魄鳃鳗珠丄丅丆万俟丈尚摸母娘量管群亚虎必我堂令申件装伏位博侠义界表女墟台戏臭皮匠胜诸葛亮赛顶倍催请运算包立叉戟离疫苗土史志演围揭瓦晒夷姑婆帝村宝烂尖杉碱屉桌山岔岛由纪峡坝库镇废从德后拗汤治旬食明昧曹朋友框栏极权幂曲归依猫民氟硼氯磷铁江侗自旅法司洋浦梅园温暖湾焦班幸用田略番叠皇炮捶硝苯酸腺苷棱草镜穗跳远索锦纲聚氰胺联店胚膲爱色堇紫罗兰芝茶饭菱云虫藏藩乱叛苏亲债凳学座恐恋柱测肌腹衩锥系貂企乌跪叩军车农题迭都甘油屯奏键短阿姨陪姐只顾茅庐槽驾魂鲜鹿页其菜单乘任供势午齿汉组织吊调泻唇坡城报坟外夸将尉建筑岸岗公床扬新剑升杭林栗校楼标款汽社浣海商馆剧院钢华港机械广媒环球融第医科证券综财乐育游涨犹岭疏瘾睑确兵领导缴肢膛船艾瑟尔苍蔡虞效衫覆访诉课谕议轨述野钩限敌鞋颌颔颚饶首龈站例修凡划垂届属崽颏厨拜挫摆放旋削棋榻槛礼沉注滑营狱画确仪聘花葬诏员跌辖周达酒锚闸陷陆雨雪飞威丌于丹久乏予理评产亢卑亦乎舞己悲矩圆词害志但住佞佳便俗信票案幅翁倦伦假偏倚斜亏鬼敲停备伤脾胃仅此像俭匮免宜穴焉戴兼容许冻伯仲负彼昼皂轩轾实刊划颠卫战哥比省非好黄饰别拘束掩奶睬选择摇扰烦苦枚写协厌及格受欢迎约只估侵犯割状告或缺抗拒挽撤救药喻磨灭端倪少逆逾越避靠适吉誉吝玉含延咎歹听啻渊善谋均匀堪忍够太惹妙妥妨孕症孝术室完纳推冠积宣疑辩栗碴称屈挠屑干涉衡待很忙恶忿怎么怠急耻恭息悦惑惜惟想愉愧怍慌愤启懂懈怀材才紧招认扣抵拉舍也罢插揣冒搭撞南墙扩核支攻敢雷攀敬里吗需景智暇曾罪遇朽枉止况竞争辱求愈渝溶济左右袒困补爽特寂寞示弱找谢畏强疾徐痛痒冤符眠睦瞅董何厚云措活疲羞者轻玻璃祥兆禁移稂莠稳佛换答简结果盟绝缕途给谈否羁翼耐肖胫毋宁兴舒若菲莱痕迹窠臼虚衰脸兔撒鹰棺范该详讳抬泰让须眉象众赀账费灰赖奇虑训辍辨菽麦辛近送透逞徒速续逮捕遂遑违逊斧钺艰醉锈随观弃显饱脂肪使丏丐帮丒且慢末丕替桃宗王尊凉爵各图屋脊粮署录坛吾禄职胄袭君厦丗北壑桐疹损逢陵鹬丙寅戌氨腈唑纶辰酮脱氢酶醚丞丢现掉纱帽弄扯炮碗丠両丣坐存激肩臻蒂莲悖序驱丨丩丫挺杈髻鬟细介俄伊犁京尼布订普渡央委监察检查剂圈设警队斯督剩震境航舶革防托播促质版蝾螈锋研艺历残消频谱精密制造陲邮候埔坚压坜凹汇执府究邦俘摄寮彬狼岳肺肿庸英讯诊埋粒胞括控码韩暑枪枢砥澳哇牟寿甸钻探篇签缀缝继耳肯照妇埃悬璧轴柜台辣搁浅邪跑纤阮阳私囊魔丮丰姿采丱烧丳丵丶丷丸参寨朗桂瑞砂衷霞貌"""
|
|
25
|
+
|
|
26
|
+
# 修正相對導入問題
|
|
27
|
+
import sys
|
|
28
|
+
import os
|
|
29
|
+
sys.path.insert(0, os.path.dirname(__file__))
|
|
30
|
+
|
|
31
|
+
from locale_data import LOCALE_MAP
|
|
32
|
+
from utils.data_utils import logger
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
cdef class LocaleDetectorCy:
|
|
36
|
+
"""
|
|
37
|
+
高效能 Cython 版本的多語言地區檢測器
|
|
38
|
+
專門針對字符統計和比較操作進行優化
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
# 類別常數
|
|
42
|
+
cdef readonly str UNKNOWN, TRADITIONAL, SIMPLIFIED, BOTH, MIXED
|
|
43
|
+
cdef readonly str DEFAULT_LOCALE
|
|
44
|
+
cdef readonly str FTLANG_CACHE
|
|
45
|
+
|
|
46
|
+
# 實例變數
|
|
47
|
+
cdef public object cc_s2t, cc_t2s # OpenCC 轉換器
|
|
48
|
+
cdef object ft_model # FastText 模型
|
|
49
|
+
cdef set TRAD, SIMP, SHARED # Python層字符集
|
|
50
|
+
cdef object ALL_HAN, HANZI_RE
|
|
51
|
+
cdef bint low_memory
|
|
52
|
+
|
|
53
|
+
# C-level bitmap for fast lookup
|
|
54
|
+
cdef unsigned char* trad_bitmap
|
|
55
|
+
cdef unsigned char* simp_bitmap
|
|
56
|
+
cdef int bitmap_size
|
|
57
|
+
|
|
58
|
+
# 靜態模型快取
|
|
59
|
+
cdef object MODELS
|
|
60
|
+
|
|
61
|
+
def __init__(self, bint low_memory=False):
|
|
62
|
+
"""
|
|
63
|
+
初始化 Cython 版本的地區檢測器
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
low_memory: 是否使用較小的 FastText 模型
|
|
67
|
+
"""
|
|
68
|
+
# 設定常數
|
|
69
|
+
self.UNKNOWN = "UNKNOWN"
|
|
70
|
+
self.TRADITIONAL = "TRADITIONAL"
|
|
71
|
+
self.SIMPLIFIED = "SIMPLIFIED"
|
|
72
|
+
self.BOTH = "BOTH"
|
|
73
|
+
self.MIXED = "MIXED"
|
|
74
|
+
self.DEFAULT_LOCALE = "en-US"
|
|
75
|
+
|
|
76
|
+
# FastText 模型快取和路徑
|
|
77
|
+
self.MODELS = {"low_mem": None, "high_mem": None}
|
|
78
|
+
self.FTLANG_CACHE = os.path.join(os.path.dirname(__file__), "artifacts", "fasttext")
|
|
79
|
+
|
|
80
|
+
# 初始化 OpenCC 轉換器
|
|
81
|
+
self.cc_s2t = OpenCC("s2t")
|
|
82
|
+
self.cc_t2s = OpenCC("t2s")
|
|
83
|
+
|
|
84
|
+
# 建立字符集
|
|
85
|
+
self.TRAD = set(TRAD_CHARS)
|
|
86
|
+
self.SIMP = set(SIMP_CHARS)
|
|
87
|
+
self.SHARED = self.TRAD.intersection(self.SIMP)
|
|
88
|
+
self.ALL_HAN = "".join(sorted(self.TRAD | self.SIMP))
|
|
89
|
+
self.HANZI_RE = re.compile(f"[^{self.ALL_HAN}]")
|
|
90
|
+
|
|
91
|
+
# 設定記憶體模式
|
|
92
|
+
self.low_memory = low_memory
|
|
93
|
+
|
|
94
|
+
# 建立 C-level bitmap (BMP 0~65535)
|
|
95
|
+
self.bitmap_size = 65536
|
|
96
|
+
self.trad_bitmap = <unsigned char*>malloc(self.bitmap_size * sizeof(unsigned char))
|
|
97
|
+
self.simp_bitmap = <unsigned char*>malloc(self.bitmap_size * sizeof(unsigned char))
|
|
98
|
+
for i in range(self.bitmap_size):
|
|
99
|
+
self.trad_bitmap[i] = 0
|
|
100
|
+
self.simp_bitmap[i] = 0
|
|
101
|
+
for ch in self.TRAD:
|
|
102
|
+
cp = ord(ch)
|
|
103
|
+
if cp < self.bitmap_size:
|
|
104
|
+
self.trad_bitmap[cp] = 1
|
|
105
|
+
for ch in self.SIMP:
|
|
106
|
+
cp = ord(ch)
|
|
107
|
+
if cp < self.bitmap_size:
|
|
108
|
+
self.simp_bitmap[cp] = 1
|
|
109
|
+
|
|
110
|
+
# 載入 FastText 模型
|
|
111
|
+
self._load_fasttext_model()
|
|
112
|
+
|
|
113
|
+
def _download_model(self, str name):
|
|
114
|
+
"""下載 FastText 模型"""
|
|
115
|
+
cdef str target_path = os.path.join(self.FTLANG_CACHE, name)
|
|
116
|
+
|
|
117
|
+
if os.path.exists(target_path):
|
|
118
|
+
logger.info({"Using local FastText model": target_path})
|
|
119
|
+
return target_path
|
|
120
|
+
|
|
121
|
+
try:
|
|
122
|
+
logger.info({"Downloading FastText model": name})
|
|
123
|
+
url = f"https://dl.fbaipublicfiles.com/fasttext/supervised-models/{name}"
|
|
124
|
+
os.makedirs(self.FTLANG_CACHE, exist_ok=True)
|
|
125
|
+
|
|
126
|
+
response = requests.get(url, timeout=300)
|
|
127
|
+
response.raise_for_status()
|
|
128
|
+
|
|
129
|
+
with open(target_path, "wb") as fp:
|
|
130
|
+
fp.write(response.content)
|
|
131
|
+
logger.info({"Downloaded FastText model": target_path})
|
|
132
|
+
return target_path
|
|
133
|
+
|
|
134
|
+
except Exception as e:
|
|
135
|
+
logger.error({"FastText model download failed": str(e)})
|
|
136
|
+
raise RuntimeError(f"Failed to download FastText model {name}: {e}")
|
|
137
|
+
|
|
138
|
+
def _load_fasttext_model(self):
|
|
139
|
+
"""載入 FastText 語言檢測模型"""
|
|
140
|
+
try:
|
|
141
|
+
if self.low_memory:
|
|
142
|
+
if not self.MODELS.get("low_mem"):
|
|
143
|
+
model_path = self._download_model("lid.176.ftz")
|
|
144
|
+
self.MODELS["low_mem"] = fasttext.load_model(model_path)
|
|
145
|
+
self.ft_model = self.MODELS["low_mem"]
|
|
146
|
+
else:
|
|
147
|
+
if not self.MODELS.get("high_mem"):
|
|
148
|
+
model_path = self._download_model("lid.176.bin")
|
|
149
|
+
self.MODELS["high_mem"] = fasttext.load_model(model_path)
|
|
150
|
+
self.ft_model = self.MODELS["high_mem"]
|
|
151
|
+
logger.info({"FastText model loaded": self.low_memory})
|
|
152
|
+
except Exception as e:
|
|
153
|
+
logger.error({"Failed to load FastText model": str(e)})
|
|
154
|
+
raise RuntimeError({"FastText model loading failed": str(e)})
|
|
155
|
+
|
|
156
|
+
cdef str _preprocess_text(self, str text):
|
|
157
|
+
"""預處理文本以處理可能破壞 FastText 的轉義字符"""
|
|
158
|
+
if not text:
|
|
159
|
+
return ""
|
|
160
|
+
|
|
161
|
+
# 替換有問題的轉義序列
|
|
162
|
+
text = text.replace("\n", " ")
|
|
163
|
+
text = text.replace("\r", " ")
|
|
164
|
+
text = text.replace("\t", " ")
|
|
165
|
+
|
|
166
|
+
# 移除多餘空格
|
|
167
|
+
return " ".join(text.split())
|
|
168
|
+
|
|
169
|
+
cdef str _map_to_i18n_code(self, str lang_code):
|
|
170
|
+
"""將檢測到的語言代碼映射到 i18n 代碼"""
|
|
171
|
+
if lang_code in LOCALE_MAP:
|
|
172
|
+
i18n_code = LOCALE_MAP[lang_code]["i18n_code"]
|
|
173
|
+
if i18n_code != "not_supported":
|
|
174
|
+
return i18n_code
|
|
175
|
+
|
|
176
|
+
# 中文變體的特殊情況
|
|
177
|
+
if lang_code in ["zh-TW", "zh-CN"]:
|
|
178
|
+
return lang_code
|
|
179
|
+
|
|
180
|
+
# 如果映射失敗,返回原始代碼或預設值
|
|
181
|
+
return lang_code if lang_code else self.DEFAULT_LOCALE
|
|
182
|
+
|
|
183
|
+
def detect_language(self, str text):
|
|
184
|
+
"""使用 FastText 檢測文本語言"""
|
|
185
|
+
if not text or not text.strip():
|
|
186
|
+
return {"lang": "unknown", "score": 0.0}
|
|
187
|
+
|
|
188
|
+
# 預處理文本處理轉義字符
|
|
189
|
+
cdef str processed_text = self._preprocess_text(text)
|
|
190
|
+
if not processed_text:
|
|
191
|
+
return {"lang": "unknown", "score": 0.0}
|
|
192
|
+
|
|
193
|
+
try:
|
|
194
|
+
labels, scores = self.ft_model.predict(processed_text)
|
|
195
|
+
label = labels[0].replace("__label__", "")
|
|
196
|
+
score = min(float(scores[0]), 1.0)
|
|
197
|
+
return {"lang": label, "score": score}
|
|
198
|
+
except Exception as e:
|
|
199
|
+
logger.error({"FastText prediction error": str(e)})
|
|
200
|
+
return {"lang": "unknown", "score": 0.0}
|
|
201
|
+
|
|
202
|
+
cpdef set extract_hanzi(self, str s):
|
|
203
|
+
"""從文本中提取中文字符 - Cython 極短文本進階優化"""
|
|
204
|
+
# 極短文本直接用 C 陣列與 bitmap
|
|
205
|
+
if len(s) <= 16:
|
|
206
|
+
cdef set result = set()
|
|
207
|
+
cdef int cp
|
|
208
|
+
cdef int i
|
|
209
|
+
cdef int n = len(s)
|
|
210
|
+
for i in range(n):
|
|
211
|
+
cp = ord(s[i])
|
|
212
|
+
if cp < self.bitmap_size and (self.trad_bitmap[cp] or self.simp_bitmap[cp]):
|
|
213
|
+
result.add(s[i])
|
|
214
|
+
return result
|
|
215
|
+
# 原本路徑
|
|
216
|
+
return set(self.HANZI_RE.sub("", s))
|
|
217
|
+
|
|
218
|
+
cpdef str identify(self, str s):
|
|
219
|
+
"""
|
|
220
|
+
識別字符串包含的中文字符類型 - Cython 優化版本
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
str: TRADITIONAL, SIMPLIFIED, BOTH, MIXED, 或 UNKNOWN 之一
|
|
224
|
+
"""
|
|
225
|
+
from sys import getsizeof
|
|
226
|
+
logger.info({
|
|
227
|
+
"identify_call": {
|
|
228
|
+
"text": s,
|
|
229
|
+
"len": len(s),
|
|
230
|
+
"mem": getsizeof(s)
|
|
231
|
+
}
|
|
232
|
+
})
|
|
233
|
+
# 極短文本直接走 C 陣列
|
|
234
|
+
if len(s) <= 16:
|
|
235
|
+
cdef int trad = 0
|
|
236
|
+
cdef int simp = 0
|
|
237
|
+
cdef int shared = 0
|
|
238
|
+
cdef int cp
|
|
239
|
+
cdef int i
|
|
240
|
+
cdef int n = len(s)
|
|
241
|
+
for i in range(n):
|
|
242
|
+
cp = ord(s[i])
|
|
243
|
+
if cp < self.bitmap_size:
|
|
244
|
+
if self.trad_bitmap[cp] and self.simp_bitmap[cp]:
|
|
245
|
+
shared += 1
|
|
246
|
+
elif self.trad_bitmap[cp]:
|
|
247
|
+
trad += 1
|
|
248
|
+
elif self.simp_bitmap[cp]:
|
|
249
|
+
simp += 1
|
|
250
|
+
total = trad + simp + shared
|
|
251
|
+
if total == 0:
|
|
252
|
+
return self.UNKNOWN
|
|
253
|
+
if trad and not simp and not shared:
|
|
254
|
+
return self.TRADITIONAL
|
|
255
|
+
if simp and not trad and not shared:
|
|
256
|
+
return self.SIMPLIFIED
|
|
257
|
+
if shared and not trad and not simp:
|
|
258
|
+
return self.BOTH
|
|
259
|
+
return self.MIXED
|
|
260
|
+
# 原本路徑
|
|
261
|
+
cdef set chinese = self.extract_hanzi(s)
|
|
262
|
+
if not chinese:
|
|
263
|
+
return self.UNKNOWN
|
|
264
|
+
if chinese.issubset(self.SHARED):
|
|
265
|
+
return self.BOTH
|
|
266
|
+
if chinese.issubset(self.TRAD):
|
|
267
|
+
return self.TRADITIONAL
|
|
268
|
+
if chinese.issubset(self.SIMP):
|
|
269
|
+
return self.SIMPLIFIED
|
|
270
|
+
return self.MIXED
|
|
271
|
+
|
|
272
|
+
cpdef bint is_traditional(self, str s):
|
|
273
|
+
"""檢查字符串的中文字符是否為繁體 - Cython 優化版本"""
|
|
274
|
+
cdef set chinese_chars = self.extract_hanzi(s)
|
|
275
|
+
if not chinese_chars:
|
|
276
|
+
return False
|
|
277
|
+
if chinese_chars.issubset(self.SHARED):
|
|
278
|
+
return True
|
|
279
|
+
return chinese_chars.issubset(self.TRAD)
|
|
280
|
+
|
|
281
|
+
cpdef bint is_simplified(self, str s):
|
|
282
|
+
"""檢查字符串的中文字符是否為簡體 - Cython 優化版本"""
|
|
283
|
+
cdef set chinese_chars = self.extract_hanzi(s)
|
|
284
|
+
if not chinese_chars:
|
|
285
|
+
return False
|
|
286
|
+
if chinese_chars.issubset(self.SHARED):
|
|
287
|
+
return True
|
|
288
|
+
return chinese_chars.issubset(self.SIMP)
|
|
289
|
+
|
|
290
|
+
@cython.boundscheck(False)
|
|
291
|
+
@cython.wraparound(False)
|
|
292
|
+
cpdef long _fast_count_trad(self, str text):
|
|
293
|
+
"""高速繁體字符計數 - Cython bitmap 優化版本"""
|
|
294
|
+
cdef long count = 0
|
|
295
|
+
cdef Py_ssize_t i, n = len(text)
|
|
296
|
+
cdef int cp
|
|
297
|
+
for i in range(n):
|
|
298
|
+
cp = ord(text[i])
|
|
299
|
+
if cp < self.bitmap_size and self.trad_bitmap[cp]:
|
|
300
|
+
count += 1
|
|
301
|
+
return count
|
|
302
|
+
|
|
303
|
+
@cython.boundscheck(False)
|
|
304
|
+
@cython.wraparound(False)
|
|
305
|
+
cpdef long _fast_count_simp(self, str text):
|
|
306
|
+
"""高速簡體字符計數 - Cython bitmap 優化版本"""
|
|
307
|
+
cdef long count = 0
|
|
308
|
+
cdef Py_ssize_t i, n = len(text)
|
|
309
|
+
cdef int cp
|
|
310
|
+
for i in range(n):
|
|
311
|
+
cp = ord(text[i])
|
|
312
|
+
if cp < self.bitmap_size and self.simp_bitmap[cp]:
|
|
313
|
+
count += 1
|
|
314
|
+
return count
|
|
315
|
+
|
|
316
|
+
cdef str _fast_identify_zh_hanzi(self, str text):
|
|
317
|
+
"""使用字符集分析識別中文類型 - Cython 優化版本"""
|
|
318
|
+
if self.is_simplified(text):
|
|
319
|
+
return "zh-CN"
|
|
320
|
+
if self.is_traditional(text):
|
|
321
|
+
return "zh-TW"
|
|
322
|
+
return "unknown"
|
|
323
|
+
|
|
324
|
+
cdef str _fast_identify_zh_opencc(self, str text):
|
|
325
|
+
"""使用 OpenCC 轉換比較識別中文類型 - Cython 優化版本"""
|
|
326
|
+
# 如果轉換為繁體後文本不變
|
|
327
|
+
if text == self.cc_s2t.convert(text):
|
|
328
|
+
return "zh-TW" # 已經是繁體
|
|
329
|
+
# 如果轉換為簡體後文本不變
|
|
330
|
+
if text == self.cc_t2s.convert(text):
|
|
331
|
+
return "zh-CN" # 已經是簡體
|
|
332
|
+
return "zh-TW" # 混合時預設為繁體
|
|
333
|
+
|
|
334
|
+
@lru_cache(maxsize=1024)
|
|
335
|
+
def detect_by_ratio_analysis(self, str text):
|
|
336
|
+
"""
|
|
337
|
+
使用比例分析檢測中文地區設定 - Cython 優化版本
|
|
338
|
+
結合以下技術:
|
|
339
|
+
- 字符比例分析(繁體 vs 簡體計數)
|
|
340
|
+
- OpenCC 轉換比較
|
|
341
|
+
- 字符集分析
|
|
342
|
+
"""
|
|
343
|
+
# 提取中文字符
|
|
344
|
+
cdef set hanzi = self.extract_hanzi(text)
|
|
345
|
+
if not hanzi:
|
|
346
|
+
return None
|
|
347
|
+
|
|
348
|
+
# OpenCC 形狀檢測
|
|
349
|
+
cdef str to_t = self.cc_s2t.convert(text)
|
|
350
|
+
cdef str to_s = self.cc_t2s.convert(text)
|
|
351
|
+
cdef str base
|
|
352
|
+
if text == to_t:
|
|
353
|
+
base = "T"
|
|
354
|
+
elif text == to_s:
|
|
355
|
+
base = "S"
|
|
356
|
+
else:
|
|
357
|
+
base = "M"
|
|
358
|
+
|
|
359
|
+
# 字符集分析
|
|
360
|
+
cdef str kind = self.identify(text)
|
|
361
|
+
|
|
362
|
+
# 計算繁體/簡體比例 - 使用 Cython 優化的計數函數
|
|
363
|
+
cdef long trad_count = self._fast_count_trad(text)
|
|
364
|
+
cdef long simp_count = self._fast_count_simp(text)
|
|
365
|
+
cdef long total = trad_count + simp_count or 1
|
|
366
|
+
cdef double trad_ratio = <double>trad_count / <double>total
|
|
367
|
+
cdef double simp_ratio = <double>simp_count / <double>total
|
|
368
|
+
|
|
369
|
+
# 決策邏輯
|
|
370
|
+
if base == "T" and kind in (self.TRADITIONAL, self.BOTH):
|
|
371
|
+
return "zh-TW"
|
|
372
|
+
if base == "S" and kind in (self.SIMPLIFIED, self.BOTH):
|
|
373
|
+
return "zh-CN"
|
|
374
|
+
if trad_ratio > 0.6 and trad_ratio > simp_ratio:
|
|
375
|
+
return "zh-TW"
|
|
376
|
+
if simp_ratio > 0.6 and simp_ratio > trad_ratio:
|
|
377
|
+
return "zh-CN"
|
|
378
|
+
return "zh-TW" if trad_ratio >= simp_ratio else "zh-CN"
|
|
379
|
+
|
|
380
|
+
async def adetect_by_ratio_analysis(self, str text):
|
|
381
|
+
"""
|
|
382
|
+
使用比例分析檢測中文地區設定(異步版本)
|
|
383
|
+
與同步版本使用相同的綜合分析,但將 CPU 密集型操作並行化以獲得更好的效能
|
|
384
|
+
"""
|
|
385
|
+
# 首先提取中文字符(這個操作足夠快,可以同步執行)
|
|
386
|
+
cdef set hanzi = self.extract_hanzi(text)
|
|
387
|
+
if not hanzi:
|
|
388
|
+
return None
|
|
389
|
+
|
|
390
|
+
loop = asyncio.get_event_loop()
|
|
391
|
+
|
|
392
|
+
# 在線程池中運行 CPU 密集型操作
|
|
393
|
+
to_t_future = loop.run_in_executor(None, self.cc_s2t.convert, text)
|
|
394
|
+
to_s_future = loop.run_in_executor(None, self.cc_t2s.convert, text)
|
|
395
|
+
kind_future = loop.run_in_executor(None, self.identify, text)
|
|
396
|
+
trad_count_future = loop.run_in_executor(None, self._fast_count_trad, text)
|
|
397
|
+
simp_count_future = loop.run_in_executor(None, self._fast_count_simp, text)
|
|
398
|
+
|
|
399
|
+
# 等待所有線程池任務完成
|
|
400
|
+
to_t, to_s, kind, trad_count, simp_count = await asyncio.gather(
|
|
401
|
+
to_t_future, to_s_future, kind_future, trad_count_future, simp_count_future
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
# 確定基礎狀態
|
|
405
|
+
cdef str base
|
|
406
|
+
if text == to_t:
|
|
407
|
+
base = "T"
|
|
408
|
+
elif text == to_s:
|
|
409
|
+
base = "S"
|
|
410
|
+
else:
|
|
411
|
+
base = "M"
|
|
412
|
+
|
|
413
|
+
# 計算比例
|
|
414
|
+
cdef long total = trad_count + simp_count or 1
|
|
415
|
+
cdef double trad_ratio = <double>trad_count / <double>total
|
|
416
|
+
cdef double simp_ratio = <double>simp_count / <double>total
|
|
417
|
+
|
|
418
|
+
# 決策邏輯
|
|
419
|
+
if base == "T" and kind in (self.TRADITIONAL, self.BOTH):
|
|
420
|
+
return "zh-TW"
|
|
421
|
+
if base == "S" and kind in (self.SIMPLIFIED, self.BOTH):
|
|
422
|
+
return "zh-CN"
|
|
423
|
+
if trad_ratio > 0.6 and trad_ratio > simp_ratio:
|
|
424
|
+
return "zh-TW"
|
|
425
|
+
if simp_ratio > 0.6 and simp_ratio > trad_ratio:
|
|
426
|
+
return "zh-CN"
|
|
427
|
+
return "zh-TW" if trad_ratio >= simp_ratio else "zh-CN"
|
|
428
|
+
|
|
429
|
+
async def detect(self, str text):
|
|
430
|
+
"""
|
|
431
|
+
檢測文本的地區設定並映射到 i18n 代碼
|
|
432
|
+
首先使用 FastText 識別語言,然後對中文應用中文變體檢測
|
|
433
|
+
|
|
434
|
+
Args:
|
|
435
|
+
text: 要分析的輸入文本
|
|
436
|
+
|
|
437
|
+
Returns:
|
|
438
|
+
str: 檢測到的 i18n 地區設定代碼(例如 'en-US', 'zh-TW', 'zh-CN', 'ja' 等)
|
|
439
|
+
"""
|
|
440
|
+
# 首先使用 FastText 檢測語言
|
|
441
|
+
lang_result = self.detect_language(text)
|
|
442
|
+
language = lang_result["lang"]
|
|
443
|
+
|
|
444
|
+
# 如果不是中文,將 FastText 結果映射到 i18n 代碼
|
|
445
|
+
if language != "zh":
|
|
446
|
+
return self._map_to_i18n_code(language)
|
|
447
|
+
|
|
448
|
+
# 對於中文,使用專門的檢測和比例方法
|
|
449
|
+
chinese_locale = await self.adetect_by_ratio_analysis(text)
|
|
450
|
+
return chinese_locale if chinese_locale else "zh-TW" # 如果檢測失敗,預設為 zh-TW
|
|
451
|
+
|
|
452
|
+
async def adetect_with_details(self, str text, str mode="ratio"):
|
|
453
|
+
"""
|
|
454
|
+
檢測文本的地區設定並提供詳細資訊和映射到 i18n 代碼
|
|
455
|
+
|
|
456
|
+
Args:
|
|
457
|
+
text: 要分析的輸入文本
|
|
458
|
+
mode: 中文文本的檢測模式:
|
|
459
|
+
- 'ratio': 使用比例分析(更準確 - 預設)
|
|
460
|
+
|
|
461
|
+
Returns:
|
|
462
|
+
dict: {
|
|
463
|
+
"locale": 檢測到的 i18n 地區設定代碼,
|
|
464
|
+
"language": 基礎語言代碼,
|
|
465
|
+
"score": 信心分數
|
|
466
|
+
}
|
|
467
|
+
"""
|
|
468
|
+
# 首先使用 FastText 檢測語言
|
|
469
|
+
lang_result = self.detect_language(text)
|
|
470
|
+
language = lang_result["lang"]
|
|
471
|
+
score = lang_result["score"]
|
|
472
|
+
|
|
473
|
+
# 如果不是中文,將 FastText 結果映射到 i18n 代碼
|
|
474
|
+
if language != "zh":
|
|
475
|
+
i18n_code = self._map_to_i18n_code(language)
|
|
476
|
+
return {"locale": i18n_code, "language": language, "score": score}
|
|
477
|
+
|
|
478
|
+
# 對於中文,使用專門的檢測
|
|
479
|
+
chinese_locale = await self.adetect_by_ratio_analysis(text)
|
|
480
|
+
if not chinese_locale:
|
|
481
|
+
chinese_locale = "zh-TW" # 預設值
|
|
482
|
+
|
|
483
|
+
# 中文變體已經是 i18n 代碼
|
|
484
|
+
return {"locale": chinese_locale, "language": "zh", "score": score}
|
|
485
|
+
|
|
486
|
+
async def adetect_batch(self, list texts, str mode="ratio"):
|
|
487
|
+
"""
|
|
488
|
+
批量處理多個文本並將結果映射到 i18n 代碼
|
|
489
|
+
首先為每個文本使用 FastText 識別語言,然後對識別為中文的文本應用中文變體檢測
|
|
490
|
+
|
|
491
|
+
Args:
|
|
492
|
+
texts: 要分析的文本列表
|
|
493
|
+
mode: 中文文本的檢測模式:
|
|
494
|
+
- 'ratio': 使用比例分析(更準確 - 預設)
|
|
495
|
+
|
|
496
|
+
Returns:
|
|
497
|
+
list[str]: 檢測到的 i18n 地區設定代碼列表
|
|
498
|
+
"""
|
|
499
|
+
tasks = [self.adetect_with_details(text, mode=mode) for text in texts]
|
|
500
|
+
results = await asyncio.gather(*tasks)
|
|
501
|
+
return [result["locale"] for result in results]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Utils package
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data utilities module providing logging functionality.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import sys
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class StructuredLogger:
|
|
11
|
+
"""
|
|
12
|
+
Structured logger that can handle both string and dictionary log messages.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, name: str = "locale_detector"):
|
|
16
|
+
self.logger = logging.getLogger(name)
|
|
17
|
+
if not self.logger.handlers:
|
|
18
|
+
handler = logging.StreamHandler(sys.stderr)
|
|
19
|
+
formatter = logging.Formatter(
|
|
20
|
+
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
21
|
+
)
|
|
22
|
+
handler.setFormatter(formatter)
|
|
23
|
+
self.logger.addHandler(handler)
|
|
24
|
+
self.logger.setLevel(logging.INFO)
|
|
25
|
+
|
|
26
|
+
def _format_message(self, msg: Any) -> str:
|
|
27
|
+
"""格式化日誌訊息"""
|
|
28
|
+
if isinstance(msg, dict):
|
|
29
|
+
return " | ".join(f"{k}: {v}" for k, v in msg.items())
|
|
30
|
+
return str(msg)
|
|
31
|
+
|
|
32
|
+
def info(self, msg: Any) -> None:
|
|
33
|
+
"""記錄 INFO 級別訊息"""
|
|
34
|
+
self.logger.info(self._format_message(msg))
|
|
35
|
+
|
|
36
|
+
def error(self, msg: Any) -> None:
|
|
37
|
+
"""記錄 ERROR 級別訊息"""
|
|
38
|
+
self.logger.error(self._format_message(msg))
|
|
39
|
+
|
|
40
|
+
def warning(self, msg: Any) -> None:
|
|
41
|
+
"""記錄 WARNING 級別訊息"""
|
|
42
|
+
self.logger.warning(self._format_message(msg))
|
|
43
|
+
|
|
44
|
+
def debug(self, msg: Any) -> None:
|
|
45
|
+
"""記錄 DEBUG 級別訊息"""
|
|
46
|
+
self.logger.debug(self._format_message(msg))
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# 全域 logger 實例
|
|
50
|
+
logger = StructuredLogger()
|