jtcg_locale_detector 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +37 -0
  3. data/PACKAGING_SUMMARY.md +195 -0
  4. data/README.md +226 -0
  5. data/bin/locale-detector +159 -0
  6. data/jtcg_locale_detector.gemspec +48 -0
  7. data/lib/locale_detector/client.rb +163 -0
  8. data/lib/locale_detector/detector.rb +46 -0
  9. data/lib/locale_detector/version.rb +3 -0
  10. data/lib/locale_detector.rb +25 -0
  11. data/locale_detector.gemspec +46 -0
  12. data/python/cli.py +220 -0
  13. data/python/requirements.txt +8 -0
  14. data/python/src/__init__.py +10 -0
  15. data/python/src/__pycache__/__init__.cpython-311.pyc +0 -0
  16. data/python/src/__pycache__/__init__.cpython-313.pyc +0 -0
  17. data/python/src/__pycache__/locale_data.cpython-311.pyc +0 -0
  18. data/python/src/__pycache__/locale_data.cpython-313.pyc +0 -0
  19. data/python/src/__pycache__/locale_detector.cpython-311.pyc +0 -0
  20. data/python/src/__pycache__/locale_detector.cpython-313.pyc +0 -0
  21. data/python/src/artifacts/fasttext/lid.176.bin +0 -0
  22. data/python/src/artifacts/fasttext/lid.176.ftz +0 -0
  23. data/python/src/download_fasttext.py +69 -0
  24. data/python/src/locale_data.py +178 -0
  25. data/python/src/locale_detector.py +534 -0
  26. data/python/src/locale_detector_c.c +403 -0
  27. data/python/src/locale_detector_c.h +37 -0
  28. data/python/src/locale_detector_cy.cpp +23126 -0
  29. data/python/src/locale_detector_cy.cpython-311-darwin.so +0 -0
  30. data/python/src/locale_detector_cy.cpython-313-darwin.so +0 -0
  31. data/python/src/locale_detector_cy.html +6460 -0
  32. data/python/src/locale_detector_cy.pyx +501 -0
  33. data/python/src/utils/__init__.py +1 -0
  34. data/python/src/utils/__pycache__/__init__.cpython-311.pyc +0 -0
  35. data/python/src/utils/__pycache__/__init__.cpython-313.pyc +0 -0
  36. data/python/src/utils/__pycache__/data_utils.cpython-311.pyc +0 -0
  37. data/python/src/utils/__pycache__/data_utils.cpython-313.pyc +0 -0
  38. data/python/src/utils/data_utils.py +50 -0
  39. data/python/src/utils/data_utils_cy.cpp +10086 -0
  40. data/python/src/utils/data_utils_cy.cpython-311-darwin.so +0 -0
  41. data/python/src/utils/data_utils_cy.cpython-313-darwin.so +0 -0
  42. data/python/src/utils/data_utils_cy.html +600 -0
  43. data/python/src/utils/data_utils_cy.pyx +94 -0
  44. data/python/src/zhon/__init__.py +7 -0
  45. data/python/src/zhon/__pycache__/__init__.cpython-311.pyc +0 -0
  46. data/python/src/zhon/__pycache__/hanzi.cpython-311.pyc +0 -0
  47. data/python/src/zhon/__pycache__/pinyin.cpython-311.pyc +0 -0
  48. data/python/src/zhon/__pycache__/zhuyin.cpython-311.pyc +0 -0
  49. data/python/src/zhon/cedict/__init__.py +14 -0
  50. data/python/src/zhon/cedict/__pycache__/__init__.cpython-311.pyc +0 -0
  51. data/python/src/zhon/cedict/__pycache__/all.cpython-311.pyc +0 -0
  52. data/python/src/zhon/cedict/__pycache__/simplified.cpython-311.pyc +0 -0
  53. data/python/src/zhon/cedict/__pycache__/traditional.cpython-311.pyc +0 -0
  54. data/python/src/zhon/cedict/all.py +4 -0
  55. data/python/src/zhon/cedict/simplified.py +4 -0
  56. data/python/src/zhon/cedict/traditional.py +4 -0
  57. data/python/src/zhon/hanzi.py +81 -0
  58. data/python/src/zhon/pinyin.py +187 -0
  59. data/python/src/zhon/zhuyin.py +46 -0
  60. metadata +198 -0
@@ -0,0 +1,403 @@
1
+ /*
2
+ * Pure C implementation of locale detector
3
+ * High-performance character set analysis with bitmap lookup
4
+ */
5
+
6
+ #include <Python.h>
7
+ #include <string.h>
8
+ #include <stdint.h>
9
+ #include <time.h>
10
+
11
+ /* Result constants */
12
+ typedef enum {
13
+ LOCALE_UNKNOWN = 0,
14
+ LOCALE_TRADITIONAL = 1,
15
+ LOCALE_SIMPLIFIED = 2,
16
+ LOCALE_BOTH = 3,
17
+ LOCALE_MIXED = 4
18
+ } locale_type_t;
19
+
20
+ /* Character count structure */
21
+ typedef struct {
22
+ long traditional;
23
+ long simplified;
24
+ long shared;
25
+ long total;
26
+ } hanzi_counts_t;
27
+
28
+ /* Static character set definitions as const arrays */
29
+ static const char* TRAD_CHARS = "制咖片型超聲盤鑒定仔點他命書歌粉巾字帳恤手指記憶棒形轉彎溝光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞㠯㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮㵎㵪㶸㷖㷭㹢㹴犬㺢狓㺵㼝㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓䠶䥯䦉䯝䰾魚䲔䳗䳘䵹鼄䶑一對應映射丁不識下兒子做二休世丘之貉並中台原則串為甚謂乾淨了百事無成八變五十些人得道雞升天代如併來去個國政策勁幽靈在歐洲遊蕩接樣蘿蔔坑側化傳價元論醇共再准刀兩斷切分耕耘收穫錢貨物向看舊就緒險刻千金動勞永逸匙零夜半卡通回復返影蹤反常態口咬氣句話同吐快吹周味呼諾嗚品紅鍋哄而散起唱和問三知生熟團漆黑火糟堆場空塊麵塌糊塗塵染壁廂夔已足多情露水大早到晚夫妻當關萬莫開失古恨套所料既往孔見提師要家主審寸陰難買鬥牛小撮部陣局展身層巴掌帆風順席地帶過年計於春頭載四季期被蛇怕井繩度願式份彈頃深前律徑心意念差愁孤行俱全房廳交遮打技長把抓死拿眼淚鼻涕鑰鎖折段抿拍即合掃排掬揮撥擁上入擊洞擲攬改故轍敗文值名斑方面旁族日秋餐隔雅里終父旦時晌會霎間晃暴寒曝更月望垠際朝夕本正經利杯羹東西板枝獨秀根筋桿進條龍服務概模次函數又性程總付步腳印趨登毛拔呵氧氮碳決雌雄波未平派謊言流清楚白準溜煙潭有獲聞是處降琴鶴甲病發可拾沙目然瞭直以相眨穿睹瞥瞬矢的解石鳥神教秉虔誠秘種窩蜂窮竅笑置筆苟勾銷抹殺煞等獎箍節吃箭仇雙鵰詩籌籮筐系列紙級士官統絲毫掛維網盡線微吭響股腦胎脈承腔臂力致效資源址器舉功投般說講規貿易葉障著慎滿皆輸號木電池衣傾鐘高低視仁覺醒覽遺角銀幣觸潰九鼎蔽抄出駟馬追重語破貧洗貫走路安蹴至幾蹶振躍役膽汗較輩輪辭贊退六連遍遞邊針血錘音錯門思閃真倒項栽霧類保護川先驚乍體鬨鱗爪鳴滴泡鄰域黨專鼓作齊炒丑烯亥克內酯冬加奴卯肝炎基尺梁街褲鎬客寵庭巳汝昌烷玲磊糖肇酉醛啷青縣韙良香骨鯛";
30
+
31
+ static const char* SIMP_CHARS = "制咖片型超声盘鉴定仔点他命书歌粉巾字帐恤手指记忆棒形转弯沟光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞以㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮涧㵪㶸㷖㷭㹢㹴犬㺢狓㺵碗㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓射䥯䦉䯝鲃鱼䲔䳗鹅䵹鼄䶑一对应映射丁不识下儿子做二休世丘之貉并中台原则串为甚谓干净了百事无成八变五十些人得道鸡升天代如并来去个国政策劲幽灵在欧洲游荡接样萝卜坑侧化传价元论醇共再准刀两断切分耕耘收获钱货物向看旧就绪险刻千金动劳永逸匙零夜半卡通回复返影踪反常态口咬气句话同吐快吹周味呼诺呜品红锅哄而散起唱和问三知生熟团漆黑火糟堆场空块面塌糊涂尘染壁厢夔已足多情露水大早到晚夫妻当关万莫开失古恨套所料既往孔见提师要家主审寸阴难买斗牛小撮部阵局展身层巴掌帆风顺席地带过年计于春头载四季期被蛇怕井绳度愿式份弹顷深前律径心意念差愁孤行俱全房厅交遮打技长把抓死拿眼泪鼻涕钥锁折段抿拍即合扫排掬挥拨拥上入击洞掷揽改故辙败文值名斑方面旁族日秋餐隔雅里终父旦时晌会霎间晃暴寒曝更月望垠际朝夕本正经利杯羹东西板枝独秀根筋杆进条龙服务概模次函数又性程总付步脚印趋登毛拔呵氧氮碳决雌雄波未平派谎言流清楚白准溜烟潭有获闻是处降琴鹤甲病发可拾沙目然了直以相眨穿睹瞥瞬矢的解石鸟神教秉虔诚秘种窝蜂穷窍笑置笔苟勾销抹杀煞等奖箍节吃箭仇双雕诗筹箩筐系列纸级士官统丝毫挂维网尽线微吭响股脑胎脉承腔臂力致效资源址器举功投般说讲规贸易叶障着慎满皆输号木电池衣倾钟高低视仁觉醒览遗角银币触溃九鼎蔽抄出驷马追重语破贫洗贯走路安蹴至几蹶振跃役胆汗较辈轮辞赞退六连遍递边针血锤音错门思闪真倒项栽雾类保护川先惊乍体哄鳞爪鸣滴泡邻域党专鼓作齐炒丑烯亥克内酯冬加奴卯肝炎基尺梁街裤镐客宠庭巳汝昌烷玲磊糖肇酉醛啷青县韪良香骨鲷";
32
+
33
+ /* Global bitmap arrays for fast lookup - static allocation for BMP (65536 codepoints) */
34
+ static unsigned char trad_bitmap[65536] = {0};
35
+ static unsigned char simp_bitmap[65536] = {0};
36
+ static unsigned char shared_bitmap[65536] = {0};
37
+ static int bitmap_initialized = 0;
38
+
39
+ /* Unicode utilities */
40
+ static uint32_t utf8_to_codepoint(const char* str, int* bytes_consumed) {
41
+ const unsigned char* s = (const unsigned char*)str;
42
+ uint32_t cp = 0;
43
+
44
+ if (s[0] < 0x80) {
45
+ /* ASCII: 0xxxxxxx */
46
+ cp = s[0];
47
+ *bytes_consumed = 1;
48
+ } else if ((s[0] & 0xE0) == 0xC0) {
49
+ /* 2-byte: 110xxxxx 10xxxxxx */
50
+ cp = ((s[0] & 0x1F) << 6) | (s[1] & 0x3F);
51
+ *bytes_consumed = 2;
52
+ } else if ((s[0] & 0xF0) == 0xE0) {
53
+ /* 3-byte: 1110xxxx 10xxxxxx 10xxxxxx */
54
+ cp = ((s[0] & 0x0F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
55
+ *bytes_consumed = 3;
56
+ } else if ((s[0] & 0xF8) == 0xF0) {
57
+ /* 4-byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
58
+ cp = ((s[0] & 0x07) << 18) | ((s[1] & 0x3F) << 12) | ((s[2] & 0x3F) << 6) | (s[3] & 0x3F);
59
+ *bytes_consumed = 4;
60
+ } else {
61
+ /* Invalid UTF-8 */
62
+ cp = 0xFFFD; /* Replacement character */
63
+ *bytes_consumed = 1;
64
+ }
65
+
66
+ return cp;
67
+ }
68
+
69
+ /* Initialize bitmap arrays from character strings */
70
+ static void init_bitmap() {
71
+ if (bitmap_initialized) return;
72
+
73
+ const char* p;
74
+ int bytes;
75
+ uint32_t cp;
76
+
77
+ /* Initialize traditional characters bitmap */
78
+ p = TRAD_CHARS;
79
+ while (*p) {
80
+ cp = utf8_to_codepoint(p, &bytes);
81
+ if (cp < 65536) {
82
+ trad_bitmap[cp] = 1;
83
+ }
84
+ p += bytes;
85
+ }
86
+
87
+ /* Initialize simplified characters bitmap */
88
+ p = SIMP_CHARS;
89
+ while (*p) {
90
+ cp = utf8_to_codepoint(p, &bytes);
91
+ if (cp < 65536) {
92
+ simp_bitmap[cp] = 1;
93
+ }
94
+ p += bytes;
95
+ }
96
+
97
+ /* Initialize shared characters bitmap */
98
+ for (int i = 0; i < 65536; i++) {
99
+ if (trad_bitmap[i] && simp_bitmap[i]) {
100
+ shared_bitmap[i] = 1;
101
+ }
102
+ }
103
+
104
+ bitmap_initialized = 1;
105
+ }
106
+
107
+ /* Fast character classification functions */
108
+ static inline int is_traditional_char(uint32_t cp) {
109
+ return (cp < 65536) ? trad_bitmap[cp] : 0;
110
+ }
111
+
112
+ static inline int is_simplified_char(uint32_t cp) {
113
+ return (cp < 65536) ? simp_bitmap[cp] : 0;
114
+ }
115
+
116
+ static inline int is_shared_char(uint32_t cp) {
117
+ return (cp < 65536) ? shared_bitmap[cp] : 0;
118
+ }
119
+
120
+ static inline int is_hanzi_char(uint32_t cp) {
121
+ return (cp < 65536) ? (trad_bitmap[cp] || simp_bitmap[cp]) : 0;
122
+ }
123
+
124
+ /* Core analysis functions */
125
+ static hanzi_counts_t count_hanzi_chars(const char* text) {
126
+ if (!bitmap_initialized) init_bitmap();
127
+
128
+ hanzi_counts_t counts = {0, 0, 0, 0};
129
+
130
+ const char* p = text;
131
+ int bytes;
132
+ uint32_t cp;
133
+
134
+ while (*p) {
135
+ cp = utf8_to_codepoint(p, &bytes);
136
+
137
+ if (is_hanzi_char(cp)) {
138
+ counts.total++;
139
+
140
+ if (is_shared_char(cp)) {
141
+ counts.shared++;
142
+ } else if (is_traditional_char(cp)) {
143
+ counts.traditional++;
144
+ } else if (is_simplified_char(cp)) {
145
+ counts.simplified++;
146
+ }
147
+ }
148
+
149
+ p += bytes;
150
+ }
151
+
152
+ return counts;
153
+ }
154
+
155
+ static locale_type_t identify_locale_type(const char* text) {
156
+ hanzi_counts_t counts = count_hanzi_chars(text);
157
+
158
+ if (counts.total == 0) {
159
+ return LOCALE_UNKNOWN;
160
+ }
161
+
162
+ /* Pure traditional (only traditional chars, no simplified) */
163
+ if (counts.traditional > 0 && counts.simplified == 0 && counts.shared == 0) {
164
+ return LOCALE_TRADITIONAL;
165
+ }
166
+
167
+ /* Pure simplified (only simplified chars, no traditional) */
168
+ if (counts.simplified > 0 && counts.traditional == 0 && counts.shared == 0) {
169
+ return LOCALE_SIMPLIFIED;
170
+ }
171
+
172
+ /* Only shared characters */
173
+ if (counts.shared > 0 && counts.traditional == 0 && counts.simplified == 0) {
174
+ return LOCALE_BOTH;
175
+ }
176
+
177
+ /* Mixed content */
178
+ return LOCALE_MIXED;
179
+ }
180
+
181
+ static const char* identify_locale_string(const char* text) {
182
+ locale_type_t type = identify_locale_type(text);
183
+
184
+ switch (type) {
185
+ case LOCALE_TRADITIONAL: return "TRADITIONAL";
186
+ case LOCALE_SIMPLIFIED: return "SIMPLIFIED";
187
+ case LOCALE_BOTH: return "BOTH";
188
+ case LOCALE_MIXED: return "MIXED";
189
+ default: return "UNKNOWN";
190
+ }
191
+ }
192
+
193
+ static int is_traditional_text(const char* text) {
194
+ hanzi_counts_t counts = count_hanzi_chars(text);
195
+
196
+ if (counts.total == 0) return 0;
197
+
198
+ /* Traditional if: no simplified chars present */
199
+ return (counts.simplified == 0);
200
+ }
201
+
202
+ static int is_simplified_text(const char* text) {
203
+ hanzi_counts_t counts = count_hanzi_chars(text);
204
+
205
+ if (counts.total == 0) return 0;
206
+
207
+ /* Simplified if: no traditional chars present */
208
+ return (counts.traditional == 0);
209
+ }
210
+
211
+ static const char* detect_chinese_locale(const char* text) {
212
+ hanzi_counts_t counts = count_hanzi_chars(text);
213
+
214
+ if (counts.total == 0) {
215
+ return "unknown";
216
+ }
217
+
218
+ /* Calculate ratios */
219
+ double trad_ratio = (double)counts.traditional / counts.total;
220
+ double simp_ratio = (double)counts.simplified / counts.total;
221
+
222
+ /* Decision logic based on character analysis */
223
+ if (counts.simplified == 0 && (counts.traditional > 0 || counts.shared > 0)) {
224
+ return "zh-TW"; /* No simplified chars present */
225
+ }
226
+
227
+ if (counts.traditional == 0 && (counts.simplified > 0 || counts.shared > 0)) {
228
+ return "zh-CN"; /* No traditional chars present */
229
+ }
230
+
231
+ /* For mixed content, use ratio analysis */
232
+ if (trad_ratio > 0.6 && trad_ratio > simp_ratio) {
233
+ return "zh-TW";
234
+ }
235
+
236
+ if (simp_ratio > 0.6 && simp_ratio > trad_ratio) {
237
+ return "zh-CN";
238
+ }
239
+
240
+ /* Default to traditional if ratios are close */
241
+ return trad_ratio >= simp_ratio ? "zh-TW" : "zh-CN";
242
+ }
243
+
244
+ /* Fast extraction of hanzi characters */
245
+ static int extract_hanzi_chars(const char* text, char* result, int max_result_len) {
246
+ if (!bitmap_initialized) init_bitmap();
247
+
248
+ const char* p = text;
249
+ char* out = result;
250
+ int result_len = 0;
251
+ int bytes;
252
+ uint32_t cp;
253
+
254
+ while (*p && result_len < max_result_len - 4) { /* Reserve space for UTF-8 char + null */
255
+ cp = utf8_to_codepoint(p, &bytes);
256
+
257
+ if (is_hanzi_char(cp)) {
258
+ /* Copy the UTF-8 bytes */
259
+ for (int i = 0; i < bytes; i++) {
260
+ *out++ = p[i];
261
+ result_len++;
262
+ }
263
+ }
264
+
265
+ p += bytes;
266
+ }
267
+
268
+ *out = '\0';
269
+ return result_len;
270
+ }
271
+
272
+ /* Python C API wrapper functions */
273
+ static PyObject* py_count_hanzi_chars(PyObject *self, PyObject *args) {
274
+ const char* text;
275
+ if (!PyArg_ParseTuple(args, "s", &text)) {
276
+ return NULL;
277
+ }
278
+
279
+ hanzi_counts_t counts = count_hanzi_chars(text);
280
+
281
+ return Py_BuildValue("{s:l,s:l,s:l,s:l}",
282
+ "traditional", counts.traditional,
283
+ "simplified", counts.simplified,
284
+ "shared", counts.shared,
285
+ "total", counts.total);
286
+ }
287
+
288
+ static PyObject* py_identify_locale(PyObject *self, PyObject *args) {
289
+ const char* text;
290
+ if (!PyArg_ParseTuple(args, "s", &text)) {
291
+ return NULL;
292
+ }
293
+
294
+ const char* result = identify_locale_string(text);
295
+ return PyUnicode_FromString(result);
296
+ }
297
+
298
+ static PyObject* py_detect_chinese_locale(PyObject *self, PyObject *args) {
299
+ const char* text;
300
+ if (!PyArg_ParseTuple(args, "s", &text)) {
301
+ return NULL;
302
+ }
303
+
304
+ const char* result = detect_chinese_locale(text);
305
+ return PyUnicode_FromString(result);
306
+ }
307
+
308
+ static PyObject* py_is_traditional(PyObject *self, PyObject *args) {
309
+ const char* text;
310
+ if (!PyArg_ParseTuple(args, "s", &text)) {
311
+ return NULL;
312
+ }
313
+
314
+ int result = is_traditional_text(text);
315
+ return PyBool_FromLong(result);
316
+ }
317
+
318
+ static PyObject* py_is_simplified(PyObject *self, PyObject *args) {
319
+ const char* text;
320
+ if (!PyArg_ParseTuple(args, "s", &text)) {
321
+ return NULL;
322
+ }
323
+
324
+ int result = is_simplified_text(text);
325
+ return PyBool_FromLong(result);
326
+ }
327
+
328
+ static PyObject* py_extract_hanzi(PyObject *self, PyObject *args) {
329
+ const char* text;
330
+ if (!PyArg_ParseTuple(args, "s", &text)) {
331
+ return NULL;
332
+ }
333
+
334
+ /* Allocate buffer for result (conservatively estimate UTF-8 size) */
335
+ int text_len = strlen(text);
336
+ char* result = malloc(text_len + 1);
337
+ if (!result) {
338
+ return PyErr_NoMemory();
339
+ }
340
+
341
+ extract_hanzi_chars(text, result, text_len + 1);
342
+
343
+ PyObject* py_result = PyUnicode_FromString(result);
344
+ free(result);
345
+
346
+ return py_result;
347
+ }
348
+
349
+ static PyObject* py_benchmark(PyObject *self, PyObject *args) {
350
+ const char* text;
351
+ int iterations = 10000;
352
+
353
+ if (!PyArg_ParseTuple(args, "s|i", &text, &iterations)) {
354
+ return NULL;
355
+ }
356
+
357
+ clock_t start = clock();
358
+
359
+ for (int i = 0; i < iterations; i++) {
360
+ const char* result = detect_chinese_locale(text);
361
+ (void)result; /* Suppress unused variable warning */
362
+ }
363
+
364
+ clock_t end = clock();
365
+ double time_taken = ((double)(end - start)) / CLOCKS_PER_SEC;
366
+ double us_per_iteration = (time_taken * 1000000) / iterations;
367
+
368
+ return Py_BuildValue("{s:i,s:d,s:d}",
369
+ "iterations", iterations,
370
+ "total_time", time_taken,
371
+ "microseconds_per_iteration", us_per_iteration);
372
+ }
373
+
374
+ static int is_in_charset(Py_UCS4 ch, const char *charset) {
375
+ // 線性查找,極短文本下最快
376
+ for (const char *p = charset; *p; ++p) {
377
+ if ((unsigned char)*p == ch) return 1;
378
+ }
379
+ return 0;
380
+ }
381
+
382
+ static PyMethodDef LocaleDetectorMethods[] = {
383
+ {"count_hanzi_chars", py_count_hanzi_chars, METH_VARARGS, "統計中文字符類型數量"},
384
+ {"identify_locale", py_identify_locale, METH_VARARGS, "識別文本中文類型"},
385
+ {"detect_chinese_locale", py_detect_chinese_locale, METH_VARARGS, "檢測中文地區設定"},
386
+ {"is_traditional", py_is_traditional, METH_VARARGS, "判斷是否為繁體文本"},
387
+ {"is_simplified", py_is_simplified, METH_VARARGS, "判斷是否為簡體文本"},
388
+ {"extract_hanzi", py_extract_hanzi, METH_VARARGS, "提取中文字符"},
389
+ {"benchmark", py_benchmark, METH_VARARGS, "效能測試"},
390
+ {NULL, NULL, 0, NULL}
391
+ };
392
+
393
+ static struct PyModuleDef locale_detectormodule = {
394
+ PyModuleDef_HEAD_INIT,
395
+ "locale_detector_c", // module name
396
+ NULL, // module doc
397
+ -1, // size of per-interpreter state of the module
398
+ LocaleDetectorMethods
399
+ };
400
+
401
+ PyMODINIT_FUNC PyInit_locale_detector_c(void) {
402
+ return PyModule_Create(&locale_detectormodule);
403
+ }
@@ -0,0 +1,37 @@
1
+ /*
2
+ * Pure C locale detector header file
3
+ */
4
+
5
+ #ifndef LOCALE_DETECTOR_C_H
6
+ #define LOCALE_DETECTOR_C_H
7
+
8
+ #include <stdint.h>
9
+
10
+ /* Result constants */
11
+ typedef enum {
12
+ LOCALE_UNKNOWN = 0,
13
+ LOCALE_TRADITIONAL = 1,
14
+ LOCALE_SIMPLIFIED = 2,
15
+ LOCALE_BOTH = 3,
16
+ LOCALE_MIXED = 4
17
+ } locale_type_t;
18
+
19
+ /* Character count structure */
20
+ typedef struct {
21
+ long traditional;
22
+ long simplified;
23
+ long shared;
24
+ long total;
25
+ } hanzi_counts_t;
26
+
27
+ /* Core C functions - can be used directly without Python */
28
+ hanzi_counts_t count_hanzi_chars(const char* text);
29
+ locale_type_t identify_locale_type(const char* text);
30
+ const char* identify_locale_string(const char* text);
31
+ int is_traditional_text(const char* text);
32
+ int is_simplified_text(const char* text);
33
+ const char* detect_chinese_locale(const char* text);
34
+ int extract_hanzi_chars(const char* text, char* result, int max_result_len);
35
+ void benchmark_locale_detection(const char* text, int iterations);
36
+
37
+ #endif /* LOCALE_DETECTOR_C_H */