rbtn 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rbtn-0.1.0/LICENSE ADDED
@@ -0,0 +1,7 @@
1
+ Copyright (c) 2026 Wu-Hao Li
2
+
3
+ Permission is hereby granted to use, copy, modify, and distribute this software for academic, research, and educational purposes only.
4
+
5
+ Commercial use is strictly prohibited without prior written permission from the copyright holder.
6
+
7
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND.
rbtn-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,42 @@
1
+ Metadata-Version: 2.4
2
+ Name: rbtn
3
+ Version: 0.1.0
4
+ Summary: RBTN C code to Python ctypes wrapper
5
+ Author-email: "WuHao, Li" <hank12451@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/cewarman/rbtn
8
+ Requires-Python: >=3.8
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Dynamic: license-file
12
+
13
+ # Rule-based text normalization system
14
+
15
+ ## ✨ Features
16
+ 消除文字的歧異,為文字轉語音系統的前置工作。<br>
17
+ 如:<br>
18
+ &emsp;7/19是星期五 -> 七月十九號是星期五<br>
19
+ &emsp;2/3的人不喜歡香菜 -> 三分之二的人不喜歡香菜<br>
20
+ ## 🚀 Quick Start
21
+ ```terminal
22
+ import rbtn.cetn as rbcetn
23
+
24
+ tn=rbcetn.textnormalizer()
25
+ nt=tn.get_normalized_text(['30年', '', '0050'])
26
+ print(nt)
27
+ ```
28
+ ## 📊 Demo / Results
29
+ <img width="1852" height="604" alt="image" src="https://github.com/user-attachments/assets/8a8a9544-a769-48b2-bbec-e12771630ceb" />
30
+
31
+ ## 📁 Project Structure
32
+
33
+ <img width="630" height="154" alt="RBTN" src="https://github.com/user-attachments/assets/2fd389f3-b4ec-42c2-9c7c-a53c40b35981" />
34
+
35
+
36
+ ## ⚙️ Configuration
37
+ ## 📚 Citation
38
+ ## 📜 License
39
+
40
+ This project is available for academic and research use only.
41
+
42
+ Commercial use requires explicit permission from the author.
rbtn-0.1.0/README.md ADDED
@@ -0,0 +1,30 @@
1
+ # Rule-based text normalization system
2
+
3
+ ## ✨ Features
4
+ 消除文字的歧異,為文字轉語音系統的前置工作。<br>
5
+ 如:<br>
6
+ &emsp;7/19是星期五 -> 七月十九號是星期五<br>
7
+ &emsp;2/3的人不喜歡香菜 -> 三分之二的人不喜歡香菜<br>
8
+ ## 🚀 Quick Start
9
+ ```terminal
10
+ import rbtn.cetn as rbcetn
11
+
12
+ tn=rbcetn.textnormalizer()
13
+ nt=tn.get_normalized_text(['30年', '', '0050'])
14
+ print(nt)
15
+ ```
16
+ ## 📊 Demo / Results
17
+ <img width="1852" height="604" alt="image" src="https://github.com/user-attachments/assets/8a8a9544-a769-48b2-bbec-e12771630ceb" />
18
+
19
+ ## 📁 Project Structure
20
+
21
+ <img width="630" height="154" alt="RBTN" src="https://github.com/user-attachments/assets/2fd389f3-b4ec-42c2-9c7c-a53c40b35981" />
22
+
23
+
24
+ ## ⚙️ Configuration
25
+ ## 📚 Citation
26
+ ## 📜 License
27
+
28
+ This project is available for academic and research use only.
29
+
30
+ Commercial use requires explicit permission from the author.
@@ -0,0 +1,23 @@
1
+ [build-system]
2
+ requires = ["setuptools>=45", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "rbtn" # ← 改成你要的套件名(小寫、不能有空格)
7
+ version = "0.1.0"
8
+ description = "RBTN C code to Python ctypes wrapper"
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ authors = [{name = "WuHao, Li", email = "hank12451@gmail.com"}]
12
+ requires-python = ">=3.8"
13
+ dependencies = []
14
+
15
+ [project.urls]
16
+ Homepage = "https://github.com/cewarman/rbtn"
17
+
18
+ [tool.setuptools]
19
+ package-dir = {"" = "src"}
20
+ packages = ["rbtn"]
21
+
22
+ [tool.setuptools.package-data]
23
+ rbtn = ["*.so", "*.dylib", "*.dll", "rules_config.txt"] # 包含 shared library
rbtn-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
File without changes
@@ -0,0 +1,30 @@
1
+ import os
2
+ from ctypes import *
3
+
4
+ class textnormalizer:
5
+ def __init__(self):
6
+ base_dir = os.path.dirname(__file__)
7
+ so_path = os.path.join(base_dir, "pypiRBTN.so")
8
+ rules_config_path = os.path.join(base_dir, "rules_config.txt")
9
+ #print(so_path, rules_config_path)
10
+ self.lib = CDLL(so_path)
11
+ self.lib.pypiloadrules(rules_config_path.encode('utf-8'))
12
+
13
+ def get_normalized_text(self, raw_txt):
14
+ ret_list=[]
15
+ if(type(raw_txt) is not list):
16
+ print('input must be a list.')
17
+ return
18
+ for i in range(len(raw_txt)):
19
+ if(raw_txt[i]==''):
20
+ ret_list.append('')
21
+ else:
22
+ buf = create_string_buffer(len(raw_txt[i])*32*4)
23
+ self.lib.pypiconversion((c_char_p * 2)(* [s.encode('utf-8') for s in [raw_txt[i], '']]), buf)
24
+ ret_list.append(buf.value.decode())
25
+ return ret_list
26
+
27
+ if __name__ == '__main__':
28
+ tn=textnormalizer()
29
+ nt=tn.get_normalized_text(['20年', '', '0050'])
30
+ print(nt)
Binary file
@@ -0,0 +1,818 @@
1
+ #<NAN> = not a number
2
+ #<NAE> = not a english
3
+ #<NAM> = not a mandarin
4
+ #<IAN> = is a number
5
+ #<IAE> = is a english
6
+ #<IAM> = is a mandarin
7
+
8
+ ~r substitution 256
9
+ ~s "2" :"二" 4
10
+ # ~t 第2<NAN>
11
+ ~t <NAN>2月
12
+ ~t <NAN>2日
13
+ # ~t 之2<NAN>
14
+ # ~s "2%" : "百分之二" 1
15
+ # ~t <NAN>2%
16
+ ~s "AT&T" : "A-T-and-T" 1
17
+ ~t <NAE>AT&T<NAE>
18
+ ~s "S&P 500" : "標普五百" 1
19
+ ~t <NAE>S&P 500<NAN>
20
+ ~s "S&P500" : "標普五百" 1
21
+ ~t <NAE>S&P500<NAN>
22
+ ~s ".com" : " dot com" 1
23
+ ~t <IAE>.com<NAE>
24
+ ~s ".tw" : " dot tw" 1
25
+ ~t <IAE>.tw<NAE>
26
+ ~s ".www" : " dot www" 1
27
+ ~t <IAE>.www<NAE>
28
+ ~s "TOEFL" : "toefl" 1
29
+ ~t <NAE>TOEFL<NAE>
30
+ ~s "7" : "seven " 2
31
+ ~t <NAE>iPhone 7<NAN>
32
+ ~t <NAE>Windows 7<NAN>
33
+ ~s "8" : "eight " 2
34
+ ~t <NAE>iPhone 8<NAN>
35
+ ~t <NAE>Windows 8<NAN>
36
+ ~s "10" : "ten " 2
37
+ ~t <NAE>iPhone 10<NAN>
38
+ ~t <NAE>Windows 10<NAN>
39
+ ~s "X" : "ten " 1
40
+ ~t <NAE>iPhone X
41
+ # ~s "19" : "nighteen " 3
42
+ # ~t <NAE>COVID-19<NAE>
43
+ # ~t <NAE>Covid-19<NAE>
44
+ # ~t <NAE>covid-19<NAE>
45
+ ~s "+" : " plus " 3
46
+ ~t <NAE>S9+<NAN>
47
+ ~t <NAE>S10+<NAN>
48
+ ~t <NAE>S20+<NAN>
49
+ ~s "+" : "加" 1
50
+ ~t <NAE>+
51
+ ~s "+" : "加" 1
52
+ ~t +
53
+ ~s "﹢" : "加" 1
54
+ ~t ﹢
55
+ ~s "50" : "五十" 1
56
+ ~t 台灣50<NAN>
57
+ ~s "500" : "五百" 2
58
+ ~t 標普500<NAN>
59
+ ~t 世界500強
60
+ ~s "7-11" : "SEVEN ELEVEN " 1
61
+ ~t <NAN>7-11<NAN>
62
+ ~s "7-ELEVEN" : "SEVEN ELEVEN " 1
63
+ ~t <NAN>7-ELEVEN<NAN>
64
+ ~s "G7" : "G SEVEN " 1
65
+ ~t <NAE>G7<NAN>
66
+ ~s "1" : " ONE " 2
67
+ ~t <NAE>B1<NAN>
68
+ ~t <NAE>A1<NAN>
69
+ ~s "B2B" : "Business to Business " 1
70
+ ~t <NAN>B2B<NAN>
71
+ ~s "C2C" : "Customer to Customer " 1
72
+ ~t <NAN>C2C<NAN>
73
+ ~s "B2C" : "Business to Customer " 1
74
+ ~t <NAN>B2C<NAN>
75
+ ~s "C2B" : "Customer to Business " 1
76
+ ~t <NAN>C2B<NAN>
77
+ ~s "O2O" : "Online to Offlines " 1
78
+ ~t <NAN>O2O<NAN>
79
+ # ~s "NVIDIA" : "nvidia" 1
80
+ # ~t <NAE>NVIDIA<NAE>
81
+ ~s "ft." : "featuring" 1
82
+ ~t <NAE>ft.<NAE>
83
+ ~s "007" : "零零七" 1
84
+ ~t <NAN>007<NAN>
85
+ ~s "×" : "乘" 1
86
+ ~t ×
87
+ ~s "=" : "等於" 1
88
+ ~t =
89
+ ~s "㏄" : "CC" 1
90
+ ~t ㏄
91
+ ~s "@" : "at " 1
92
+ ~t @
93
+ ~s "@" : "at " 1
94
+ ~t @
95
+ ~s "﹫" : "at " 1
96
+ ~t ﹫
97
+ ~s "㎎" : "毫克" 1
98
+ ~t ㎎
99
+ ~s "㎏" : "公斤" 1
100
+ ~t ㎏
101
+ ~s "㎜" : "毫米" 1
102
+ ~t ㎜
103
+ ~s "㎝" : "公分" 1
104
+ ~t ㎝
105
+ ~s "㎞" : "公里" 1
106
+ ~t ㎞
107
+ ~s "㎡" : "平方公尺" 1
108
+ ~t ㎡
109
+ ~s "㏎" : "公里" 1
110
+ ~t ㏎
111
+ ~s "㏑" : "natural log " 1
112
+ ~t ㏑
113
+ ~s "㏒" : "log " 1
114
+ ~t ㏒
115
+ ~s "㏕" : "米爾" 1
116
+ ~t ㏕
117
+ ~s "Α" : "Alpha " 1
118
+ ~t Α
119
+ ~s "Β" : "Beta " 1
120
+ ~t Β
121
+ ~s "Γ" : "Gamma " 1
122
+ ~t Γ
123
+ ~s "Δ" : "Delta " 1
124
+ ~t Δ
125
+ ~s "Ε" : "Epsilon " 1
126
+ ~t Ε
127
+ ~s "Ζ" : "Zeta " 1
128
+ ~t Ζ
129
+ ~s "Η" : "Eta " 1
130
+ ~t Η
131
+ ~s "Θ" : "Theta " 1
132
+ ~t Θ
133
+ ~s "Ι" : "Iota " 1
134
+ ~t Ι
135
+ ~s "Κ" : "Kappa " 1
136
+ ~t Κ
137
+ ~s "Λ" : "Lambda " 1
138
+ ~t Λ
139
+ ~s "Μ" : "Mu " 1
140
+ ~t Μ
141
+ ~s "Ν" : "Nu " 1
142
+ ~t Ν
143
+ ~s "Ξ" : "Xi " 1
144
+ ~t Ξ
145
+ ~s "Ο" : "Omicron " 1
146
+ ~t Ο
147
+ ~s "Π" : "Pi " 1
148
+ ~t Π
149
+ ~s "Ρ" : "Rho " 1
150
+ ~t Ρ
151
+ ~s "Σ" : "Sigma " 1
152
+ ~t Σ
153
+ ~s "Τ" : "Tau " 1
154
+ ~t Τ
155
+ ~s "Υ" : "Upsilon " 1
156
+ ~t Υ
157
+ ~s "Φ" : "Phi " 1
158
+ ~t Φ
159
+ ~s "Χ" : "Chi " 1
160
+ ~t Χ
161
+ ~s "Ψ" : "Psi " 1
162
+ ~t Ψ
163
+ ~s "Ω" : "Omega " 1
164
+ ~t Ω
165
+ ~s "α" : "alpha " 1
166
+ ~t α
167
+ ~s "β" : "beta " 1
168
+ ~t β
169
+ ~s "γ" : "gamma " 1
170
+ ~t γ
171
+ ~s "δ" : "delta " 1
172
+ ~t δ
173
+ ~s "ε" : "epsilon " 1
174
+ ~t ε
175
+ ~s "ζ" : "zeta " 1
176
+ ~t ζ
177
+ ~s "η" : "eta " 1
178
+ ~t η
179
+ ~s "θ" : "theta " 1
180
+ ~t θ
181
+ ~s "ι" : "iota " 1
182
+ ~t ι
183
+ ~s "κ" : "kappa " 1
184
+ ~t κ
185
+ ~s "λ" : "lambda " 1
186
+ ~t λ
187
+ ~s "μ" : "mu " 1
188
+ ~t μ
189
+ ~s "ν" : "nu " 1
190
+ ~t ν
191
+ ~s "ξ" : "xi " 1
192
+ ~t ξ
193
+ ~s "ο" : "omicron " 1
194
+ ~t ο
195
+ ~s "π" : "pi " 1
196
+ ~t π
197
+ ~s "ρ" : "rho " 1
198
+ ~t ρ
199
+ ~s "σ" : "sigma " 1
200
+ ~t σ
201
+ ~s "τ" : "tau " 1
202
+ ~t τ
203
+ ~s "υ" : "upsilon " 1
204
+ ~t υ
205
+ ~s "φ" : "phi " 1
206
+ ~t φ
207
+ ~s "χ" : "chi " 1
208
+ ~t χ
209
+ ~s "ψ" : "psi " 1
210
+ ~t ψ
211
+ ~s "ω" : "omega " 1
212
+ ~t ω
213
+ ~s "瓩" : "千瓦" 1
214
+ ~t 瓩
215
+ ~s "á" : "a" 1
216
+ ~t Taiwán
217
+ ~s "SoC" : "S-o-C" 1
218
+ ~t <NAE>SoC<NAE>
219
+ ~s "〇" : "零" 1
220
+ ~t 〇
221
+ ~e #end_of_substitution
222
+
223
+ #****************************************************************************************************************#
224
+ #NTISFW numerals_to_individually_spoken_form_word
225
+ #DPTSFW decimal_point_to_spoken_form_word
226
+ #NWMSFW numerals_with_measure_word_to_spoken_form_word(reserve measure unit)
227
+ #N2SWND non-standard word to spoken-form word(numeriacl) with non digital input reserved
228
+ #N2IWND non-standard word to spoken-form word(individual) with non digital input reserved
229
+ #TADSRB replace dash and tilde symbols to '到'
230
+ #NWDSFW numerals_with_date_word_to_spoken_form_word
231
+ #FRACSFW fraction_to_spoken_form_word
232
+ #PTSFW proportion_to_spoken_form_word
233
+ #~k : transform rule for numerals
234
+ #~p : prefix language(1 for Mandarin, 2 for English) "string_token"
235
+ #~f : suffix language(1 for Mandarin, 2 for English) "string_token"
236
+ #~v : replace set prefix
237
+ #~m : replace pair 0/1_for_on_off "raw" : "replace"
238
+
239
+ ~r transformation 256
240
+ ~c R1 on 16 0 #3,333、10.5、3,333.5 數值唸法
241
+ ~k DPTSFW
242
+
243
+ ~c R2 on 16 32 #money mark $100
244
+ ~k MMTDSF
245
+
246
+ ~c R3 on 1 7 #英文字母+英文字母、 covid-vaccine+ i.e. A+B -> A加B、 AZ+->AZ加
247
+ ~k NULL
248
+ ~v 4
249
+ ~m 1 "+" : "加"
250
+ ~m 1 "+" : "加"
251
+ ~m 1 "﹢" : "加"
252
+
253
+ ~c R4 on 6 10 #年份 e.g. 2004鼠年,2008龜山
254
+ ~k N2SSTF
255
+
256
+ ~c R5 on 16 13 #1992/09~1995/06
257
+ ~k YMTYMT
258
+
259
+ ~c R6 on 16 0 #9to5
260
+ ~k NESWND
261
+
262
+ ~c R7 on 6 10 #日月年
263
+ ~k DMYTSF
264
+
265
+ ~c R8 on 6 7 #日/月-日/月 i.e. 21/02-28/03 古今天外(第五季)
266
+ ~k DMTODM
267
+
268
+ ~c R9 on 6 10 #年月日
269
+ ~k DTDYMD
270
+
271
+ ~c R10 on 6 10 #西元月年
272
+ ~k DMYTSF
273
+
274
+ ~c R11 on 6 7 #分數,分子分母都大於13 i.e. 13/29
275
+ ~k FSFWTF
276
+
277
+ ~c R12 on 16 7 #西元年念法 2008/11
278
+ ~k YMTSFW
279
+
280
+ ~c R12_subspecies on 16 7 #西元年念法 2021/22
281
+ ~k N2SSTF_LD
282
+
283
+ ~c R13 on 16 4 #3. 數字發音
284
+ ~k DPTSFW
285
+ # ~p 1 "第"
286
+ # ~f 1 ","
287
+
288
+ ~c R14 on 6 1 #serial number i.e. ISBN 978-4-592-14517-2 2013年12月23日 EAN 471-7702-25832-0
289
+ ~k N2IWND
290
+
291
+ ~c R15 on 16 10 #Elizabeth vii
292
+ ~k PRNTPG
293
+
294
+ ~c R16 on 6 1 #三位數:三位數、三位數:二位數、二位數:三位數、二位數(25以上):二位數 幾比幾
295
+ ~k RSFSTF
296
+
297
+ ~c R17 on 3 1 #14位數值以上 數字發音
298
+ ~k NTISFW
299
+
300
+ ~c R18 on 4 1 #羅馬數字轉英文 ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫⅬⅭⅮⅯⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹⅺⅻⅼⅽⅾⅿ
301
+ ~k RACTEA
302
+
303
+ ~c R19 on 16 10 #37℃ 攝氏
304
+ ~k DPWAID
305
+ ~p 1 "攝氏"
306
+ ~f 1 "度"
307
+
308
+ ~c R20 on 16 10 #numeral~numeral
309
+ ~k NULL
310
+ ~v 4
311
+ ~m 1 "~" : "到"
312
+ ~m 1 "~" : "到"
313
+
314
+ ~c R21 on 6 10 #在2/26、3/4、3/9這幾個交易日
315
+ ~k DSPFWW
316
+
317
+ ~c R22 on 16 0 #第number
318
+ ~k N2SWND
319
+ ~v 7
320
+ ~m 1 "~" : "到"
321
+ ~m 1 "~" : "到"
322
+ ~m 1 "-" : "到"
323
+ ~m 1 "-" : "到"
324
+ ~m 1 "–" : "到"
325
+ ~m 1 "—" : "到"
326
+ ~m 1 "−" : "到"
327
+ ~c R23 on 1 4 #四○・八、五・一、五・八%
328
+ ~k CMNTSW
329
+ ~v 3
330
+ ~m 1 "%" : "趴"
331
+ ~m 1 "%" : "趴"
332
+ ~m 1 "﹪" : "趴"
333
+
334
+ ~c R24 on 6 0 #1000+單位 數值唸法
335
+ ~k NWMSFW
336
+
337
+ ~c R25 on 6 0 #1,000+單位 數值唸法
338
+ ~k NWMSFW
339
+
340
+ ~c R26 on 16 10 #37℉ 華氏
341
+ ~k DPWAID
342
+ ~p 1 "華氏"
343
+ ~f 1 "度"
344
+
345
+ ~c R27 on 16 10 #":"章節 i.e 約翰一書2:2;哥林多後書5:21;羅馬書3:23-26
346
+ ~k N2SCAC
347
+
348
+ ~c R28 on 6 100 #1000+英文單位 數值唸法
349
+ ~k NWMSFW
350
+ ~v 100
351
+ ~m 0 "V" : "伏特"
352
+ ~m 0 "A" : "安培"
353
+ ~m 0 "hp" : "匹馬力"
354
+ ~m 0 "g" : "公克"
355
+ ~m 0 "kg" : "公斤"
356
+ ~m 1 "Hz" : " Hertz"
357
+ ~m 1 "GHz" : " giga Hertz"
358
+ ~m 1 "MHz" : " mega Hertz"
359
+ ~m 1 "km" : "公里"
360
+ ~m 0 "cm" : "公分"
361
+ ~m 1 "mm" : "毫米"
362
+ ~m 1 "nm" : "奈米"
363
+ ~m 1 "Gbps" : " giga bits per second"
364
+ ~m 1 "C.C." : "毫升"
365
+ ~m 1 "㏄" : "毫升"
366
+ ~m 1 "/s" : "每秒"
367
+ ~m 1 "/min" : "每分"
368
+ ~m 1 "/h" : "每小時"
369
+ ~m 1 "/hr" : "每小時"
370
+ ~m 1 "/L" : "每升"
371
+
372
+ ~c R29 on 6 100 #1,000+英文單位 數值唸法
373
+ ~k N2SWND
374
+ ~v 100
375
+ ~m 0 "V" : "伏特"
376
+ ~m 0 "A" : "安培"
377
+ ~m 0 "hp" : "匹馬力"
378
+ ~m 0 "g" : "公克"
379
+ ~m 0 "kg" : "公斤"
380
+ ~m 1 "Hz" : " Hertz"
381
+ ~m 1 "MHz" : " mega Hertz"
382
+ ~m 1 "km" : "公里"
383
+ ~m 0 "cm" : "公分"
384
+ ~m 1 "mm" : "毫米"
385
+ ~m 1 "nm" : "奈米"
386
+ ~m 1 "Gbps" : " giga bits per second"
387
+ ~m 1 "C.C." : "毫升"
388
+ ~m 1 "㏄" : "毫升"
389
+ ~m 1 "/" : "每"
390
+
391
+ ~c R30 on 16 1 #昨(31)日 數值唸法
392
+ ~k N2SWND
393
+
394
+ ~c R31 on 16 0 #1、2+單位 數值唸法
395
+ ~k N2SWND_FT
396
+
397
+ ~c R32 on 6 0 #1~2+單位,1-2+單位 到,數值唸法
398
+ ~k TADSRB
399
+
400
+ ~c R33 on 8 0 #http//www.yuanzaicoin.com 網址結構
401
+ ~k URLNOR
402
+
403
+ ~c R34 on 16 0 #滿5000送500、投11中6 數值唸法
404
+ ~k N2SWND
405
+
406
+ ~c R35 on 16 10 #(02)2349-4666 數字唸法
407
+ ~k N2IWND
408
+ ~v 6
409
+ ~m 1 "#" : "轉分機"
410
+ ~m 1 "#" : "轉分機"
411
+ ~m 1 "﹟" : "轉分機"
412
+ ~m 1 "+" : "加"
413
+ ~m 1 "+" : "加"
414
+ ~m 1 "﹢" : "加"
415
+
416
+ ~c R36 on 16 10 #9月16到17日 日期數值唸法
417
+ ~k YMOMDT
418
+
419
+ ~c R37 on 16 0 #傳真:數字、代碼:數字 數字唸法
420
+ ~k N2IWND
421
+
422
+ ~c R38 on 8 0 #e-mail
423
+ ~k EMAILN
424
+
425
+ ~c R39 on 16 0 #covid-19 , number percent , TOP5->TOP five
426
+ ~k NESWND
427
+
428
+ ~c R40 on 6 0 #07/19~07/28
429
+ ~k MDTOMD
430
+
431
+ ~c R41 on 6 0 #03/22(二)
432
+ ~k MDWNDI
433
+
434
+ ~c R42 on 6 0 #03/21~25
435
+ ~k MDTOMD
436
+
437
+ ~c R43 on 6 0 #10:00~18:00
438
+ ~k HMTOHM
439
+
440
+ ~c R44 on 6 0 #802.11ac
441
+ ~k COMPRO
442
+
443
+ ~c R45 on 6 10 #plus sign and minus sing Symmetry, i.e. +、- , ++/--, 2,000萬元-1,200萬元-6萬元-2.85萬元
444
+ ~k NULL
445
+ ~v 8
446
+ ~m 1 "+" : "加"
447
+ ~m 1 "+" : "加"
448
+ ~m 1 "﹢" : "加"
449
+ ~m 1 "-" : "減"
450
+ ~m 1 "-" : "減"
451
+ ~m 1 "–" : "減"
452
+ ~m 1 "—" : "減"
453
+ ~m 1 "−" : "減"
454
+
455
+ ~c R46 on 16 0 #台88線、F-16戰機 數值念法(雜)
456
+ ~k N2SWND
457
+ ~v 3
458
+ ~m 1 "~" : "到"
459
+ ~m 1 "~" : "到"
460
+
461
+ ~c R47 on 6 0 #滬深300指數、史坦普500
462
+ ~k SFWFSC
463
+
464
+ ~c R48 on 16 5 #案18590和案18397互相不認識
465
+ ~k NFNLT4
466
+
467
+
468
+ ~c R49 on 6 0 #3:0完封、0:3落敗
469
+ ~k RSFWTF
470
+
471
+ ~c R50 on 16 0 #4比6、3缺1、2選1
472
+ ~k N2SWND
473
+ ~v 5
474
+ ~m 1 "x" : "乘"
475
+ ~m 1 "X" : "乘"
476
+ ~m 1 "x" : "乘"
477
+ ~m 1 "X" : "乘"
478
+ ~m 1 "×" : "乘"
479
+
480
+ ~c R51 on 6 0 #分數念法 1/2學分
481
+ ~k FSFWTF
482
+
483
+ ~c R52 on 1 7 #&#10003; decimal html code
484
+ ~k HICTU8
485
+
486
+ ~c R53 on 6 0 #22:00
487
+ ~k HMTSFW
488
+
489
+ ~c R54 on 16 1 #單獨一數字
490
+ ~k N2IWND
491
+
492
+ ~c unknow_R54 on 16 10 #55688 、 831 、 711
493
+ ~k NSFWSP
494
+
495
+ ~c R55 on 0 16 #24h
496
+ ~k TTFHSF
497
+
498
+ ~c R56 on 16 0 #iphone 11(限制念中文發音)
499
+ ~k CH_N2SWND
500
+
501
+ ~c R57 on 16 0 #'engchar' '+' 'num' ie. BD+34 -> BD plus 三四
502
+ ~k N2IWND
503
+ ~v 7
504
+ ~m 1 "+" : " plus"
505
+ ~m 1 "+" : " plus"
506
+ ~m 1 "﹢" : " plus"
507
+
508
+ ~c R58 on 3 1 #.364的打擊率->點三六四的打擊率 銀色.50口徑Mark XIX型,命名為“.50 GS”,
509
+ ~k DCSOIN
510
+
511
+ ~c R59 on 30 0 #26th->twenty-sixth
512
+ ~k CONAFF
513
+
514
+ ~c R60 on 16 10 #slash -> or, i.e.熟成55/75天的
515
+ ~k N2SWND
516
+ ~v 4
517
+ ~m 1 "/" : "或"
518
+ ~m 1 "/" : "或"
519
+ ~m 1 "∕" : "或"
520
+ ~m 1 "╱" : "或"
521
+
522
+ ~c R61 on 16 10 #2+2, 2x4米, 7x24小時
523
+ ~k CMFSFW
524
+
525
+ ~c R62 on 1 8 #& -> and
526
+ ~k CASSWA
527
+
528
+ ~c R63 on 6 8 #SmartTag+ 、C# 、Disney+ -> Disney plus
529
+ ~k ECCWPS
530
+
531
+ ~c R64 on 4 8 #2018-19賽季
532
+ ~k ABOYTY
533
+
534
+ ~c R65 on 4 8 #PER值計算公式為:(得分+籃板+助攻+搶斷+蓋帽)-(出手次數-命中次數)-(罰球次數-罰球命中次數)-失誤次數/球員上場比賽的場次。
535
+ ~k GFGCAS
536
+
537
+ ~c R66 on 4 0 # 英文字母"O"->"零" i.e.2O2O年
538
+ ~k RUOEAO
539
+
540
+ ~c R67 on 4 16 # ?te -> why-te, 9M88, M.2, CO2
541
+ ~k TPEFRU
542
+
543
+ ~c R68 on 4 16 # Ⅰ期 -> 一期, Ⅱ期 -> 二期
544
+ ~k RUTMSF
545
+
546
+ ~c R69 on 16 16 # 711、7-11 -> seven-eleven
547
+ ~k TN77SE
548
+
549
+ ~c unknow_R69 on 4 1 # 7-11 -> 七一一
550
+ ~k TN77SE
551
+
552
+ ~c R70 on 2 0 # 廿->二十、 卅->三十、 卌->四十
553
+ ~k NULL
554
+ ~v 3
555
+ ~m 1 "廿" : "二十"
556
+ ~m 1 "卅" : "三十"
557
+ ~m 1 "卌" : "四十"
558
+
559
+ ~c R71 on 6 10 #日期 分數 難以區分MMDD[YYYY], i.e. 7/20、 8/5、 05/21/2013
560
+ ~k FFDFCD
561
+
562
+ ~c R72 on 6 10 #year_prefix+(year-year), i.e. 計劃(2023-2027)
563
+ ~k ABOYTY
564
+
565
+ ~c R73 on 16 1 #model_prefix+model_name, i.e. 包括 A2640、A2643 和 A2645 ; P2P、B2C
566
+ ~k N2SSTF_LD
567
+ ~v 3
568
+ ~m 1 "+" : " plus"
569
+ ~m 1 "+" : " plus"
570
+ ~m 1 "﹢" : " plus"
571
+ ~c R74 on 4 1 #numerals+festival, i.e. 包括 99重陽節
572
+ ~k N2IWND
573
+
574
+ ~c R75 on 16 1 #English+"-"+number, i.e. AIM-120
575
+ ~k N2SSTF_LD
576
+
577
+ ~c R76 on 16 1 #English+("."+number)*N,N>=1, i.e. M.2、B.1.621
578
+ ~k EDNMNT
579
+
580
+ ~c R77 on 16 1 #[第]+numerals+期, i.e. 第110065期、3期
581
+ ~k NINB4I
582
+
583
+ ~c R78 on 16 1 #數學符號, i.e. ≠、≤、≥、÷
584
+ ~k MSTSFW
585
+
586
+ ~c R79 on 16 32 #百分比統整3% -10% 12~18% 降低15%
587
+ ~k UAPPUN
588
+
589
+ ~c R80 on 16 1 #型號 ex: i9-12900
590
+ ~k NNLIRW
591
+
592
+ ~c R81 on 16 1 #iPhone 8+、 S20+
593
+ ~k N2SWND
594
+ ~v 7
595
+ ~m 1 "+" : "plus"
596
+ ~m 1 "+" : "plus"
597
+ ~m 1 "﹢" : "plus"
598
+
599
+ ~c R82 on 16 1 #波音737-800型客機
600
+ ~k NILNRW
601
+
602
+
603
+ ~c R83 on 16 10 #開價35~40萬/坪
604
+ ~k MYSHUT
605
+
606
+
607
+ ~c R84 on 16 10 #可以參考書中P134~P136寫的
608
+ ~k RWPOST
609
+ #~p 2 "pages "
610
+ #~f 1 "頁"
611
+
612
+ ~c R85 on 16 10 #有4000+建案成交
613
+ ~k N2SWND
614
+ ~v 6
615
+ ~m 1 "+" : "以上"
616
+ ~m 1 "+" : "以上"
617
+ ~m 1 "﹢" : "以上"
618
+
619
+ ~c R86 on 16 10 #9/17(三)~9/19(5)
620
+ ~k MSDWDW
621
+
622
+ ~c R87 on 16 10 #8/15(2)起~2023/11
623
+ ~k MDD2YM
624
+
625
+ ~c R88 on 16 10 #T+0
626
+ ~k N2SWND
627
+ ~v 6
628
+ ~m 1 "+" : "加"
629
+ ~m 1 "+" : "加"
630
+ ~m 1 "﹢" : "加"
631
+
632
+ ~c R89 on 16 10 #IC+IT
633
+ ~k NULL
634
+
635
+ ~c R90 on 16 10 #A-1+
636
+ ~k NESWND
637
+ ~v 8
638
+ ~m 1 "+" : "plus "
639
+ ~m 1 "+" : "plus "
640
+ ~m 1 "﹢" : "plus "
641
+ ~m 1 "-" : ""
642
+ ~m 1 "-" : ""
643
+ ~m 1 "–" : ""
644
+ ~m 1 "—" : ""
645
+ ~m 1 "−" : ""
646
+
647
+ ~c R91 on 16 10 #2分之1
648
+ ~k N2SWND
649
+
650
+ ~c R92 on 4 10 #XA~XS、即日起-5/16(一)可參加報名, 10月13至11月26日
651
+ ~k N2SWND
652
+ ~v 7
653
+ ~m 1 "~" : "到"
654
+ ~m 1 "~" : "到"
655
+ ~m 1 "-" : "到"
656
+ ~m 1 "-" : "到"
657
+ ~m 1 "–" : "到"
658
+ ~m 1 "—" : "到"
659
+ ~m 1 "−" : "到"
660
+
661
+ ~c R93 on 16 10 # iOS 16.0.3
662
+ ~k VMPNDN
663
+
664
+ ~c R94 on 16 10 # 時間帶秒 i.e.17:53:28
665
+ ~k THCMCS
666
+
667
+ ~c R95 on 16 32 #千分比統整3‰ -10‰ 12~18‰ 降低15‰
668
+ ~k UAPPMU
669
+
670
+ ~c R96 on 16 10 #semi_Numerical_suffix, i.e. 2024對賴清德、17對情侶
671
+ ~k SEMINS
672
+
673
+ ~c R97 on 16 0 #路2段(雜) 2番戰
674
+ ~k N2IWND
675
+
676
+ ~c R98 on 16 0 #IP address 192.168.50.69
677
+ ~k N2IWND
678
+ ~v 13
679
+ ~m 1 "." : "點"
680
+ ~m 1 "." : "點"
681
+ ~m 1 "‧" : "點"
682
+ ~m 1 "~" : "到"
683
+ ~m 1 "~" : "到"
684
+ ~m 1 "-" : "到"
685
+ ~m 1 "-" : "到"
686
+ ~m 1 "–" : "到"
687
+ ~m 1 "—" : "到"
688
+ ~m 1 "−" : "到"
689
+
690
+ ~c R99 on 16 0 #U+2030(unicode number)
691
+ ~k N2IWND
692
+
693
+ ~c R100 on 16 0 #E-BD2/3、E-BG2/3
694
+ ~k N2SWND
695
+
696
+ ~c R101 on 16 0 #60°26′10″N 134°15′02″W
697
+ ~k MASAFW
698
+
699
+ ~c R102 on 16 0 #No.1 Part.1 vol.1
700
+ ~k EAWWON
701
+
702
+ ~c R103 on 16 0 #2H+ + O2− → H2O
703
+ ~k CFTSFW
704
+
705
+ ~c R104 on 16 0 #2009-2010 10001-11000
706
+ ~k RWAPAS
707
+
708
+ ~c R105 on 16 0 #filler annoying English char CHEM.is.TRY、H.Spectrum+
709
+ ~k NULL
710
+ ~v 3
711
+ ~m 1 "+" : " plus"
712
+ ~m 1 "+" : " plus"
713
+ ~m 1 "﹢" : " plus"
714
+
715
+ ~c R106 on 16 0 #比賽成績 i.e. 2:13.51 +7.28
716
+ ~k CRTSFW
717
+
718
+ ~c R107 on 16 0 #Latex i.e. 10^{-24}
719
+ ~k LATEXT
720
+
721
+ ~c R108 on 16 0 #competition_result i.e. 意大利隊以2-1取得勝利
722
+ ~k CH_N2SWND
723
+ ~v 5
724
+ ~m 1 "-" : "比"
725
+ ~m 1 "-" : "比"
726
+ ~m 1 "–" : "比"
727
+ ~m 1 "—" : "比"
728
+ ~m 1 "−" : "比"
729
+
730
+ ~c R109 on 16 0 #廣播電台 i.e. FM98.7
731
+ ~k N2IWND
732
+ ~v 3
733
+ ~m 1 "." : "點"
734
+ ~m 1 "." : "點"
735
+ ~m 1 "‧" : "點"
736
+
737
+ ~c R110 on 16 0 #個別發音的範圍 i.e. 車次總範圍為6001-7598,其中:. (車次爲6001-7598)
738
+ ~k RSFSTF
739
+ ~v 7
740
+ ~m 1 "~" : "到"
741
+ ~m 1 "~" : "到"
742
+ ~m 1 "-" : "到"
743
+ ~m 1 "-" : "到"
744
+ ~m 1 "–" : "到"
745
+ ~m 1 "—" : "到"
746
+ ~m 1 "−" : "到"
747
+
748
+ ~c R111 on 16 0 #'^'當作次方 3^2=9 10^2 -5^2 10^0.5 1.5^8 2^2.71828
749
+ ~k HSAESF
750
+
751
+ ~c R112 on 16 0 #列舉 i.e. 分別為「1、4、13、17、34、39」
752
+ ~k N2SWND
753
+ ~v 3
754
+ ~m 1 "+" : " plus"
755
+ ~m 1 "+" : " plus"
756
+ ~m 1 "﹢" : " plus"
757
+
758
+ ~c R113 on 16 0 #法規 i.e. 第23條、23-1條規定辦理的兩年回訓一次制度
759
+ ~k DAFROL
760
+
761
+ ~c R114 on 16 0 #門牌號 i.e. 台北市文山區萬隆街47-12號 區塊開發3-2期
762
+ ~k N2SWND
763
+ ~v 5
764
+ ~m 1 "-" : "之"
765
+ ~m 1 "-" : "之"
766
+ ~m 1 "–" : "之"
767
+ ~m 1 "—" : "之"
768
+ ~m 1 "−" : "之"
769
+
770
+ ~c R115 on 16 0 #'/' -> '每' i.e.微克/毫升
771
+ ~k NULL
772
+ ~v 4
773
+ ~m 1 "/" : "每"
774
+ ~m 1 "/" : "每"
775
+ ~m 1 "∕" : "每"
776
+ ~m 1 "╱" : "每"
777
+
778
+ ~c R116 on 16 0 #幾月幾日幾時幾分-幾月幾日幾時幾分
779
+ ~k MDHMTO
780
+
781
+ ~c R117 on 16 0 #幾分幾秒 i.e. 18:11:、41:08:
782
+ ~k THMSSF
783
+
784
+ ~c R118 on 16 0 #Numerical_format-Numerical_format i.e. 3,500-3,667
785
+ ~k NFSNFS
786
+
787
+ ~c R119 on 16 0 #numerical for Numerals less than 3 times ; shortest first for others
788
+ ~k NFNLT3
789
+
790
+ ~c R120 on 16 0 #Plus->正、Minus->負
791
+ ~k N2SWND
792
+ ~v 8
793
+ ~m 1 "+" : "正"
794
+ ~m 1 "+" : "正"
795
+ ~m 1 "﹢" : "正"
796
+ ~m 1 "-" : "負"
797
+ ~m 1 "-" : "負"
798
+ ~m 1 "–" : "負"
799
+ ~m 1 "—" : "負"
800
+ ~m 1 "−" : "負"
801
+
802
+ ~c R121 on 16 0 #">" -> "大於"
803
+ ~k N2SWND
804
+ ~v 3
805
+ ~m 1 ">" : "大於"
806
+ ~m 1 ">" : "大於"
807
+ ~m 1 "﹥" : "大於"
808
+
809
+ ~c R122 on 16 0 #2,000萬元-1,200萬元-6萬元-2.85萬元
810
+ ~k NUDLOP
811
+
812
+ ~c R123 on 4 0 #三○○億元、二○二二年度
813
+ ~k CNWAAN
814
+
815
+ ~c R124 on 4 0 #0開頭英文字母結尾 i.e. 他列00981A僅漲0.71% 網酸
816
+ ~k N2IWND
817
+
818
+ ~e #end_of_transformation
@@ -0,0 +1,42 @@
1
+ Metadata-Version: 2.4
2
+ Name: rbtn
3
+ Version: 0.1.0
4
+ Summary: RBTN C code to Python ctypes wrapper
5
+ Author-email: "WuHao, Li" <hank12451@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/cewarman/rbtn
8
+ Requires-Python: >=3.8
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Dynamic: license-file
12
+
13
+ # Rule-based text normalization system
14
+
15
+ ## ✨ Features
16
+ 消除文字的歧異,為文字轉語音系統的前置工作。<br>
17
+ 如:<br>
18
+ &emsp;7/19是星期五 -> 七月十九號是星期五<br>
19
+ &emsp;2/3的人不喜歡香菜 -> 三分之二的人不喜歡香菜<br>
20
+ ## 🚀 Quick Start
21
+ ```terminal
22
+ import rbtn.cetn as rbcetn
23
+
24
+ tn=rbcetn.textnormalizer()
25
+ nt=tn.get_normalized_text(['30年', '', '0050'])
26
+ print(nt)
27
+ ```
28
+ ## 📊 Demo / Results
29
+ <img width="1852" height="604" alt="image" src="https://github.com/user-attachments/assets/8a8a9544-a769-48b2-bbec-e12771630ceb" />
30
+
31
+ ## 📁 Project Structure
32
+
33
+ <img width="630" height="154" alt="RBTN" src="https://github.com/user-attachments/assets/2fd389f3-b4ec-42c2-9c7c-a53c40b35981" />
34
+
35
+
36
+ ## ⚙️ Configuration
37
+ ## 📚 Citation
38
+ ## 📜 License
39
+
40
+ This project is available for academic and research use only.
41
+
42
+ Commercial use requires explicit permission from the author.
@@ -0,0 +1,11 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/rbtn/__init__.py
5
+ src/rbtn/cetn.py
6
+ src/rbtn/pypiRBTN.so
7
+ src/rbtn/rules_config.txt
8
+ src/rbtn.egg-info/PKG-INFO
9
+ src/rbtn.egg-info/SOURCES.txt
10
+ src/rbtn.egg-info/dependency_links.txt
11
+ src/rbtn.egg-info/top_level.txt
@@ -0,0 +1 @@
1
+ rbtn