phoonnx 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phoonnx/__init__.py +0 -0
- phoonnx/config.py +490 -0
- phoonnx/locale/ca/phonetic_spellings.txt +2 -0
- phoonnx/locale/en/phonetic_spellings.txt +1 -0
- phoonnx/locale/gl/phonetic_spellings.txt +2 -0
- phoonnx/locale/pt/phonetic_spellings.txt +2 -0
- phoonnx/phoneme_ids.py +453 -0
- phoonnx/phonemizers/__init__.py +45 -0
- phoonnx/phonemizers/ar.py +42 -0
- phoonnx/phonemizers/base.py +216 -0
- phoonnx/phonemizers/en.py +250 -0
- phoonnx/phonemizers/fa.py +46 -0
- phoonnx/phonemizers/gl.py +142 -0
- phoonnx/phonemizers/he.py +67 -0
- phoonnx/phonemizers/ja.py +119 -0
- phoonnx/phonemizers/ko.py +97 -0
- phoonnx/phonemizers/mul.py +606 -0
- phoonnx/phonemizers/vi.py +44 -0
- phoonnx/phonemizers/zh.py +308 -0
- phoonnx/thirdparty/__init__.py +0 -0
- phoonnx/thirdparty/arpa2ipa.py +249 -0
- phoonnx/thirdparty/cotovia/cotovia_aarch64 +0 -0
- phoonnx/thirdparty/cotovia/cotovia_x86_64 +0 -0
- phoonnx/thirdparty/hangul2ipa.py +783 -0
- phoonnx/thirdparty/ko_tables/aspiration.csv +20 -0
- phoonnx/thirdparty/ko_tables/assimilation.csv +31 -0
- phoonnx/thirdparty/ko_tables/double_coda.csv +17 -0
- phoonnx/thirdparty/ko_tables/hanja.tsv +8525 -0
- phoonnx/thirdparty/ko_tables/ipa.csv +22 -0
- phoonnx/thirdparty/ko_tables/neutralization.csv +11 -0
- phoonnx/thirdparty/ko_tables/tensification.csv +56 -0
- phoonnx/thirdparty/ko_tables/yale.csv +22 -0
- phoonnx/thirdparty/kog2p/__init__.py +385 -0
- phoonnx/thirdparty/kog2p/rulebook.txt +212 -0
- phoonnx/thirdparty/mantoq/__init__.py +67 -0
- phoonnx/thirdparty/mantoq/buck/__init__.py +0 -0
- phoonnx/thirdparty/mantoq/buck/phonetise_buckwalter.py +569 -0
- phoonnx/thirdparty/mantoq/buck/symbols.py +64 -0
- phoonnx/thirdparty/mantoq/buck/tokenization.py +105 -0
- phoonnx/thirdparty/mantoq/num2words.py +37 -0
- phoonnx/thirdparty/mantoq/pyarabic/__init__.py +12 -0
- phoonnx/thirdparty/mantoq/pyarabic/arabrepr.py +64 -0
- phoonnx/thirdparty/mantoq/pyarabic/araby.py +1647 -0
- phoonnx/thirdparty/mantoq/pyarabic/named_const.py +227 -0
- phoonnx/thirdparty/mantoq/pyarabic/normalize.py +161 -0
- phoonnx/thirdparty/mantoq/pyarabic/number.py +826 -0
- phoonnx/thirdparty/mantoq/pyarabic/number_const.py +1704 -0
- phoonnx/thirdparty/mantoq/pyarabic/stack.py +52 -0
- phoonnx/thirdparty/mantoq/pyarabic/trans.py +517 -0
- phoonnx/thirdparty/mantoq/unicode_symbol2label.py +4173 -0
- phoonnx/thirdparty/tashkeel/LICENSE +22 -0
- phoonnx/thirdparty/tashkeel/SOURCE +1 -0
- phoonnx/thirdparty/tashkeel/__init__.py +212 -0
- phoonnx/thirdparty/tashkeel/hint_id_map.json +18 -0
- phoonnx/thirdparty/tashkeel/input_id_map.json +56 -0
- phoonnx/thirdparty/tashkeel/model.onnx +0 -0
- phoonnx/thirdparty/tashkeel/target_id_map.json +17 -0
- phoonnx/thirdparty/zh_num.py +238 -0
- phoonnx/util.py +705 -0
- phoonnx/version.py +6 -0
- phoonnx/voice.py +521 -0
- phoonnx-0.0.0.dist-info/METADATA +255 -0
- phoonnx-0.0.0.dist-info/RECORD +86 -0
- phoonnx-0.0.0.dist-info/WHEEL +5 -0
- phoonnx-0.0.0.dist-info/top_level.txt +2 -0
- phoonnx_train/__main__.py +151 -0
- phoonnx_train/export_onnx.py +109 -0
- phoonnx_train/norm_audio/__init__.py +92 -0
- phoonnx_train/norm_audio/trim.py +54 -0
- phoonnx_train/norm_audio/vad.py +54 -0
- phoonnx_train/preprocess.py +420 -0
- phoonnx_train/vits/__init__.py +0 -0
- phoonnx_train/vits/attentions.py +427 -0
- phoonnx_train/vits/commons.py +147 -0
- phoonnx_train/vits/config.py +330 -0
- phoonnx_train/vits/dataset.py +214 -0
- phoonnx_train/vits/lightning.py +352 -0
- phoonnx_train/vits/losses.py +58 -0
- phoonnx_train/vits/mel_processing.py +139 -0
- phoonnx_train/vits/models.py +732 -0
- phoonnx_train/vits/modules.py +527 -0
- phoonnx_train/vits/monotonic_align/__init__.py +20 -0
- phoonnx_train/vits/monotonic_align/setup.py +13 -0
- phoonnx_train/vits/transforms.py +212 -0
- phoonnx_train/vits/utils.py +16 -0
- phoonnx_train/vits/wavfile.py +860 -0
phoonnx/util.py
ADDED
@@ -0,0 +1,705 @@
|
|
1
|
+
import datetime
|
2
|
+
import logging
|
3
|
+
import re
|
4
|
+
import string
|
5
|
+
from datetime import date
|
6
|
+
|
7
|
+
from ovos_date_parser import nice_time, nice_date
|
8
|
+
from ovos_number_parser import pronounce_number, is_fractional, pronounce_fraction
|
9
|
+
from ovos_number_parser.util import is_numeric
|
10
|
+
from unicode_rbnf import RbnfEngine, FormatPurpose
|
11
|
+
|
12
|
+
LOG = logging.getLogger("normalize")
|
13
|
+
|
14
|
+
# A dictionary of common contractions and their expanded forms.
|
15
|
+
# This list is very comprehensive for English.
|
16
|
+
CONTRACTIONS = {
|
17
|
+
"en": {
|
18
|
+
"I'd": "I would",
|
19
|
+
"I'll": "I will",
|
20
|
+
"I'm": "I am",
|
21
|
+
"I've": "I have",
|
22
|
+
"ain't": "is not",
|
23
|
+
"aren't": "are not",
|
24
|
+
"can't": "can not",
|
25
|
+
"could've": "could have",
|
26
|
+
"couldn't": "could not",
|
27
|
+
"didn't": "did not",
|
28
|
+
"doesn't": "does not",
|
29
|
+
"don't": "do not",
|
30
|
+
"gonna": "going to",
|
31
|
+
"gotta": "got to",
|
32
|
+
"hadn't": "had not",
|
33
|
+
"hasn't": "has not",
|
34
|
+
"haven't": "have not",
|
35
|
+
"he'd": "he would",
|
36
|
+
"he'll": "he will",
|
37
|
+
"he's": "he is",
|
38
|
+
"how'd": "how did",
|
39
|
+
"how'll": "how will",
|
40
|
+
"how's": "how is",
|
41
|
+
"isn't": "is not",
|
42
|
+
"it'd": "it would",
|
43
|
+
"it'll": "it will",
|
44
|
+
"it's": "it is",
|
45
|
+
"might've": "might have",
|
46
|
+
"mightn't": "might not",
|
47
|
+
"must've": "must have",
|
48
|
+
"mustn't": "must not",
|
49
|
+
"needn't": "need not",
|
50
|
+
"oughtn't": "ought not",
|
51
|
+
"shan't": "shall not",
|
52
|
+
"she'd": "she would",
|
53
|
+
"she'll": "she will",
|
54
|
+
"she's": "she is",
|
55
|
+
"should've": "should have",
|
56
|
+
"shouldn't": "should not",
|
57
|
+
"somebody's": "somebody is",
|
58
|
+
"someone'd": "someone would",
|
59
|
+
"someone'll": "someone will",
|
60
|
+
"someone's": "someone is",
|
61
|
+
"that'd": "that would",
|
62
|
+
"that'll": "that will",
|
63
|
+
"that's": "that is",
|
64
|
+
"there'd": "there would",
|
65
|
+
"there're": "there are",
|
66
|
+
"there's": "there is",
|
67
|
+
"they'd": "they would",
|
68
|
+
"they'll": "they will",
|
69
|
+
"they're": "they are",
|
70
|
+
"they've": "they have",
|
71
|
+
"wasn't": "was not",
|
72
|
+
"we'd": "we would",
|
73
|
+
"we'll": "we will",
|
74
|
+
"we're": "we are",
|
75
|
+
"we've": "we have",
|
76
|
+
"weren't": "were not",
|
77
|
+
"what'd": "what did",
|
78
|
+
"what'll": "what will",
|
79
|
+
"what're": "what are",
|
80
|
+
"what's": "what is",
|
81
|
+
"what've": "what have",
|
82
|
+
"whats": "what is",
|
83
|
+
"when'd": "when did",
|
84
|
+
"when's": "when is",
|
85
|
+
"where'd": "where did",
|
86
|
+
"where's": "where is",
|
87
|
+
"where've": "where have",
|
88
|
+
"who'd": "who would",
|
89
|
+
"who'd've": "who would have",
|
90
|
+
"who'll": "who will",
|
91
|
+
"who're": "who are",
|
92
|
+
"who's": "who is",
|
93
|
+
"who've": "who have",
|
94
|
+
"why'd": "why did",
|
95
|
+
"why're": "why are",
|
96
|
+
"why's": "why is",
|
97
|
+
"won't": "will not",
|
98
|
+
"won't've": "will not have",
|
99
|
+
"would've": "would have",
|
100
|
+
"wouldn't": "would not",
|
101
|
+
"wouldn't've": "would not have",
|
102
|
+
"y'ain't": "you are not",
|
103
|
+
"y'aint": "you are not",
|
104
|
+
"y'all": "you all",
|
105
|
+
"ya'll": "you all",
|
106
|
+
"you'd": "you would",
|
107
|
+
"you'd've": "you would have",
|
108
|
+
"you'll": "you will",
|
109
|
+
"you're": "you are",
|
110
|
+
"you've": "you have",
|
111
|
+
"I'm'a": "I am going to",
|
112
|
+
"I'm'o": "I am going to",
|
113
|
+
"I'll've": "I will have",
|
114
|
+
"I'd've": "I would have",
|
115
|
+
"Whatcha": "What are you",
|
116
|
+
"amn't": "am not",
|
117
|
+
"'cause": "because",
|
118
|
+
"can't've": "cannot have",
|
119
|
+
"couldn't've": "could not have",
|
120
|
+
"daren't": "dare not",
|
121
|
+
"daresn't": "dare not",
|
122
|
+
"dasn't": "dare not",
|
123
|
+
"everyone's": "everyone is",
|
124
|
+
"gimme": "give me",
|
125
|
+
"gon't": "go not",
|
126
|
+
"hadn't've": "had not have",
|
127
|
+
"he've": "he would have",
|
128
|
+
"he'll've": "he will have",
|
129
|
+
"he'd've": "he would have",
|
130
|
+
"here's": "here is",
|
131
|
+
"how're": "how are",
|
132
|
+
"how'd'y": "how do you do",
|
133
|
+
"howd'y": "how do you do",
|
134
|
+
"howdy": "how do you do",
|
135
|
+
"'tis": "it is",
|
136
|
+
"'twas": "it was",
|
137
|
+
"it'll've": "it will have",
|
138
|
+
"it'd've": "it would have",
|
139
|
+
"kinda": "kind of",
|
140
|
+
"let's": "let us",
|
141
|
+
"ma'am": "madam",
|
142
|
+
"may've": "may have",
|
143
|
+
"mayn't": "may not",
|
144
|
+
"mightn't've": "might not have",
|
145
|
+
"mustn't've": "must not have",
|
146
|
+
"needn't've": "need not have",
|
147
|
+
"ol'": "old",
|
148
|
+
"oughtn't've": "ought not have",
|
149
|
+
"sha'n't": "shall not",
|
150
|
+
"shan't": "shall not",
|
151
|
+
"shalln't": "shall not",
|
152
|
+
"shan't've": "shall not have",
|
153
|
+
"she'd've": "she would have",
|
154
|
+
"shouldn't've": "should not have",
|
155
|
+
"so've": "so have",
|
156
|
+
"so's": "so is",
|
157
|
+
"something's": "something is",
|
158
|
+
"that're": "that are",
|
159
|
+
"that'd've": "that would have",
|
160
|
+
"there'll": "there will",
|
161
|
+
"there'd've": "there would have",
|
162
|
+
"these're": "these are",
|
163
|
+
"they'll've": "they will have",
|
164
|
+
"they'd've": "they would have",
|
165
|
+
"this's": "this is",
|
166
|
+
"this'll": "this will",
|
167
|
+
"this'd": "this would",
|
168
|
+
"those're": "those are",
|
169
|
+
"to've": "to have",
|
170
|
+
"wanna": "want to",
|
171
|
+
"we'll've": "we will have",
|
172
|
+
"we'd've": "we would have",
|
173
|
+
"what'll've": "what will have",
|
174
|
+
"when've": "when have",
|
175
|
+
"where're": "where are",
|
176
|
+
"which's": "which is",
|
177
|
+
"who'll've": "who will have",
|
178
|
+
"why've": "why have",
|
179
|
+
"will've": "will have",
|
180
|
+
"y'all're": "you all are",
|
181
|
+
"y'all've": "you all have",
|
182
|
+
"y'all'd": "you all would",
|
183
|
+
"y'all'd've": "you all would have",
|
184
|
+
"you'll've": "you will have"
|
185
|
+
}
|
186
|
+
}
|
187
|
+
|
188
|
+
# Dictionaries for titles, units, and their full word equivalents.
|
189
|
+
TITLES = {
|
190
|
+
"en": {
|
191
|
+
"Dr.": "Doctor",
|
192
|
+
"Mr.": "Mister",
|
193
|
+
"Prof.": "Professor"
|
194
|
+
},
|
195
|
+
"ca": {
|
196
|
+
"Dr.": "Doctor",
|
197
|
+
"Sr.": "Senyor",
|
198
|
+
"Sra.": "Senyora",
|
199
|
+
"Prof.": "Professor"
|
200
|
+
},
|
201
|
+
"es": {
|
202
|
+
"Dr.": "Doctor",
|
203
|
+
"Sr.": "Señor",
|
204
|
+
"Sra.": "Señora",
|
205
|
+
"Prof.": "Profesor",
|
206
|
+
"D.": "Don",
|
207
|
+
"Dña.": "Doña"
|
208
|
+
},
|
209
|
+
"pt": {
|
210
|
+
"Dr.": "Doutor",
|
211
|
+
"Sr.": "Senhor",
|
212
|
+
"Sra.": "Senhora",
|
213
|
+
"Prof.": "Professor",
|
214
|
+
"Drª.": "Doutora",
|
215
|
+
"Eng.": "Engenheiro",
|
216
|
+
"D.": "Dom",
|
217
|
+
"Dª": "Dona"
|
218
|
+
},
|
219
|
+
"gl": {
|
220
|
+
"Dr.": "Doutor",
|
221
|
+
"Sr.": "Señor",
|
222
|
+
"Sra.": "Señora",
|
223
|
+
"Prof.": "Profesor",
|
224
|
+
"Srta.": "Señorita"
|
225
|
+
},
|
226
|
+
"fr": {
|
227
|
+
"Dr.": "Docteur",
|
228
|
+
"M.": "Monsieur",
|
229
|
+
"Mme": "Madame",
|
230
|
+
"Mlle": "Mademoiselle",
|
231
|
+
"Prof.": "Professeur",
|
232
|
+
"Pr.": "Professeur"
|
233
|
+
},
|
234
|
+
"it": {
|
235
|
+
"Dr.": "Dottore",
|
236
|
+
"Sig.": "Signore",
|
237
|
+
"Sig.ra": "Signora",
|
238
|
+
"Prof.": "Professore",
|
239
|
+
"Dott.ssa": "Dottoressa",
|
240
|
+
"Sig.na": "Signorina"
|
241
|
+
},
|
242
|
+
"nl": {
|
243
|
+
"Dr.": "Dokter",
|
244
|
+
"Dhr.": "De Heer",
|
245
|
+
"Mevr.": "Mevrouw",
|
246
|
+
"Prof.": "Professor",
|
247
|
+
"Drs.": "Dokterandus",
|
248
|
+
"Ing.": "Ingenieur"
|
249
|
+
},
|
250
|
+
"de": {
|
251
|
+
"Dr.": "Doktor",
|
252
|
+
"Prof.": "Professor"
|
253
|
+
}
|
254
|
+
}
|
255
|
+
|
256
|
+
UNITS = {
|
257
|
+
"en": {
|
258
|
+
"€": "euros",
|
259
|
+
"%": "per cent",
|
260
|
+
"ºC": "degrees celsius",
|
261
|
+
"ºF": "degrees fahrenheit",
|
262
|
+
"ºK": "degrees kelvin",
|
263
|
+
"°": "degrees",
|
264
|
+
"$": "dollars",
|
265
|
+
"£": "pounds",
|
266
|
+
"km": "kilometers",
|
267
|
+
"m": "meters",
|
268
|
+
"cm": "centimeters",
|
269
|
+
"mm": "millimeters",
|
270
|
+
"ft": "feet",
|
271
|
+
"in": "inches",
|
272
|
+
"yd": "yards",
|
273
|
+
"mi": "miles",
|
274
|
+
"kg": "kilograms",
|
275
|
+
"g": "grams",
|
276
|
+
"lb": "pounds",
|
277
|
+
"oz": "ounces",
|
278
|
+
"L": "liters",
|
279
|
+
"mL": "milliliters",
|
280
|
+
"gal": "gallons",
|
281
|
+
"qt": "quarts",
|
282
|
+
"pt": "pints",
|
283
|
+
"hr": "hours",
|
284
|
+
"min": "minutes",
|
285
|
+
"s": "seconds"
|
286
|
+
},
|
287
|
+
"pt": {
|
288
|
+
"€": "euros",
|
289
|
+
"%": "por cento",
|
290
|
+
"ºC": "graus celsius",
|
291
|
+
"ºF": "graus fahrenheit",
|
292
|
+
"ºK": "graus kelvin",
|
293
|
+
"°": "graus",
|
294
|
+
"$": "dólares",
|
295
|
+
"£": "libras",
|
296
|
+
"km": "quilômetros",
|
297
|
+
"m": "metros",
|
298
|
+
"cm": "centímetros",
|
299
|
+
"mm": "milímetros",
|
300
|
+
"kg": "quilogramas",
|
301
|
+
"g": "gramas",
|
302
|
+
"L": "litros",
|
303
|
+
"mL": "mililitros",
|
304
|
+
"h": "horas",
|
305
|
+
"min": "minutos",
|
306
|
+
"s": "segundos"
|
307
|
+
},
|
308
|
+
"es": {
|
309
|
+
"€": "euros",
|
310
|
+
"%": "por ciento",
|
311
|
+
"ºC": "grados celsius",
|
312
|
+
"ºF": "grados fahrenheit",
|
313
|
+
"ºK": "grados kelvin",
|
314
|
+
"°": "grados",
|
315
|
+
"$": "dólares",
|
316
|
+
"£": "libras",
|
317
|
+
"km": "kilómetros",
|
318
|
+
"m": "metros",
|
319
|
+
"cm": "centímetros",
|
320
|
+
"kg": "kilogramos",
|
321
|
+
"g": "gramos",
|
322
|
+
"L": "litros",
|
323
|
+
"mL": "millilitros"
|
324
|
+
},
|
325
|
+
"fr": {
|
326
|
+
"€": "euros",
|
327
|
+
"%": "pour cent",
|
328
|
+
"ºC": "degrés celsius",
|
329
|
+
"ºF": "degrés fahrenheit",
|
330
|
+
"ºK": "degrés kelvin",
|
331
|
+
"°": "degrés",
|
332
|
+
"$": "dollars",
|
333
|
+
"£": "livres",
|
334
|
+
"km": "kilomètres",
|
335
|
+
"m": "mètres",
|
336
|
+
"cm": "centimètres",
|
337
|
+
"kg": "kilogrammes",
|
338
|
+
"g": "grammes",
|
339
|
+
"L": "litres",
|
340
|
+
"mL": "millilitres"
|
341
|
+
},
|
342
|
+
"de": {
|
343
|
+
"€": "Euro",
|
344
|
+
"%": "Prozent",
|
345
|
+
"ºC": "Grad Celsius",
|
346
|
+
"ºF": "Grad Fahrenheit",
|
347
|
+
"ºK": "Grad Kelvin",
|
348
|
+
"°": "Grad",
|
349
|
+
"$": "Dollar",
|
350
|
+
"£": "Pfund",
|
351
|
+
"km": "Kilometer",
|
352
|
+
"m": "Meter",
|
353
|
+
"cm": "Zentimeter",
|
354
|
+
"kg": "Kilogramm",
|
355
|
+
"g": "Gramm",
|
356
|
+
"L": "Liter",
|
357
|
+
"mL": "Milliliter"
|
358
|
+
}
|
359
|
+
}
|
360
|
+
|
361
|
+
|
362
|
+
def _get_number_separators(full_lang: str) -> tuple[str, str]:
|
363
|
+
"""
|
364
|
+
Determines decimal and thousands separators based on language.
|
365
|
+
Defaults to '.' decimal and ',' thousands for most languages.
|
366
|
+
Special cases:
|
367
|
+
- 'pt', 'es', 'fr', 'de': ',' decimal and '.' thousands.
|
368
|
+
"""
|
369
|
+
lang_code = full_lang.split("-")[0]
|
370
|
+
decimal_separator = '.'
|
371
|
+
thousands_separator = ','
|
372
|
+
if lang_code in ["pt", "es", "fr", "de"]:
|
373
|
+
decimal_separator = ','
|
374
|
+
thousands_separator = '.'
|
375
|
+
return decimal_separator, thousands_separator
|
376
|
+
|
377
|
+
|
378
|
+
def _normalize_number_word(word: str, full_lang: str, rbnf_engine) -> str:
|
379
|
+
"""
|
380
|
+
Helper function to normalize a single word that is a number, handling
|
381
|
+
decimal and thousands separators based on locale.
|
382
|
+
"""
|
383
|
+
cleaned_word = word.rstrip(string.punctuation)
|
384
|
+
|
385
|
+
# Handle fractions like '3/3'
|
386
|
+
if is_fraction(cleaned_word):
|
387
|
+
try:
|
388
|
+
return pronounce_fraction(cleaned_word, full_lang) + word[len(cleaned_word):]
|
389
|
+
except Exception as e:
|
390
|
+
LOG.error(f"ovos-number-parser failed to pronounce fraction: {word} - ({e})")
|
391
|
+
return word
|
392
|
+
|
393
|
+
# Handle numbers with locale-specific separators
|
394
|
+
decimal_separator, thousands_separator = _get_number_separators(full_lang)
|
395
|
+
temp_cleaned_word = cleaned_word
|
396
|
+
|
397
|
+
# Check if the word contains a thousands separator followed by digits and a decimal separator
|
398
|
+
# This is a specific check for formats like '123.456,78'
|
399
|
+
has_thousands_and_decimal = (
|
400
|
+
thousands_separator in temp_cleaned_word and
|
401
|
+
decimal_separator in temp_cleaned_word and
|
402
|
+
temp_cleaned_word.index(thousands_separator) < temp_cleaned_word.index(decimal_separator)
|
403
|
+
)
|
404
|
+
|
405
|
+
if has_thousands_and_decimal:
|
406
|
+
temp_cleaned_word = temp_cleaned_word.replace(thousands_separator, "")
|
407
|
+
temp_cleaned_word = temp_cleaned_word.replace(decimal_separator, ".")
|
408
|
+
elif decimal_separator in temp_cleaned_word and is_numeric(temp_cleaned_word.replace(decimal_separator, ".", 1)):
|
409
|
+
# Handle cases like '1,2' -> '1.2'
|
410
|
+
temp_cleaned_word = temp_cleaned_word.replace(decimal_separator, ".")
|
411
|
+
elif thousands_separator in temp_cleaned_word and is_numeric(temp_cleaned_word.replace(thousands_separator, "", 1)):
|
412
|
+
# Handle cases like '1.234' -> '1234'
|
413
|
+
temp_cleaned_word = temp_cleaned_word.replace(thousands_separator, "")
|
414
|
+
|
415
|
+
# Check if the word is a valid number after processing
|
416
|
+
if is_numeric(temp_cleaned_word):
|
417
|
+
try:
|
418
|
+
num = float(temp_cleaned_word) if "." in temp_cleaned_word else int(temp_cleaned_word)
|
419
|
+
return pronounce_number(num, lang=full_lang) + word[len(cleaned_word):]
|
420
|
+
except Exception as e:
|
421
|
+
LOG.error(f"ovos-number-parser failed to pronounce number: {word} - ({e})")
|
422
|
+
return word
|
423
|
+
|
424
|
+
elif rbnf_engine and cleaned_word.isdigit():
|
425
|
+
try:
|
426
|
+
pronounced_number = rbnf_engine.format_number(cleaned_word, FormatPurpose.CARDINAL).text
|
427
|
+
return pronounced_number + word[len(cleaned_word):]
|
428
|
+
except Exception as e:
|
429
|
+
LOG.error(f"unicode-rbnf failed to pronounce number: {word} - ({e})")
|
430
|
+
return word
|
431
|
+
|
432
|
+
return word
|
433
|
+
|
434
|
+
|
435
|
+
# --- Date and Time Pronunciation ---
|
436
|
+
def pronounce_date(date_obj: date, full_lang: str) -> str:
|
437
|
+
"""
|
438
|
+
Pronounces a date object using ovos-date-parser.
|
439
|
+
"""
|
440
|
+
return nice_date(date_obj, full_lang)
|
441
|
+
|
442
|
+
|
443
|
+
def pronounce_time(time_string: str, full_lang: str) -> str:
|
444
|
+
"""
|
445
|
+
Pronounces a time string using ovos-date-parser.
|
446
|
+
Handles military time like "15h01" and converts it to a
|
447
|
+
datetime.time object before passing it to nice_time.
|
448
|
+
"""
|
449
|
+
try:
|
450
|
+
hours, mins = time_string.split("h")
|
451
|
+
time_obj = datetime.time(int(hours), int(mins))
|
452
|
+
# Use nice_time from ovos-date-parser
|
453
|
+
return nice_time(time_obj, full_lang, speech=True, use_24hour=True, use_ampm=False)
|
454
|
+
except Exception as e:
|
455
|
+
LOG.warning(f"Failed to parse time string '{time_string}': {e}")
|
456
|
+
return time_string.replace("h", " ")
|
457
|
+
|
458
|
+
|
459
|
+
def _normalize_dates_and_times(text: str, full_lang: str, date_format: str = "DMY") -> str:
|
460
|
+
"""
|
461
|
+
Helper function to normalize dates and times using regular expressions.
|
462
|
+
This prepares the strings for pronunciation.
|
463
|
+
"""
|
464
|
+
lang_code = full_lang.split("-")[0]
|
465
|
+
# Pre-process with regex to handle English am/pm times
|
466
|
+
if lang_code == "en":
|
467
|
+
text = re.sub(r"(?i)(\d+)(am|pm)", r"\1 \2", text)
|
468
|
+
# Handle the pronunciation for TTS
|
469
|
+
text = text.replace("am", "A M").replace("pm", "P M")
|
470
|
+
|
471
|
+
# Normalize times like "15h01" to words
|
472
|
+
time_pattern = re.compile(r"(\d{1,2})h(\d{2})", re.IGNORECASE)
|
473
|
+
|
474
|
+
def replace_time(match):
|
475
|
+
time_str = match.group(0)
|
476
|
+
return pronounce_time(time_str, full_lang)
|
477
|
+
|
478
|
+
text = time_pattern.sub(replace_time, text)
|
479
|
+
|
480
|
+
# Find dates like "DD/MM/YYYY" or "YYYY/MM/DD"
|
481
|
+
date_pattern = re.compile(r"(\d{1,4})[/-](\d{1,2})[/-](\d{1,4})")
|
482
|
+
|
483
|
+
match = date_pattern.search(text)
|
484
|
+
|
485
|
+
if match:
|
486
|
+
# Get the three parts of the date string
|
487
|
+
part1_str, part2_str, part3_str = match.groups()
|
488
|
+
p1, p2, p3 = int(part1_str), int(part2_str), int(part3_str)
|
489
|
+
|
490
|
+
# Initialize month, day, and year
|
491
|
+
month, day, year = None, None, None
|
492
|
+
|
493
|
+
# Determine year first based on length (4 digits)
|
494
|
+
if len(part1_str) == 4:
|
495
|
+
year, rest_parts = p1, [p2, p3]
|
496
|
+
elif len(part3_str) == 4:
|
497
|
+
year, rest_parts = p3, [p1, p2]
|
498
|
+
else:
|
499
|
+
# If no 4-digit year, it's ambiguous, assume a 2-digit year.
|
500
|
+
# We'll assume the last part is the year based on common patterns.
|
501
|
+
year = p3
|
502
|
+
# Expand 2-digit year to 4-digit year
|
503
|
+
if year < 100:
|
504
|
+
# Assume years 00-29 are 2000-2029, 30-99 are 1930-1999
|
505
|
+
year = 2000 + year if year < 30 else 1900 + year
|
506
|
+
rest_parts = [p1, p2]
|
507
|
+
|
508
|
+
# From the remaining parts, try to determine day and month
|
509
|
+
if day is None and any(p > 12 and len(str(p)) == 2 for p in rest_parts):
|
510
|
+
# If a two-digit number is > 12, it's a day
|
511
|
+
day_candidate = next((p for p in rest_parts if p > 12), None)
|
512
|
+
if day_candidate:
|
513
|
+
day = day_candidate
|
514
|
+
rest_parts.remove(day_candidate)
|
515
|
+
month = rest_parts[0]
|
516
|
+
|
517
|
+
# Fallback to date_format if day/month are still ambiguous
|
518
|
+
if day is None or month is None:
|
519
|
+
if date_format.lower() == "mdy":
|
520
|
+
month, day = rest_parts[0], rest_parts[1]
|
521
|
+
else: # default to DD/MM/YY
|
522
|
+
day, month = rest_parts[0], rest_parts[1]
|
523
|
+
|
524
|
+
try:
|
525
|
+
date_obj = date(year, month, day)
|
526
|
+
pronounced_date_str = pronounce_date(date_obj, full_lang)
|
527
|
+
text = text.replace(match.group(0), pronounced_date_str)
|
528
|
+
except (ValueError, IndexError) as e:
|
529
|
+
LOG.warning(f"Could not parse date from '{match.group(0)}': {e}")
|
530
|
+
|
531
|
+
return text
|
532
|
+
|
533
|
+
|
534
|
+
def _normalize_word_hyphen_digit(text: str) -> str:
|
535
|
+
"""
|
536
|
+
Helper function to normalize words attached to digits with a hyphen,
|
537
|
+
such as 'sub-23' -> 'sub 23'.
|
538
|
+
"""
|
539
|
+
# Regex to find a word (\w+) followed by a hyphen and a digit (\d+)
|
540
|
+
pattern = re.compile(r"(\w+)-(\d+)")
|
541
|
+
text = pattern.sub(r"\1 \2", text)
|
542
|
+
return text
|
543
|
+
|
544
|
+
|
545
|
+
def _normalize_units(text: str, full_lang: str) -> str:
|
546
|
+
"""
|
547
|
+
Helper function to normalize units attached to numbers.
|
548
|
+
This function handles symbolic and alphanumeric units separately
|
549
|
+
to avoid issues with word boundaries.
|
550
|
+
"""
|
551
|
+
lang_code = full_lang.split("-")[0]
|
552
|
+
if lang_code in UNITS:
|
553
|
+
# Determine number separators for the language
|
554
|
+
decimal_separator, thousands_separator = _get_number_separators(full_lang)
|
555
|
+
|
556
|
+
# Separate units into symbolic and alphanumeric
|
557
|
+
symbolic_units = {k: v for k, v in UNITS[lang_code].items() if not k.isalnum()}
|
558
|
+
alphanumeric_units = {k: v for k, v in UNITS[lang_code].items() if k.isalnum()}
|
559
|
+
|
560
|
+
# Create regex pattern for symbolic units and replace them first
|
561
|
+
sorted_symbolic = sorted(symbolic_units.keys(), key=len, reverse=True)
|
562
|
+
symbolic_pattern_str = "|".join(re.escape(unit) for unit in sorted_symbolic)
|
563
|
+
if symbolic_pattern_str:
|
564
|
+
# Pattern to match numbers with optional thousands and decimal separators
|
565
|
+
number_pattern_str = rf"(\d+[{re.escape(thousands_separator)}]?\d*[{re.escape(decimal_separator)}]?\d*)"
|
566
|
+
symbolic_pattern = re.compile(number_pattern_str + r"\s*(" + symbolic_pattern_str + r")", re.IGNORECASE)
|
567
|
+
|
568
|
+
def replace_symbolic(match):
|
569
|
+
number_str = match.group(1)
|
570
|
+
# Remove thousands separator and replace decimal separator for parsing
|
571
|
+
number = number_str.replace(thousands_separator, "").replace(decimal_separator, ".")
|
572
|
+
unit_symbol = match.group(2)
|
573
|
+
unit_word = symbolic_units[unit_symbol]
|
574
|
+
try:
|
575
|
+
return f"{pronounce_number(float(number), full_lang)} {unit_word}"
|
576
|
+
except Exception as e:
|
577
|
+
LOG.error(f"Failed to pronounce number with unit: {number_str}{unit_symbol} - ({e})")
|
578
|
+
return match.group(0)
|
579
|
+
text = symbolic_pattern.sub(replace_symbolic, text)
|
580
|
+
|
581
|
+
# Create regex pattern for alphanumeric units and replace them next
|
582
|
+
sorted_alphanumeric = sorted(alphanumeric_units.keys(), key=len, reverse=True)
|
583
|
+
alphanumeric_pattern_str = "|".join(re.escape(unit) for unit in sorted_alphanumeric)
|
584
|
+
if alphanumeric_pattern_str:
|
585
|
+
number_pattern_str = rf"(\d+[{re.escape(thousands_separator)}]?\d*[{re.escape(decimal_separator)}]?\d*)"
|
586
|
+
alphanumeric_pattern = re.compile(number_pattern_str + r"\s*(" + alphanumeric_pattern_str + r")\b",
|
587
|
+
re.IGNORECASE)
|
588
|
+
|
589
|
+
def replace_alphanumeric(match):
|
590
|
+
number_str = match.group(1)
|
591
|
+
# Remove thousands separator and replace decimal separator for parsing
|
592
|
+
number = number_str.replace(thousands_separator, "").replace(decimal_separator, ".")
|
593
|
+
unit_symbol = match.group(2)
|
594
|
+
unit_word = alphanumeric_units[unit_symbol]
|
595
|
+
return f"{pronounce_number(float(number), full_lang)} {unit_word}"
|
596
|
+
|
597
|
+
text = alphanumeric_pattern.sub(replace_alphanumeric, text)
|
598
|
+
return text
|
599
|
+
|
600
|
+
|
601
|
+
def _normalize_word(word: str, full_lang: str, rbnf_engine) -> str:
|
602
|
+
"""
|
603
|
+
Helper function to normalize a single word.
|
604
|
+
"""
|
605
|
+
lang_code = full_lang.split("-")[0]
|
606
|
+
|
607
|
+
if word in CONTRACTIONS.get(lang_code, {}):
|
608
|
+
return CONTRACTIONS[lang_code][word]
|
609
|
+
|
610
|
+
if word in TITLES.get(lang_code, {}):
|
611
|
+
return TITLES[lang_code][word]
|
612
|
+
|
613
|
+
# Delegate number parsing to the new helper function
|
614
|
+
normalized_number = _normalize_number_word(word, full_lang, rbnf_engine)
|
615
|
+
if normalized_number != word:
|
616
|
+
return normalized_number
|
617
|
+
|
618
|
+
return word
|
619
|
+
|
620
|
+
|
621
|
+
def is_fraction(word: str) -> bool:
|
622
|
+
"""Checks if a word is a fraction like '3/3'."""
|
623
|
+
if "/" in word:
|
624
|
+
parts = word.split("/")
|
625
|
+
if len(parts) == 2:
|
626
|
+
n1, n2 = parts
|
627
|
+
return n1.isdigit() and n2.isdigit()
|
628
|
+
return False
|
629
|
+
|
630
|
+
|
631
|
+
def normalize(text: str, lang: str) -> str:
|
632
|
+
"""
|
633
|
+
Normalizes a text string by expanding contractions, titles, and pronouncing
|
634
|
+
numbers, units, and fractions.
|
635
|
+
"""
|
636
|
+
full_lang = lang
|
637
|
+
lang_code = full_lang.split("-")[0]
|
638
|
+
dialog = text
|
639
|
+
|
640
|
+
# Step 1: Handle dates and times with ovos-date-parser
|
641
|
+
date_format = "MDY" if full_lang.lower() == "en-us" else "DMY"
|
642
|
+
dialog = _normalize_dates_and_times(dialog, full_lang, date_format)
|
643
|
+
|
644
|
+
# Step 2: Normalize words with hyphens and digits
|
645
|
+
dialog = _normalize_word_hyphen_digit(dialog)
|
646
|
+
|
647
|
+
# Step 3: Expand units attached to numbers
|
648
|
+
dialog = _normalize_units(dialog, full_lang)
|
649
|
+
|
650
|
+
# Step 4: Normalize word-by-word
|
651
|
+
words = dialog.split()
|
652
|
+
rbnf_engine = None
|
653
|
+
try:
|
654
|
+
rbnf_engine = RbnfEngine.for_language(lang_code)
|
655
|
+
except (ValueError, KeyError) as e:
|
656
|
+
LOG.debug(f"RBNF engine not available for language '{lang_code}': {e}")
|
657
|
+
|
658
|
+
normalized_words = [_normalize_word(word, full_lang, rbnf_engine) for word in words]
|
659
|
+
dialog = " ".join(normalized_words)
|
660
|
+
|
661
|
+
return dialog
|
662
|
+
|
663
|
+
|
664
|
+
if __name__ == "__main__":
|
665
|
+
# --- Example usage for demonstration purposes ---
|
666
|
+
|
667
|
+
# General normalization examples
|
668
|
+
print("General English example: " + normalize('I\'m Dr. Prof. 3/3 0.5% of 12345€, 5ft, and 10kg', 'en'))
|
669
|
+
print(f"General Portuguese example: {normalize('Dr. Prof. 3/3 0.5% de 12345€, 5m, e 10kg', 'pt')}")
|
670
|
+
|
671
|
+
# Portuguese examples with comma decimal separator
|
672
|
+
print("\n--- Portuguese Decimal Separator Examples ---")
|
673
|
+
print(
|
674
|
+
f"Original: 'A coima aplicada é de 1,2 milhões de euros.' Normalized: '{normalize('A coima aplicada é de 1,2 milhões de euros.', 'pt')}'")
|
675
|
+
print(
|
676
|
+
f"Original: 'Agora, tem 1,88 metros e muito para contar.' Normalized: '{normalize('Agora, tem 1,88 metros e muito para contar.', 'pt')}'")
|
677
|
+
print(
|
678
|
+
f"Original: 'Ainda temos 1,7 milhões de pobres!' Normalized: '{normalize('Ainda temos 1,7 milhões de pobres!', 'pt')}'")
|
679
|
+
print(f"Original: 'O lucro foi de 123.456,78€.' Normalized: '{normalize('O lucro foi de 123.456,78€.', 'pt')}'")
|
680
|
+
print(f"Normalized: '{normalize('O lucro foi de 123.456,78€.', 'pt-PT')}'")
|
681
|
+
|
682
|
+
# English dates and times
|
683
|
+
print("\n--- English Date & Time Examples ---")
|
684
|
+
print(f"English date (MDY format): {normalize('The date is 08/03/2025', 'en-US')}")
|
685
|
+
print(f"English ambiguous date (MDY assumed): {normalize('The report is due 15/05/2025', 'en-US')}")
|
686
|
+
print(f"English date with dashes: {normalize('The event is on 11-04-2025', 'en-US')}")
|
687
|
+
print(f"English AM/PM time: {normalize('The meeting is at 10am', 'en-US')}")
|
688
|
+
print(f"English military time: {normalize('The party is at 19h30', 'en-US')}")
|
689
|
+
print(f"English month name: {normalize('The report is due 15 May 2025', 'en-US')}")
|
690
|
+
|
691
|
+
# Portuguese dates and times
|
692
|
+
print("\n--- Portuguese Date & Time Examples ---")
|
693
|
+
print(f"Portuguese date (DMY format): {normalize('A data é 03/08/2025', 'pt')}")
|
694
|
+
print(f"Portuguese ambiguous date (DMY assumed): {normalize('O relatório é para 15/05/2025', 'pt')}")
|
695
|
+
print(f"Portuguese date with dashes: {normalize('O evento é no dia 25-10-2024', 'pt')}")
|
696
|
+
print(f"Portuguese military time: {normalize('O encontro é às 14h30', 'pt')}")
|
697
|
+
|
698
|
+
# Other examples
|
699
|
+
print(f"\n--- Other Examples ---")
|
700
|
+
print(f"English fraction: {normalize('The fraction is 1/2', 'en')}")
|
701
|
+
print(f"English plural fraction: {normalize('There are 3/4 of a cup', 'en')}")
|
702
|
+
print(f"Spanish example with units: {normalize('The temperature is 25ºC', 'es')}")
|
703
|
+
print(f"Portuguese with punctuation: {normalize('12345€, 5m e 10kg', 'pt')}")
|
704
|
+
print(
|
705
|
+
f"Portuguese word-digit: {normalize('Esta temporada leva oito jogos ao serviço da equipa sub-23 leonina.', 'pt')}")
|