phoonnx 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. phoonnx/__init__.py +0 -0
  2. phoonnx/config.py +490 -0
  3. phoonnx/locale/ca/phonetic_spellings.txt +2 -0
  4. phoonnx/locale/en/phonetic_spellings.txt +1 -0
  5. phoonnx/locale/gl/phonetic_spellings.txt +2 -0
  6. phoonnx/locale/pt/phonetic_spellings.txt +2 -0
  7. phoonnx/phoneme_ids.py +453 -0
  8. phoonnx/phonemizers/__init__.py +45 -0
  9. phoonnx/phonemizers/ar.py +42 -0
  10. phoonnx/phonemizers/base.py +216 -0
  11. phoonnx/phonemizers/en.py +250 -0
  12. phoonnx/phonemizers/fa.py +46 -0
  13. phoonnx/phonemizers/gl.py +142 -0
  14. phoonnx/phonemizers/he.py +67 -0
  15. phoonnx/phonemizers/ja.py +119 -0
  16. phoonnx/phonemizers/ko.py +97 -0
  17. phoonnx/phonemizers/mul.py +606 -0
  18. phoonnx/phonemizers/vi.py +44 -0
  19. phoonnx/phonemizers/zh.py +308 -0
  20. phoonnx/thirdparty/__init__.py +0 -0
  21. phoonnx/thirdparty/arpa2ipa.py +249 -0
  22. phoonnx/thirdparty/cotovia/cotovia_aarch64 +0 -0
  23. phoonnx/thirdparty/cotovia/cotovia_x86_64 +0 -0
  24. phoonnx/thirdparty/hangul2ipa.py +783 -0
  25. phoonnx/thirdparty/ko_tables/aspiration.csv +20 -0
  26. phoonnx/thirdparty/ko_tables/assimilation.csv +31 -0
  27. phoonnx/thirdparty/ko_tables/double_coda.csv +17 -0
  28. phoonnx/thirdparty/ko_tables/hanja.tsv +8525 -0
  29. phoonnx/thirdparty/ko_tables/ipa.csv +22 -0
  30. phoonnx/thirdparty/ko_tables/neutralization.csv +11 -0
  31. phoonnx/thirdparty/ko_tables/tensification.csv +56 -0
  32. phoonnx/thirdparty/ko_tables/yale.csv +22 -0
  33. phoonnx/thirdparty/kog2p/__init__.py +385 -0
  34. phoonnx/thirdparty/kog2p/rulebook.txt +212 -0
  35. phoonnx/thirdparty/mantoq/__init__.py +67 -0
  36. phoonnx/thirdparty/mantoq/buck/__init__.py +0 -0
  37. phoonnx/thirdparty/mantoq/buck/phonetise_buckwalter.py +569 -0
  38. phoonnx/thirdparty/mantoq/buck/symbols.py +64 -0
  39. phoonnx/thirdparty/mantoq/buck/tokenization.py +105 -0
  40. phoonnx/thirdparty/mantoq/num2words.py +37 -0
  41. phoonnx/thirdparty/mantoq/pyarabic/__init__.py +12 -0
  42. phoonnx/thirdparty/mantoq/pyarabic/arabrepr.py +64 -0
  43. phoonnx/thirdparty/mantoq/pyarabic/araby.py +1647 -0
  44. phoonnx/thirdparty/mantoq/pyarabic/named_const.py +227 -0
  45. phoonnx/thirdparty/mantoq/pyarabic/normalize.py +161 -0
  46. phoonnx/thirdparty/mantoq/pyarabic/number.py +826 -0
  47. phoonnx/thirdparty/mantoq/pyarabic/number_const.py +1704 -0
  48. phoonnx/thirdparty/mantoq/pyarabic/stack.py +52 -0
  49. phoonnx/thirdparty/mantoq/pyarabic/trans.py +517 -0
  50. phoonnx/thirdparty/mantoq/unicode_symbol2label.py +4173 -0
  51. phoonnx/thirdparty/tashkeel/LICENSE +22 -0
  52. phoonnx/thirdparty/tashkeel/SOURCE +1 -0
  53. phoonnx/thirdparty/tashkeel/__init__.py +212 -0
  54. phoonnx/thirdparty/tashkeel/hint_id_map.json +18 -0
  55. phoonnx/thirdparty/tashkeel/input_id_map.json +56 -0
  56. phoonnx/thirdparty/tashkeel/model.onnx +0 -0
  57. phoonnx/thirdparty/tashkeel/target_id_map.json +17 -0
  58. phoonnx/thirdparty/zh_num.py +238 -0
  59. phoonnx/util.py +705 -0
  60. phoonnx/version.py +6 -0
  61. phoonnx/voice.py +521 -0
  62. phoonnx-0.0.0.dist-info/METADATA +255 -0
  63. phoonnx-0.0.0.dist-info/RECORD +86 -0
  64. phoonnx-0.0.0.dist-info/WHEEL +5 -0
  65. phoonnx-0.0.0.dist-info/top_level.txt +2 -0
  66. phoonnx_train/__main__.py +151 -0
  67. phoonnx_train/export_onnx.py +109 -0
  68. phoonnx_train/norm_audio/__init__.py +92 -0
  69. phoonnx_train/norm_audio/trim.py +54 -0
  70. phoonnx_train/norm_audio/vad.py +54 -0
  71. phoonnx_train/preprocess.py +420 -0
  72. phoonnx_train/vits/__init__.py +0 -0
  73. phoonnx_train/vits/attentions.py +427 -0
  74. phoonnx_train/vits/commons.py +147 -0
  75. phoonnx_train/vits/config.py +330 -0
  76. phoonnx_train/vits/dataset.py +214 -0
  77. phoonnx_train/vits/lightning.py +352 -0
  78. phoonnx_train/vits/losses.py +58 -0
  79. phoonnx_train/vits/mel_processing.py +139 -0
  80. phoonnx_train/vits/models.py +732 -0
  81. phoonnx_train/vits/modules.py +527 -0
  82. phoonnx_train/vits/monotonic_align/__init__.py +20 -0
  83. phoonnx_train/vits/monotonic_align/setup.py +13 -0
  84. phoonnx_train/vits/transforms.py +212 -0
  85. phoonnx_train/vits/utils.py +16 -0
  86. phoonnx_train/vits/wavfile.py +860 -0
phoonnx/util.py ADDED
@@ -0,0 +1,705 @@
1
+ import datetime
2
+ import logging
3
+ import re
4
+ import string
5
+ from datetime import date
6
+
7
+ from ovos_date_parser import nice_time, nice_date
8
+ from ovos_number_parser import pronounce_number, is_fractional, pronounce_fraction
9
+ from ovos_number_parser.util import is_numeric
10
+ from unicode_rbnf import RbnfEngine, FormatPurpose
11
+
12
+ LOG = logging.getLogger("normalize")
13
+
14
+ # A dictionary of common contractions and their expanded forms.
15
+ # This list is very comprehensive for English.
16
+ CONTRACTIONS = {
17
+ "en": {
18
+ "I'd": "I would",
19
+ "I'll": "I will",
20
+ "I'm": "I am",
21
+ "I've": "I have",
22
+ "ain't": "is not",
23
+ "aren't": "are not",
24
+ "can't": "can not",
25
+ "could've": "could have",
26
+ "couldn't": "could not",
27
+ "didn't": "did not",
28
+ "doesn't": "does not",
29
+ "don't": "do not",
30
+ "gonna": "going to",
31
+ "gotta": "got to",
32
+ "hadn't": "had not",
33
+ "hasn't": "has not",
34
+ "haven't": "have not",
35
+ "he'd": "he would",
36
+ "he'll": "he will",
37
+ "he's": "he is",
38
+ "how'd": "how did",
39
+ "how'll": "how will",
40
+ "how's": "how is",
41
+ "isn't": "is not",
42
+ "it'd": "it would",
43
+ "it'll": "it will",
44
+ "it's": "it is",
45
+ "might've": "might have",
46
+ "mightn't": "might not",
47
+ "must've": "must have",
48
+ "mustn't": "must not",
49
+ "needn't": "need not",
50
+ "oughtn't": "ought not",
51
+ "shan't": "shall not",
52
+ "she'd": "she would",
53
+ "she'll": "she will",
54
+ "she's": "she is",
55
+ "should've": "should have",
56
+ "shouldn't": "should not",
57
+ "somebody's": "somebody is",
58
+ "someone'd": "someone would",
59
+ "someone'll": "someone will",
60
+ "someone's": "someone is",
61
+ "that'd": "that would",
62
+ "that'll": "that will",
63
+ "that's": "that is",
64
+ "there'd": "there would",
65
+ "there're": "there are",
66
+ "there's": "there is",
67
+ "they'd": "they would",
68
+ "they'll": "they will",
69
+ "they're": "they are",
70
+ "they've": "they have",
71
+ "wasn't": "was not",
72
+ "we'd": "we would",
73
+ "we'll": "we will",
74
+ "we're": "we are",
75
+ "we've": "we have",
76
+ "weren't": "were not",
77
+ "what'd": "what did",
78
+ "what'll": "what will",
79
+ "what're": "what are",
80
+ "what's": "what is",
81
+ "what've": "what have",
82
+ "whats": "what is",
83
+ "when'd": "when did",
84
+ "when's": "when is",
85
+ "where'd": "where did",
86
+ "where's": "where is",
87
+ "where've": "where have",
88
+ "who'd": "who would",
89
+ "who'd've": "who would have",
90
+ "who'll": "who will",
91
+ "who're": "who are",
92
+ "who's": "who is",
93
+ "who've": "who have",
94
+ "why'd": "why did",
95
+ "why're": "why are",
96
+ "why's": "why is",
97
+ "won't": "will not",
98
+ "won't've": "will not have",
99
+ "would've": "would have",
100
+ "wouldn't": "would not",
101
+ "wouldn't've": "would not have",
102
+ "y'ain't": "you are not",
103
+ "y'aint": "you are not",
104
+ "y'all": "you all",
105
+ "ya'll": "you all",
106
+ "you'd": "you would",
107
+ "you'd've": "you would have",
108
+ "you'll": "you will",
109
+ "you're": "you are",
110
+ "you've": "you have",
111
+ "I'm'a": "I am going to",
112
+ "I'm'o": "I am going to",
113
+ "I'll've": "I will have",
114
+ "I'd've": "I would have",
115
+ "Whatcha": "What are you",
116
+ "amn't": "am not",
117
+ "'cause": "because",
118
+ "can't've": "cannot have",
119
+ "couldn't've": "could not have",
120
+ "daren't": "dare not",
121
+ "daresn't": "dare not",
122
+ "dasn't": "dare not",
123
+ "everyone's": "everyone is",
124
+ "gimme": "give me",
125
+ "gon't": "go not",
126
+ "hadn't've": "had not have",
127
+ "he've": "he would have",
128
+ "he'll've": "he will have",
129
+ "he'd've": "he would have",
130
+ "here's": "here is",
131
+ "how're": "how are",
132
+ "how'd'y": "how do you do",
133
+ "howd'y": "how do you do",
134
+ "howdy": "how do you do",
135
+ "'tis": "it is",
136
+ "'twas": "it was",
137
+ "it'll've": "it will have",
138
+ "it'd've": "it would have",
139
+ "kinda": "kind of",
140
+ "let's": "let us",
141
+ "ma'am": "madam",
142
+ "may've": "may have",
143
+ "mayn't": "may not",
144
+ "mightn't've": "might not have",
145
+ "mustn't've": "must not have",
146
+ "needn't've": "need not have",
147
+ "ol'": "old",
148
+ "oughtn't've": "ought not have",
149
+ "sha'n't": "shall not",
150
+ "shan't": "shall not",
151
+ "shalln't": "shall not",
152
+ "shan't've": "shall not have",
153
+ "she'd've": "she would have",
154
+ "shouldn't've": "should not have",
155
+ "so've": "so have",
156
+ "so's": "so is",
157
+ "something's": "something is",
158
+ "that're": "that are",
159
+ "that'd've": "that would have",
160
+ "there'll": "there will",
161
+ "there'd've": "there would have",
162
+ "these're": "these are",
163
+ "they'll've": "they will have",
164
+ "they'd've": "they would have",
165
+ "this's": "this is",
166
+ "this'll": "this will",
167
+ "this'd": "this would",
168
+ "those're": "those are",
169
+ "to've": "to have",
170
+ "wanna": "want to",
171
+ "we'll've": "we will have",
172
+ "we'd've": "we would have",
173
+ "what'll've": "what will have",
174
+ "when've": "when have",
175
+ "where're": "where are",
176
+ "which's": "which is",
177
+ "who'll've": "who will have",
178
+ "why've": "why have",
179
+ "will've": "will have",
180
+ "y'all're": "you all are",
181
+ "y'all've": "you all have",
182
+ "y'all'd": "you all would",
183
+ "y'all'd've": "you all would have",
184
+ "you'll've": "you will have"
185
+ }
186
+ }
187
+
188
+ # Dictionaries for titles, units, and their full word equivalents.
189
+ TITLES = {
190
+ "en": {
191
+ "Dr.": "Doctor",
192
+ "Mr.": "Mister",
193
+ "Prof.": "Professor"
194
+ },
195
+ "ca": {
196
+ "Dr.": "Doctor",
197
+ "Sr.": "Senyor",
198
+ "Sra.": "Senyora",
199
+ "Prof.": "Professor"
200
+ },
201
+ "es": {
202
+ "Dr.": "Doctor",
203
+ "Sr.": "Señor",
204
+ "Sra.": "Señora",
205
+ "Prof.": "Profesor",
206
+ "D.": "Don",
207
+ "Dña.": "Doña"
208
+ },
209
+ "pt": {
210
+ "Dr.": "Doutor",
211
+ "Sr.": "Senhor",
212
+ "Sra.": "Senhora",
213
+ "Prof.": "Professor",
214
+ "Drª.": "Doutora",
215
+ "Eng.": "Engenheiro",
216
+ "D.": "Dom",
217
+ "Dª": "Dona"
218
+ },
219
+ "gl": {
220
+ "Dr.": "Doutor",
221
+ "Sr.": "Señor",
222
+ "Sra.": "Señora",
223
+ "Prof.": "Profesor",
224
+ "Srta.": "Señorita"
225
+ },
226
+ "fr": {
227
+ "Dr.": "Docteur",
228
+ "M.": "Monsieur",
229
+ "Mme": "Madame",
230
+ "Mlle": "Mademoiselle",
231
+ "Prof.": "Professeur",
232
+ "Pr.": "Professeur"
233
+ },
234
+ "it": {
235
+ "Dr.": "Dottore",
236
+ "Sig.": "Signore",
237
+ "Sig.ra": "Signora",
238
+ "Prof.": "Professore",
239
+ "Dott.ssa": "Dottoressa",
240
+ "Sig.na": "Signorina"
241
+ },
242
+ "nl": {
243
+ "Dr.": "Dokter",
244
+ "Dhr.": "De Heer",
245
+ "Mevr.": "Mevrouw",
246
+ "Prof.": "Professor",
247
+ "Drs.": "Dokterandus",
248
+ "Ing.": "Ingenieur"
249
+ },
250
+ "de": {
251
+ "Dr.": "Doktor",
252
+ "Prof.": "Professor"
253
+ }
254
+ }
255
+
256
+ UNITS = {
257
+ "en": {
258
+ "€": "euros",
259
+ "%": "per cent",
260
+ "ºC": "degrees celsius",
261
+ "ºF": "degrees fahrenheit",
262
+ "ºK": "degrees kelvin",
263
+ "°": "degrees",
264
+ "$": "dollars",
265
+ "£": "pounds",
266
+ "km": "kilometers",
267
+ "m": "meters",
268
+ "cm": "centimeters",
269
+ "mm": "millimeters",
270
+ "ft": "feet",
271
+ "in": "inches",
272
+ "yd": "yards",
273
+ "mi": "miles",
274
+ "kg": "kilograms",
275
+ "g": "grams",
276
+ "lb": "pounds",
277
+ "oz": "ounces",
278
+ "L": "liters",
279
+ "mL": "milliliters",
280
+ "gal": "gallons",
281
+ "qt": "quarts",
282
+ "pt": "pints",
283
+ "hr": "hours",
284
+ "min": "minutes",
285
+ "s": "seconds"
286
+ },
287
+ "pt": {
288
+ "€": "euros",
289
+ "%": "por cento",
290
+ "ºC": "graus celsius",
291
+ "ºF": "graus fahrenheit",
292
+ "ºK": "graus kelvin",
293
+ "°": "graus",
294
+ "$": "dólares",
295
+ "£": "libras",
296
+ "km": "quilômetros",
297
+ "m": "metros",
298
+ "cm": "centímetros",
299
+ "mm": "milímetros",
300
+ "kg": "quilogramas",
301
+ "g": "gramas",
302
+ "L": "litros",
303
+ "mL": "mililitros",
304
+ "h": "horas",
305
+ "min": "minutos",
306
+ "s": "segundos"
307
+ },
308
+ "es": {
309
+ "€": "euros",
310
+ "%": "por ciento",
311
+ "ºC": "grados celsius",
312
+ "ºF": "grados fahrenheit",
313
+ "ºK": "grados kelvin",
314
+ "°": "grados",
315
+ "$": "dólares",
316
+ "£": "libras",
317
+ "km": "kilómetros",
318
+ "m": "metros",
319
+ "cm": "centímetros",
320
+ "kg": "kilogramos",
321
+ "g": "gramos",
322
+ "L": "litros",
323
+ "mL": "millilitros"
324
+ },
325
+ "fr": {
326
+ "€": "euros",
327
+ "%": "pour cent",
328
+ "ºC": "degrés celsius",
329
+ "ºF": "degrés fahrenheit",
330
+ "ºK": "degrés kelvin",
331
+ "°": "degrés",
332
+ "$": "dollars",
333
+ "£": "livres",
334
+ "km": "kilomètres",
335
+ "m": "mètres",
336
+ "cm": "centimètres",
337
+ "kg": "kilogrammes",
338
+ "g": "grammes",
339
+ "L": "litres",
340
+ "mL": "millilitres"
341
+ },
342
+ "de": {
343
+ "€": "Euro",
344
+ "%": "Prozent",
345
+ "ºC": "Grad Celsius",
346
+ "ºF": "Grad Fahrenheit",
347
+ "ºK": "Grad Kelvin",
348
+ "°": "Grad",
349
+ "$": "Dollar",
350
+ "£": "Pfund",
351
+ "km": "Kilometer",
352
+ "m": "Meter",
353
+ "cm": "Zentimeter",
354
+ "kg": "Kilogramm",
355
+ "g": "Gramm",
356
+ "L": "Liter",
357
+ "mL": "Milliliter"
358
+ }
359
+ }
360
+
361
+
362
+ def _get_number_separators(full_lang: str) -> tuple[str, str]:
363
+ """
364
+ Determines decimal and thousands separators based on language.
365
+ Defaults to '.' decimal and ',' thousands for most languages.
366
+ Special cases:
367
+ - 'pt', 'es', 'fr', 'de': ',' decimal and '.' thousands.
368
+ """
369
+ lang_code = full_lang.split("-")[0]
370
+ decimal_separator = '.'
371
+ thousands_separator = ','
372
+ if lang_code in ["pt", "es", "fr", "de"]:
373
+ decimal_separator = ','
374
+ thousands_separator = '.'
375
+ return decimal_separator, thousands_separator
376
+
377
+
378
+ def _normalize_number_word(word: str, full_lang: str, rbnf_engine) -> str:
379
+ """
380
+ Helper function to normalize a single word that is a number, handling
381
+ decimal and thousands separators based on locale.
382
+ """
383
+ cleaned_word = word.rstrip(string.punctuation)
384
+
385
+ # Handle fractions like '3/3'
386
+ if is_fraction(cleaned_word):
387
+ try:
388
+ return pronounce_fraction(cleaned_word, full_lang) + word[len(cleaned_word):]
389
+ except Exception as e:
390
+ LOG.error(f"ovos-number-parser failed to pronounce fraction: {word} - ({e})")
391
+ return word
392
+
393
+ # Handle numbers with locale-specific separators
394
+ decimal_separator, thousands_separator = _get_number_separators(full_lang)
395
+ temp_cleaned_word = cleaned_word
396
+
397
+ # Check if the word contains a thousands separator followed by digits and a decimal separator
398
+ # This is a specific check for formats like '123.456,78'
399
+ has_thousands_and_decimal = (
400
+ thousands_separator in temp_cleaned_word and
401
+ decimal_separator in temp_cleaned_word and
402
+ temp_cleaned_word.index(thousands_separator) < temp_cleaned_word.index(decimal_separator)
403
+ )
404
+
405
+ if has_thousands_and_decimal:
406
+ temp_cleaned_word = temp_cleaned_word.replace(thousands_separator, "")
407
+ temp_cleaned_word = temp_cleaned_word.replace(decimal_separator, ".")
408
+ elif decimal_separator in temp_cleaned_word and is_numeric(temp_cleaned_word.replace(decimal_separator, ".", 1)):
409
+ # Handle cases like '1,2' -> '1.2'
410
+ temp_cleaned_word = temp_cleaned_word.replace(decimal_separator, ".")
411
+ elif thousands_separator in temp_cleaned_word and is_numeric(temp_cleaned_word.replace(thousands_separator, "", 1)):
412
+ # Handle cases like '1.234' -> '1234'
413
+ temp_cleaned_word = temp_cleaned_word.replace(thousands_separator, "")
414
+
415
+ # Check if the word is a valid number after processing
416
+ if is_numeric(temp_cleaned_word):
417
+ try:
418
+ num = float(temp_cleaned_word) if "." in temp_cleaned_word else int(temp_cleaned_word)
419
+ return pronounce_number(num, lang=full_lang) + word[len(cleaned_word):]
420
+ except Exception as e:
421
+ LOG.error(f"ovos-number-parser failed to pronounce number: {word} - ({e})")
422
+ return word
423
+
424
+ elif rbnf_engine and cleaned_word.isdigit():
425
+ try:
426
+ pronounced_number = rbnf_engine.format_number(cleaned_word, FormatPurpose.CARDINAL).text
427
+ return pronounced_number + word[len(cleaned_word):]
428
+ except Exception as e:
429
+ LOG.error(f"unicode-rbnf failed to pronounce number: {word} - ({e})")
430
+ return word
431
+
432
+ return word
433
+
434
+
435
+ # --- Date and Time Pronunciation ---
436
+ def pronounce_date(date_obj: date, full_lang: str) -> str:
437
+ """
438
+ Pronounces a date object using ovos-date-parser.
439
+ """
440
+ return nice_date(date_obj, full_lang)
441
+
442
+
443
+ def pronounce_time(time_string: str, full_lang: str) -> str:
444
+ """
445
+ Pronounces a time string using ovos-date-parser.
446
+ Handles military time like "15h01" and converts it to a
447
+ datetime.time object before passing it to nice_time.
448
+ """
449
+ try:
450
+ hours, mins = time_string.split("h")
451
+ time_obj = datetime.time(int(hours), int(mins))
452
+ # Use nice_time from ovos-date-parser
453
+ return nice_time(time_obj, full_lang, speech=True, use_24hour=True, use_ampm=False)
454
+ except Exception as e:
455
+ LOG.warning(f"Failed to parse time string '{time_string}': {e}")
456
+ return time_string.replace("h", " ")
457
+
458
+
459
+ def _normalize_dates_and_times(text: str, full_lang: str, date_format: str = "DMY") -> str:
460
+ """
461
+ Helper function to normalize dates and times using regular expressions.
462
+ This prepares the strings for pronunciation.
463
+ """
464
+ lang_code = full_lang.split("-")[0]
465
+ # Pre-process with regex to handle English am/pm times
466
+ if lang_code == "en":
467
+ text = re.sub(r"(?i)(\d+)(am|pm)", r"\1 \2", text)
468
+ # Handle the pronunciation for TTS
469
+ text = text.replace("am", "A M").replace("pm", "P M")
470
+
471
+ # Normalize times like "15h01" to words
472
+ time_pattern = re.compile(r"(\d{1,2})h(\d{2})", re.IGNORECASE)
473
+
474
+ def replace_time(match):
475
+ time_str = match.group(0)
476
+ return pronounce_time(time_str, full_lang)
477
+
478
+ text = time_pattern.sub(replace_time, text)
479
+
480
+ # Find dates like "DD/MM/YYYY" or "YYYY/MM/DD"
481
+ date_pattern = re.compile(r"(\d{1,4})[/-](\d{1,2})[/-](\d{1,4})")
482
+
483
+ match = date_pattern.search(text)
484
+
485
+ if match:
486
+ # Get the three parts of the date string
487
+ part1_str, part2_str, part3_str = match.groups()
488
+ p1, p2, p3 = int(part1_str), int(part2_str), int(part3_str)
489
+
490
+ # Initialize month, day, and year
491
+ month, day, year = None, None, None
492
+
493
+ # Determine year first based on length (4 digits)
494
+ if len(part1_str) == 4:
495
+ year, rest_parts = p1, [p2, p3]
496
+ elif len(part3_str) == 4:
497
+ year, rest_parts = p3, [p1, p2]
498
+ else:
499
+ # If no 4-digit year, it's ambiguous, assume a 2-digit year.
500
+ # We'll assume the last part is the year based on common patterns.
501
+ year = p3
502
+ # Expand 2-digit year to 4-digit year
503
+ if year < 100:
504
+ # Assume years 00-29 are 2000-2029, 30-99 are 1930-1999
505
+ year = 2000 + year if year < 30 else 1900 + year
506
+ rest_parts = [p1, p2]
507
+
508
+ # From the remaining parts, try to determine day and month
509
+ if day is None and any(p > 12 and len(str(p)) == 2 for p in rest_parts):
510
+ # If a two-digit number is > 12, it's a day
511
+ day_candidate = next((p for p in rest_parts if p > 12), None)
512
+ if day_candidate:
513
+ day = day_candidate
514
+ rest_parts.remove(day_candidate)
515
+ month = rest_parts[0]
516
+
517
+ # Fallback to date_format if day/month are still ambiguous
518
+ if day is None or month is None:
519
+ if date_format.lower() == "mdy":
520
+ month, day = rest_parts[0], rest_parts[1]
521
+ else: # default to DD/MM/YY
522
+ day, month = rest_parts[0], rest_parts[1]
523
+
524
+ try:
525
+ date_obj = date(year, month, day)
526
+ pronounced_date_str = pronounce_date(date_obj, full_lang)
527
+ text = text.replace(match.group(0), pronounced_date_str)
528
+ except (ValueError, IndexError) as e:
529
+ LOG.warning(f"Could not parse date from '{match.group(0)}': {e}")
530
+
531
+ return text
532
+
533
+
534
+ def _normalize_word_hyphen_digit(text: str) -> str:
535
+ """
536
+ Helper function to normalize words attached to digits with a hyphen,
537
+ such as 'sub-23' -> 'sub 23'.
538
+ """
539
+ # Regex to find a word (\w+) followed by a hyphen and a digit (\d+)
540
+ pattern = re.compile(r"(\w+)-(\d+)")
541
+ text = pattern.sub(r"\1 \2", text)
542
+ return text
543
+
544
+
545
+ def _normalize_units(text: str, full_lang: str) -> str:
546
+ """
547
+ Helper function to normalize units attached to numbers.
548
+ This function handles symbolic and alphanumeric units separately
549
+ to avoid issues with word boundaries.
550
+ """
551
+ lang_code = full_lang.split("-")[0]
552
+ if lang_code in UNITS:
553
+ # Determine number separators for the language
554
+ decimal_separator, thousands_separator = _get_number_separators(full_lang)
555
+
556
+ # Separate units into symbolic and alphanumeric
557
+ symbolic_units = {k: v for k, v in UNITS[lang_code].items() if not k.isalnum()}
558
+ alphanumeric_units = {k: v for k, v in UNITS[lang_code].items() if k.isalnum()}
559
+
560
+ # Create regex pattern for symbolic units and replace them first
561
+ sorted_symbolic = sorted(symbolic_units.keys(), key=len, reverse=True)
562
+ symbolic_pattern_str = "|".join(re.escape(unit) for unit in sorted_symbolic)
563
+ if symbolic_pattern_str:
564
+ # Pattern to match numbers with optional thousands and decimal separators
565
+ number_pattern_str = rf"(\d+[{re.escape(thousands_separator)}]?\d*[{re.escape(decimal_separator)}]?\d*)"
566
+ symbolic_pattern = re.compile(number_pattern_str + r"\s*(" + symbolic_pattern_str + r")", re.IGNORECASE)
567
+
568
+ def replace_symbolic(match):
569
+ number_str = match.group(1)
570
+ # Remove thousands separator and replace decimal separator for parsing
571
+ number = number_str.replace(thousands_separator, "").replace(decimal_separator, ".")
572
+ unit_symbol = match.group(2)
573
+ unit_word = symbolic_units[unit_symbol]
574
+ try:
575
+ return f"{pronounce_number(float(number), full_lang)} {unit_word}"
576
+ except Exception as e:
577
+ LOG.error(f"Failed to pronounce number with unit: {number_str}{unit_symbol} - ({e})")
578
+ return match.group(0)
579
+ text = symbolic_pattern.sub(replace_symbolic, text)
580
+
581
+ # Create regex pattern for alphanumeric units and replace them next
582
+ sorted_alphanumeric = sorted(alphanumeric_units.keys(), key=len, reverse=True)
583
+ alphanumeric_pattern_str = "|".join(re.escape(unit) for unit in sorted_alphanumeric)
584
+ if alphanumeric_pattern_str:
585
+ number_pattern_str = rf"(\d+[{re.escape(thousands_separator)}]?\d*[{re.escape(decimal_separator)}]?\d*)"
586
+ alphanumeric_pattern = re.compile(number_pattern_str + r"\s*(" + alphanumeric_pattern_str + r")\b",
587
+ re.IGNORECASE)
588
+
589
+ def replace_alphanumeric(match):
590
+ number_str = match.group(1)
591
+ # Remove thousands separator and replace decimal separator for parsing
592
+ number = number_str.replace(thousands_separator, "").replace(decimal_separator, ".")
593
+ unit_symbol = match.group(2)
594
+ unit_word = alphanumeric_units[unit_symbol]
595
+ return f"{pronounce_number(float(number), full_lang)} {unit_word}"
596
+
597
+ text = alphanumeric_pattern.sub(replace_alphanumeric, text)
598
+ return text
599
+
600
+
601
+ def _normalize_word(word: str, full_lang: str, rbnf_engine) -> str:
602
+ """
603
+ Helper function to normalize a single word.
604
+ """
605
+ lang_code = full_lang.split("-")[0]
606
+
607
+ if word in CONTRACTIONS.get(lang_code, {}):
608
+ return CONTRACTIONS[lang_code][word]
609
+
610
+ if word in TITLES.get(lang_code, {}):
611
+ return TITLES[lang_code][word]
612
+
613
+ # Delegate number parsing to the new helper function
614
+ normalized_number = _normalize_number_word(word, full_lang, rbnf_engine)
615
+ if normalized_number != word:
616
+ return normalized_number
617
+
618
+ return word
619
+
620
+
621
+ def is_fraction(word: str) -> bool:
622
+ """Checks if a word is a fraction like '3/3'."""
623
+ if "/" in word:
624
+ parts = word.split("/")
625
+ if len(parts) == 2:
626
+ n1, n2 = parts
627
+ return n1.isdigit() and n2.isdigit()
628
+ return False
629
+
630
+
631
+ def normalize(text: str, lang: str) -> str:
632
+ """
633
+ Normalizes a text string by expanding contractions, titles, and pronouncing
634
+ numbers, units, and fractions.
635
+ """
636
+ full_lang = lang
637
+ lang_code = full_lang.split("-")[0]
638
+ dialog = text
639
+
640
+ # Step 1: Handle dates and times with ovos-date-parser
641
+ date_format = "MDY" if full_lang.lower() == "en-us" else "DMY"
642
+ dialog = _normalize_dates_and_times(dialog, full_lang, date_format)
643
+
644
+ # Step 2: Normalize words with hyphens and digits
645
+ dialog = _normalize_word_hyphen_digit(dialog)
646
+
647
+ # Step 3: Expand units attached to numbers
648
+ dialog = _normalize_units(dialog, full_lang)
649
+
650
+ # Step 4: Normalize word-by-word
651
+ words = dialog.split()
652
+ rbnf_engine = None
653
+ try:
654
+ rbnf_engine = RbnfEngine.for_language(lang_code)
655
+ except (ValueError, KeyError) as e:
656
+ LOG.debug(f"RBNF engine not available for language '{lang_code}': {e}")
657
+
658
+ normalized_words = [_normalize_word(word, full_lang, rbnf_engine) for word in words]
659
+ dialog = " ".join(normalized_words)
660
+
661
+ return dialog
662
+
663
+
664
+ if __name__ == "__main__":
665
+ # --- Example usage for demonstration purposes ---
666
+
667
+ # General normalization examples
668
+ print("General English example: " + normalize('I\'m Dr. Prof. 3/3 0.5% of 12345€, 5ft, and 10kg', 'en'))
669
+ print(f"General Portuguese example: {normalize('Dr. Prof. 3/3 0.5% de 12345€, 5m, e 10kg', 'pt')}")
670
+
671
+ # Portuguese examples with comma decimal separator
672
+ print("\n--- Portuguese Decimal Separator Examples ---")
673
+ print(
674
+ f"Original: 'A coima aplicada é de 1,2 milhões de euros.' Normalized: '{normalize('A coima aplicada é de 1,2 milhões de euros.', 'pt')}'")
675
+ print(
676
+ f"Original: 'Agora, tem 1,88 metros e muito para contar.' Normalized: '{normalize('Agora, tem 1,88 metros e muito para contar.', 'pt')}'")
677
+ print(
678
+ f"Original: 'Ainda temos 1,7 milhões de pobres!' Normalized: '{normalize('Ainda temos 1,7 milhões de pobres!', 'pt')}'")
679
+ print(f"Original: 'O lucro foi de 123.456,78€.' Normalized: '{normalize('O lucro foi de 123.456,78€.', 'pt')}'")
680
+ print(f"Normalized: '{normalize('O lucro foi de 123.456,78€.', 'pt-PT')}'")
681
+
682
+ # English dates and times
683
+ print("\n--- English Date & Time Examples ---")
684
+ print(f"English date (MDY format): {normalize('The date is 08/03/2025', 'en-US')}")
685
+ print(f"English ambiguous date (MDY assumed): {normalize('The report is due 15/05/2025', 'en-US')}")
686
+ print(f"English date with dashes: {normalize('The event is on 11-04-2025', 'en-US')}")
687
+ print(f"English AM/PM time: {normalize('The meeting is at 10am', 'en-US')}")
688
+ print(f"English military time: {normalize('The party is at 19h30', 'en-US')}")
689
+ print(f"English month name: {normalize('The report is due 15 May 2025', 'en-US')}")
690
+
691
+ # Portuguese dates and times
692
+ print("\n--- Portuguese Date & Time Examples ---")
693
+ print(f"Portuguese date (DMY format): {normalize('A data é 03/08/2025', 'pt')}")
694
+ print(f"Portuguese ambiguous date (DMY assumed): {normalize('O relatório é para 15/05/2025', 'pt')}")
695
+ print(f"Portuguese date with dashes: {normalize('O evento é no dia 25-10-2024', 'pt')}")
696
+ print(f"Portuguese military time: {normalize('O encontro é às 14h30', 'pt')}")
697
+
698
+ # Other examples
699
+ print(f"\n--- Other Examples ---")
700
+ print(f"English fraction: {normalize('The fraction is 1/2', 'en')}")
701
+ print(f"English plural fraction: {normalize('There are 3/4 of a cup', 'en')}")
702
+ print(f"Spanish example with units: {normalize('The temperature is 25ºC', 'es')}")
703
+ print(f"Portuguese with punctuation: {normalize('12345€, 5m e 10kg', 'pt')}")
704
+ print(
705
+ f"Portuguese word-digit: {normalize('Esta temporada leva oito jogos ao serviço da equipa sub-23 leonina.', 'pt')}")