tugaphone 0.0.2__tar.gz → 0.1.0a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ Metadata-Version: 2.4
2
+ Name: tugaphone
3
+ Version: 0.1.0a1
4
+ Home-page: https://github.com/TigreGotico/tugaphone
5
+ Author: JarbasAi
6
+ Author-email: jarbasai@mailfence.com
7
+ Requires-Dist: brill-postagger
8
+ Requires-Dist: unicode-rbnf
9
+ Dynamic: author
10
+ Dynamic: author-email
11
+ Dynamic: home-page
12
+ Dynamic: requires-dist
@@ -0,0 +1,131 @@
1
+ import os
2
+ from typing import Optional
3
+
4
+ from tugaphone.lexicon import TugaLexicon
5
+ from tugaphone.pos import TugaTagger
6
+ from tugaphone.tokenizer import Sentence as Tokenizer, EuropeanPortuguese, BrazilianPortuguese
7
+
8
+
9
+ class TugaPhonemizer:
10
+ """
11
+ TugaPhonemizer applies dialect-aware Portuguese phonemization.
12
+
13
+ Supports:
14
+ - pt-PT (Portugal)
15
+ - pt-BR (Brazil)
16
+ - pt-AO (Angola)
17
+ - pt-MZ (Mozambique)
18
+ - pt-TL (Timor-Leste)
19
+ """
20
+ _DIALECT_REGIONS = {
21
+ "pt-PT": "lbx",
22
+ "pt-BR": "rjx",
23
+ "pt-AO": "lda",
24
+ "pt-MZ": "mpx",
25
+ "pt-TL": "dli",
26
+ }
27
+
28
+ def __init__(self, dictionary_path: str = None,
29
+ postag_engine="auto",
30
+ postag_model="pt_core_news_lg"):
31
+
32
+ """
33
+ Initialize the TugaPhonemizer by loading the regional lexicon and configuring the part-of-speech tagger.
34
+
35
+ Parameters:
36
+ dictionary_path (str): Path to a CSV lexicon file; if omitted, defaults to the bundled "regional_dict.csv" located next to this module.
37
+ postag_engine (str): Tagging engine selection passed to TugaTagger (e.g., "auto" to let the tagger choose the best available engine).
38
+ postag_model (str): Model name or identifier used by the POS tagger (for engines that accept a model parameter).
39
+ """
40
+ self.dictionary_path = dictionary_path or os.path.join(
41
+ os.path.dirname(__file__), "regional_dict.csv"
42
+ )
43
+ self.lexicon = TugaLexicon(self.dictionary_path)
44
+ self.postag = TugaTagger(postag_engine, postag_model)
45
+
46
+ def _lang_to_region(self, lang: str) -> str:
47
+ """
48
+ Map an ISO Portuguese dialect code to the internal region code used by the lexicon.
49
+
50
+ Parameters:
51
+ lang (str): ISO dialect code (e.g., "pt-PT", "pt-BR").
52
+
53
+ Returns:
54
+ str: The corresponding internal region code.
55
+
56
+ Raises:
57
+ ValueError: If `lang` is not a supported dialect.
58
+ """
59
+ try:
60
+ return self._DIALECT_REGIONS[lang]
61
+ except KeyError as e:
62
+ raise ValueError(f"Unsupported dialect: {lang}") from e
63
+
64
+ def _get_phones(self, word: str, lang: str, pos: str,
65
+ region: Optional[str] = None) -> str:
66
+ """
67
+ Retrieve the phonemic transcription for a single word in a specified Portuguese dialect.
68
+
69
+ Attempts to find a lexicon entry for the lowercased word using the given part-of-speech tag and a predefined sequence of POS fallbacks; if no lexicon entry is found, produces a dialect-appropriate IPA transcription via the tokenizer (European Portuguese for non-pt-BR dialects, Brazilian Portuguese for pt-BR).
70
+
71
+ Parameters:
72
+ word (str): The word to phonemize.
73
+ lang (str): ISO dialect code (e.g., "pt-PT", "pt-BR") used to determine the default region when `region` is not provided.
74
+ pos (str): The part-of-speech tag to prefer when looking up lexicon entries.
75
+ region (Optional[str]): Optional explicit region code to use for lexicon lookup; when omitted, the region is derived from `lang`.
76
+
77
+ Returns:
78
+ str: A phoneme string from the regional lexicon when available, otherwise an IPA transcription produced by the tokenizer.
79
+ """
80
+ region = region or self._lang_to_region(lang)
81
+ word = word.lower().strip()
82
+
83
+ for fallback in [pos, "NOUN", "PRON", "ADP", "DET", "ADJ", "VERB", "ADV", "SCONJ"]:
84
+ gold_pho = self.lexicon.get_phonemes(word, fallback, region)
85
+ if gold_pho:
86
+ # print(f"DEBUG - word={word}, region={region}, pos={fallback}, phonemes={phones}")
87
+ return gold_pho
88
+
89
+ # Fallback
90
+ dialect = EuropeanPortuguese() if lang != "pt-BR" else BrazilianPortuguese()
91
+ return Tokenizer(surface=word, dialect=dialect).ipa
92
+
93
+ def phonemize_sentence(self, sentence: str, lang: str = "pt-PT") -> str:
94
+ """
95
+ Phonemizes a sentence for the given Portuguese dialect.
96
+
97
+ Parameters:
98
+ sentence (str): Input sentence to phonemize.
99
+ lang (str): ISO dialect code to target (e.g., "pt-PT", "pt-BR", "pt-AO", "pt-MZ", "pt-TL").
100
+
101
+ Returns:
102
+ phonemized (str): Space-separated phoneme tokens for each word; punctuation tokens are preserved unchanged.
103
+ """
104
+ phonemized = [self._get_phones(word=tok, lang=lang, pos=pos)
105
+ if pos != "PUNCT" else tok
106
+ for tok, pos in self.postag.tag(sentence)]
107
+
108
+ return " ".join(phonemized)
109
+
110
+
111
+ if __name__ == "__main__":
112
+ ph = TugaPhonemizer()
113
+
114
+ sentences = [
115
+ "O gato dorme.",
116
+ "Tu falas português muito bem.",
117
+ "O comboio chegou à estação.",
118
+ "A menina comeu o pão todo.",
119
+ "Vou pôr a manteiga no frigorífico.",
120
+ "Ele está a trabalhar no escritório.",
121
+ "Choveu muito ontem à noite.",
122
+ "A rapariga comprou um telemóvel novo.",
123
+ "Vamos tomar um pequeno-almoço.",
124
+ "O carro ficou sem gasolina."
125
+ ]
126
+
127
+ for s in sentences:
128
+ print(s)
129
+ for code in ["pt-PT", "pt-BR", "pt-AO", "pt-MZ", "pt-TL"]:
130
+ print(f"{code} → {ph.phonemize_sentence(s, code)}")
131
+ print("######")
@@ -0,0 +1,169 @@
1
+ import os
2
+ from typing import Dict, List, Tuple, Optional
3
+
4
+ # Typing helpers
5
+ IPA_MAP = Dict[str, Dict[str, Dict[str, str]]]
6
+ SYLLABLE_MAP = Dict[str, List[str]]
7
+
8
+
9
+ class TugaLexicon:
10
+ """
11
+ Supports:
12
+ - pt-PT (Portugal)
13
+ - pt-BR (Brazil)
14
+ - pt-AO (Angola)
15
+ - pt-MZ (Mozambique)
16
+ - pt-TL (Timor-Leste)
17
+ """
18
+ _DIALECT_REGIONS = {
19
+ "pt-PT": "lbx",
20
+ "pt-BR": "rjx",
21
+ "pt-AO": "lda",
22
+ "pt-MZ": "mpx",
23
+ "pt-TL": "dli",
24
+ }
25
+
26
+ def __init__(self, dictionary_path: str = None):
27
+ """
28
+ Initialize the TugaLexicon and load regional phoneme and syllable data.
29
+
30
+ Parameters:
31
+ dictionary_path (str, optional): Path to the CSV file containing regional phoneme and syllable mappings.
32
+ If not provided, defaults to "regional_dict.csv" located in the same directory as this module.
33
+
34
+ Description:
35
+ Loads the dataset at `dictionary_path` and populates `self.ipa` and `self.syllables` with region-specific
36
+ phoneme and syllable mappings.
37
+ """
38
+ self.dictionary_path = dictionary_path or os.path.join(
39
+ os.path.dirname(__file__), "regional_dict.csv"
40
+ )
41
+ self.ipa, self.syllables = self._load_lang_map(self.dictionary_path)
42
+
43
+ @staticmethod
44
+ def _load_lang_map(path: str) -> Tuple[IPA_MAP, SYLLABLE_MAP]:
45
+ """
46
+ Load phoneme and syllable mappings from a CSV into region-indexed lookup structures.
47
+
48
+ The CSV is expected to have columns in this order (header is skipped): _, word, pos, _, phonemes, _, region.
49
+ - `phonemes` entries use `|` as an internal separator in the file and will be represented using the middle dot `·` in the returned data.
50
+ - `word` and `region` values are normalized to lowercase.
51
+
52
+ Parameters:
53
+ path (str): Path to the CSV file.
54
+
55
+ Returns:
56
+ Tuple[Dict[str, Dict[str, Dict[str, str]]], Dict[str, List[str]]]:
57
+ - ipa: mapping region -> word -> POS -> phoneme string (POS keys are uppercase).
58
+ - syllables: mapping region -> word -> list of syllable segments.
59
+ """
60
+ ipa: IPA_MAP = {}
61
+ syllables: SYLLABLE_MAP = {}
62
+
63
+ with open(path, "r", encoding="utf-8") as f:
64
+ for line in f.read().splitlines()[1:]: # skip header
65
+ _, word, pos, _, phonemes, syl, region = line.lower().split(",", 6)
66
+ phonemes = phonemes.replace("|", "·").strip()
67
+ word = word.strip().lower()
68
+ region = region.strip()
69
+ if region not in syllables:
70
+ syllables[region] = {}
71
+ if region not in ipa:
72
+ ipa[region] = {}
73
+ if word not in ipa[region]:
74
+ ipa[region][word] = {}
75
+ syllables[region][word] = syl.strip().replace(" ", "|").split("|")
76
+ ipa[region][word][pos.upper()] = phonemes
77
+
78
+ return ipa, syllables
79
+
80
+ def lang_to_region(self, lang: str) -> str:
81
+ """
82
+ Map an ISO Portuguese dialect code to the internal dataset region code.
83
+
84
+ Parameters:
85
+ lang (str): ISO dialect code (e.g., "pt-PT", "pt-BR").
86
+
87
+ Returns:
88
+ region (str): Corresponding dataset region code (e.g., "lbx", "rjx").
89
+
90
+ Raises:
91
+ ValueError: If the provided dialect code is not supported.
92
+ """
93
+ try:
94
+ return self._DIALECT_REGIONS[lang]
95
+ except KeyError as e:
96
+ raise ValueError(f"Unsupported dialect: {lang}") from e
97
+
98
+ def get_phonemes(self, word: str, pos: str = "NOUN", region: str = "lbx") -> Optional[str]:
99
+ """
100
+ Retrieve the phoneme transcription for a word in a specific region and part of speech.
101
+
102
+ Parameters:
103
+ word: The word to look up; matching is case-insensitive because entries are normalized to lowercase.
104
+ pos: Part-of-speech tag to select the transcription variant (default: "NOUN").
105
+ region: Region code identifying the dialect dataset (e.g., "lbx").
106
+
107
+ Returns:
108
+ The phoneme string for the specified word and POS in the region, or None if no entry exists.
109
+ """
110
+ return self.ipa[region].get(word, {}).get(pos)
111
+
112
+ def get_syllables(self, word: str, region: str = "lbx") -> Optional[str]:
113
+ """
114
+ Retrieve the syllable segments for a word in the given region.
115
+
116
+ Parameters:
117
+ word (str): The target word, normalized to lowercase.
118
+ region (str): Dataset region code (e.g. 'lbx', 'rjx', 'lda', 'mpx', 'dli').
119
+
120
+ Returns:
121
+ list[str]: List of syllable strings for the word if present.
122
+ {}: An empty mapping if the word is not found for the region.
123
+ """
124
+ try:
125
+ return self.syllables[region].get(word, {})
126
+ except KeyError as e:
127
+ raise ValueError(f"Unsupported dialect: {region}") from e
128
+
129
+ def get(self, word: str, pos: str = "NOUN", region: str = "lbx") -> Dict[str, str]:
130
+ """
131
+ Retrieve both syllable segmentation and phoneme transcription for a word in a given region and part of speech.
132
+
133
+ Parameters:
134
+ word (str): The lookup word (case-insensitive).
135
+ pos (str): Part-of-speech tag to select the phoneme variant (default: "NOUN").
136
+ region (str): Region code to query (e.g. "lbx") (default: "lbx").
137
+
138
+ Returns:
139
+ dict: A mapping with keys:
140
+ - "syllables": list of syllable segments for the word in the region, or `None` if not found.
141
+ - "phonemes": phoneme transcription for the given POS and region, or `None` if not found.
142
+ """
143
+ return {
144
+ "syllables": self.get_syllables(word, region),
145
+ "phonemes": self.get_phonemes(word, pos, region),
146
+ }
147
+
148
+ def get_wordlist(self, region: str = "lbx") -> List[str]:
149
+ """
150
+ Return a sorted list of words available for the given region.
151
+
152
+ Parameters:
153
+ region (str): Region code (e.g., "lbx") identifying the dataset to query.
154
+
155
+ Returns:
156
+ List[str]: Words available in the lexicon for the region, sorted alphabetically.
157
+ """
158
+ try:
159
+ return sorted(self.syllables[region].keys())
160
+ except KeyError as e:
161
+ raise ValueError(f"Unsupported dialect: {region}") from e
162
+
163
+ if __name__ == "__main__":
164
+ ph = TugaLexicon()
165
+
166
+ for w in ph.get_wordlist():
167
+ gold_pho = ph.get_phonemes(w)
168
+ if gold_pho:
169
+ print(w, gold_pho)
@@ -0,0 +1,320 @@
1
+ import string
2
+ from typing import Optional
3
+
4
+ from unicode_rbnf import RbnfEngine, FormatPurpose
5
+
6
+
7
+ class NumberParser:
8
+ """
9
+ A utility class to convert digits into their spelled-out Portuguese equivalent.
10
+
11
+ In Portuguese, numbers must agree in gender (masculine/feminine) and
12
+ type (cardinal/ordinal) with the nouns they modify.
13
+ Example: '1' can be 'um' (masc.), 'uma' (fem.), 'primeiro' (1st masc.), or 'primeira' (1st fem.).
14
+
15
+ NOTE: RbnfEngine defaults to short-scale for pt-BR and long-scale for pt-PT.
16
+ https://en.wikipedia.org/wiki/Long_and_short_scales
17
+ https://pt.wikipedia.org/wiki/Escalas_curta_e_longa
18
+
19
+ Limitations:
20
+ - Can not set number scale independently of language
21
+ - Can not handle very large numbers TODO: document max value
22
+ """
23
+ engine_pt = RbnfEngine.for_language("pt_PT")
24
+ engine_br = RbnfEngine.for_language("pt")
25
+
26
+ # Symbols used in PT to denote ordinals (like the English 'st', 'nd', 'rd')
27
+ ORDINAL_MALE = "º" # e.g., 1º (primeiro)
28
+ ORDINAL_FEMALE = "ª" # e.g., 1ª (primeira)
29
+ ORDINAL_TOKENS = [ORDINAL_MALE, ORDINAL_FEMALE]
30
+
31
+ @classmethod
32
+ def pronounce_number_word(cls, word: str,
33
+ prev_word: Optional[str] = None,
34
+ next_word: Optional[str] = None,
35
+ gender: Optional[str] = None,
36
+ as_ordinal: Optional[bool] = None,
37
+ is_brazilian=False) -> str:
38
+ """
39
+ Convert a numeric token into its spelled-out Portuguese form using surrounding context.
40
+
41
+ Parameters:
42
+ word (str): Numeric string to convert (e.g., "1", "2.5", "1e9").
43
+ prev_word (Optional[str]): Word immediately before `word`, used to infer gender.
44
+ next_word (Optional[str]): Word immediately after `word`, used to infer ordinality and gender.
45
+ gender (Optional[str]): Explicit gender override ("masculine" or "feminine"); if omitted a heuristic is applied.
46
+ as_ordinal (Optional[bool]): If provided, forces ordinal (`True`) or cardinal (`False`) interpretation; otherwise context is used.
47
+ is_brazilian (bool): If True, use Brazilian Portuguese formatting rules (pt-BR); otherwise use pt-PT.
48
+
49
+ Returns:
50
+ Optional[str]: The spelled-out form of the number in Portuguese, or `None` if a textual form cannot be produced.
51
+ """
52
+ # TODO: allow scale independent from language code
53
+ # ie. enable pt-PT+short-scale and pt-BR+long-scale
54
+ if cls.is_scientific_notation(word):
55
+ return cls.pronounce_scientific(word, is_brazilian=is_brazilian)
56
+
57
+ # 1. Determine if the number is an ordinal (1st, 2nd) or cardinal (1, 2)
58
+ is_ord = cls.is_ordinal(word, next_word) if as_ordinal is None else as_ordinal
59
+
60
+ # 2. Determine grammatical gender (numbers 1, 2, and hundreds change in PT)
61
+ gender = gender or cls.get_number_gender(word, prev_word, next_word)
62
+ fmt = FormatPurpose.ORDINAL if is_ord else FormatPurpose.CARDINAL
63
+
64
+ # 3. Generate the base text using RBNF (Rule-Based Number Format)
65
+ word = word.replace(" º", "º").replace(" ª", "ª").strip()
66
+ spelled = cls.engine_br.format_number(word, fmt) if is_brazilian else cls.engine_pt.format_number(word, fmt)
67
+
68
+ # Select the specific ruleset based on grammar results
69
+ if is_ord:
70
+ key = f'spellout-ordinal-{gender}'
71
+ else:
72
+ key = f'spellout-cardinal-{gender}'
73
+
74
+ text = spelled.text_by_ruleset[key]
75
+ return text
76
+
77
+ # digit/string conversion
78
+ @classmethod
79
+ def to_int(cls, word: str) -> Optional[int]:
80
+ """
81
+ Parse a numeric token into an integer after stripping ordinal markers and surrounding punctuation/whitespace.
82
+
83
+ Parameters:
84
+ word (str): Input token which may contain ordinal symbols (º, ª), punctuation, or surrounding whitespace.
85
+
86
+ Returns:
87
+ int: The parsed integer value on success.
88
+ None: If the token contains a decimal point (treated as a non-integer) or cannot be parsed as an integer after cleaning.
89
+ """
90
+ if "." in word:
91
+ return None # may be a decimal
92
+ try:
93
+ # Remove ordinal markers and standard punctuation
94
+ word = word.strip(cls.ORDINAL_MALE +
95
+ cls.ORDINAL_FEMALE +
96
+ string.whitespace)
97
+ return int(word)
98
+ except (ValueError, TypeError):
99
+ return None
100
+
101
+ @classmethod
102
+ def is_int(cls, word: str) -> bool:
103
+ """
104
+ Determine whether a token represents an integer (no decimal point).
105
+
106
+ Parameters:
107
+ word (str): Input token; ordinal markers (º, ª), punctuation and spaces are ignored during validation.
108
+
109
+ Returns:
110
+ bool: `True` if the token can be parsed to an integer after cleaning, `False` otherwise.
111
+ """
112
+ return cls.to_int(word) is not None
113
+
114
+ @classmethod
115
+ def to_float(cls, word: str) -> Optional[float]:
116
+ """
117
+ Convert a numeric string (possibly containing ordinal markers, punctuation, or surrounding whitespace) into a float.
118
+
119
+ Parameters:
120
+ word (str): The input string to parse; may include ordinal symbols (º, ª), punctuation, or whitespace.
121
+
122
+ Returns:
123
+ float: The parsed numeric value if conversion succeeds, `None` if the input cannot be converted to a float.
124
+ """
125
+ try:
126
+ # Remove ordinal markers and standard punctuation
127
+ word = word.strip(cls.ORDINAL_MALE +
128
+ cls.ORDINAL_FEMALE +
129
+ string.whitespace)
130
+ return float(word)
131
+ except (ValueError, TypeError):
132
+ return None
133
+
134
+ @classmethod
135
+ def is_float(cls, word: str) -> bool:
136
+ """
137
+ Determine whether a string represents a decimal/floating point number.
138
+
139
+ TODO: differentiate float and decimal , float also handles scientific notation
140
+ Returns:
141
+ `true` if the string can be parsed as a float, `false` otherwise.
142
+ """
143
+ return cls.to_float(word) is not None
144
+
145
+ @classmethod
146
+ def is_scientific_notation(cls, word: str) -> bool:
147
+ """
148
+ Check whether a token uses scientific notation with a decimal mantissa and an integer exponent separated by 'e' (case-insensitive).
149
+
150
+ Parameters:
151
+ word (str): Token to test; the mantissa may include a decimal point, and the exponent must consist of digits.
152
+
153
+ Returns:
154
+ `true` if the token is scientific notation (e.g., "1.5e10"), `false` otherwise.
155
+ """
156
+ nums = word.lower().split("e")
157
+ if len(nums) != 2:
158
+ return False
159
+ # NOTE: cant use .isdigit() in order to allow decimals and negative numbers
160
+ return cls.is_float(nums[0]) and cls.is_int(nums[1])
161
+
162
+ @classmethod
163
+ def pronounce_scientific(cls, word: str, is_brazilian=False) -> str:
164
+ """
165
+ Convert a number in scientific notation into its Portuguese spoken form.
166
+
167
+ Parameters:
168
+ word (str): A numeric string in scientific notation (e.g., "1.5e10").
169
+ is_brazilian (bool): If True, use Brazilian Portuguese variants; otherwise use Portugal variants.
170
+
171
+ Returns:
172
+ spoken (str): The spelled-out Portuguese phrase for the notation, combining mantissa and exponent (e.g., "um vírgula cinco vezes dez elevado a dez").
173
+
174
+ Raises:
175
+ ValueError: If `word` is not valid scientific notation.
176
+ """
177
+ if not cls.is_scientific_notation(word):
178
+ raise ValueError(f"word is not scientific notation: '{word}'")
179
+ a, b = word.lower().split("e")
180
+ a_str = cls.pronounce_number_word(a, is_brazilian=is_brazilian)
181
+ b_str = cls.pronounce_number_word(b, is_brazilian=is_brazilian)
182
+ return f"{a_str} vezes dez elevado a {b_str}"
183
+
184
+ # contextual rules
185
+ @classmethod
186
+ def is_ordinal(cls, word: str, next_word: Optional[str] = None) -> bool:
187
+ """
188
+ Determine whether a token represents an ordinal number.
189
+
190
+ Parameters:
191
+ word (str): The token to check.
192
+ next_word (Optional[str]): The following token; used to detect a separated ordinal marker (e.g., "º", "ª").
193
+
194
+ Returns:
195
+ `true` if the word contains an ordinal marker or the next_token is an ordinal marker, `false` otherwise.
196
+ """
197
+ # Check if the symbol is a separate token or attached to the number
198
+ if next_word in cls.ORDINAL_TOKENS:
199
+ return True
200
+ elif any(t in word for t in cls.ORDINAL_TOKENS):
201
+ return True
202
+ return False
203
+
204
+ @classmethod
205
+ def get_number_gender(cls, word: str,
206
+ prev_word: Optional[str] = None,
207
+ next_word: Optional[str] = None) -> str:
208
+ """
209
+ Determine the grammatical gender (masculine or feminine) that a numeric token should take in Portuguese.
210
+
211
+ Parameters:
212
+ word (str): The numeric token (may include ordinal symbols like 'º' or 'ª').
213
+ prev_word (Optional[str]): The preceding word in context, used for heuristic cues (e.g., articles).
214
+ next_word (Optional[str]): The following word in context, used to infer the gender of the counted noun.
215
+
216
+ Returns:
217
+ str: "feminine" if the number should agree as feminine, "masculine" otherwise.
218
+ """
219
+ # Rule A: Ordinal symbols explicitly dictate gender (º = masc, ª = fem)
220
+ if (next_word and next_word == cls.ORDINAL_FEMALE) or cls.ORDINAL_FEMALE in word:
221
+ return "feminine"
222
+
223
+ # Rule B: Check preceding articles/prepositions (a, as, da, das are feminine)
224
+ if prev_word and prev_word in ["a", "as", "da", "das"]:
225
+ return "feminine"
226
+
227
+ # Rule C: Check the following noun (the object being counted)
228
+ if next_word:
229
+ # Simple check: Words ending in 'a' are usually feminine (e.g., 'casa')
230
+ # We strip 's' to account for plural nouns.
231
+ if next_word.strip("s").lower().endswith("a"):
232
+ # 1 casa (house) -> uma casa (female)
233
+ # 1 cão (dog) -> um cão (male)
234
+ return "feminine"
235
+
236
+ # Rule D: Handle tricky '-e' endings
237
+ # Words ending in -dade, -age, or -agem are consistently feminine.
238
+ elif next_word.rstrip("sm").lower().endswith("e"):
239
+ # words ending with "e" may be either male, female or both
240
+ # a wordlist is needed to be sure
241
+ # 1 ponte (bridge) -> uma ponte (female)
242
+ # 1 dente (tooth) -> um dente (male)
243
+ # 1 cliente -> um(a) cliente
244
+ female_endings = ["dade", "age", "agem"]
245
+ # -dade (Feminine): Words like felicidade (happiness), cidade (city), and liberdade (freedom) are always feminine.
246
+ # -age / -agem (Feminine): Words like viagem (trip) or coragem (courage) are feminine.
247
+ if any(next_word.endswith(f) for f in female_endings):
248
+ return "feminine"
249
+ # by default numbers are male in portuguese
250
+ return "masculine"
251
+
252
+
253
+ def normalize_numbers(text: str, lang: str = "pt-PT", strict=True) -> str:
254
+ """
255
+ Replace numeric tokens in a sentence with their contextually correct Portuguese written forms.
256
+
257
+ This function normalizes the language tag (treating any variant of "pt-br" as "pt-BR"), collapses spaced ordinal markers (e.g., "1 º" -> "1º") for parsing, and converts integer, float and scientific-notation tokens into their spelled-out Portuguese equivalents, preserving other tokens and surrounding context.
258
+
259
+ Parameters:
260
+ text (str): Input sentence containing numeric and non-numeric tokens.
261
+ lang (str): Language variant to use for spelling rules (defaults to "pt-PT"; any "pt-br" variant is treated as "pt-BR").
262
+ strict (bool): raise or ignore exceptions in RbnfEngine
263
+
264
+ Returns:
265
+ str: The input sentence with numeric tokens replaced by their spelled-out Portuguese forms.
266
+ """
267
+ if "pt-br" in lang.lower():
268
+ lang = "pt-BR"
269
+
270
+ # Pre-process: ensure symbols like 1 º become 1º for easier parsing
271
+ words = text.replace(" º", "º").replace(" ª", "ª").split()
272
+ normalized_words = []
273
+
274
+ for idx, word in enumerate(words):
275
+ # is this word a number?
276
+ is_num = NumberParser.is_int(word) or NumberParser.is_float(word)
277
+ if is_num:
278
+ # Lookahead and Lookbehind for grammatical context
279
+ next_word = words[idx + 1] if idx + 1 < len(words) else None
280
+ prev_word = words[idx - 1] if idx - 1 >= 0 else None
281
+ # spell out the number
282
+ try:
283
+ spelled = NumberParser.pronounce_number_word(
284
+ word, prev_word, next_word, is_brazilian=lang == "pt-BR"
285
+ )
286
+ normalized_words.append(spelled)
287
+ except Exception as e:
288
+ if strict:
289
+ raise e
290
+ normalized_words.append(word)
291
+ else:
292
+ normalized_words.append(word)
293
+
294
+ return " ".join(normalized_words)
295
+
296
+
297
+ if __name__ == "__main__":
298
+ # Test regional spelling (19)
299
+ print(f"BR: {NumberParser.pronounce_number_word('19', is_brazilian=True)}")
300
+ print(f"PT: {NumberParser.pronounce_number_word('19', is_brazilian=False)}")
301
+
302
+ # Test gender agreement
303
+ print(normalize_numbers("vou comprar 1 casa")) # uma (fem)
304
+ print(normalize_numbers("vou comprar 2 casas")) # duas (fem)
305
+ print(normalize_numbers("vou adotar 1 cão")) # um (masc)
306
+ print(normalize_numbers("vou adotar 2 cães")) # dois (masc)
307
+
308
+ # Test -e suffix rule (cidade = fem)
309
+ print(normalize_numbers("visitei 1 cidade")) # uma (fem)
310
+
311
+
312
+ print(normalize_numbers("897654356789098", "pt-PT")) # long-scale
313
+ # oitocentos e noventa e sete biliões seiscentos e cinquenta e quatro mil milhões trezentos e cinquenta e seis milhões setecentos e oitenta e nove mil e noventa e oito
314
+ print(normalize_numbers("897654356789098", "pt-BR")) # short-scale
315
+ # oitocentos e noventa e sete trilhões seiscentos e cinquenta e quatro bilhões trezentos e cinquenta e seis milhões setecentos e oitenta e nove mil e noventa e oito
316
+
317
+ print(normalize_numbers("1e-3")) # um vezes dez elevado a nove
318
+ print(normalize_numbers("1e9")) # um vezes dez elevado a nove
319
+ print(normalize_numbers("1.5e10")) # um vírgula cinco vezes dez elevado a dez
320
+ print(normalize_numbers("1.5e10000000")) # um vírgula cinco vezes dez elevado a dez milhões