tugaphone 0.0.2__tar.gz → 0.1.0a1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tugaphone-0.1.0a1/PKG-INFO +12 -0
- tugaphone-0.1.0a1/tugaphone/__init__.py +131 -0
- tugaphone-0.1.0a1/tugaphone/lexicon.py +169 -0
- tugaphone-0.1.0a1/tugaphone/number_utils.py +320 -0
- tugaphone-0.1.0a1/tugaphone/pos.py +154 -0
- {tugaphone-0.0.2 → tugaphone-0.1.0a1}/tugaphone/regional_dict.csv +271 -271
- tugaphone-0.1.0a1/tugaphone/syl.py +1203 -0
- tugaphone-0.1.0a1/tugaphone/tokenizer.py +3689 -0
- {tugaphone-0.0.2 → tugaphone-0.1.0a1}/tugaphone/version.py +3 -3
- tugaphone-0.1.0a1/tugaphone.egg-info/PKG-INFO +12 -0
- {tugaphone-0.0.2 → tugaphone-0.1.0a1}/tugaphone.egg-info/SOURCES.txt +5 -2
- tugaphone-0.1.0a1/tugaphone.egg-info/requires.txt +2 -0
- tugaphone-0.0.2/PKG-INFO +0 -6
- tugaphone-0.0.2/tugaphone/__init__.py +0 -125
- tugaphone-0.0.2/tugaphone/espeak.py +0 -164
- tugaphone-0.0.2/tugaphone/util.py +0 -713
- tugaphone-0.0.2/tugaphone.egg-info/PKG-INFO +0 -6
- tugaphone-0.0.2/tugaphone.egg-info/requires.txt +0 -1
- {tugaphone-0.0.2 → tugaphone-0.1.0a1}/README.md +0 -0
- {tugaphone-0.0.2 → tugaphone-0.1.0a1}/setup.cfg +0 -0
- {tugaphone-0.0.2 → tugaphone-0.1.0a1}/setup.py +0 -0
- {tugaphone-0.0.2 → tugaphone-0.1.0a1}/tugaphone.egg-info/dependency_links.txt +0 -0
- {tugaphone-0.0.2 → tugaphone-0.1.0a1}/tugaphone.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tugaphone
|
|
3
|
+
Version: 0.1.0a1
|
|
4
|
+
Home-page: https://github.com/TigreGotico/tugaphone
|
|
5
|
+
Author: JarbasAi
|
|
6
|
+
Author-email: jarbasai@mailfence.com
|
|
7
|
+
Requires-Dist: brill-postagger
|
|
8
|
+
Requires-Dist: unicode-rbnf
|
|
9
|
+
Dynamic: author
|
|
10
|
+
Dynamic: author-email
|
|
11
|
+
Dynamic: home-page
|
|
12
|
+
Dynamic: requires-dist
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from tugaphone.lexicon import TugaLexicon
|
|
5
|
+
from tugaphone.pos import TugaTagger
|
|
6
|
+
from tugaphone.tokenizer import Sentence as Tokenizer, EuropeanPortuguese, BrazilianPortuguese
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TugaPhonemizer:
|
|
10
|
+
"""
|
|
11
|
+
TugaPhonemizer applies dialect-aware Portuguese phonemization.
|
|
12
|
+
|
|
13
|
+
Supports:
|
|
14
|
+
- pt-PT (Portugal)
|
|
15
|
+
- pt-BR (Brazil)
|
|
16
|
+
- pt-AO (Angola)
|
|
17
|
+
- pt-MZ (Mozambique)
|
|
18
|
+
- pt-TL (Timor-Leste)
|
|
19
|
+
"""
|
|
20
|
+
_DIALECT_REGIONS = {
|
|
21
|
+
"pt-PT": "lbx",
|
|
22
|
+
"pt-BR": "rjx",
|
|
23
|
+
"pt-AO": "lda",
|
|
24
|
+
"pt-MZ": "mpx",
|
|
25
|
+
"pt-TL": "dli",
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
def __init__(self, dictionary_path: str = None,
|
|
29
|
+
postag_engine="auto",
|
|
30
|
+
postag_model="pt_core_news_lg"):
|
|
31
|
+
|
|
32
|
+
"""
|
|
33
|
+
Initialize the TugaPhonemizer by loading the regional lexicon and configuring the part-of-speech tagger.
|
|
34
|
+
|
|
35
|
+
Parameters:
|
|
36
|
+
dictionary_path (str): Path to a CSV lexicon file; if omitted, defaults to the bundled "regional_dict.csv" located next to this module.
|
|
37
|
+
postag_engine (str): Tagging engine selection passed to TugaTagger (e.g., "auto" to let the tagger choose the best available engine).
|
|
38
|
+
postag_model (str): Model name or identifier used by the POS tagger (for engines that accept a model parameter).
|
|
39
|
+
"""
|
|
40
|
+
self.dictionary_path = dictionary_path or os.path.join(
|
|
41
|
+
os.path.dirname(__file__), "regional_dict.csv"
|
|
42
|
+
)
|
|
43
|
+
self.lexicon = TugaLexicon(self.dictionary_path)
|
|
44
|
+
self.postag = TugaTagger(postag_engine, postag_model)
|
|
45
|
+
|
|
46
|
+
def _lang_to_region(self, lang: str) -> str:
|
|
47
|
+
"""
|
|
48
|
+
Map an ISO Portuguese dialect code to the internal region code used by the lexicon.
|
|
49
|
+
|
|
50
|
+
Parameters:
|
|
51
|
+
lang (str): ISO dialect code (e.g., "pt-PT", "pt-BR").
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
str: The corresponding internal region code.
|
|
55
|
+
|
|
56
|
+
Raises:
|
|
57
|
+
ValueError: If `lang` is not a supported dialect.
|
|
58
|
+
"""
|
|
59
|
+
try:
|
|
60
|
+
return self._DIALECT_REGIONS[lang]
|
|
61
|
+
except KeyError as e:
|
|
62
|
+
raise ValueError(f"Unsupported dialect: {lang}") from e
|
|
63
|
+
|
|
64
|
+
def _get_phones(self, word: str, lang: str, pos: str,
|
|
65
|
+
region: Optional[str] = None) -> str:
|
|
66
|
+
"""
|
|
67
|
+
Retrieve the phonemic transcription for a single word in a specified Portuguese dialect.
|
|
68
|
+
|
|
69
|
+
Attempts to find a lexicon entry for the lowercased word using the given part-of-speech tag and a predefined sequence of POS fallbacks; if no lexicon entry is found, produces a dialect-appropriate IPA transcription via the tokenizer (European Portuguese for non-pt-BR dialects, Brazilian Portuguese for pt-BR).
|
|
70
|
+
|
|
71
|
+
Parameters:
|
|
72
|
+
word (str): The word to phonemize.
|
|
73
|
+
lang (str): ISO dialect code (e.g., "pt-PT", "pt-BR") used to determine the default region when `region` is not provided.
|
|
74
|
+
pos (str): The part-of-speech tag to prefer when looking up lexicon entries.
|
|
75
|
+
region (Optional[str]): Optional explicit region code to use for lexicon lookup; when omitted, the region is derived from `lang`.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
str: A phoneme string from the regional lexicon when available, otherwise an IPA transcription produced by the tokenizer.
|
|
79
|
+
"""
|
|
80
|
+
region = region or self._lang_to_region(lang)
|
|
81
|
+
word = word.lower().strip()
|
|
82
|
+
|
|
83
|
+
for fallback in [pos, "NOUN", "PRON", "ADP", "DET", "ADJ", "VERB", "ADV", "SCONJ"]:
|
|
84
|
+
gold_pho = self.lexicon.get_phonemes(word, fallback, region)
|
|
85
|
+
if gold_pho:
|
|
86
|
+
# print(f"DEBUG - word={word}, region={region}, pos={fallback}, phonemes={phones}")
|
|
87
|
+
return gold_pho
|
|
88
|
+
|
|
89
|
+
# Fallback
|
|
90
|
+
dialect = EuropeanPortuguese() if lang != "pt-BR" else BrazilianPortuguese()
|
|
91
|
+
return Tokenizer(surface=word, dialect=dialect).ipa
|
|
92
|
+
|
|
93
|
+
def phonemize_sentence(self, sentence: str, lang: str = "pt-PT") -> str:
|
|
94
|
+
"""
|
|
95
|
+
Phonemizes a sentence for the given Portuguese dialect.
|
|
96
|
+
|
|
97
|
+
Parameters:
|
|
98
|
+
sentence (str): Input sentence to phonemize.
|
|
99
|
+
lang (str): ISO dialect code to target (e.g., "pt-PT", "pt-BR", "pt-AO", "pt-MZ", "pt-TL").
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
phonemized (str): Space-separated phoneme tokens for each word; punctuation tokens are preserved unchanged.
|
|
103
|
+
"""
|
|
104
|
+
phonemized = [self._get_phones(word=tok, lang=lang, pos=pos)
|
|
105
|
+
if pos != "PUNCT" else tok
|
|
106
|
+
for tok, pos in self.postag.tag(sentence)]
|
|
107
|
+
|
|
108
|
+
return " ".join(phonemized)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
if __name__ == "__main__":
|
|
112
|
+
ph = TugaPhonemizer()
|
|
113
|
+
|
|
114
|
+
sentences = [
|
|
115
|
+
"O gato dorme.",
|
|
116
|
+
"Tu falas português muito bem.",
|
|
117
|
+
"O comboio chegou à estação.",
|
|
118
|
+
"A menina comeu o pão todo.",
|
|
119
|
+
"Vou pôr a manteiga no frigorífico.",
|
|
120
|
+
"Ele está a trabalhar no escritório.",
|
|
121
|
+
"Choveu muito ontem à noite.",
|
|
122
|
+
"A rapariga comprou um telemóvel novo.",
|
|
123
|
+
"Vamos tomar um pequeno-almoço.",
|
|
124
|
+
"O carro ficou sem gasolina."
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
for s in sentences:
|
|
128
|
+
print(s)
|
|
129
|
+
for code in ["pt-PT", "pt-BR", "pt-AO", "pt-MZ", "pt-TL"]:
|
|
130
|
+
print(f"{code} → {ph.phonemize_sentence(s, code)}")
|
|
131
|
+
print("######")
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Dict, List, Tuple, Optional
|
|
3
|
+
|
|
4
|
+
# Typing helpers
|
|
5
|
+
IPA_MAP = Dict[str, Dict[str, Dict[str, str]]]
|
|
6
|
+
SYLLABLE_MAP = Dict[str, List[str]]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TugaLexicon:
|
|
10
|
+
"""
|
|
11
|
+
Supports:
|
|
12
|
+
- pt-PT (Portugal)
|
|
13
|
+
- pt-BR (Brazil)
|
|
14
|
+
- pt-AO (Angola)
|
|
15
|
+
- pt-MZ (Mozambique)
|
|
16
|
+
- pt-TL (Timor-Leste)
|
|
17
|
+
"""
|
|
18
|
+
_DIALECT_REGIONS = {
|
|
19
|
+
"pt-PT": "lbx",
|
|
20
|
+
"pt-BR": "rjx",
|
|
21
|
+
"pt-AO": "lda",
|
|
22
|
+
"pt-MZ": "mpx",
|
|
23
|
+
"pt-TL": "dli",
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
def __init__(self, dictionary_path: str = None):
|
|
27
|
+
"""
|
|
28
|
+
Initialize the TugaLexicon and load regional phoneme and syllable data.
|
|
29
|
+
|
|
30
|
+
Parameters:
|
|
31
|
+
dictionary_path (str, optional): Path to the CSV file containing regional phoneme and syllable mappings.
|
|
32
|
+
If not provided, defaults to "regional_dict.csv" located in the same directory as this module.
|
|
33
|
+
|
|
34
|
+
Description:
|
|
35
|
+
Loads the dataset at `dictionary_path` and populates `self.ipa` and `self.syllables` with region-specific
|
|
36
|
+
phoneme and syllable mappings.
|
|
37
|
+
"""
|
|
38
|
+
self.dictionary_path = dictionary_path or os.path.join(
|
|
39
|
+
os.path.dirname(__file__), "regional_dict.csv"
|
|
40
|
+
)
|
|
41
|
+
self.ipa, self.syllables = self._load_lang_map(self.dictionary_path)
|
|
42
|
+
|
|
43
|
+
@staticmethod
|
|
44
|
+
def _load_lang_map(path: str) -> Tuple[IPA_MAP, SYLLABLE_MAP]:
|
|
45
|
+
"""
|
|
46
|
+
Load phoneme and syllable mappings from a CSV into region-indexed lookup structures.
|
|
47
|
+
|
|
48
|
+
The CSV is expected to have columns in this order (header is skipped): _, word, pos, _, phonemes, _, region.
|
|
49
|
+
- `phonemes` entries use `|` as an internal separator in the file and will be represented using the middle dot `·` in the returned data.
|
|
50
|
+
- `word` and `region` values are normalized to lowercase.
|
|
51
|
+
|
|
52
|
+
Parameters:
|
|
53
|
+
path (str): Path to the CSV file.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
Tuple[Dict[str, Dict[str, Dict[str, str]]], Dict[str, List[str]]]:
|
|
57
|
+
- ipa: mapping region -> word -> POS -> phoneme string (POS keys are uppercase).
|
|
58
|
+
- syllables: mapping region -> word -> list of syllable segments.
|
|
59
|
+
"""
|
|
60
|
+
ipa: IPA_MAP = {}
|
|
61
|
+
syllables: SYLLABLE_MAP = {}
|
|
62
|
+
|
|
63
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
64
|
+
for line in f.read().splitlines()[1:]: # skip header
|
|
65
|
+
_, word, pos, _, phonemes, syl, region = line.lower().split(",", 6)
|
|
66
|
+
phonemes = phonemes.replace("|", "·").strip()
|
|
67
|
+
word = word.strip().lower()
|
|
68
|
+
region = region.strip()
|
|
69
|
+
if region not in syllables:
|
|
70
|
+
syllables[region] = {}
|
|
71
|
+
if region not in ipa:
|
|
72
|
+
ipa[region] = {}
|
|
73
|
+
if word not in ipa[region]:
|
|
74
|
+
ipa[region][word] = {}
|
|
75
|
+
syllables[region][word] = syl.strip().replace(" ", "|").split("|")
|
|
76
|
+
ipa[region][word][pos.upper()] = phonemes
|
|
77
|
+
|
|
78
|
+
return ipa, syllables
|
|
79
|
+
|
|
80
|
+
def lang_to_region(self, lang: str) -> str:
|
|
81
|
+
"""
|
|
82
|
+
Map an ISO Portuguese dialect code to the internal dataset region code.
|
|
83
|
+
|
|
84
|
+
Parameters:
|
|
85
|
+
lang (str): ISO dialect code (e.g., "pt-PT", "pt-BR").
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
region (str): Corresponding dataset region code (e.g., "lbx", "rjx").
|
|
89
|
+
|
|
90
|
+
Raises:
|
|
91
|
+
ValueError: If the provided dialect code is not supported.
|
|
92
|
+
"""
|
|
93
|
+
try:
|
|
94
|
+
return self._DIALECT_REGIONS[lang]
|
|
95
|
+
except KeyError as e:
|
|
96
|
+
raise ValueError(f"Unsupported dialect: {lang}") from e
|
|
97
|
+
|
|
98
|
+
def get_phonemes(self, word: str, pos: str = "NOUN", region: str = "lbx") -> Optional[str]:
|
|
99
|
+
"""
|
|
100
|
+
Retrieve the phoneme transcription for a word in a specific region and part of speech.
|
|
101
|
+
|
|
102
|
+
Parameters:
|
|
103
|
+
word: The word to look up; matching is case-insensitive because entries are normalized to lowercase.
|
|
104
|
+
pos: Part-of-speech tag to select the transcription variant (default: "NOUN").
|
|
105
|
+
region: Region code identifying the dialect dataset (e.g., "lbx").
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
The phoneme string for the specified word and POS in the region, or None if no entry exists.
|
|
109
|
+
"""
|
|
110
|
+
return self.ipa[region].get(word, {}).get(pos)
|
|
111
|
+
|
|
112
|
+
def get_syllables(self, word: str, region: str = "lbx") -> Optional[str]:
|
|
113
|
+
"""
|
|
114
|
+
Retrieve the syllable segments for a word in the given region.
|
|
115
|
+
|
|
116
|
+
Parameters:
|
|
117
|
+
word (str): The target word, normalized to lowercase.
|
|
118
|
+
region (str): Dataset region code (e.g. 'lbx', 'rjx', 'lda', 'mpx', 'dli').
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
list[str]: List of syllable strings for the word if present.
|
|
122
|
+
{}: An empty mapping if the word is not found for the region.
|
|
123
|
+
"""
|
|
124
|
+
try:
|
|
125
|
+
return self.syllables[region].get(word, {})
|
|
126
|
+
except KeyError as e:
|
|
127
|
+
raise ValueError(f"Unsupported dialect: {region}") from e
|
|
128
|
+
|
|
129
|
+
def get(self, word: str, pos: str = "NOUN", region: str = "lbx") -> Dict[str, str]:
|
|
130
|
+
"""
|
|
131
|
+
Retrieve both syllable segmentation and phoneme transcription for a word in a given region and part of speech.
|
|
132
|
+
|
|
133
|
+
Parameters:
|
|
134
|
+
word (str): The lookup word (case-insensitive).
|
|
135
|
+
pos (str): Part-of-speech tag to select the phoneme variant (default: "NOUN").
|
|
136
|
+
region (str): Region code to query (e.g. "lbx") (default: "lbx").
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
dict: A mapping with keys:
|
|
140
|
+
- "syllables": list of syllable segments for the word in the region, or `None` if not found.
|
|
141
|
+
- "phonemes": phoneme transcription for the given POS and region, or `None` if not found.
|
|
142
|
+
"""
|
|
143
|
+
return {
|
|
144
|
+
"syllables": self.get_syllables(word, region),
|
|
145
|
+
"phonemes": self.get_phonemes(word, pos, region),
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
def get_wordlist(self, region: str = "lbx") -> List[str]:
|
|
149
|
+
"""
|
|
150
|
+
Return a sorted list of words available for the given region.
|
|
151
|
+
|
|
152
|
+
Parameters:
|
|
153
|
+
region (str): Region code (e.g., "lbx") identifying the dataset to query.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
List[str]: Words available in the lexicon for the region, sorted alphabetically.
|
|
157
|
+
"""
|
|
158
|
+
try:
|
|
159
|
+
return sorted(self.syllables[region].keys())
|
|
160
|
+
except KeyError as e:
|
|
161
|
+
raise ValueError(f"Unsupported dialect: {region}") from e
|
|
162
|
+
|
|
163
|
+
if __name__ == "__main__":
|
|
164
|
+
ph = TugaLexicon()
|
|
165
|
+
|
|
166
|
+
for w in ph.get_wordlist():
|
|
167
|
+
gold_pho = ph.get_phonemes(w)
|
|
168
|
+
if gold_pho:
|
|
169
|
+
print(w, gold_pho)
|
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
import string
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from unicode_rbnf import RbnfEngine, FormatPurpose
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class NumberParser:
|
|
8
|
+
"""
|
|
9
|
+
A utility class to convert digits into their spelled-out Portuguese equivalent.
|
|
10
|
+
|
|
11
|
+
In Portuguese, numbers must agree in gender (masculine/feminine) and
|
|
12
|
+
type (cardinal/ordinal) with the nouns they modify.
|
|
13
|
+
Example: '1' can be 'um' (masc.), 'uma' (fem.), 'primeiro' (1st masc.), or 'primeira' (1st fem.).
|
|
14
|
+
|
|
15
|
+
NOTE: RbnfEngine defaults to short-scale for pt-BR and long-scale for pt-PT.
|
|
16
|
+
https://en.wikipedia.org/wiki/Long_and_short_scales
|
|
17
|
+
https://pt.wikipedia.org/wiki/Escalas_curta_e_longa
|
|
18
|
+
|
|
19
|
+
Limitations:
|
|
20
|
+
- Can not set number scale independently of language
|
|
21
|
+
- Can not handle very large numbers TODO: document max value
|
|
22
|
+
"""
|
|
23
|
+
engine_pt = RbnfEngine.for_language("pt_PT")
|
|
24
|
+
engine_br = RbnfEngine.for_language("pt")
|
|
25
|
+
|
|
26
|
+
# Symbols used in PT to denote ordinals (like the English 'st', 'nd', 'rd')
|
|
27
|
+
ORDINAL_MALE = "º" # e.g., 1º (primeiro)
|
|
28
|
+
ORDINAL_FEMALE = "ª" # e.g., 1ª (primeira)
|
|
29
|
+
ORDINAL_TOKENS = [ORDINAL_MALE, ORDINAL_FEMALE]
|
|
30
|
+
|
|
31
|
+
@classmethod
|
|
32
|
+
def pronounce_number_word(cls, word: str,
|
|
33
|
+
prev_word: Optional[str] = None,
|
|
34
|
+
next_word: Optional[str] = None,
|
|
35
|
+
gender: Optional[str] = None,
|
|
36
|
+
as_ordinal: Optional[bool] = None,
|
|
37
|
+
is_brazilian=False) -> str:
|
|
38
|
+
"""
|
|
39
|
+
Convert a numeric token into its spelled-out Portuguese form using surrounding context.
|
|
40
|
+
|
|
41
|
+
Parameters:
|
|
42
|
+
word (str): Numeric string to convert (e.g., "1", "2.5", "1e9").
|
|
43
|
+
prev_word (Optional[str]): Word immediately before `word`, used to infer gender.
|
|
44
|
+
next_word (Optional[str]): Word immediately after `word`, used to infer ordinality and gender.
|
|
45
|
+
gender (Optional[str]): Explicit gender override ("masculine" or "feminine"); if omitted a heuristic is applied.
|
|
46
|
+
as_ordinal (Optional[bool]): If provided, forces ordinal (`True`) or cardinal (`False`) interpretation; otherwise context is used.
|
|
47
|
+
is_brazilian (bool): If True, use Brazilian Portuguese formatting rules (pt-BR); otherwise use pt-PT.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Optional[str]: The spelled-out form of the number in Portuguese, or `None` if a textual form cannot be produced.
|
|
51
|
+
"""
|
|
52
|
+
# TODO: allow scale independent from language code
|
|
53
|
+
# ie. enable pt-PT+short-scale and pt-BR+long-scale
|
|
54
|
+
if cls.is_scientific_notation(word):
|
|
55
|
+
return cls.pronounce_scientific(word, is_brazilian=is_brazilian)
|
|
56
|
+
|
|
57
|
+
# 1. Determine if the number is an ordinal (1st, 2nd) or cardinal (1, 2)
|
|
58
|
+
is_ord = cls.is_ordinal(word, next_word) if as_ordinal is None else as_ordinal
|
|
59
|
+
|
|
60
|
+
# 2. Determine grammatical gender (numbers 1, 2, and hundreds change in PT)
|
|
61
|
+
gender = gender or cls.get_number_gender(word, prev_word, next_word)
|
|
62
|
+
fmt = FormatPurpose.ORDINAL if is_ord else FormatPurpose.CARDINAL
|
|
63
|
+
|
|
64
|
+
# 3. Generate the base text using RBNF (Rule-Based Number Format)
|
|
65
|
+
word = word.replace(" º", "º").replace(" ª", "ª").strip()
|
|
66
|
+
spelled = cls.engine_br.format_number(word, fmt) if is_brazilian else cls.engine_pt.format_number(word, fmt)
|
|
67
|
+
|
|
68
|
+
# Select the specific ruleset based on grammar results
|
|
69
|
+
if is_ord:
|
|
70
|
+
key = f'spellout-ordinal-{gender}'
|
|
71
|
+
else:
|
|
72
|
+
key = f'spellout-cardinal-{gender}'
|
|
73
|
+
|
|
74
|
+
text = spelled.text_by_ruleset[key]
|
|
75
|
+
return text
|
|
76
|
+
|
|
77
|
+
# digit/string conversion
|
|
78
|
+
@classmethod
|
|
79
|
+
def to_int(cls, word: str) -> Optional[int]:
|
|
80
|
+
"""
|
|
81
|
+
Parse a numeric token into an integer after stripping ordinal markers and surrounding punctuation/whitespace.
|
|
82
|
+
|
|
83
|
+
Parameters:
|
|
84
|
+
word (str): Input token which may contain ordinal symbols (º, ª), punctuation, or surrounding whitespace.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
int: The parsed integer value on success.
|
|
88
|
+
None: If the token contains a decimal point (treated as a non-integer) or cannot be parsed as an integer after cleaning.
|
|
89
|
+
"""
|
|
90
|
+
if "." in word:
|
|
91
|
+
return None # may be a decimal
|
|
92
|
+
try:
|
|
93
|
+
# Remove ordinal markers and standard punctuation
|
|
94
|
+
word = word.strip(cls.ORDINAL_MALE +
|
|
95
|
+
cls.ORDINAL_FEMALE +
|
|
96
|
+
string.whitespace)
|
|
97
|
+
return int(word)
|
|
98
|
+
except (ValueError, TypeError):
|
|
99
|
+
return None
|
|
100
|
+
|
|
101
|
+
@classmethod
|
|
102
|
+
def is_int(cls, word: str) -> bool:
|
|
103
|
+
"""
|
|
104
|
+
Determine whether a token represents an integer (no decimal point).
|
|
105
|
+
|
|
106
|
+
Parameters:
|
|
107
|
+
word (str): Input token; ordinal markers (º, ª), punctuation and spaces are ignored during validation.
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
bool: `True` if the token can be parsed to an integer after cleaning, `False` otherwise.
|
|
111
|
+
"""
|
|
112
|
+
return cls.to_int(word) is not None
|
|
113
|
+
|
|
114
|
+
@classmethod
|
|
115
|
+
def to_float(cls, word: str) -> Optional[float]:
|
|
116
|
+
"""
|
|
117
|
+
Convert a numeric string (possibly containing ordinal markers, punctuation, or surrounding whitespace) into a float.
|
|
118
|
+
|
|
119
|
+
Parameters:
|
|
120
|
+
word (str): The input string to parse; may include ordinal symbols (º, ª), punctuation, or whitespace.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
float: The parsed numeric value if conversion succeeds, `None` if the input cannot be converted to a float.
|
|
124
|
+
"""
|
|
125
|
+
try:
|
|
126
|
+
# Remove ordinal markers and standard punctuation
|
|
127
|
+
word = word.strip(cls.ORDINAL_MALE +
|
|
128
|
+
cls.ORDINAL_FEMALE +
|
|
129
|
+
string.whitespace)
|
|
130
|
+
return float(word)
|
|
131
|
+
except (ValueError, TypeError):
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
@classmethod
|
|
135
|
+
def is_float(cls, word: str) -> bool:
|
|
136
|
+
"""
|
|
137
|
+
Determine whether a string represents a decimal/floating point number.
|
|
138
|
+
|
|
139
|
+
TODO: differentiate float and decimal , float also handles scientific notation
|
|
140
|
+
Returns:
|
|
141
|
+
`true` if the string can be parsed as a float, `false` otherwise.
|
|
142
|
+
"""
|
|
143
|
+
return cls.to_float(word) is not None
|
|
144
|
+
|
|
145
|
+
@classmethod
|
|
146
|
+
def is_scientific_notation(cls, word: str) -> bool:
|
|
147
|
+
"""
|
|
148
|
+
Check whether a token uses scientific notation with a decimal mantissa and an integer exponent separated by 'e' (case-insensitive).
|
|
149
|
+
|
|
150
|
+
Parameters:
|
|
151
|
+
word (str): Token to test; the mantissa may include a decimal point, and the exponent must consist of digits.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
`true` if the token is scientific notation (e.g., "1.5e10"), `false` otherwise.
|
|
155
|
+
"""
|
|
156
|
+
nums = word.lower().split("e")
|
|
157
|
+
if len(nums) != 2:
|
|
158
|
+
return False
|
|
159
|
+
# NOTE: cant use .isdigit() in order to allow decimals and negative numbers
|
|
160
|
+
return cls.is_float(nums[0]) and cls.is_int(nums[1])
|
|
161
|
+
|
|
162
|
+
@classmethod
|
|
163
|
+
def pronounce_scientific(cls, word: str, is_brazilian=False) -> str:
|
|
164
|
+
"""
|
|
165
|
+
Convert a number in scientific notation into its Portuguese spoken form.
|
|
166
|
+
|
|
167
|
+
Parameters:
|
|
168
|
+
word (str): A numeric string in scientific notation (e.g., "1.5e10").
|
|
169
|
+
is_brazilian (bool): If True, use Brazilian Portuguese variants; otherwise use Portugal variants.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
spoken (str): The spelled-out Portuguese phrase for the notation, combining mantissa and exponent (e.g., "um vírgula cinco vezes dez elevado a dez").
|
|
173
|
+
|
|
174
|
+
Raises:
|
|
175
|
+
ValueError: If `word` is not valid scientific notation.
|
|
176
|
+
"""
|
|
177
|
+
if not cls.is_scientific_notation(word):
|
|
178
|
+
raise ValueError(f"word is not scientific notation: '{word}'")
|
|
179
|
+
a, b = word.lower().split("e")
|
|
180
|
+
a_str = cls.pronounce_number_word(a, is_brazilian=is_brazilian)
|
|
181
|
+
b_str = cls.pronounce_number_word(b, is_brazilian=is_brazilian)
|
|
182
|
+
return f"{a_str} vezes dez elevado a {b_str}"
|
|
183
|
+
|
|
184
|
+
# contextual rules
|
|
185
|
+
@classmethod
|
|
186
|
+
def is_ordinal(cls, word: str, next_word: Optional[str] = None) -> bool:
|
|
187
|
+
"""
|
|
188
|
+
Determine whether a token represents an ordinal number.
|
|
189
|
+
|
|
190
|
+
Parameters:
|
|
191
|
+
word (str): The token to check.
|
|
192
|
+
next_word (Optional[str]): The following token; used to detect a separated ordinal marker (e.g., "º", "ª").
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
`true` if the word contains an ordinal marker or the next_token is an ordinal marker, `false` otherwise.
|
|
196
|
+
"""
|
|
197
|
+
# Check if the symbol is a separate token or attached to the number
|
|
198
|
+
if next_word in cls.ORDINAL_TOKENS:
|
|
199
|
+
return True
|
|
200
|
+
elif any(t in word for t in cls.ORDINAL_TOKENS):
|
|
201
|
+
return True
|
|
202
|
+
return False
|
|
203
|
+
|
|
204
|
+
@classmethod
|
|
205
|
+
def get_number_gender(cls, word: str,
|
|
206
|
+
prev_word: Optional[str] = None,
|
|
207
|
+
next_word: Optional[str] = None) -> str:
|
|
208
|
+
"""
|
|
209
|
+
Determine the grammatical gender (masculine or feminine) that a numeric token should take in Portuguese.
|
|
210
|
+
|
|
211
|
+
Parameters:
|
|
212
|
+
word (str): The numeric token (may include ordinal symbols like 'º' or 'ª').
|
|
213
|
+
prev_word (Optional[str]): The preceding word in context, used for heuristic cues (e.g., articles).
|
|
214
|
+
next_word (Optional[str]): The following word in context, used to infer the gender of the counted noun.
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
str: "feminine" if the number should agree as feminine, "masculine" otherwise.
|
|
218
|
+
"""
|
|
219
|
+
# Rule A: Ordinal symbols explicitly dictate gender (º = masc, ª = fem)
|
|
220
|
+
if (next_word and next_word == cls.ORDINAL_FEMALE) or cls.ORDINAL_FEMALE in word:
|
|
221
|
+
return "feminine"
|
|
222
|
+
|
|
223
|
+
# Rule B: Check preceding articles/prepositions (a, as, da, das are feminine)
|
|
224
|
+
if prev_word and prev_word in ["a", "as", "da", "das"]:
|
|
225
|
+
return "feminine"
|
|
226
|
+
|
|
227
|
+
# Rule C: Check the following noun (the object being counted)
|
|
228
|
+
if next_word:
|
|
229
|
+
# Simple check: Words ending in 'a' are usually feminine (e.g., 'casa')
|
|
230
|
+
# We strip 's' to account for plural nouns.
|
|
231
|
+
if next_word.strip("s").lower().endswith("a"):
|
|
232
|
+
# 1 casa (house) -> uma casa (female)
|
|
233
|
+
# 1 cão (dog) -> um cão (male)
|
|
234
|
+
return "feminine"
|
|
235
|
+
|
|
236
|
+
# Rule D: Handle tricky '-e' endings
|
|
237
|
+
# Words ending in -dade, -age, or -agem are consistently feminine.
|
|
238
|
+
elif next_word.rstrip("sm").lower().endswith("e"):
|
|
239
|
+
# words ending with "e" may be either male, female or both
|
|
240
|
+
# a wordlist is needed to be sure
|
|
241
|
+
# 1 ponte (bridge) -> uma ponte (female)
|
|
242
|
+
# 1 dente (tooth) -> um dente (male)
|
|
243
|
+
# 1 cliente -> um(a) cliente
|
|
244
|
+
female_endings = ["dade", "age", "agem"]
|
|
245
|
+
# -dade (Feminine): Words like felicidade (happiness), cidade (city), and liberdade (freedom) are always feminine.
|
|
246
|
+
# -age / -agem (Feminine): Words like viagem (trip) or coragem (courage) are feminine.
|
|
247
|
+
if any(next_word.endswith(f) for f in female_endings):
|
|
248
|
+
return "feminine"
|
|
249
|
+
# by default numbers are male in portuguese
|
|
250
|
+
return "masculine"
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def normalize_numbers(text: str, lang: str = "pt-PT", strict=True) -> str:
|
|
254
|
+
"""
|
|
255
|
+
Replace numeric tokens in a sentence with their contextually correct Portuguese written forms.
|
|
256
|
+
|
|
257
|
+
This function normalizes the language tag (treating any variant of "pt-br" as "pt-BR"), collapses spaced ordinal markers (e.g., "1 º" -> "1º") for parsing, and converts integer, float and scientific-notation tokens into their spelled-out Portuguese equivalents, preserving other tokens and surrounding context.
|
|
258
|
+
|
|
259
|
+
Parameters:
|
|
260
|
+
text (str): Input sentence containing numeric and non-numeric tokens.
|
|
261
|
+
lang (str): Language variant to use for spelling rules (defaults to "pt-PT"; any "pt-br" variant is treated as "pt-BR").
|
|
262
|
+
strict (bool): raise or ignore exceptions in RbnfEngine
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
str: The input sentence with numeric tokens replaced by their spelled-out Portuguese forms.
|
|
266
|
+
"""
|
|
267
|
+
if "pt-br" in lang.lower():
|
|
268
|
+
lang = "pt-BR"
|
|
269
|
+
|
|
270
|
+
# Pre-process: ensure symbols like 1 º become 1º for easier parsing
|
|
271
|
+
words = text.replace(" º", "º").replace(" ª", "ª").split()
|
|
272
|
+
normalized_words = []
|
|
273
|
+
|
|
274
|
+
for idx, word in enumerate(words):
|
|
275
|
+
# is this word a number?
|
|
276
|
+
is_num = NumberParser.is_int(word) or NumberParser.is_float(word)
|
|
277
|
+
if is_num:
|
|
278
|
+
# Lookahead and Lookbehind for grammatical context
|
|
279
|
+
next_word = words[idx + 1] if idx + 1 < len(words) else None
|
|
280
|
+
prev_word = words[idx - 1] if idx - 1 >= 0 else None
|
|
281
|
+
# spell out the number
|
|
282
|
+
try:
|
|
283
|
+
spelled = NumberParser.pronounce_number_word(
|
|
284
|
+
word, prev_word, next_word, is_brazilian=lang == "pt-BR"
|
|
285
|
+
)
|
|
286
|
+
normalized_words.append(spelled)
|
|
287
|
+
except Exception as e:
|
|
288
|
+
if strict:
|
|
289
|
+
raise e
|
|
290
|
+
normalized_words.append(word)
|
|
291
|
+
else:
|
|
292
|
+
normalized_words.append(word)
|
|
293
|
+
|
|
294
|
+
return " ".join(normalized_words)
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
if __name__ == "__main__":
|
|
298
|
+
# Test regional spelling (19)
|
|
299
|
+
print(f"BR: {NumberParser.pronounce_number_word('19', is_brazilian=True)}")
|
|
300
|
+
print(f"PT: {NumberParser.pronounce_number_word('19', is_brazilian=False)}")
|
|
301
|
+
|
|
302
|
+
# Test gender agreement
|
|
303
|
+
print(normalize_numbers("vou comprar 1 casa")) # uma (fem)
|
|
304
|
+
print(normalize_numbers("vou comprar 2 casas")) # duas (fem)
|
|
305
|
+
print(normalize_numbers("vou adotar 1 cão")) # um (masc)
|
|
306
|
+
print(normalize_numbers("vou adotar 2 cães")) # dois (masc)
|
|
307
|
+
|
|
308
|
+
# Test -e suffix rule (cidade = fem)
|
|
309
|
+
print(normalize_numbers("visitei 1 cidade")) # uma (fem)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
print(normalize_numbers("897654356789098", "pt-PT")) # long-scale
|
|
313
|
+
# oitocentos e noventa e sete biliões seiscentos e cinquenta e quatro mil milhões trezentos e cinquenta e seis milhões setecentos e oitenta e nove mil e noventa e oito
|
|
314
|
+
print(normalize_numbers("897654356789098", "pt-BR")) # short-scale
|
|
315
|
+
# oitocentos e noventa e sete trilhões seiscentos e cinquenta e quatro bilhões trezentos e cinquenta e seis milhões setecentos e oitenta e nove mil e noventa e oito
|
|
316
|
+
|
|
317
|
+
print(normalize_numbers("1e-3")) # um vezes dez elevado a nove
|
|
318
|
+
print(normalize_numbers("1e9")) # um vezes dez elevado a nove
|
|
319
|
+
print(normalize_numbers("1.5e10")) # um vírgula cinco vezes dez elevado a dez
|
|
320
|
+
print(normalize_numbers("1.5e10000000")) # um vírgula cinco vezes dez elevado a dez milhões
|