tugaphone 0.0.2a1__py3-none-any.whl → 0.1.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tugaphone/__init__.py +66 -60
- tugaphone/lexicon.py +169 -0
- tugaphone/number_utils.py +320 -0
- tugaphone/pos.py +154 -0
- tugaphone/regional_dict.csv +271 -271
- tugaphone/syl.py +1203 -0
- tugaphone/tokenizer.py +3689 -0
- tugaphone/version.py +2 -2
- tugaphone-0.1.0a1.dist-info/METADATA +12 -0
- tugaphone-0.1.0a1.dist-info/RECORD +12 -0
- {tugaphone-0.0.2a1.dist-info → tugaphone-0.1.0a1.dist-info}/WHEEL +1 -1
- tugaphone/espeak.py +0 -164
- tugaphone/util.py +0 -713
- tugaphone-0.0.2a1.dist-info/METADATA +0 -8
- tugaphone-0.0.2a1.dist-info/RECORD +0 -9
- {tugaphone-0.0.2a1.dist-info → tugaphone-0.1.0a1.dist-info}/top_level.txt +0 -0
tugaphone/tokenizer.py
ADDED
|
@@ -0,0 +1,3689 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Portuguese Orthography → IPA Transcription System
|
|
3
|
+
|
|
4
|
+
This module provides comprehensive conversion from Portuguese orthography to
|
|
5
|
+
International Phonetic Alphabet (IPA) notation, following prescriptive norms
|
|
6
|
+
for European Portuguese (pt-PT), Brazilian Portuguese (pt-BR), and African
|
|
7
|
+
Portuguese variants (pt-AO, pt-MZ, pt-TL).
|
|
8
|
+
|
|
9
|
+
LINGUISTIC BACKGROUND:
|
|
10
|
+
======================
|
|
11
|
+
Portuguese orthography uses Latin script with diacritical marks to represent
|
|
12
|
+
a rich phonological system. The relationship between spelling and pronunciation
|
|
13
|
+
is relatively regular but includes context-sensitive rules, silent letters,
|
|
14
|
+
and dialectal variations.
|
|
15
|
+
|
|
16
|
+
DIALECTAL VARIATION:
|
|
17
|
+
====================
|
|
18
|
+
Portuguese exhibits significant phonological variation across regions:
|
|
19
|
+
|
|
20
|
+
1. EUROPEAN PORTUGUESE (pt-PT):
|
|
21
|
+
- Heavy vowel reduction in unstressed positions
|
|
22
|
+
- Post-alveolar fricatives for syllable-final /s, z/
|
|
23
|
+
- Velarized/dark [ɫ] in coda position
|
|
24
|
+
- Uvular [ʁ] for strong R in most regions
|
|
25
|
+
|
|
26
|
+
2. BRAZILIAN PORTUGUESE (pt-BR):
|
|
27
|
+
- Less vowel reduction (fuller vowel quality)
|
|
28
|
+
- Palatalization: /t, d/ → [tʃ, dʒ] before [i]
|
|
29
|
+
- L-vocalization: coda /l/ → [w] (creates new diphthongs)
|
|
30
|
+
- Glottal/velar [h, x] for strong R (region-dependent)
|
|
31
|
+
- Alveolar [s] for syllable-final /s/ (not palatalized)
|
|
32
|
+
- Nasal vowels less nasalized than European
|
|
33
|
+
|
|
34
|
+
3. ANGOLAN PORTUGUESE (pt-AO):
|
|
35
|
+
- Similar to European but with substrate influence
|
|
36
|
+
- Less vowel reduction than European
|
|
37
|
+
- Consistent alveolar trill [r] for R
|
|
38
|
+
- Substrate-influenced prosody from Bantu languages
|
|
39
|
+
|
|
40
|
+
4. MOZAMBICAN PORTUGUESE (pt-MZ):
|
|
41
|
+
- Similar to European with Bantu substrate
|
|
42
|
+
- Less vowel reduction
|
|
43
|
+
- May preserve distinctions lost in European
|
|
44
|
+
- Regional variation (north vs. south)
|
|
45
|
+
|
|
46
|
+
5. TIMORESE PORTUGUESE (pt-TL):
|
|
47
|
+
- Influenced by Tetum and other Austronesian languages
|
|
48
|
+
- Similar to European base with local adaptations
|
|
49
|
+
- Less widespread native use (L2 features common)
|
|
50
|
+
|
|
51
|
+
KEY PHONOLOGICAL CONCEPTS:
|
|
52
|
+
--------------------------
|
|
53
|
+
1. STRESS: Portuguese uses lexical stress (word-level prominence of syllables)
|
|
54
|
+
- Proparoxytone: stress on antepenultimate (third-to-last) syllable - rare, always marked
|
|
55
|
+
- Paroxytone: stress on penultimate (second-to-last) syllable - most common
|
|
56
|
+
- Oxytone: stress on final syllable - less common, specific phonological contexts
|
|
57
|
+
|
|
58
|
+
2. VOWEL QUALITY: Stressed vs unstressed vowels differ in quality and reduction
|
|
59
|
+
- Stressed: fuller realization, can be open [ɛ, ɔ] or closed [e, o]
|
|
60
|
+
- Unstressed: typically reduced to [ɨ] or [ɐ] in European Portuguese
|
|
61
|
+
- Brazilian: less reduction, maintains [e, o, a] quality
|
|
62
|
+
|
|
63
|
+
3. NASALIZATION: Vowels can be oral or nasal
|
|
64
|
+
- Marked by tilde (ã, õ) or followed by nasal consonant (m, n)
|
|
65
|
+
- Creates distinct phonemes, not just allophones
|
|
66
|
+
- Less nasalized in Brazilian Portuguese
|
|
67
|
+
|
|
68
|
+
4. DIPHTHONGS: Sequences of vowel + semivowel or semivowel + vowel
|
|
69
|
+
- Falling/descending: vowel → semivowel (rei [ˈʁej])
|
|
70
|
+
- Rising/ascending: semivowel → vowel (piano [ˈpjɐnu])
|
|
71
|
+
- Can be oral or nasal
|
|
72
|
+
- Brazilian: additional diphthongs from L-vocalization
|
|
73
|
+
|
|
74
|
+
IMPLEMENTATION ARCHITECTURE:
|
|
75
|
+
============================
|
|
76
|
+
The code uses a hierarchical tokenization model that mirrors linguistic structure:
|
|
77
|
+
|
|
78
|
+
Sentence → Words → Graphemes → Characters
|
|
79
|
+
|
|
80
|
+
- Character: Single letter/symbol
|
|
81
|
+
- Grapheme: Minimal spelling unit (can be digraph like 'ch' or diphthong like 'ai')
|
|
82
|
+
- Word: Sequence of graphemes with syllable structure
|
|
83
|
+
- Sentence: Sequence of words with prosodic information
|
|
84
|
+
|
|
85
|
+
All indices are computed top-down during initialization to avoid circular dependencies.
|
|
86
|
+
Context-sensitive rules are applied bottom-up during IPA generation.
|
|
87
|
+
|
|
88
|
+
QUICK REFERENCES:
|
|
89
|
+
===========
|
|
90
|
+
- http://www.portaldalinguaportuguesa.org
|
|
91
|
+
- https://en.wiktionary.org/wiki/Wiktionary:International_Phonetic_Alphabet
|
|
92
|
+
- https://en.wiktionary.org/wiki/Appendix:Portuguese_pronunciation
|
|
93
|
+
- https://en.wiktionary.org/wiki/Appendix:Portuguese_spellings
|
|
94
|
+
- https://european-portuguese.info/vowels
|
|
95
|
+
- https://pt.wikipedia.org/wiki/L%C3%ADngua_portuguesa
|
|
96
|
+
- https://pt.wikipedia.org/wiki/Ortografia_da_l%C3%ADngua_portuguesa
|
|
97
|
+
- https://pt.wikipedia.org/wiki/Gram%C3%A1tica_da_l%C3%ADngua_portuguesa
|
|
98
|
+
- https://pt.wikipedia.org/wiki/Fonologia_da_língua_portuguesa
|
|
99
|
+
- https://pt.wikipedia.org/wiki/Processo_do_vocalismo_%C3%A1tono_do_portugu%C3%AAs_europeu
|
|
100
|
+
- https://pt.wikipedia.org/wiki/Ditongo
|
|
101
|
+
- https://pt.wikipedia.org/wiki/Tritongo
|
|
102
|
+
- https://pt.wikipedia.org/wiki/Hiato_(lingu%C3%ADstica)
|
|
103
|
+
- https://pt.wikipedia.org/wiki/D%C3%ADgrafo
|
|
104
|
+
- https://pt.wikipedia.org/wiki/Fonema
|
|
105
|
+
- https://pt.wikipedia.org/wiki/Alofonia
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
import dataclasses
|
|
109
|
+
import string
|
|
110
|
+
from functools import cached_property
|
|
111
|
+
from typing import List, Optional, Dict, Set
|
|
112
|
+
|
|
113
|
+
from tugaphone.number_utils import normalize_numbers
|
|
114
|
+
from tugaphone.syl import syllabify
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# =============================================================================
|
|
118
|
+
# DIALECT INVENTORY: Phonological Rules and Mappings
|
|
119
|
+
# =============================================================================
|
|
120
|
+
|
|
121
|
+
@dataclasses.dataclass()
|
|
122
|
+
class DialectInventory:
|
|
123
|
+
"""
|
|
124
|
+
Encapsulates all dialect-specific phonological rules and mappings.
|
|
125
|
+
|
|
126
|
+
This class serves as a lookup table and rule repository for converting
|
|
127
|
+
Portuguese orthography to IPA. Different Portuguese dialects (European,
|
|
128
|
+
Brazilian, etc.) can define different inventories.
|
|
129
|
+
|
|
130
|
+
DESIGN RATIONALE:
|
|
131
|
+
-----------------
|
|
132
|
+
Centralizing dialect rules in one class allows:
|
|
133
|
+
- Easy comparison between dialects
|
|
134
|
+
- Clean separation of data from logic
|
|
135
|
+
- Simple addition of new dialects
|
|
136
|
+
- Maintenance of linguistic rules in one location
|
|
137
|
+
|
|
138
|
+
Attributes:
|
|
139
|
+
dialect_code: IETF BCP 47 language tag (e.g., 'pt-PT', 'pt-BR')
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
dialect_code: str = "pt-PT"
|
|
143
|
+
|
|
144
|
+
# =========================================================================
|
|
145
|
+
# SYMBOLIC CONSTANTS
|
|
146
|
+
# =========================================================================
|
|
147
|
+
# These are used in IPA output to represent prosodic features
|
|
148
|
+
|
|
149
|
+
HIATUS_TOKEN: str = "·" # Syllable boundary marker
|
|
150
|
+
PRIMARY_STRESS_TOKEN: str = "ˈ" # IPA primary stress marker (before stressed syllable)
|
|
151
|
+
SECONDARY_STRESS_TOKEN: str = "ˌ" # IPA secondary stress marker
|
|
152
|
+
|
|
153
|
+
# =========================================================================
|
|
154
|
+
# PUNCTUATION MAPPING
|
|
155
|
+
# =========================================================================
|
|
156
|
+
# Maps orthographic punctuation to prosodic IPA markers
|
|
157
|
+
# Rationale: Punctuation affects speech rhythm and pausing
|
|
158
|
+
|
|
159
|
+
PUNCT2IPA: Dict[str, str] = dataclasses.field(default_factory=dict)
|
|
160
|
+
|
|
161
|
+
# =========================================================================
|
|
162
|
+
# CHARACTER SETS
|
|
163
|
+
# =========================================================================
|
|
164
|
+
# Organized by linguistic function for efficient categorization
|
|
165
|
+
|
|
166
|
+
PUNCT_CHARS: Set[str] = dataclasses.field(default_factory=set)
|
|
167
|
+
|
|
168
|
+
# Base vowels: a, e, i, o, u
|
|
169
|
+
# Portuguese vowel system is asymmetric - more distinctions in stressed position
|
|
170
|
+
VOWEL_CHARS: Set[str] = dataclasses.field(default_factory=set)
|
|
171
|
+
|
|
172
|
+
# DIACRITICS ON VOWELS:
|
|
173
|
+
# Portuguese uses diacritics to mark stress, vowel quality, and nasalization
|
|
174
|
+
|
|
175
|
+
# Acute accent (´): Marks primary stress AND open vowel quality
|
|
176
|
+
# Only valid on a, e, o (vowels with open/closed distinction)
|
|
177
|
+
# Examples: café [kɐˈfɛ], está [ɨʃˈta], avó [ɐˈvɔ]
|
|
178
|
+
ACUTE_VOWEL_CHARS: Set[str] = dataclasses.field(default_factory=set)
|
|
179
|
+
|
|
180
|
+
# Grave accent (`): ARCHAIC - marked secondary stress (pre-1973 Portugal, pre-1971 Brazil)
|
|
181
|
+
# Modern usage: only 'à' (contraction a + a = à)
|
|
182
|
+
# Historical: sòmente, cafèzinho
|
|
183
|
+
GRAVE_VOWEL_CHARS: Set[str] = dataclasses.field(default_factory=set)
|
|
184
|
+
|
|
185
|
+
# Circumflex (^): Marks primary stress AND closed vowel quality
|
|
186
|
+
# Only valid on a, e, o
|
|
187
|
+
# Examples: você [voˈse], avô [ɐˈvo], âmbito [ˈɐ̃bitu]
|
|
188
|
+
CIRCUM_VOWEL_CHARS: Set[str] = dataclasses.field(default_factory=set)
|
|
189
|
+
|
|
190
|
+
# Tilde (~): Marks nasalization (air flow through nose)
|
|
191
|
+
# Modern Portuguese: only ã, õ are valid
|
|
192
|
+
# ẽ, ĩ, ũ: archaic or foreign words
|
|
193
|
+
# Examples: mão [ˈmɐ̃w̃], põe [ˈpõj̃]
|
|
194
|
+
TILDE_VOWEL_CHARS: Set[str] = dataclasses.field(default_factory=set)
|
|
195
|
+
|
|
196
|
+
# Diaeresis/Trema (¨): ARCHAIC - marked pronounced 'u' in 'gu/qu' contexts
|
|
197
|
+
# Abolished in 1945 (Portugal) and 2009 (Brazil)
|
|
198
|
+
# Historical: lingüiça [lĩˈgwisɐ] vs linguiça [lĩˈgisɐ]
|
|
199
|
+
# Modern German names: Müller, Göring
|
|
200
|
+
TREMA_VOWEL_CHARS: Set[str] = dataclasses.field(default_factory=set)
|
|
201
|
+
|
|
202
|
+
# Semivowels: Can function as vowel or consonant depending on position
|
|
203
|
+
# In Portuguese: /j/ (written i, e) and /w/ (written u, o)
|
|
204
|
+
# Examples: rei [ˈʁej] - 'i' is semivowel; rima [ˈʁimɐ] - 'i' is vowel
|
|
205
|
+
SEMIVOWEL_CHARS: Set[str] = dataclasses.field(default_factory=set)
|
|
206
|
+
|
|
207
|
+
# Foreign letters: Not in traditional Portuguese alphabet
|
|
208
|
+
# k, w, y: used in loanwords, foreign names, scientific terms
|
|
209
|
+
# Examples: kilo, whisky, yen
|
|
210
|
+
FOREIGN_CHARS: Set[str] = dataclasses.field(default_factory=set)
|
|
211
|
+
|
|
212
|
+
# Front vowels: Tongue positioned forward in mouth
|
|
213
|
+
# Relevant for palatalization rules (c→s, g→ʒ before front vowels)
|
|
214
|
+
FRONT_VOWEL_CHARS: Set[str] = dataclasses.field(default_factory=set)
|
|
215
|
+
|
|
216
|
+
# STRESS MARKERS (for automatic stress detection)
|
|
217
|
+
# Primary: acute accent and tilde (õ, ã are always stressed when final)
|
|
218
|
+
PRIMARY_STRESS_MARKERS: Set[str] = dataclasses.field(default_factory=set)
|
|
219
|
+
# Secondary: grave and circumflex
|
|
220
|
+
SECONDARY_STRESS_MARKERS: Set[str] = dataclasses.field(default_factory=set)
|
|
221
|
+
|
|
222
|
+
# =========================================================================
|
|
223
|
+
# IPA VOWEL INVENTORY
|
|
224
|
+
# =========================================================================
|
|
225
|
+
# Portuguese has one of the richest vowel systems in Romance languages
|
|
226
|
+
|
|
227
|
+
# ORAL VOWELS (air flows only through mouth):
|
|
228
|
+
# High: i [i] (si), ɨ [ɨ] (pedir-unstressed), u [u] (tu)
|
|
229
|
+
# Mid-closed: e [e] (você), o [o] (avô)
|
|
230
|
+
# Mid-open: ɛ [ɛ] (pé), ɔ [ɔ] (pó)
|
|
231
|
+
# Low: a [a] (lá-stressed), ɐ [ɐ] (casa-unstressed), ə [ə] (reduction)
|
|
232
|
+
ORAL_VOWELS: Set[str] = dataclasses.field(default_factory=set)
|
|
233
|
+
|
|
234
|
+
# NASAL VOWELS (air flows through nose AND mouth):
|
|
235
|
+
# Nasalization is phonemic in Portuguese (changes meaning)
|
|
236
|
+
# Examples: mato [ˈmatu] "bush" vs manto [ˈmɐ̃tu] "cloak"
|
|
237
|
+
NASAL_VOWELS: Set[str] = dataclasses.field(default_factory=set)
|
|
238
|
+
|
|
239
|
+
# VOWEL CATEGORIES BY OPENNESS (relevant for stress rules):
|
|
240
|
+
# These categories determine whether acute (´) or circumflex (^) is used
|
|
241
|
+
CLOSED_VOWELS: Set[str] = dataclasses.field(default_factory=set) # High vowels
|
|
242
|
+
SEMI_CLOSED_VOWELS: Set[str] = dataclasses.field(default_factory=set) # Mid-closed
|
|
243
|
+
OPEN_VOWELS: Set[str] = dataclasses.field(default_factory=set) # Low
|
|
244
|
+
SEMI_OPEN_VOWELS: Set[str] = dataclasses.field(default_factory=set) # Mid-open
|
|
245
|
+
|
|
246
|
+
ALL_VOWEL_CHARS: Set[str] = dataclasses.field(default_factory=set)
|
|
247
|
+
|
|
248
|
+
# =========================================================================
|
|
249
|
+
# DIPHTHONG INVENTORIES
|
|
250
|
+
# =========================================================================
|
|
251
|
+
# Diphthongs are single-syllable vowel sequences
|
|
252
|
+
# Structure: V+G (vowel + glide/semivowel) or G+V
|
|
253
|
+
|
|
254
|
+
# ORAL FALLING DIPHTHONGS (vowel → semivowel)
|
|
255
|
+
# Format: IPA → orthographic representation
|
|
256
|
+
# The /j/ glide is written 'i' or 'e', /w/ glide is written 'u' or 'o'
|
|
257
|
+
RISING_ORAL_DIPHTHONGS: Dict[str, str] = dataclasses.field(default_factory=dict)
|
|
258
|
+
|
|
259
|
+
# NASAL DIPHTHONGS
|
|
260
|
+
# Nasalization extends across the entire diphthong
|
|
261
|
+
# Examples: mãe [ˈmɐ̃j̃], cão [ˈkɐ̃w̃], põe [ˈpõj̃]
|
|
262
|
+
FALLING_NASAL_DIPHTHONGS: Dict[str, str] = dataclasses.field(default_factory=dict)
|
|
263
|
+
|
|
264
|
+
# BRAZILIAN PORTUGUESE SPECIAL DIPHTHONGS
|
|
265
|
+
# In Brazilian dialects, coda /l/ vocalizes to [w]
|
|
266
|
+
# This creates diphthongs not present in European Portuguese
|
|
267
|
+
# Examples: Brasil [bɾaˈziw] vs [bɾɐˈziɫ] (European)
|
|
268
|
+
PTBR_DIPHTHONGS: Dict[str, str] = dataclasses.field(default_factory=dict)
|
|
269
|
+
|
|
270
|
+
# =========================================================================
|
|
271
|
+
# NORMALIZATION MAPPINGS
|
|
272
|
+
# =========================================================================
|
|
273
|
+
# Maps archaic/invalid diacritics to modern standard equivalents
|
|
274
|
+
# Rationale: Historical texts use obsolete orthography
|
|
275
|
+
|
|
276
|
+
NORMALIZED_VOWELS: Dict[str, str] = dataclasses.field(default_factory=dict)
|
|
277
|
+
|
|
278
|
+
# =========================================================================
|
|
279
|
+
# GRAPHEME → IPA MAPPINGS
|
|
280
|
+
# =========================================================================
|
|
281
|
+
# Organized by complexity: multigraphs first, then digraphs, then single chars
|
|
282
|
+
|
|
283
|
+
# TETRAGRAPHS (4-letter sequences with special pronunciation)
|
|
284
|
+
TETRAGRAM2IPA: Dict[str, str] = dataclasses.field(default_factory=dict)
|
|
285
|
+
|
|
286
|
+
# TRIGRAPHS (3-letter sequences)
|
|
287
|
+
TRIGRAM2IPA: Dict[str, str] = dataclasses.field(default_factory=dict)
|
|
288
|
+
|
|
289
|
+
# TRIPHTHONGS (vowel + semivowel + vowel in one syllable)
|
|
290
|
+
# Rare in Portuguese: mostly in derived forms
|
|
291
|
+
# Example: Paraguai [pɐɾɐˈgwaj]
|
|
292
|
+
TRIPHTHONG2IPA: Dict[str, str] = dataclasses.field(default_factory=dict)
|
|
293
|
+
|
|
294
|
+
# DIPHTHONGS (reverse mapping: orthography → IPA)
|
|
295
|
+
DIPHTHONG2IPA: Dict[str, str] = dataclasses.field(default_factory=dict)
|
|
296
|
+
|
|
297
|
+
# DIGRAPHS - CONSONANTAL
|
|
298
|
+
# Two letters representing one consonant phoneme
|
|
299
|
+
# nh [ɲ]: palatal nasal (like Spanish ñ, Italian gn)
|
|
300
|
+
# lh [ʎ]: palatal lateral (like Italian gl)
|
|
301
|
+
# ch [ʃ]: voiceless postalveolar fricative (like English sh)
|
|
302
|
+
# rr [ʁ]: uvular trill (strong R)
|
|
303
|
+
# ss [s]: voiceless between vowels (otherwise 's' → [z])
|
|
304
|
+
DIGRAPH2IPA: Dict[str, str] = dataclasses.field(default_factory=dict)
|
|
305
|
+
|
|
306
|
+
# DIGRAPHS - NASAL VOWELS
|
|
307
|
+
# Vowel + nasal consonant (m/n) at syllable boundary → nasal vowel
|
|
308
|
+
# The 'm/n' is not pronounced separately; it nasalizes the vowel
|
|
309
|
+
# Examples: campo [ˈkɐ̃pu], antes [ˈɐ̃tɨʃ]
|
|
310
|
+
NASAL_DIGRAPHS: Dict[str, str] = dataclasses.field(default_factory=dict)
|
|
311
|
+
|
|
312
|
+
# CONSONANT HIATUS (intervocalic consonant clusters)
|
|
313
|
+
# These clusters span syllable boundaries with preserved articulation
|
|
314
|
+
# Examples: ficção [fik·ˈsɐ̃w̃], pacto [ˈpak·tu]
|
|
315
|
+
HETEROSYLLABIC_CLUSTERS: Dict[str, str] = dataclasses.field(default_factory=dict)
|
|
316
|
+
|
|
317
|
+
# ARCHAIC SILENT CONSONANTS
|
|
318
|
+
# Pre-2009 orthography included etymological consonants
|
|
319
|
+
# These were eliminated in Acordo Ortográfico
|
|
320
|
+
# Example: assumpção → assunção
|
|
321
|
+
ARCHAIC_MUTE_P: Dict[str, Set[str]] = dataclasses.field(default_factory=dict)
|
|
322
|
+
|
|
323
|
+
# FOREIGN DIGRAPHS (in loanwords)
|
|
324
|
+
FOREIGN_DIGRAPH2IPA: Dict[str, str] = dataclasses.field(default_factory=dict)
|
|
325
|
+
|
|
326
|
+
# =========================================================================
|
|
327
|
+
# HIATUS CONTEXTS
|
|
328
|
+
# =========================================================================
|
|
329
|
+
# Prefixes that force vowel separation (prevent diphthong formation)
|
|
330
|
+
# Example: bi·aturar [bi.ɐtu.ˈɾaɾ] not *[bjɐ.tu.ˈɾaɾ]
|
|
331
|
+
HIATUS_PREFIXES: Set[str] = dataclasses.field(default_factory=set)
|
|
332
|
+
|
|
333
|
+
# =========================================================================
|
|
334
|
+
# DEFAULT CHARACTER MAPPINGS
|
|
335
|
+
# =========================================================================
|
|
336
|
+
# Single character → IPA mapping (context-free baseline)
|
|
337
|
+
# Many characters have context-sensitive variants applied later
|
|
338
|
+
DEFAULT_CHAR2PHONEMES: Dict[str, str] = dataclasses.field(default_factory=dict)
|
|
339
|
+
|
|
340
|
+
# =========================================================================
|
|
341
|
+
# IRREGULAR WORD MAPPINGS
|
|
342
|
+
# =========================================================================
|
|
343
|
+
# Words with exceptional pronunciations that don't follow regular rules
|
|
344
|
+
# These override all other rules
|
|
345
|
+
IRREGULAR_WORDS: Dict[str, str] = dataclasses.field(default_factory=dict)
|
|
346
|
+
|
|
347
|
+
# =========================================================================
|
|
348
|
+
# STRESS RULES
|
|
349
|
+
# =========================================================================
|
|
350
|
+
# Portuguese stress is semi-predictable based on word endings
|
|
351
|
+
|
|
352
|
+
# OXYTONE ENDINGS (stress on final syllable)
|
|
353
|
+
# Words ending in these patterns are stressed on final syllable
|
|
354
|
+
# Examples: café, funil, rapaz, caju
|
|
355
|
+
OXYTONE_ENDINGS: Set[str] = dataclasses.field(default_factory=set)
|
|
356
|
+
|
|
357
|
+
# =========================================================================
|
|
358
|
+
# COMPILED GRAPHEME INVENTORY
|
|
359
|
+
# =========================================================================
|
|
360
|
+
# All valid multi-character graphemes for tokenization
|
|
361
|
+
# Ordered by length (longest first) for greedy matching
|
|
362
|
+
GRAPHEME_INVENTORY: List[str] = dataclasses.field(default_factory=list)
|
|
363
|
+
|
|
364
|
+
def __post_init__(self):
|
|
365
|
+
"""
|
|
366
|
+
Initialize all mapping dictionaries with default values.
|
|
367
|
+
|
|
368
|
+
This method populates the dialect-specific rules. It's called automatically
|
|
369
|
+
after dataclass initialization. Subclasses can override individual mappings.
|
|
370
|
+
|
|
371
|
+
DESIGN DECISION:
|
|
372
|
+
----------------
|
|
373
|
+
Using __post_init__ allows:
|
|
374
|
+
- Empty initialization for inheritance
|
|
375
|
+
- Default values for base dialect
|
|
376
|
+
- Override flexibility for subclasses
|
|
377
|
+
"""
|
|
378
|
+
self._initialize_char_lists()
|
|
379
|
+
self._initialize_normalized_vowels()
|
|
380
|
+
self._initialize_punctuation()
|
|
381
|
+
self._initialize_consonant_digraphs()
|
|
382
|
+
self._initialize_nasal_digraphs()
|
|
383
|
+
self._initialize_consonant_hiatus()
|
|
384
|
+
self._initialize_archaic_forms()
|
|
385
|
+
self._initialize_foreign_digraphs()
|
|
386
|
+
self._initialize_hiatus_prefixes()
|
|
387
|
+
self._initialize_diphthongs()
|
|
388
|
+
self._initialize_triphthongs()
|
|
389
|
+
self._initialize_trigrams()
|
|
390
|
+
self._initialize_tetragrams()
|
|
391
|
+
self._initialize_default_chars()
|
|
392
|
+
self._initialize_stress_rules()
|
|
393
|
+
self._compile_grapheme_inventory()
|
|
394
|
+
|
|
395
|
+
# Até ao início do século XX, tanto em Portugal como no Brasil,
|
|
396
|
+
# seguia-se uma ortografia que, por regra, baseava-se nos étimos latino ou grego para escrever cada palavra
|
|
397
|
+
# TODO: mapping to modern word equivalent, normalize for IPA parsing
|
|
398
|
+
self.ARCHAIC_WORDS = {
|
|
399
|
+
"architectura",
|
|
400
|
+
"caravella",
|
|
401
|
+
"diccionario",
|
|
402
|
+
"diphthongo",
|
|
403
|
+
"estylo",
|
|
404
|
+
"grammatica",
|
|
405
|
+
"lyrio",
|
|
406
|
+
"parochia",
|
|
407
|
+
"kilometro",
|
|
408
|
+
"orthographia",
|
|
409
|
+
"pharmacia",
|
|
410
|
+
"phleugma",
|
|
411
|
+
"prompto",
|
|
412
|
+
"psychologia",
|
|
413
|
+
"psalmo",
|
|
414
|
+
"rheumatismo",
|
|
415
|
+
"sanccionar",
|
|
416
|
+
"theatro"
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
def _initialize_char_lists(self):
|
|
420
|
+
if not self.PUNCT_CHARS:
|
|
421
|
+
self.PUNCT_CHARS = set(string.punctuation)
|
|
422
|
+
if not self.VOWEL_CHARS:
|
|
423
|
+
self.VOWEL_CHARS = set("aeiou")
|
|
424
|
+
if not self.ACUTE_VOWEL_CHARS:
|
|
425
|
+
self.ACUTE_VOWEL_CHARS = set("áéíóú")
|
|
426
|
+
if not self.GRAVE_VOWEL_CHARS:
|
|
427
|
+
self.GRAVE_VOWEL_CHARS = set("àèìòù")
|
|
428
|
+
if not self.CIRCUM_VOWEL_CHARS:
|
|
429
|
+
self.CIRCUM_VOWEL_CHARS = set("âêîôû")
|
|
430
|
+
if not self.TILDE_VOWEL_CHARS:
|
|
431
|
+
self.TILDE_VOWEL_CHARS = set("ãõẽĩũ")
|
|
432
|
+
if not self.TREMA_VOWEL_CHARS:
|
|
433
|
+
self.TREMA_VOWEL_CHARS = set("äëïöü")
|
|
434
|
+
if not self.SEMIVOWEL_CHARS:
|
|
435
|
+
self.SEMIVOWEL_CHARS = set("iueo")
|
|
436
|
+
if not self.FOREIGN_CHARS:
|
|
437
|
+
self.FOREIGN_CHARS = set("wkyÿ")
|
|
438
|
+
if not self.FRONT_VOWEL_CHARS:
|
|
439
|
+
self.FRONT_VOWEL_CHARS = set("eiéêí")
|
|
440
|
+
if not self.PRIMARY_STRESS_MARKERS:
|
|
441
|
+
self.PRIMARY_STRESS_MARKERS = self.ACUTE_VOWEL_CHARS | self.TILDE_VOWEL_CHARS
|
|
442
|
+
if not self.SECONDARY_STRESS_MARKERS:
|
|
443
|
+
self.SECONDARY_STRESS_MARKERS = self.GRAVE_VOWEL_CHARS | self.CIRCUM_VOWEL_CHARS | self.TREMA_VOWEL_CHARS
|
|
444
|
+
|
|
445
|
+
if not self.ALL_VOWEL_CHARS:
|
|
446
|
+
self.ALL_VOWEL_CHARS = self.VOWEL_CHARS | self.ACUTE_VOWEL_CHARS | self.GRAVE_VOWEL_CHARS | self.CIRCUM_VOWEL_CHARS | self.TREMA_VOWEL_CHARS
|
|
447
|
+
|
|
448
|
+
# IPA vowel mappings
|
|
449
|
+
if not self.ORAL_VOWELS:
|
|
450
|
+
self.ORAL_VOWELS = set("ieɛɨɐəauoɔ")
|
|
451
|
+
if not self.NASAL_VOWELS:
|
|
452
|
+
self.NASAL_VOWELS = set("ĩẽɐ̃ũõ")
|
|
453
|
+
if not self.CLOSED_VOWELS:
|
|
454
|
+
self.CLOSED_VOWELS = set("iɨu")
|
|
455
|
+
if not self.SEMI_CLOSED_VOWELS:
|
|
456
|
+
self.SEMI_CLOSED_VOWELS = set("eo")
|
|
457
|
+
if not self.OPEN_VOWELS:
|
|
458
|
+
self.OPEN_VOWELS = set("a")
|
|
459
|
+
if not self.SEMI_OPEN_VOWELS:
|
|
460
|
+
self.SEMI_OPEN_VOWELS = set("ɛɐɔ")
|
|
461
|
+
|
|
462
|
+
def _initialize_normalized_vowels(self):
|
|
463
|
+
"""
|
|
464
|
+
Map archaic and foreign diacritics to modern Portuguese equivalents.
|
|
465
|
+
|
|
466
|
+
LINGUISTIC BACKGROUND:
|
|
467
|
+
----------------------
|
|
468
|
+
Portuguese orthography has evolved through several reforms:
|
|
469
|
+
- 1911: Major reform in Portugal
|
|
470
|
+
- 1943: Brazil's orthographic convention
|
|
471
|
+
- 1945: Portugal aligns with Brazil
|
|
472
|
+
- 1971/1973: Further simplifications
|
|
473
|
+
- 1990/2009: Acordo Ortográfico (unified orthography)
|
|
474
|
+
|
|
475
|
+
Obsolete marks must be normalized for consistent processing.
|
|
476
|
+
"""
|
|
477
|
+
if not self.NORMALIZED_VOWELS:
|
|
478
|
+
self.NORMALIZED_VOWELS = {
|
|
479
|
+
# CIRCUMFLEX ON HIGH VOWELS (î, û)
|
|
480
|
+
# Rule: High vowels /i, u/ have no open/closed distinction
|
|
481
|
+
# Therefore circumflex is redundant → removed
|
|
482
|
+
"î": "i", # Historical: used for emphasis
|
|
483
|
+
"û": "u", # Historical: used for emphasis
|
|
484
|
+
|
|
485
|
+
# TILDE ON MID VOWELS (ẽ, ĩ, ũ)
|
|
486
|
+
# Rule: Nasalization of mid/high vowels is allophonic
|
|
487
|
+
# Only /ɐ̃/ and /õ/ are phonemic
|
|
488
|
+
# These appear in foreign words or archaic texts
|
|
489
|
+
"ẽ": "ê", # Maps to closed mid vowel
|
|
490
|
+
"ĩ": "i", # Maps to high vowel
|
|
491
|
+
"ũ": "u", # Maps to high vowel
|
|
492
|
+
|
|
493
|
+
# GRAVE ACCENT (obsolete stress marker)
|
|
494
|
+
# Pre-1973: marked secondary stress in suffixed words
|
|
495
|
+
# Example: só + -mente → sòmente
|
|
496
|
+
# Modern: stress is not marked in these contexts
|
|
497
|
+
"è": "é",
|
|
498
|
+
"ì": "í",
|
|
499
|
+
"ò": "ó",
|
|
500
|
+
"ù": "ú",
|
|
501
|
+
|
|
502
|
+
# DIAERESIS/TREMA (obsolete hiatus marker)
|
|
503
|
+
# Pre-1945/2009: ü indicated pronounced /w/ after g/q
|
|
504
|
+
# Example: lingüiça [lĩˈgwisɐ] vs linguiça [lĩˈgisɐ]
|
|
505
|
+
# Modern: context must be learned (etymology required)
|
|
506
|
+
"ä": "á",
|
|
507
|
+
"ë": "é",
|
|
508
|
+
"ï": "í",
|
|
509
|
+
"ö": "ó",
|
|
510
|
+
"ü": "w", # Special: indicates [w] realization
|
|
511
|
+
"ÿ": "í"
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
def _initialize_punctuation(self):
|
|
515
|
+
"""
|
|
516
|
+
Map orthographic punctuation to prosodic IPA representations.
|
|
517
|
+
|
|
518
|
+
PROSODIC INTERPRETATION:
|
|
519
|
+
------------------------
|
|
520
|
+
Punctuation affects speech prosody (rhythm, pausing, intonation).
|
|
521
|
+
While full prosodic annotation requires ToBI or similar systems,
|
|
522
|
+
we use simplified IPA conventions.
|
|
523
|
+
|
|
524
|
+
Hiatus tokens (·) represent pause length:
|
|
525
|
+
- Short pause: 1 token (comma, hyphen)
|
|
526
|
+
- Medium pause: 2 tokens (semicolon)
|
|
527
|
+
- Long pause: 3 tokens (period)
|
|
528
|
+
|
|
529
|
+
Intonation markers (!, ?) require dedicated tone notation
|
|
530
|
+
which is beyond standard IPA segmental transcription.
|
|
531
|
+
"""
|
|
532
|
+
if not self.PUNCT2IPA:
|
|
533
|
+
self.PUNCT2IPA = {
|
|
534
|
+
"-": self.HIATUS_TOKEN, # Hyphen: brief pause
|
|
535
|
+
",": self.HIATUS_TOKEN, # Comma: brief pause
|
|
536
|
+
";": self.HIATUS_TOKEN * 2, # Semicolon: medium pause
|
|
537
|
+
".": self.HIATUS_TOKEN * 3, # Period: long pause
|
|
538
|
+
"!": self.PRIMARY_STRESS_TOKEN + self.HIATUS_TOKEN, # Exclamation: stress + pause
|
|
539
|
+
"?": "↗" + self.HIATUS_TOKEN, # Question: rising intonation + pause
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
def _initialize_consonant_digraphs(self):
|
|
543
|
+
"""
|
|
544
|
+
Define two-letter sequences representing single consonant phonemes.
|
|
545
|
+
|
|
546
|
+
PHONETIC BACKGROUND:
|
|
547
|
+
--------------------
|
|
548
|
+
Portuguese inherited Latin digraphs and developed new ones:
|
|
549
|
+
|
|
550
|
+
- NH [ɲ]: Palatal nasal (tongue blade touches hard palate)
|
|
551
|
+
Etymology: Latin -gn- > Portuguese -nh-
|
|
552
|
+
Examples: vinho [ˈviɲu] < Latin vīnum
|
|
553
|
+
|
|
554
|
+
- LH [ʎ]: Palatal lateral (lateral with palatal contact)
|
|
555
|
+
Etymology: Latin -ll-, -cl-, -gl- > Portuguese -lh-
|
|
556
|
+
Examples: filho [ˈfiʎu] < Latin fīlius
|
|
557
|
+
|
|
558
|
+
- CH [ʃ]: Voiceless postalveolar fricative
|
|
559
|
+
Etymology: Latin -cl-, -pl-, fl- > Portuguese -ch-
|
|
560
|
+
Examples: chuva [ˈʃuvɐ] < Latin plŭvia
|
|
561
|
+
|
|
562
|
+
- RR [ʁ]: Strong R (uvular/velar fricative or trill)
|
|
563
|
+
Rule: 'rr' only occurs intervocalically
|
|
564
|
+
Contrast: caro [ˈkaɾu] "expensive" vs carro [ˈkaʁu] "car"
|
|
565
|
+
|
|
566
|
+
- SS [s]: Ensures voiceless [s] between vowels
|
|
567
|
+
Rule: single 's' between vowels → [z]
|
|
568
|
+
Contrast: casa [ˈkazɐ] "house" vs cassa [ˈkasɐ] (archaic "cancel")
|
|
569
|
+
|
|
570
|
+
- PH [f]: Archaic Greek etymological spelling
|
|
571
|
+
Modern: ph → f in orthographic reforms
|
|
572
|
+
Examples: pharmacia → farmácia
|
|
573
|
+
"""
|
|
574
|
+
if not self.DIGRAPH2IPA:
|
|
575
|
+
self.DIGRAPH2IPA = {
|
|
576
|
+
"nh": "ɲ",
|
|
577
|
+
"lh": "ʎ",
|
|
578
|
+
"ch": "ʃ",
|
|
579
|
+
"rr": "ʀ", # Alternative: ʁ for uvular fricative
|
|
580
|
+
"ss": "s",
|
|
581
|
+
|
|
582
|
+
# Abolidos na Reforma Ortográfica de 1911
|
|
583
|
+
"th": "t",
|
|
584
|
+
"rh": "r",
|
|
585
|
+
"ph": "f" # O dígrafo ph foi substituído pela letra f.
|
|
586
|
+
# No entanto, manteve-se a pronúncia do ph com som de f, sobretudo no caso de nomes próprios e marcas comerciais de uso corrente.
|
|
587
|
+
# Exemplo: iPhone, Philips e Phebo.
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
def _initialize_nasal_digraphs(self):
|
|
591
|
+
"""
|
|
592
|
+
Define vowel + nasal consonant sequences that create nasal vowels.
|
|
593
|
+
|
|
594
|
+
NASALIZATION RULES:
|
|
595
|
+
-------------------
|
|
596
|
+
In Portuguese, nasal vowels have two orthographic realizations:
|
|
597
|
+
|
|
598
|
+
1. Tilde: ã, õ (direct nasal marking)
|
|
599
|
+
2. Vowel + m/n: Nasalizes the vowel when m/n is in coda position
|
|
600
|
+
|
|
601
|
+
CODA POSITION DEFINITION:
|
|
602
|
+
--------------------------
|
|
603
|
+
m/n is in coda (nasalizes vowel) when:
|
|
604
|
+
- Word-final: tem [ˈtẽj̃], bom [ˈbõ]
|
|
605
|
+
- Before consonant: campo [ˈkɐ̃pu], ponte [ˈpõtɨ]
|
|
606
|
+
|
|
607
|
+
m/n is in onset (does NOT nasalize) when:
|
|
608
|
+
- Before vowel: caminho [kɐˈmiɲu], bonito [buˈnitu]
|
|
609
|
+
|
|
610
|
+
PHONETIC RESULT:
|
|
611
|
+
----------------
|
|
612
|
+
The nasal consonant is not pronounced separately;
|
|
613
|
+
it triggers nasal airflow throughout the vowel.
|
|
614
|
+
|
|
615
|
+
ALLOPHONIC VARIATION:
|
|
616
|
+
---------------------
|
|
617
|
+
Exact nasal vowel quality varies by context:
|
|
618
|
+
- /am/, /an/ → [ɐ̃] in most contexts
|
|
619
|
+
- /am/, /an/ → [ə̃] in European Portuguese final position
|
|
620
|
+
|
|
621
|
+
We use phonemic representations, abstracting over fine detail.
|
|
622
|
+
"""
|
|
623
|
+
if not self.NASAL_DIGRAPHS:
|
|
624
|
+
self.NASAL_DIGRAPHS = {
|
|
625
|
+
# Low vowel nasalization: /a/ + nasal
|
|
626
|
+
"am": "ɐ̃", # Example: campo [ˈkɐ̃pu]
|
|
627
|
+
"âm": "ɐ̃", # With circumflex (stress marker)
|
|
628
|
+
"an": "ɐ̃", # Example: santo [ˈsɐ̃tu]
|
|
629
|
+
"ân": "ɐ̃",
|
|
630
|
+
|
|
631
|
+
# Mid-high vowel nasalization: /e/ + nasal
|
|
632
|
+
"em": "ẽ", # Example: tempo [ˈtẽpu]
|
|
633
|
+
"êm": "ẽ",
|
|
634
|
+
"en": "ẽ", # Example: dente [ˈdẽtɨ]
|
|
635
|
+
"ên": "ẽ",
|
|
636
|
+
|
|
637
|
+
# High front vowel nasalization: /i/ + nasal
|
|
638
|
+
"im": "ĩ", # Example: sim [ˈsĩ]
|
|
639
|
+
"in": "ĩ", # Example: tinta [ˈtĩtɐ]
|
|
640
|
+
|
|
641
|
+
# Mid-back vowel nasalization: /o/ + nasal
|
|
642
|
+
"om": "õ", # Example: som [ˈsõ]
|
|
643
|
+
"ôm": "õ",
|
|
644
|
+
"on": "õ", # Example: fonte [ˈfõtɨ]
|
|
645
|
+
"ôn": "õ",
|
|
646
|
+
|
|
647
|
+
# High back vowel nasalization: /u/ + nasal
|
|
648
|
+
"um": "ũ", # Example: um [ˈũ]
|
|
649
|
+
"un": "ũ", # Example: fundo [ˈfũdu]
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
def _initialize_consonant_hiatus(self):
|
|
653
|
+
"""
|
|
654
|
+
Define consonant clusters that span syllable boundaries.
|
|
655
|
+
|
|
656
|
+
SYLLABIFICATION PRINCIPLE:
|
|
657
|
+
--------------------------
|
|
658
|
+
Portuguese syllables prefer CV (consonant-vowel) structure.
|
|
659
|
+
Certain consonant clusters cannot be parsed as single onsets,
|
|
660
|
+
so they split across syllables with a hiatus (break).
|
|
661
|
+
|
|
662
|
+
HETEROSYLLABIC CLUSTERS:
|
|
663
|
+
------------------------
|
|
664
|
+
These clusters are always split:
|
|
665
|
+
|
|
666
|
+
- cc, cç [k·s]: Represents /ks/ cluster
|
|
667
|
+
Examples: ficção [fik·ˈsɐ̃w̃], acção [ak·ˈsɐ̃w̃]
|
|
668
|
+
Note: Modern spelling often simplifies to ç: ação
|
|
669
|
+
|
|
670
|
+
- ct [k·t]: Voiceless stops across syllable boundary
|
|
671
|
+
Examples: pacto [ˈpak·tu], convicto [kõˈvik·tu]
|
|
672
|
+
|
|
673
|
+
- pt [p·t]: Bilabial + alveolar across boundary
|
|
674
|
+
Examples: apto [ˈap·tu], eucalipto [ew·kɐˈlip·tu]
|
|
675
|
+
Note: In some archaic words, 'p' was silent
|
|
676
|
+
|
|
677
|
+
- pç, pc [p·s]: Bilabial + fricative
|
|
678
|
+
Examples: opção [op·ˈsɐ̃w̃], núpcias [ˈnup·sjɐʃ]
|
|
679
|
+
|
|
680
|
+
SYLLABIFICATION ALGORITHM:
|
|
681
|
+
--------------------------
|
|
682
|
+
The syllabifier should recognize these as split clusters,
|
|
683
|
+
not as single onsets. The hiatus token (·) marks the boundary.
|
|
684
|
+
"""
|
|
685
|
+
if not self.HETEROSYLLABIC_CLUSTERS:
|
|
686
|
+
self.HETEROSYLLABIC_CLUSTERS = {
|
|
687
|
+
"cç": "k·s", # convicção, ficção, friccionar,
|
|
688
|
+
"cc": "k·s", # friccionar, cóccix, facciosa, ficcionado, infecciologia, fraccionamento
|
|
689
|
+
"ct": "k·t", # compacto, convicto, pacto, pictural;
|
|
690
|
+
"pt": "p·t", # adepto, apto, díptico, inepto, rapto. eucalipto,
|
|
691
|
+
"pç": "p·s", # erupção, opção, recepção
|
|
692
|
+
"pc": "p·s", # núpcias
|
|
693
|
+
}
|
|
694
|
+
|
|
695
|
+
def _initialize_archaic_forms(self):
|
|
696
|
+
"""
|
|
697
|
+
Define archaic silent consonant patterns.
|
|
698
|
+
|
|
699
|
+
HISTORICAL ORTHOGRAPHY:
|
|
700
|
+
-----------------------
|
|
701
|
+
Pre-2009, Portuguese preserved etymological consonants from Latin
|
|
702
|
+
even when not pronounced. The Acordo Ortográfico eliminated these.
|
|
703
|
+
|
|
704
|
+
SILENT CONSONANT RULES:
|
|
705
|
+
-----------------------
|
|
706
|
+
When 'p' appeared in clusters mpc, mpç, mpt:
|
|
707
|
+
- If 'p' was silent: m + p → n (in modern spelling)
|
|
708
|
+
- If 'p' was pronounced: cluster retained
|
|
709
|
+
|
|
710
|
+
Examples of elimination:
|
|
711
|
+
- assumpcão → assunção [ɐsũˈsɐ̃w̃]
|
|
712
|
+
- assumptível → assuntível
|
|
713
|
+
- peremptório → perentório
|
|
714
|
+
|
|
715
|
+
IMPLEMENTATION CHALLENGE:
|
|
716
|
+
-------------------------
|
|
717
|
+
Modern texts may still contain archaic spellings.
|
|
718
|
+
We need word lists to distinguish:
|
|
719
|
+
- Truly archaic: 'p' silent in both old and new spelling
|
|
720
|
+
- Etymological retention: 'p' pronounced (Egito [ˈɛʒitu] retained)
|
|
721
|
+
|
|
722
|
+
For now, we flag known archaic forms.
|
|
723
|
+
Future: Integrate comprehensive etymological dictionary.
|
|
724
|
+
"""
|
|
725
|
+
if not self.ARCHAIC_MUTE_P:
|
|
726
|
+
self.ARCHAIC_MUTE_P = {
|
|
727
|
+
"mpc": {"assumpcionista"}, # → assuncionista
|
|
728
|
+
"mpç": {"assumpção"}, # → assunção
|
|
729
|
+
"mpt": {
|
|
730
|
+
"assumptível", # → assuntível
|
|
731
|
+
"peremptório", # → perentório
|
|
732
|
+
"sumptuoso", # → suntuoso
|
|
733
|
+
"sumptuosidade" # → suntuosidade
|
|
734
|
+
},
|
|
735
|
+
}
|
|
736
|
+
|
|
737
|
+
def _initialize_foreign_digraphs(self):
|
|
738
|
+
"""
|
|
739
|
+
Define digraphs from loanwords and foreign names.
|
|
740
|
+
|
|
741
|
+
ADAPTATION RULES:
|
|
742
|
+
-----------------
|
|
743
|
+
Portuguese adapts foreign orthography to native phonology:
|
|
744
|
+
|
|
745
|
+
- ff [f]: Geminate f in Italian, French loanwords
|
|
746
|
+
Realized as single [f] in Portuguese
|
|
747
|
+
Examples: graffiti, buffet
|
|
748
|
+
|
|
749
|
+
- ll [l]: Geminate l (not palatal ʎ)
|
|
750
|
+
Realized as single [l]
|
|
751
|
+
Examples: Llosa, villa
|
|
752
|
+
|
|
753
|
+
- sh [ʃ]: English/Russian orthography
|
|
754
|
+
Adapted to Portuguese [ʃ]
|
|
755
|
+
Examples: show, shopping, Shostakovich
|
|
756
|
+
|
|
757
|
+
- th [t] or [d]: English/Greek orthography
|
|
758
|
+
Usually adapted to [t] (voiceless) or [d] (voiced)
|
|
759
|
+
Examples: thriller [ˈtɾilɛɾ], Athens [ɐˈtenɐʃ]
|
|
760
|
+
Note: Some speakers use [θ] (interdental), but non-standard
|
|
761
|
+
|
|
762
|
+
PRONUNCIATION VARIATION:
|
|
763
|
+
------------------------
|
|
764
|
+
Loanword pronunciation varies by:
|
|
765
|
+
- Speaker's education/exposure
|
|
766
|
+
- Degree of word integration
|
|
767
|
+
- Formality of context
|
|
768
|
+
|
|
769
|
+
We provide standard Portuguese adaptations.
|
|
770
|
+
"""
|
|
771
|
+
if not self.FOREIGN_DIGRAPH2IPA:
|
|
772
|
+
self.FOREIGN_DIGRAPH2IPA = {
|
|
773
|
+
"ff": "f", # Italian/French: graffiti
|
|
774
|
+
"ll": "l", # Spanish: paella (note: not palatal)
|
|
775
|
+
"sh": "ʃ", # English: show, shopping
|
|
776
|
+
"th": "t", # English: thriller (some use [d])
|
|
777
|
+
}
|
|
778
|
+
|
|
779
|
+
def _initialize_hiatus_prefixes(self):
|
|
780
|
+
"""
|
|
781
|
+
Define prefixes that force vowel hiatus (block diphthong formation).
|
|
782
|
+
|
|
783
|
+
HIATUS vs DIPHTHONG:
|
|
784
|
+
--------------------
|
|
785
|
+
When two vowels meet, they can form:
|
|
786
|
+
1. Diphthong: Single syllable (e.g., pai [ˈpaj])
|
|
787
|
+
2. Hiatus: Separate syllables (e.g., pa·ís [pɐˈiʃ])
|
|
788
|
+
|
|
789
|
+
MORPHOLOGICAL HIATUS:
|
|
790
|
+
---------------------
|
|
791
|
+
Prefix boundaries often block diphthongization:
|
|
792
|
+
- bi- + auricular → bi·auricular [bi.aw.ɾi.ku.ˈlaɾ]
|
|
793
|
+
NOT *[bjaw.ɾi.ku.ˈlaɾ]
|
|
794
|
+
- semi- + automático → semi·automático
|
|
795
|
+
- ante- + ontem → ante·ontem
|
|
796
|
+
|
|
797
|
+
PHONOLOGICAL MOTIVATION:
|
|
798
|
+
------------------------
|
|
799
|
+
Hiatus preservation maintains morphological transparency
|
|
800
|
+
(clear prefix + root boundaries) and aids comprehension.
|
|
801
|
+
|
|
802
|
+
IMPLEMENTATION:
|
|
803
|
+
---------------
|
|
804
|
+
During grapheme tokenization, if a prefix is detected,
|
|
805
|
+
insert a syllable boundary marker to prevent diphthong parsing.
|
|
806
|
+
"""
|
|
807
|
+
if not self.HIATUS_PREFIXES:
|
|
808
|
+
self.HIATUS_PREFIXES = {
|
|
809
|
+
"ante", # ante-histórico, ante-ontem
|
|
810
|
+
"bi", # bi-auricular, bi-anual
|
|
811
|
+
"semi", # semi-automático, semi-urbano
|
|
812
|
+
"mini", # mini-autocarro, mini-ópera
|
|
813
|
+
"anti", # anti-inflação, anti-oxidante
|
|
814
|
+
"multi", # multi-étnico, multi-uso
|
|
815
|
+
"auto", # auto-observação (when doubled)
|
|
816
|
+
"contra", # contra-ataque
|
|
817
|
+
"extra", # extra-oficial
|
|
818
|
+
"hiper", # hiper-ativo
|
|
819
|
+
"inter", # inter-urbano
|
|
820
|
+
"intra", # intra-ocular
|
|
821
|
+
"neo", # neo-ortodoxo
|
|
822
|
+
"pré", # pré-escolar
|
|
823
|
+
"pró", # pró-ativo
|
|
824
|
+
"re", # re-eleger (when doubled)
|
|
825
|
+
"sub", # sub-humano
|
|
826
|
+
"super", # super-homem
|
|
827
|
+
"supra", # supra-ocular
|
|
828
|
+
"ultra", # ultra-ortodoxo
|
|
829
|
+
}
|
|
830
|
+
|
|
831
|
+
# TODO - hiatus suffixes. eg. for suffix "inha" - Vinha -> V.inha
|
|
832
|
+
|
|
833
|
+
def _initialize_diphthongs(self):
|
|
834
|
+
"""
|
|
835
|
+
Define all Portuguese diphthongs (oral and nasal).
|
|
836
|
+
|
|
837
|
+
DIPHTHONG STRUCTURE:
|
|
838
|
+
--------------------
|
|
839
|
+
A diphthong is a vocalic sequence pronounced in one syllable,
|
|
840
|
+
consisting of a vowel (nucleus) and a semivowel (glide).
|
|
841
|
+
|
|
842
|
+
Classification:
|
|
843
|
+
1. By direction:
|
|
844
|
+
- Falling/descending: V + G (rei, pau)
|
|
845
|
+
- Rising/ascending: G + V (piano, água)
|
|
846
|
+
|
|
847
|
+
2. By nasalization:
|
|
848
|
+
- Oral: only oral airflow (rei)
|
|
849
|
+
- Nasal: nasal + oral airflow (mãe, cão)
|
|
850
|
+
|
|
851
|
+
FALLING ORAL DIPHTHONGS:
|
|
852
|
+
------------------------
|
|
853
|
+
Ending in [j] (spelled i or e):
|
|
854
|
+
- [aj]: pai, cai, vai
|
|
855
|
+
- [ɐj]: unstressed variant (casa > casais)
|
|
856
|
+
- [ɛj]: rei, papéis
|
|
857
|
+
- [ej]: leite, sei
|
|
858
|
+
- [ɔj]: herói, dói
|
|
859
|
+
- [oj]: boi, foi
|
|
860
|
+
- [uj]: fui, azuis
|
|
861
|
+
|
|
862
|
+
Ending in [w] (spelled u or o):
|
|
863
|
+
- [iw]: viu, partiu
|
|
864
|
+
- [ew]: meu, seu
|
|
865
|
+
- [ɛw]: céu, véu
|
|
866
|
+
- [aw]: mau, pau
|
|
867
|
+
- [ɐw]: unstressed (casa > casão)
|
|
868
|
+
- [ow]: sou, ou
|
|
869
|
+
|
|
870
|
+
FALLING NASAL DIPHTHONGS:
|
|
871
|
+
-------------------------
|
|
872
|
+
- [ɐ̃j̃]: mãe, cães (spelled ãe)
|
|
873
|
+
- [ẽj̃]: bem, também (spelled em final)
|
|
874
|
+
- [õj̃]: põe, õfões (spelled õe)
|
|
875
|
+
- [ɐ̃w̃]: cão, mão (spelled ão)
|
|
876
|
+
- [ũj̃]: muito, muitos (special case)
|
|
877
|
+
|
|
878
|
+
BRAZILIAN PORTUGUESE L-VOCALIZATION:
|
|
879
|
+
------------------------------------
|
|
880
|
+
In most Brazilian dialects, syllable-final /l/ → [w]:
|
|
881
|
+
- mal [ˈmaw] (European: [ˈmaɫ])
|
|
882
|
+
- sol [ˈsɔw] (European: [ˈsɔɫ])
|
|
883
|
+
- Brasil [bɾaˈziw] (European: [bɾɐˈziɫ])
|
|
884
|
+
|
|
885
|
+
This creates additional diphthongs not present in European Portuguese.
|
|
886
|
+
"""
|
|
887
|
+
if not self.RISING_ORAL_DIPHTHONGS:
|
|
888
|
+
self.RISING_ORAL_DIPHTHONGS = {
|
|
889
|
+
# Falling diphthongs ending in [j]
|
|
890
|
+
"aj": "ai", # pai, cai (stressed)
|
|
891
|
+
"ɐj": "ai", # variant (unstressed)
|
|
892
|
+
"ɛj": "éi", # rei, papéis
|
|
893
|
+
"ej": "ei", # leite, sei
|
|
894
|
+
"ɔj": "ói", # herói, dói
|
|
895
|
+
"oj": "oi", # boi, foi
|
|
896
|
+
"uj": "ui", # fui, azuis
|
|
897
|
+
|
|
898
|
+
# Falling diphthongs ending in [w]
|
|
899
|
+
"iw": "iu", # viu, partiu
|
|
900
|
+
"ew": "eu", # meu, seu
|
|
901
|
+
"ɛw": "éu", # céu, véu
|
|
902
|
+
"aw": "au", # mau, pau
|
|
903
|
+
"ɐw": "ao", # unstressed variant
|
|
904
|
+
"ow": "ou", # sou, ou
|
|
905
|
+
}
|
|
906
|
+
|
|
907
|
+
if not self.FALLING_NASAL_DIPHTHONGS:
|
|
908
|
+
self.FALLING_NASAL_DIPHTHONGS = {
|
|
909
|
+
"ɐ̃j": "ãe", # mãe, cães, pães
|
|
910
|
+
"ẽj": "em", # bem, também (final position)
|
|
911
|
+
"õj": "õe", # põe, limões
|
|
912
|
+
"ɐ̃w": "ão", # cão, mão, pão
|
|
913
|
+
}
|
|
914
|
+
|
|
915
|
+
if not self.PTBR_DIPHTHONGS:
|
|
916
|
+
# Brazilian Portuguese L-vocalization diphthongs
|
|
917
|
+
self.PTBR_DIPHTHONGS = {
|
|
918
|
+
"aw": "al", # mal [ˈmaw]
|
|
919
|
+
"ɛw": "el", # mel [ˈmɛw]
|
|
920
|
+
"ew": "el", # feltro [ˈfew.tɾu]
|
|
921
|
+
"iw": "il", # funil [fu.ˈniw]
|
|
922
|
+
"ɔw": "ol", # sol [ˈsɔw]
|
|
923
|
+
"ow": "ol", # soldado [sow.ˈda.du]
|
|
924
|
+
"uw": "ul", # azul [a.ˈzuw]
|
|
925
|
+
}
|
|
926
|
+
|
|
927
|
+
# Compile reverse mapping: orthography → IPA
|
|
928
|
+
if not self.DIPHTHONG2IPA:
|
|
929
|
+
self.DIPHTHONG2IPA = {
|
|
930
|
+
**{v: k for k, v in self.RISING_ORAL_DIPHTHONGS.items()},
|
|
931
|
+
**{v: k for k, v in self.FALLING_NASAL_DIPHTHONGS.items()},
|
|
932
|
+
}
|
|
933
|
+
|
|
934
|
+
def _initialize_triphthongs(self):
|
|
935
|
+
"""
|
|
936
|
+
Define Portuguese triphthongs (rare, mostly in foreign words).
|
|
937
|
+
|
|
938
|
+
TRIPHTHONG DEFINITION:
|
|
939
|
+
----------------------
|
|
940
|
+
A sequence of three vowel-like sounds in one syllable:
|
|
941
|
+
semivowel + vowel + semivowel (G-V-G)
|
|
942
|
+
|
|
943
|
+
Examples:
|
|
944
|
+
- Uruguai [u.ɾu.ˈgwaj]: G[w] + V[a] + G[j]
|
|
945
|
+
- Paraguai [pɐ.ɾɐ.ˈgwaj]
|
|
946
|
+
- miau [ˈmjaw]: G[j] + V[a] + G[w]
|
|
947
|
+
|
|
948
|
+
PHONETIC REALITY:
|
|
949
|
+
-----------------
|
|
950
|
+
True triphthongs are rare cross-linguistically.
|
|
951
|
+
Many apparent triphthongs are:
|
|
952
|
+
- Diphthong + separate vowel across syllable boundary
|
|
953
|
+
- Regional variants that simplify to diphthongs
|
|
954
|
+
|
|
955
|
+
In European Portuguese, many potential triphthongs reduce:
|
|
956
|
+
- iei → [jej] or [jɐj] depending on dialect
|
|
957
|
+
Examples: fieira, macieira
|
|
958
|
+
|
|
959
|
+
ORTHOGRAPHIC AMBIGUITY:
|
|
960
|
+
-----------------------
|
|
961
|
+
Portuguese orthography doesn't distinguish triphthongs clearly.
|
|
962
|
+
Syllabification and stress determine the parse:
|
|
963
|
+
- ca.iu [kɐ.ˈju]: hiatus (two syllables)
|
|
964
|
+
- miau [ˈmjaw]: triphthong (one syllable)
|
|
965
|
+
|
|
966
|
+
We include common patterns and flag for special handling.
|
|
967
|
+
"""
|
|
968
|
+
if not self.TRIPHTHONG2IPA:
|
|
969
|
+
self.TRIPHTHONG2IPA = {
|
|
970
|
+
# [w-a-j] sequence
|
|
971
|
+
"uai": "waj", # rare: Uruguai, Paraguai
|
|
972
|
+
# [w-ɐ̃-j] nasal sequence
|
|
973
|
+
"uão": "wɐ̃w", # rare: saguão
|
|
974
|
+
}
|
|
975
|
+
|
|
976
|
+
def _initialize_trigrams(self):
|
|
977
|
+
"""
|
|
978
|
+
Define three-letter graphemes with special pronunciations.
|
|
979
|
+
|
|
980
|
+
TYPES OF TRIGRAPHS:
|
|
981
|
+
-------------------
|
|
982
|
+
1. QU/GU before E/I with explicit vowel
|
|
983
|
+
2. Vowel sequences in hiatus or special contexts
|
|
984
|
+
3. Foreign word patterns
|
|
985
|
+
|
|
986
|
+
QUE/QUI/GUE/GUI AMBIGUITY:
|
|
987
|
+
--------------------------
|
|
988
|
+
These sequences are ambiguous in modern Portuguese:
|
|
989
|
+
- 'u' can be silent: quero [ˈkeɾu], guerra [ˈɡɛʁɐ]
|
|
990
|
+
- 'u' can be pronounced: equino [eˈkwinu], ambíguo [ɐ̃ˈbiɡwu]
|
|
991
|
+
|
|
992
|
+
Historical solution: Trema (ü) marked pronounced u
|
|
993
|
+
- lingüiça [lĩˈgwisɐ]: u pronounced
|
|
994
|
+
- linguiça [lĩˈgisɐ]: u silent
|
|
995
|
+
|
|
996
|
+
Modern challenge: No marking, must learn from context/etymology
|
|
997
|
+
|
|
998
|
+
DOUBLE-O SEQUENCES:
|
|
999
|
+
-------------------
|
|
1000
|
+
When prefix/root boundary creates -oo-, typically pronounced as:
|
|
1001
|
+
- Separate syllables: co.operação [ko.o.pɛ.ɾɐ.ˈsɐ̃w̃]
|
|
1002
|
+
- But may reduce in rapid speech
|
|
1003
|
+
|
|
1004
|
+
Special cases:
|
|
1005
|
+
- voo [ˈvo.u] or [ˈvow]: "flight" (noun from voar)
|
|
1006
|
+
- zoo [ˈzɔ.u] or [ˈzow]: "zoo"
|
|
1007
|
+
|
|
1008
|
+
We mark these for context-sensitive handling.
|
|
1009
|
+
"""
|
|
1010
|
+
if not self.TRIGRAM2IPA:
|
|
1011
|
+
self.TRIGRAM2IPA = {
|
|
1012
|
+
"tch": "tʃ", # the only true trigraph in portuguese
|
|
1013
|
+
|
|
1014
|
+
# QU/GU patterns (context-dependent, flagged for special handling)
|
|
1015
|
+
"que": "kɨ", # quero (default: u silent)
|
|
1016
|
+
"qui": "ki", # quia
|
|
1017
|
+
"gue": "ɡɨ", # guerra
|
|
1018
|
+
"gui": "ɡi", # guia
|
|
1019
|
+
"qué": "kɛ", # with explicit stress
|
|
1020
|
+
"gué": "ɡɛ",
|
|
1021
|
+
"quê": "ke",
|
|
1022
|
+
"guê": "ɡe",
|
|
1023
|
+
|
|
1024
|
+
# Double-O patterns (prefix boundaries)
|
|
1025
|
+
"coo": "ku.u", # cooperar, coordenar
|
|
1026
|
+
"joo": "ʒo.u", # enjoo
|
|
1027
|
+
"noo": "nu.u", # noológico
|
|
1028
|
+
"zoo": "zu.u", # zoologia, zoo
|
|
1029
|
+
"voo": "vo.u", # voo, revoo
|
|
1030
|
+
|
|
1031
|
+
# Foreign patterns
|
|
1032
|
+
"boo": "bu.u", # booleano
|
|
1033
|
+
"too": "tu.u", # cartoonista
|
|
1034
|
+
"woo": "wu.u", # Hollywood
|
|
1035
|
+
"hoo": "u.u", # hooliganismo
|
|
1036
|
+
|
|
1037
|
+
# Nasal patterns
|
|
1038
|
+
"ção": "sɐ̃w̃", # -ção suffix (very common)
|
|
1039
|
+
"ões": "õj̃ʃ", # plural -ões
|
|
1040
|
+
}
|
|
1041
|
+
|
|
1042
|
+
def _initialize_tetragrams(self):
|
|
1043
|
+
"""
|
|
1044
|
+
Define four-letter graphemes (very rare).
|
|
1045
|
+
|
|
1046
|
+
TETRAGRAPH CONTEXTS:
|
|
1047
|
+
--------------------
|
|
1048
|
+
Four-letter sequences with special pronunciation arise from:
|
|
1049
|
+
1. Suffix attachment: -ense, -iano
|
|
1050
|
+
2. Compound formation
|
|
1051
|
+
3. Loanwords
|
|
1052
|
+
|
|
1053
|
+
Most are analyzable as diphthong + digraph or similar,
|
|
1054
|
+
but we list them explicitly for pattern recognition.
|
|
1055
|
+
|
|
1056
|
+
GENTILICS (DEMONYMS):
|
|
1057
|
+
---------------------
|
|
1058
|
+
-iense suffix (indicating origin) creates potential tetragraphs:
|
|
1059
|
+
- gaiense: from Gaia [ɡɐj.ˈẽ.sɨ] or [ɡɐ.jẽ.sɨ]
|
|
1060
|
+
- praiense: from Praia
|
|
1061
|
+
- xangaiense: from Shanghai
|
|
1062
|
+
|
|
1063
|
+
Syllabification is variable and dialect-dependent.
|
|
1064
|
+
"""
|
|
1065
|
+
if not self.TETRAGRAM2IPA:
|
|
1066
|
+
self.TETRAGRAM2IPA = {
|
|
1067
|
+
"aien": "ɐj.ẽ", # gaiense, praiense, xangaiense
|
|
1068
|
+
|
|
1069
|
+
# Foreign words / proper nouns
|
|
1070
|
+
"guai": "gwaj", # Uruguai, Paraguai
|
|
1071
|
+
"quai": "kwaj",
|
|
1072
|
+
|
|
1073
|
+
# hiatus
|
|
1074
|
+
"iaiá": "i.ɐ.ˈja", # iaiá (Brazilian: nanny, lady)
|
|
1075
|
+
}
|
|
1076
|
+
|
|
1077
|
+
def _initialize_default_chars(self):
|
|
1078
|
+
"""
|
|
1079
|
+
Define baseline character-to-phoneme mappings.
|
|
1080
|
+
|
|
1081
|
+
DESIGN PRINCIPLE:
|
|
1082
|
+
-----------------
|
|
1083
|
+
These are CONTEXT-FREE default mappings.
|
|
1084
|
+
Many characters have context-sensitive realizations
|
|
1085
|
+
that override these defaults. Context rules are applied
|
|
1086
|
+
during IPA generation in the CharToken class.
|
|
1087
|
+
|
|
1088
|
+
VOWELS:
|
|
1089
|
+
-------
|
|
1090
|
+
Portuguese has 9 oral vowel phonemes in stressed position:
|
|
1091
|
+
/i, e, ɛ, a, ɐ, ɔ, o, u/ (plus nasal vowels)
|
|
1092
|
+
|
|
1093
|
+
Unstressed vowels reduce to smaller inventory:
|
|
1094
|
+
/i, ɨ, u, ɐ/ (European Portuguese)
|
|
1095
|
+
/i, u, a/ (Brazilian Portuguese - less reduction)
|
|
1096
|
+
|
|
1097
|
+
Default mapping uses neutral/unstressed values where ambiguous.
|
|
1098
|
+
|
|
1099
|
+
CONSONANTS:
|
|
1100
|
+
-----------
|
|
1101
|
+
Most consonants have straightforward mappings.
|
|
1102
|
+
Exceptions (context-sensitive):
|
|
1103
|
+
- c: [k] default, but [s] before e/i
|
|
1104
|
+
- g: [ɡ] default, but [ʒ] before e/i
|
|
1105
|
+
- r: [ɾ] default (tap), but [ʁ]/[ʀ] word-initially or after n/l/s
|
|
1106
|
+
- s: [s] default, but [z] intervocalically
|
|
1107
|
+
- x: [ʃ] default, but can be [ks], [z], [s], [gz] contextually
|
|
1108
|
+
- z: [z] default, but [ʃ]/[s] word-finally
|
|
1109
|
+
|
|
1110
|
+
SILENT LETTERS:
|
|
1111
|
+
---------------
|
|
1112
|
+
- h: Always silent except in digraphs (ch, nh, lh)
|
|
1113
|
+
- u: Silent in que/qui, gue/gui contexts (modern orthography)
|
|
1114
|
+
"""
|
|
1115
|
+
if not self.DEFAULT_CHAR2PHONEMES:
|
|
1116
|
+
self.DEFAULT_CHAR2PHONEMES = {
|
|
1117
|
+
# VOWELS
|
|
1118
|
+
# Low vowel: stressed [a], unstressed [ɐ]
|
|
1119
|
+
"a": "ɐ", # Default: reduced (unstressed) value
|
|
1120
|
+
"á": "a", # Acute: stressed open value
|
|
1121
|
+
"à": "a", # Grave: (rare) stressed
|
|
1122
|
+
"â": "ɐ", # Circumflex: stressed closed value (often [ɐ])
|
|
1123
|
+
"ã": "ɐ̃", # Nasal low vowel
|
|
1124
|
+
|
|
1125
|
+
# Mid-front vowel: stressed [e] or [ɛ], unstressed [ɨ]
|
|
1126
|
+
"e": "ɨ", # Default: reduced (European Portuguese)
|
|
1127
|
+
"é": "ɛ", # Acute: stressed open value
|
|
1128
|
+
"ê": "e", # Circumflex: stressed closed value
|
|
1129
|
+
|
|
1130
|
+
# High-front vowel: always [i]
|
|
1131
|
+
"i": "i",
|
|
1132
|
+
"í": "i", # Stress marker only (no quality change)
|
|
1133
|
+
|
|
1134
|
+
# Mid-back vowel: stressed [o] or [ɔ], unstressed [u]
|
|
1135
|
+
"o": "u", # Default: reduced (European Portuguese)
|
|
1136
|
+
"ó": "ɔ", # Acute: stressed open value
|
|
1137
|
+
"ô": "o", # Circumflex: stressed closed value
|
|
1138
|
+
"õ": "õ", # Nasal mid-back vowel
|
|
1139
|
+
|
|
1140
|
+
# High-back vowel: always [u]
|
|
1141
|
+
"u": "u",
|
|
1142
|
+
"ú": "u", # Stress marker only
|
|
1143
|
+
|
|
1144
|
+
# CONSONANTS
|
|
1145
|
+
# Stops
|
|
1146
|
+
"p": "p", # Voiceless bilabial stop
|
|
1147
|
+
"b": "b", # Voiced bilabial stop
|
|
1148
|
+
"t": "t", # Voiceless alveolar stop
|
|
1149
|
+
"d": "d", # Voiced alveolar stop
|
|
1150
|
+
"k": "k", # Voiceless velar stop (foreign)
|
|
1151
|
+
"c": "k", # Default: voiceless velar stop
|
|
1152
|
+
"q": "k", # Always voiceless velar (+ u)
|
|
1153
|
+
"g": "ɡ", # Default: voiced velar stop
|
|
1154
|
+
|
|
1155
|
+
# Fricatives
|
|
1156
|
+
"f": "f", # Voiceless labiodental fricative
|
|
1157
|
+
"v": "v", # Voiced labiodental fricative
|
|
1158
|
+
"s": "s", # Default: voiceless alveolar fricative
|
|
1159
|
+
"z": "z", # Voiced alveolar fricative
|
|
1160
|
+
"ç": "s", # Voiceless alveolar fricative (c-cedilla)
|
|
1161
|
+
"j": "ʒ", # Voiced postalveolar fricative
|
|
1162
|
+
"x": "ʃ", # Default: voiceless postalveolar fricative
|
|
1163
|
+
|
|
1164
|
+
# Nasals
|
|
1165
|
+
"m": "m", # Bilabial nasal
|
|
1166
|
+
"n": "n", # Alveolar nasal
|
|
1167
|
+
|
|
1168
|
+
# Liquids
|
|
1169
|
+
"l": "l", # Alveolar lateral
|
|
1170
|
+
"r": "ɾ", # Default: alveolar tap
|
|
1171
|
+
|
|
1172
|
+
# Semivowels (in consonantal position)
|
|
1173
|
+
"w": "w", # Labiovelar approximant (foreign)
|
|
1174
|
+
"y": "j", # Palatal approximant (foreign, rare)
|
|
1175
|
+
|
|
1176
|
+
# Silent
|
|
1177
|
+
"h": "", # Always silent in Portuguese
|
|
1178
|
+
}
|
|
1179
|
+
|
|
1180
|
+
def _initialize_stress_rules(self):
|
|
1181
|
+
"""
|
|
1182
|
+
Define patterns that predict stress placement.
|
|
1183
|
+
|
|
1184
|
+
PORTUGUESE STRESS SYSTEM:
|
|
1185
|
+
-------------------------
|
|
1186
|
+
Portuguese stress is SEMI-PREDICTABLE based on word shape:
|
|
1187
|
+
|
|
1188
|
+
DEFAULT RULE (Paroxytone):
|
|
1189
|
+
- Stress falls on penultimate (second-to-last) syllable
|
|
1190
|
+
- Applies to ~80% of words
|
|
1191
|
+
- Examples: casa, livro, falam
|
|
1192
|
+
|
|
1193
|
+
OXYTONE EXCEPTIONS (final syllable stress):
|
|
1194
|
+
- Words ending in: -r, -l, -z, -im, -um, nasal vowels
|
|
1195
|
+
- Examples: falar, azul, rapaz, jardim, atum, maçã
|
|
1196
|
+
- Loanwords often follow this pattern: hotel, bar
|
|
1197
|
+
|
|
1198
|
+
PROPAROXYTONE (antepenultimate stress):
|
|
1199
|
+
- ALWAYS marked with written accent
|
|
1200
|
+
- Less common (~5% of words)
|
|
1201
|
+
- Examples: médico, lâmpada, ótimo
|
|
1202
|
+
- Mostly erudite words, Latin borrowings
|
|
1203
|
+
|
|
1204
|
+
MONOSYLLABLES:
|
|
1205
|
+
- Inherently stressed (no choice of syllable)
|
|
1206
|
+
- May have accent for semantic distinction:
|
|
1207
|
+
- pé [ˈpɛ] "foot" vs pê [ˈpe] "letter P"
|
|
1208
|
+
|
|
1209
|
+
WRITTEN ACCENT RULES:
|
|
1210
|
+
---------------------
|
|
1211
|
+
Accents are written to mark:
|
|
1212
|
+
1. Unexpected stress position (proparoxytones)
|
|
1213
|
+
2. Vowel quality (é [ɛ] vs ê [e])
|
|
1214
|
+
3. Disambiguation (pára "stops" vs para "for")
|
|
1215
|
+
|
|
1216
|
+
NOTE: The 1990/2009 Acordo Ortográfico changed some rules,
|
|
1217
|
+
eliminating some accents (e.g., trema) and disambiguators.
|
|
1218
|
+
"""
|
|
1219
|
+
if not self.OXYTONE_ENDINGS:
|
|
1220
|
+
self.OXYTONE_ENDINGS = {
|
|
1221
|
+
# Consonant endings that trigger final stress
|
|
1222
|
+
"r", # falar, comer, partir
|
|
1223
|
+
"l", # azul, papel, farol
|
|
1224
|
+
"z", # rapaz, feliz, capaz
|
|
1225
|
+
"x", # fax, latex (loanwords)
|
|
1226
|
+
|
|
1227
|
+
# Nasal endings (final nasal vowels are stressed)
|
|
1228
|
+
"m", # jardim, atum, homem
|
|
1229
|
+
"n", # hífen (rare, mostly foreign)
|
|
1230
|
+
"ão", # cão, mão (diphthong)
|
|
1231
|
+
"ãe", # mãe, cães
|
|
1232
|
+
"õe", # põe, limões
|
|
1233
|
+
|
|
1234
|
+
# Diphthong endings
|
|
1235
|
+
"éi", # papéis, hotéis
|
|
1236
|
+
"éu", # troféu, céu
|
|
1237
|
+
"ói", # herói, anzóis
|
|
1238
|
+
"au", # grau, pau
|
|
1239
|
+
"áu", #
|
|
1240
|
+
|
|
1241
|
+
# Explicit stress markers (always stressed)
|
|
1242
|
+
"á", "é", "í", "ó", "ú",
|
|
1243
|
+
"â", "ê", "ô",
|
|
1244
|
+
"ã", "õ",
|
|
1245
|
+
}
|
|
1246
|
+
|
|
1247
|
+
def _compile_grapheme_inventory(self):
|
|
1248
|
+
"""
|
|
1249
|
+
Compile sorted list of all multi-character graphemes.
|
|
1250
|
+
|
|
1251
|
+
PURPOSE:
|
|
1252
|
+
--------
|
|
1253
|
+
During tokenization, we need to recognize multi-character units
|
|
1254
|
+
(digraphs, diphthongs, etc.) before processing individual characters.
|
|
1255
|
+
|
|
1256
|
+
GREEDY MATCHING PRINCIPLE:
|
|
1257
|
+
--------------------------
|
|
1258
|
+
Longer sequences must be checked first to avoid incorrect parses:
|
|
1259
|
+
- Incorrect: "ch" → ['c', 'h'] → [k, (silent)]
|
|
1260
|
+
- Correct: "ch" → ['ch'] → [ʃ]
|
|
1261
|
+
|
|
1262
|
+
SORTING:
|
|
1263
|
+
--------
|
|
1264
|
+
We sort by length (descending) to ensure longest match wins.
|
|
1265
|
+
Within same length, alphabetical order for deterministic behavior.
|
|
1266
|
+
|
|
1267
|
+
INVENTORY SOURCES:
|
|
1268
|
+
------------------
|
|
1269
|
+
- Tetragraphs (4 chars)
|
|
1270
|
+
- Trigraphs (3 chars)
|
|
1271
|
+
- Triphthongs (3 chars)
|
|
1272
|
+
- Digraphs: consonant, nasal, foreign
|
|
1273
|
+
- Diphthongs (2 chars)
|
|
1274
|
+
- Consonant hiatus patterns
|
|
1275
|
+
- Hiatus prefixes
|
|
1276
|
+
- Archaic forms
|
|
1277
|
+
|
|
1278
|
+
Single characters are NOT included (handled separately).
|
|
1279
|
+
"""
|
|
1280
|
+
if not self.GRAPHEME_INVENTORY:
|
|
1281
|
+
# Collect all multi-character graphemes
|
|
1282
|
+
all_graphemes = set()
|
|
1283
|
+
|
|
1284
|
+
# add vowel inventory
|
|
1285
|
+
all_graphemes.update(self.ALL_VOWEL_CHARS)
|
|
1286
|
+
|
|
1287
|
+
# Add from all mapping dictionaries
|
|
1288
|
+
all_graphemes.update(self.TETRAGRAM2IPA.keys())
|
|
1289
|
+
all_graphemes.update(self.TRIGRAM2IPA.keys())
|
|
1290
|
+
all_graphemes.update(self.TRIPHTHONG2IPA.keys())
|
|
1291
|
+
all_graphemes.update(self.DIGRAPH2IPA.keys())
|
|
1292
|
+
all_graphemes.update(self.DIPHTHONG2IPA.keys())
|
|
1293
|
+
all_graphemes.update(self.FOREIGN_DIGRAPH2IPA.keys())
|
|
1294
|
+
all_graphemes.update(self.HETEROSYLLABIC_CLUSTERS.keys())
|
|
1295
|
+
all_graphemes.update(self.NASAL_DIGRAPHS.keys())
|
|
1296
|
+
|
|
1297
|
+
# Add prefixes and archaic forms
|
|
1298
|
+
all_graphemes.update(self.HIATUS_PREFIXES)
|
|
1299
|
+
all_graphemes.update(self.ARCHAIC_MUTE_P.keys())
|
|
1300
|
+
|
|
1301
|
+
# Add single characters for completeness
|
|
1302
|
+
all_graphemes.update(string.ascii_lowercase)
|
|
1303
|
+
all_graphemes.update(string.punctuation)
|
|
1304
|
+
|
|
1305
|
+
# Sort: longest first (for greedy matching), then alphabetical
|
|
1306
|
+
self.GRAPHEME_INVENTORY = sorted(
|
|
1307
|
+
all_graphemes,
|
|
1308
|
+
key=lambda x: (-len(x), x)
|
|
1309
|
+
)
|
|
1310
|
+
|
|
1311
|
+
|
|
1312
|
+
# the base ruleset is based on Acordo Ortográfico de 1990, in effect since 2009
|
|
1313
|
+
# https://pt.wikipedia.org/wiki/Acordo_Ortogr%C3%A1fico_de_1990
|
|
1314
|
+
# http://www.portaldalinguaportuguesa.org/acordo.php
|
|
1315
|
+
AO1990 = DialectInventory(dialect_code="pt")
|
|
1316
|
+
|
|
1317
|
+
|
|
1318
|
+
# =============================================================================
|
|
1319
|
+
# DIALECT INSTANCES
|
|
1320
|
+
# =============================================================================
|
|
1321
|
+
|
|
1322
|
+
class EuropeanPortuguese(DialectInventory):
|
|
1323
|
+
"""
|
|
1324
|
+
European Portuguese (Portugal) phonological inventory.
|
|
1325
|
+
|
|
1326
|
+
CHARACTERISTIC FEATURES:
|
|
1327
|
+
------------------------
|
|
1328
|
+
1. VOWEL REDUCTION: Unstressed vowels reduce heavily
|
|
1329
|
+
- /a/ → [ɐ] in unstressed positions
|
|
1330
|
+
- /e/ → [ɨ] (close central) in unstressed positions
|
|
1331
|
+
- /o/ → [u] in unstressed positions
|
|
1332
|
+
Example: "pedir" [pɨˈdiɾ], "casa" [ˈkazɐ]
|
|
1333
|
+
|
|
1334
|
+
2. FRICATIVE PALATALIZATION: Final /s, z/ → [ʃ, ʒ]
|
|
1335
|
+
- "três" [ˈtɾeʃ]
|
|
1336
|
+
- "luz" [ˈluʃ]
|
|
1337
|
+
- Before voiceless consonants: /s/ → [ʃ]
|
|
1338
|
+
- Before voiced consonants: /s/ → [ʒ]
|
|
1339
|
+
|
|
1340
|
+
3. DARK L: Coda /l/ realized as velarized [ɫ]
|
|
1341
|
+
- "Brasil" [bɾɐˈziɫ]
|
|
1342
|
+
- "mal" [ˈmaɫ]
|
|
1343
|
+
|
|
1344
|
+
4. UVULAR R: Strong /R/ often realized as uvular [ʁ]
|
|
1345
|
+
- "rato" [ˈʁatu]
|
|
1346
|
+
- "carro" [ˈkaʁu]
|
|
1347
|
+
(Some regions use alveolar trill [r])
|
|
1348
|
+
|
|
1349
|
+
5. NASAL VOWELS: Highly nasalized
|
|
1350
|
+
- "mão" [ˈmɐ̃w̃]
|
|
1351
|
+
- "bem" [ˈbẽj̃]
|
|
1352
|
+
"""
|
|
1353
|
+
|
|
1354
|
+
def __init__(self):
|
|
1355
|
+
super().__init__(
|
|
1356
|
+
dialect_code="pt-PT",
|
|
1357
|
+
FALLING_NASAL_DIPHTHONGS={
|
|
1358
|
+
**AO1990.FALLING_NASAL_DIPHTHONGS,
|
|
1359
|
+
"ũj": "ui", # muito (special nasalized case)
|
|
1360
|
+
},
|
|
1361
|
+
TRIPHTHONG2IPA={
|
|
1362
|
+
**AO1990.TRIPHTHONG2IPA,
|
|
1363
|
+
# [j-e-j] sequence
|
|
1364
|
+
"iei": "jej", # chieira, macieira, pardieiro
|
|
1365
|
+
# Alternative Lisbon realization:
|
|
1366
|
+
# "iei": "jɐj", # with vowel reduction
|
|
1367
|
+
# [j-a-w] sequence
|
|
1368
|
+
"iau": "jaw", # miau
|
|
1369
|
+
},
|
|
1370
|
+
IRREGULAR_WORDS={
|
|
1371
|
+
# "ui" nasalized in "muito"
|
|
1372
|
+
"muito": "ˈmũj.tu",
|
|
1373
|
+
# Single-syllable special cases
|
|
1374
|
+
"miau": "ˈmjaw",
|
|
1375
|
+
})
|
|
1376
|
+
|
|
1377
|
+
|
|
1378
|
+
# =============================================================================
|
|
1379
|
+
# BRAZILIAN PORTUGUESE (pt-BR)
|
|
1380
|
+
# =============================================================================
|
|
1381
|
+
|
|
1382
|
+
class BrazilianPortuguese(DialectInventory):
|
|
1383
|
+
"""
|
|
1384
|
+
Brazilian Portuguese phonological inventory.
|
|
1385
|
+
|
|
1386
|
+
MAJOR DIFFERENCES FROM EUROPEAN:
|
|
1387
|
+
--------------------------------
|
|
1388
|
+
1. LESS VOWEL REDUCTION: Unstressed vowels maintain quality
|
|
1389
|
+
- European: "pedir" [pɨˈdiɾ] vs. Brazilian: "pedir" [peˈdʒiɾ]
|
|
1390
|
+
- European: "casa" [ˈkazɐ] vs. Brazilian: "casa" [ˈkaza]
|
|
1391
|
+
- /a/ stays [a] (not reduced to [ɐ])
|
|
1392
|
+
- /e/ stays [e] (not reduced to [ɨ])
|
|
1393
|
+
- /o/ stays [o] (not reduced to [u])
|
|
1394
|
+
|
|
1395
|
+
2. PALATALIZATION: /t, d/ → [tʃ, dʒ] before [i]
|
|
1396
|
+
- "tia" [ˈtʃiɐ] (European: [ˈtiɐ])
|
|
1397
|
+
- "dia" [ˈdʒiɐ] (European: [ˈdiɐ])
|
|
1398
|
+
- "noite" [ˈnojtʃi] (European: [ˈnojtɨ])
|
|
1399
|
+
- "grande" [ˈɡɾɐ̃dʒi] (European: [ˈɡɾɐ̃dɨ])
|
|
1400
|
+
|
|
1401
|
+
3. L-VOCALIZATION: Syllable-final /l/ → [w]
|
|
1402
|
+
- "Brasil" [bɾaˈziw] (European: [bɾɐˈziɫ])
|
|
1403
|
+
- "mal" [ˈmaw] (European: [ˈmaɫ])
|
|
1404
|
+
- "sol" [ˈsɔw] (European: [ˈsɔɫ])
|
|
1405
|
+
- Creates new diphthongs: -al, -el, -il, -ol, -ul
|
|
1406
|
+
|
|
1407
|
+
4. DIFFERENT R SOUNDS: Regional variation
|
|
1408
|
+
- São Paulo/South: [ɾ] (tap) and [x]/[h] (velar/glottal fricative)
|
|
1409
|
+
- Rio: [ʁ] (uvular) and [x]/[h]
|
|
1410
|
+
- Rural areas: May preserve alveolar trill [r]
|
|
1411
|
+
- "carro" [ˈkaxu] (SP) vs. [ˈkaʁu] (Rio) vs. [ˈkaru] (rural)
|
|
1412
|
+
|
|
1413
|
+
5. FINAL /s/: Stays [s], doesn't palatalize
|
|
1414
|
+
- "três" [ˈtɾes] (European: [ˈtɾeʃ])
|
|
1415
|
+
- "nós" [ˈnɔs] (European: [ˈnɔʃ])
|
|
1416
|
+
|
|
1417
|
+
6. LESS NASAL: Nasal vowels less nasalized than European
|
|
1418
|
+
- Nasalization is lighter
|
|
1419
|
+
- May have shorter nasal quality
|
|
1420
|
+
|
|
1421
|
+
7. OPEN VOWELS IN STRESSED POSITION:
|
|
1422
|
+
- Greater tendency toward open vowels [ɛ, ɔ] when stressed
|
|
1423
|
+
- "café" [kaˈfɛ]
|
|
1424
|
+
- "avô" [aˈvɔ]
|
|
1425
|
+
"""
|
|
1426
|
+
|
|
1427
|
+
def __init__(self):
|
|
1428
|
+
super().__init__(
|
|
1429
|
+
dialect_code="pt-BR",
|
|
1430
|
+
DIGRAPH2IPA = {
|
|
1431
|
+
**AO1990.DIGRAPH2IPA,
|
|
1432
|
+
"rr": "h" # DIVERGENCE: Brazilian uses [h] or [x] instead of [ʁ]
|
|
1433
|
+
},
|
|
1434
|
+
DEFAULT_CHAR2PHONEMES={
|
|
1435
|
+
**AO1990.DEFAULT_CHAR2PHONEMES,
|
|
1436
|
+
# VOWELS - LESS REDUCTION IN BRAZILIAN
|
|
1437
|
+
"a": "a", # DIVERGENCE: stays [a], not [ɐ]
|
|
1438
|
+
"â": "a", # DIVERGENCE: stays [a], not [ɐ]
|
|
1439
|
+
"e": "e", # DIVERGENCE: stays [e], not [ɨ]
|
|
1440
|
+
"o": "o", # DIVERGENCE: stays [o], not [u]
|
|
1441
|
+
# CONSONANTS
|
|
1442
|
+
"r": "ɾ", # DIVERGENCE: tap, strong R is [h]
|
|
1443
|
+
}
|
|
1444
|
+
)
|
|
1445
|
+
|
|
1446
|
+
|
|
1447
|
+
# =============================================================================
|
|
1448
|
+
# ANGOLAN PORTUGUESE (pt-AO)
|
|
1449
|
+
# =============================================================================
|
|
1450
|
+
|
|
1451
|
+
class AngolanPortuguese(DialectInventory):
|
|
1452
|
+
"""
|
|
1453
|
+
Angolan Portuguese phonological inventory.
|
|
1454
|
+
|
|
1455
|
+
CHARACTERISTIC FEATURES:
|
|
1456
|
+
------------------------
|
|
1457
|
+
1. BASE: Similar to European Portuguese but with modifications
|
|
1458
|
+
|
|
1459
|
+
2. VOWEL REDUCTION: Less reduction than European, more than Brazilian
|
|
1460
|
+
- Intermediate between European and Brazilian
|
|
1461
|
+
- Influenced by Bantu substrate (Kimbundu, Umbundu, Kikongo)
|
|
1462
|
+
|
|
1463
|
+
3. R SOUNDS: Consistent alveolar trill [r]
|
|
1464
|
+
- Preserves distinction between tap [ɾ] and trill [r]
|
|
1465
|
+
- More conservative than European or Brazilian
|
|
1466
|
+
- "carro" [ˈkaru] (not [ˈkaʁu] or [ˈkaxu])
|
|
1467
|
+
|
|
1468
|
+
4. PROSODY: Influenced by Bantu tone languages
|
|
1469
|
+
- May have different intonation patterns
|
|
1470
|
+
- Stress patterns similar to European
|
|
1471
|
+
|
|
1472
|
+
5. FINAL /s/: Generally [ʃ] like European
|
|
1473
|
+
- "três" [ˈtɾeʃ]
|
|
1474
|
+
|
|
1475
|
+
6. SUBSTRATE INFLUENCE: Phonological features from Bantu languages
|
|
1476
|
+
- May preserve some consonant distinctions
|
|
1477
|
+
- Prosodic patterns influenced by L1 Bantu speakers
|
|
1478
|
+
"""
|
|
1479
|
+
|
|
1480
|
+
def __init__(self):
|
|
1481
|
+
super().__init__(dialect_code="pt-AO",
|
|
1482
|
+
DIGRAPH2IPA={
|
|
1483
|
+
**AO1990.DIGRAPH2IPA,
|
|
1484
|
+
"rr": "r", # DIVERGENCE: Angolan uses alveolar trill [r]
|
|
1485
|
+
},
|
|
1486
|
+
# Moderate vowel reduction (between European and Brazilian)
|
|
1487
|
+
DEFAULT_CHAR2PHONEMES={
|
|
1488
|
+
**AO1990.DEFAULT_CHAR2PHONEMES,
|
|
1489
|
+
"e": "e", # DIVERGENCE: Less reduction than European [ɨ]
|
|
1490
|
+
"o": "o", # DIVERGENCE: Less reduction than European [u]
|
|
1491
|
+
"r": "ɾ", # DIVERGENCE: Strong R is [r], not [ʁ]
|
|
1492
|
+
}
|
|
1493
|
+
)
|
|
1494
|
+
|
|
1495
|
+
|
|
1496
|
+
# =============================================================================
|
|
1497
|
+
# MOZAMBICAN PORTUGUESE (pt-MZ)
|
|
1498
|
+
# =============================================================================
|
|
1499
|
+
|
|
1500
|
+
class MozambicanPortuguese(DialectInventory):
|
|
1501
|
+
"""
|
|
1502
|
+
Mozambican Portuguese phonological inventory.
|
|
1503
|
+
|
|
1504
|
+
CHARACTERISTIC FEATURES:
|
|
1505
|
+
------------------------
|
|
1506
|
+
1. BASE: Similar to European Portuguese with Bantu substrate
|
|
1507
|
+
|
|
1508
|
+
2. VOWEL REDUCTION: Variable, generally less than European
|
|
1509
|
+
- Influenced by substrate languages (Makhuwa, Tsonga, Sena)
|
|
1510
|
+
- May preserve more vowel distinctions
|
|
1511
|
+
|
|
1512
|
+
3. R SOUNDS: Alveolar trill [r] common
|
|
1513
|
+
- Similar to Angolan Portuguese
|
|
1514
|
+
- "carro" [ˈkaru]
|
|
1515
|
+
|
|
1516
|
+
4. REGIONAL VARIATION:
|
|
1517
|
+
- North (Nampula): More substrate influence
|
|
1518
|
+
- South (Maputo): Closer to European/South African Portuguese
|
|
1519
|
+
|
|
1520
|
+
5. FINAL /s/: Generally [ʃ] like European
|
|
1521
|
+
- "nós" [ˈnɔʃ]
|
|
1522
|
+
|
|
1523
|
+
6. PROSODY: Bantu-influenced intonation
|
|
1524
|
+
- May have different rhythm patterns
|
|
1525
|
+
"""
|
|
1526
|
+
|
|
1527
|
+
def __init__(self):
|
|
1528
|
+
super().__init__(dialect_code="pt-MZ",
|
|
1529
|
+
DIGRAPH2IPA={
|
|
1530
|
+
**AO1990.DIGRAPH2IPA,
|
|
1531
|
+
"rr": "r", # DIVERGENCE: Angolan uses alveolar trill [r]
|
|
1532
|
+
},
|
|
1533
|
+
# Moderate vowel reduction (between European and Brazilian)
|
|
1534
|
+
DEFAULT_CHAR2PHONEMES={
|
|
1535
|
+
**AO1990.DEFAULT_CHAR2PHONEMES,
|
|
1536
|
+
"e": "e", # DIVERGENCE: Less reduction than European [ɨ]
|
|
1537
|
+
"o": "o", # DIVERGENCE: Less reduction than European [u]
|
|
1538
|
+
"r": "ɾ", # DIVERGENCE: Strong R is [r], not [ʁ]
|
|
1539
|
+
}
|
|
1540
|
+
)
|
|
1541
|
+
|
|
1542
|
+
|
|
1543
|
+
# =============================================================================
|
|
1544
|
+
# TIMORESE PORTUGUESE (pt-TL)
|
|
1545
|
+
# =============================================================================
|
|
1546
|
+
|
|
1547
|
+
class TimoresePortuguese(DialectInventory):
|
|
1548
|
+
"""
|
|
1549
|
+
Timorese Portuguese (East Timor) phonological inventory.
|
|
1550
|
+
|
|
1551
|
+
CHARACTERISTIC FEATURES:
|
|
1552
|
+
------------------------
|
|
1553
|
+
1. BASE: European Portuguese with Austronesian substrate influence
|
|
1554
|
+
- Primary substrate: Tetum
|
|
1555
|
+
- Also influenced by Indonesian
|
|
1556
|
+
|
|
1557
|
+
2. L2 FEATURES: Portuguese often learned as second language
|
|
1558
|
+
- May show substrate transfer from Tetum
|
|
1559
|
+
- More conservative/formal pronunciation
|
|
1560
|
+
- Less naturalistic reduction
|
|
1561
|
+
|
|
1562
|
+
3. VOWEL SYSTEM: Similar to European but may be simpler
|
|
1563
|
+
- Less vowel reduction than European
|
|
1564
|
+
- May neutralize some distinctions
|
|
1565
|
+
|
|
1566
|
+
4. R SOUNDS: Variable
|
|
1567
|
+
- May use alveolar tap [ɾ] and trill [r]
|
|
1568
|
+
- Less uvular [ʁ] than European
|
|
1569
|
+
|
|
1570
|
+
5. FINAL /s/: Generally [ʃ] like European
|
|
1571
|
+
- "nós" [ˈnɔʃ]
|
|
1572
|
+
|
|
1573
|
+
6. SMALLER SPEAKER BASE: Portuguese is official but less widely native
|
|
1574
|
+
- More formal/prescriptive forms common
|
|
1575
|
+
- Less dialectal innovation
|
|
1576
|
+
"""
|
|
1577
|
+
|
|
1578
|
+
def __init__(self):
|
|
1579
|
+
super().__init__(dialect_code="pt-TL")
|
|
1580
|
+
|
|
1581
|
+
super().__init__(dialect_code="pt-MZ",
|
|
1582
|
+
DIGRAPH2IPA={
|
|
1583
|
+
**AO1990.DIGRAPH2IPA,
|
|
1584
|
+
"rr": "r", # DIVERGENCE: Angolan uses alveolar trill [r]
|
|
1585
|
+
},
|
|
1586
|
+
# Moderate vowel reduction (between European and Brazilian)
|
|
1587
|
+
DEFAULT_CHAR2PHONEMES={
|
|
1588
|
+
**AO1990.DEFAULT_CHAR2PHONEMES,
|
|
1589
|
+
"a": "a", # DIVERGENCE: Less reduction
|
|
1590
|
+
"e": "e", # DIVERGENCE: Less reduction than European [ɨ]
|
|
1591
|
+
"o": "o", # DIVERGENCE: Less reduction than European [u]
|
|
1592
|
+
"r": "ɾ", # DIVERGENCE: Strong R is [r], not [ʁ]
|
|
1593
|
+
}
|
|
1594
|
+
)
|
|
1595
|
+
|
|
1596
|
+
|
|
1597
|
+
# =============================================================================
|
|
1598
|
+
# Helper Functions
|
|
1599
|
+
# =============================================================================
|
|
1600
|
+
|
|
1601
|
+
def detect_stress_position(word: str, syllables: List[str], dialect: DialectInventory) -> int:
|
|
1602
|
+
"""
|
|
1603
|
+
Determine which syllable carries primary stress.
|
|
1604
|
+
|
|
1605
|
+
ALGORITHM:
|
|
1606
|
+
----------
|
|
1607
|
+
1. Check for explicit accent marks → stress that syllable
|
|
1608
|
+
2. Check word-final pattern against OXYTONE_ENDINGS
|
|
1609
|
+
3. Default to penultimate (paroxytone rule)
|
|
1610
|
+
|
|
1611
|
+
Args:
|
|
1612
|
+
word: Normalized word string
|
|
1613
|
+
syllables: List of syllables
|
|
1614
|
+
dialect: DialectInventory with stress rules
|
|
1615
|
+
|
|
1616
|
+
Returns:
|
|
1617
|
+
Index of stressed syllable (0-based)
|
|
1618
|
+
|
|
1619
|
+
Examples:
|
|
1620
|
+
>>> detect_stress_position("café", ["ca", "fé"], dialect)
|
|
1621
|
+
1 # Final syllable (explicit accent)
|
|
1622
|
+
|
|
1623
|
+
>>> detect_stress_position("casa", ["ca", "sa"], dialect)
|
|
1624
|
+
0 # Penultimate (default)
|
|
1625
|
+
|
|
1626
|
+
>>> detect_stress_position("falar", ["fa", "lar"], dialect)
|
|
1627
|
+
1 # Final (ends in -r)
|
|
1628
|
+
"""
|
|
1629
|
+
n_syllables = len(syllables)
|
|
1630
|
+
|
|
1631
|
+
# Monosyllables are inherently stressed
|
|
1632
|
+
if n_syllables == 1:
|
|
1633
|
+
return 0
|
|
1634
|
+
|
|
1635
|
+
# Check for explicit accent marks (primary stress markers)
|
|
1636
|
+
for idx, syllable in enumerate(syllables):
|
|
1637
|
+
if any(char in syllable for char in dialect.PRIMARY_STRESS_MARKERS):
|
|
1638
|
+
return idx
|
|
1639
|
+
|
|
1640
|
+
# Check for oxytone word endings (final stress)
|
|
1641
|
+
for ending in dialect.OXYTONE_ENDINGS:
|
|
1642
|
+
if word.endswith(ending):
|
|
1643
|
+
return n_syllables - 1
|
|
1644
|
+
|
|
1645
|
+
# Default: paroxytone (penultimate stress)
|
|
1646
|
+
return n_syllables - 2 if n_syllables >= 2 else 0
|
|
1647
|
+
|
|
1648
|
+
|
|
1649
|
+
def is_grapheme_silent(grapheme: str, context_before: str, context_after: str,
|
|
1650
|
+
word: str, dialect: DialectInventory) -> bool:
|
|
1651
|
+
"""
|
|
1652
|
+
Determine if a grapheme has no phonetic realization (silent).
|
|
1653
|
+
|
|
1654
|
+
SILENT CATEGORIES:
|
|
1655
|
+
------------------
|
|
1656
|
+
1. H: Always silent except in digraphs (handled separately)
|
|
1657
|
+
2. U in QU/GU: Silent before e/i in modern orthography
|
|
1658
|
+
- Exception: Some words have pronounced [w] (needs word list)
|
|
1659
|
+
3. Archaic consonants: p in mpt/mpc/mpç (pre-2009 spelling)
|
|
1660
|
+
4. First letter of doubled consonants in digraphs: rr, ss, ff, ll
|
|
1661
|
+
|
|
1662
|
+
Args:
|
|
1663
|
+
grapheme: The grapheme to check
|
|
1664
|
+
context_before: Characters immediately before
|
|
1665
|
+
context_after: Characters immediately after
|
|
1666
|
+
word: Full word (for irregular word lookup)
|
|
1667
|
+
dialect: DialectInventory with rules
|
|
1668
|
+
|
|
1669
|
+
Returns:
|
|
1670
|
+
True if grapheme is silent, False otherwise
|
|
1671
|
+
|
|
1672
|
+
Examples:
|
|
1673
|
+
>>> is_grapheme_silent('h', '', 'oje', 'hoje', dialect)
|
|
1674
|
+
True # h is always silent
|
|
1675
|
+
|
|
1676
|
+
>>> is_grapheme_silent('u', 'q', 'ero', 'quero', dialect)
|
|
1677
|
+
True # u silent in 'que'
|
|
1678
|
+
|
|
1679
|
+
>>> is_grapheme_silent('u', 'q', 'ino', 'equino', dialect)
|
|
1680
|
+
False # u pronounced in 'equino' [eˈkwinu]
|
|
1681
|
+
(Note: This would require word list to distinguish)
|
|
1682
|
+
"""
|
|
1683
|
+
g = grapheme.lower()
|
|
1684
|
+
before = context_before.lower()
|
|
1685
|
+
after = context_after.lower()
|
|
1686
|
+
|
|
1687
|
+
# H is always silent in Portuguese
|
|
1688
|
+
if g == "h":
|
|
1689
|
+
return True
|
|
1690
|
+
|
|
1691
|
+
# U after Q or G before E or I (modern orthography default: silent)
|
|
1692
|
+
# Historical note: Trema (ü) used to mark pronounced u
|
|
1693
|
+
# Modern: Requires etymology/word list to determine
|
|
1694
|
+
if g == "u" and before in ["q", "g"] and after and after[0] in "ei":
|
|
1695
|
+
# Check word list for know exceptions
|
|
1696
|
+
if word.lower() in ["equino", "antiguidade, linguiça", "pinguim", "frequente", "frequentemente"]:
|
|
1697
|
+
return False
|
|
1698
|
+
# assume silent (most common)
|
|
1699
|
+
return True
|
|
1700
|
+
|
|
1701
|
+
# Archaic silent P in mpc/mpç/mpt
|
|
1702
|
+
if g == "p" and before == "m" and after and after[0] in "cç":
|
|
1703
|
+
# Check if word is in archaic word list
|
|
1704
|
+
for cluster, words in dialect.ARCHAIC_MUTE_P.items():
|
|
1705
|
+
if cluster in word and word in words:
|
|
1706
|
+
return True
|
|
1707
|
+
|
|
1708
|
+
# First consonant in geminate digraphs (rr, ss, ff, ll, mm)
|
|
1709
|
+
# These are handled at grapheme level, not here
|
|
1710
|
+
# (The grapheme would be "rr" as a unit, not two 'r's)
|
|
1711
|
+
|
|
1712
|
+
return False
|
|
1713
|
+
|
|
1714
|
+
|
|
1715
|
+
# =============================================================================
|
|
1716
|
+
# CHARACTER TOKEN
|
|
1717
|
+
# =============================================================================
|
|
1718
|
+
|
|
1719
|
+
@dataclasses.dataclass
|
|
1720
|
+
class CharToken:
|
|
1721
|
+
"""
|
|
1722
|
+
Represents a single character with its phonological context.
|
|
1723
|
+
|
|
1724
|
+
LINGUISTIC ROLE:
|
|
1725
|
+
----------------
|
|
1726
|
+
Characters are the atomic units of orthography.
|
|
1727
|
+
Their phonetic realization depends on:
|
|
1728
|
+
- Inherent properties (vowel/consonant, diacritics)
|
|
1729
|
+
- Linear context (preceding/following characters)
|
|
1730
|
+
- Hierarchical context (parent grapheme, syllable, word)
|
|
1731
|
+
- Prosodic context (stress, position in word)
|
|
1732
|
+
|
|
1733
|
+
DESIGN RATIONALE:
|
|
1734
|
+
-----------------
|
|
1735
|
+
We track both the character itself and its context
|
|
1736
|
+
to enable context-sensitive phonological rules.
|
|
1737
|
+
All indices are computed during initialization to avoid
|
|
1738
|
+
circular dependencies.
|
|
1739
|
+
|
|
1740
|
+
Attributes:
|
|
1741
|
+
surface: The actual character string (may include diacritics)
|
|
1742
|
+
char_idx: Position within parent grapheme (0-based)
|
|
1743
|
+
parent_grapheme: GraphemeToken containing this character
|
|
1744
|
+
dialect: DialectInventory with phonological rules
|
|
1745
|
+
"""
|
|
1746
|
+
|
|
1747
|
+
surface: str
|
|
1748
|
+
char_idx: int = 0 # parent_grapheme.characters[idx] == self
|
|
1749
|
+
parent_grapheme: Optional["GraphemeToken"] = None
|
|
1750
|
+
dialect: DialectInventory = dataclasses.field(default_factory=EuropeanPortuguese)
|
|
1751
|
+
|
|
1752
|
+
# Precomputed indices (set during initialization)
|
|
1753
|
+
_idx_in_word: int = -1
|
|
1754
|
+
_idx_in_sentence: int = -1
|
|
1755
|
+
|
|
1756
|
+
def __post_init__(self):
|
|
1757
|
+
"""
|
|
1758
|
+
Validate and precompute indices.
|
|
1759
|
+
|
|
1760
|
+
Indices are computed top-down during sentence initialization
|
|
1761
|
+
to avoid circular dependency issues.
|
|
1762
|
+
"""
|
|
1763
|
+
# Validation
|
|
1764
|
+
if len(self.surface) != 1:
|
|
1765
|
+
raise ValueError(f"CharToken must contain exactly one character, got: {self.surface}")
|
|
1766
|
+
|
|
1767
|
+
self._idx_in_word = self.parent_grapheme.idx_in_word + self.char_idx
|
|
1768
|
+
self._idx_in_sentence = self.parent_grapheme.idx_in_sentence + self.char_idx
|
|
1769
|
+
|
|
1770
|
+
# =========================================================================
|
|
1771
|
+
# BASIC PROPERTIES
|
|
1772
|
+
# =========================================================================
|
|
1773
|
+
|
|
1774
|
+
@cached_property
|
|
1775
|
+
def normalized(self) -> str:
|
|
1776
|
+
"""
|
|
1777
|
+
Lowercase, normalized form of character.
|
|
1778
|
+
|
|
1779
|
+
Normalization maps archaic/foreign diacritics to standard equivalents.
|
|
1780
|
+
Examples:
|
|
1781
|
+
- ü → w (represents [w] sound)
|
|
1782
|
+
- è → é (obsolete grave → modern acute)
|
|
1783
|
+
- î → i (redundant circumflex → plain)
|
|
1784
|
+
"""
|
|
1785
|
+
s = self.surface.lower().strip()
|
|
1786
|
+
return self.dialect.NORMALIZED_VOWELS.get(s, s)
|
|
1787
|
+
|
|
1788
|
+
# =========================================================================
|
|
1789
|
+
# INDICES AND CONTEXT
|
|
1790
|
+
# =========================================================================
|
|
1791
|
+
|
|
1792
|
+
@property
|
|
1793
|
+
def idx_in_word(self) -> int:
|
|
1794
|
+
"""Index of this character in parent word."""
|
|
1795
|
+
return self._idx_in_word
|
|
1796
|
+
|
|
1797
|
+
@property
|
|
1798
|
+
def idx_in_sentence(self) -> int:
|
|
1799
|
+
"""Index of this character in parent sentence."""
|
|
1800
|
+
return self._idx_in_sentence
|
|
1801
|
+
|
|
1802
|
+
@cached_property
|
|
1803
|
+
def parent_word(self) -> Optional['WordToken']:
|
|
1804
|
+
"""The word containing this character."""
|
|
1805
|
+
if self.parent_grapheme:
|
|
1806
|
+
return self.parent_grapheme.parent_word
|
|
1807
|
+
return None
|
|
1808
|
+
|
|
1809
|
+
@cached_property
|
|
1810
|
+
def parent_sentence(self) -> Optional['Sentence']:
|
|
1811
|
+
"""The sentence containing this character."""
|
|
1812
|
+
if not self.parent_word:
|
|
1813
|
+
return None
|
|
1814
|
+
return self.parent_word.parent_sentence
|
|
1815
|
+
|
|
1816
|
+
@cached_property
|
|
1817
|
+
def prev_char(self) -> Optional['CharToken']:
|
|
1818
|
+
"""Previous character in the grapheme, or None if first."""
|
|
1819
|
+
if not self.parent_grapheme:
|
|
1820
|
+
return None
|
|
1821
|
+
if self.char_idx == 0:
|
|
1822
|
+
# TODO: go to prev grapheme
|
|
1823
|
+
return None
|
|
1824
|
+
return self.parent_grapheme.characters[self.char_idx - 1]
|
|
1825
|
+
|
|
1826
|
+
@cached_property
|
|
1827
|
+
def next_char(self) -> Optional['CharToken']:
|
|
1828
|
+
"""Next character in the grapheme, or None if last."""
|
|
1829
|
+
if self.char_idx == -1 or not self.parent_grapheme:
|
|
1830
|
+
return None
|
|
1831
|
+
if self.char_idx >= len(self.parent_grapheme.characters) - 1:
|
|
1832
|
+
# TODO: go to next grapheme
|
|
1833
|
+
return None
|
|
1834
|
+
return self.parent_grapheme.characters[self.char_idx + 1]
|
|
1835
|
+
|
|
1836
|
+
# -------------------------------
|
|
1837
|
+
# Look-behind/ahead
|
|
1838
|
+
# -------------------------------
|
|
1839
|
+
@property
|
|
1840
|
+
def prefix(self) -> str:
|
|
1841
|
+
return self.parent_grapheme.prefix + "".join([c.normalized for c in self._prev_chars])
|
|
1842
|
+
|
|
1843
|
+
@property
|
|
1844
|
+
def suffix(self) -> str:
|
|
1845
|
+
return "".join([c.normalized for c in self._next_chars]) + self.parent_grapheme.suffix
|
|
1846
|
+
|
|
1847
|
+
@cached_property
|
|
1848
|
+
def _prev_chars(self) -> List['CharToken']:
|
|
1849
|
+
if self.char_idx == 0:
|
|
1850
|
+
return []
|
|
1851
|
+
return [w for w in self.parent_grapheme.characters if w.char_idx < self.char_idx]
|
|
1852
|
+
|
|
1853
|
+
@cached_property
|
|
1854
|
+
def _next_chars(self) -> List['CharToken']:
|
|
1855
|
+
return [w for w in self.parent_grapheme.characters if w.char_idx > self.char_idx]
|
|
1856
|
+
|
|
1857
|
+
# =========================================================================
|
|
1858
|
+
# CHARACTER CLASSIFICATION
|
|
1859
|
+
# =========================================================================
|
|
1860
|
+
|
|
1861
|
+
@cached_property
|
|
1862
|
+
def is_punct(self) -> bool:
|
|
1863
|
+
"""True if character is punctuation."""
|
|
1864
|
+
return self.surface in self.dialect.PUNCT_CHARS
|
|
1865
|
+
|
|
1866
|
+
@cached_property
|
|
1867
|
+
def is_vowel(self) -> bool:
|
|
1868
|
+
"""
|
|
1869
|
+
True if character represents a vowel (with or without diacritics).
|
|
1870
|
+
|
|
1871
|
+
Portuguese vowels: a, e, i, o, u
|
|
1872
|
+
With diacritics: á, à, â, ã, é, ê, í, ó, ô, õ, ú
|
|
1873
|
+
Archaic: è, ì, ò, ù, ẽ, ĩ, ũ, ä, ë, ï, ö, ü, ÿ
|
|
1874
|
+
"""
|
|
1875
|
+
return self.normalized in (
|
|
1876
|
+
self.dialect.VOWEL_CHARS |
|
|
1877
|
+
self.dialect.ACUTE_VOWEL_CHARS |
|
|
1878
|
+
self.dialect.GRAVE_VOWEL_CHARS |
|
|
1879
|
+
self.dialect.CIRCUM_VOWEL_CHARS |
|
|
1880
|
+
self.dialect.TILDE_VOWEL_CHARS |
|
|
1881
|
+
self.dialect.TREMA_VOWEL_CHARS
|
|
1882
|
+
)
|
|
1883
|
+
|
|
1884
|
+
@cached_property
|
|
1885
|
+
def is_semivowel(self) -> bool:
|
|
1886
|
+
"""
|
|
1887
|
+
True if character can function as semivowel (glide).
|
|
1888
|
+
|
|
1889
|
+
Semivowels in Portuguese:
|
|
1890
|
+
- [j]: written as 'i' or 'e'
|
|
1891
|
+
- [w]: written as 'u' or 'o'
|
|
1892
|
+
|
|
1893
|
+
Whether it actually IS a semivowel depends on position:
|
|
1894
|
+
- In diphthong: semivowel
|
|
1895
|
+
- As syllable nucleus: vowel
|
|
1896
|
+
"""
|
|
1897
|
+
return self.normalized in self.dialect.SEMIVOWEL_CHARS
|
|
1898
|
+
|
|
1899
|
+
@cached_property
|
|
1900
|
+
def is_consonant(self) -> bool:
|
|
1901
|
+
"""True if character represents a consonant."""
|
|
1902
|
+
return not self.is_vowel and not self.is_punct
|
|
1903
|
+
|
|
1904
|
+
@cached_property
|
|
1905
|
+
def is_nasal_vowel(self) -> bool:
|
|
1906
|
+
"""
|
|
1907
|
+
True if vowel is phonemically nasal.
|
|
1908
|
+
|
|
1909
|
+
Two orthographic realizations:
|
|
1910
|
+
1. Tilde: ã, õ (and archaic ẽ, ĩ, ũ)
|
|
1911
|
+
2. Vowel + nasal consonant: am, an, em, en, etc.
|
|
1912
|
+
"""
|
|
1913
|
+
if not self.is_vowel:
|
|
1914
|
+
return False
|
|
1915
|
+
|
|
1916
|
+
# Explicit tilde marking
|
|
1917
|
+
if self.normalized in self.dialect.TILDE_VOWEL_CHARS:
|
|
1918
|
+
return True
|
|
1919
|
+
|
|
1920
|
+
# Followed by m/n in coda position
|
|
1921
|
+
if self.next_char and self.next_char.normalized in "mn":
|
|
1922
|
+
# Check if next char is in coda (not before vowel)
|
|
1923
|
+
next_next = self.next_char.next_char
|
|
1924
|
+
if not next_next or next_next.is_consonant:
|
|
1925
|
+
return True
|
|
1926
|
+
|
|
1927
|
+
return False
|
|
1928
|
+
|
|
1929
|
+
@cached_property
|
|
1930
|
+
def is_foreign(self) -> bool:
|
|
1931
|
+
"""
|
|
1932
|
+
True if character is not in traditional Portuguese alphabet.
|
|
1933
|
+
|
|
1934
|
+
Foreign letters: k, w, y
|
|
1935
|
+
Used in: loanwords, foreign names, scientific terms
|
|
1936
|
+
"""
|
|
1937
|
+
return self.normalized in self.dialect.FOREIGN_CHARS
|
|
1938
|
+
|
|
1939
|
+
@cached_property
|
|
1940
|
+
def has_diacritics(self) -> bool:
|
|
1941
|
+
"""True if character has diacritical marks."""
|
|
1942
|
+
return self.normalized in (
|
|
1943
|
+
self.dialect.ACUTE_VOWEL_CHARS |
|
|
1944
|
+
self.dialect.GRAVE_VOWEL_CHARS |
|
|
1945
|
+
self.dialect.CIRCUM_VOWEL_CHARS |
|
|
1946
|
+
self.dialect.TILDE_VOWEL_CHARS |
|
|
1947
|
+
self.dialect.TREMA_VOWEL_CHARS
|
|
1948
|
+
)
|
|
1949
|
+
|
|
1950
|
+
@cached_property
|
|
1951
|
+
def is_silent(self) -> bool:
|
|
1952
|
+
"""
|
|
1953
|
+
True if character has no phonetic realization.
|
|
1954
|
+
|
|
1955
|
+
Silent letter categories:
|
|
1956
|
+
1. H: Always silent (except in digraphs ch, nh, lh)
|
|
1957
|
+
2. U in QU/GU: Silent before e/i (modern orthography)
|
|
1958
|
+
3. Archaic P: Silent in mpc/mpç/mpt clusters
|
|
1959
|
+
4. First letter in doubled consonant digraphs
|
|
1960
|
+
|
|
1961
|
+
Context-dependent - uses word and positional information.
|
|
1962
|
+
"""
|
|
1963
|
+
return is_grapheme_silent(
|
|
1964
|
+
self.normalized,
|
|
1965
|
+
self.prefix,
|
|
1966
|
+
self.suffix,
|
|
1967
|
+
self.parent_word.normalized if self.parent_word else "",
|
|
1968
|
+
self.dialect
|
|
1969
|
+
)
|
|
1970
|
+
|
|
1971
|
+
# =========================================================================
|
|
1972
|
+
# VOWEL QUALITY CLASSIFICATION
|
|
1973
|
+
# =========================================================================
|
|
1974
|
+
|
|
1975
|
+
@cached_property
|
|
1976
|
+
def is_open_vowel(self) -> bool:
|
|
1977
|
+
"""
|
|
1978
|
+
True if vowel is phonetically open (low tongue position).
|
|
1979
|
+
|
|
1980
|
+
Open vowels: [a, ɛ, ɔ]
|
|
1981
|
+
Marked with acute accent: á, é, ó
|
|
1982
|
+
|
|
1983
|
+
Linguistic note: Only a, e, o have open/closed distinction.
|
|
1984
|
+
i and u are always closed (high vowels).
|
|
1985
|
+
"""
|
|
1986
|
+
return self.normalized in self.dialect.ACUTE_VOWEL_CHARS or self.normalized == "a"
|
|
1987
|
+
|
|
1988
|
+
@cached_property
|
|
1989
|
+
def is_closed_vowel(self) -> bool:
|
|
1990
|
+
"""
|
|
1991
|
+
True if vowel is phonetically closed (high tongue position).
|
|
1992
|
+
|
|
1993
|
+
Closed vowels: [i, e, o, u, ɨ]
|
|
1994
|
+
High vowels i, u are always closed.
|
|
1995
|
+
Mid vowels e, o are closed when marked with circumflex: ê, ô
|
|
1996
|
+
"""
|
|
1997
|
+
return self.normalized in ["i", "u", "ê", "ô"]
|
|
1998
|
+
|
|
1999
|
+
# =========================================================================
|
|
2000
|
+
# POSITIONAL PROPERTIES
|
|
2001
|
+
# =========================================================================
|
|
2002
|
+
|
|
2003
|
+
@cached_property
|
|
2004
|
+
def is_first_word_letter(self) -> bool:
|
|
2005
|
+
"""True if this is the first letter of the word."""
|
|
2006
|
+
return self.idx_in_word == 0
|
|
2007
|
+
|
|
2008
|
+
@cached_property
|
|
2009
|
+
def is_last_word_letter(self) -> bool:
|
|
2010
|
+
"""True if this is the last letter of the word."""
|
|
2011
|
+
if not self.parent_word:
|
|
2012
|
+
return False
|
|
2013
|
+
return self.idx_in_word == len(self.parent_word.normalized) - 1
|
|
2014
|
+
|
|
2015
|
+
@cached_property
|
|
2016
|
+
def is_intervocalic(self) -> bool:
|
|
2017
|
+
"""
|
|
2018
|
+
True if character is between two vowels (V-C-V context).
|
|
2019
|
+
|
|
2020
|
+
Relevant for:
|
|
2021
|
+
- S voicing: casa [ˈkazɐ] (s → [z] between vowels)
|
|
2022
|
+
- R strengthening: caro vs carro
|
|
2023
|
+
"""
|
|
2024
|
+
prev_is_vowel = self.prev_char.is_vowel if self.prev_char else False
|
|
2025
|
+
next_is_vowel = self.next_char.is_vowel if self.next_char else False
|
|
2026
|
+
return prev_is_vowel and next_is_vowel
|
|
2027
|
+
|
|
2028
|
+
@cached_property
|
|
2029
|
+
def is_between_consonant_vowel(self) -> bool:
|
|
2030
|
+
"""
|
|
2031
|
+
True if pattern is C-S-V.
|
|
2032
|
+
|
|
2033
|
+
Relevant for S voicing rules.
|
|
2034
|
+
"""
|
|
2035
|
+
prev_is_cons = self.prev_char.is_consonant if self.prev_char else False
|
|
2036
|
+
next_is_vowel = self.next_char.is_vowel if self.next_char else False
|
|
2037
|
+
return prev_is_cons and next_is_vowel
|
|
2038
|
+
|
|
2039
|
+
@cached_property
|
|
2040
|
+
def is_between_vowel_consonant(self) -> bool:
|
|
2041
|
+
"""
|
|
2042
|
+
True if pattern is V-S-C.
|
|
2043
|
+
|
|
2044
|
+
Relevant for syllable-final consonant rules.
|
|
2045
|
+
"""
|
|
2046
|
+
prev_is_vowel = self.prev_char.is_vowel if self.prev_char else False
|
|
2047
|
+
next_is_cons = self.next_char.is_consonant if self.next_char else False
|
|
2048
|
+
return prev_is_vowel and next_is_cons
|
|
2049
|
+
|
|
2050
|
+
# =========================================================================
|
|
2051
|
+
# STRESS PROPERTIES
|
|
2052
|
+
# =========================================================================
|
|
2053
|
+
|
|
2054
|
+
@cached_property
|
|
2055
|
+
def has_primary_stress(self) -> bool:
|
|
2056
|
+
"""
|
|
2057
|
+
True if this vowel carries primary stress.
|
|
2058
|
+
|
|
2059
|
+
For diacritically marked vowels (á, é, etc.), stress is explicit.
|
|
2060
|
+
For unmarked vowels, stress is determined by syllable-level rules
|
|
2061
|
+
in the parent grapheme/word.
|
|
2062
|
+
"""
|
|
2063
|
+
# Explicit stress markers
|
|
2064
|
+
if self.normalized in self.dialect.PRIMARY_STRESS_MARKERS:
|
|
2065
|
+
return True
|
|
2066
|
+
|
|
2067
|
+
# Defer to parent grapheme's stress determination
|
|
2068
|
+
if self.parent_grapheme:
|
|
2069
|
+
return self.parent_grapheme.has_primary_stress
|
|
2070
|
+
|
|
2071
|
+
return False
|
|
2072
|
+
|
|
2073
|
+
@cached_property
|
|
2074
|
+
def has_secondary_stress(self) -> bool:
|
|
2075
|
+
"""
|
|
2076
|
+
True if this vowel carries secondary stress.
|
|
2077
|
+
|
|
2078
|
+
Circumflex and grave accents can mark secondary stress
|
|
2079
|
+
in compound words and some historical contexts.
|
|
2080
|
+
"""
|
|
2081
|
+
# Explicit secondary stress markers
|
|
2082
|
+
if self.normalized in self.dialect.SECONDARY_STRESS_MARKERS:
|
|
2083
|
+
return True
|
|
2084
|
+
|
|
2085
|
+
if self.is_vowel and self.prev_char and self.prev_char.normalized == "h":
|
|
2086
|
+
return True
|
|
2087
|
+
|
|
2088
|
+
# Defer to parent grapheme
|
|
2089
|
+
if self.parent_grapheme:
|
|
2090
|
+
return self.parent_grapheme.has_secondary_stress
|
|
2091
|
+
|
|
2092
|
+
return False
|
|
2093
|
+
|
|
2094
|
+
# =========================================================================
|
|
2095
|
+
# IPA GENERATION
|
|
2096
|
+
# =========================================================================
|
|
2097
|
+
|
|
2098
|
+
def _ipa_for_vowel(self) -> str:
|
|
2099
|
+
"""
|
|
2100
|
+
Generate IPA for vowel character.
|
|
2101
|
+
|
|
2102
|
+
VOWEL REALIZATION RULES:
|
|
2103
|
+
------------------------
|
|
2104
|
+
1. Explicit quality: á→[a], é→[ɛ], ê→[e], ó→[ɔ], ô→[o]
|
|
2105
|
+
2. Stress-dependent:
|
|
2106
|
+
- Stressed a → [a]
|
|
2107
|
+
- Unstressed a → [ɐ]
|
|
2108
|
+
- Stressed e → [ɛ] or [e] (depends on syllable)
|
|
2109
|
+
- Unstressed e → [ɨ] (European) or [e] (Brazilian)
|
|
2110
|
+
3. Nasal: ã→[ɐ̃], õ→[õ], a+m/n→[ɐ̃], etc.
|
|
2111
|
+
|
|
2112
|
+
Returns:
|
|
2113
|
+
IPA string for this vowel
|
|
2114
|
+
"""
|
|
2115
|
+
s = self.normalized
|
|
2116
|
+
|
|
2117
|
+
# Explicit diacritical marking
|
|
2118
|
+
if s in self.dialect.DEFAULT_CHAR2PHONEMES:
|
|
2119
|
+
base_ipa = self.dialect.DEFAULT_CHAR2PHONEMES[s]
|
|
2120
|
+
|
|
2121
|
+
word = self.parent_word.normalized if self.parent_word else ""
|
|
2122
|
+
|
|
2123
|
+
# TODO: per dialect handling
|
|
2124
|
+
|
|
2125
|
+
# Special case: Single-vowel words
|
|
2126
|
+
if word == "a":
|
|
2127
|
+
return "ɐ"
|
|
2128
|
+
elif word == "e":
|
|
2129
|
+
return "i"
|
|
2130
|
+
elif word == "é":
|
|
2131
|
+
return "ɛ"
|
|
2132
|
+
elif word == "o":
|
|
2133
|
+
return "u"
|
|
2134
|
+
|
|
2135
|
+
# Special case: prepositions
|
|
2136
|
+
preps = ["a", "o", "as", "os",
|
|
2137
|
+
"de", "em", "por"]
|
|
2138
|
+
# Special case: determinants
|
|
2139
|
+
dets = ["da", "do", "das", "dos"]
|
|
2140
|
+
# Special case: contractions
|
|
2141
|
+
# em a/o -> na/o
|
|
2142
|
+
# para -> pra | para a -> prá
|
|
2143
|
+
contr = ["na", "no", "nas", "nos", "pra"]
|
|
2144
|
+
# Special case: oblique pronouns
|
|
2145
|
+
prons = ["me", "te", "se",
|
|
2146
|
+
"le", "lo", "la",
|
|
2147
|
+
"les", "los", "las",
|
|
2148
|
+
"lhe", "lho", "lha",
|
|
2149
|
+
"lhes", "lhos", "lhas"]
|
|
2150
|
+
if word in preps + dets + prons + contr:
|
|
2151
|
+
# Brazilian Portuguese: less reduction
|
|
2152
|
+
if self.dialect.dialect_code.startswith("pt-BR"):
|
|
2153
|
+
if s == "a":
|
|
2154
|
+
return "a" # Less reduction
|
|
2155
|
+
if s == "e":
|
|
2156
|
+
return "e" # Less reduction
|
|
2157
|
+
if s == "o":
|
|
2158
|
+
return "o" # Less reduction
|
|
2159
|
+
else:
|
|
2160
|
+
# European/African: more reduction
|
|
2161
|
+
if s == "a":
|
|
2162
|
+
return "ɐ"
|
|
2163
|
+
if s == "e":
|
|
2164
|
+
return "ɨ"
|
|
2165
|
+
if s == "o":
|
|
2166
|
+
return "u"
|
|
2167
|
+
|
|
2168
|
+
# Override with stress-based quality for ambiguous vowels
|
|
2169
|
+
if s == "a":
|
|
2170
|
+
return "a" if self.has_primary_stress or self.has_secondary_stress else "ɐ"
|
|
2171
|
+
elif s == "e":
|
|
2172
|
+
if self.dialect.dialect_code.startswith("pt-PT"):
|
|
2173
|
+
return "ɛ" if self.has_primary_stress else "ɨ"
|
|
2174
|
+
return "ɛ" if self.has_primary_stress else "e"
|
|
2175
|
+
elif s == "o":
|
|
2176
|
+
return "ɔ" if self.has_primary_stress or self.has_secondary_stress else "u"
|
|
2177
|
+
|
|
2178
|
+
return base_ipa
|
|
2179
|
+
|
|
2180
|
+
return s # Fallback
|
|
2181
|
+
|
|
2182
|
+
def _ipa_for_consonant(self) -> str:
|
|
2183
|
+
"""
|
|
2184
|
+
Generate IPA for consonant character.
|
|
2185
|
+
|
|
2186
|
+
CONTEXT-SENSITIVE CONSONANT RULES:
|
|
2187
|
+
-----------------------------------
|
|
2188
|
+
1. C: [k] normally, [s] before e/i
|
|
2189
|
+
2. G: [ɡ] normally, [ʒ] before e/i
|
|
2190
|
+
3. R: [ɾ] normally, [ʁ] word-initially or after l/n/s
|
|
2191
|
+
4. S: [s] normally, [z] intervocalically
|
|
2192
|
+
5. X: [ʃ] normally, but [ks], [z], [s], [gz] in specific contexts
|
|
2193
|
+
6. Z: [z] normally, [ʃ] word-finally (European)
|
|
2194
|
+
|
|
2195
|
+
Returns:
|
|
2196
|
+
IPA string for this consonant
|
|
2197
|
+
"""
|
|
2198
|
+
s = self.normalized
|
|
2199
|
+
next_char = self.next_char.normalized if self.next_char else ""
|
|
2200
|
+
prev_char = self.prev_char.normalized if self.prev_char else ""
|
|
2201
|
+
|
|
2202
|
+
# BRAZILIAN PORTUGUESE: t/d palatalization before [i]
|
|
2203
|
+
if self.dialect.dialect_code.startswith("pt-BR"):
|
|
2204
|
+
if s == "t" and next_char == "i":
|
|
2205
|
+
return "tʃ"
|
|
2206
|
+
if s == "d" and next_char == "i":
|
|
2207
|
+
return "dʒ"
|
|
2208
|
+
|
|
2209
|
+
# L-vocalization in coda position
|
|
2210
|
+
if s == "l" and self.is_last_word_letter:
|
|
2211
|
+
return "w"
|
|
2212
|
+
if s == "l" and self.next_char and self.next_char.is_consonant:
|
|
2213
|
+
return "w"
|
|
2214
|
+
|
|
2215
|
+
# C before front vowels → [s]
|
|
2216
|
+
if s == "c" and next_char in self.dialect.FRONT_VOWEL_CHARS:
|
|
2217
|
+
return "s"
|
|
2218
|
+
|
|
2219
|
+
# G before front vowels → [ʒ]
|
|
2220
|
+
if s == "g" and next_char in self.dialect.FRONT_VOWEL_CHARS:
|
|
2221
|
+
return "ʒ"
|
|
2222
|
+
|
|
2223
|
+
# Initial R → strong R [ʁ]
|
|
2224
|
+
if s == "r" and self.is_first_word_letter:
|
|
2225
|
+
if self.dialect.dialect_code.startswith("pt-BR"):
|
|
2226
|
+
return "h" # Brazilian [h] or [x]
|
|
2227
|
+
elif self.dialect.dialect_code.startswith("pt-PT"):
|
|
2228
|
+
return "ʁ" # European uvular
|
|
2229
|
+
else:
|
|
2230
|
+
return "r" # African/Timorese alveolar trill
|
|
2231
|
+
|
|
2232
|
+
# R after l, n, s → strong R
|
|
2233
|
+
if s == "r" and prev_char in "lns":
|
|
2234
|
+
if self.dialect.dialect_code.startswith("pt-BR"):
|
|
2235
|
+
return "h" # Brazilian [h] or [x]
|
|
2236
|
+
elif self.dialect.dialect_code.startswith("pt-PT"):
|
|
2237
|
+
return "ʁ" # European uvular
|
|
2238
|
+
else:
|
|
2239
|
+
return "r" # African/Timorese alveolar trill
|
|
2240
|
+
|
|
2241
|
+
# S between vowels → [z]
|
|
2242
|
+
if s == "s" and self.is_intervocalic:
|
|
2243
|
+
return "z"
|
|
2244
|
+
|
|
2245
|
+
# S between consonant and vowel → context-dependent
|
|
2246
|
+
if s == "s" and self.is_between_consonant_vowel:
|
|
2247
|
+
# Special case: trans- prefix
|
|
2248
|
+
word = self.parent_word.normalized if self.parent_word else ""
|
|
2249
|
+
if word.startswith(("trans", "trâns")) and self.idx_in_word == 4:
|
|
2250
|
+
# Check if followed by vowel (voice) or consonant (voiceless)
|
|
2251
|
+
if self.next_char and self.next_char.is_vowel:
|
|
2252
|
+
# Exception: transação [tɾɐ̃zɐˈsɐ̃w]
|
|
2253
|
+
return "z"
|
|
2254
|
+
return "s"
|
|
2255
|
+
|
|
2256
|
+
# X rules (complex, context-dependent)
|
|
2257
|
+
if s == "x":
|
|
2258
|
+
return self._ipa_for_x()
|
|
2259
|
+
|
|
2260
|
+
# Z word-finally → [ʃ] (European) or [s]
|
|
2261
|
+
if s == "z" and self.is_last_word_letter:
|
|
2262
|
+
if self.dialect.dialect_code.startswith("pt-BR"):
|
|
2263
|
+
return "s" # Brazilian: [s]
|
|
2264
|
+
else:
|
|
2265
|
+
return "ʃ" # European/African: [ʃ]
|
|
2266
|
+
|
|
2267
|
+
# L word-finally (Brazilian vocalization handled above)
|
|
2268
|
+
if s == "l" and self.is_last_word_letter:
|
|
2269
|
+
if self.dialect.dialect_code.startswith("pt-PT"):
|
|
2270
|
+
return "ɫ" # European dark L
|
|
2271
|
+
|
|
2272
|
+
# Default mapping
|
|
2273
|
+
return self.dialect.DEFAULT_CHAR2PHONEMES.get(s, s)
|
|
2274
|
+
|
|
2275
|
+
def _ipa_for_x(self) -> str:
|
|
2276
|
+
"""
|
|
2277
|
+
Generate IPA for the letter X (highly context-dependent).
|
|
2278
|
+
|
|
2279
|
+
X PRONUNCIATION RULES:
|
|
2280
|
+
----------------------
|
|
2281
|
+
1. Word-initial: [ʃ] - xadrez, xícara
|
|
2282
|
+
2. Word-final: [ks] - tórax, fax
|
|
2283
|
+
3. Intervocalic:
|
|
2284
|
+
a. [ʃ]: peixe, caixa (default)
|
|
2285
|
+
b. [ks]: sexo, máximo (after stressed vowel with accent)
|
|
2286
|
+
c. [z]: exemplo, exato (in ex- prefix before vowel)
|
|
2287
|
+
d. [s]: próximo (rare)
|
|
2288
|
+
e. [gz]: hexa- prefix (rare variant)
|
|
2289
|
+
|
|
2290
|
+
This is one of the most complex orthographic patterns in Portuguese.
|
|
2291
|
+
|
|
2292
|
+
Returns:
|
|
2293
|
+
IPA string for X
|
|
2294
|
+
"""
|
|
2295
|
+
prev_char = self.prev_char.normalized if self.prev_char else ""
|
|
2296
|
+
next_char = self.next_char.normalized if self.next_char else ""
|
|
2297
|
+
word = self.parent_word.normalized if self.parent_word else ""
|
|
2298
|
+
|
|
2299
|
+
# Word-initial: [ʃ]
|
|
2300
|
+
if self.is_first_word_letter:
|
|
2301
|
+
return "ʃ"
|
|
2302
|
+
|
|
2303
|
+
# Word-final: [ks]
|
|
2304
|
+
if self.is_last_word_letter:
|
|
2305
|
+
return "ks"
|
|
2306
|
+
|
|
2307
|
+
# Intervocalic context
|
|
2308
|
+
if self.is_intervocalic:
|
|
2309
|
+
# Check for hexa- prefix: [gz] variant
|
|
2310
|
+
if word.startswith("hexa") and self.idx_in_word == 2:
|
|
2311
|
+
return "gz"
|
|
2312
|
+
|
|
2313
|
+
# Check for próxim-: [s]
|
|
2314
|
+
if word.startswith("próxim") and self.idx_in_word == 3:
|
|
2315
|
+
return "s"
|
|
2316
|
+
|
|
2317
|
+
# Ex- prefix before vowel: [z]
|
|
2318
|
+
if prev_char == "e" and next_char in "aeiouáéíóú":
|
|
2319
|
+
# Examples: exemplo, exato, executivo
|
|
2320
|
+
return "z"
|
|
2321
|
+
|
|
2322
|
+
# After stressed vowel with accent: [ks]
|
|
2323
|
+
if prev_char in self.dialect.ACUTE_VOWEL_CHARS | set("e"):
|
|
2324
|
+
# Examples: máximo, tóxico, sexo
|
|
2325
|
+
if prev_char == "ú":
|
|
2326
|
+
# Exception: esdrúxulo [ʃ]
|
|
2327
|
+
return "ʃ"
|
|
2328
|
+
return "ks"
|
|
2329
|
+
|
|
2330
|
+
# Default intervocalic: [ʃ]
|
|
2331
|
+
return "ʃ"
|
|
2332
|
+
|
|
2333
|
+
# Default: [ʃ]
|
|
2334
|
+
return "ʃ"
|
|
2335
|
+
|
|
2336
|
+
@cached_property
|
|
2337
|
+
def ipa(self) -> str:
|
|
2338
|
+
"""
|
|
2339
|
+
Generate IPA transcription for this character.
|
|
2340
|
+
|
|
2341
|
+
ALGORITHM:
|
|
2342
|
+
----------
|
|
2343
|
+
1. Handle punctuation → prosodic markers
|
|
2344
|
+
2. Check for silence
|
|
2345
|
+
3. Dispatch to vowel vs consonant rules
|
|
2346
|
+
4. Apply special-case overrides
|
|
2347
|
+
|
|
2348
|
+
Returns:
|
|
2349
|
+
IPA string (may be empty for silent characters)
|
|
2350
|
+
"""
|
|
2351
|
+
# Punctuation → prosodic markers
|
|
2352
|
+
if self.is_punct:
|
|
2353
|
+
return self.dialect.PUNCT2IPA.get(self.normalized, self.dialect.HIATUS_TOKEN)
|
|
2354
|
+
|
|
2355
|
+
# Silent characters
|
|
2356
|
+
if self.is_silent:
|
|
2357
|
+
return ""
|
|
2358
|
+
|
|
2359
|
+
# Dispatch based on vowel vs consonant
|
|
2360
|
+
if self.is_vowel:
|
|
2361
|
+
return self._ipa_for_vowel()
|
|
2362
|
+
else:
|
|
2363
|
+
return self._ipa_for_consonant()
|
|
2364
|
+
|
|
2365
|
+
# =========================================================================
|
|
2366
|
+
# FEATURE EXTRACTION
|
|
2367
|
+
# =========================================================================
|
|
2368
|
+
|
|
2369
|
+
@property
|
|
2370
|
+
def features(self) -> Dict[str, any]:
|
|
2371
|
+
"""
|
|
2372
|
+
Extract all linguistic features as a dictionary.
|
|
2373
|
+
|
|
2374
|
+
Useful for:
|
|
2375
|
+
- Machine learning feature vectors
|
|
2376
|
+
- Debugging
|
|
2377
|
+
- Linguistic analysis
|
|
2378
|
+
|
|
2379
|
+
Returns:
|
|
2380
|
+
Dictionary mapping feature names to values
|
|
2381
|
+
"""
|
|
2382
|
+
return {
|
|
2383
|
+
"text": self.normalized,
|
|
2384
|
+
"ipa": self.ipa,
|
|
2385
|
+
"is_first_letter": self.is_first_word_letter,
|
|
2386
|
+
"is_last_letter": self.is_last_word_letter,
|
|
2387
|
+
"is_punct": self.is_punct,
|
|
2388
|
+
"is_vowel": self.is_vowel,
|
|
2389
|
+
"is_semivowel": self.is_semivowel,
|
|
2390
|
+
"is_nasal_vowel": self.is_nasal_vowel,
|
|
2391
|
+
"is_open_vowel": self.is_open_vowel,
|
|
2392
|
+
"is_closed_vowel": self.is_closed_vowel,
|
|
2393
|
+
"is_consonant": self.is_consonant,
|
|
2394
|
+
"is_foreign": self.is_foreign,
|
|
2395
|
+
"is_silent": self.is_silent,
|
|
2396
|
+
"is_intervocalic": self.is_intervocalic,
|
|
2397
|
+
"is_between_consonant_vowel": self.is_between_consonant_vowel,
|
|
2398
|
+
"is_between_vowel_consonant": self.is_between_vowel_consonant,
|
|
2399
|
+
"has_diacritics": self.has_diacritics,
|
|
2400
|
+
"has_primary_stress": self.has_primary_stress,
|
|
2401
|
+
"has_secondary_stress": self.has_secondary_stress,
|
|
2402
|
+
}
|
|
2403
|
+
|
|
2404
|
+
def __eq__(self, other) -> bool:
|
|
2405
|
+
"""Allow comparison with string for convenience."""
|
|
2406
|
+
if isinstance(other, str):
|
|
2407
|
+
return self.surface == other
|
|
2408
|
+
return super().__eq__(other)
|
|
2409
|
+
|
|
2410
|
+
def __repr__(self) -> str:
|
|
2411
|
+
"""String representation for debugging."""
|
|
2412
|
+
return f"CharToken('{self.surface}' → [{self.ipa}])"
|
|
2413
|
+
|
|
2414
|
+
|
|
2415
|
+
# =============================================================================
|
|
2416
|
+
# GRAPHEME TOKEN
|
|
2417
|
+
# =============================================================================
|
|
2418
|
+
|
|
2419
|
+
@dataclasses.dataclass
|
|
2420
|
+
class GraphemeToken:
|
|
2421
|
+
"""
|
|
2422
|
+
Represents a grapheme - the minimal distinctive unit of writing.
|
|
2423
|
+
|
|
2424
|
+
GRAPHEME DEFINITION:
|
|
2425
|
+
--------------------
|
|
2426
|
+
A grapheme is the smallest unit of a writing system.
|
|
2427
|
+
In alphabetic systems like Portuguese, graphemes can be:
|
|
2428
|
+
- Single letters: a, b, c
|
|
2429
|
+
- Digraphs: ch, nh, lh, rr, ss
|
|
2430
|
+
- Diphthongs: ai, ou, ei
|
|
2431
|
+
- Trigraphs: que, gui, coo
|
|
2432
|
+
- Tetragraphs: aien (rare)
|
|
2433
|
+
|
|
2434
|
+
LINGUISTIC MOTIVATION:
|
|
2435
|
+
----------------------
|
|
2436
|
+
Portuguese orthography uses multi-character sequences to represent:
|
|
2437
|
+
1. Single phonemes: ch → [ʃ], nh → [ɲ]
|
|
2438
|
+
2. Phoneme clusters: qu → [kw] or [k] depending on context
|
|
2439
|
+
3. Diphthongs: ai → [aj], ou → [ow]
|
|
2440
|
+
|
|
2441
|
+
Tokenizing at grapheme level (not character level) respects
|
|
2442
|
+
the structure of the writing system.
|
|
2443
|
+
|
|
2444
|
+
Attributes:
|
|
2445
|
+
surface: The grapheme string (1-4 characters)
|
|
2446
|
+
grapheme_idx: Position in parent word's grapheme list
|
|
2447
|
+
syllable_idx: Which syllable this grapheme belongs to
|
|
2448
|
+
characters: List of CharToken objects composing this grapheme
|
|
2449
|
+
parent_word: WordToken containing this grapheme
|
|
2450
|
+
dialect: DialectInventory with rules
|
|
2451
|
+
"""
|
|
2452
|
+
|
|
2453
|
+
surface: str
|
|
2454
|
+
grapheme_idx: int = 0 # parent_word.graphemes[idx] == self
|
|
2455
|
+
syllable_idx: int = 0 # parent_word.normalized_syllables[idx] == self.surface
|
|
2456
|
+
characters: List[CharToken] = dataclasses.field(default_factory=list)
|
|
2457
|
+
parent_word: Optional["WordToken"] = None
|
|
2458
|
+
dialect: DialectInventory = dataclasses.field(default_factory=EuropeanPortuguese)
|
|
2459
|
+
|
|
2460
|
+
# Precomputed indices
|
|
2461
|
+
_idx_in_word: int = -1
|
|
2462
|
+
_idx_in_sentence: int = -1
|
|
2463
|
+
|
|
2464
|
+
def __post_init__(self):
|
|
2465
|
+
"""
|
|
2466
|
+
Initialize character tokens and compute indices.
|
|
2467
|
+
|
|
2468
|
+
Characters are created and their indices are computed here
|
|
2469
|
+
to avoid circular dependencies during IPA generation.
|
|
2470
|
+
"""
|
|
2471
|
+
if not self.characters:
|
|
2472
|
+
self.characters = [
|
|
2473
|
+
CharToken(
|
|
2474
|
+
surface=c,
|
|
2475
|
+
char_idx=i,
|
|
2476
|
+
parent_grapheme=self,
|
|
2477
|
+
dialect=self.dialect
|
|
2478
|
+
)
|
|
2479
|
+
for i, c in enumerate(self.surface)
|
|
2480
|
+
]
|
|
2481
|
+
|
|
2482
|
+
# =========================================================================
|
|
2483
|
+
# BASIC PROPERTIES
|
|
2484
|
+
# =========================================================================
|
|
2485
|
+
|
|
2486
|
+
@cached_property
|
|
2487
|
+
def normalized(self) -> str:
|
|
2488
|
+
"""Lowercase form of grapheme."""
|
|
2489
|
+
return self.surface.lower()
|
|
2490
|
+
|
|
2491
|
+
@property
|
|
2492
|
+
def n_chars(self) -> int:
|
|
2493
|
+
"""Number of characters in this grapheme."""
|
|
2494
|
+
return len(self.characters)
|
|
2495
|
+
|
|
2496
|
+
@property
|
|
2497
|
+
def first_char(self) -> CharToken:
|
|
2498
|
+
"""First character of grapheme."""
|
|
2499
|
+
return self.characters[0]
|
|
2500
|
+
|
|
2501
|
+
@property
|
|
2502
|
+
def last_char(self) -> CharToken:
|
|
2503
|
+
"""Last character of grapheme."""
|
|
2504
|
+
return self.characters[-1]
|
|
2505
|
+
|
|
2506
|
+
# =========================================================================
|
|
2507
|
+
# INDICES AND CONTEXT
|
|
2508
|
+
# =========================================================================
|
|
2509
|
+
|
|
2510
|
+
@property
|
|
2511
|
+
def idx_in_word(self) -> int:
|
|
2512
|
+
"""Character index of first char in parent word."""
|
|
2513
|
+
return self._idx_in_word
|
|
2514
|
+
|
|
2515
|
+
@property
|
|
2516
|
+
def idx_in_sentence(self) -> int:
|
|
2517
|
+
"""Character index of first char in parent sentence."""
|
|
2518
|
+
return self._idx_in_sentence
|
|
2519
|
+
|
|
2520
|
+
@cached_property
|
|
2521
|
+
def parent_sentence(self) -> Optional['Sentence']:
|
|
2522
|
+
"""The sentence containing this grapheme."""
|
|
2523
|
+
if not self.parent_word:
|
|
2524
|
+
return None
|
|
2525
|
+
return self.parent_word.parent_sentence
|
|
2526
|
+
|
|
2527
|
+
@cached_property
|
|
2528
|
+
def parent_syllable(self) -> Optional[str]:
|
|
2529
|
+
"""The syllable string containing this grapheme."""
|
|
2530
|
+
if not self.parent_word or self.syllable_idx < 0:
|
|
2531
|
+
return None
|
|
2532
|
+
if self.syllable_idx >= len(self.parent_word.syllables):
|
|
2533
|
+
return None
|
|
2534
|
+
return self.parent_word.syllables[self.syllable_idx]
|
|
2535
|
+
|
|
2536
|
+
# ------------------------------------------------------------------
|
|
2537
|
+
# Prefix/suffix context
|
|
2538
|
+
# ------------------------------------------------------------------
|
|
2539
|
+
@property
|
|
2540
|
+
def prefix(self) -> str:
|
|
2541
|
+
"""
|
|
2542
|
+
All text before this grapheme in the word.
|
|
2543
|
+
|
|
2544
|
+
Used for checking morphological boundaries (prefixes).
|
|
2545
|
+
Example: In "biauricular", prefix of "au" is "bi"
|
|
2546
|
+
"""
|
|
2547
|
+
if not self.parent_word:
|
|
2548
|
+
return ""
|
|
2549
|
+
|
|
2550
|
+
prev_graphemes = [
|
|
2551
|
+
g.normalized for g in self.parent_word.graphemes
|
|
2552
|
+
if g.grapheme_idx < self.grapheme_idx
|
|
2553
|
+
]
|
|
2554
|
+
return "".join(prev_graphemes)
|
|
2555
|
+
|
|
2556
|
+
@property
|
|
2557
|
+
def suffix(self) -> str:
|
|
2558
|
+
"""
|
|
2559
|
+
All text after this grapheme in the word.
|
|
2560
|
+
|
|
2561
|
+
Used for checking word endings and contexts.
|
|
2562
|
+
"""
|
|
2563
|
+
if not self.parent_word:
|
|
2564
|
+
return ""
|
|
2565
|
+
|
|
2566
|
+
next_graphemes = [
|
|
2567
|
+
g.normalized for g in self.parent_word.graphemes
|
|
2568
|
+
if g.grapheme_idx > self.grapheme_idx
|
|
2569
|
+
]
|
|
2570
|
+
return "".join(next_graphemes)
|
|
2571
|
+
|
|
2572
|
+
@cached_property
|
|
2573
|
+
def prev_grapheme(self) -> Optional['GraphemeToken']:
|
|
2574
|
+
"""Previous grapheme in word, or None if first."""
|
|
2575
|
+
if self.grapheme_idx == 0 or not self.parent_word:
|
|
2576
|
+
return None
|
|
2577
|
+
return self.parent_word.graphemes[self.grapheme_idx - 1]
|
|
2578
|
+
|
|
2579
|
+
@cached_property
|
|
2580
|
+
def next_grapheme(self) -> Optional['GraphemeToken']:
|
|
2581
|
+
"""Next grapheme in word, or None if last."""
|
|
2582
|
+
if not self.parent_word:
|
|
2583
|
+
return None
|
|
2584
|
+
if self.grapheme_idx >= len(self.parent_word.graphemes) - 1:
|
|
2585
|
+
return None
|
|
2586
|
+
return self.parent_word.graphemes[self.grapheme_idx + 1]
|
|
2587
|
+
|
|
2588
|
+
# ------------------------------------------------------------------
|
|
2589
|
+
# Syllabic context
|
|
2590
|
+
# ------------------------------------------------------------------
|
|
2591
|
+
@cached_property
|
|
2592
|
+
def prev_syllable(self) -> Optional[str]:
|
|
2593
|
+
"""Previous syllable string, or None if first."""
|
|
2594
|
+
if self.syllable_idx == 0 or not self.parent_syllable:
|
|
2595
|
+
return None
|
|
2596
|
+
return self.parent_word.normalized_syllables[self.syllable_idx - 1]
|
|
2597
|
+
|
|
2598
|
+
@cached_property
|
|
2599
|
+
def next_syllable(self) -> Optional[str]:
|
|
2600
|
+
"""Next syllable string, or None if last."""
|
|
2601
|
+
if self.syllable_idx == -1 or not self.parent_syllable:
|
|
2602
|
+
return None
|
|
2603
|
+
if self.syllable_idx >= len(self.parent_word.normalized_syllables) - 1:
|
|
2604
|
+
return None
|
|
2605
|
+
return self.parent_word.normalized_syllables[self.syllable_idx + 1]
|
|
2606
|
+
|
|
2607
|
+
# =========================================================================
|
|
2608
|
+
# GRAPHEME CLASSIFICATION
|
|
2609
|
+
# =========================================================================
|
|
2610
|
+
|
|
2611
|
+
@cached_property
|
|
2612
|
+
def is_archaism(self) -> bool:
|
|
2613
|
+
"""
|
|
2614
|
+
True if grapheme uses archaic orthography.
|
|
2615
|
+
|
|
2616
|
+
Archaic patterns:
|
|
2617
|
+
1. Trema (ü): Pre-1945/2009 marker for pronounced u
|
|
2618
|
+
2. Grave accents (except à): Pre-1973 secondary stress
|
|
2619
|
+
3. Archaic words with ph, mpt, mpc, mpç
|
|
2620
|
+
4. Obsolete circumflex: êle
|
|
2621
|
+
|
|
2622
|
+
These may appear in historical texts or proper names.
|
|
2623
|
+
"""
|
|
2624
|
+
s = self.normalized
|
|
2625
|
+
|
|
2626
|
+
# Trema (abolished 1945/2009)
|
|
2627
|
+
if "ü" in s:
|
|
2628
|
+
return True
|
|
2629
|
+
|
|
2630
|
+
# Grave accents (except à contraction)
|
|
2631
|
+
archaic_graves = [c for c in self.dialect.GRAVE_VOWEL_CHARS if c != "à"]
|
|
2632
|
+
if any(c in s for c in archaic_graves):
|
|
2633
|
+
return True
|
|
2634
|
+
|
|
2635
|
+
# In paroxytones, when the same form existed with an open and a closed vowel, a circumflex accent was placed in the word with the closed vowel.
|
|
2636
|
+
# Example: êle (“he”) (/ˈe.li/) and ele (“name of the letter L”) (/ˈɛ.li/).
|
|
2637
|
+
# This usage was made obsolete by the 1945 spelling reform in Portugal, and by the 1971 spelling reform in Brazil.
|
|
2638
|
+
archaic_words = ["êle"]
|
|
2639
|
+
if s in archaic_words:
|
|
2640
|
+
return True
|
|
2641
|
+
|
|
2642
|
+
# ph -> /f/ eg. "pharmacia"
|
|
2643
|
+
if s == "ph":
|
|
2644
|
+
return True
|
|
2645
|
+
|
|
2646
|
+
# Quando, nas seqüências interiores "mpc", "mpç" e "mpt" se eliminar o "p",
|
|
2647
|
+
# o "m" passa a "n", escrevendo-se, respectivamente "nc", "nç" e "nt":
|
|
2648
|
+
if s in self.dialect.ARCHAIC_MUTE_P:
|
|
2649
|
+
# NOTE: a word list is needed, in modern orthography none of the letters is silent
|
|
2650
|
+
# we do not know if input text is modern or archaic (before acordo ortográfico)
|
|
2651
|
+
# exemplos:
|
|
2652
|
+
# assumpcionista e assuncionista;
|
|
2653
|
+
# assumpção e assunção;
|
|
2654
|
+
# assumptível e assuntível;
|
|
2655
|
+
# peremptório e perentório,
|
|
2656
|
+
# sumptuoso e suntuoso,
|
|
2657
|
+
# sumptuosidade e suntuosidade
|
|
2658
|
+
return self.parent_word.normalized in self.dialect.ARCHAIC_MUTE_P[s]
|
|
2659
|
+
return False
|
|
2660
|
+
|
|
2661
|
+
@cached_property
|
|
2662
|
+
def is_nasal(self) -> bool:
|
|
2663
|
+
"""
|
|
2664
|
+
True if grapheme represents nasal sound(s).
|
|
2665
|
+
|
|
2666
|
+
Nasal patterns:
|
|
2667
|
+
1. Nasal digraphs: am, an, em, en, im, in, om, on, um, un
|
|
2668
|
+
2. Tilde vowels: ã, õ (and archaic ẽ, ĩ, ũ)
|
|
2669
|
+
3. Nasal diphthongs: ão, ãe, õe, em (final)
|
|
2670
|
+
"""
|
|
2671
|
+
s = self.normalized
|
|
2672
|
+
|
|
2673
|
+
# Nasal digraph lookup
|
|
2674
|
+
if s in self.dialect.NASAL_DIGRAPHS:
|
|
2675
|
+
return True
|
|
2676
|
+
|
|
2677
|
+
# Tilde vowels
|
|
2678
|
+
if any(c in s for c in self.dialect.TILDE_VOWEL_CHARS):
|
|
2679
|
+
return True
|
|
2680
|
+
|
|
2681
|
+
return False
|
|
2682
|
+
|
|
2683
|
+
# ------------------------------------------------------------------
|
|
2684
|
+
# Diphthong classification
|
|
2685
|
+
# ------------------------------------------------------------------
|
|
2686
|
+
@property
|
|
2687
|
+
def is_vocalic_hiatus(self) -> bool:
|
|
2688
|
+
# Hiato é quando duas vogais estão juntas porém em sílabas vizinhas.
|
|
2689
|
+
# O hiato diferencia-se de um ditongo e de um tritongo pelo fato de ser constituído por duas sílabas e,
|
|
2690
|
+
# consequentemente, ser pronunciado em dois esforços de voz.
|
|
2691
|
+
|
|
2692
|
+
# Os outros casos que na escrita costumam estar representados por «i» + vogal ou «u» mais vogal
|
|
2693
|
+
# (ou, no português europeu, «e» + vogal ou «o» + vogal),
|
|
2694
|
+
# costumam ser considerados como hiatos.
|
|
2695
|
+
return False # TODO
|
|
2696
|
+
|
|
2697
|
+
@cached_property
|
|
2698
|
+
def is_diphthong(self) -> bool:
|
|
2699
|
+
"""
|
|
2700
|
+
True if grapheme represents a diphthong.
|
|
2701
|
+
|
|
2702
|
+
Diphthongs are two-vowel sequences in one syllable.
|
|
2703
|
+
Examples: ai, ei, ou, ão, ãe
|
|
2704
|
+
|
|
2705
|
+
Note: Diphthong vs hiatus is determined by syllabification.
|
|
2706
|
+
Same spelling can be different:
|
|
2707
|
+
- caiu [kɐˈju]: hiatus (ca.iu, two syllables)
|
|
2708
|
+
- cai [ˈkaj]: diphthong (one syllable)
|
|
2709
|
+
"""
|
|
2710
|
+
s = self.normalized
|
|
2711
|
+
# Observação: qu"em : não é um encontro vocálico, pois não se pronuncia o U.
|
|
2712
|
+
# Portanto, "qu" é um dígrafo e "ue" não é um ditongo.
|
|
2713
|
+
if s == "ue" and self.parent_word.normalized == "quem":
|
|
2714
|
+
return False
|
|
2715
|
+
return s in self.dialect.DIPHTHONG2IPA
|
|
2716
|
+
|
|
2717
|
+
@cached_property
|
|
2718
|
+
def is_triphthong(self) -> bool:
|
|
2719
|
+
"""
|
|
2720
|
+
True if grapheme represents a triphthong.
|
|
2721
|
+
|
|
2722
|
+
Triphthongs are three-vowel sequences in one syllable: G-V-G
|
|
2723
|
+
Examples: miau [ˈmjaw], Uruguai [uɾuˈɡwaj]
|
|
2724
|
+
|
|
2725
|
+
Very rare in Portuguese.
|
|
2726
|
+
"""
|
|
2727
|
+
return self.normalized in self.dialect.TRIPHTHONG2IPA
|
|
2728
|
+
|
|
2729
|
+
@cached_property
|
|
2730
|
+
def is_falling_diphthong(self) -> bool:
|
|
2731
|
+
"""
|
|
2732
|
+
True if diphthong has vowel before semivowel (V-G).
|
|
2733
|
+
|
|
2734
|
+
Examples: pai, rei, meu, céu
|
|
2735
|
+
Direction: vowel [a] → glide [j]
|
|
2736
|
+
|
|
2737
|
+
Most Portuguese diphthongs are falling.
|
|
2738
|
+
"""
|
|
2739
|
+
if not self.is_diphthong:
|
|
2740
|
+
return False
|
|
2741
|
+
|
|
2742
|
+
if self.dialect.dialect_code.startswith("pt-BR"):
|
|
2743
|
+
# Em muitos dialetos brasileiros, devido à Vocalização do fonema /l/ em fim de sílaba,
|
|
2744
|
+
# também são considerados ditongos decrescentes os seguintes casos.
|
|
2745
|
+
if self.normalized in self.dialect.PTBR_DIPHTHONGS.values():
|
|
2746
|
+
# exemplos:
|
|
2747
|
+
# funil /fu.ˈniw/
|
|
2748
|
+
# feltro /few.tɾu/
|
|
2749
|
+
# mel /ˈmɛw/
|
|
2750
|
+
# mal /ˈmaw/
|
|
2751
|
+
# Sol /ˈsɔw/
|
|
2752
|
+
# soldado /sow.ˈda.du/
|
|
2753
|
+
# azul /aˈzuw/
|
|
2754
|
+
return True
|
|
2755
|
+
# Quando a vogal vem antes da semivogal, o ditongo é classificado como ditongo decrescente
|
|
2756
|
+
# exemplos:
|
|
2757
|
+
# leite /ˈlej.ti/ - /ˈlɐj.tɨ/ (Lisboa)
|
|
2758
|
+
# cai /ˈcaj/
|
|
2759
|
+
# dói /ˈdɔj/
|
|
2760
|
+
# foi /ˈfoj/
|
|
2761
|
+
# cuidado /cuj.ˈda.du/
|
|
2762
|
+
# viu /ˈviw/
|
|
2763
|
+
# meu /ˈmew/
|
|
2764
|
+
# céu /ˈcɛw/
|
|
2765
|
+
# mau /ˈmaw/
|
|
2766
|
+
# sou /ˈsow/
|
|
2767
|
+
return self.first_char.normalized not in self.dialect.SEMIVOWEL_CHARS
|
|
2768
|
+
|
|
2769
|
+
@cached_property
|
|
2770
|
+
def is_rising_diphthong(self) -> bool:
|
|
2771
|
+
"""
|
|
2772
|
+
True if diphthong has semivowel before vowel (G-V).
|
|
2773
|
+
|
|
2774
|
+
Examples: piano, água, qual
|
|
2775
|
+
Direction: glide [j] → vowel [a]
|
|
2776
|
+
|
|
2777
|
+
Less common than falling diphthongs in Portuguese.
|
|
2778
|
+
"""
|
|
2779
|
+
if not self.is_diphthong:
|
|
2780
|
+
return False
|
|
2781
|
+
return self.first_char.normalized in self.dialect.SEMIVOWEL_CHARS
|
|
2782
|
+
|
|
2783
|
+
@cached_property
|
|
2784
|
+
def is_nasal_diphthong(self) -> bool:
|
|
2785
|
+
"""
|
|
2786
|
+
True if diphthong is nasalized.
|
|
2787
|
+
|
|
2788
|
+
Examples: mãe [ˈmɐ̃j̃], cão [ˈkɐ̃w̃], põe [ˈpõj̃]
|
|
2789
|
+
|
|
2790
|
+
Nasalization extends across entire diphthong.
|
|
2791
|
+
"""
|
|
2792
|
+
if not self.is_diphthong:
|
|
2793
|
+
return False
|
|
2794
|
+
return self.first_char.normalized in self.dialect.TILDE_VOWEL_CHARS
|
|
2795
|
+
|
|
2796
|
+
@cached_property
|
|
2797
|
+
def is_oral_diphthong(self) -> bool:
|
|
2798
|
+
"""
|
|
2799
|
+
True if diphthong is oral (not nasal).
|
|
2800
|
+
|
|
2801
|
+
Examples: pai, rei, meu, boi
|
|
2802
|
+
"""
|
|
2803
|
+
return self.is_diphthong and not self.is_nasal_diphthong
|
|
2804
|
+
|
|
2805
|
+
@cached_property
|
|
2806
|
+
def is_digraph(self) -> bool:
|
|
2807
|
+
"""
|
|
2808
|
+
True if grapheme is a consonant digraph.
|
|
2809
|
+
|
|
2810
|
+
Consonant digraphs (two letters, one consonant phoneme):
|
|
2811
|
+
- nh [ɲ]: palatal nasal
|
|
2812
|
+
- lh [ʎ]: palatal lateral
|
|
2813
|
+
- ch [ʃ]: postalveolar fricative
|
|
2814
|
+
- rr [ʁ]: strong R
|
|
2815
|
+
- ss [s]: voiceless between vowels
|
|
2816
|
+
- ph [f]: archaic
|
|
2817
|
+
|
|
2818
|
+
Does NOT include nasal digraphs (am, em, etc.) - see is_nasal.
|
|
2819
|
+
"""
|
|
2820
|
+
return self.normalized in self.dialect.DIGRAPH2IPA
|
|
2821
|
+
|
|
2822
|
+
@cached_property
|
|
2823
|
+
def is_foreign_digraph(self) -> bool:
|
|
2824
|
+
"""
|
|
2825
|
+
True if grapheme is a foreign digraph.
|
|
2826
|
+
|
|
2827
|
+
Examples from loanwords:
|
|
2828
|
+
- sh [ʃ]: show, shopping
|
|
2829
|
+
- th [t]: thriller
|
|
2830
|
+
- ff [f]: graffiti
|
|
2831
|
+
- ll [l]: villa
|
|
2832
|
+
"""
|
|
2833
|
+
return self.normalized in self.dialect.FOREIGN_DIGRAPH2IPA
|
|
2834
|
+
|
|
2835
|
+
@cached_property
|
|
2836
|
+
def is_trigraph(self) -> bool:
|
|
2837
|
+
"""
|
|
2838
|
+
True if grapheme is a trigraph (3-letter unit).
|
|
2839
|
+
|
|
2840
|
+
Examples:
|
|
2841
|
+
- que, qui: q + u + vowel
|
|
2842
|
+
- coo: prefix boundary
|
|
2843
|
+
- ção: common suffix
|
|
2844
|
+
"""
|
|
2845
|
+
return self.normalized in self.dialect.TRIGRAM2IPA
|
|
2846
|
+
|
|
2847
|
+
@cached_property
|
|
2848
|
+
def is_consonant_hiatus(self) -> bool:
|
|
2849
|
+
"""
|
|
2850
|
+
True if grapheme is a consonant cluster spanning syllable boundary.
|
|
2851
|
+
|
|
2852
|
+
Examples:
|
|
2853
|
+
- ct: pac.to [ˈpak.tu]
|
|
2854
|
+
- cç: fic.ção [fik.ˈsɐ̃w]
|
|
2855
|
+
|
|
2856
|
+
These are NOT pronounced as single units; they split across syllables.
|
|
2857
|
+
"""
|
|
2858
|
+
return self.normalized in self.dialect.HETEROSYLLABIC_CLUSTERS
|
|
2859
|
+
|
|
2860
|
+
# =========================================================================
|
|
2861
|
+
# STRESS PROPERTIES
|
|
2862
|
+
# =========================================================================
|
|
2863
|
+
|
|
2864
|
+
@cached_property
|
|
2865
|
+
def has_primary_stress(self) -> bool:
|
|
2866
|
+
"""
|
|
2867
|
+
True if this grapheme carries primary word stress.
|
|
2868
|
+
|
|
2869
|
+
Stress determination:
|
|
2870
|
+
1. Explicit: any character has primary stress marker (á, é, ã, etc.)
|
|
2871
|
+
2. Implicit: this grapheme's syllable is the stressed syllable
|
|
2872
|
+
|
|
2873
|
+
For words with explicit accent marks, that syllable is stressed.
|
|
2874
|
+
For unmarked words, stress is predicted by word ending and syllable count.
|
|
2875
|
+
"""
|
|
2876
|
+
if self.parent_word.n_syllables == 1:
|
|
2877
|
+
return True
|
|
2878
|
+
# Check if any character in this grapheme has explicit primary stress
|
|
2879
|
+
if any(c.normalized in self.dialect.PRIMARY_STRESS_MARKERS for c in self.characters):
|
|
2880
|
+
return True
|
|
2881
|
+
|
|
2882
|
+
# Check if syllable-level stress applies to this grapheme's syllable
|
|
2883
|
+
if not self.parent_word:
|
|
2884
|
+
return False
|
|
2885
|
+
|
|
2886
|
+
# Determine stressed syllable index
|
|
2887
|
+
stressed_syllable_idx = detect_stress_position(
|
|
2888
|
+
self.parent_word.normalized,
|
|
2889
|
+
self.parent_word.syllables,
|
|
2890
|
+
self.dialect
|
|
2891
|
+
)
|
|
2892
|
+
|
|
2893
|
+
return self.syllable_idx == stressed_syllable_idx
|
|
2894
|
+
|
|
2895
|
+
@cached_property
|
|
2896
|
+
def has_secondary_stress(self) -> bool:
|
|
2897
|
+
"""
|
|
2898
|
+
True if this grapheme carries secondary stress.
|
|
2899
|
+
|
|
2900
|
+
Secondary stress occurs in:
|
|
2901
|
+
- Compound words: semi-automático
|
|
2902
|
+
- Long words with complex morphology
|
|
2903
|
+
- Historical grave accent usage (obsolete)
|
|
2904
|
+
|
|
2905
|
+
Marked by circumflex or grave accents in non-primary position.
|
|
2906
|
+
"""
|
|
2907
|
+
if self.has_primary_stress:
|
|
2908
|
+
return False
|
|
2909
|
+
|
|
2910
|
+
return any(
|
|
2911
|
+
c.normalized in self.dialect.SECONDARY_STRESS_MARKERS
|
|
2912
|
+
for c in self.characters
|
|
2913
|
+
)
|
|
2914
|
+
|
|
2915
|
+
# =========================================================================
|
|
2916
|
+
# IPA GENERATION
|
|
2917
|
+
# =========================================================================
|
|
2918
|
+
|
|
2919
|
+
@cached_property
|
|
2920
|
+
def ipa(self) -> str:
|
|
2921
|
+
"""
|
|
2922
|
+
Generate IPA transcription for this grapheme.
|
|
2923
|
+
|
|
2924
|
+
ALGORITHM:
|
|
2925
|
+
----------
|
|
2926
|
+
1. Check irregular word list (highest priority)
|
|
2927
|
+
2. Check multi-character lookups (tetragraph → trigraph → digraph)
|
|
2928
|
+
3. Fall back to character-by-character IPA
|
|
2929
|
+
|
|
2930
|
+
For multi-character graphemes (digraphs, diphthongs),
|
|
2931
|
+
the lookup returns a single IPA unit, not individual characters.
|
|
2932
|
+
|
|
2933
|
+
Returns:
|
|
2934
|
+
IPA string for this grapheme
|
|
2935
|
+
"""
|
|
2936
|
+
s = self.normalized
|
|
2937
|
+
word = self.parent_word.normalized if self.parent_word else ""
|
|
2938
|
+
|
|
2939
|
+
# Special case: "ui" nasalized in "muito"
|
|
2940
|
+
if s == "ui" and word == "muito":
|
|
2941
|
+
return "ũj"
|
|
2942
|
+
|
|
2943
|
+
# Check multi-character lookups (longest first)
|
|
2944
|
+
if s in self.dialect.TETRAGRAM2IPA:
|
|
2945
|
+
return self.dialect.TETRAGRAM2IPA[s]
|
|
2946
|
+
|
|
2947
|
+
if s in self.dialect.TRIGRAM2IPA:
|
|
2948
|
+
return self.dialect.TRIGRAM2IPA[s]
|
|
2949
|
+
|
|
2950
|
+
if s in self.dialect.NASAL_DIGRAPHS:
|
|
2951
|
+
return self.dialect.NASAL_DIGRAPHS[s]
|
|
2952
|
+
|
|
2953
|
+
if s in self.dialect.DIPHTHONG2IPA:
|
|
2954
|
+
return self.dialect.DIPHTHONG2IPA[s]
|
|
2955
|
+
|
|
2956
|
+
if s in self.dialect.DIGRAPH2IPA:
|
|
2957
|
+
return self.dialect.DIGRAPH2IPA[s]
|
|
2958
|
+
|
|
2959
|
+
if s in self.dialect.HETEROSYLLABIC_CLUSTERS:
|
|
2960
|
+
return self.dialect.HETEROSYLLABIC_CLUSTERS[s]
|
|
2961
|
+
|
|
2962
|
+
# Fall back to character-by-character
|
|
2963
|
+
return "".join(c.ipa for c in self.characters)
|
|
2964
|
+
|
|
2965
|
+
# =========================================================================
|
|
2966
|
+
# FEATURE EXTRACTION
|
|
2967
|
+
# =========================================================================
|
|
2968
|
+
|
|
2969
|
+
@property
|
|
2970
|
+
def features(self) -> Dict[str, any]:
|
|
2971
|
+
"""
|
|
2972
|
+
Extract all linguistic features as a dictionary.
|
|
2973
|
+
|
|
2974
|
+
Returns:
|
|
2975
|
+
Dictionary with grapheme features and nested character features
|
|
2976
|
+
"""
|
|
2977
|
+
feats = {
|
|
2978
|
+
"n_chars": self.n_chars,
|
|
2979
|
+
"text": self.normalized,
|
|
2980
|
+
"ipa": self.ipa,
|
|
2981
|
+
"parent_syllable": self.parent_syllable,
|
|
2982
|
+
"prev_syllable": self.prev_syllable,
|
|
2983
|
+
"next_syllable": self.next_syllable,
|
|
2984
|
+
"is_archaism": self.is_archaism,
|
|
2985
|
+
"is_nasal": self.is_nasal,
|
|
2986
|
+
"is_digraph": self.is_digraph,
|
|
2987
|
+
"is_trigraph": self.is_trigraph,
|
|
2988
|
+
"is_foreign_digraph": self.is_foreign_digraph,
|
|
2989
|
+
"is_consonant_hiatus": self.is_consonant_hiatus,
|
|
2990
|
+
"is_diphthong": self.is_diphthong,
|
|
2991
|
+
"is_triphthong": self.is_triphthong,
|
|
2992
|
+
"is_falling_diphthong": self.is_falling_diphthong,
|
|
2993
|
+
"is_rising_diphthong": self.is_rising_diphthong,
|
|
2994
|
+
"is_nasal_diphthong": self.is_nasal_diphthong,
|
|
2995
|
+
"is_oral_diphthong": self.is_oral_diphthong,
|
|
2996
|
+
"has_primary_stress": self.has_primary_stress,
|
|
2997
|
+
"has_secondary_stress": self.has_secondary_stress,
|
|
2998
|
+
}
|
|
2999
|
+
|
|
3000
|
+
# Add character-level features
|
|
3001
|
+
for c in self.characters:
|
|
3002
|
+
for k, v in c.features.items():
|
|
3003
|
+
feats[f"char_{c.char_idx}_{k}"] = v
|
|
3004
|
+
|
|
3005
|
+
return feats
|
|
3006
|
+
|
|
3007
|
+
def __eq__(self, other) -> bool:
|
|
3008
|
+
"""Allow comparison with string."""
|
|
3009
|
+
if isinstance(other, str):
|
|
3010
|
+
return self.surface == other
|
|
3011
|
+
return super().__eq__(other)
|
|
3012
|
+
|
|
3013
|
+
def __repr__(self) -> str:
|
|
3014
|
+
"""String representation for debugging."""
|
|
3015
|
+
return f"GraphemeToken('{self.surface}' → [{self.ipa}])"
|
|
3016
|
+
|
|
3017
|
+
|
|
3018
|
+
# =============================================================================
|
|
3019
|
+
# WORD TOKEN
|
|
3020
|
+
# =============================================================================
|
|
3021
|
+
|
|
3022
|
+
@dataclasses.dataclass
|
|
3023
|
+
class WordToken:
|
|
3024
|
+
"""
|
|
3025
|
+
Represents a word with syllable structure and grapheme tokenization.
|
|
3026
|
+
|
|
3027
|
+
LINGUISTIC STRUCTURE:
|
|
3028
|
+
---------------------
|
|
3029
|
+
A word is analyzed at multiple levels:
|
|
3030
|
+
1. Orthographic: sequence of characters
|
|
3031
|
+
2. Graphemic: sequence of graphemes (digraphs, diphthongs, etc.)
|
|
3032
|
+
3. Syllabic: sequence of syllables
|
|
3033
|
+
4. Phonological: stress pattern and IPA transcription
|
|
3034
|
+
|
|
3035
|
+
SYLLABIFICATION:
|
|
3036
|
+
----------------
|
|
3037
|
+
Portuguese syllables follow a preferred CV (consonant-vowel) structure.
|
|
3038
|
+
The syllabifier handles:
|
|
3039
|
+
- Onset maximization: consonants go with following vowel
|
|
3040
|
+
- Complex onsets: pr, tr, br, etc.
|
|
3041
|
+
- Coda constraints: only l, r, s, n allowed in syllable-final position
|
|
3042
|
+
- Hiatus vs diphthong: vowel sequences may be one or two syllables
|
|
3043
|
+
|
|
3044
|
+
STRESS ASSIGNMENT:
|
|
3045
|
+
------------------
|
|
3046
|
+
Stress is determined by:
|
|
3047
|
+
1. Explicit accent marks (highest priority)
|
|
3048
|
+
2. Word-final pattern (oxytone exceptions)
|
|
3049
|
+
3. Default paroxytone rule (penultimate syllable)
|
|
3050
|
+
|
|
3051
|
+
Attributes:
|
|
3052
|
+
surface: The word as it appears in text
|
|
3053
|
+
word_idx: Position in parent sentence
|
|
3054
|
+
graphemes: List of GraphemeToken objects
|
|
3055
|
+
syllables: List of syllable strings
|
|
3056
|
+
parent_sentence: Sentence containing this word
|
|
3057
|
+
dialect: DialectInventory with rules
|
|
3058
|
+
"""
|
|
3059
|
+
|
|
3060
|
+
surface: str
|
|
3061
|
+
word_idx: int # parent_sentence.words[idx] == self
|
|
3062
|
+
graphemes: List[GraphemeToken] = dataclasses.field(default_factory=list)
|
|
3063
|
+
syllables: List[str] = dataclasses.field(default_factory=list)
|
|
3064
|
+
parent_sentence: Optional["Sentence"] = None
|
|
3065
|
+
dialect: DialectInventory = dataclasses.field(default_factory=EuropeanPortuguese)
|
|
3066
|
+
|
|
3067
|
+
# Precomputed index
|
|
3068
|
+
_idx_in_sentence: int = -1
|
|
3069
|
+
|
|
3070
|
+
def __post_init__(self):
|
|
3071
|
+
"""
|
|
3072
|
+
Initialize syllables and graphemes with proper indexing.
|
|
3073
|
+
|
|
3074
|
+
INITIALIZATION ORDER:
|
|
3075
|
+
---------------------
|
|
3076
|
+
1. Syllabify word (using external syllabifier)
|
|
3077
|
+
2. Tokenize into graphemes with syllable alignment
|
|
3078
|
+
3. Create character tokens with computed indices
|
|
3079
|
+
|
|
3080
|
+
This top-down approach avoids circular dependencies.
|
|
3081
|
+
"""
|
|
3082
|
+
# Step 1: Syllabification
|
|
3083
|
+
if not self.syllables:
|
|
3084
|
+
self.syllables = syllabify(self.normalized)
|
|
3085
|
+
|
|
3086
|
+
# Step 2: Grapheme tokenization with syllable alignment
|
|
3087
|
+
if not self.graphemes:
|
|
3088
|
+
self.graphemes = self._tokenize_graphemes()
|
|
3089
|
+
|
|
3090
|
+
# Step 3: Compute all indices top-down
|
|
3091
|
+
self._compute_indices()
|
|
3092
|
+
|
|
3093
|
+
def _tokenize_graphemes(self) -> List[GraphemeToken]:
|
|
3094
|
+
"""
|
|
3095
|
+
Tokenize word into graphemes aligned with syllables.
|
|
3096
|
+
|
|
3097
|
+
TOKENIZATION STRATEGY:
|
|
3098
|
+
----------------------
|
|
3099
|
+
1. Normalize word and syllables
|
|
3100
|
+
2. For each syllable, scan for longest matching grapheme
|
|
3101
|
+
3. Greedy match: try tetragraphs → trigraphs → digraphs → chars
|
|
3102
|
+
4. Track which syllable each grapheme belongs to
|
|
3103
|
+
|
|
3104
|
+
SYLLABLE ALIGNMENT:
|
|
3105
|
+
-------------------
|
|
3106
|
+
We need to know which grapheme belongs to which syllable
|
|
3107
|
+
for stress assignment and phonological rules.
|
|
3108
|
+
|
|
3109
|
+
DOUBLED CONSONANT HANDLING:
|
|
3110
|
+
---------------------------
|
|
3111
|
+
Portuguese syllabification splits doubled consonants:
|
|
3112
|
+
- bairro → bair.ro (not bai.rro)
|
|
3113
|
+
- muitíssimo → mui.tís.si.mo
|
|
3114
|
+
|
|
3115
|
+
But these represent single phonemes. We normalize:
|
|
3116
|
+
- Move first letter to following syllable for phonological unity
|
|
3117
|
+
|
|
3118
|
+
Returns:
|
|
3119
|
+
List of GraphemeToken objects with syllable indices
|
|
3120
|
+
"""
|
|
3121
|
+
# Normalize syllables for consonant doubling
|
|
3122
|
+
normalized_syllables = self._normalize_syllables()
|
|
3123
|
+
|
|
3124
|
+
graphemes = []
|
|
3125
|
+
# char_to_syllable = self._build_char_to_syllable_map(normalized_syllables)
|
|
3126
|
+
|
|
3127
|
+
# Process each syllable
|
|
3128
|
+
for syl_idx, syllable in enumerate(normalized_syllables):
|
|
3129
|
+
syl_pos = 0
|
|
3130
|
+
|
|
3131
|
+
while syl_pos < len(syllable):
|
|
3132
|
+
# Try longest match first (greedy)
|
|
3133
|
+
matched = False
|
|
3134
|
+
|
|
3135
|
+
for grapheme in self.dialect.GRAPHEME_INVENTORY:
|
|
3136
|
+
if syllable[syl_pos:].startswith(grapheme):
|
|
3137
|
+
# Found match
|
|
3138
|
+
graphemes.append(
|
|
3139
|
+
GraphemeToken(
|
|
3140
|
+
surface=syllable[syl_pos:syl_pos + len(grapheme)],
|
|
3141
|
+
grapheme_idx=len(graphemes),
|
|
3142
|
+
syllable_idx=syl_idx,
|
|
3143
|
+
parent_word=self,
|
|
3144
|
+
dialect=self.dialect
|
|
3145
|
+
)
|
|
3146
|
+
)
|
|
3147
|
+
syl_pos += len(grapheme)
|
|
3148
|
+
matched = True
|
|
3149
|
+
break
|
|
3150
|
+
|
|
3151
|
+
if not matched:
|
|
3152
|
+
# Single character fallback
|
|
3153
|
+
graphemes.append(
|
|
3154
|
+
GraphemeToken(
|
|
3155
|
+
surface=syllable[syl_pos],
|
|
3156
|
+
grapheme_idx=len(graphemes),
|
|
3157
|
+
syllable_idx=syl_idx,
|
|
3158
|
+
parent_word=self,
|
|
3159
|
+
dialect=self.dialect
|
|
3160
|
+
)
|
|
3161
|
+
)
|
|
3162
|
+
syl_pos += 1
|
|
3163
|
+
|
|
3164
|
+
return graphemes
|
|
3165
|
+
|
|
3166
|
+
def _normalize_syllables(self) -> List[str]:
|
|
3167
|
+
"""
|
|
3168
|
+
Normalize syllables for doubled consonant handling.
|
|
3169
|
+
|
|
3170
|
+
Portuguese syllabification splits rr, ss, etc.:
|
|
3171
|
+
- carro → car.ro
|
|
3172
|
+
|
|
3173
|
+
But phonologically, these are single consonants [ʁ], [s].
|
|
3174
|
+
We want them in the second syllable for correct IPA generation.
|
|
3175
|
+
|
|
3176
|
+
Normalization: Move first letter of doubled consonant to next syllable.
|
|
3177
|
+
- car.ro → ca.rro (for processing)
|
|
3178
|
+
- baír.ris.mo → baí.rris.mo
|
|
3179
|
+
|
|
3180
|
+
Returns:
|
|
3181
|
+
List of normalized syllable strings
|
|
3182
|
+
"""
|
|
3183
|
+
norm_syllables = list(self.syllables)
|
|
3184
|
+
|
|
3185
|
+
for idx in range(len(norm_syllables) - 1):
|
|
3186
|
+
current = norm_syllables[idx]
|
|
3187
|
+
next_syl = norm_syllables[idx + 1]
|
|
3188
|
+
|
|
3189
|
+
# Check if syllable boundary splits doubled consonant
|
|
3190
|
+
for consonant in ["r", "s", "f", "l"]:
|
|
3191
|
+
if current.endswith(consonant) and next_syl.startswith(consonant):
|
|
3192
|
+
# Move first consonant to next syllable
|
|
3193
|
+
norm_syllables[idx] = current[:-1]
|
|
3194
|
+
norm_syllables[idx + 1] = consonant + next_syl
|
|
3195
|
+
break
|
|
3196
|
+
|
|
3197
|
+
return norm_syllables
|
|
3198
|
+
|
|
3199
|
+
@staticmethod
|
|
3200
|
+
def _build_char_to_syllable_map(syllables: List[str]) -> Dict[int, int]:
|
|
3201
|
+
"""
|
|
3202
|
+
Map character index to syllable index.
|
|
3203
|
+
|
|
3204
|
+
Needed for aligning graphemes with syllables during tokenization.
|
|
3205
|
+
|
|
3206
|
+
Args:
|
|
3207
|
+
syllables: List of syllable strings
|
|
3208
|
+
|
|
3209
|
+
Returns:
|
|
3210
|
+
Dictionary mapping character position to syllable index
|
|
3211
|
+
"""
|
|
3212
|
+
char_to_syl = {}
|
|
3213
|
+
char_pos = 0
|
|
3214
|
+
|
|
3215
|
+
for syl_idx, syl in enumerate(syllables):
|
|
3216
|
+
for _ in syl:
|
|
3217
|
+
char_to_syl[char_pos] = syl_idx
|
|
3218
|
+
char_pos += 1
|
|
3219
|
+
|
|
3220
|
+
return char_to_syl
|
|
3221
|
+
|
|
3222
|
+
def _compute_indices(self):
|
|
3223
|
+
"""
|
|
3224
|
+
Compute all character and grapheme indices top-down.
|
|
3225
|
+
|
|
3226
|
+
This is called after grapheme tokenization to set:
|
|
3227
|
+
- Grapheme indices in word
|
|
3228
|
+
- Character indices in word
|
|
3229
|
+
- Character indices in sentence
|
|
3230
|
+
|
|
3231
|
+
Top-down computation avoids circular dependencies.
|
|
3232
|
+
"""
|
|
3233
|
+
char_idx_in_word = 0
|
|
3234
|
+
|
|
3235
|
+
for grapheme in self.graphemes:
|
|
3236
|
+
# Set grapheme's index in word
|
|
3237
|
+
grapheme._idx_in_word = char_idx_in_word
|
|
3238
|
+
grapheme._idx_in_sentence = self._idx_in_sentence + char_idx_in_word
|
|
3239
|
+
|
|
3240
|
+
# Set character indices
|
|
3241
|
+
for char in grapheme.characters:
|
|
3242
|
+
char._idx_in_word = char_idx_in_word
|
|
3243
|
+
char._idx_in_sentence = self._idx_in_sentence + char_idx_in_word
|
|
3244
|
+
char_idx_in_word += 1
|
|
3245
|
+
|
|
3246
|
+
# =========================================================================
|
|
3247
|
+
# BASIC PROPERTIES
|
|
3248
|
+
# =========================================================================
|
|
3249
|
+
|
|
3250
|
+
@cached_property
|
|
3251
|
+
def normalized(self) -> str:
|
|
3252
|
+
"""Lowercase, stripped form of word."""
|
|
3253
|
+
return self.surface.lower().strip()
|
|
3254
|
+
|
|
3255
|
+
@cached_property
|
|
3256
|
+
def normalized_syllables(self) -> List[str]:
|
|
3257
|
+
"""Syllables after consonant doubling normalization."""
|
|
3258
|
+
return self._normalize_syllables()
|
|
3259
|
+
|
|
3260
|
+
@property
|
|
3261
|
+
def n_syllables(self) -> int:
|
|
3262
|
+
"""Number of syllables in word."""
|
|
3263
|
+
return len(self.syllables)
|
|
3264
|
+
|
|
3265
|
+
@property
|
|
3266
|
+
def idx_in_sentence(self) -> int:
|
|
3267
|
+
"""Character index of first letter in sentence."""
|
|
3268
|
+
return self._idx_in_sentence
|
|
3269
|
+
|
|
3270
|
+
@cached_property
|
|
3271
|
+
def is_archaic(self) -> bool:
|
|
3272
|
+
return self.normalized in self.dialect.ARCHAIC_WORDS
|
|
3273
|
+
|
|
3274
|
+
# =========================================================================
|
|
3275
|
+
# LINKED PROPERTIES
|
|
3276
|
+
# =========================================================================
|
|
3277
|
+
|
|
3278
|
+
@cached_property
|
|
3279
|
+
def prev_word(self) -> Optional['WordToken']:
|
|
3280
|
+
"""Previous word in sentence, or None if first."""
|
|
3281
|
+
if self.word_idx == 0 or not self.parent_sentence:
|
|
3282
|
+
return None
|
|
3283
|
+
return self.parent_sentence.words[self.word_idx - 1]
|
|
3284
|
+
|
|
3285
|
+
@cached_property
|
|
3286
|
+
def next_word(self) -> Optional['WordToken']:
|
|
3287
|
+
"""Next word in sentence, or None if last."""
|
|
3288
|
+
if self.word_idx == -1 or not self.parent_sentence:
|
|
3289
|
+
return None
|
|
3290
|
+
if self.word_idx >= len(self.parent_sentence.words) - 1:
|
|
3291
|
+
return None
|
|
3292
|
+
return self.parent_sentence.words[self.word_idx + 1]
|
|
3293
|
+
|
|
3294
|
+
# =========================================================================
|
|
3295
|
+
# STRESS PROPERTIES
|
|
3296
|
+
# =========================================================================
|
|
3297
|
+
|
|
3298
|
+
@cached_property
|
|
3299
|
+
def stressed_syllable_idx(self) -> int:
|
|
3300
|
+
"""
|
|
3301
|
+
Index of syllable carrying primary stress.
|
|
3302
|
+
|
|
3303
|
+
Uses detect_stress_position() helper function.
|
|
3304
|
+
"""
|
|
3305
|
+
return detect_stress_position(
|
|
3306
|
+
self.normalized,
|
|
3307
|
+
self.syllables,
|
|
3308
|
+
self.dialect
|
|
3309
|
+
)
|
|
3310
|
+
|
|
3311
|
+
# =========================================================================
|
|
3312
|
+
# IPA GENERATION
|
|
3313
|
+
# =========================================================================
|
|
3314
|
+
|
|
3315
|
+
@cached_property
|
|
3316
|
+
def ipa(self) -> str:
|
|
3317
|
+
"""
|
|
3318
|
+
Generate IPA transcription for entire word.
|
|
3319
|
+
|
|
3320
|
+
ALGORITHM:
|
|
3321
|
+
----------
|
|
3322
|
+
1. Check irregular word list (overrides all rules)
|
|
3323
|
+
2. Generate IPA for each grapheme
|
|
3324
|
+
3. Insert syllable boundaries (·)
|
|
3325
|
+
4. Insert stress marker (ˈ) before stressed syllable
|
|
3326
|
+
|
|
3327
|
+
STRESS MARKING:
|
|
3328
|
+
---------------
|
|
3329
|
+
IPA convention: ˈ precedes stressed syllable
|
|
3330
|
+
Example: português [puɾ.tu.ˈɡeʃ] → "ˈ" before "ɡeʃ"
|
|
3331
|
+
|
|
3332
|
+
Returns:
|
|
3333
|
+
Full IPA transcription with stress and syllable marks
|
|
3334
|
+
"""
|
|
3335
|
+
# Check irregular words first
|
|
3336
|
+
if self.normalized in self.dialect.IRREGULAR_WORDS:
|
|
3337
|
+
return self.dialect.IRREGULAR_WORDS[self.normalized]
|
|
3338
|
+
|
|
3339
|
+
# Generate grapheme IPAs grouped by syllable
|
|
3340
|
+
syllable_ipas = [[] for _ in self.syllables]
|
|
3341
|
+
|
|
3342
|
+
for grapheme in self.graphemes:
|
|
3343
|
+
syl_idx = grapheme.syllable_idx
|
|
3344
|
+
if 0 <= syl_idx < len(syllable_ipas):
|
|
3345
|
+
grapheme_ipa = grapheme.ipa
|
|
3346
|
+
if grapheme_ipa: # Skip empty (silent) graphemes
|
|
3347
|
+
syllable_ipas[syl_idx].append(grapheme_ipa)
|
|
3348
|
+
|
|
3349
|
+
# Join graphemes within syllables
|
|
3350
|
+
syllable_strings = ["".join(ipa_list) for ipa_list in syllable_ipas]
|
|
3351
|
+
|
|
3352
|
+
# Insert stress marker before stressed syllable
|
|
3353
|
+
stressed_idx = self.stressed_syllable_idx
|
|
3354
|
+
if 0 <= stressed_idx < len(syllable_strings):
|
|
3355
|
+
syllable_strings[stressed_idx] = (
|
|
3356
|
+
self.dialect.PRIMARY_STRESS_TOKEN + syllable_strings[stressed_idx]
|
|
3357
|
+
)
|
|
3358
|
+
|
|
3359
|
+
# Join syllables with hiatus marker
|
|
3360
|
+
return self.dialect.HIATUS_TOKEN.join(syllable_strings)
|
|
3361
|
+
|
|
3362
|
+
# =========================================================================
|
|
3363
|
+
# FEATURE EXTRACTION
|
|
3364
|
+
# =========================================================================
|
|
3365
|
+
|
|
3366
|
+
@property
|
|
3367
|
+
def features(self) -> Dict[str, any]:
|
|
3368
|
+
"""
|
|
3369
|
+
Extract all linguistic features.
|
|
3370
|
+
|
|
3371
|
+
Returns:
|
|
3372
|
+
Dictionary with word features and nested grapheme features
|
|
3373
|
+
"""
|
|
3374
|
+
feats = {
|
|
3375
|
+
"n_syllables": self.n_syllables,
|
|
3376
|
+
"idx_in_sentence": self.idx_in_sentence,
|
|
3377
|
+
"stressed_syllable_idx": self.stressed_syllable_idx,
|
|
3378
|
+
}
|
|
3379
|
+
|
|
3380
|
+
for grapheme in self.graphemes:
|
|
3381
|
+
for k, v in grapheme.features.items():
|
|
3382
|
+
feats[f"graph_{grapheme.grapheme_idx}_{k}"] = v
|
|
3383
|
+
|
|
3384
|
+
return feats
|
|
3385
|
+
|
|
3386
|
+
def __eq__(self, other) -> bool:
|
|
3387
|
+
"""Allow comparison with string."""
|
|
3388
|
+
if isinstance(other, str):
|
|
3389
|
+
return self.surface == other
|
|
3390
|
+
return super().__eq__(other)
|
|
3391
|
+
|
|
3392
|
+
def __repr__(self) -> str:
|
|
3393
|
+
"""String representation for debugging."""
|
|
3394
|
+
syllables_str = ".".join(self.syllables)
|
|
3395
|
+
return f"WordToken('{self.surface}' [{syllables_str}] → [{self.ipa}])"
|
|
3396
|
+
|
|
3397
|
+
|
|
3398
|
+
# =============================================================================
|
|
3399
|
+
# SENTENCE
|
|
3400
|
+
# =============================================================================
|
|
3401
|
+
|
|
3402
|
+
@dataclasses.dataclass
|
|
3403
|
+
class Sentence:
|
|
3404
|
+
"""
|
|
3405
|
+
Represents a sentence with full phonological analysis.
|
|
3406
|
+
|
|
3407
|
+
SENTENCE-LEVEL PHONOLOGY:
|
|
3408
|
+
-------------------------
|
|
3409
|
+
While most phonological rules operate at word level,
|
|
3410
|
+
sentences introduce:
|
|
3411
|
+
1. Liaison: linking between words (resyllabification)
|
|
3412
|
+
2. Phrasal stress: prominence patterns across words
|
|
3413
|
+
3. Intonation: pitch contours for questions, statements, etc.
|
|
3414
|
+
|
|
3415
|
+
CURRENT IMPLEMENTATION:
|
|
3416
|
+
-----------------------
|
|
3417
|
+
This version focuses on word-level analysis.
|
|
3418
|
+
Sentence-level prosody (liaison, phrasal stress, intonation)
|
|
3419
|
+
is simplified or not yet implemented.
|
|
3420
|
+
|
|
3421
|
+
Future extensions could include:
|
|
3422
|
+
- Liaison rules (final consonant + initial vowel)
|
|
3423
|
+
- Phrasal stress patterns
|
|
3424
|
+
- Intonation contours (ToBI annotation)
|
|
3425
|
+
|
|
3426
|
+
Attributes:
|
|
3427
|
+
surface: Raw sentence text
|
|
3428
|
+
words: List of WordToken objects
|
|
3429
|
+
dialect: DialectInventory with rules
|
|
3430
|
+
"""
|
|
3431
|
+
|
|
3432
|
+
surface: str
|
|
3433
|
+
words: List[WordToken] = dataclasses.field(default_factory=list)
|
|
3434
|
+
dialect: DialectInventory = dataclasses.field(default_factory=EuropeanPortuguese)
|
|
3435
|
+
|
|
3436
|
+
def __post_init__(self):
|
|
3437
|
+
"""
|
|
3438
|
+
Initialize word tokens with computed indices.
|
|
3439
|
+
|
|
3440
|
+
TOKENIZATION:
|
|
3441
|
+
-------------
|
|
3442
|
+
Simple whitespace tokenization.
|
|
3443
|
+
Punctuation is kept attached to words for now.
|
|
3444
|
+
|
|
3445
|
+
More sophisticated tokenization could handle:
|
|
3446
|
+
- Clitics: dar-lhe → dar + lhe
|
|
3447
|
+
- Contractions: do → de + o
|
|
3448
|
+
- Punctuation separation
|
|
3449
|
+
"""
|
|
3450
|
+
if not self.words:
|
|
3451
|
+
# Tokenize on whitespace and hyphen
|
|
3452
|
+
word_surfaces = self.normalized.replace('-', ' ').split()
|
|
3453
|
+
|
|
3454
|
+
# Compute word positions in sentence
|
|
3455
|
+
char_position = 0
|
|
3456
|
+
for idx, word_surface in enumerate(word_surfaces):
|
|
3457
|
+
# Find word in original sentence (preserve case)
|
|
3458
|
+
word_start = self.surface.lower().find(word_surface, char_position)
|
|
3459
|
+
|
|
3460
|
+
# Create word token
|
|
3461
|
+
word_token = WordToken(
|
|
3462
|
+
surface=word_surface,
|
|
3463
|
+
word_idx=idx,
|
|
3464
|
+
parent_sentence=self,
|
|
3465
|
+
dialect=self.dialect
|
|
3466
|
+
)
|
|
3467
|
+
word_token._idx_in_sentence = word_start
|
|
3468
|
+
|
|
3469
|
+
self.words.append(word_token)
|
|
3470
|
+
|
|
3471
|
+
# Update position (word length + space)
|
|
3472
|
+
char_position = word_start + len(word_surface) + 1
|
|
3473
|
+
|
|
3474
|
+
# =========================================================================
|
|
3475
|
+
# BASIC PROPERTIES
|
|
3476
|
+
# =========================================================================
|
|
3477
|
+
@cached_property
|
|
3478
|
+
def normalized(self) -> str:
|
|
3479
|
+
"""Lowercase, stripped form of sentence."""
|
|
3480
|
+
# Remove leading/trailing punctuation and whitespace
|
|
3481
|
+
text = self.surface.lower().strip(string.punctuation + string.whitespace)
|
|
3482
|
+
return normalize_numbers(text)
|
|
3483
|
+
|
|
3484
|
+
@property
|
|
3485
|
+
def n_words(self) -> int:
|
|
3486
|
+
"""Number of words in sentence."""
|
|
3487
|
+
return len(self.words)
|
|
3488
|
+
|
|
3489
|
+
# =========================================================================
|
|
3490
|
+
# IPA GENERATION
|
|
3491
|
+
# =========================================================================
|
|
3492
|
+
|
|
3493
|
+
@cached_property
|
|
3494
|
+
def ipa(self) -> str:
|
|
3495
|
+
"""
|
|
3496
|
+
Generate IPA transcription for entire sentence.
|
|
3497
|
+
|
|
3498
|
+
ALGORITHM:
|
|
3499
|
+
----------
|
|
3500
|
+
1. Generate IPA for each word
|
|
3501
|
+
2. Join with word boundary markers (space)
|
|
3502
|
+
|
|
3503
|
+
SIMPLIFICATION:
|
|
3504
|
+
---------------
|
|
3505
|
+
This treats each word independently.
|
|
3506
|
+
A full implementation would handle:
|
|
3507
|
+
- Liaison across word boundaries
|
|
3508
|
+
- Resyllabification (e.g., "os amigos" → "o.za.mi.gos")
|
|
3509
|
+
- Phrasal stress patterns
|
|
3510
|
+
|
|
3511
|
+
Returns:
|
|
3512
|
+
Space-separated IPA transcription
|
|
3513
|
+
"""
|
|
3514
|
+
word_ipas = [word.ipa for word in self.words]
|
|
3515
|
+
return " ".join(word_ipas)
|
|
3516
|
+
|
|
3517
|
+
# =========================================================================
|
|
3518
|
+
# FEATURE EXTRACTION
|
|
3519
|
+
# =========================================================================
|
|
3520
|
+
|
|
3521
|
+
@property
|
|
3522
|
+
def features(self) -> Dict[str, any]:
|
|
3523
|
+
"""
|
|
3524
|
+
Extract all linguistic features.
|
|
3525
|
+
|
|
3526
|
+
WARNING: Can produce very large feature dictionaries
|
|
3527
|
+
for long sentences. Consider alternative representations
|
|
3528
|
+
(e.g., arrays, DataFrames) for ML applications.
|
|
3529
|
+
|
|
3530
|
+
Returns:
|
|
3531
|
+
Dictionary with sentence features and nested word features
|
|
3532
|
+
"""
|
|
3533
|
+
feats = {
|
|
3534
|
+
"n_words": self.n_words,
|
|
3535
|
+
"n_whitespaces": self.n_words - 1,
|
|
3536
|
+
}
|
|
3537
|
+
|
|
3538
|
+
for word in self.words:
|
|
3539
|
+
for k, v in word.features.items():
|
|
3540
|
+
feats[f"word_{word.word_idx}_{k}"] = v
|
|
3541
|
+
|
|
3542
|
+
return feats
|
|
3543
|
+
|
|
3544
|
+
def __eq__(self, other) -> bool:
|
|
3545
|
+
"""Allow comparison with string."""
|
|
3546
|
+
if isinstance(other, str):
|
|
3547
|
+
return self.surface == other
|
|
3548
|
+
return super().__eq__(other)
|
|
3549
|
+
|
|
3550
|
+
def __repr__(self) -> str:
|
|
3551
|
+
"""String representation for debugging."""
|
|
3552
|
+
return f"Sentence('{self.surface}' → [{self.ipa}])"
|
|
3553
|
+
|
|
3554
|
+
|
|
3555
|
+
# =============================================================================
|
|
3556
|
+
# UTILITY FUNCTIONS FOR TESTING AND DEMONSTRATION
|
|
3557
|
+
# =============================================================================
|
|
3558
|
+
|
|
3559
|
+
def demonstrate_transcription(text: str, dialect: DialectInventory = None):
|
|
3560
|
+
"""
|
|
3561
|
+
Demonstrate IPA transcription with detailed linguistic analysis.
|
|
3562
|
+
|
|
3563
|
+
This function provides a pedagogical view of the transcription process,
|
|
3564
|
+
showing intermediate steps and linguistic features.
|
|
3565
|
+
|
|
3566
|
+
Args:
|
|
3567
|
+
text: Portuguese text to transcribe
|
|
3568
|
+
dialect: DialectInventory to use (default: European Portuguese)
|
|
3569
|
+
|
|
3570
|
+
Example:
|
|
3571
|
+
>>> demonstrate_transcription("O cão comeu o pão.")
|
|
3572
|
+
Sentence: O cão comeu o pão.
|
|
3573
|
+
IPA: [u ˈkɐ̃w ko·ˈmew u ˈpɐ̃w]
|
|
3574
|
+
|
|
3575
|
+
Words:
|
|
3576
|
+
1. o [u]
|
|
3577
|
+
Syllables: o
|
|
3578
|
+
Stress: syllable 0
|
|
3579
|
+
2. cão [ˈkɐ̃w]
|
|
3580
|
+
Syllables: cão
|
|
3581
|
+
Stress: syllable 0 (final -ão)
|
|
3582
|
+
Graphemes: c[k] ão[ɐ̃w]
|
|
3583
|
+
Nasal diphthong: ão
|
|
3584
|
+
...
|
|
3585
|
+
"""
|
|
3586
|
+
if dialect is None:
|
|
3587
|
+
dialect = EuropeanPortuguese()
|
|
3588
|
+
|
|
3589
|
+
sentence = Sentence(text, dialect=dialect)
|
|
3590
|
+
|
|
3591
|
+
print(f"Sentence: {sentence.surface}")
|
|
3592
|
+
print(f"IPA: [{sentence.ipa}]")
|
|
3593
|
+
print()
|
|
3594
|
+
print("Words:")
|
|
3595
|
+
|
|
3596
|
+
for word in sentence.words:
|
|
3597
|
+
print(f"{word.word_idx + 1}. {word.surface} [{word.ipa}]")
|
|
3598
|
+
print(f" Syllables: {'.'.join(word.syllables)}")
|
|
3599
|
+
print(f" Stress: syllable {word.stressed_syllable_idx}")
|
|
3600
|
+
|
|
3601
|
+
# Show graphemes
|
|
3602
|
+
grapheme_strs = []
|
|
3603
|
+
for g in word.graphemes:
|
|
3604
|
+
label = f"{g.surface}[{g.ipa}]"
|
|
3605
|
+
if g.is_diphthong:
|
|
3606
|
+
label += "(diphthong)"
|
|
3607
|
+
if g.is_digraph:
|
|
3608
|
+
label += "(digraph)"
|
|
3609
|
+
grapheme_strs.append(label)
|
|
3610
|
+
|
|
3611
|
+
print(f" Graphemes: {' '.join(grapheme_strs)}")
|
|
3612
|
+
print()
|
|
3613
|
+
|
|
3614
|
+
|
|
3615
|
+
# =============================================================================
|
|
3616
|
+
# MAIN DEMONSTRATION
|
|
3617
|
+
# =============================================================================
|
|
3618
|
+
|
|
3619
|
+
if __name__ == "__main__":
|
|
3620
|
+
"""
|
|
3621
|
+
Demonstrate the transcription system with various Portuguese examples.
|
|
3622
|
+
|
|
3623
|
+
These examples showcase:
|
|
3624
|
+
- Different stress patterns
|
|
3625
|
+
- Diphthongs and nasal vowels
|
|
3626
|
+
- Consonant digraphs
|
|
3627
|
+
- Challenging orthographic patterns
|
|
3628
|
+
"""
|
|
3629
|
+
print("=" * 80)
|
|
3630
|
+
print("PORTUGUESE ORTHOGRAPHY → IPA TRANSCRIPTION SYSTEM")
|
|
3631
|
+
print("=" * 80)
|
|
3632
|
+
print()
|
|
3633
|
+
|
|
3634
|
+
# Example sentences showcasing different phenomena
|
|
3635
|
+
examples = [
|
|
3636
|
+
# Basic sentence with nasal diphthongs
|
|
3637
|
+
"O cão comeu o pão.",
|
|
3638
|
+
|
|
3639
|
+
# Stress patterns
|
|
3640
|
+
"O médico português está no café.",
|
|
3641
|
+
|
|
3642
|
+
# Diphthongs and digraphs
|
|
3643
|
+
"A rainha viu o vinho.",
|
|
3644
|
+
|
|
3645
|
+
# Complex consonants
|
|
3646
|
+
"O carro chegou rápido.",
|
|
3647
|
+
|
|
3648
|
+
# X variants
|
|
3649
|
+
"O exemplo do táxi é exato.",
|
|
3650
|
+
|
|
3651
|
+
# Nasal patterns
|
|
3652
|
+
"Um homem tem compaixão.",
|
|
3653
|
+
]
|
|
3654
|
+
|
|
3655
|
+
european = EuropeanPortuguese()
|
|
3656
|
+
|
|
3657
|
+
for example in examples:
|
|
3658
|
+
demonstrate_transcription(example, european)
|
|
3659
|
+
print("=" * 80)
|
|
3660
|
+
print()
|
|
3661
|
+
|
|
3662
|
+
print("\nTranscription complete!")
|
|
3663
|
+
|
|
3664
|
+
examples = [
|
|
3665
|
+
"O cão comeu o pão.",
|
|
3666
|
+
"Três tigres tristes.",
|
|
3667
|
+
"Brasil é bonito.",
|
|
3668
|
+
"A tia comeu muito.",
|
|
3669
|
+
]
|
|
3670
|
+
|
|
3671
|
+
dialects = [
|
|
3672
|
+
("European", EuropeanPortuguese()),
|
|
3673
|
+
("Brazilian", BrazilianPortuguese()),
|
|
3674
|
+
("Angolan", AngolanPortuguese()),
|
|
3675
|
+
("Mozambican", MozambicanPortuguese()),
|
|
3676
|
+
("Timorese", TimoresePortuguese()),
|
|
3677
|
+
]
|
|
3678
|
+
|
|
3679
|
+
for example in examples:
|
|
3680
|
+
print(f"\nExample: {example}")
|
|
3681
|
+
print("-" * 80)
|
|
3682
|
+
for name, dialect in dialects:
|
|
3683
|
+
sent = Sentence(example, dialect=dialect)
|
|
3684
|
+
print(f"{name:15} [{dialect.dialect_code}]: {sent.ipa}")
|
|
3685
|
+
print()
|
|
3686
|
+
|
|
3687
|
+
print("\nDetailed analysis: pt-BR")
|
|
3688
|
+
print("=" * 80)
|
|
3689
|
+
demonstrate_transcription("A tia comeu muito pão.", BrazilianPortuguese())
|