tugaphone 0.0.2a1__py3-none-any.whl → 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tugaphone/tokenizer.py ADDED
@@ -0,0 +1,3689 @@
1
+ """
2
+ Portuguese Orthography → IPA Transcription System
3
+
4
+ This module provides comprehensive conversion from Portuguese orthography to
5
+ International Phonetic Alphabet (IPA) notation, following prescriptive norms
6
+ for European Portuguese (pt-PT), Brazilian Portuguese (pt-BR), and African
7
+ Portuguese variants (pt-AO, pt-MZ, pt-TL).
8
+
9
+ LINGUISTIC BACKGROUND:
10
+ ======================
11
+ Portuguese orthography uses Latin script with diacritical marks to represent
12
+ a rich phonological system. The relationship between spelling and pronunciation
13
+ is relatively regular but includes context-sensitive rules, silent letters,
14
+ and dialectal variations.
15
+
16
+ DIALECTAL VARIATION:
17
+ ====================
18
+ Portuguese exhibits significant phonological variation across regions:
19
+
20
+ 1. EUROPEAN PORTUGUESE (pt-PT):
21
+ - Heavy vowel reduction in unstressed positions
22
+ - Post-alveolar fricatives for syllable-final /s, z/
23
+ - Velarized/dark [ɫ] in coda position
24
+ - Uvular [ʁ] for strong R in most regions
25
+
26
+ 2. BRAZILIAN PORTUGUESE (pt-BR):
27
+ - Less vowel reduction (fuller vowel quality)
28
+ - Palatalization: /t, d/ → [tʃ, dʒ] before [i]
29
+ - L-vocalization: coda /l/ → [w] (creates new diphthongs)
30
+ - Glottal/velar [h, x] for strong R (region-dependent)
31
+ - Alveolar [s] for syllable-final /s/ (not palatalized)
32
+ - Nasal vowels less nasalized than European
33
+
34
+ 3. ANGOLAN PORTUGUESE (pt-AO):
35
+ - Similar to European but with substrate influence
36
+ - Less vowel reduction than European
37
+ - Consistent alveolar trill [r] for R
38
+ - Substrate-influenced prosody from Bantu languages
39
+
40
+ 4. MOZAMBICAN PORTUGUESE (pt-MZ):
41
+ - Similar to European with Bantu substrate
42
+ - Less vowel reduction
43
+ - May preserve distinctions lost in European
44
+ - Regional variation (north vs. south)
45
+
46
+ 5. TIMORESE PORTUGUESE (pt-TL):
47
+ - Influenced by Tetum and other Austronesian languages
48
+ - Similar to European base with local adaptations
49
+ - Less widespread native use (L2 features common)
50
+
51
+ KEY PHONOLOGICAL CONCEPTS:
52
+ --------------------------
53
+ 1. STRESS: Portuguese uses lexical stress (word-level prominence of syllables)
54
+ - Proparoxytone: stress on antepenultimate (third-to-last) syllable - rare, always marked
55
+ - Paroxytone: stress on penultimate (second-to-last) syllable - most common
56
+ - Oxytone: stress on final syllable - less common, specific phonological contexts
57
+
58
+ 2. VOWEL QUALITY: Stressed vs unstressed vowels differ in quality and reduction
59
+ - Stressed: fuller realization, can be open [ɛ, ɔ] or closed [e, o]
60
+ - Unstressed: typically reduced to [ɨ] or [ɐ] in European Portuguese
61
+ - Brazilian: less reduction, maintains [e, o, a] quality
62
+
63
+ 3. NASALIZATION: Vowels can be oral or nasal
64
+ - Marked by tilde (ã, õ) or followed by nasal consonant (m, n)
65
+ - Creates distinct phonemes, not just allophones
66
+ - Less nasalized in Brazilian Portuguese
67
+
68
+ 4. DIPHTHONGS: Sequences of vowel + semivowel or semivowel + vowel
69
+ - Falling/descending: vowel → semivowel (rei [ˈʁej])
70
+ - Rising/ascending: semivowel → vowel (piano [ˈpjɐnu])
71
+ - Can be oral or nasal
72
+ - Brazilian: additional diphthongs from L-vocalization
73
+
74
+ IMPLEMENTATION ARCHITECTURE:
75
+ ============================
76
+ The code uses a hierarchical tokenization model that mirrors linguistic structure:
77
+
78
+ Sentence → Words → Graphemes → Characters
79
+
80
+ - Character: Single letter/symbol
81
+ - Grapheme: Minimal spelling unit (can be digraph like 'ch' or diphthong like 'ai')
82
+ - Word: Sequence of graphemes with syllable structure
83
+ - Sentence: Sequence of words with prosodic information
84
+
85
+ All indices are computed top-down during initialization to avoid circular dependencies.
86
+ Context-sensitive rules are applied bottom-up during IPA generation.
87
+
88
+ QUICK REFERENCES:
89
+ ===========
90
+ - http://www.portaldalinguaportuguesa.org
91
+ - https://en.wiktionary.org/wiki/Wiktionary:International_Phonetic_Alphabet
92
+ - https://en.wiktionary.org/wiki/Appendix:Portuguese_pronunciation
93
+ - https://en.wiktionary.org/wiki/Appendix:Portuguese_spellings
94
+ - https://european-portuguese.info/vowels
95
+ - https://pt.wikipedia.org/wiki/L%C3%ADngua_portuguesa
96
+ - https://pt.wikipedia.org/wiki/Ortografia_da_l%C3%ADngua_portuguesa
97
+ - https://pt.wikipedia.org/wiki/Gram%C3%A1tica_da_l%C3%ADngua_portuguesa
98
+ - https://pt.wikipedia.org/wiki/Fonologia_da_língua_portuguesa
99
+ - https://pt.wikipedia.org/wiki/Processo_do_vocalismo_%C3%A1tono_do_portugu%C3%AAs_europeu
100
+ - https://pt.wikipedia.org/wiki/Ditongo
101
+ - https://pt.wikipedia.org/wiki/Tritongo
102
+ - https://pt.wikipedia.org/wiki/Hiato_(lingu%C3%ADstica)
103
+ - https://pt.wikipedia.org/wiki/D%C3%ADgrafo
104
+ - https://pt.wikipedia.org/wiki/Fonema
105
+ - https://pt.wikipedia.org/wiki/Alofonia
106
+ """
107
+
108
+ import dataclasses
109
+ import string
110
+ from functools import cached_property
111
+ from typing import List, Optional, Dict, Set
112
+
113
+ from tugaphone.number_utils import normalize_numbers
114
+ from tugaphone.syl import syllabify
115
+
116
+
117
+ # =============================================================================
118
+ # DIALECT INVENTORY: Phonological Rules and Mappings
119
+ # =============================================================================
120
+
121
+ @dataclasses.dataclass()
122
+ class DialectInventory:
123
+ """
124
+ Encapsulates all dialect-specific phonological rules and mappings.
125
+
126
+ This class serves as a lookup table and rule repository for converting
127
+ Portuguese orthography to IPA. Different Portuguese dialects (European,
128
+ Brazilian, etc.) can define different inventories.
129
+
130
+ DESIGN RATIONALE:
131
+ -----------------
132
+ Centralizing dialect rules in one class allows:
133
+ - Easy comparison between dialects
134
+ - Clean separation of data from logic
135
+ - Simple addition of new dialects
136
+ - Maintenance of linguistic rules in one location
137
+
138
+ Attributes:
139
+ dialect_code: IETF BCP 47 language tag (e.g., 'pt-PT', 'pt-BR')
140
+ """
141
+
142
+ dialect_code: str = "pt-PT"
143
+
144
+ # =========================================================================
145
+ # SYMBOLIC CONSTANTS
146
+ # =========================================================================
147
+ # These are used in IPA output to represent prosodic features
148
+
149
+ HIATUS_TOKEN: str = "·" # Syllable boundary marker
150
+ PRIMARY_STRESS_TOKEN: str = "ˈ" # IPA primary stress marker (before stressed syllable)
151
+ SECONDARY_STRESS_TOKEN: str = "ˌ" # IPA secondary stress marker
152
+
153
+ # =========================================================================
154
+ # PUNCTUATION MAPPING
155
+ # =========================================================================
156
+ # Maps orthographic punctuation to prosodic IPA markers
157
+ # Rationale: Punctuation affects speech rhythm and pausing
158
+
159
+ PUNCT2IPA: Dict[str, str] = dataclasses.field(default_factory=dict)
160
+
161
+ # =========================================================================
162
+ # CHARACTER SETS
163
+ # =========================================================================
164
+ # Organized by linguistic function for efficient categorization
165
+
166
+ PUNCT_CHARS: Set[str] = dataclasses.field(default_factory=set)
167
+
168
+ # Base vowels: a, e, i, o, u
169
+ # Portuguese vowel system is asymmetric - more distinctions in stressed position
170
+ VOWEL_CHARS: Set[str] = dataclasses.field(default_factory=set)
171
+
172
+ # DIACRITICS ON VOWELS:
173
+ # Portuguese uses diacritics to mark stress, vowel quality, and nasalization
174
+
175
+ # Acute accent (´): Marks primary stress AND open vowel quality
176
+ # Only valid on a, e, o (vowels with open/closed distinction)
177
+ # Examples: café [kɐˈfɛ], está [ɨʃˈta], avó [ɐˈvɔ]
178
+ ACUTE_VOWEL_CHARS: Set[str] = dataclasses.field(default_factory=set)
179
+
180
+ # Grave accent (`): ARCHAIC - marked secondary stress (pre-1973 Portugal, pre-1971 Brazil)
181
+ # Modern usage: only 'à' (contraction a + a = à)
182
+ # Historical: sòmente, cafèzinho
183
+ GRAVE_VOWEL_CHARS: Set[str] = dataclasses.field(default_factory=set)
184
+
185
+ # Circumflex (^): Marks primary stress AND closed vowel quality
186
+ # Only valid on a, e, o
187
+ # Examples: você [voˈse], avô [ɐˈvo], âmbito [ˈɐ̃bitu]
188
+ CIRCUM_VOWEL_CHARS: Set[str] = dataclasses.field(default_factory=set)
189
+
190
+ # Tilde (~): Marks nasalization (air flow through nose)
191
+ # Modern Portuguese: only ã, õ are valid
192
+ # ẽ, ĩ, ũ: archaic or foreign words
193
+ # Examples: mão [ˈmɐ̃w̃], põe [ˈpõj̃]
194
+ TILDE_VOWEL_CHARS: Set[str] = dataclasses.field(default_factory=set)
195
+
196
+ # Diaeresis/Trema (¨): ARCHAIC - marked pronounced 'u' in 'gu/qu' contexts
197
+ # Abolished in 1945 (Portugal) and 2009 (Brazil)
198
+ # Historical: lingüiça [lĩˈgwisɐ] vs linguiça [lĩˈgisɐ]
199
+ # Modern German names: Müller, Göring
200
+ TREMA_VOWEL_CHARS: Set[str] = dataclasses.field(default_factory=set)
201
+
202
+ # Semivowels: Can function as vowel or consonant depending on position
203
+ # In Portuguese: /j/ (written i, e) and /w/ (written u, o)
204
+ # Examples: rei [ˈʁej] - 'i' is semivowel; rima [ˈʁimɐ] - 'i' is vowel
205
+ SEMIVOWEL_CHARS: Set[str] = dataclasses.field(default_factory=set)
206
+
207
+ # Foreign letters: Not in traditional Portuguese alphabet
208
+ # k, w, y: used in loanwords, foreign names, scientific terms
209
+ # Examples: kilo, whisky, yen
210
+ FOREIGN_CHARS: Set[str] = dataclasses.field(default_factory=set)
211
+
212
+ # Front vowels: Tongue positioned forward in mouth
213
+ # Relevant for palatalization rules (c→s, g→ʒ before front vowels)
214
+ FRONT_VOWEL_CHARS: Set[str] = dataclasses.field(default_factory=set)
215
+
216
+ # STRESS MARKERS (for automatic stress detection)
217
+ # Primary: acute accent and tilde (õ, ã are always stressed when final)
218
+ PRIMARY_STRESS_MARKERS: Set[str] = dataclasses.field(default_factory=set)
219
+ # Secondary: grave and circumflex
220
+ SECONDARY_STRESS_MARKERS: Set[str] = dataclasses.field(default_factory=set)
221
+
222
+ # =========================================================================
223
+ # IPA VOWEL INVENTORY
224
+ # =========================================================================
225
+ # Portuguese has one of the richest vowel systems in Romance languages
226
+
227
+ # ORAL VOWELS (air flows only through mouth):
228
+ # High: i [i] (si), ɨ [ɨ] (pedir-unstressed), u [u] (tu)
229
+ # Mid-closed: e [e] (você), o [o] (avô)
230
+ # Mid-open: ɛ [ɛ] (pé), ɔ [ɔ] (pó)
231
+ # Low: a [a] (lá-stressed), ɐ [ɐ] (casa-unstressed), ə [ə] (reduction)
232
+ ORAL_VOWELS: Set[str] = dataclasses.field(default_factory=set)
233
+
234
+ # NASAL VOWELS (air flows through nose AND mouth):
235
+ # Nasalization is phonemic in Portuguese (changes meaning)
236
+ # Examples: mato [ˈmatu] "bush" vs manto [ˈmɐ̃tu] "cloak"
237
+ NASAL_VOWELS: Set[str] = dataclasses.field(default_factory=set)
238
+
239
+ # VOWEL CATEGORIES BY OPENNESS (relevant for stress rules):
240
+ # These categories determine whether acute (´) or circumflex (^) is used
241
+ CLOSED_VOWELS: Set[str] = dataclasses.field(default_factory=set) # High vowels
242
+ SEMI_CLOSED_VOWELS: Set[str] = dataclasses.field(default_factory=set) # Mid-closed
243
+ OPEN_VOWELS: Set[str] = dataclasses.field(default_factory=set) # Low
244
+ SEMI_OPEN_VOWELS: Set[str] = dataclasses.field(default_factory=set) # Mid-open
245
+
246
+ ALL_VOWEL_CHARS: Set[str] = dataclasses.field(default_factory=set)
247
+
248
+ # =========================================================================
249
+ # DIPHTHONG INVENTORIES
250
+ # =========================================================================
251
+ # Diphthongs are single-syllable vowel sequences
252
+ # Structure: V+G (vowel + glide/semivowel) or G+V
253
+
254
+ # ORAL FALLING DIPHTHONGS (vowel → semivowel)
255
+ # Format: IPA → orthographic representation
256
+ # The /j/ glide is written 'i' or 'e', /w/ glide is written 'u' or 'o'
257
+ RISING_ORAL_DIPHTHONGS: Dict[str, str] = dataclasses.field(default_factory=dict)
258
+
259
+ # NASAL DIPHTHONGS
260
+ # Nasalization extends across the entire diphthong
261
+ # Examples: mãe [ˈmɐ̃j̃], cão [ˈkɐ̃w̃], põe [ˈpõj̃]
262
+ FALLING_NASAL_DIPHTHONGS: Dict[str, str] = dataclasses.field(default_factory=dict)
263
+
264
+ # BRAZILIAN PORTUGUESE SPECIAL DIPHTHONGS
265
+ # In Brazilian dialects, coda /l/ vocalizes to [w]
266
+ # This creates diphthongs not present in European Portuguese
267
+ # Examples: Brasil [bɾaˈziw] vs [bɾɐˈziɫ] (European)
268
+ PTBR_DIPHTHONGS: Dict[str, str] = dataclasses.field(default_factory=dict)
269
+
270
+ # =========================================================================
271
+ # NORMALIZATION MAPPINGS
272
+ # =========================================================================
273
+ # Maps archaic/invalid diacritics to modern standard equivalents
274
+ # Rationale: Historical texts use obsolete orthography
275
+
276
+ NORMALIZED_VOWELS: Dict[str, str] = dataclasses.field(default_factory=dict)
277
+
278
+ # =========================================================================
279
+ # GRAPHEME → IPA MAPPINGS
280
+ # =========================================================================
281
+ # Organized by complexity: multigraphs first, then digraphs, then single chars
282
+
283
+ # TETRAGRAPHS (4-letter sequences with special pronunciation)
284
+ TETRAGRAM2IPA: Dict[str, str] = dataclasses.field(default_factory=dict)
285
+
286
+ # TRIGRAPHS (3-letter sequences)
287
+ TRIGRAM2IPA: Dict[str, str] = dataclasses.field(default_factory=dict)
288
+
289
+ # TRIPHTHONGS (vowel + semivowel + vowel in one syllable)
290
+ # Rare in Portuguese: mostly in derived forms
291
+ # Example: Paraguai [pɐɾɐˈgwaj]
292
+ TRIPHTHONG2IPA: Dict[str, str] = dataclasses.field(default_factory=dict)
293
+
294
+ # DIPHTHONGS (reverse mapping: orthography → IPA)
295
+ DIPHTHONG2IPA: Dict[str, str] = dataclasses.field(default_factory=dict)
296
+
297
+ # DIGRAPHS - CONSONANTAL
298
+ # Two letters representing one consonant phoneme
299
+ # nh [ɲ]: palatal nasal (like Spanish ñ, Italian gn)
300
+ # lh [ʎ]: palatal lateral (like Italian gl)
301
+ # ch [ʃ]: voiceless postalveolar fricative (like English sh)
302
+ # rr [ʁ]: uvular trill (strong R)
303
+ # ss [s]: voiceless between vowels (otherwise 's' → [z])
304
+ DIGRAPH2IPA: Dict[str, str] = dataclasses.field(default_factory=dict)
305
+
306
+ # DIGRAPHS - NASAL VOWELS
307
+ # Vowel + nasal consonant (m/n) at syllable boundary → nasal vowel
308
+ # The 'm/n' is not pronounced separately; it nasalizes the vowel
309
+ # Examples: campo [ˈkɐ̃pu], antes [ˈɐ̃tɨʃ]
310
+ NASAL_DIGRAPHS: Dict[str, str] = dataclasses.field(default_factory=dict)
311
+
312
+ # CONSONANT HIATUS (intervocalic consonant clusters)
313
+ # These clusters span syllable boundaries with preserved articulation
314
+ # Examples: ficção [fik·ˈsɐ̃w̃], pacto [ˈpak·tu]
315
+ HETEROSYLLABIC_CLUSTERS: Dict[str, str] = dataclasses.field(default_factory=dict)
316
+
317
+ # ARCHAIC SILENT CONSONANTS
318
+ # Pre-2009 orthography included etymological consonants
319
+ # These were eliminated in Acordo Ortográfico
320
+ # Example: assumpção → assunção
321
+ ARCHAIC_MUTE_P: Dict[str, Set[str]] = dataclasses.field(default_factory=dict)
322
+
323
+ # FOREIGN DIGRAPHS (in loanwords)
324
+ FOREIGN_DIGRAPH2IPA: Dict[str, str] = dataclasses.field(default_factory=dict)
325
+
326
+ # =========================================================================
327
+ # HIATUS CONTEXTS
328
+ # =========================================================================
329
+ # Prefixes that force vowel separation (prevent diphthong formation)
330
+ # Example: bi·aturar [bi.ɐtu.ˈɾaɾ] not *[bjɐ.tu.ˈɾaɾ]
331
+ HIATUS_PREFIXES: Set[str] = dataclasses.field(default_factory=set)
332
+
333
+ # =========================================================================
334
+ # DEFAULT CHARACTER MAPPINGS
335
+ # =========================================================================
336
+ # Single character → IPA mapping (context-free baseline)
337
+ # Many characters have context-sensitive variants applied later
338
+ DEFAULT_CHAR2PHONEMES: Dict[str, str] = dataclasses.field(default_factory=dict)
339
+
340
+ # =========================================================================
341
+ # IRREGULAR WORD MAPPINGS
342
+ # =========================================================================
343
+ # Words with exceptional pronunciations that don't follow regular rules
344
+ # These override all other rules
345
+ IRREGULAR_WORDS: Dict[str, str] = dataclasses.field(default_factory=dict)
346
+
347
+ # =========================================================================
348
+ # STRESS RULES
349
+ # =========================================================================
350
+ # Portuguese stress is semi-predictable based on word endings
351
+
352
+ # OXYTONE ENDINGS (stress on final syllable)
353
+ # Words ending in these patterns are stressed on final syllable
354
+ # Examples: café, funil, rapaz, caju
355
+ OXYTONE_ENDINGS: Set[str] = dataclasses.field(default_factory=set)
356
+
357
+ # =========================================================================
358
+ # COMPILED GRAPHEME INVENTORY
359
+ # =========================================================================
360
+ # All valid multi-character graphemes for tokenization
361
+ # Ordered by length (longest first) for greedy matching
362
+ GRAPHEME_INVENTORY: List[str] = dataclasses.field(default_factory=list)
363
+
364
+ def __post_init__(self):
365
+ """
366
+ Initialize all mapping dictionaries with default values.
367
+
368
+ This method populates the dialect-specific rules. It's called automatically
369
+ after dataclass initialization. Subclasses can override individual mappings.
370
+
371
+ DESIGN DECISION:
372
+ ----------------
373
+ Using __post_init__ allows:
374
+ - Empty initialization for inheritance
375
+ - Default values for base dialect
376
+ - Override flexibility for subclasses
377
+ """
378
+ self._initialize_char_lists()
379
+ self._initialize_normalized_vowels()
380
+ self._initialize_punctuation()
381
+ self._initialize_consonant_digraphs()
382
+ self._initialize_nasal_digraphs()
383
+ self._initialize_consonant_hiatus()
384
+ self._initialize_archaic_forms()
385
+ self._initialize_foreign_digraphs()
386
+ self._initialize_hiatus_prefixes()
387
+ self._initialize_diphthongs()
388
+ self._initialize_triphthongs()
389
+ self._initialize_trigrams()
390
+ self._initialize_tetragrams()
391
+ self._initialize_default_chars()
392
+ self._initialize_stress_rules()
393
+ self._compile_grapheme_inventory()
394
+
395
+ # Até ao início do século XX, tanto em Portugal como no Brasil,
396
+ # seguia-se uma ortografia que, por regra, baseava-se nos étimos latino ou grego para escrever cada palavra
397
+ # TODO: mapping to modern word equivalent, normalize for IPA parsing
398
+ self.ARCHAIC_WORDS = {
399
+ "architectura",
400
+ "caravella",
401
+ "diccionario",
402
+ "diphthongo",
403
+ "estylo",
404
+ "grammatica",
405
+ "lyrio",
406
+ "parochia",
407
+ "kilometro",
408
+ "orthographia",
409
+ "pharmacia",
410
+ "phleugma",
411
+ "prompto",
412
+ "psychologia",
413
+ "psalmo",
414
+ "rheumatismo",
415
+ "sanccionar",
416
+ "theatro"
417
+ }
418
+
419
+ def _initialize_char_lists(self):
420
+ if not self.PUNCT_CHARS:
421
+ self.PUNCT_CHARS = set(string.punctuation)
422
+ if not self.VOWEL_CHARS:
423
+ self.VOWEL_CHARS = set("aeiou")
424
+ if not self.ACUTE_VOWEL_CHARS:
425
+ self.ACUTE_VOWEL_CHARS = set("áéíóú")
426
+ if not self.GRAVE_VOWEL_CHARS:
427
+ self.GRAVE_VOWEL_CHARS = set("àèìòù")
428
+ if not self.CIRCUM_VOWEL_CHARS:
429
+ self.CIRCUM_VOWEL_CHARS = set("âêîôû")
430
+ if not self.TILDE_VOWEL_CHARS:
431
+ self.TILDE_VOWEL_CHARS = set("ãõẽĩũ")
432
+ if not self.TREMA_VOWEL_CHARS:
433
+ self.TREMA_VOWEL_CHARS = set("äëïöü")
434
+ if not self.SEMIVOWEL_CHARS:
435
+ self.SEMIVOWEL_CHARS = set("iueo")
436
+ if not self.FOREIGN_CHARS:
437
+ self.FOREIGN_CHARS = set("wkyÿ")
438
+ if not self.FRONT_VOWEL_CHARS:
439
+ self.FRONT_VOWEL_CHARS = set("eiéêí")
440
+ if not self.PRIMARY_STRESS_MARKERS:
441
+ self.PRIMARY_STRESS_MARKERS = self.ACUTE_VOWEL_CHARS | self.TILDE_VOWEL_CHARS
442
+ if not self.SECONDARY_STRESS_MARKERS:
443
+ self.SECONDARY_STRESS_MARKERS = self.GRAVE_VOWEL_CHARS | self.CIRCUM_VOWEL_CHARS | self.TREMA_VOWEL_CHARS
444
+
445
+ if not self.ALL_VOWEL_CHARS:
446
+ self.ALL_VOWEL_CHARS = self.VOWEL_CHARS | self.ACUTE_VOWEL_CHARS | self.GRAVE_VOWEL_CHARS | self.CIRCUM_VOWEL_CHARS | self.TREMA_VOWEL_CHARS
447
+
448
+ # IPA vowel mappings
449
+ if not self.ORAL_VOWELS:
450
+ self.ORAL_VOWELS = set("ieɛɨɐəauoɔ")
451
+ if not self.NASAL_VOWELS:
452
+ self.NASAL_VOWELS = set("ĩẽɐ̃ũõ")
453
+ if not self.CLOSED_VOWELS:
454
+ self.CLOSED_VOWELS = set("iɨu")
455
+ if not self.SEMI_CLOSED_VOWELS:
456
+ self.SEMI_CLOSED_VOWELS = set("eo")
457
+ if not self.OPEN_VOWELS:
458
+ self.OPEN_VOWELS = set("a")
459
+ if not self.SEMI_OPEN_VOWELS:
460
+ self.SEMI_OPEN_VOWELS = set("ɛɐɔ")
461
+
462
+ def _initialize_normalized_vowels(self):
463
+ """
464
+ Map archaic and foreign diacritics to modern Portuguese equivalents.
465
+
466
+ LINGUISTIC BACKGROUND:
467
+ ----------------------
468
+ Portuguese orthography has evolved through several reforms:
469
+ - 1911: Major reform in Portugal
470
+ - 1943: Brazil's orthographic convention
471
+ - 1945: Portugal aligns with Brazil
472
+ - 1971/1973: Further simplifications
473
+ - 1990/2009: Acordo Ortográfico (unified orthography)
474
+
475
+ Obsolete marks must be normalized for consistent processing.
476
+ """
477
+ if not self.NORMALIZED_VOWELS:
478
+ self.NORMALIZED_VOWELS = {
479
+ # CIRCUMFLEX ON HIGH VOWELS (î, û)
480
+ # Rule: High vowels /i, u/ have no open/closed distinction
481
+ # Therefore circumflex is redundant → removed
482
+ "î": "i", # Historical: used for emphasis
483
+ "û": "u", # Historical: used for emphasis
484
+
485
+ # TILDE ON MID VOWELS (ẽ, ĩ, ũ)
486
+ # Rule: Nasalization of mid/high vowels is allophonic
487
+ # Only /ɐ̃/ and /õ/ are phonemic
488
+ # These appear in foreign words or archaic texts
489
+ "ẽ": "ê", # Maps to closed mid vowel
490
+ "ĩ": "i", # Maps to high vowel
491
+ "ũ": "u", # Maps to high vowel
492
+
493
+ # GRAVE ACCENT (obsolete stress marker)
494
+ # Pre-1973: marked secondary stress in suffixed words
495
+ # Example: só + -mente → sòmente
496
+ # Modern: stress is not marked in these contexts
497
+ "è": "é",
498
+ "ì": "í",
499
+ "ò": "ó",
500
+ "ù": "ú",
501
+
502
+ # DIAERESIS/TREMA (obsolete hiatus marker)
503
+ # Pre-1945/2009: ü indicated pronounced /w/ after g/q
504
+ # Example: lingüiça [lĩˈgwisɐ] vs linguiça [lĩˈgisɐ]
505
+ # Modern: context must be learned (etymology required)
506
+ "ä": "á",
507
+ "ë": "é",
508
+ "ï": "í",
509
+ "ö": "ó",
510
+ "ü": "w", # Special: indicates [w] realization
511
+ "ÿ": "í"
512
+ }
513
+
514
+ def _initialize_punctuation(self):
515
+ """
516
+ Map orthographic punctuation to prosodic IPA representations.
517
+
518
+ PROSODIC INTERPRETATION:
519
+ ------------------------
520
+ Punctuation affects speech prosody (rhythm, pausing, intonation).
521
+ While full prosodic annotation requires ToBI or similar systems,
522
+ we use simplified IPA conventions.
523
+
524
+ Hiatus tokens (·) represent pause length:
525
+ - Short pause: 1 token (comma, hyphen)
526
+ - Medium pause: 2 tokens (semicolon)
527
+ - Long pause: 3 tokens (period)
528
+
529
+ Intonation markers (!, ?) require dedicated tone notation
530
+ which is beyond standard IPA segmental transcription.
531
+ """
532
+ if not self.PUNCT2IPA:
533
+ self.PUNCT2IPA = {
534
+ "-": self.HIATUS_TOKEN, # Hyphen: brief pause
535
+ ",": self.HIATUS_TOKEN, # Comma: brief pause
536
+ ";": self.HIATUS_TOKEN * 2, # Semicolon: medium pause
537
+ ".": self.HIATUS_TOKEN * 3, # Period: long pause
538
+ "!": self.PRIMARY_STRESS_TOKEN + self.HIATUS_TOKEN, # Exclamation: stress + pause
539
+ "?": "↗" + self.HIATUS_TOKEN, # Question: rising intonation + pause
540
+ }
541
+
542
+ def _initialize_consonant_digraphs(self):
543
+ """
544
+ Define two-letter sequences representing single consonant phonemes.
545
+
546
+ PHONETIC BACKGROUND:
547
+ --------------------
548
+ Portuguese inherited Latin digraphs and developed new ones:
549
+
550
+ - NH [ɲ]: Palatal nasal (tongue blade touches hard palate)
551
+ Etymology: Latin -gn- > Portuguese -nh-
552
+ Examples: vinho [ˈviɲu] < Latin vīnum
553
+
554
+ - LH [ʎ]: Palatal lateral (lateral with palatal contact)
555
+ Etymology: Latin -ll-, -cl-, -gl- > Portuguese -lh-
556
+ Examples: filho [ˈfiʎu] < Latin fīlius
557
+
558
+ - CH [ʃ]: Voiceless postalveolar fricative
559
+ Etymology: Latin -cl-, -pl-, fl- > Portuguese -ch-
560
+ Examples: chuva [ˈʃuvɐ] < Latin plŭvia
561
+
562
+ - RR [ʁ]: Strong R (uvular/velar fricative or trill)
563
+ Rule: 'rr' only occurs intervocalically
564
+ Contrast: caro [ˈkaɾu] "expensive" vs carro [ˈkaʁu] "car"
565
+
566
+ - SS [s]: Ensures voiceless [s] between vowels
567
+ Rule: single 's' between vowels → [z]
568
+ Contrast: casa [ˈkazɐ] "house" vs cassa [ˈkasɐ] (archaic "cancel")
569
+
570
+ - PH [f]: Archaic Greek etymological spelling
571
+ Modern: ph → f in orthographic reforms
572
+ Examples: pharmacia → farmácia
573
+ """
574
+ if not self.DIGRAPH2IPA:
575
+ self.DIGRAPH2IPA = {
576
+ "nh": "ɲ",
577
+ "lh": "ʎ",
578
+ "ch": "ʃ",
579
+ "rr": "ʀ", # Alternative: ʁ for uvular fricative
580
+ "ss": "s",
581
+
582
+ # Abolidos na Reforma Ortográfica de 1911
583
+ "th": "t",
584
+ "rh": "r",
585
+ "ph": "f" # O dígrafo ph foi substituído pela letra f.
586
+ # No entanto, manteve-se a pronúncia do ph com som de f, sobretudo no caso de nomes próprios e marcas comerciais de uso corrente.
587
+ # Exemplo: iPhone, Philips e Phebo.
588
+ }
589
+
590
+ def _initialize_nasal_digraphs(self):
591
+ """
592
+ Define vowel + nasal consonant sequences that create nasal vowels.
593
+
594
+ NASALIZATION RULES:
595
+ -------------------
596
+ In Portuguese, nasal vowels have two orthographic realizations:
597
+
598
+ 1. Tilde: ã, õ (direct nasal marking)
599
+ 2. Vowel + m/n: Nasalizes the vowel when m/n is in coda position
600
+
601
+ CODA POSITION DEFINITION:
602
+ --------------------------
603
+ m/n is in coda (nasalizes vowel) when:
604
+ - Word-final: tem [ˈtẽj̃], bom [ˈbõ]
605
+ - Before consonant: campo [ˈkɐ̃pu], ponte [ˈpõtɨ]
606
+
607
+ m/n is in onset (does NOT nasalize) when:
608
+ - Before vowel: caminho [kɐˈmiɲu], bonito [buˈnitu]
609
+
610
+ PHONETIC RESULT:
611
+ ----------------
612
+ The nasal consonant is not pronounced separately;
613
+ it triggers nasal airflow throughout the vowel.
614
+
615
+ ALLOPHONIC VARIATION:
616
+ ---------------------
617
+ Exact nasal vowel quality varies by context:
618
+ - /am/, /an/ → [ɐ̃] in most contexts
619
+ - /am/, /an/ → [ə̃] in European Portuguese final position
620
+
621
+ We use phonemic representations, abstracting over fine detail.
622
+ """
623
+ if not self.NASAL_DIGRAPHS:
624
+ self.NASAL_DIGRAPHS = {
625
+ # Low vowel nasalization: /a/ + nasal
626
+ "am": "ɐ̃", # Example: campo [ˈkɐ̃pu]
627
+ "âm": "ɐ̃", # With circumflex (stress marker)
628
+ "an": "ɐ̃", # Example: santo [ˈsɐ̃tu]
629
+ "ân": "ɐ̃",
630
+
631
+ # Mid-high vowel nasalization: /e/ + nasal
632
+ "em": "ẽ", # Example: tempo [ˈtẽpu]
633
+ "êm": "ẽ",
634
+ "en": "ẽ", # Example: dente [ˈdẽtɨ]
635
+ "ên": "ẽ",
636
+
637
+ # High front vowel nasalization: /i/ + nasal
638
+ "im": "ĩ", # Example: sim [ˈsĩ]
639
+ "in": "ĩ", # Example: tinta [ˈtĩtɐ]
640
+
641
+ # Mid-back vowel nasalization: /o/ + nasal
642
+ "om": "õ", # Example: som [ˈsõ]
643
+ "ôm": "õ",
644
+ "on": "õ", # Example: fonte [ˈfõtɨ]
645
+ "ôn": "õ",
646
+
647
+ # High back vowel nasalization: /u/ + nasal
648
+ "um": "ũ", # Example: um [ˈũ]
649
+ "un": "ũ", # Example: fundo [ˈfũdu]
650
+ }
651
+
652
+ def _initialize_consonant_hiatus(self):
653
+ """
654
+ Define consonant clusters that span syllable boundaries.
655
+
656
+ SYLLABIFICATION PRINCIPLE:
657
+ --------------------------
658
+ Portuguese syllables prefer CV (consonant-vowel) structure.
659
+ Certain consonant clusters cannot be parsed as single onsets,
660
+ so they split across syllables with a hiatus (break).
661
+
662
+ HETEROSYLLABIC CLUSTERS:
663
+ ------------------------
664
+ These clusters are always split:
665
+
666
+ - cc, cç [k·s]: Represents /ks/ cluster
667
+ Examples: ficção [fik·ˈsɐ̃w̃], acção [ak·ˈsɐ̃w̃]
668
+ Note: Modern spelling often simplifies to ç: ação
669
+
670
+ - ct [k·t]: Voiceless stops across syllable boundary
671
+ Examples: pacto [ˈpak·tu], convicto [kõˈvik·tu]
672
+
673
+ - pt [p·t]: Bilabial + alveolar across boundary
674
+ Examples: apto [ˈap·tu], eucalipto [ew·kɐˈlip·tu]
675
+ Note: In some archaic words, 'p' was silent
676
+
677
+ - pç, pc [p·s]: Bilabial + fricative
678
+ Examples: opção [op·ˈsɐ̃w̃], núpcias [ˈnup·sjɐʃ]
679
+
680
+ SYLLABIFICATION ALGORITHM:
681
+ --------------------------
682
+ The syllabifier should recognize these as split clusters,
683
+ not as single onsets. The hiatus token (·) marks the boundary.
684
+ """
685
+ if not self.HETEROSYLLABIC_CLUSTERS:
686
+ self.HETEROSYLLABIC_CLUSTERS = {
687
+ "cç": "k·s", # convicção, ficção, friccionar,
688
+ "cc": "k·s", # friccionar, cóccix, facciosa, ficcionado, infecciologia, fraccionamento
689
+ "ct": "k·t", # compacto, convicto, pacto, pictural;
690
+ "pt": "p·t", # adepto, apto, díptico, inepto, rapto. eucalipto,
691
+ "pç": "p·s", # erupção, opção, recepção
692
+ "pc": "p·s", # núpcias
693
+ }
694
+
695
+ def _initialize_archaic_forms(self):
696
+ """
697
+ Define archaic silent consonant patterns.
698
+
699
+ HISTORICAL ORTHOGRAPHY:
700
+ -----------------------
701
+ Pre-2009, Portuguese preserved etymological consonants from Latin
702
+ even when not pronounced. The Acordo Ortográfico eliminated these.
703
+
704
+ SILENT CONSONANT RULES:
705
+ -----------------------
706
+ When 'p' appeared in clusters mpc, mpç, mpt:
707
+ - If 'p' was silent: m + p → n (in modern spelling)
708
+ - If 'p' was pronounced: cluster retained
709
+
710
+ Examples of elimination:
711
+ - assumpcão → assunção [ɐsũˈsɐ̃w̃]
712
+ - assumptível → assuntível
713
+ - peremptório → perentório
714
+
715
+ IMPLEMENTATION CHALLENGE:
716
+ -------------------------
717
+ Modern texts may still contain archaic spellings.
718
+ We need word lists to distinguish:
719
+ - Truly archaic: 'p' silent in both old and new spelling
720
+ - Etymological retention: 'p' pronounced (Egito [ˈɛʒitu] retained)
721
+
722
+ For now, we flag known archaic forms.
723
+ Future: Integrate comprehensive etymological dictionary.
724
+ """
725
+ if not self.ARCHAIC_MUTE_P:
726
+ self.ARCHAIC_MUTE_P = {
727
+ "mpc": {"assumpcionista"}, # → assuncionista
728
+ "mpç": {"assumpção"}, # → assunção
729
+ "mpt": {
730
+ "assumptível", # → assuntível
731
+ "peremptório", # → perentório
732
+ "sumptuoso", # → suntuoso
733
+ "sumptuosidade" # → suntuosidade
734
+ },
735
+ }
736
+
737
+ def _initialize_foreign_digraphs(self):
738
+ """
739
+ Define digraphs from loanwords and foreign names.
740
+
741
+ ADAPTATION RULES:
742
+ -----------------
743
+ Portuguese adapts foreign orthography to native phonology:
744
+
745
+ - ff [f]: Geminate f in Italian, French loanwords
746
+ Realized as single [f] in Portuguese
747
+ Examples: graffiti, buffet
748
+
749
+ - ll [l]: Geminate l (not palatal ʎ)
750
+ Realized as single [l]
751
+ Examples: Llosa, villa
752
+
753
+ - sh [ʃ]: English/Russian orthography
754
+ Adapted to Portuguese [ʃ]
755
+ Examples: show, shopping, Shostakovich
756
+
757
+ - th [t] or [d]: English/Greek orthography
758
+ Usually adapted to [t] (voiceless) or [d] (voiced)
759
+ Examples: thriller [ˈtɾilɛɾ], Athens [ɐˈtenɐʃ]
760
+ Note: Some speakers use [θ] (interdental), but non-standard
761
+
762
+ PRONUNCIATION VARIATION:
763
+ ------------------------
764
+ Loanword pronunciation varies by:
765
+ - Speaker's education/exposure
766
+ - Degree of word integration
767
+ - Formality of context
768
+
769
+ We provide standard Portuguese adaptations.
770
+ """
771
+ if not self.FOREIGN_DIGRAPH2IPA:
772
+ self.FOREIGN_DIGRAPH2IPA = {
773
+ "ff": "f", # Italian/French: graffiti
774
+ "ll": "l", # Spanish: paella (note: not palatal)
775
+ "sh": "ʃ", # English: show, shopping
776
+ "th": "t", # English: thriller (some use [d])
777
+ }
778
+
779
+ def _initialize_hiatus_prefixes(self):
780
+ """
781
+ Define prefixes that force vowel hiatus (block diphthong formation).
782
+
783
+ HIATUS vs DIPHTHONG:
784
+ --------------------
785
+ When two vowels meet, they can form:
786
+ 1. Diphthong: Single syllable (e.g., pai [ˈpaj])
787
+ 2. Hiatus: Separate syllables (e.g., pa·ís [pɐˈiʃ])
788
+
789
+ MORPHOLOGICAL HIATUS:
790
+ ---------------------
791
+ Prefix boundaries often block diphthongization:
792
+ - bi- + auricular → bi·auricular [bi.aw.ɾi.ku.ˈlaɾ]
793
+ NOT *[bjaw.ɾi.ku.ˈlaɾ]
794
+ - semi- + automático → semi·automático
795
+ - ante- + ontem → ante·ontem
796
+
797
+ PHONOLOGICAL MOTIVATION:
798
+ ------------------------
799
+ Hiatus preservation maintains morphological transparency
800
+ (clear prefix + root boundaries) and aids comprehension.
801
+
802
+ IMPLEMENTATION:
803
+ ---------------
804
+ During grapheme tokenization, if a prefix is detected,
805
+ insert a syllable boundary marker to prevent diphthong parsing.
806
+ """
807
+ if not self.HIATUS_PREFIXES:
808
+ self.HIATUS_PREFIXES = {
809
+ "ante", # ante-histórico, ante-ontem
810
+ "bi", # bi-auricular, bi-anual
811
+ "semi", # semi-automático, semi-urbano
812
+ "mini", # mini-autocarro, mini-ópera
813
+ "anti", # anti-inflação, anti-oxidante
814
+ "multi", # multi-étnico, multi-uso
815
+ "auto", # auto-observação (when doubled)
816
+ "contra", # contra-ataque
817
+ "extra", # extra-oficial
818
+ "hiper", # hiper-ativo
819
+ "inter", # inter-urbano
820
+ "intra", # intra-ocular
821
+ "neo", # neo-ortodoxo
822
+ "pré", # pré-escolar
823
+ "pró", # pró-ativo
824
+ "re", # re-eleger (when doubled)
825
+ "sub", # sub-humano
826
+ "super", # super-homem
827
+ "supra", # supra-ocular
828
+ "ultra", # ultra-ortodoxo
829
+ }
830
+
831
+ # TODO - hiatus suffixes. eg. for suffix "inha" - Vinha -> V.inha
832
+
833
+ def _initialize_diphthongs(self):
834
+ """
835
+ Define all Portuguese diphthongs (oral and nasal).
836
+
837
+ DIPHTHONG STRUCTURE:
838
+ --------------------
839
+ A diphthong is a vocalic sequence pronounced in one syllable,
840
+ consisting of a vowel (nucleus) and a semivowel (glide).
841
+
842
+ Classification:
843
+ 1. By direction:
844
+ - Falling/descending: V + G (rei, pau)
845
+ - Rising/ascending: G + V (piano, água)
846
+
847
+ 2. By nasalization:
848
+ - Oral: only oral airflow (rei)
849
+ - Nasal: nasal + oral airflow (mãe, cão)
850
+
851
+ FALLING ORAL DIPHTHONGS:
852
+ ------------------------
853
+ Ending in [j] (spelled i or e):
854
+ - [aj]: pai, cai, vai
855
+ - [ɐj]: unstressed variant (casa > casais)
856
+ - [ɛj]: rei, papéis
857
+ - [ej]: leite, sei
858
+ - [ɔj]: herói, dói
859
+ - [oj]: boi, foi
860
+ - [uj]: fui, azuis
861
+
862
+ Ending in [w] (spelled u or o):
863
+ - [iw]: viu, partiu
864
+ - [ew]: meu, seu
865
+ - [ɛw]: céu, véu
866
+ - [aw]: mau, pau
867
+ - [ɐw]: unstressed (casa > casão)
868
+ - [ow]: sou, ou
869
+
870
+ FALLING NASAL DIPHTHONGS:
871
+ -------------------------
872
+ - [ɐ̃j̃]: mãe, cães (spelled ãe)
873
+ - [ẽj̃]: bem, também (spelled em final)
874
+ - [õj̃]: põe, õfões (spelled õe)
875
+ - [ɐ̃w̃]: cão, mão (spelled ão)
876
+ - [ũj̃]: muito, muitos (special case)
877
+
878
+ BRAZILIAN PORTUGUESE L-VOCALIZATION:
879
+ ------------------------------------
880
+ In most Brazilian dialects, syllable-final /l/ → [w]:
881
+ - mal [ˈmaw] (European: [ˈmaɫ])
882
+ - sol [ˈsɔw] (European: [ˈsɔɫ])
883
+ - Brasil [bɾaˈziw] (European: [bɾɐˈziɫ])
884
+
885
+ This creates additional diphthongs not present in European Portuguese.
886
+ """
887
+ if not self.RISING_ORAL_DIPHTHONGS:
888
+ self.RISING_ORAL_DIPHTHONGS = {
889
+ # Falling diphthongs ending in [j]
890
+ "aj": "ai", # pai, cai (stressed)
891
+ "ɐj": "ai", # variant (unstressed)
892
+ "ɛj": "éi", # rei, papéis
893
+ "ej": "ei", # leite, sei
894
+ "ɔj": "ói", # herói, dói
895
+ "oj": "oi", # boi, foi
896
+ "uj": "ui", # fui, azuis
897
+
898
+ # Falling diphthongs ending in [w]
899
+ "iw": "iu", # viu, partiu
900
+ "ew": "eu", # meu, seu
901
+ "ɛw": "éu", # céu, véu
902
+ "aw": "au", # mau, pau
903
+ "ɐw": "ao", # unstressed variant
904
+ "ow": "ou", # sou, ou
905
+ }
906
+
907
+ if not self.FALLING_NASAL_DIPHTHONGS:
908
+ self.FALLING_NASAL_DIPHTHONGS = {
909
+ "ɐ̃j": "ãe", # mãe, cães, pães
910
+ "ẽj": "em", # bem, também (final position)
911
+ "õj": "õe", # põe, limões
912
+ "ɐ̃w": "ão", # cão, mão, pão
913
+ }
914
+
915
+ if not self.PTBR_DIPHTHONGS:
916
+ # Brazilian Portuguese L-vocalization diphthongs
917
+ self.PTBR_DIPHTHONGS = {
918
+ "aw": "al", # mal [ˈmaw]
919
+ "ɛw": "el", # mel [ˈmɛw]
920
+ "ew": "el", # feltro [ˈfew.tɾu]
921
+ "iw": "il", # funil [fu.ˈniw]
922
+ "ɔw": "ol", # sol [ˈsɔw]
923
+ "ow": "ol", # soldado [sow.ˈda.du]
924
+ "uw": "ul", # azul [a.ˈzuw]
925
+ }
926
+
927
+ # Compile reverse mapping: orthography → IPA
928
+ if not self.DIPHTHONG2IPA:
929
+ self.DIPHTHONG2IPA = {
930
+ **{v: k for k, v in self.RISING_ORAL_DIPHTHONGS.items()},
931
+ **{v: k for k, v in self.FALLING_NASAL_DIPHTHONGS.items()},
932
+ }
933
+
934
+ def _initialize_triphthongs(self):
935
+ """
936
+ Define Portuguese triphthongs (rare, mostly in foreign words).
937
+
938
+ TRIPHTHONG DEFINITION:
939
+ ----------------------
940
+ A sequence of three vowel-like sounds in one syllable:
941
+ semivowel + vowel + semivowel (G-V-G)
942
+
943
+ Examples:
944
+ - Uruguai [u.ɾu.ˈgwaj]: G[w] + V[a] + G[j]
945
+ - Paraguai [pɐ.ɾɐ.ˈgwaj]
946
+ - miau [ˈmjaw]: G[j] + V[a] + G[w]
947
+
948
+ PHONETIC REALITY:
949
+ -----------------
950
+ True triphthongs are rare cross-linguistically.
951
+ Many apparent triphthongs are:
952
+ - Diphthong + separate vowel across syllable boundary
953
+ - Regional variants that simplify to diphthongs
954
+
955
+ In European Portuguese, many potential triphthongs reduce:
956
+ - iei → [jej] or [jɐj] depending on dialect
957
+ Examples: fieira, macieira
958
+
959
+ ORTHOGRAPHIC AMBIGUITY:
960
+ -----------------------
961
+ Portuguese orthography doesn't distinguish triphthongs clearly.
962
+ Syllabification and stress determine the parse:
963
+ - ca.iu [kɐ.ˈju]: hiatus (two syllables)
964
+ - miau [ˈmjaw]: triphthong (one syllable)
965
+
966
+ We include common patterns and flag for special handling.
967
+ """
968
+ if not self.TRIPHTHONG2IPA:
969
+ self.TRIPHTHONG2IPA = {
970
+ # [w-a-j] sequence
971
+ "uai": "waj", # rare: Uruguai, Paraguai
972
+ # [w-ɐ̃-j] nasal sequence
973
+ "uão": "wɐ̃w", # rare: saguão
974
+ }
975
+
976
+ def _initialize_trigrams(self):
977
+ """
978
+ Define three-letter graphemes with special pronunciations.
979
+
980
+ TYPES OF TRIGRAPHS:
981
+ -------------------
982
+ 1. QU/GU before E/I with explicit vowel
983
+ 2. Vowel sequences in hiatus or special contexts
984
+ 3. Foreign word patterns
985
+
986
+ QUE/QUI/GUE/GUI AMBIGUITY:
987
+ --------------------------
988
+ These sequences are ambiguous in modern Portuguese:
989
+ - 'u' can be silent: quero [ˈkeɾu], guerra [ˈɡɛʁɐ]
990
+ - 'u' can be pronounced: equino [eˈkwinu], ambíguo [ɐ̃ˈbiɡwu]
991
+
992
+ Historical solution: Trema (ü) marked pronounced u
993
+ - lingüiça [lĩˈgwisɐ]: u pronounced
994
+ - linguiça [lĩˈgisɐ]: u silent
995
+
996
+ Modern challenge: No marking, must learn from context/etymology
997
+
998
+ DOUBLE-O SEQUENCES:
999
+ -------------------
1000
+ When prefix/root boundary creates -oo-, typically pronounced as:
1001
+ - Separate syllables: co.operação [ko.o.pɛ.ɾɐ.ˈsɐ̃w̃]
1002
+ - But may reduce in rapid speech
1003
+
1004
+ Special cases:
1005
+ - voo [ˈvo.u] or [ˈvow]: "flight" (noun from voar)
1006
+ - zoo [ˈzɔ.u] or [ˈzow]: "zoo"
1007
+
1008
+ We mark these for context-sensitive handling.
1009
+ """
1010
+ if not self.TRIGRAM2IPA:
1011
+ self.TRIGRAM2IPA = {
1012
+ "tch": "tʃ", # the only true trigraph in portuguese
1013
+
1014
+ # QU/GU patterns (context-dependent, flagged for special handling)
1015
+ "que": "kɨ", # quero (default: u silent)
1016
+ "qui": "ki", # quia
1017
+ "gue": "ɡɨ", # guerra
1018
+ "gui": "ɡi", # guia
1019
+ "qué": "kɛ", # with explicit stress
1020
+ "gué": "ɡɛ",
1021
+ "quê": "ke",
1022
+ "guê": "ɡe",
1023
+
1024
+ # Double-O patterns (prefix boundaries)
1025
+ "coo": "ku.u", # cooperar, coordenar
1026
+ "joo": "ʒo.u", # enjoo
1027
+ "noo": "nu.u", # noológico
1028
+ "zoo": "zu.u", # zoologia, zoo
1029
+ "voo": "vo.u", # voo, revoo
1030
+
1031
+ # Foreign patterns
1032
+ "boo": "bu.u", # booleano
1033
+ "too": "tu.u", # cartoonista
1034
+ "woo": "wu.u", # Hollywood
1035
+ "hoo": "u.u", # hooliganismo
1036
+
1037
+ # Nasal patterns
1038
+ "ção": "sɐ̃w̃", # -ção suffix (very common)
1039
+ "ões": "õj̃ʃ", # plural -ões
1040
+ }
1041
+
1042
+ def _initialize_tetragrams(self):
1043
+ """
1044
+ Define four-letter graphemes (very rare).
1045
+
1046
+ TETRAGRAPH CONTEXTS:
1047
+ --------------------
1048
+ Four-letter sequences with special pronunciation arise from:
1049
+ 1. Suffix attachment: -ense, -iano
1050
+ 2. Compound formation
1051
+ 3. Loanwords
1052
+
1053
+ Most are analyzable as diphthong + digraph or similar,
1054
+ but we list them explicitly for pattern recognition.
1055
+
1056
+ GENTILICS (DEMONYMS):
1057
+ ---------------------
1058
+ -iense suffix (indicating origin) creates potential tetragraphs:
1059
+ - gaiense: from Gaia [ɡɐj.ˈẽ.sɨ] or [ɡɐ.jẽ.sɨ]
1060
+ - praiense: from Praia
1061
+ - xangaiense: from Shanghai
1062
+
1063
+ Syllabification is variable and dialect-dependent.
1064
+ """
1065
+ if not self.TETRAGRAM2IPA:
1066
+ self.TETRAGRAM2IPA = {
1067
+ "aien": "ɐj.ẽ", # gaiense, praiense, xangaiense
1068
+
1069
+ # Foreign words / proper nouns
1070
+ "guai": "gwaj", # Uruguai, Paraguai
1071
+ "quai": "kwaj",
1072
+
1073
+ # hiatus
1074
+ "iaiá": "i.ɐ.ˈja", # iaiá (Brazilian: nanny, lady)
1075
+ }
1076
+
1077
+ def _initialize_default_chars(self):
1078
+ """
1079
+ Define baseline character-to-phoneme mappings.
1080
+
1081
+ DESIGN PRINCIPLE:
1082
+ -----------------
1083
+ These are CONTEXT-FREE default mappings.
1084
+ Many characters have context-sensitive realizations
1085
+ that override these defaults. Context rules are applied
1086
+ during IPA generation in the CharToken class.
1087
+
1088
+ VOWELS:
1089
+ -------
1090
+ Portuguese has 9 oral vowel phonemes in stressed position:
1091
+ /i, e, ɛ, a, ɐ, ɔ, o, u/ (plus nasal vowels)
1092
+
1093
+ Unstressed vowels reduce to smaller inventory:
1094
+ /i, ɨ, u, ɐ/ (European Portuguese)
1095
+ /i, u, a/ (Brazilian Portuguese - less reduction)
1096
+
1097
+ Default mapping uses neutral/unstressed values where ambiguous.
1098
+
1099
+ CONSONANTS:
1100
+ -----------
1101
+ Most consonants have straightforward mappings.
1102
+ Exceptions (context-sensitive):
1103
+ - c: [k] default, but [s] before e/i
1104
+ - g: [ɡ] default, but [ʒ] before e/i
1105
+ - r: [ɾ] default (tap), but [ʁ]/[ʀ] word-initially or after n/l/s
1106
+ - s: [s] default, but [z] intervocalically
1107
+ - x: [ʃ] default, but can be [ks], [z], [s], [gz] contextually
1108
+ - z: [z] default, but [ʃ]/[s] word-finally
1109
+
1110
+ SILENT LETTERS:
1111
+ ---------------
1112
+ - h: Always silent except in digraphs (ch, nh, lh)
1113
+ - u: Silent in que/qui, gue/gui contexts (modern orthography)
1114
+ """
1115
+ if not self.DEFAULT_CHAR2PHONEMES:
1116
+ self.DEFAULT_CHAR2PHONEMES = {
1117
+ # VOWELS
1118
+ # Low vowel: stressed [a], unstressed [ɐ]
1119
+ "a": "ɐ", # Default: reduced (unstressed) value
1120
+ "á": "a", # Acute: stressed open value
1121
+ "à": "a", # Grave: (rare) stressed
1122
+ "â": "ɐ", # Circumflex: stressed closed value (often [ɐ])
1123
+ "ã": "ɐ̃", # Nasal low vowel
1124
+
1125
+ # Mid-front vowel: stressed [e] or [ɛ], unstressed [ɨ]
1126
+ "e": "ɨ", # Default: reduced (European Portuguese)
1127
+ "é": "ɛ", # Acute: stressed open value
1128
+ "ê": "e", # Circumflex: stressed closed value
1129
+
1130
+ # High-front vowel: always [i]
1131
+ "i": "i",
1132
+ "í": "i", # Stress marker only (no quality change)
1133
+
1134
+ # Mid-back vowel: stressed [o] or [ɔ], unstressed [u]
1135
+ "o": "u", # Default: reduced (European Portuguese)
1136
+ "ó": "ɔ", # Acute: stressed open value
1137
+ "ô": "o", # Circumflex: stressed closed value
1138
+ "õ": "õ", # Nasal mid-back vowel
1139
+
1140
+ # High-back vowel: always [u]
1141
+ "u": "u",
1142
+ "ú": "u", # Stress marker only
1143
+
1144
+ # CONSONANTS
1145
+ # Stops
1146
+ "p": "p", # Voiceless bilabial stop
1147
+ "b": "b", # Voiced bilabial stop
1148
+ "t": "t", # Voiceless alveolar stop
1149
+ "d": "d", # Voiced alveolar stop
1150
+ "k": "k", # Voiceless velar stop (foreign)
1151
+ "c": "k", # Default: voiceless velar stop
1152
+ "q": "k", # Always voiceless velar (+ u)
1153
+ "g": "ɡ", # Default: voiced velar stop
1154
+
1155
+ # Fricatives
1156
+ "f": "f", # Voiceless labiodental fricative
1157
+ "v": "v", # Voiced labiodental fricative
1158
+ "s": "s", # Default: voiceless alveolar fricative
1159
+ "z": "z", # Voiced alveolar fricative
1160
+ "ç": "s", # Voiceless alveolar fricative (c-cedilla)
1161
+ "j": "ʒ", # Voiced postalveolar fricative
1162
+ "x": "ʃ", # Default: voiceless postalveolar fricative
1163
+
1164
+ # Nasals
1165
+ "m": "m", # Bilabial nasal
1166
+ "n": "n", # Alveolar nasal
1167
+
1168
+ # Liquids
1169
+ "l": "l", # Alveolar lateral
1170
+ "r": "ɾ", # Default: alveolar tap
1171
+
1172
+ # Semivowels (in consonantal position)
1173
+ "w": "w", # Labiovelar approximant (foreign)
1174
+ "y": "j", # Palatal approximant (foreign, rare)
1175
+
1176
+ # Silent
1177
+ "h": "", # Always silent in Portuguese
1178
+ }
1179
+
1180
+ def _initialize_stress_rules(self):
1181
+ """
1182
+ Define patterns that predict stress placement.
1183
+
1184
+ PORTUGUESE STRESS SYSTEM:
1185
+ -------------------------
1186
+ Portuguese stress is SEMI-PREDICTABLE based on word shape:
1187
+
1188
+ DEFAULT RULE (Paroxytone):
1189
+ - Stress falls on penultimate (second-to-last) syllable
1190
+ - Applies to ~80% of words
1191
+ - Examples: casa, livro, falam
1192
+
1193
+ OXYTONE EXCEPTIONS (final syllable stress):
1194
+ - Words ending in: -r, -l, -z, -im, -um, nasal vowels
1195
+ - Examples: falar, azul, rapaz, jardim, atum, maçã
1196
+ - Loanwords often follow this pattern: hotel, bar
1197
+
1198
+ PROPAROXYTONE (antepenultimate stress):
1199
+ - ALWAYS marked with written accent
1200
+ - Less common (~5% of words)
1201
+ - Examples: médico, lâmpada, ótimo
1202
+ - Mostly erudite words, Latin borrowings
1203
+
1204
+ MONOSYLLABLES:
1205
+ - Inherently stressed (no choice of syllable)
1206
+ - May have accent for semantic distinction:
1207
+ - pé [ˈpɛ] "foot" vs pê [ˈpe] "letter P"
1208
+
1209
+ WRITTEN ACCENT RULES:
1210
+ ---------------------
1211
+ Accents are written to mark:
1212
+ 1. Unexpected stress position (proparoxytones)
1213
+ 2. Vowel quality (é [ɛ] vs ê [e])
1214
+ 3. Disambiguation (pára "stops" vs para "for")
1215
+
1216
+ NOTE: The 1990/2009 Acordo Ortográfico changed some rules,
1217
+ eliminating some accents (e.g., trema) and disambiguators.
1218
+ """
1219
+ if not self.OXYTONE_ENDINGS:
1220
+ self.OXYTONE_ENDINGS = {
1221
+ # Consonant endings that trigger final stress
1222
+ "r", # falar, comer, partir
1223
+ "l", # azul, papel, farol
1224
+ "z", # rapaz, feliz, capaz
1225
+ "x", # fax, latex (loanwords)
1226
+
1227
+ # Nasal endings (final nasal vowels are stressed)
1228
+ "m", # jardim, atum, homem
1229
+ "n", # hífen (rare, mostly foreign)
1230
+ "ão", # cão, mão (diphthong)
1231
+ "ãe", # mãe, cães
1232
+ "õe", # põe, limões
1233
+
1234
+ # Diphthong endings
1235
+ "éi", # papéis, hotéis
1236
+ "éu", # troféu, céu
1237
+ "ói", # herói, anzóis
1238
+ "au", # grau, pau
1239
+ "áu", #
1240
+
1241
+ # Explicit stress markers (always stressed)
1242
+ "á", "é", "í", "ó", "ú",
1243
+ "â", "ê", "ô",
1244
+ "ã", "õ",
1245
+ }
1246
+
1247
+ def _compile_grapheme_inventory(self):
1248
+ """
1249
+ Compile sorted list of all multi-character graphemes.
1250
+
1251
+ PURPOSE:
1252
+ --------
1253
+ During tokenization, we need to recognize multi-character units
1254
+ (digraphs, diphthongs, etc.) before processing individual characters.
1255
+
1256
+ GREEDY MATCHING PRINCIPLE:
1257
+ --------------------------
1258
+ Longer sequences must be checked first to avoid incorrect parses:
1259
+ - Incorrect: "ch" → ['c', 'h'] → [k, (silent)]
1260
+ - Correct: "ch" → ['ch'] → [ʃ]
1261
+
1262
+ SORTING:
1263
+ --------
1264
+ We sort by length (descending) to ensure longest match wins.
1265
+ Within same length, alphabetical order for deterministic behavior.
1266
+
1267
+ INVENTORY SOURCES:
1268
+ ------------------
1269
+ - Tetragraphs (4 chars)
1270
+ - Trigraphs (3 chars)
1271
+ - Triphthongs (3 chars)
1272
+ - Digraphs: consonant, nasal, foreign
1273
+ - Diphthongs (2 chars)
1274
+ - Consonant hiatus patterns
1275
+ - Hiatus prefixes
1276
+ - Archaic forms
1277
+
1278
+ Single characters are NOT included (handled separately).
1279
+ """
1280
+ if not self.GRAPHEME_INVENTORY:
1281
+ # Collect all multi-character graphemes
1282
+ all_graphemes = set()
1283
+
1284
+ # add vowel inventory
1285
+ all_graphemes.update(self.ALL_VOWEL_CHARS)
1286
+
1287
+ # Add from all mapping dictionaries
1288
+ all_graphemes.update(self.TETRAGRAM2IPA.keys())
1289
+ all_graphemes.update(self.TRIGRAM2IPA.keys())
1290
+ all_graphemes.update(self.TRIPHTHONG2IPA.keys())
1291
+ all_graphemes.update(self.DIGRAPH2IPA.keys())
1292
+ all_graphemes.update(self.DIPHTHONG2IPA.keys())
1293
+ all_graphemes.update(self.FOREIGN_DIGRAPH2IPA.keys())
1294
+ all_graphemes.update(self.HETEROSYLLABIC_CLUSTERS.keys())
1295
+ all_graphemes.update(self.NASAL_DIGRAPHS.keys())
1296
+
1297
+ # Add prefixes and archaic forms
1298
+ all_graphemes.update(self.HIATUS_PREFIXES)
1299
+ all_graphemes.update(self.ARCHAIC_MUTE_P.keys())
1300
+
1301
+ # Add single characters for completeness
1302
+ all_graphemes.update(string.ascii_lowercase)
1303
+ all_graphemes.update(string.punctuation)
1304
+
1305
+ # Sort: longest first (for greedy matching), then alphabetical
1306
+ self.GRAPHEME_INVENTORY = sorted(
1307
+ all_graphemes,
1308
+ key=lambda x: (-len(x), x)
1309
+ )
1310
+
1311
+
1312
+ # the base ruleset is based on Acordo Ortográfico de 1990, in effect since 2009
1313
+ # https://pt.wikipedia.org/wiki/Acordo_Ortogr%C3%A1fico_de_1990
1314
+ # http://www.portaldalinguaportuguesa.org/acordo.php
1315
+ AO1990 = DialectInventory(dialect_code="pt")
1316
+
1317
+
1318
+ # =============================================================================
1319
+ # DIALECT INSTANCES
1320
+ # =============================================================================
1321
+
1322
+ class EuropeanPortuguese(DialectInventory):
1323
+ """
1324
+ European Portuguese (Portugal) phonological inventory.
1325
+
1326
+ CHARACTERISTIC FEATURES:
1327
+ ------------------------
1328
+ 1. VOWEL REDUCTION: Unstressed vowels reduce heavily
1329
+ - /a/ → [ɐ] in unstressed positions
1330
+ - /e/ → [ɨ] (close central) in unstressed positions
1331
+ - /o/ → [u] in unstressed positions
1332
+ Example: "pedir" [pɨˈdiɾ], "casa" [ˈkazɐ]
1333
+
1334
+ 2. FRICATIVE PALATALIZATION: Final /s, z/ → [ʃ, ʒ]
1335
+ - "três" [ˈtɾeʃ]
1336
+ - "luz" [ˈluʃ]
1337
+ - Before voiceless consonants: /s/ → [ʃ]
1338
+ - Before voiced consonants: /s/ → [ʒ]
1339
+
1340
+ 3. DARK L: Coda /l/ realized as velarized [ɫ]
1341
+ - "Brasil" [bɾɐˈziɫ]
1342
+ - "mal" [ˈmaɫ]
1343
+
1344
+ 4. UVULAR R: Strong /R/ often realized as uvular [ʁ]
1345
+ - "rato" [ˈʁatu]
1346
+ - "carro" [ˈkaʁu]
1347
+ (Some regions use alveolar trill [r])
1348
+
1349
+ 5. NASAL VOWELS: Highly nasalized
1350
+ - "mão" [ˈmɐ̃w̃]
1351
+ - "bem" [ˈbẽj̃]
1352
+ """
1353
+
1354
+ def __init__(self):
1355
+ super().__init__(
1356
+ dialect_code="pt-PT",
1357
+ FALLING_NASAL_DIPHTHONGS={
1358
+ **AO1990.FALLING_NASAL_DIPHTHONGS,
1359
+ "ũj": "ui", # muito (special nasalized case)
1360
+ },
1361
+ TRIPHTHONG2IPA={
1362
+ **AO1990.TRIPHTHONG2IPA,
1363
+ # [j-e-j] sequence
1364
+ "iei": "jej", # chieira, macieira, pardieiro
1365
+ # Alternative Lisbon realization:
1366
+ # "iei": "jɐj", # with vowel reduction
1367
+ # [j-a-w] sequence
1368
+ "iau": "jaw", # miau
1369
+ },
1370
+ IRREGULAR_WORDS={
1371
+ # "ui" nasalized in "muito"
1372
+ "muito": "ˈmũj.tu",
1373
+ # Single-syllable special cases
1374
+ "miau": "ˈmjaw",
1375
+ })
1376
+
1377
+
1378
+ # =============================================================================
1379
+ # BRAZILIAN PORTUGUESE (pt-BR)
1380
+ # =============================================================================
1381
+
1382
+ class BrazilianPortuguese(DialectInventory):
1383
+ """
1384
+ Brazilian Portuguese phonological inventory.
1385
+
1386
+ MAJOR DIFFERENCES FROM EUROPEAN:
1387
+ --------------------------------
1388
+ 1. LESS VOWEL REDUCTION: Unstressed vowels maintain quality
1389
+ - European: "pedir" [pɨˈdiɾ] vs. Brazilian: "pedir" [peˈdʒiɾ]
1390
+ - European: "casa" [ˈkazɐ] vs. Brazilian: "casa" [ˈkaza]
1391
+ - /a/ stays [a] (not reduced to [ɐ])
1392
+ - /e/ stays [e] (not reduced to [ɨ])
1393
+ - /o/ stays [o] (not reduced to [u])
1394
+
1395
+ 2. PALATALIZATION: /t, d/ → [tʃ, dʒ] before [i]
1396
+ - "tia" [ˈtʃiɐ] (European: [ˈtiɐ])
1397
+ - "dia" [ˈdʒiɐ] (European: [ˈdiɐ])
1398
+ - "noite" [ˈnojtʃi] (European: [ˈnojtɨ])
1399
+ - "grande" [ˈɡɾɐ̃dʒi] (European: [ˈɡɾɐ̃dɨ])
1400
+
1401
+ 3. L-VOCALIZATION: Syllable-final /l/ → [w]
1402
+ - "Brasil" [bɾaˈziw] (European: [bɾɐˈziɫ])
1403
+ - "mal" [ˈmaw] (European: [ˈmaɫ])
1404
+ - "sol" [ˈsɔw] (European: [ˈsɔɫ])
1405
+ - Creates new diphthongs: -al, -el, -il, -ol, -ul
1406
+
1407
+ 4. DIFFERENT R SOUNDS: Regional variation
1408
+ - São Paulo/South: [ɾ] (tap) and [x]/[h] (velar/glottal fricative)
1409
+ - Rio: [ʁ] (uvular) and [x]/[h]
1410
+ - Rural areas: May preserve alveolar trill [r]
1411
+ - "carro" [ˈkaxu] (SP) vs. [ˈkaʁu] (Rio) vs. [ˈkaru] (rural)
1412
+
1413
+ 5. FINAL /s/: Stays [s], doesn't palatalize
1414
+ - "três" [ˈtɾes] (European: [ˈtɾeʃ])
1415
+ - "nós" [ˈnɔs] (European: [ˈnɔʃ])
1416
+
1417
+ 6. LESS NASAL: Nasal vowels less nasalized than European
1418
+ - Nasalization is lighter
1419
+ - May have shorter nasal quality
1420
+
1421
+ 7. OPEN VOWELS IN STRESSED POSITION:
1422
+ - Greater tendency toward open vowels [ɛ, ɔ] when stressed
1423
+ - "café" [kaˈfɛ]
1424
+ - "avô" [aˈvɔ]
1425
+ """
1426
+
1427
+ def __init__(self):
1428
+ super().__init__(
1429
+ dialect_code="pt-BR",
1430
+ DIGRAPH2IPA = {
1431
+ **AO1990.DIGRAPH2IPA,
1432
+ "rr": "h" # DIVERGENCE: Brazilian uses [h] or [x] instead of [ʁ]
1433
+ },
1434
+ DEFAULT_CHAR2PHONEMES={
1435
+ **AO1990.DEFAULT_CHAR2PHONEMES,
1436
+ # VOWELS - LESS REDUCTION IN BRAZILIAN
1437
+ "a": "a", # DIVERGENCE: stays [a], not [ɐ]
1438
+ "â": "a", # DIVERGENCE: stays [a], not [ɐ]
1439
+ "e": "e", # DIVERGENCE: stays [e], not [ɨ]
1440
+ "o": "o", # DIVERGENCE: stays [o], not [u]
1441
+ # CONSONANTS
1442
+ "r": "ɾ", # DIVERGENCE: tap, strong R is [h]
1443
+ }
1444
+ )
1445
+
1446
+
1447
+ # =============================================================================
1448
+ # ANGOLAN PORTUGUESE (pt-AO)
1449
+ # =============================================================================
1450
+
1451
+ class AngolanPortuguese(DialectInventory):
1452
+ """
1453
+ Angolan Portuguese phonological inventory.
1454
+
1455
+ CHARACTERISTIC FEATURES:
1456
+ ------------------------
1457
+ 1. BASE: Similar to European Portuguese but with modifications
1458
+
1459
+ 2. VOWEL REDUCTION: Less reduction than European, more than Brazilian
1460
+ - Intermediate between European and Brazilian
1461
+ - Influenced by Bantu substrate (Kimbundu, Umbundu, Kikongo)
1462
+
1463
+ 3. R SOUNDS: Consistent alveolar trill [r]
1464
+ - Preserves distinction between tap [ɾ] and trill [r]
1465
+ - More conservative than European or Brazilian
1466
+ - "carro" [ˈkaru] (not [ˈkaʁu] or [ˈkaxu])
1467
+
1468
+ 4. PROSODY: Influenced by Bantu tone languages
1469
+ - May have different intonation patterns
1470
+ - Stress patterns similar to European
1471
+
1472
+ 5. FINAL /s/: Generally [ʃ] like European
1473
+ - "três" [ˈtɾeʃ]
1474
+
1475
+ 6. SUBSTRATE INFLUENCE: Phonological features from Bantu languages
1476
+ - May preserve some consonant distinctions
1477
+ - Prosodic patterns influenced by L1 Bantu speakers
1478
+ """
1479
+
1480
+ def __init__(self):
1481
+ super().__init__(dialect_code="pt-AO",
1482
+ DIGRAPH2IPA={
1483
+ **AO1990.DIGRAPH2IPA,
1484
+ "rr": "r", # DIVERGENCE: Angolan uses alveolar trill [r]
1485
+ },
1486
+ # Moderate vowel reduction (between European and Brazilian)
1487
+ DEFAULT_CHAR2PHONEMES={
1488
+ **AO1990.DEFAULT_CHAR2PHONEMES,
1489
+ "e": "e", # DIVERGENCE: Less reduction than European [ɨ]
1490
+ "o": "o", # DIVERGENCE: Less reduction than European [u]
1491
+ "r": "ɾ", # DIVERGENCE: Strong R is [r], not [ʁ]
1492
+ }
1493
+ )
1494
+
1495
+
1496
+ # =============================================================================
1497
+ # MOZAMBICAN PORTUGUESE (pt-MZ)
1498
+ # =============================================================================
1499
+
1500
+ class MozambicanPortuguese(DialectInventory):
1501
+ """
1502
+ Mozambican Portuguese phonological inventory.
1503
+
1504
+ CHARACTERISTIC FEATURES:
1505
+ ------------------------
1506
+ 1. BASE: Similar to European Portuguese with Bantu substrate
1507
+
1508
+ 2. VOWEL REDUCTION: Variable, generally less than European
1509
+ - Influenced by substrate languages (Makhuwa, Tsonga, Sena)
1510
+ - May preserve more vowel distinctions
1511
+
1512
+ 3. R SOUNDS: Alveolar trill [r] common
1513
+ - Similar to Angolan Portuguese
1514
+ - "carro" [ˈkaru]
1515
+
1516
+ 4. REGIONAL VARIATION:
1517
+ - North (Nampula): More substrate influence
1518
+ - South (Maputo): Closer to European/South African Portuguese
1519
+
1520
+ 5. FINAL /s/: Generally [ʃ] like European
1521
+ - "nós" [ˈnɔʃ]
1522
+
1523
+ 6. PROSODY: Bantu-influenced intonation
1524
+ - May have different rhythm patterns
1525
+ """
1526
+
1527
+ def __init__(self):
1528
+ super().__init__(dialect_code="pt-MZ",
1529
+ DIGRAPH2IPA={
1530
+ **AO1990.DIGRAPH2IPA,
1531
+ "rr": "r", # DIVERGENCE: Angolan uses alveolar trill [r]
1532
+ },
1533
+ # Moderate vowel reduction (between European and Brazilian)
1534
+ DEFAULT_CHAR2PHONEMES={
1535
+ **AO1990.DEFAULT_CHAR2PHONEMES,
1536
+ "e": "e", # DIVERGENCE: Less reduction than European [ɨ]
1537
+ "o": "o", # DIVERGENCE: Less reduction than European [u]
1538
+ "r": "ɾ", # DIVERGENCE: Strong R is [r], not [ʁ]
1539
+ }
1540
+ )
1541
+
1542
+
1543
+ # =============================================================================
1544
+ # TIMORESE PORTUGUESE (pt-TL)
1545
+ # =============================================================================
1546
+
1547
+ class TimoresePortuguese(DialectInventory):
1548
+ """
1549
+ Timorese Portuguese (East Timor) phonological inventory.
1550
+
1551
+ CHARACTERISTIC FEATURES:
1552
+ ------------------------
1553
+ 1. BASE: European Portuguese with Austronesian substrate influence
1554
+ - Primary substrate: Tetum
1555
+ - Also influenced by Indonesian
1556
+
1557
+ 2. L2 FEATURES: Portuguese often learned as second language
1558
+ - May show substrate transfer from Tetum
1559
+ - More conservative/formal pronunciation
1560
+ - Less naturalistic reduction
1561
+
1562
+ 3. VOWEL SYSTEM: Similar to European but may be simpler
1563
+ - Less vowel reduction than European
1564
+ - May neutralize some distinctions
1565
+
1566
+ 4. R SOUNDS: Variable
1567
+ - May use alveolar tap [ɾ] and trill [r]
1568
+ - Less uvular [ʁ] than European
1569
+
1570
+ 5. FINAL /s/: Generally [ʃ] like European
1571
+ - "nós" [ˈnɔʃ]
1572
+
1573
+ 6. SMALLER SPEAKER BASE: Portuguese is official but less widely native
1574
+ - More formal/prescriptive forms common
1575
+ - Less dialectal innovation
1576
+ """
1577
+
1578
+ def __init__(self):
1579
+ super().__init__(dialect_code="pt-TL")
1580
+
1581
+ super().__init__(dialect_code="pt-MZ",
1582
+ DIGRAPH2IPA={
1583
+ **AO1990.DIGRAPH2IPA,
1584
+ "rr": "r", # DIVERGENCE: Angolan uses alveolar trill [r]
1585
+ },
1586
+ # Moderate vowel reduction (between European and Brazilian)
1587
+ DEFAULT_CHAR2PHONEMES={
1588
+ **AO1990.DEFAULT_CHAR2PHONEMES,
1589
+ "a": "a", # DIVERGENCE: Less reduction
1590
+ "e": "e", # DIVERGENCE: Less reduction than European [ɨ]
1591
+ "o": "o", # DIVERGENCE: Less reduction than European [u]
1592
+ "r": "ɾ", # DIVERGENCE: Strong R is [r], not [ʁ]
1593
+ }
1594
+ )
1595
+
1596
+
1597
+ # =============================================================================
1598
+ # Helper Functions
1599
+ # =============================================================================
1600
+
1601
+ def detect_stress_position(word: str, syllables: List[str], dialect: DialectInventory) -> int:
1602
+ """
1603
+ Determine which syllable carries primary stress.
1604
+
1605
+ ALGORITHM:
1606
+ ----------
1607
+ 1. Check for explicit accent marks → stress that syllable
1608
+ 2. Check word-final pattern against OXYTONE_ENDINGS
1609
+ 3. Default to penultimate (paroxytone rule)
1610
+
1611
+ Args:
1612
+ word: Normalized word string
1613
+ syllables: List of syllables
1614
+ dialect: DialectInventory with stress rules
1615
+
1616
+ Returns:
1617
+ Index of stressed syllable (0-based)
1618
+
1619
+ Examples:
1620
+ >>> detect_stress_position("café", ["ca", "fé"], dialect)
1621
+ 1 # Final syllable (explicit accent)
1622
+
1623
+ >>> detect_stress_position("casa", ["ca", "sa"], dialect)
1624
+ 0 # Penultimate (default)
1625
+
1626
+ >>> detect_stress_position("falar", ["fa", "lar"], dialect)
1627
+ 1 # Final (ends in -r)
1628
+ """
1629
+ n_syllables = len(syllables)
1630
+
1631
+ # Monosyllables are inherently stressed
1632
+ if n_syllables == 1:
1633
+ return 0
1634
+
1635
+ # Check for explicit accent marks (primary stress markers)
1636
+ for idx, syllable in enumerate(syllables):
1637
+ if any(char in syllable for char in dialect.PRIMARY_STRESS_MARKERS):
1638
+ return idx
1639
+
1640
+ # Check for oxytone word endings (final stress)
1641
+ for ending in dialect.OXYTONE_ENDINGS:
1642
+ if word.endswith(ending):
1643
+ return n_syllables - 1
1644
+
1645
+ # Default: paroxytone (penultimate stress)
1646
+ return n_syllables - 2 if n_syllables >= 2 else 0
1647
+
1648
+
1649
+ def is_grapheme_silent(grapheme: str, context_before: str, context_after: str,
1650
+ word: str, dialect: DialectInventory) -> bool:
1651
+ """
1652
+ Determine if a grapheme has no phonetic realization (silent).
1653
+
1654
+ SILENT CATEGORIES:
1655
+ ------------------
1656
+ 1. H: Always silent except in digraphs (handled separately)
1657
+ 2. U in QU/GU: Silent before e/i in modern orthography
1658
+ - Exception: Some words have pronounced [w] (needs word list)
1659
+ 3. Archaic consonants: p in mpt/mpc/mpç (pre-2009 spelling)
1660
+ 4. First letter of doubled consonants in digraphs: rr, ss, ff, ll
1661
+
1662
+ Args:
1663
+ grapheme: The grapheme to check
1664
+ context_before: Characters immediately before
1665
+ context_after: Characters immediately after
1666
+ word: Full word (for irregular word lookup)
1667
+ dialect: DialectInventory with rules
1668
+
1669
+ Returns:
1670
+ True if grapheme is silent, False otherwise
1671
+
1672
+ Examples:
1673
+ >>> is_grapheme_silent('h', '', 'oje', 'hoje', dialect)
1674
+ True # h is always silent
1675
+
1676
+ >>> is_grapheme_silent('u', 'q', 'ero', 'quero', dialect)
1677
+ True # u silent in 'que'
1678
+
1679
+ >>> is_grapheme_silent('u', 'q', 'ino', 'equino', dialect)
1680
+ False # u pronounced in 'equino' [eˈkwinu]
1681
+ (Note: This would require word list to distinguish)
1682
+ """
1683
+ g = grapheme.lower()
1684
+ before = context_before.lower()
1685
+ after = context_after.lower()
1686
+
1687
+ # H is always silent in Portuguese
1688
+ if g == "h":
1689
+ return True
1690
+
1691
+ # U after Q or G before E or I (modern orthography default: silent)
1692
+ # Historical note: Trema (ü) used to mark pronounced u
1693
+ # Modern: Requires etymology/word list to determine
1694
+ if g == "u" and before in ["q", "g"] and after and after[0] in "ei":
1695
+ # Check word list for know exceptions
1696
+ if word.lower() in ["equino", "antiguidade, linguiça", "pinguim", "frequente", "frequentemente"]:
1697
+ return False
1698
+ # assume silent (most common)
1699
+ return True
1700
+
1701
+ # Archaic silent P in mpc/mpç/mpt
1702
+ if g == "p" and before == "m" and after and after[0] in "cç":
1703
+ # Check if word is in archaic word list
1704
+ for cluster, words in dialect.ARCHAIC_MUTE_P.items():
1705
+ if cluster in word and word in words:
1706
+ return True
1707
+
1708
+ # First consonant in geminate digraphs (rr, ss, ff, ll, mm)
1709
+ # These are handled at grapheme level, not here
1710
+ # (The grapheme would be "rr" as a unit, not two 'r's)
1711
+
1712
+ return False
1713
+
1714
+
1715
+ # =============================================================================
1716
+ # CHARACTER TOKEN
1717
+ # =============================================================================
1718
+
1719
+ @dataclasses.dataclass
1720
+ class CharToken:
1721
+ """
1722
+ Represents a single character with its phonological context.
1723
+
1724
+ LINGUISTIC ROLE:
1725
+ ----------------
1726
+ Characters are the atomic units of orthography.
1727
+ Their phonetic realization depends on:
1728
+ - Inherent properties (vowel/consonant, diacritics)
1729
+ - Linear context (preceding/following characters)
1730
+ - Hierarchical context (parent grapheme, syllable, word)
1731
+ - Prosodic context (stress, position in word)
1732
+
1733
+ DESIGN RATIONALE:
1734
+ -----------------
1735
+ We track both the character itself and its context
1736
+ to enable context-sensitive phonological rules.
1737
+ All indices are computed during initialization to avoid
1738
+ circular dependencies.
1739
+
1740
+ Attributes:
1741
+ surface: The actual character string (may include diacritics)
1742
+ char_idx: Position within parent grapheme (0-based)
1743
+ parent_grapheme: GraphemeToken containing this character
1744
+ dialect: DialectInventory with phonological rules
1745
+ """
1746
+
1747
+ surface: str
1748
+ char_idx: int = 0 # parent_grapheme.characters[idx] == self
1749
+ parent_grapheme: Optional["GraphemeToken"] = None
1750
+ dialect: DialectInventory = dataclasses.field(default_factory=EuropeanPortuguese)
1751
+
1752
+ # Precomputed indices (set during initialization)
1753
+ _idx_in_word: int = -1
1754
+ _idx_in_sentence: int = -1
1755
+
1756
+ def __post_init__(self):
1757
+ """
1758
+ Validate and precompute indices.
1759
+
1760
+ Indices are computed top-down during sentence initialization
1761
+ to avoid circular dependency issues.
1762
+ """
1763
+ # Validation
1764
+ if len(self.surface) != 1:
1765
+ raise ValueError(f"CharToken must contain exactly one character, got: {self.surface}")
1766
+
1767
+ self._idx_in_word = self.parent_grapheme.idx_in_word + self.char_idx
1768
+ self._idx_in_sentence = self.parent_grapheme.idx_in_sentence + self.char_idx
1769
+
1770
+ # =========================================================================
1771
+ # BASIC PROPERTIES
1772
+ # =========================================================================
1773
+
1774
+ @cached_property
1775
+ def normalized(self) -> str:
1776
+ """
1777
+ Lowercase, normalized form of character.
1778
+
1779
+ Normalization maps archaic/foreign diacritics to standard equivalents.
1780
+ Examples:
1781
+ - ü → w (represents [w] sound)
1782
+ - è → é (obsolete grave → modern acute)
1783
+ - î → i (redundant circumflex → plain)
1784
+ """
1785
+ s = self.surface.lower().strip()
1786
+ return self.dialect.NORMALIZED_VOWELS.get(s, s)
1787
+
1788
+ # =========================================================================
1789
+ # INDICES AND CONTEXT
1790
+ # =========================================================================
1791
+
1792
+ @property
1793
+ def idx_in_word(self) -> int:
1794
+ """Index of this character in parent word."""
1795
+ return self._idx_in_word
1796
+
1797
+ @property
1798
+ def idx_in_sentence(self) -> int:
1799
+ """Index of this character in parent sentence."""
1800
+ return self._idx_in_sentence
1801
+
1802
+ @cached_property
1803
+ def parent_word(self) -> Optional['WordToken']:
1804
+ """The word containing this character."""
1805
+ if self.parent_grapheme:
1806
+ return self.parent_grapheme.parent_word
1807
+ return None
1808
+
1809
+ @cached_property
1810
+ def parent_sentence(self) -> Optional['Sentence']:
1811
+ """The sentence containing this character."""
1812
+ if not self.parent_word:
1813
+ return None
1814
+ return self.parent_word.parent_sentence
1815
+
1816
+ @cached_property
1817
+ def prev_char(self) -> Optional['CharToken']:
1818
+ """Previous character in the grapheme, or None if first."""
1819
+ if not self.parent_grapheme:
1820
+ return None
1821
+ if self.char_idx == 0:
1822
+ # TODO: go to prev grapheme
1823
+ return None
1824
+ return self.parent_grapheme.characters[self.char_idx - 1]
1825
+
1826
+ @cached_property
1827
+ def next_char(self) -> Optional['CharToken']:
1828
+ """Next character in the grapheme, or None if last."""
1829
+ if self.char_idx == -1 or not self.parent_grapheme:
1830
+ return None
1831
+ if self.char_idx >= len(self.parent_grapheme.characters) - 1:
1832
+ # TODO: go to next grapheme
1833
+ return None
1834
+ return self.parent_grapheme.characters[self.char_idx + 1]
1835
+
1836
+ # -------------------------------
1837
+ # Look-behind/ahead
1838
+ # -------------------------------
1839
+ @property
1840
+ def prefix(self) -> str:
1841
+ return self.parent_grapheme.prefix + "".join([c.normalized for c in self._prev_chars])
1842
+
1843
+ @property
1844
+ def suffix(self) -> str:
1845
+ return "".join([c.normalized for c in self._next_chars]) + self.parent_grapheme.suffix
1846
+
1847
+ @cached_property
1848
+ def _prev_chars(self) -> List['CharToken']:
1849
+ if self.char_idx == 0:
1850
+ return []
1851
+ return [w for w in self.parent_grapheme.characters if w.char_idx < self.char_idx]
1852
+
1853
+ @cached_property
1854
+ def _next_chars(self) -> List['CharToken']:
1855
+ return [w for w in self.parent_grapheme.characters if w.char_idx > self.char_idx]
1856
+
1857
+ # =========================================================================
1858
+ # CHARACTER CLASSIFICATION
1859
+ # =========================================================================
1860
+
1861
+ @cached_property
1862
+ def is_punct(self) -> bool:
1863
+ """True if character is punctuation."""
1864
+ return self.surface in self.dialect.PUNCT_CHARS
1865
+
1866
+ @cached_property
1867
+ def is_vowel(self) -> bool:
1868
+ """
1869
+ True if character represents a vowel (with or without diacritics).
1870
+
1871
+ Portuguese vowels: a, e, i, o, u
1872
+ With diacritics: á, à, â, ã, é, ê, í, ó, ô, õ, ú
1873
+ Archaic: è, ì, ò, ù, ẽ, ĩ, ũ, ä, ë, ï, ö, ü, ÿ
1874
+ """
1875
+ return self.normalized in (
1876
+ self.dialect.VOWEL_CHARS |
1877
+ self.dialect.ACUTE_VOWEL_CHARS |
1878
+ self.dialect.GRAVE_VOWEL_CHARS |
1879
+ self.dialect.CIRCUM_VOWEL_CHARS |
1880
+ self.dialect.TILDE_VOWEL_CHARS |
1881
+ self.dialect.TREMA_VOWEL_CHARS
1882
+ )
1883
+
1884
+ @cached_property
1885
+ def is_semivowel(self) -> bool:
1886
+ """
1887
+ True if character can function as semivowel (glide).
1888
+
1889
+ Semivowels in Portuguese:
1890
+ - [j]: written as 'i' or 'e'
1891
+ - [w]: written as 'u' or 'o'
1892
+
1893
+ Whether it actually IS a semivowel depends on position:
1894
+ - In diphthong: semivowel
1895
+ - As syllable nucleus: vowel
1896
+ """
1897
+ return self.normalized in self.dialect.SEMIVOWEL_CHARS
1898
+
1899
+ @cached_property
1900
+ def is_consonant(self) -> bool:
1901
+ """True if character represents a consonant."""
1902
+ return not self.is_vowel and not self.is_punct
1903
+
1904
+ @cached_property
1905
+ def is_nasal_vowel(self) -> bool:
1906
+ """
1907
+ True if vowel is phonemically nasal.
1908
+
1909
+ Two orthographic realizations:
1910
+ 1. Tilde: ã, õ (and archaic ẽ, ĩ, ũ)
1911
+ 2. Vowel + nasal consonant: am, an, em, en, etc.
1912
+ """
1913
+ if not self.is_vowel:
1914
+ return False
1915
+
1916
+ # Explicit tilde marking
1917
+ if self.normalized in self.dialect.TILDE_VOWEL_CHARS:
1918
+ return True
1919
+
1920
+ # Followed by m/n in coda position
1921
+ if self.next_char and self.next_char.normalized in "mn":
1922
+ # Check if next char is in coda (not before vowel)
1923
+ next_next = self.next_char.next_char
1924
+ if not next_next or next_next.is_consonant:
1925
+ return True
1926
+
1927
+ return False
1928
+
1929
+ @cached_property
1930
+ def is_foreign(self) -> bool:
1931
+ """
1932
+ True if character is not in traditional Portuguese alphabet.
1933
+
1934
+ Foreign letters: k, w, y
1935
+ Used in: loanwords, foreign names, scientific terms
1936
+ """
1937
+ return self.normalized in self.dialect.FOREIGN_CHARS
1938
+
1939
+ @cached_property
1940
+ def has_diacritics(self) -> bool:
1941
+ """True if character has diacritical marks."""
1942
+ return self.normalized in (
1943
+ self.dialect.ACUTE_VOWEL_CHARS |
1944
+ self.dialect.GRAVE_VOWEL_CHARS |
1945
+ self.dialect.CIRCUM_VOWEL_CHARS |
1946
+ self.dialect.TILDE_VOWEL_CHARS |
1947
+ self.dialect.TREMA_VOWEL_CHARS
1948
+ )
1949
+
1950
+ @cached_property
1951
+ def is_silent(self) -> bool:
1952
+ """
1953
+ True if character has no phonetic realization.
1954
+
1955
+ Silent letter categories:
1956
+ 1. H: Always silent (except in digraphs ch, nh, lh)
1957
+ 2. U in QU/GU: Silent before e/i (modern orthography)
1958
+ 3. Archaic P: Silent in mpc/mpç/mpt clusters
1959
+ 4. First letter in doubled consonant digraphs
1960
+
1961
+ Context-dependent - uses word and positional information.
1962
+ """
1963
+ return is_grapheme_silent(
1964
+ self.normalized,
1965
+ self.prefix,
1966
+ self.suffix,
1967
+ self.parent_word.normalized if self.parent_word else "",
1968
+ self.dialect
1969
+ )
1970
+
1971
+ # =========================================================================
1972
+ # VOWEL QUALITY CLASSIFICATION
1973
+ # =========================================================================
1974
+
1975
+ @cached_property
1976
+ def is_open_vowel(self) -> bool:
1977
+ """
1978
+ True if vowel is phonetically open (low tongue position).
1979
+
1980
+ Open vowels: [a, ɛ, ɔ]
1981
+ Marked with acute accent: á, é, ó
1982
+
1983
+ Linguistic note: Only a, e, o have open/closed distinction.
1984
+ i and u are always closed (high vowels).
1985
+ """
1986
+ return self.normalized in self.dialect.ACUTE_VOWEL_CHARS or self.normalized == "a"
1987
+
1988
+ @cached_property
1989
+ def is_closed_vowel(self) -> bool:
1990
+ """
1991
+ True if vowel is phonetically closed (high tongue position).
1992
+
1993
+ Closed vowels: [i, e, o, u, ɨ]
1994
+ High vowels i, u are always closed.
1995
+ Mid vowels e, o are closed when marked with circumflex: ê, ô
1996
+ """
1997
+ return self.normalized in ["i", "u", "ê", "ô"]
1998
+
1999
+ # =========================================================================
2000
+ # POSITIONAL PROPERTIES
2001
+ # =========================================================================
2002
+
2003
+ @cached_property
2004
+ def is_first_word_letter(self) -> bool:
2005
+ """True if this is the first letter of the word."""
2006
+ return self.idx_in_word == 0
2007
+
2008
+ @cached_property
2009
+ def is_last_word_letter(self) -> bool:
2010
+ """True if this is the last letter of the word."""
2011
+ if not self.parent_word:
2012
+ return False
2013
+ return self.idx_in_word == len(self.parent_word.normalized) - 1
2014
+
2015
+ @cached_property
2016
+ def is_intervocalic(self) -> bool:
2017
+ """
2018
+ True if character is between two vowels (V-C-V context).
2019
+
2020
+ Relevant for:
2021
+ - S voicing: casa [ˈkazɐ] (s → [z] between vowels)
2022
+ - R strengthening: caro vs carro
2023
+ """
2024
+ prev_is_vowel = self.prev_char.is_vowel if self.prev_char else False
2025
+ next_is_vowel = self.next_char.is_vowel if self.next_char else False
2026
+ return prev_is_vowel and next_is_vowel
2027
+
2028
+ @cached_property
2029
+ def is_between_consonant_vowel(self) -> bool:
2030
+ """
2031
+ True if pattern is C-S-V.
2032
+
2033
+ Relevant for S voicing rules.
2034
+ """
2035
+ prev_is_cons = self.prev_char.is_consonant if self.prev_char else False
2036
+ next_is_vowel = self.next_char.is_vowel if self.next_char else False
2037
+ return prev_is_cons and next_is_vowel
2038
+
2039
+ @cached_property
2040
+ def is_between_vowel_consonant(self) -> bool:
2041
+ """
2042
+ True if pattern is V-S-C.
2043
+
2044
+ Relevant for syllable-final consonant rules.
2045
+ """
2046
+ prev_is_vowel = self.prev_char.is_vowel if self.prev_char else False
2047
+ next_is_cons = self.next_char.is_consonant if self.next_char else False
2048
+ return prev_is_vowel and next_is_cons
2049
+
2050
+ # =========================================================================
2051
+ # STRESS PROPERTIES
2052
+ # =========================================================================
2053
+
2054
+ @cached_property
2055
+ def has_primary_stress(self) -> bool:
2056
+ """
2057
+ True if this vowel carries primary stress.
2058
+
2059
+ For diacritically marked vowels (á, é, etc.), stress is explicit.
2060
+ For unmarked vowels, stress is determined by syllable-level rules
2061
+ in the parent grapheme/word.
2062
+ """
2063
+ # Explicit stress markers
2064
+ if self.normalized in self.dialect.PRIMARY_STRESS_MARKERS:
2065
+ return True
2066
+
2067
+ # Defer to parent grapheme's stress determination
2068
+ if self.parent_grapheme:
2069
+ return self.parent_grapheme.has_primary_stress
2070
+
2071
+ return False
2072
+
2073
+ @cached_property
2074
+ def has_secondary_stress(self) -> bool:
2075
+ """
2076
+ True if this vowel carries secondary stress.
2077
+
2078
+ Circumflex and grave accents can mark secondary stress
2079
+ in compound words and some historical contexts.
2080
+ """
2081
+ # Explicit secondary stress markers
2082
+ if self.normalized in self.dialect.SECONDARY_STRESS_MARKERS:
2083
+ return True
2084
+
2085
+ if self.is_vowel and self.prev_char and self.prev_char.normalized == "h":
2086
+ return True
2087
+
2088
+ # Defer to parent grapheme
2089
+ if self.parent_grapheme:
2090
+ return self.parent_grapheme.has_secondary_stress
2091
+
2092
+ return False
2093
+
2094
+ # =========================================================================
2095
+ # IPA GENERATION
2096
+ # =========================================================================
2097
+
2098
+ def _ipa_for_vowel(self) -> str:
2099
+ """
2100
+ Generate IPA for vowel character.
2101
+
2102
+ VOWEL REALIZATION RULES:
2103
+ ------------------------
2104
+ 1. Explicit quality: á→[a], é→[ɛ], ê→[e], ó→[ɔ], ô→[o]
2105
+ 2. Stress-dependent:
2106
+ - Stressed a → [a]
2107
+ - Unstressed a → [ɐ]
2108
+ - Stressed e → [ɛ] or [e] (depends on syllable)
2109
+ - Unstressed e → [ɨ] (European) or [e] (Brazilian)
2110
+ 3. Nasal: ã→[ɐ̃], õ→[õ], a+m/n→[ɐ̃], etc.
2111
+
2112
+ Returns:
2113
+ IPA string for this vowel
2114
+ """
2115
+ s = self.normalized
2116
+
2117
+ # Explicit diacritical marking
2118
+ if s in self.dialect.DEFAULT_CHAR2PHONEMES:
2119
+ base_ipa = self.dialect.DEFAULT_CHAR2PHONEMES[s]
2120
+
2121
+ word = self.parent_word.normalized if self.parent_word else ""
2122
+
2123
+ # TODO: per dialect handling
2124
+
2125
+ # Special case: Single-vowel words
2126
+ if word == "a":
2127
+ return "ɐ"
2128
+ elif word == "e":
2129
+ return "i"
2130
+ elif word == "é":
2131
+ return "ɛ"
2132
+ elif word == "o":
2133
+ return "u"
2134
+
2135
+ # Special case: prepositions
2136
+ preps = ["a", "o", "as", "os",
2137
+ "de", "em", "por"]
2138
+ # Special case: determinants
2139
+ dets = ["da", "do", "das", "dos"]
2140
+ # Special case: contractions
2141
+ # em a/o -> na/o
2142
+ # para -> pra | para a -> prá
2143
+ contr = ["na", "no", "nas", "nos", "pra"]
2144
+ # Special case: oblique pronouns
2145
+ prons = ["me", "te", "se",
2146
+ "le", "lo", "la",
2147
+ "les", "los", "las",
2148
+ "lhe", "lho", "lha",
2149
+ "lhes", "lhos", "lhas"]
2150
+ if word in preps + dets + prons + contr:
2151
+ # Brazilian Portuguese: less reduction
2152
+ if self.dialect.dialect_code.startswith("pt-BR"):
2153
+ if s == "a":
2154
+ return "a" # Less reduction
2155
+ if s == "e":
2156
+ return "e" # Less reduction
2157
+ if s == "o":
2158
+ return "o" # Less reduction
2159
+ else:
2160
+ # European/African: more reduction
2161
+ if s == "a":
2162
+ return "ɐ"
2163
+ if s == "e":
2164
+ return "ɨ"
2165
+ if s == "o":
2166
+ return "u"
2167
+
2168
+ # Override with stress-based quality for ambiguous vowels
2169
+ if s == "a":
2170
+ return "a" if self.has_primary_stress or self.has_secondary_stress else "ɐ"
2171
+ elif s == "e":
2172
+ if self.dialect.dialect_code.startswith("pt-PT"):
2173
+ return "ɛ" if self.has_primary_stress else "ɨ"
2174
+ return "ɛ" if self.has_primary_stress else "e"
2175
+ elif s == "o":
2176
+ return "ɔ" if self.has_primary_stress or self.has_secondary_stress else "u"
2177
+
2178
+ return base_ipa
2179
+
2180
+ return s # Fallback
2181
+
2182
+ def _ipa_for_consonant(self) -> str:
2183
+ """
2184
+ Generate IPA for consonant character.
2185
+
2186
+ CONTEXT-SENSITIVE CONSONANT RULES:
2187
+ -----------------------------------
2188
+ 1. C: [k] normally, [s] before e/i
2189
+ 2. G: [ɡ] normally, [ʒ] before e/i
2190
+ 3. R: [ɾ] normally, [ʁ] word-initially or after l/n/s
2191
+ 4. S: [s] normally, [z] intervocalically
2192
+ 5. X: [ʃ] normally, but [ks], [z], [s], [gz] in specific contexts
2193
+ 6. Z: [z] normally, [ʃ] word-finally (European)
2194
+
2195
+ Returns:
2196
+ IPA string for this consonant
2197
+ """
2198
+ s = self.normalized
2199
+ next_char = self.next_char.normalized if self.next_char else ""
2200
+ prev_char = self.prev_char.normalized if self.prev_char else ""
2201
+
2202
+ # BRAZILIAN PORTUGUESE: t/d palatalization before [i]
2203
+ if self.dialect.dialect_code.startswith("pt-BR"):
2204
+ if s == "t" and next_char == "i":
2205
+ return "tʃ"
2206
+ if s == "d" and next_char == "i":
2207
+ return "dʒ"
2208
+
2209
+ # L-vocalization in coda position
2210
+ if s == "l" and self.is_last_word_letter:
2211
+ return "w"
2212
+ if s == "l" and self.next_char and self.next_char.is_consonant:
2213
+ return "w"
2214
+
2215
+ # C before front vowels → [s]
2216
+ if s == "c" and next_char in self.dialect.FRONT_VOWEL_CHARS:
2217
+ return "s"
2218
+
2219
+ # G before front vowels → [ʒ]
2220
+ if s == "g" and next_char in self.dialect.FRONT_VOWEL_CHARS:
2221
+ return "ʒ"
2222
+
2223
+ # Initial R → strong R [ʁ]
2224
+ if s == "r" and self.is_first_word_letter:
2225
+ if self.dialect.dialect_code.startswith("pt-BR"):
2226
+ return "h" # Brazilian [h] or [x]
2227
+ elif self.dialect.dialect_code.startswith("pt-PT"):
2228
+ return "ʁ" # European uvular
2229
+ else:
2230
+ return "r" # African/Timorese alveolar trill
2231
+
2232
+ # R after l, n, s → strong R
2233
+ if s == "r" and prev_char in "lns":
2234
+ if self.dialect.dialect_code.startswith("pt-BR"):
2235
+ return "h" # Brazilian [h] or [x]
2236
+ elif self.dialect.dialect_code.startswith("pt-PT"):
2237
+ return "ʁ" # European uvular
2238
+ else:
2239
+ return "r" # African/Timorese alveolar trill
2240
+
2241
+ # S between vowels → [z]
2242
+ if s == "s" and self.is_intervocalic:
2243
+ return "z"
2244
+
2245
+ # S between consonant and vowel → context-dependent
2246
+ if s == "s" and self.is_between_consonant_vowel:
2247
+ # Special case: trans- prefix
2248
+ word = self.parent_word.normalized if self.parent_word else ""
2249
+ if word.startswith(("trans", "trâns")) and self.idx_in_word == 4:
2250
+ # Check if followed by vowel (voice) or consonant (voiceless)
2251
+ if self.next_char and self.next_char.is_vowel:
2252
+ # Exception: transação [tɾɐ̃zɐˈsɐ̃w]
2253
+ return "z"
2254
+ return "s"
2255
+
2256
+ # X rules (complex, context-dependent)
2257
+ if s == "x":
2258
+ return self._ipa_for_x()
2259
+
2260
+ # Z word-finally → [ʃ] (European) or [s]
2261
+ if s == "z" and self.is_last_word_letter:
2262
+ if self.dialect.dialect_code.startswith("pt-BR"):
2263
+ return "s" # Brazilian: [s]
2264
+ else:
2265
+ return "ʃ" # European/African: [ʃ]
2266
+
2267
+ # L word-finally (Brazilian vocalization handled above)
2268
+ if s == "l" and self.is_last_word_letter:
2269
+ if self.dialect.dialect_code.startswith("pt-PT"):
2270
+ return "ɫ" # European dark L
2271
+
2272
+ # Default mapping
2273
+ return self.dialect.DEFAULT_CHAR2PHONEMES.get(s, s)
2274
+
2275
+ def _ipa_for_x(self) -> str:
2276
+ """
2277
+ Generate IPA for the letter X (highly context-dependent).
2278
+
2279
+ X PRONUNCIATION RULES:
2280
+ ----------------------
2281
+ 1. Word-initial: [ʃ] - xadrez, xícara
2282
+ 2. Word-final: [ks] - tórax, fax
2283
+ 3. Intervocalic:
2284
+ a. [ʃ]: peixe, caixa (default)
2285
+ b. [ks]: sexo, máximo (after stressed vowel with accent)
2286
+ c. [z]: exemplo, exato (in ex- prefix before vowel)
2287
+ d. [s]: próximo (rare)
2288
+ e. [gz]: hexa- prefix (rare variant)
2289
+
2290
+ This is one of the most complex orthographic patterns in Portuguese.
2291
+
2292
+ Returns:
2293
+ IPA string for X
2294
+ """
2295
+ prev_char = self.prev_char.normalized if self.prev_char else ""
2296
+ next_char = self.next_char.normalized if self.next_char else ""
2297
+ word = self.parent_word.normalized if self.parent_word else ""
2298
+
2299
+ # Word-initial: [ʃ]
2300
+ if self.is_first_word_letter:
2301
+ return "ʃ"
2302
+
2303
+ # Word-final: [ks]
2304
+ if self.is_last_word_letter:
2305
+ return "ks"
2306
+
2307
+ # Intervocalic context
2308
+ if self.is_intervocalic:
2309
+ # Check for hexa- prefix: [gz] variant
2310
+ if word.startswith("hexa") and self.idx_in_word == 2:
2311
+ return "gz"
2312
+
2313
+ # Check for próxim-: [s]
2314
+ if word.startswith("próxim") and self.idx_in_word == 3:
2315
+ return "s"
2316
+
2317
+ # Ex- prefix before vowel: [z]
2318
+ if prev_char == "e" and next_char in "aeiouáéíóú":
2319
+ # Examples: exemplo, exato, executivo
2320
+ return "z"
2321
+
2322
+ # After stressed vowel with accent: [ks]
2323
+ if prev_char in self.dialect.ACUTE_VOWEL_CHARS | set("e"):
2324
+ # Examples: máximo, tóxico, sexo
2325
+ if prev_char == "ú":
2326
+ # Exception: esdrúxulo [ʃ]
2327
+ return "ʃ"
2328
+ return "ks"
2329
+
2330
+ # Default intervocalic: [ʃ]
2331
+ return "ʃ"
2332
+
2333
+ # Default: [ʃ]
2334
+ return "ʃ"
2335
+
2336
+ @cached_property
2337
+ def ipa(self) -> str:
2338
+ """
2339
+ Generate IPA transcription for this character.
2340
+
2341
+ ALGORITHM:
2342
+ ----------
2343
+ 1. Handle punctuation → prosodic markers
2344
+ 2. Check for silence
2345
+ 3. Dispatch to vowel vs consonant rules
2346
+ 4. Apply special-case overrides
2347
+
2348
+ Returns:
2349
+ IPA string (may be empty for silent characters)
2350
+ """
2351
+ # Punctuation → prosodic markers
2352
+ if self.is_punct:
2353
+ return self.dialect.PUNCT2IPA.get(self.normalized, self.dialect.HIATUS_TOKEN)
2354
+
2355
+ # Silent characters
2356
+ if self.is_silent:
2357
+ return ""
2358
+
2359
+ # Dispatch based on vowel vs consonant
2360
+ if self.is_vowel:
2361
+ return self._ipa_for_vowel()
2362
+ else:
2363
+ return self._ipa_for_consonant()
2364
+
2365
+ # =========================================================================
2366
+ # FEATURE EXTRACTION
2367
+ # =========================================================================
2368
+
2369
+ @property
2370
+ def features(self) -> Dict[str, any]:
2371
+ """
2372
+ Extract all linguistic features as a dictionary.
2373
+
2374
+ Useful for:
2375
+ - Machine learning feature vectors
2376
+ - Debugging
2377
+ - Linguistic analysis
2378
+
2379
+ Returns:
2380
+ Dictionary mapping feature names to values
2381
+ """
2382
+ return {
2383
+ "text": self.normalized,
2384
+ "ipa": self.ipa,
2385
+ "is_first_letter": self.is_first_word_letter,
2386
+ "is_last_letter": self.is_last_word_letter,
2387
+ "is_punct": self.is_punct,
2388
+ "is_vowel": self.is_vowel,
2389
+ "is_semivowel": self.is_semivowel,
2390
+ "is_nasal_vowel": self.is_nasal_vowel,
2391
+ "is_open_vowel": self.is_open_vowel,
2392
+ "is_closed_vowel": self.is_closed_vowel,
2393
+ "is_consonant": self.is_consonant,
2394
+ "is_foreign": self.is_foreign,
2395
+ "is_silent": self.is_silent,
2396
+ "is_intervocalic": self.is_intervocalic,
2397
+ "is_between_consonant_vowel": self.is_between_consonant_vowel,
2398
+ "is_between_vowel_consonant": self.is_between_vowel_consonant,
2399
+ "has_diacritics": self.has_diacritics,
2400
+ "has_primary_stress": self.has_primary_stress,
2401
+ "has_secondary_stress": self.has_secondary_stress,
2402
+ }
2403
+
2404
+ def __eq__(self, other) -> bool:
2405
+ """Allow comparison with string for convenience."""
2406
+ if isinstance(other, str):
2407
+ return self.surface == other
2408
+ return super().__eq__(other)
2409
+
2410
+ def __repr__(self) -> str:
2411
+ """String representation for debugging."""
2412
+ return f"CharToken('{self.surface}' → [{self.ipa}])"
2413
+
2414
+
2415
+ # =============================================================================
2416
+ # GRAPHEME TOKEN
2417
+ # =============================================================================
2418
+
2419
+ @dataclasses.dataclass
2420
+ class GraphemeToken:
2421
+ """
2422
+ Represents a grapheme - the minimal distinctive unit of writing.
2423
+
2424
+ GRAPHEME DEFINITION:
2425
+ --------------------
2426
+ A grapheme is the smallest unit of a writing system.
2427
+ In alphabetic systems like Portuguese, graphemes can be:
2428
+ - Single letters: a, b, c
2429
+ - Digraphs: ch, nh, lh, rr, ss
2430
+ - Diphthongs: ai, ou, ei
2431
+ - Trigraphs: que, gui, coo
2432
+ - Tetragraphs: aien (rare)
2433
+
2434
+ LINGUISTIC MOTIVATION:
2435
+ ----------------------
2436
+ Portuguese orthography uses multi-character sequences to represent:
2437
+ 1. Single phonemes: ch → [ʃ], nh → [ɲ]
2438
+ 2. Phoneme clusters: qu → [kw] or [k] depending on context
2439
+ 3. Diphthongs: ai → [aj], ou → [ow]
2440
+
2441
+ Tokenizing at grapheme level (not character level) respects
2442
+ the structure of the writing system.
2443
+
2444
+ Attributes:
2445
+ surface: The grapheme string (1-4 characters)
2446
+ grapheme_idx: Position in parent word's grapheme list
2447
+ syllable_idx: Which syllable this grapheme belongs to
2448
+ characters: List of CharToken objects composing this grapheme
2449
+ parent_word: WordToken containing this grapheme
2450
+ dialect: DialectInventory with rules
2451
+ """
2452
+
2453
+ surface: str
2454
+ grapheme_idx: int = 0 # parent_word.graphemes[idx] == self
2455
+ syllable_idx: int = 0 # parent_word.normalized_syllables[idx] == self.surface
2456
+ characters: List[CharToken] = dataclasses.field(default_factory=list)
2457
+ parent_word: Optional["WordToken"] = None
2458
+ dialect: DialectInventory = dataclasses.field(default_factory=EuropeanPortuguese)
2459
+
2460
+ # Precomputed indices
2461
+ _idx_in_word: int = -1
2462
+ _idx_in_sentence: int = -1
2463
+
2464
+ def __post_init__(self):
2465
+ """
2466
+ Initialize character tokens and compute indices.
2467
+
2468
+ Characters are created and their indices are computed here
2469
+ to avoid circular dependencies during IPA generation.
2470
+ """
2471
+ if not self.characters:
2472
+ self.characters = [
2473
+ CharToken(
2474
+ surface=c,
2475
+ char_idx=i,
2476
+ parent_grapheme=self,
2477
+ dialect=self.dialect
2478
+ )
2479
+ for i, c in enumerate(self.surface)
2480
+ ]
2481
+
2482
+ # =========================================================================
2483
+ # BASIC PROPERTIES
2484
+ # =========================================================================
2485
+
2486
+ @cached_property
2487
+ def normalized(self) -> str:
2488
+ """Lowercase form of grapheme."""
2489
+ return self.surface.lower()
2490
+
2491
+ @property
2492
+ def n_chars(self) -> int:
2493
+ """Number of characters in this grapheme."""
2494
+ return len(self.characters)
2495
+
2496
+ @property
2497
+ def first_char(self) -> CharToken:
2498
+ """First character of grapheme."""
2499
+ return self.characters[0]
2500
+
2501
+ @property
2502
+ def last_char(self) -> CharToken:
2503
+ """Last character of grapheme."""
2504
+ return self.characters[-1]
2505
+
2506
+ # =========================================================================
2507
+ # INDICES AND CONTEXT
2508
+ # =========================================================================
2509
+
2510
+ @property
2511
+ def idx_in_word(self) -> int:
2512
+ """Character index of first char in parent word."""
2513
+ return self._idx_in_word
2514
+
2515
+ @property
2516
+ def idx_in_sentence(self) -> int:
2517
+ """Character index of first char in parent sentence."""
2518
+ return self._idx_in_sentence
2519
+
2520
+ @cached_property
2521
+ def parent_sentence(self) -> Optional['Sentence']:
2522
+ """The sentence containing this grapheme."""
2523
+ if not self.parent_word:
2524
+ return None
2525
+ return self.parent_word.parent_sentence
2526
+
2527
+ @cached_property
2528
+ def parent_syllable(self) -> Optional[str]:
2529
+ """The syllable string containing this grapheme."""
2530
+ if not self.parent_word or self.syllable_idx < 0:
2531
+ return None
2532
+ if self.syllable_idx >= len(self.parent_word.syllables):
2533
+ return None
2534
+ return self.parent_word.syllables[self.syllable_idx]
2535
+
2536
+ # ------------------------------------------------------------------
2537
+ # Prefix/suffix context
2538
+ # ------------------------------------------------------------------
2539
+ @property
2540
+ def prefix(self) -> str:
2541
+ """
2542
+ All text before this grapheme in the word.
2543
+
2544
+ Used for checking morphological boundaries (prefixes).
2545
+ Example: In "biauricular", prefix of "au" is "bi"
2546
+ """
2547
+ if not self.parent_word:
2548
+ return ""
2549
+
2550
+ prev_graphemes = [
2551
+ g.normalized for g in self.parent_word.graphemes
2552
+ if g.grapheme_idx < self.grapheme_idx
2553
+ ]
2554
+ return "".join(prev_graphemes)
2555
+
2556
+ @property
2557
+ def suffix(self) -> str:
2558
+ """
2559
+ All text after this grapheme in the word.
2560
+
2561
+ Used for checking word endings and contexts.
2562
+ """
2563
+ if not self.parent_word:
2564
+ return ""
2565
+
2566
+ next_graphemes = [
2567
+ g.normalized for g in self.parent_word.graphemes
2568
+ if g.grapheme_idx > self.grapheme_idx
2569
+ ]
2570
+ return "".join(next_graphemes)
2571
+
2572
+ @cached_property
2573
+ def prev_grapheme(self) -> Optional['GraphemeToken']:
2574
+ """Previous grapheme in word, or None if first."""
2575
+ if self.grapheme_idx == 0 or not self.parent_word:
2576
+ return None
2577
+ return self.parent_word.graphemes[self.grapheme_idx - 1]
2578
+
2579
+ @cached_property
2580
+ def next_grapheme(self) -> Optional['GraphemeToken']:
2581
+ """Next grapheme in word, or None if last."""
2582
+ if not self.parent_word:
2583
+ return None
2584
+ if self.grapheme_idx >= len(self.parent_word.graphemes) - 1:
2585
+ return None
2586
+ return self.parent_word.graphemes[self.grapheme_idx + 1]
2587
+
2588
+ # ------------------------------------------------------------------
2589
+ # Syllabic context
2590
+ # ------------------------------------------------------------------
2591
+ @cached_property
2592
+ def prev_syllable(self) -> Optional[str]:
2593
+ """Previous syllable string, or None if first."""
2594
+ if self.syllable_idx == 0 or not self.parent_syllable:
2595
+ return None
2596
+ return self.parent_word.normalized_syllables[self.syllable_idx - 1]
2597
+
2598
+ @cached_property
2599
+ def next_syllable(self) -> Optional[str]:
2600
+ """Next syllable string, or None if last."""
2601
+ if self.syllable_idx == -1 or not self.parent_syllable:
2602
+ return None
2603
+ if self.syllable_idx >= len(self.parent_word.normalized_syllables) - 1:
2604
+ return None
2605
+ return self.parent_word.normalized_syllables[self.syllable_idx + 1]
2606
+
2607
+ # =========================================================================
2608
+ # GRAPHEME CLASSIFICATION
2609
+ # =========================================================================
2610
+
2611
+ @cached_property
2612
+ def is_archaism(self) -> bool:
2613
+ """
2614
+ True if grapheme uses archaic orthography.
2615
+
2616
+ Archaic patterns:
2617
+ 1. Trema (ü): Pre-1945/2009 marker for pronounced u
2618
+ 2. Grave accents (except à): Pre-1973 secondary stress
2619
+ 3. Archaic words with ph, mpt, mpc, mpç
2620
+ 4. Obsolete circumflex: êle
2621
+
2622
+ These may appear in historical texts or proper names.
2623
+ """
2624
+ s = self.normalized
2625
+
2626
+ # Trema (abolished 1945/2009)
2627
+ if "ü" in s:
2628
+ return True
2629
+
2630
+ # Grave accents (except à contraction)
2631
+ archaic_graves = [c for c in self.dialect.GRAVE_VOWEL_CHARS if c != "à"]
2632
+ if any(c in s for c in archaic_graves):
2633
+ return True
2634
+
2635
+ # In paroxytones, when the same form existed with an open and a closed vowel, a circumflex accent was placed in the word with the closed vowel.
2636
+ # Example: êle (“he”) (/ˈe.li/) and ele (“name of the letter L”) (/ˈɛ.li/).
2637
+ # This usage was made obsolete by the 1945 spelling reform in Portugal, and by the 1971 spelling reform in Brazil.
2638
+ archaic_words = ["êle"]
2639
+ if s in archaic_words:
2640
+ return True
2641
+
2642
+ # ph -> /f/ eg. "pharmacia"
2643
+ if s == "ph":
2644
+ return True
2645
+
2646
+ # Quando, nas seqüências interiores "mpc", "mpç" e "mpt" se eliminar o "p",
2647
+ # o "m" passa a "n", escrevendo-se, respectivamente "nc", "nç" e "nt":
2648
+ if s in self.dialect.ARCHAIC_MUTE_P:
2649
+ # NOTE: a word list is needed, in modern orthography none of the letters is silent
2650
+ # we do not know if input text is modern or archaic (before acordo ortográfico)
2651
+ # exemplos:
2652
+ # assumpcionista e assuncionista;
2653
+ # assumpção e assunção;
2654
+ # assumptível e assuntível;
2655
+ # peremptório e perentório,
2656
+ # sumptuoso e suntuoso,
2657
+ # sumptuosidade e suntuosidade
2658
+ return self.parent_word.normalized in self.dialect.ARCHAIC_MUTE_P[s]
2659
+ return False
2660
+
2661
+ @cached_property
2662
+ def is_nasal(self) -> bool:
2663
+ """
2664
+ True if grapheme represents nasal sound(s).
2665
+
2666
+ Nasal patterns:
2667
+ 1. Nasal digraphs: am, an, em, en, im, in, om, on, um, un
2668
+ 2. Tilde vowels: ã, õ (and archaic ẽ, ĩ, ũ)
2669
+ 3. Nasal diphthongs: ão, ãe, õe, em (final)
2670
+ """
2671
+ s = self.normalized
2672
+
2673
+ # Nasal digraph lookup
2674
+ if s in self.dialect.NASAL_DIGRAPHS:
2675
+ return True
2676
+
2677
+ # Tilde vowels
2678
+ if any(c in s for c in self.dialect.TILDE_VOWEL_CHARS):
2679
+ return True
2680
+
2681
+ return False
2682
+
2683
+ # ------------------------------------------------------------------
2684
+ # Diphthong classification
2685
+ # ------------------------------------------------------------------
2686
+ @property
2687
+ def is_vocalic_hiatus(self) -> bool:
2688
+ # Hiato é quando duas vogais estão juntas porém em sílabas vizinhas.
2689
+ # O hiato diferencia-se de um ditongo e de um tritongo pelo fato de ser constituído por duas sílabas e,
2690
+ # consequentemente, ser pronunciado em dois esforços de voz.
2691
+
2692
+ # Os outros casos que na escrita costumam estar representados por «i» + vogal ou «u» mais vogal
2693
+ # (ou, no português europeu, «e» + vogal ou «o» + vogal),
2694
+ # costumam ser considerados como hiatos.
2695
+ return False # TODO
2696
+
2697
+ @cached_property
2698
+ def is_diphthong(self) -> bool:
2699
+ """
2700
+ True if grapheme represents a diphthong.
2701
+
2702
+ Diphthongs are two-vowel sequences in one syllable.
2703
+ Examples: ai, ei, ou, ão, ãe
2704
+
2705
+ Note: Diphthong vs hiatus is determined by syllabification.
2706
+ Same spelling can be different:
2707
+ - caiu [kɐˈju]: hiatus (ca.iu, two syllables)
2708
+ - cai [ˈkaj]: diphthong (one syllable)
2709
+ """
2710
+ s = self.normalized
2711
+ # Observação: qu"em : não é um encontro vocálico, pois não se pronuncia o U.
2712
+ # Portanto, "qu" é um dígrafo e "ue" não é um ditongo.
2713
+ if s == "ue" and self.parent_word.normalized == "quem":
2714
+ return False
2715
+ return s in self.dialect.DIPHTHONG2IPA
2716
+
2717
+ @cached_property
2718
+ def is_triphthong(self) -> bool:
2719
+ """
2720
+ True if grapheme represents a triphthong.
2721
+
2722
+ Triphthongs are three-vowel sequences in one syllable: G-V-G
2723
+ Examples: miau [ˈmjaw], Uruguai [uɾuˈɡwaj]
2724
+
2725
+ Very rare in Portuguese.
2726
+ """
2727
+ return self.normalized in self.dialect.TRIPHTHONG2IPA
2728
+
2729
+ @cached_property
2730
+ def is_falling_diphthong(self) -> bool:
2731
+ """
2732
+ True if diphthong has vowel before semivowel (V-G).
2733
+
2734
+ Examples: pai, rei, meu, céu
2735
+ Direction: vowel [a] → glide [j]
2736
+
2737
+ Most Portuguese diphthongs are falling.
2738
+ """
2739
+ if not self.is_diphthong:
2740
+ return False
2741
+
2742
+ if self.dialect.dialect_code.startswith("pt-BR"):
2743
+ # Em muitos dialetos brasileiros, devido à Vocalização do fonema /l/ em fim de sílaba,
2744
+ # também são considerados ditongos decrescentes os seguintes casos.
2745
+ if self.normalized in self.dialect.PTBR_DIPHTHONGS.values():
2746
+ # exemplos:
2747
+ # funil /fu.ˈniw/
2748
+ # feltro /few.tɾu/
2749
+ # mel /ˈmɛw/
2750
+ # mal /ˈmaw/
2751
+ # Sol /ˈsɔw/
2752
+ # soldado /sow.ˈda.du/
2753
+ # azul /aˈzuw/
2754
+ return True
2755
+ # Quando a vogal vem antes da semivogal, o ditongo é classificado como ditongo decrescente
2756
+ # exemplos:
2757
+ # leite /ˈlej.ti/ - /ˈlɐj.tɨ/ (Lisboa)
2758
+ # cai /ˈcaj/
2759
+ # dói /ˈdɔj/
2760
+ # foi /ˈfoj/
2761
+ # cuidado /cuj.ˈda.du/
2762
+ # viu /ˈviw/
2763
+ # meu /ˈmew/
2764
+ # céu /ˈcɛw/
2765
+ # mau /ˈmaw/
2766
+ # sou /ˈsow/
2767
+ return self.first_char.normalized not in self.dialect.SEMIVOWEL_CHARS
2768
+
2769
+ @cached_property
2770
+ def is_rising_diphthong(self) -> bool:
2771
+ """
2772
+ True if diphthong has semivowel before vowel (G-V).
2773
+
2774
+ Examples: piano, água, qual
2775
+ Direction: glide [j] → vowel [a]
2776
+
2777
+ Less common than falling diphthongs in Portuguese.
2778
+ """
2779
+ if not self.is_diphthong:
2780
+ return False
2781
+ return self.first_char.normalized in self.dialect.SEMIVOWEL_CHARS
2782
+
2783
+ @cached_property
2784
+ def is_nasal_diphthong(self) -> bool:
2785
+ """
2786
+ True if diphthong is nasalized.
2787
+
2788
+ Examples: mãe [ˈmɐ̃j̃], cão [ˈkɐ̃w̃], põe [ˈpõj̃]
2789
+
2790
+ Nasalization extends across entire diphthong.
2791
+ """
2792
+ if not self.is_diphthong:
2793
+ return False
2794
+ return self.first_char.normalized in self.dialect.TILDE_VOWEL_CHARS
2795
+
2796
+ @cached_property
2797
+ def is_oral_diphthong(self) -> bool:
2798
+ """
2799
+ True if diphthong is oral (not nasal).
2800
+
2801
+ Examples: pai, rei, meu, boi
2802
+ """
2803
+ return self.is_diphthong and not self.is_nasal_diphthong
2804
+
2805
+ @cached_property
2806
+ def is_digraph(self) -> bool:
2807
+ """
2808
+ True if grapheme is a consonant digraph.
2809
+
2810
+ Consonant digraphs (two letters, one consonant phoneme):
2811
+ - nh [ɲ]: palatal nasal
2812
+ - lh [ʎ]: palatal lateral
2813
+ - ch [ʃ]: postalveolar fricative
2814
+ - rr [ʁ]: strong R
2815
+ - ss [s]: voiceless between vowels
2816
+ - ph [f]: archaic
2817
+
2818
+ Does NOT include nasal digraphs (am, em, etc.) - see is_nasal.
2819
+ """
2820
+ return self.normalized in self.dialect.DIGRAPH2IPA
2821
+
2822
+ @cached_property
2823
+ def is_foreign_digraph(self) -> bool:
2824
+ """
2825
+ True if grapheme is a foreign digraph.
2826
+
2827
+ Examples from loanwords:
2828
+ - sh [ʃ]: show, shopping
2829
+ - th [t]: thriller
2830
+ - ff [f]: graffiti
2831
+ - ll [l]: villa
2832
+ """
2833
+ return self.normalized in self.dialect.FOREIGN_DIGRAPH2IPA
2834
+
2835
+ @cached_property
2836
+ def is_trigraph(self) -> bool:
2837
+ """
2838
+ True if grapheme is a trigraph (3-letter unit).
2839
+
2840
+ Examples:
2841
+ - que, qui: q + u + vowel
2842
+ - coo: prefix boundary
2843
+ - ção: common suffix
2844
+ """
2845
+ return self.normalized in self.dialect.TRIGRAM2IPA
2846
+
2847
+ @cached_property
2848
+ def is_consonant_hiatus(self) -> bool:
2849
+ """
2850
+ True if grapheme is a consonant cluster spanning syllable boundary.
2851
+
2852
+ Examples:
2853
+ - ct: pac.to [ˈpak.tu]
2854
+ - cç: fic.ção [fik.ˈsɐ̃w]
2855
+
2856
+ These are NOT pronounced as single units; they split across syllables.
2857
+ """
2858
+ return self.normalized in self.dialect.HETEROSYLLABIC_CLUSTERS
2859
+
2860
+ # =========================================================================
2861
+ # STRESS PROPERTIES
2862
+ # =========================================================================
2863
+
2864
+ @cached_property
2865
+ def has_primary_stress(self) -> bool:
2866
+ """
2867
+ True if this grapheme carries primary word stress.
2868
+
2869
+ Stress determination:
2870
+ 1. Explicit: any character has primary stress marker (á, é, ã, etc.)
2871
+ 2. Implicit: this grapheme's syllable is the stressed syllable
2872
+
2873
+ For words with explicit accent marks, that syllable is stressed.
2874
+ For unmarked words, stress is predicted by word ending and syllable count.
2875
+ """
2876
+ if self.parent_word.n_syllables == 1:
2877
+ return True
2878
+ # Check if any character in this grapheme has explicit primary stress
2879
+ if any(c.normalized in self.dialect.PRIMARY_STRESS_MARKERS for c in self.characters):
2880
+ return True
2881
+
2882
+ # Check if syllable-level stress applies to this grapheme's syllable
2883
+ if not self.parent_word:
2884
+ return False
2885
+
2886
+ # Determine stressed syllable index
2887
+ stressed_syllable_idx = detect_stress_position(
2888
+ self.parent_word.normalized,
2889
+ self.parent_word.syllables,
2890
+ self.dialect
2891
+ )
2892
+
2893
+ return self.syllable_idx == stressed_syllable_idx
2894
+
2895
+ @cached_property
2896
+ def has_secondary_stress(self) -> bool:
2897
+ """
2898
+ True if this grapheme carries secondary stress.
2899
+
2900
+ Secondary stress occurs in:
2901
+ - Compound words: semi-automático
2902
+ - Long words with complex morphology
2903
+ - Historical grave accent usage (obsolete)
2904
+
2905
+ Marked by circumflex or grave accents in non-primary position.
2906
+ """
2907
+ if self.has_primary_stress:
2908
+ return False
2909
+
2910
+ return any(
2911
+ c.normalized in self.dialect.SECONDARY_STRESS_MARKERS
2912
+ for c in self.characters
2913
+ )
2914
+
2915
+ # =========================================================================
2916
+ # IPA GENERATION
2917
+ # =========================================================================
2918
+
2919
+ @cached_property
2920
+ def ipa(self) -> str:
2921
+ """
2922
+ Generate IPA transcription for this grapheme.
2923
+
2924
+ ALGORITHM:
2925
+ ----------
2926
+ 1. Check irregular word list (highest priority)
2927
+ 2. Check multi-character lookups (tetragraph → trigraph → digraph)
2928
+ 3. Fall back to character-by-character IPA
2929
+
2930
+ For multi-character graphemes (digraphs, diphthongs),
2931
+ the lookup returns a single IPA unit, not individual characters.
2932
+
2933
+ Returns:
2934
+ IPA string for this grapheme
2935
+ """
2936
+ s = self.normalized
2937
+ word = self.parent_word.normalized if self.parent_word else ""
2938
+
2939
+ # Special case: "ui" nasalized in "muito"
2940
+ if s == "ui" and word == "muito":
2941
+ return "ũj"
2942
+
2943
+ # Check multi-character lookups (longest first)
2944
+ if s in self.dialect.TETRAGRAM2IPA:
2945
+ return self.dialect.TETRAGRAM2IPA[s]
2946
+
2947
+ if s in self.dialect.TRIGRAM2IPA:
2948
+ return self.dialect.TRIGRAM2IPA[s]
2949
+
2950
+ if s in self.dialect.NASAL_DIGRAPHS:
2951
+ return self.dialect.NASAL_DIGRAPHS[s]
2952
+
2953
+ if s in self.dialect.DIPHTHONG2IPA:
2954
+ return self.dialect.DIPHTHONG2IPA[s]
2955
+
2956
+ if s in self.dialect.DIGRAPH2IPA:
2957
+ return self.dialect.DIGRAPH2IPA[s]
2958
+
2959
+ if s in self.dialect.HETEROSYLLABIC_CLUSTERS:
2960
+ return self.dialect.HETEROSYLLABIC_CLUSTERS[s]
2961
+
2962
+ # Fall back to character-by-character
2963
+ return "".join(c.ipa for c in self.characters)
2964
+
2965
+ # =========================================================================
2966
+ # FEATURE EXTRACTION
2967
+ # =========================================================================
2968
+
2969
+ @property
2970
+ def features(self) -> Dict[str, any]:
2971
+ """
2972
+ Extract all linguistic features as a dictionary.
2973
+
2974
+ Returns:
2975
+ Dictionary with grapheme features and nested character features
2976
+ """
2977
+ feats = {
2978
+ "n_chars": self.n_chars,
2979
+ "text": self.normalized,
2980
+ "ipa": self.ipa,
2981
+ "parent_syllable": self.parent_syllable,
2982
+ "prev_syllable": self.prev_syllable,
2983
+ "next_syllable": self.next_syllable,
2984
+ "is_archaism": self.is_archaism,
2985
+ "is_nasal": self.is_nasal,
2986
+ "is_digraph": self.is_digraph,
2987
+ "is_trigraph": self.is_trigraph,
2988
+ "is_foreign_digraph": self.is_foreign_digraph,
2989
+ "is_consonant_hiatus": self.is_consonant_hiatus,
2990
+ "is_diphthong": self.is_diphthong,
2991
+ "is_triphthong": self.is_triphthong,
2992
+ "is_falling_diphthong": self.is_falling_diphthong,
2993
+ "is_rising_diphthong": self.is_rising_diphthong,
2994
+ "is_nasal_diphthong": self.is_nasal_diphthong,
2995
+ "is_oral_diphthong": self.is_oral_diphthong,
2996
+ "has_primary_stress": self.has_primary_stress,
2997
+ "has_secondary_stress": self.has_secondary_stress,
2998
+ }
2999
+
3000
+ # Add character-level features
3001
+ for c in self.characters:
3002
+ for k, v in c.features.items():
3003
+ feats[f"char_{c.char_idx}_{k}"] = v
3004
+
3005
+ return feats
3006
+
3007
+ def __eq__(self, other) -> bool:
3008
+ """Allow comparison with string."""
3009
+ if isinstance(other, str):
3010
+ return self.surface == other
3011
+ return super().__eq__(other)
3012
+
3013
+ def __repr__(self) -> str:
3014
+ """String representation for debugging."""
3015
+ return f"GraphemeToken('{self.surface}' → [{self.ipa}])"
3016
+
3017
+
3018
+ # =============================================================================
3019
+ # WORD TOKEN
3020
+ # =============================================================================
3021
+
3022
+ @dataclasses.dataclass
3023
+ class WordToken:
3024
+ """
3025
+ Represents a word with syllable structure and grapheme tokenization.
3026
+
3027
+ LINGUISTIC STRUCTURE:
3028
+ ---------------------
3029
+ A word is analyzed at multiple levels:
3030
+ 1. Orthographic: sequence of characters
3031
+ 2. Graphemic: sequence of graphemes (digraphs, diphthongs, etc.)
3032
+ 3. Syllabic: sequence of syllables
3033
+ 4. Phonological: stress pattern and IPA transcription
3034
+
3035
+ SYLLABIFICATION:
3036
+ ----------------
3037
+ Portuguese syllables follow a preferred CV (consonant-vowel) structure.
3038
+ The syllabifier handles:
3039
+ - Onset maximization: consonants go with following vowel
3040
+ - Complex onsets: pr, tr, br, etc.
3041
+ - Coda constraints: only l, r, s, n allowed in syllable-final position
3042
+ - Hiatus vs diphthong: vowel sequences may be one or two syllables
3043
+
3044
+ STRESS ASSIGNMENT:
3045
+ ------------------
3046
+ Stress is determined by:
3047
+ 1. Explicit accent marks (highest priority)
3048
+ 2. Word-final pattern (oxytone exceptions)
3049
+ 3. Default paroxytone rule (penultimate syllable)
3050
+
3051
+ Attributes:
3052
+ surface: The word as it appears in text
3053
+ word_idx: Position in parent sentence
3054
+ graphemes: List of GraphemeToken objects
3055
+ syllables: List of syllable strings
3056
+ parent_sentence: Sentence containing this word
3057
+ dialect: DialectInventory with rules
3058
+ """
3059
+
3060
+ surface: str
3061
+ word_idx: int # parent_sentence.words[idx] == self
3062
+ graphemes: List[GraphemeToken] = dataclasses.field(default_factory=list)
3063
+ syllables: List[str] = dataclasses.field(default_factory=list)
3064
+ parent_sentence: Optional["Sentence"] = None
3065
+ dialect: DialectInventory = dataclasses.field(default_factory=EuropeanPortuguese)
3066
+
3067
+ # Precomputed index
3068
+ _idx_in_sentence: int = -1
3069
+
3070
+ def __post_init__(self):
3071
+ """
3072
+ Initialize syllables and graphemes with proper indexing.
3073
+
3074
+ INITIALIZATION ORDER:
3075
+ ---------------------
3076
+ 1. Syllabify word (using external syllabifier)
3077
+ 2. Tokenize into graphemes with syllable alignment
3078
+ 3. Create character tokens with computed indices
3079
+
3080
+ This top-down approach avoids circular dependencies.
3081
+ """
3082
+ # Step 1: Syllabification
3083
+ if not self.syllables:
3084
+ self.syllables = syllabify(self.normalized)
3085
+
3086
+ # Step 2: Grapheme tokenization with syllable alignment
3087
+ if not self.graphemes:
3088
+ self.graphemes = self._tokenize_graphemes()
3089
+
3090
+ # Step 3: Compute all indices top-down
3091
+ self._compute_indices()
3092
+
3093
+ def _tokenize_graphemes(self) -> List[GraphemeToken]:
3094
+ """
3095
+ Tokenize word into graphemes aligned with syllables.
3096
+
3097
+ TOKENIZATION STRATEGY:
3098
+ ----------------------
3099
+ 1. Normalize word and syllables
3100
+ 2. For each syllable, scan for longest matching grapheme
3101
+ 3. Greedy match: try tetragraphs → trigraphs → digraphs → chars
3102
+ 4. Track which syllable each grapheme belongs to
3103
+
3104
+ SYLLABLE ALIGNMENT:
3105
+ -------------------
3106
+ We need to know which grapheme belongs to which syllable
3107
+ for stress assignment and phonological rules.
3108
+
3109
+ DOUBLED CONSONANT HANDLING:
3110
+ ---------------------------
3111
+ Portuguese syllabification splits doubled consonants:
3112
+ - bairro → bair.ro (not bai.rro)
3113
+ - muitíssimo → mui.tís.si.mo
3114
+
3115
+ But these represent single phonemes. We normalize:
3116
+ - Move first letter to following syllable for phonological unity
3117
+
3118
+ Returns:
3119
+ List of GraphemeToken objects with syllable indices
3120
+ """
3121
+ # Normalize syllables for consonant doubling
3122
+ normalized_syllables = self._normalize_syllables()
3123
+
3124
+ graphemes = []
3125
+ # char_to_syllable = self._build_char_to_syllable_map(normalized_syllables)
3126
+
3127
+ # Process each syllable
3128
+ for syl_idx, syllable in enumerate(normalized_syllables):
3129
+ syl_pos = 0
3130
+
3131
+ while syl_pos < len(syllable):
3132
+ # Try longest match first (greedy)
3133
+ matched = False
3134
+
3135
+ for grapheme in self.dialect.GRAPHEME_INVENTORY:
3136
+ if syllable[syl_pos:].startswith(grapheme):
3137
+ # Found match
3138
+ graphemes.append(
3139
+ GraphemeToken(
3140
+ surface=syllable[syl_pos:syl_pos + len(grapheme)],
3141
+ grapheme_idx=len(graphemes),
3142
+ syllable_idx=syl_idx,
3143
+ parent_word=self,
3144
+ dialect=self.dialect
3145
+ )
3146
+ )
3147
+ syl_pos += len(grapheme)
3148
+ matched = True
3149
+ break
3150
+
3151
+ if not matched:
3152
+ # Single character fallback
3153
+ graphemes.append(
3154
+ GraphemeToken(
3155
+ surface=syllable[syl_pos],
3156
+ grapheme_idx=len(graphemes),
3157
+ syllable_idx=syl_idx,
3158
+ parent_word=self,
3159
+ dialect=self.dialect
3160
+ )
3161
+ )
3162
+ syl_pos += 1
3163
+
3164
+ return graphemes
3165
+
3166
+ def _normalize_syllables(self) -> List[str]:
3167
+ """
3168
+ Normalize syllables for doubled consonant handling.
3169
+
3170
+ Portuguese syllabification splits rr, ss, etc.:
3171
+ - carro → car.ro
3172
+
3173
+ But phonologically, these are single consonants [ʁ], [s].
3174
+ We want them in the second syllable for correct IPA generation.
3175
+
3176
+ Normalization: Move first letter of doubled consonant to next syllable.
3177
+ - car.ro → ca.rro (for processing)
3178
+ - baír.ris.mo → baí.rris.mo
3179
+
3180
+ Returns:
3181
+ List of normalized syllable strings
3182
+ """
3183
+ norm_syllables = list(self.syllables)
3184
+
3185
+ for idx in range(len(norm_syllables) - 1):
3186
+ current = norm_syllables[idx]
3187
+ next_syl = norm_syllables[idx + 1]
3188
+
3189
+ # Check if syllable boundary splits doubled consonant
3190
+ for consonant in ["r", "s", "f", "l"]:
3191
+ if current.endswith(consonant) and next_syl.startswith(consonant):
3192
+ # Move first consonant to next syllable
3193
+ norm_syllables[idx] = current[:-1]
3194
+ norm_syllables[idx + 1] = consonant + next_syl
3195
+ break
3196
+
3197
+ return norm_syllables
3198
+
3199
+ @staticmethod
3200
+ def _build_char_to_syllable_map(syllables: List[str]) -> Dict[int, int]:
3201
+ """
3202
+ Map character index to syllable index.
3203
+
3204
+ Needed for aligning graphemes with syllables during tokenization.
3205
+
3206
+ Args:
3207
+ syllables: List of syllable strings
3208
+
3209
+ Returns:
3210
+ Dictionary mapping character position to syllable index
3211
+ """
3212
+ char_to_syl = {}
3213
+ char_pos = 0
3214
+
3215
+ for syl_idx, syl in enumerate(syllables):
3216
+ for _ in syl:
3217
+ char_to_syl[char_pos] = syl_idx
3218
+ char_pos += 1
3219
+
3220
+ return char_to_syl
3221
+
3222
+ def _compute_indices(self):
3223
+ """
3224
+ Compute all character and grapheme indices top-down.
3225
+
3226
+ This is called after grapheme tokenization to set:
3227
+ - Grapheme indices in word
3228
+ - Character indices in word
3229
+ - Character indices in sentence
3230
+
3231
+ Top-down computation avoids circular dependencies.
3232
+ """
3233
+ char_idx_in_word = 0
3234
+
3235
+ for grapheme in self.graphemes:
3236
+ # Set grapheme's index in word
3237
+ grapheme._idx_in_word = char_idx_in_word
3238
+ grapheme._idx_in_sentence = self._idx_in_sentence + char_idx_in_word
3239
+
3240
+ # Set character indices
3241
+ for char in grapheme.characters:
3242
+ char._idx_in_word = char_idx_in_word
3243
+ char._idx_in_sentence = self._idx_in_sentence + char_idx_in_word
3244
+ char_idx_in_word += 1
3245
+
3246
+ # =========================================================================
3247
+ # BASIC PROPERTIES
3248
+ # =========================================================================
3249
+
3250
+ @cached_property
3251
+ def normalized(self) -> str:
3252
+ """Lowercase, stripped form of word."""
3253
+ return self.surface.lower().strip()
3254
+
3255
+ @cached_property
3256
+ def normalized_syllables(self) -> List[str]:
3257
+ """Syllables after consonant doubling normalization."""
3258
+ return self._normalize_syllables()
3259
+
3260
+ @property
3261
+ def n_syllables(self) -> int:
3262
+ """Number of syllables in word."""
3263
+ return len(self.syllables)
3264
+
3265
+ @property
3266
+ def idx_in_sentence(self) -> int:
3267
+ """Character index of first letter in sentence."""
3268
+ return self._idx_in_sentence
3269
+
3270
+ @cached_property
3271
+ def is_archaic(self) -> bool:
3272
+ return self.normalized in self.dialect.ARCHAIC_WORDS
3273
+
3274
+ # =========================================================================
3275
+ # LINKED PROPERTIES
3276
+ # =========================================================================
3277
+
3278
+ @cached_property
3279
+ def prev_word(self) -> Optional['WordToken']:
3280
+ """Previous word in sentence, or None if first."""
3281
+ if self.word_idx == 0 or not self.parent_sentence:
3282
+ return None
3283
+ return self.parent_sentence.words[self.word_idx - 1]
3284
+
3285
+ @cached_property
3286
+ def next_word(self) -> Optional['WordToken']:
3287
+ """Next word in sentence, or None if last."""
3288
+ if self.word_idx == -1 or not self.parent_sentence:
3289
+ return None
3290
+ if self.word_idx >= len(self.parent_sentence.words) - 1:
3291
+ return None
3292
+ return self.parent_sentence.words[self.word_idx + 1]
3293
+
3294
+ # =========================================================================
3295
+ # STRESS PROPERTIES
3296
+ # =========================================================================
3297
+
3298
+ @cached_property
3299
+ def stressed_syllable_idx(self) -> int:
3300
+ """
3301
+ Index of syllable carrying primary stress.
3302
+
3303
+ Uses detect_stress_position() helper function.
3304
+ """
3305
+ return detect_stress_position(
3306
+ self.normalized,
3307
+ self.syllables,
3308
+ self.dialect
3309
+ )
3310
+
3311
+ # =========================================================================
3312
+ # IPA GENERATION
3313
+ # =========================================================================
3314
+
3315
+ @cached_property
3316
+ def ipa(self) -> str:
3317
+ """
3318
+ Generate IPA transcription for entire word.
3319
+
3320
+ ALGORITHM:
3321
+ ----------
3322
+ 1. Check irregular word list (overrides all rules)
3323
+ 2. Generate IPA for each grapheme
3324
+ 3. Insert syllable boundaries (·)
3325
+ 4. Insert stress marker (ˈ) before stressed syllable
3326
+
3327
+ STRESS MARKING:
3328
+ ---------------
3329
+ IPA convention: ˈ precedes stressed syllable
3330
+ Example: português [puɾ.tu.ˈɡeʃ] → "ˈ" before "ɡeʃ"
3331
+
3332
+ Returns:
3333
+ Full IPA transcription with stress and syllable marks
3334
+ """
3335
+ # Check irregular words first
3336
+ if self.normalized in self.dialect.IRREGULAR_WORDS:
3337
+ return self.dialect.IRREGULAR_WORDS[self.normalized]
3338
+
3339
+ # Generate grapheme IPAs grouped by syllable
3340
+ syllable_ipas = [[] for _ in self.syllables]
3341
+
3342
+ for grapheme in self.graphemes:
3343
+ syl_idx = grapheme.syllable_idx
3344
+ if 0 <= syl_idx < len(syllable_ipas):
3345
+ grapheme_ipa = grapheme.ipa
3346
+ if grapheme_ipa: # Skip empty (silent) graphemes
3347
+ syllable_ipas[syl_idx].append(grapheme_ipa)
3348
+
3349
+ # Join graphemes within syllables
3350
+ syllable_strings = ["".join(ipa_list) for ipa_list in syllable_ipas]
3351
+
3352
+ # Insert stress marker before stressed syllable
3353
+ stressed_idx = self.stressed_syllable_idx
3354
+ if 0 <= stressed_idx < len(syllable_strings):
3355
+ syllable_strings[stressed_idx] = (
3356
+ self.dialect.PRIMARY_STRESS_TOKEN + syllable_strings[stressed_idx]
3357
+ )
3358
+
3359
+ # Join syllables with hiatus marker
3360
+ return self.dialect.HIATUS_TOKEN.join(syllable_strings)
3361
+
3362
+ # =========================================================================
3363
+ # FEATURE EXTRACTION
3364
+ # =========================================================================
3365
+
3366
+ @property
3367
+ def features(self) -> Dict[str, any]:
3368
+ """
3369
+ Extract all linguistic features.
3370
+
3371
+ Returns:
3372
+ Dictionary with word features and nested grapheme features
3373
+ """
3374
+ feats = {
3375
+ "n_syllables": self.n_syllables,
3376
+ "idx_in_sentence": self.idx_in_sentence,
3377
+ "stressed_syllable_idx": self.stressed_syllable_idx,
3378
+ }
3379
+
3380
+ for grapheme in self.graphemes:
3381
+ for k, v in grapheme.features.items():
3382
+ feats[f"graph_{grapheme.grapheme_idx}_{k}"] = v
3383
+
3384
+ return feats
3385
+
3386
+ def __eq__(self, other) -> bool:
3387
+ """Allow comparison with string."""
3388
+ if isinstance(other, str):
3389
+ return self.surface == other
3390
+ return super().__eq__(other)
3391
+
3392
+ def __repr__(self) -> str:
3393
+ """String representation for debugging."""
3394
+ syllables_str = ".".join(self.syllables)
3395
+ return f"WordToken('{self.surface}' [{syllables_str}] → [{self.ipa}])"
3396
+
3397
+
3398
+ # =============================================================================
3399
+ # SENTENCE
3400
+ # =============================================================================
3401
+
3402
+ @dataclasses.dataclass
3403
+ class Sentence:
3404
+ """
3405
+ Represents a sentence with full phonological analysis.
3406
+
3407
+ SENTENCE-LEVEL PHONOLOGY:
3408
+ -------------------------
3409
+ While most phonological rules operate at word level,
3410
+ sentences introduce:
3411
+ 1. Liaison: linking between words (resyllabification)
3412
+ 2. Phrasal stress: prominence patterns across words
3413
+ 3. Intonation: pitch contours for questions, statements, etc.
3414
+
3415
+ CURRENT IMPLEMENTATION:
3416
+ -----------------------
3417
+ This version focuses on word-level analysis.
3418
+ Sentence-level prosody (liaison, phrasal stress, intonation)
3419
+ is simplified or not yet implemented.
3420
+
3421
+ Future extensions could include:
3422
+ - Liaison rules (final consonant + initial vowel)
3423
+ - Phrasal stress patterns
3424
+ - Intonation contours (ToBI annotation)
3425
+
3426
+ Attributes:
3427
+ surface: Raw sentence text
3428
+ words: List of WordToken objects
3429
+ dialect: DialectInventory with rules
3430
+ """
3431
+
3432
+ surface: str
3433
+ words: List[WordToken] = dataclasses.field(default_factory=list)
3434
+ dialect: DialectInventory = dataclasses.field(default_factory=EuropeanPortuguese)
3435
+
3436
+ def __post_init__(self):
3437
+ """
3438
+ Initialize word tokens with computed indices.
3439
+
3440
+ TOKENIZATION:
3441
+ -------------
3442
+ Simple whitespace tokenization.
3443
+ Punctuation is kept attached to words for now.
3444
+
3445
+ More sophisticated tokenization could handle:
3446
+ - Clitics: dar-lhe → dar + lhe
3447
+ - Contractions: do → de + o
3448
+ - Punctuation separation
3449
+ """
3450
+ if not self.words:
3451
+ # Tokenize on whitespace and hyphen
3452
+ word_surfaces = self.normalized.replace('-', ' ').split()
3453
+
3454
+ # Compute word positions in sentence
3455
+ char_position = 0
3456
+ for idx, word_surface in enumerate(word_surfaces):
3457
+ # Find word in original sentence (preserve case)
3458
+ word_start = self.surface.lower().find(word_surface, char_position)
3459
+
3460
+ # Create word token
3461
+ word_token = WordToken(
3462
+ surface=word_surface,
3463
+ word_idx=idx,
3464
+ parent_sentence=self,
3465
+ dialect=self.dialect
3466
+ )
3467
+ word_token._idx_in_sentence = word_start
3468
+
3469
+ self.words.append(word_token)
3470
+
3471
+ # Update position (word length + space)
3472
+ char_position = word_start + len(word_surface) + 1
3473
+
3474
+ # =========================================================================
3475
+ # BASIC PROPERTIES
3476
+ # =========================================================================
3477
+ @cached_property
3478
+ def normalized(self) -> str:
3479
+ """Lowercase, stripped form of sentence."""
3480
+ # Remove leading/trailing punctuation and whitespace
3481
+ text = self.surface.lower().strip(string.punctuation + string.whitespace)
3482
+ return normalize_numbers(text)
3483
+
3484
+ @property
3485
+ def n_words(self) -> int:
3486
+ """Number of words in sentence."""
3487
+ return len(self.words)
3488
+
3489
+ # =========================================================================
3490
+ # IPA GENERATION
3491
+ # =========================================================================
3492
+
3493
+ @cached_property
3494
+ def ipa(self) -> str:
3495
+ """
3496
+ Generate IPA transcription for entire sentence.
3497
+
3498
+ ALGORITHM:
3499
+ ----------
3500
+ 1. Generate IPA for each word
3501
+ 2. Join with word boundary markers (space)
3502
+
3503
+ SIMPLIFICATION:
3504
+ ---------------
3505
+ This treats each word independently.
3506
+ A full implementation would handle:
3507
+ - Liaison across word boundaries
3508
+ - Resyllabification (e.g., "os amigos" → "o.za.mi.gos")
3509
+ - Phrasal stress patterns
3510
+
3511
+ Returns:
3512
+ Space-separated IPA transcription
3513
+ """
3514
+ word_ipas = [word.ipa for word in self.words]
3515
+ return " ".join(word_ipas)
3516
+
3517
+ # =========================================================================
3518
+ # FEATURE EXTRACTION
3519
+ # =========================================================================
3520
+
3521
+ @property
3522
+ def features(self) -> Dict[str, any]:
3523
+ """
3524
+ Extract all linguistic features.
3525
+
3526
+ WARNING: Can produce very large feature dictionaries
3527
+ for long sentences. Consider alternative representations
3528
+ (e.g., arrays, DataFrames) for ML applications.
3529
+
3530
+ Returns:
3531
+ Dictionary with sentence features and nested word features
3532
+ """
3533
+ feats = {
3534
+ "n_words": self.n_words,
3535
+ "n_whitespaces": self.n_words - 1,
3536
+ }
3537
+
3538
+ for word in self.words:
3539
+ for k, v in word.features.items():
3540
+ feats[f"word_{word.word_idx}_{k}"] = v
3541
+
3542
+ return feats
3543
+
3544
+ def __eq__(self, other) -> bool:
3545
+ """Allow comparison with string."""
3546
+ if isinstance(other, str):
3547
+ return self.surface == other
3548
+ return super().__eq__(other)
3549
+
3550
+ def __repr__(self) -> str:
3551
+ """String representation for debugging."""
3552
+ return f"Sentence('{self.surface}' → [{self.ipa}])"
3553
+
3554
+
3555
+ # =============================================================================
3556
+ # UTILITY FUNCTIONS FOR TESTING AND DEMONSTRATION
3557
+ # =============================================================================
3558
+
3559
+ def demonstrate_transcription(text: str, dialect: DialectInventory = None):
3560
+ """
3561
+ Demonstrate IPA transcription with detailed linguistic analysis.
3562
+
3563
+ This function provides a pedagogical view of the transcription process,
3564
+ showing intermediate steps and linguistic features.
3565
+
3566
+ Args:
3567
+ text: Portuguese text to transcribe
3568
+ dialect: DialectInventory to use (default: European Portuguese)
3569
+
3570
+ Example:
3571
+ >>> demonstrate_transcription("O cão comeu o pão.")
3572
+ Sentence: O cão comeu o pão.
3573
+ IPA: [u ˈkɐ̃w ko·ˈmew u ˈpɐ̃w]
3574
+
3575
+ Words:
3576
+ 1. o [u]
3577
+ Syllables: o
3578
+ Stress: syllable 0
3579
+ 2. cão [ˈkɐ̃w]
3580
+ Syllables: cão
3581
+ Stress: syllable 0 (final -ão)
3582
+ Graphemes: c[k] ão[ɐ̃w]
3583
+ Nasal diphthong: ão
3584
+ ...
3585
+ """
3586
+ if dialect is None:
3587
+ dialect = EuropeanPortuguese()
3588
+
3589
+ sentence = Sentence(text, dialect=dialect)
3590
+
3591
+ print(f"Sentence: {sentence.surface}")
3592
+ print(f"IPA: [{sentence.ipa}]")
3593
+ print()
3594
+ print("Words:")
3595
+
3596
+ for word in sentence.words:
3597
+ print(f"{word.word_idx + 1}. {word.surface} [{word.ipa}]")
3598
+ print(f" Syllables: {'.'.join(word.syllables)}")
3599
+ print(f" Stress: syllable {word.stressed_syllable_idx}")
3600
+
3601
+ # Show graphemes
3602
+ grapheme_strs = []
3603
+ for g in word.graphemes:
3604
+ label = f"{g.surface}[{g.ipa}]"
3605
+ if g.is_diphthong:
3606
+ label += "(diphthong)"
3607
+ if g.is_digraph:
3608
+ label += "(digraph)"
3609
+ grapheme_strs.append(label)
3610
+
3611
+ print(f" Graphemes: {' '.join(grapheme_strs)}")
3612
+ print()
3613
+
3614
+
3615
+ # =============================================================================
3616
+ # MAIN DEMONSTRATION
3617
+ # =============================================================================
3618
+
3619
+ if __name__ == "__main__":
3620
+ """
3621
+ Demonstrate the transcription system with various Portuguese examples.
3622
+
3623
+ These examples showcase:
3624
+ - Different stress patterns
3625
+ - Diphthongs and nasal vowels
3626
+ - Consonant digraphs
3627
+ - Challenging orthographic patterns
3628
+ """
3629
+ print("=" * 80)
3630
+ print("PORTUGUESE ORTHOGRAPHY → IPA TRANSCRIPTION SYSTEM")
3631
+ print("=" * 80)
3632
+ print()
3633
+
3634
+ # Example sentences showcasing different phenomena
3635
+ examples = [
3636
+ # Basic sentence with nasal diphthongs
3637
+ "O cão comeu o pão.",
3638
+
3639
+ # Stress patterns
3640
+ "O médico português está no café.",
3641
+
3642
+ # Diphthongs and digraphs
3643
+ "A rainha viu o vinho.",
3644
+
3645
+ # Complex consonants
3646
+ "O carro chegou rápido.",
3647
+
3648
+ # X variants
3649
+ "O exemplo do táxi é exato.",
3650
+
3651
+ # Nasal patterns
3652
+ "Um homem tem compaixão.",
3653
+ ]
3654
+
3655
+ european = EuropeanPortuguese()
3656
+
3657
+ for example in examples:
3658
+ demonstrate_transcription(example, european)
3659
+ print("=" * 80)
3660
+ print()
3661
+
3662
+ print("\nTranscription complete!")
3663
+
3664
+ examples = [
3665
+ "O cão comeu o pão.",
3666
+ "Três tigres tristes.",
3667
+ "Brasil é bonito.",
3668
+ "A tia comeu muito.",
3669
+ ]
3670
+
3671
+ dialects = [
3672
+ ("European", EuropeanPortuguese()),
3673
+ ("Brazilian", BrazilianPortuguese()),
3674
+ ("Angolan", AngolanPortuguese()),
3675
+ ("Mozambican", MozambicanPortuguese()),
3676
+ ("Timorese", TimoresePortuguese()),
3677
+ ]
3678
+
3679
+ for example in examples:
3680
+ print(f"\nExample: {example}")
3681
+ print("-" * 80)
3682
+ for name, dialect in dialects:
3683
+ sent = Sentence(example, dialect=dialect)
3684
+ print(f"{name:15} [{dialect.dialect_code}]: {sent.ipa}")
3685
+ print()
3686
+
3687
+ print("\nDetailed analysis: pt-BR")
3688
+ print("=" * 80)
3689
+ demonstrate_transcription("A tia comeu muito pão.", BrazilianPortuguese())