phoonnx 0.0.2a2__py3-none-any.whl → 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
phoonnx/phonemizers/ar.py CHANGED
@@ -1,11 +1,15 @@
1
1
  from phoonnx.phonemizers.base import BasePhonemizer
2
2
  from phoonnx.thirdparty.mantoq import g2p as mantoq
3
3
  from phoonnx.config import Alphabet
4
+ from phoonnx.thirdparty.bw2ipa import translate as bw2ipa
5
+
4
6
 
5
7
  class MantoqPhonemizer(BasePhonemizer):
6
8
 
7
- def __init__(self):
8
- super().__init__(Alphabet.MANTOQ)
9
+ def __init__(self, alphabet=Alphabet.MANTOQ):
10
+ if alphabet not in [Alphabet.IPA, Alphabet.MANTOQ]:
11
+ raise ValueError("unsupported alphabet")
12
+ super().__init__(alphabet)
9
13
 
10
14
  @classmethod
11
15
  def get_lang(cls, target_lang: str) -> str:
@@ -26,17 +30,77 @@ class MantoqPhonemizer(BasePhonemizer):
26
30
 
27
31
  def phonemize_string(self, text: str, lang: str = "ar") -> str:
28
32
  """
33
+ Phonemizes an Arabic string using the Mantoq G2P tool.
34
+ If the alphabet is set to IPA, it then converts the result using bw2ipa.
29
35
  """
30
36
  lang = self.get_lang(lang)
37
+ # The mantoq function returns a tuple of (normalized_text, phonemes)
31
38
  normalized_text, phonemes = mantoq(text)
32
- return "".join(phonemes).replace("_+_", " ")
39
+
40
+ # The phonemes are a list of characters, we join them into a string
41
+ # and replace the word separator token with a space.
42
+ phonemes = "".join(phonemes).replace("_+_", " ")
43
+
44
+ if self.alphabet == Alphabet.IPA:
45
+ # If the alphabet is IPA, we use the bw2ipa function to translate
46
+ # the Buckwalter-like phonemes into IPA.
47
+ return bw2ipa(phonemes)
48
+
49
+ # Otherwise, we return the phonemes in the default Mantoq alphabet.
50
+ return phonemes
33
51
 
34
52
 
35
53
  if __name__ == "__main__":
36
- text = "مرحبا بالعالم"
37
- # gets normalized to
38
- # مَرْحَبًا بِالْعالَم
54
+ # Initialize phonemizers for both MANTOQ and IPA alphabets
55
+ pho_mantoq = MantoqPhonemizer()
56
+ pho_ipa = MantoqPhonemizer(alphabet=Alphabet.IPA)
57
+
58
+ text1 = "مرحبا بالعالم"
59
+ print(f"Original Text: {text1}")
60
+ print(f" Mantoq Phonemizer: {pho_mantoq.phonemize_string(text1, 'ar')}")
61
+ print(f" IPA Phonemizer: {pho_ipa.phonemize_string(text1, 'ar')}")
62
+ print("-" * 20)
63
+
64
+ text2 = "ذهب الطالب إلى المكتبة لقراءة كتاب عن تاريخ الأندلس."
65
+ print(f"Original Text: {text2}")
66
+ print(f" Mantoq Phonemizer: {pho_mantoq.phonemize_string(text2, 'ar')}")
67
+ print(f" IPA Phonemizer: {pho_ipa.phonemize_string(text2, 'ar')}")
68
+ print("-" * 20)
69
+
70
+ # --- New Test Cases to check bw2ipa logic ---
71
+ print("--- New Test Cases for bw2ipa logic ---")
72
+
73
+ # 1. Test for gemination of a sun letter (e.g., ash-shams)
74
+ text3 = "الشمس"
75
+ print(f"Original Text: '{text3}'")
76
+ print(f" Mantoq Phonemizer: {pho_mantoq.phonemize_string(text3, 'ar')}")
77
+ print(f" IPA Phonemizer: {pho_ipa.phonemize_string(text3, 'ar')}")
78
+ print("-" * 20)
79
+
80
+ # 2. Test for long vowels (e.g., 'fil' - elephant)
81
+ text4 = "فيل"
82
+ print(f"Original Text: '{text4}'")
83
+ print(f" Mantoq Phonemizer: {pho_mantoq.phonemize_string(text4, 'ar')}")
84
+ print(f" IPA Phonemizer: {pho_ipa.phonemize_string(text4, 'ar')}")
85
+ print("-" * 20)
86
+
87
+ # 3. Test for glide (e.g., 'yawm' - day)
88
+ text5 = "يوم"
89
+ print(f"Original Text: '{text5}'")
90
+ print(f" Mantoq Phonemizer: {pho_mantoq.phonemize_string(text5, 'ar')}")
91
+ print(f" IPA Phonemizer: {pho_ipa.phonemize_string(text5, 'ar')}")
92
+ print("-" * 20)
93
+
94
+ # 4. Test for long vowels (e.g., 'suwr' - wall)
95
+ text6 = "سور"
96
+ print(f"Original Text: '{text6}'")
97
+ print(f" Mantoq Phonemizer: {pho_mantoq.phonemize_string(text6, 'ar')}")
98
+ print(f" IPA Phonemizer: {pho_ipa.phonemize_string(text6, 'ar')}")
99
+ print("-" * 20)
39
100
 
40
- pho = MantoqPhonemizer()
41
- print(pho.phonemize(text, "ar"))
42
- # [('m a r H a b a n aa b i l E aa l a m', '.', True)]
101
+ # 5. Test for glide (e.g., 'law' - if)
102
+ text7 = "لو"
103
+ print(f"Original Text: '{text7}'")
104
+ print(f" Mantoq Phonemizer: {pho_mantoq.phonemize_string(text7, 'ar')}")
105
+ print(f" IPA Phonemizer: {pho_ipa.phonemize_string(text7, 'ar')}")
106
+ print("-" * 20)
@@ -0,0 +1,66 @@
1
+ # -*- coding: UTF8 -*-
2
+ """
3
+ This script translates Buckwalter-transcribed Modern Standard Arabic to IPA.
4
+
5
+ This relies on Mantoq tokenization that uses separate tokens for vowel length and consonant gemination.
6
+
7
+ by Casimiro Ferreira with help of Gemini July 2025.
8
+ """
9
+
10
+ import re
11
+
12
+ # This dictionary maps a single Buckwalter character to its most common IPA equivalent.
13
+ char_dict = {
14
+ 'a': 'a', 'A': 'aː', 'b': 'b', 'c': 'x', 'd': 'd', 'D': 'dˤ', 'e': 'e', 'E': 'ʕ',
15
+ 'f': 'f', 'g': 'ɣ', 'h': 'h', 'H': 'ħ', 'i': 'i', 'I': 'iː', 'j': 'ʒ', 'k': 'k',
16
+ 'l': 'l', 'm': 'm', 'n': 'n', 'p': 'p', 'q': 'q', 'r': 'r', 'R': 'r', 's': 's',
17
+ 'S': 'sˤ', 't': 't', 'T': 'tˤ', 'u': 'u', 'U': 'uː', 'v': 'v', 'w': 'w', 'x': 'x',
18
+ 'y': 'j', 'z': 'z', 'Z': 'ðˤ', '\'': 'ʔ', '<': 'ʔ', 'o': 'o', '-': ' ',
19
+ '*': 'ð', '$': 'ʃ'
20
+ }
21
+ _vowels = {'a', 'i', 'u', 'aː', 'iː', 'uː'}
22
+
23
+
24
+ def translate(buckwalter_text: str) -> str:
25
+ """
26
+ Translates a Buckwalter-transcribed string into an IPA string.
27
+
28
+ Args:
29
+ buckwalter_text (str): The Buckwalter string to translate.
30
+
31
+ Returns:
32
+ str: The translated IPA string.
33
+ """
34
+ ipa_list = []
35
+ i = 0
36
+ while i < len(buckwalter_text):
37
+ # Check for the longest token first. The new Mantoq tokenization
38
+ # seems to use a 5-character token.
39
+ token = buckwalter_text[i:i + 5]
40
+
41
+ # If the previous character was a vowel, we assume it's a long vowel
42
+ # marker (ː). Otherwise, we assume it's a geminated consonant.
43
+ if token == '_dbl_':
44
+ if ipa_list and ipa_list[-1] in _vowels:
45
+ ipa_list.append('ː') # Add length marker for long vowels
46
+ elif ipa_list:
47
+ ipa_list.append(ipa_list[-1]) # Duplicate the consonant
48
+ i += 5
49
+ continue
50
+
51
+ # Check for multi-character mappings from char_dict
52
+ two_char_token = buckwalter_text[i:i + 2]
53
+ if two_char_token in char_dict:
54
+ ipa_list.append(char_dict[two_char_token])
55
+ i += 2
56
+ continue
57
+
58
+ # Handle single characters
59
+ single_char = buckwalter_text[i]
60
+ if single_char in char_dict:
61
+ ipa_list.append(char_dict[single_char])
62
+ else:
63
+ ipa_list.append(single_char)
64
+ i += 1
65
+
66
+ return ''.join(ipa_list)
phoonnx/version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # START_VERSION_BLOCK
2
2
  VERSION_MAJOR = 0
3
- VERSION_MINOR = 0
4
- VERSION_BUILD = 2
5
- VERSION_ALPHA = 2
3
+ VERSION_MINOR = 1
4
+ VERSION_BUILD = 0
5
+ VERSION_ALPHA = 1
6
6
  # END_VERSION_BLOCK
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: phoonnx
3
- Version: 0.0.2a2
3
+ Version: 0.1.0a1
4
4
  Home-page: https://github.com/TigreGotico/phoonnx
5
5
  Author: JarbasAi
6
6
  Author-email: jarbasai@mailfence.com
@@ -2,14 +2,14 @@ phoonnx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  phoonnx/config.py,sha256=81H34oPG2BaiOA6UM1KapoT341n068LqRprKb5ER6mY,19451
3
3
  phoonnx/phoneme_ids.py,sha256=FiNgZwV6naEsBh6XwFLh3_FyOgPiCsK9qo7S0v-CmI4,13667
4
4
  phoonnx/util.py,sha256=XSjFEoqSFcujFTHxednacgC9GrSYyF-Il5L6Utmxmu4,25909
5
- phoonnx/version.py,sha256=E5lPoALEg4bqb4bmnVYnseCgozkzjZGnhpUom_OlcQI,114
5
+ phoonnx/version.py,sha256=95gLFCt-8xv9DgF7FIF6CljWmhm8SUhevumEBfo7Pl0,114
6
6
  phoonnx/voice.py,sha256=FR_LafK1vSi_anPERJjZBuH3Bb9vUIof0MAW6TnALlA,20024
7
7
  phoonnx/locale/ca/phonetic_spellings.txt,sha256=igv3t7jxLSRE5GHsdn57HOpxiWNcEmECPql6m02wbO0,47
8
8
  phoonnx/locale/en/phonetic_spellings.txt,sha256=xGQlWOABLzbttpQvopl9CU-NnwEJRqKx8iuylsdUoQA,27
9
9
  phoonnx/locale/gl/phonetic_spellings.txt,sha256=igv3t7jxLSRE5GHsdn57HOpxiWNcEmECPql6m02wbO0,47
10
10
  phoonnx/locale/pt/phonetic_spellings.txt,sha256=KntS8QMynEJ5A3Clvcjq4qlmL-ThSbhfD6v0nKSrlqs,49
11
11
  phoonnx/phonemizers/__init__.py,sha256=QGBZk0QUgJdg2MwUWY9Kpk6ucwrEJYtHb07YcNvXCV4,1647
12
- phoonnx/phonemizers/ar.py,sha256=rPAMGPlyXOlKY99IoHe_vWcLllepiG0YFUbOBia075E,1260
12
+ phoonnx/phonemizers/ar.py,sha256=29bCfYhlhx0QX3PQyx3EkUghzh8YfkxNAnMAICXX6I8,4148
13
13
  phoonnx/phonemizers/base.py,sha256=yPg6-dvscYpl3rR3JEULG1PRF-i8DWC_C3HAZGLbxOo,7648
14
14
  phoonnx/phonemizers/en.py,sha256=N2SVoVhplQao7Ej5TXbxJU-YkAgkY0Fr9iYBFnsjFSE,9271
15
15
  phoonnx/phonemizers/fa.py,sha256=d_DZM2wqomf4gcRH_rFcNA3VkQWKHru8vwBwaNG8Ll8,1452
@@ -22,6 +22,7 @@ phoonnx/phonemizers/vi.py,sha256=_XJc-Xeawr1Lxr7o8mE_hJao1aGcj4g01XYAOxC_Scg,131
22
22
  phoonnx/phonemizers/zh.py,sha256=88Ywq8h9LDanlyz8RHjRSCY_PRK_Dq808tBADyrgaP8,9657
23
23
  phoonnx/thirdparty/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
24
  phoonnx/thirdparty/arpa2ipa.py,sha256=Uj1G5NgP5oBBfSm26LGB8QoumdT-NqCLQTZHT165-_o,5850
25
+ phoonnx/thirdparty/bw2ipa.py,sha256=5FiWC4AP4KXkqtbclbinoXEsUnSYEjk4VWAPasMMcbg,2328
25
26
  phoonnx/thirdparty/hangul2ipa.py,sha256=e2c0WOy5lFMcf6GS7pNqIbauMKBX07S84lCczZAZJGA,27518
26
27
  phoonnx/thirdparty/zh_num.py,sha256=SESA6gvSJW3LZ0FLoybXn2SpbxqhQTi9Tg_U2IZ5JYY,7147
27
28
  phoonnx/thirdparty/cotovia/cotovia_aarch64,sha256=BsAWZN452Lm9kDU4i6rQGHFSlmxP3GfHRKhbJMUQrfA,6764592
@@ -80,7 +81,7 @@ phoonnx_train/vits/utils.py,sha256=exiyrtPHbnnGvcHWSbaH9-gR6srH5ZPHlKiqV2IHUrQ,4
80
81
  phoonnx_train/vits/wavfile.py,sha256=oQZiTIrdw0oLTbcVwKfGXye1WtKte6qK_52qVwiMvfc,26396
81
82
  phoonnx_train/vits/monotonic_align/__init__.py,sha256=5IdAOD1Z7UloMb6d_9NRFsXoNIjEQ3h9mvOSh_AtO3k,636
82
83
  phoonnx_train/vits/monotonic_align/setup.py,sha256=0K5iJJ2mKIklx6ncEfCQS34skm5hHPiz9vRlQEvevvY,266
83
- phoonnx-0.0.2a2.dist-info/METADATA,sha256=-uMrs5iV1A4gqenGg1r8ZyhZN1DhBd9HCsezIy4UxPE,8145
84
- phoonnx-0.0.2a2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
85
- phoonnx-0.0.2a2.dist-info/top_level.txt,sha256=ZrnHXe-4HqbOSX6fbdY-JiP7YEu2Bok9T0ji351MrmM,22
86
- phoonnx-0.0.2a2.dist-info/RECORD,,
84
+ phoonnx-0.1.0a1.dist-info/METADATA,sha256=9FZiRhA48da6ZbX1qCrKKVqsWMWQwfedz-bUXATd6Sk,8145
85
+ phoonnx-0.1.0a1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
86
+ phoonnx-0.1.0a1.dist-info/top_level.txt,sha256=ZrnHXe-4HqbOSX6fbdY-JiP7YEu2Bok9T0ji351MrmM,22
87
+ phoonnx-0.1.0a1.dist-info/RECORD,,