phoonnx 0.1.0a1__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
phoonnx/config.py CHANGED
@@ -33,7 +33,7 @@ class Alphabet(str, Enum):
33
33
  ERAAB = "eraab" # fa
34
34
  COTOVIA = "cotovia" # gl
35
35
  HANZI = "hanzi" # zh
36
- MANTOQ = "mantoq" # ar
36
+ BUCKWALTER = "buckwalter" # ar
37
37
 
38
38
 
39
39
 
@@ -48,6 +48,8 @@ class PhonemeType(str, Enum):
48
48
  EPITRAN = "epitran"
49
49
  BYT5 = "byt5"
50
50
  CHARSIU = "charsiu" # technically same as byt5, but needs special handling for whitespace
51
+ TRANSPHONE = "transphone"
52
+ MIRANDESE = "mwl_phonemizer"
51
53
 
52
54
  DEEPPHONEMIZER = "deepphonemizer" # en
53
55
  OPENPHONEMIZER = "openphonemizer" # en
@@ -382,6 +384,9 @@ class SynthesisConfig:
382
384
 
383
385
  enable_phonetic_spellings: bool = True
384
386
 
387
+ """for arabic and hebrew models"""
388
+ add_diacritics: bool = True
389
+
385
390
 
386
391
  def get_phonemizer(phoneme_type: PhonemeType,
387
392
  alphabet: Alphabet = Alphabet.IPA,
@@ -389,6 +394,7 @@ def get_phonemizer(phoneme_type: PhonemeType,
389
394
  from phoonnx.phonemizers import (EpitranPhonemizer, EspeakPhonemizer, OpenPhonemizer, OpenJTaklPhonemizer,
390
395
  ByT5Phonemizer, CharsiuPhonemizer, DeepPhonemizer, PersianPhonemizer,
391
396
  G2pCPhonemizer, G2pMPhonemizer, G2PKPhonemizer, G2PEnPhonemizer,
397
+ TransphonePhonemizer, MirandesePhonemizer,
392
398
  GruutPhonemizer, GraphemePhonemizer, MantoqPhonemizer, MisakiPhonemizer,
393
399
  KoG2PPhonemizer, PypinyinPhonemizer, PyKakasiPhonemizer, CotoviaPhonemizer,
394
400
  CutletPhonemizer, PhonikudPhonemizer, VIPhonemePhonemizer, XpinyinPhonemizer,
@@ -405,6 +411,10 @@ def get_phonemizer(phoneme_type: PhonemeType,
405
411
  phonemizer = EpitranPhonemizer()
406
412
  elif phoneme_type == PhonemeType.MISAKI:
407
413
  phonemizer = MisakiPhonemizer()
414
+ elif phoneme_type == PhonemeType.TRANSPHONE:
415
+ phonemizer = TransphonePhonemizer()
416
+ elif phoneme_type == PhonemeType.MIRANDESE:
417
+ phonemizer = MirandesePhonemizer()
408
418
  elif phoneme_type == PhonemeType.DEEPPHONEMIZER:
409
419
  phonemizer = DeepPhonemizer(model)
410
420
  elif phoneme_type == PhonemeType.OPENPHONEMIZER:
@@ -12,7 +12,8 @@ from phoonnx.phonemizers.ko import KoG2PPhonemizer, G2PKPhonemizer
12
12
  from phoonnx.phonemizers.zh import (G2pCPhonemizer, G2pMPhonemizer, PypinyinPhonemizer,
13
13
  XpinyinPhonemizer, JiebaPhonemizer)
14
14
  from phoonnx.phonemizers.mul import (EspeakPhonemizer, EpitranPhonemizer, MisakiPhonemizer,
15
- GruutPhonemizer, ByT5Phonemizer, CharsiuPhonemizer)
15
+ GruutPhonemizer, ByT5Phonemizer, CharsiuPhonemizer, TransphonePhonemizer)
16
+ from phoonnx.phonemizers.mwl import MirandesePhonemizer
16
17
 
17
18
  Phonemizer = Union[
18
19
  MisakiPhonemizer,
@@ -22,6 +23,8 @@ Phonemizer = Union[
22
23
  EspeakPhonemizer,
23
24
  GruutPhonemizer,
24
25
  EpitranPhonemizer,
26
+ TransphonePhonemizer,
27
+ MirandesePhonemizer,
25
28
  OpenJTaklPhonemizer,
26
29
  CutletPhonemizer,
27
30
  PyKakasiPhonemizer,
phoonnx/phonemizers/ar.py CHANGED
@@ -1,13 +1,13 @@
1
- from phoonnx.phonemizers.base import BasePhonemizer
2
- from phoonnx.thirdparty.mantoq import g2p as mantoq
3
1
  from phoonnx.config import Alphabet
2
+ from phoonnx.phonemizers.base import BasePhonemizer
4
3
  from phoonnx.thirdparty.bw2ipa import translate as bw2ipa
4
+ from phoonnx.thirdparty.mantoq import g2p as mantoq
5
5
 
6
6
 
7
7
  class MantoqPhonemizer(BasePhonemizer):
8
8
 
9
- def __init__(self, alphabet=Alphabet.MANTOQ):
10
- if alphabet not in [Alphabet.IPA, Alphabet.MANTOQ]:
9
+ def __init__(self, alphabet=Alphabet.BUCKWALTER):
10
+ if alphabet not in [Alphabet.IPA, Alphabet.BUCKWALTER]:
11
11
  raise ValueError("unsupported alphabet")
12
12
  super().__init__(alphabet)
13
13
 
@@ -39,7 +39,7 @@ class MantoqPhonemizer(BasePhonemizer):
39
39
 
40
40
  # The phonemes are a list of characters, we join them into a string
41
41
  # and replace the word separator token with a space.
42
- phonemes = "".join(phonemes).replace("_+_", " ")
42
+ phonemes = "".join(phonemes).replace("_+_", " ")
43
43
 
44
44
  if self.alphabet == Alphabet.IPA:
45
45
  # If the alphabet is IPA, we use the bw2ipa function to translate
@@ -51,56 +51,48 @@ class MantoqPhonemizer(BasePhonemizer):
51
51
 
52
52
 
53
53
  if __name__ == "__main__":
54
+ from phoonnx.phonemizers.mul import EspeakPhonemizer
55
+
56
+ espeak = EspeakPhonemizer()
57
+
54
58
  # Initialize phonemizers for both MANTOQ and IPA alphabets
55
- pho_mantoq = MantoqPhonemizer()
56
- pho_ipa = MantoqPhonemizer(alphabet=Alphabet.IPA)
59
+ pho_mantoq = MantoqPhonemizer(alphabet=Alphabet.IPA)
60
+
61
+
62
+ def compare(text):
63
+ print(f"Original Text: {text}")
64
+ print(f" Mantoq: {pho_mantoq.phonemize_string(text, 'ar')}")
65
+ print(f" Espeak: {espeak.phonemize_string(text, 'ar')}")
66
+
67
+ ts = pho_mantoq.add_diacritics(text, 'ar')
68
+ print(f"Tashkeel Text: {ts}")
69
+ print(f" Mantoq: {pho_mantoq.phonemize_string(ts, 'ar')}")
70
+ print(f" Espeak: {espeak.phonemize_string(ts, 'ar')}")
71
+ print("\n#########################")
57
72
 
58
- text1 = "مرحبا بالعالم"
59
- print(f"Original Text: {text1}")
60
- print(f" Mantoq Phonemizer: {pho_mantoq.phonemize_string(text1, 'ar')}")
61
- print(f" IPA Phonemizer: {pho_ipa.phonemize_string(text1, 'ar')}")
62
- print("-" * 20)
63
73
 
64
- text2 = "ذهب الطالب إلى المكتبة لقراءة كتاب عن تاريخ الأندلس."
65
- print(f"Original Text: {text2}")
66
- print(f" Mantoq Phonemizer: {pho_mantoq.phonemize_string(text2, 'ar')}")
67
- print(f" IPA Phonemizer: {pho_ipa.phonemize_string(text2, 'ar')}")
68
- print("-" * 20)
74
+ text = "مرحبا بالعالم"
75
+ compare(text)
69
76
 
70
- # --- New Test Cases to check bw2ipa logic ---
71
- print("--- New Test Cases for bw2ipa logic ---")
77
+ text = "ذهب الطالب إلى المكتبة لقراءة كتاب عن تاريخ الأندلس."
78
+ compare(text)
72
79
 
73
80
  # 1. Test for gemination of a sun letter (e.g., ash-shams)
74
- text3 = "الشمس"
75
- print(f"Original Text: '{text3}'")
76
- print(f" Mantoq Phonemizer: {pho_mantoq.phonemize_string(text3, 'ar')}")
77
- print(f" IPA Phonemizer: {pho_ipa.phonemize_string(text3, 'ar')}")
78
- print("-" * 20)
81
+ text = "الشمس"
82
+ compare(text)
79
83
 
80
84
  # 2. Test for long vowels (e.g., 'fil' - elephant)
81
- text4 = "فيل"
82
- print(f"Original Text: '{text4}'")
83
- print(f" Mantoq Phonemizer: {pho_mantoq.phonemize_string(text4, 'ar')}")
84
- print(f" IPA Phonemizer: {pho_ipa.phonemize_string(text4, 'ar')}")
85
- print("-" * 20)
85
+ text = "فيل"
86
+ compare(text)
86
87
 
87
88
  # 3. Test for glide (e.g., 'yawm' - day)
88
- text5 = "يوم"
89
- print(f"Original Text: '{text5}'")
90
- print(f" Mantoq Phonemizer: {pho_mantoq.phonemize_string(text5, 'ar')}")
91
- print(f" IPA Phonemizer: {pho_ipa.phonemize_string(text5, 'ar')}")
92
- print("-" * 20)
89
+ text = "يوم"
90
+ compare(text)
93
91
 
94
92
  # 4. Test for long vowels (e.g., 'suwr' - wall)
95
- text6 = "سور"
96
- print(f"Original Text: '{text6}'")
97
- print(f" Mantoq Phonemizer: {pho_mantoq.phonemize_string(text6, 'ar')}")
98
- print(f" IPA Phonemizer: {pho_ipa.phonemize_string(text6, 'ar')}")
99
- print("-" * 20)
93
+ text = "سور"
94
+ compare(text)
100
95
 
101
96
  # 5. Test for glide (e.g., 'law' - if)
102
- text7 = "لو"
103
- print(f"Original Text: '{text7}'")
104
- print(f" Mantoq Phonemizer: {pho_mantoq.phonemize_string(text7, 'ar')}")
105
- print(f" IPA Phonemizer: {pho_ipa.phonemize_string(text7, 'ar')}")
106
- print("-" * 20)
97
+ text = "لو"
98
+ compare(text)
@@ -8,6 +8,8 @@ from langcodes import tag_distance
8
8
  from quebra_frases import sentence_tokenize
9
9
  from phoonnx.config import Alphabet
10
10
  from phoonnx.util import normalize
11
+ from phoonnx.thirdparty.phonikud import PhonikudDiacritizer
12
+ from phoonnx.thirdparty.tashkeel import TashkeelDiacritizer
11
13
 
12
14
  # list of (substring, terminator, end_of_sentence) tuples.
13
15
  TextChunks = List[Tuple[str, str, bool]]
@@ -18,10 +20,27 @@ PhonemizedChunks = list[list[str]]
18
20
 
19
21
 
20
22
  class BasePhonemizer(metaclass=abc.ABCMeta):
21
- def __init__(self, alphabet: Alphabet = Alphabet.UNICODE):
23
+ def __init__(self, alphabet: Alphabet = Alphabet.UNICODE,
24
+ taskeen_threshold: Optional[float] = 0.8):
22
25
  super().__init__()
23
26
  self.alphabet = alphabet
24
27
 
28
+ self.taskeen_threshold = taskeen_threshold # arabic only
29
+ self._tashkeel: Optional[TashkeelDiacritizer] = None
30
+ self._phonikud: Optional[PhonikudDiacritizer] = None # hebrew only
31
+
32
+ @property
33
+ def phonikud(self) -> PhonikudDiacritizer:
34
+ if self._phonikud is None:
35
+ self._phonikud = PhonikudDiacritizer()
36
+ return self._phonikud
37
+
38
+ @property
39
+ def tashkeel(self) -> TashkeelDiacritizer:
40
+ if self._tashkeel is None:
41
+ self._tashkeel = TashkeelDiacritizer()
42
+ return self._tashkeel
43
+
25
44
  @abc.abstractmethod
26
45
  def phonemize_string(self, text: str, lang: str) -> str:
27
46
  raise NotImplementedError
@@ -29,6 +48,13 @@ class BasePhonemizer(metaclass=abc.ABCMeta):
29
48
  def phonemize_to_list(self, text: str, lang: str) -> List[str]:
30
49
  return list(self.phonemize_string(text, lang))
31
50
 
51
+ def add_diacritics(self, text: str, lang: str) -> str:
52
+ if lang.startswith("he"):
53
+ return self.phonikud.diacritize(text)
54
+ elif lang.startswith("ar"):
55
+ return self.tashkeel.diacritize(text, self.taskeen_threshold)
56
+ return text
57
+
32
58
  def phonemize(self, text: str, lang: str) -> PhonemizedChunks:
33
59
  if not text:
34
60
  return [('', '', True)]
phoonnx/phonemizers/he.py CHANGED
@@ -1,30 +1,12 @@
1
- import os.path
2
-
3
- import requests
4
-
5
- from phoonnx.phonemizers.base import BasePhonemizer
6
1
  from phoonnx.config import Alphabet
2
+ from phoonnx.phonemizers.base import BasePhonemizer
7
3
 
8
4
 
9
5
  class PhonikudPhonemizer(BasePhonemizer):
10
- dl_url = "https://huggingface.co/thewh1teagle/phonikud-onnx/resolve/main/phonikud-1.0.int8.onnx"
11
6
 
12
- def __init__(self, model: str = None, diacritics=True):
13
- from phonikud_onnx import Phonikud
7
+ def __init__(self):
14
8
  from phonikud import phonemize
15
9
  self.g2p = phonemize
16
- self.diacritics = diacritics
17
- if model is None:
18
- base_path = os.path.expanduser("~/.local/share/phonikud")
19
- fname = self.dl_url.split("/")[-1]
20
- model = f"{base_path}/{fname}"
21
- if not os.path.isfile(model):
22
- os.makedirs(base_path, exist_ok=True)
23
- # TODO - streaming download
24
- data = requests.get(self.dl_url).content
25
- with open(model, "wb") as f:
26
- f.write(data)
27
- self.phonikud = Phonikud(model) if diacritics else None
28
10
  super().__init__(Alphabet.IPA)
29
11
 
30
12
  @classmethod
@@ -48,20 +30,19 @@ class PhonikudPhonemizer(BasePhonemizer):
48
30
  """
49
31
  """
50
32
  lang = self.get_lang(lang)
51
- if self.diacritics:
52
- text = self.phonikud.add_diacritics(text)
53
33
  return self.g2p(text)
54
34
 
55
35
 
56
36
  if __name__ == "__main__":
57
- #text = "מתכת יקרה"
37
+ # text = "מתכת יקרה"
58
38
  text = 'שָׁלוֹם עוֹלָם'
59
39
 
60
- pho = PhonikudPhonemizer(diacritics=False)
40
+ pho = PhonikudPhonemizer()
61
41
  lang = "he"
62
42
 
63
43
  print(f"\n--- Getting phonemes for '{text}' ---")
44
+ # text = pho.add_diacritics(text, lang)
64
45
  phonemes = pho.phonemize(text, lang)
65
46
  print(f" Phonemes: {phonemes}")
66
47
  # --- Getting phonemes for 'שָׁלוֹם עוֹלָם' ---
67
- # Phonemes: [('ʃalˈom ʔolˈam', '.', True)]
48
+ # Phonemes: [('ʃalˈom ʔolˈam', '.', True)]