olaph 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- olaph-0.1.0/LICENSE +21 -0
- olaph-0.1.0/PKG-INFO +182 -0
- olaph-0.1.0/README.md +109 -0
- olaph-0.1.0/olaph.egg-info/PKG-INFO +182 -0
- olaph-0.1.0/olaph.egg-info/SOURCES.txt +8 -0
- olaph-0.1.0/olaph.egg-info/dependency_links.txt +1 -0
- olaph-0.1.0/olaph.egg-info/requires.txt +55 -0
- olaph-0.1.0/olaph.egg-info/top_level.txt +1 -0
- olaph-0.1.0/pyproject.toml +98 -0
- olaph-0.1.0/setup.cfg +4 -0
olaph-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Institute of Information Systems at Hof University
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
olaph-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: olaph
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A multilingual phonemizer combining lexica, NLP, and probabilistic scoring for improved phonemization accuracy..
|
|
5
|
+
Author-email: Johannes Wirth <johannes.wirth.3@iisys.de>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/iisys-hof/olaph
|
|
8
|
+
Project-URL: Documentation, https://github.com/iisys-hof/olaph#readme
|
|
9
|
+
Project-URL: Issues, https://github.com/iisys-hof/olaph/issues
|
|
10
|
+
Keywords: phonemizer,text-to-speech,linguistics,NLP,multilingual
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Requires-Python: >=3.12
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: spacy
|
|
19
|
+
Requires-Dist: lingua-language-detector==2.1.1
|
|
20
|
+
Requires-Dist: num2words==0.5.14
|
|
21
|
+
Requires-Dist: requests==2.32.5
|
|
22
|
+
Requires-Dist: annotated-types==0.7.0
|
|
23
|
+
Requires-Dist: blis==1.3.0
|
|
24
|
+
Requires-Dist: catalogue==2.0.10
|
|
25
|
+
Requires-Dist: certifi==2025.10.5
|
|
26
|
+
Requires-Dist: charset-normalizer==3.4.3
|
|
27
|
+
Requires-Dist: click==8.3.0
|
|
28
|
+
Requires-Dist: cloudpathlib==0.22.0
|
|
29
|
+
Requires-Dist: colorama==0.4.6
|
|
30
|
+
Requires-Dist: confection==0.1.5
|
|
31
|
+
Requires-Dist: cymem==2.0.11
|
|
32
|
+
Requires-Dist: de-core-news-sm
|
|
33
|
+
Requires-Dist: docopt==0.6.2
|
|
34
|
+
Requires-Dist: en-core-web-sm
|
|
35
|
+
Requires-Dist: es-core-news-sm
|
|
36
|
+
Requires-Dist: fr-core-news-sm
|
|
37
|
+
Requires-Dist: idna==3.10
|
|
38
|
+
Requires-Dist: jinja2==3.1.6
|
|
39
|
+
Requires-Dist: langcodes==3.5.0
|
|
40
|
+
Requires-Dist: language-data==1.3.0
|
|
41
|
+
Requires-Dist: marisa-trie==1.3.1
|
|
42
|
+
Requires-Dist: markdown-it-py==4.0.0
|
|
43
|
+
Requires-Dist: markupsafe==3.0.3
|
|
44
|
+
Requires-Dist: mdurl==0.1.2
|
|
45
|
+
Requires-Dist: murmurhash==1.0.13
|
|
46
|
+
Requires-Dist: numpy==2.3.3
|
|
47
|
+
Requires-Dist: packaging==25.0
|
|
48
|
+
Requires-Dist: preshed==3.0.10
|
|
49
|
+
Requires-Dist: pydantic==2.11.10
|
|
50
|
+
Requires-Dist: pydantic-core==2.33.2
|
|
51
|
+
Requires-Dist: pygments==2.19.2
|
|
52
|
+
Requires-Dist: rich==14.1.0
|
|
53
|
+
Requires-Dist: setuptools==80.9.0
|
|
54
|
+
Requires-Dist: shellingham==1.5.4
|
|
55
|
+
Requires-Dist: smart-open==7.3.1
|
|
56
|
+
Requires-Dist: spacy-legacy==3.0.12
|
|
57
|
+
Requires-Dist: spacy-loggers==1.0.5
|
|
58
|
+
Requires-Dist: srsly==2.5.1
|
|
59
|
+
Requires-Dist: thinc==8.3.6
|
|
60
|
+
Requires-Dist: tqdm==4.67.1
|
|
61
|
+
Requires-Dist: typer==0.19.2
|
|
62
|
+
Requires-Dist: typing-inspection==0.4.2
|
|
63
|
+
Requires-Dist: typing-extensions==4.15.0
|
|
64
|
+
Requires-Dist: urllib3==2.5.0
|
|
65
|
+
Requires-Dist: wasabi==1.1.3
|
|
66
|
+
Requires-Dist: weasel==0.4.1
|
|
67
|
+
Requires-Dist: wrapt==1.17.3
|
|
68
|
+
Provides-Extra: dev
|
|
69
|
+
Requires-Dist: pytest; extra == "dev"
|
|
70
|
+
Requires-Dist: black; extra == "dev"
|
|
71
|
+
Requires-Dist: flake8; extra == "dev"
|
|
72
|
+
Dynamic: license-file
|
|
73
|
+
|
|
74
|
+
# OLaPh — Optimal Language Phonemizer
|
|
75
|
+
|
|
76
|
+
[](https://pypi.org/project/olaph/)
|
|
77
|
+
[](https://pypi.org/project/olaph/)
|
|
78
|
+
[](https://opensource.org/licenses/MIT)
|
|
79
|
+
|
|
80
|
+
**OLaPh (Optimal Language Phonemizer)** is a multilingual phonemization framework that converts text into phonemes surpassing the quality of comparable frameworks.
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## Overview
|
|
85
|
+
|
|
86
|
+
Traditional phonemizers rely on simple rule-based mappings or lexicon lookups.
|
|
87
|
+
Neural and hybrid approaches improve generalization but still struggle with:
|
|
88
|
+
|
|
89
|
+
- Names and foreign words
|
|
90
|
+
- Abbreviations and acronyms
|
|
91
|
+
- Loanwords and compounds
|
|
92
|
+
- Ambiguous homographs
|
|
93
|
+
|
|
94
|
+
**OLaPh** tackles these challenges by combining:
|
|
95
|
+
|
|
96
|
+
- Extensive **language-specific dictionaries**
|
|
97
|
+
- **Abbreviation, number, and letter normalization**
|
|
98
|
+
- **Compound resolution with probabilistic scoring**
|
|
99
|
+
- **Cross-language handling**
|
|
100
|
+
- **NLP-based preprocessing** via [spaCy](https://spacy.io) and [Lingua](https://github.com/pemistahl/lingua-py)
|
|
101
|
+
|
|
102
|
+
Evaluations in **German** and **English** show improved accuracy and robustness over existing phonemizers, including on challenging multilingual datasets.
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
## Features
|
|
107
|
+
|
|
108
|
+
- Multilingual phonemization (DE, EN, FR, ES)
|
|
109
|
+
- Abbreviation and letter pronunciation dictionaries
|
|
110
|
+
- Number normalization
|
|
111
|
+
- Cross-language acronym detection
|
|
112
|
+
- Compound splitting with probabilistic scoring
|
|
113
|
+
- Freely available lexica for research and development derived from wiktionary.org.
|
|
114
|
+
|
|
115
|
+
## Large Language Model
|
|
116
|
+
A LLM based on OLaPh output is also available. It is a GemmaX 2B Model trained on ~10M sentences derived from the FineWeb Corpus phonemized with the OLaPh framework.
|
|
117
|
+
|
|
118
|
+
Find it here on [huggingface](https://huggingface.co/iisys-hof/olaph)
|
|
119
|
+
|
|
120
|
+
---
|
|
121
|
+
|
|
122
|
+
## Installation
|
|
123
|
+
|
|
124
|
+
### From PyPI
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
pip install olaph
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### From source
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
git clone https://github.com/iisys-hof/olaph.git
|
|
134
|
+
cd olaph
|
|
135
|
+
pip install -e .
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## Example Usage
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
from olaph import Olaph
|
|
142
|
+
|
|
143
|
+
phonemizer = Olaph()
|
|
144
|
+
|
|
145
|
+
output = phonemizer.phonemize_text("He ordered a Brezel and a beer in a tavern near München.", lang="en")
|
|
146
|
+
|
|
147
|
+
print(output)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## Dependencies
|
|
153
|
+
|
|
154
|
+
- [spaCy](https://spacy.io)
|
|
155
|
+
- [Lingua](https://github.com/pemistahl/lingua-py)
|
|
156
|
+
- [num2words](https://github.com/savoirfairelinux/num2words)
|
|
157
|
+
- [requests](https://requests.readthedocs.io)
|
|
158
|
+
|
|
159
|
+
---
|
|
160
|
+
|
|
161
|
+
## Research Summary
|
|
162
|
+
|
|
163
|
+
Phonemization, the conversion of text into phonemes, is a key step in text-to-speech. Traditional approaches use rule-based transformations and lexicon lookups, while more advanced methods apply preprocessing techniques or neural networks for improved accuracy on out-of-domain vocabulary. However, all systems struggle with names, loanwords, abbreviations, and homographs. This work presents OLaPh (Optimal Language Phonemizer), a framework that combines large lexica, multiple NLP techniques, and compound resolution with a probabilistic scoring function. Evaluations in German and English show improved accuracy over previous approaches, including on a challenging dataset. To further address unresolved cases, we train a large language model on OLaPh-generated data, which achieves even stronger generalization and performance. Together, the framework and LLM improve phonemization consistency and provide a freely available resource for future research.
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
167
|
+
## Citation
|
|
168
|
+
|
|
169
|
+
If you use OLaPh in academic work, please cite:
|
|
170
|
+
|
|
171
|
+
### Citation
|
|
172
|
+
```bibtex
|
|
173
|
+
@misc{wirth2025olaphoptimallanguagephonemizer,
|
|
174
|
+
title={OLaPh: Optimal Language Phonemizer},
|
|
175
|
+
author={Johannes Wirth},
|
|
176
|
+
year={2025},
|
|
177
|
+
eprint={2509.20086},
|
|
178
|
+
archivePrefix={arXiv},
|
|
179
|
+
primaryClass={cs.CL},
|
|
180
|
+
url={https://arxiv.org/abs/2509.20086},
|
|
181
|
+
}
|
|
182
|
+
```
|
olaph-0.1.0/README.md
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# OLaPh — Optimal Language Phonemizer
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/olaph/)
|
|
4
|
+
[](https://pypi.org/project/olaph/)
|
|
5
|
+
[](https://opensource.org/licenses/MIT)
|
|
6
|
+
|
|
7
|
+
**OLaPh (Optimal Language Phonemizer)** is a multilingual phonemization framework that converts text into phonemes surpassing the quality of comparable frameworks.
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## Overview
|
|
12
|
+
|
|
13
|
+
Traditional phonemizers rely on simple rule-based mappings or lexicon lookups.
|
|
14
|
+
Neural and hybrid approaches improve generalization but still struggle with:
|
|
15
|
+
|
|
16
|
+
- Names and foreign words
|
|
17
|
+
- Abbreviations and acronyms
|
|
18
|
+
- Loanwords and compounds
|
|
19
|
+
- Ambiguous homographs
|
|
20
|
+
|
|
21
|
+
**OLaPh** tackles these challenges by combining:
|
|
22
|
+
|
|
23
|
+
- Extensive **language-specific dictionaries**
|
|
24
|
+
- **Abbreviation, number, and letter normalization**
|
|
25
|
+
- **Compound resolution with probabilistic scoring**
|
|
26
|
+
- **Cross-language handling**
|
|
27
|
+
- **NLP-based preprocessing** via [spaCy](https://spacy.io) and [Lingua](https://github.com/pemistahl/lingua-py)
|
|
28
|
+
|
|
29
|
+
Evaluations in **German** and **English** show improved accuracy and robustness over existing phonemizers, including on challenging multilingual datasets.
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## Features
|
|
34
|
+
|
|
35
|
+
- Multilingual phonemization (DE, EN, FR, ES)
|
|
36
|
+
- Abbreviation and letter pronunciation dictionaries
|
|
37
|
+
- Number normalization
|
|
38
|
+
- Cross-language acronym detection
|
|
39
|
+
- Compound splitting with probabilistic scoring
|
|
40
|
+
- Freely available lexica for research and development derived from wiktionary.org.
|
|
41
|
+
|
|
42
|
+
## Large Language Model
|
|
43
|
+
A LLM based on OLaPh output is also available. It is a GemmaX 2B Model trained on ~10M sentences derived from the FineWeb Corpus phonemized with the OLaPh framework.
|
|
44
|
+
|
|
45
|
+
Find it here on [huggingface](https://huggingface.co/iisys-hof/olaph)
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## Installation
|
|
50
|
+
|
|
51
|
+
### From PyPI
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install olaph
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### From source
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
git clone https://github.com/iisys-hof/olaph.git
|
|
61
|
+
cd olaph
|
|
62
|
+
pip install -e .
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Example Usage
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
from olaph import Olaph
|
|
69
|
+
|
|
70
|
+
phonemizer = Olaph()
|
|
71
|
+
|
|
72
|
+
output = phonemizer.phonemize_text("He ordered a Brezel and a beer in a tavern near München.", lang="en")
|
|
73
|
+
|
|
74
|
+
print(output)
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
## Dependencies
|
|
80
|
+
|
|
81
|
+
- [spaCy](https://spacy.io)
|
|
82
|
+
- [Lingua](https://github.com/pemistahl/lingua-py)
|
|
83
|
+
- [num2words](https://github.com/savoirfairelinux/num2words)
|
|
84
|
+
- [requests](https://requests.readthedocs.io)
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## Research Summary
|
|
89
|
+
|
|
90
|
+
Phonemization, the conversion of text into phonemes, is a key step in text-to-speech. Traditional approaches use rule-based transformations and lexicon lookups, while more advanced methods apply preprocessing techniques or neural networks for improved accuracy on out-of-domain vocabulary. However, all systems struggle with names, loanwords, abbreviations, and homographs. This work presents OLaPh (Optimal Language Phonemizer), a framework that combines large lexica, multiple NLP techniques, and compound resolution with a probabilistic scoring function. Evaluations in German and English show improved accuracy over previous approaches, including on a challenging dataset. To further address unresolved cases, we train a large language model on OLaPh-generated data, which achieves even stronger generalization and performance. Together, the framework and LLM improve phonemization consistency and provide a freely available resource for future research.
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
## Citation
|
|
95
|
+
|
|
96
|
+
If you use OLaPh in academic work, please cite:
|
|
97
|
+
|
|
98
|
+
### Citation
|
|
99
|
+
```bibtex
|
|
100
|
+
@misc{wirth2025olaphoptimallanguagephonemizer,
|
|
101
|
+
title={OLaPh: Optimal Language Phonemizer},
|
|
102
|
+
author={Johannes Wirth},
|
|
103
|
+
year={2025},
|
|
104
|
+
eprint={2509.20086},
|
|
105
|
+
archivePrefix={arXiv},
|
|
106
|
+
primaryClass={cs.CL},
|
|
107
|
+
url={https://arxiv.org/abs/2509.20086},
|
|
108
|
+
}
|
|
109
|
+
```
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: olaph
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A multilingual phonemizer combining lexica, NLP, and probabilistic scoring for improved phonemization accuracy..
|
|
5
|
+
Author-email: Johannes Wirth <johannes.wirth.3@iisys.de>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/iisys-hof/olaph
|
|
8
|
+
Project-URL: Documentation, https://github.com/iisys-hof/olaph#readme
|
|
9
|
+
Project-URL: Issues, https://github.com/iisys-hof/olaph/issues
|
|
10
|
+
Keywords: phonemizer,text-to-speech,linguistics,NLP,multilingual
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Requires-Python: >=3.12
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: spacy
|
|
19
|
+
Requires-Dist: lingua-language-detector==2.1.1
|
|
20
|
+
Requires-Dist: num2words==0.5.14
|
|
21
|
+
Requires-Dist: requests==2.32.5
|
|
22
|
+
Requires-Dist: annotated-types==0.7.0
|
|
23
|
+
Requires-Dist: blis==1.3.0
|
|
24
|
+
Requires-Dist: catalogue==2.0.10
|
|
25
|
+
Requires-Dist: certifi==2025.10.5
|
|
26
|
+
Requires-Dist: charset-normalizer==3.4.3
|
|
27
|
+
Requires-Dist: click==8.3.0
|
|
28
|
+
Requires-Dist: cloudpathlib==0.22.0
|
|
29
|
+
Requires-Dist: colorama==0.4.6
|
|
30
|
+
Requires-Dist: confection==0.1.5
|
|
31
|
+
Requires-Dist: cymem==2.0.11
|
|
32
|
+
Requires-Dist: de-core-news-sm
|
|
33
|
+
Requires-Dist: docopt==0.6.2
|
|
34
|
+
Requires-Dist: en-core-web-sm
|
|
35
|
+
Requires-Dist: es-core-news-sm
|
|
36
|
+
Requires-Dist: fr-core-news-sm
|
|
37
|
+
Requires-Dist: idna==3.10
|
|
38
|
+
Requires-Dist: jinja2==3.1.6
|
|
39
|
+
Requires-Dist: langcodes==3.5.0
|
|
40
|
+
Requires-Dist: language-data==1.3.0
|
|
41
|
+
Requires-Dist: marisa-trie==1.3.1
|
|
42
|
+
Requires-Dist: markdown-it-py==4.0.0
|
|
43
|
+
Requires-Dist: markupsafe==3.0.3
|
|
44
|
+
Requires-Dist: mdurl==0.1.2
|
|
45
|
+
Requires-Dist: murmurhash==1.0.13
|
|
46
|
+
Requires-Dist: numpy==2.3.3
|
|
47
|
+
Requires-Dist: packaging==25.0
|
|
48
|
+
Requires-Dist: preshed==3.0.10
|
|
49
|
+
Requires-Dist: pydantic==2.11.10
|
|
50
|
+
Requires-Dist: pydantic-core==2.33.2
|
|
51
|
+
Requires-Dist: pygments==2.19.2
|
|
52
|
+
Requires-Dist: rich==14.1.0
|
|
53
|
+
Requires-Dist: setuptools==80.9.0
|
|
54
|
+
Requires-Dist: shellingham==1.5.4
|
|
55
|
+
Requires-Dist: smart-open==7.3.1
|
|
56
|
+
Requires-Dist: spacy-legacy==3.0.12
|
|
57
|
+
Requires-Dist: spacy-loggers==1.0.5
|
|
58
|
+
Requires-Dist: srsly==2.5.1
|
|
59
|
+
Requires-Dist: thinc==8.3.6
|
|
60
|
+
Requires-Dist: tqdm==4.67.1
|
|
61
|
+
Requires-Dist: typer==0.19.2
|
|
62
|
+
Requires-Dist: typing-inspection==0.4.2
|
|
63
|
+
Requires-Dist: typing-extensions==4.15.0
|
|
64
|
+
Requires-Dist: urllib3==2.5.0
|
|
65
|
+
Requires-Dist: wasabi==1.1.3
|
|
66
|
+
Requires-Dist: weasel==0.4.1
|
|
67
|
+
Requires-Dist: wrapt==1.17.3
|
|
68
|
+
Provides-Extra: dev
|
|
69
|
+
Requires-Dist: pytest; extra == "dev"
|
|
70
|
+
Requires-Dist: black; extra == "dev"
|
|
71
|
+
Requires-Dist: flake8; extra == "dev"
|
|
72
|
+
Dynamic: license-file
|
|
73
|
+
|
|
74
|
+
# OLaPh — Optimal Language Phonemizer
|
|
75
|
+
|
|
76
|
+
[](https://pypi.org/project/olaph/)
|
|
77
|
+
[](https://pypi.org/project/olaph/)
|
|
78
|
+
[](https://opensource.org/licenses/MIT)
|
|
79
|
+
|
|
80
|
+
**OLaPh (Optimal Language Phonemizer)** is a multilingual phonemization framework that converts text into phonemes surpassing the quality of comparable frameworks.
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## Overview
|
|
85
|
+
|
|
86
|
+
Traditional phonemizers rely on simple rule-based mappings or lexicon lookups.
|
|
87
|
+
Neural and hybrid approaches improve generalization but still struggle with:
|
|
88
|
+
|
|
89
|
+
- Names and foreign words
|
|
90
|
+
- Abbreviations and acronyms
|
|
91
|
+
- Loanwords and compounds
|
|
92
|
+
- Ambiguous homographs
|
|
93
|
+
|
|
94
|
+
**OLaPh** tackles these challenges by combining:
|
|
95
|
+
|
|
96
|
+
- Extensive **language-specific dictionaries**
|
|
97
|
+
- **Abbreviation, number, and letter normalization**
|
|
98
|
+
- **Compound resolution with probabilistic scoring**
|
|
99
|
+
- **Cross-language handling**
|
|
100
|
+
- **NLP-based preprocessing** via [spaCy](https://spacy.io) and [Lingua](https://github.com/pemistahl/lingua-py)
|
|
101
|
+
|
|
102
|
+
Evaluations in **German** and **English** show improved accuracy and robustness over existing phonemizers, including on challenging multilingual datasets.
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
## Features
|
|
107
|
+
|
|
108
|
+
- Multilingual phonemization (DE, EN, FR, ES)
|
|
109
|
+
- Abbreviation and letter pronunciation dictionaries
|
|
110
|
+
- Number normalization
|
|
111
|
+
- Cross-language acronym detection
|
|
112
|
+
- Compound splitting with probabilistic scoring
|
|
113
|
+
- Freely available lexica for research and development derived from wiktionary.org.
|
|
114
|
+
|
|
115
|
+
## Large Language Model
|
|
116
|
+
A LLM based on OLaPh output is also available. It is a GemmaX 2B Model trained on ~10M sentences derived from the FineWeb Corpus phonemized with the OLaPh framework.
|
|
117
|
+
|
|
118
|
+
Find it here on [huggingface](https://huggingface.co/iisys-hof/olaph)
|
|
119
|
+
|
|
120
|
+
---
|
|
121
|
+
|
|
122
|
+
## Installation
|
|
123
|
+
|
|
124
|
+
### From PyPI
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
pip install olaph
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### From source
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
git clone https://github.com/iisys-hof/olaph.git
|
|
134
|
+
cd olaph
|
|
135
|
+
pip install -e .
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## Example Usage
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
from olaph import Olaph
|
|
142
|
+
|
|
143
|
+
phonemizer = Olaph()
|
|
144
|
+
|
|
145
|
+
output = phonemizer.phonemize_text("He ordered a Brezel and a beer in a tavern near München.", lang="en")
|
|
146
|
+
|
|
147
|
+
print(output)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## Dependencies
|
|
153
|
+
|
|
154
|
+
- [spaCy](https://spacy.io)
|
|
155
|
+
- [Lingua](https://github.com/pemistahl/lingua-py)
|
|
156
|
+
- [num2words](https://github.com/savoirfairelinux/num2words)
|
|
157
|
+
- [requests](https://requests.readthedocs.io)
|
|
158
|
+
|
|
159
|
+
---
|
|
160
|
+
|
|
161
|
+
## Research Summary
|
|
162
|
+
|
|
163
|
+
Phonemization, the conversion of text into phonemes, is a key step in text-to-speech. Traditional approaches use rule-based transformations and lexicon lookups, while more advanced methods apply preprocessing techniques or neural networks for improved accuracy on out-of-domain vocabulary. However, all systems struggle with names, loanwords, abbreviations, and homographs. This work presents OLaPh (Optimal Language Phonemizer), a framework that combines large lexica, multiple NLP techniques, and compound resolution with a probabilistic scoring function. Evaluations in German and English show improved accuracy over previous approaches, including on a challenging dataset. To further address unresolved cases, we train a large language model on OLaPh-generated data, which achieves even stronger generalization and performance. Together, the framework and LLM improve phonemization consistency and provide a freely available resource for future research.
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
167
|
+
## Citation
|
|
168
|
+
|
|
169
|
+
If you use OLaPh in academic work, please cite:
|
|
170
|
+
|
|
171
|
+
### Citation
|
|
172
|
+
```bibtex
|
|
173
|
+
@misc{wirth2025olaphoptimallanguagephonemizer,
|
|
174
|
+
title={OLaPh: Optimal Language Phonemizer},
|
|
175
|
+
author={Johannes Wirth},
|
|
176
|
+
year={2025},
|
|
177
|
+
eprint={2509.20086},
|
|
178
|
+
archivePrefix={arXiv},
|
|
179
|
+
primaryClass={cs.CL},
|
|
180
|
+
url={https://arxiv.org/abs/2509.20086},
|
|
181
|
+
}
|
|
182
|
+
```
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
spacy
|
|
2
|
+
lingua-language-detector==2.1.1
|
|
3
|
+
num2words==0.5.14
|
|
4
|
+
requests==2.32.5
|
|
5
|
+
annotated-types==0.7.0
|
|
6
|
+
blis==1.3.0
|
|
7
|
+
catalogue==2.0.10
|
|
8
|
+
certifi==2025.10.5
|
|
9
|
+
charset-normalizer==3.4.3
|
|
10
|
+
click==8.3.0
|
|
11
|
+
cloudpathlib==0.22.0
|
|
12
|
+
colorama==0.4.6
|
|
13
|
+
confection==0.1.5
|
|
14
|
+
cymem==2.0.11
|
|
15
|
+
de-core-news-sm
|
|
16
|
+
docopt==0.6.2
|
|
17
|
+
en-core-web-sm
|
|
18
|
+
es-core-news-sm
|
|
19
|
+
fr-core-news-sm
|
|
20
|
+
idna==3.10
|
|
21
|
+
jinja2==3.1.6
|
|
22
|
+
langcodes==3.5.0
|
|
23
|
+
language-data==1.3.0
|
|
24
|
+
marisa-trie==1.3.1
|
|
25
|
+
markdown-it-py==4.0.0
|
|
26
|
+
markupsafe==3.0.3
|
|
27
|
+
mdurl==0.1.2
|
|
28
|
+
murmurhash==1.0.13
|
|
29
|
+
numpy==2.3.3
|
|
30
|
+
packaging==25.0
|
|
31
|
+
preshed==3.0.10
|
|
32
|
+
pydantic==2.11.10
|
|
33
|
+
pydantic-core==2.33.2
|
|
34
|
+
pygments==2.19.2
|
|
35
|
+
rich==14.1.0
|
|
36
|
+
setuptools==80.9.0
|
|
37
|
+
shellingham==1.5.4
|
|
38
|
+
smart-open==7.3.1
|
|
39
|
+
spacy-legacy==3.0.12
|
|
40
|
+
spacy-loggers==1.0.5
|
|
41
|
+
srsly==2.5.1
|
|
42
|
+
thinc==8.3.6
|
|
43
|
+
tqdm==4.67.1
|
|
44
|
+
typer==0.19.2
|
|
45
|
+
typing-inspection==0.4.2
|
|
46
|
+
typing-extensions==4.15.0
|
|
47
|
+
urllib3==2.5.0
|
|
48
|
+
wasabi==1.1.3
|
|
49
|
+
weasel==0.4.1
|
|
50
|
+
wrapt==1.17.3
|
|
51
|
+
|
|
52
|
+
[dev]
|
|
53
|
+
pytest
|
|
54
|
+
black
|
|
55
|
+
flake8
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "olaph"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A multilingual phonemizer combining lexica, NLP, and probabilistic scoring for improved phonemization accuracy.."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.12"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
|
|
13
|
+
authors = [
|
|
14
|
+
{ name = "Johannes Wirth", email = "johannes.wirth.3@iisys.de" }
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
keywords = ["phonemizer", "text-to-speech", "linguistics", "NLP", "multilingual"]
|
|
18
|
+
|
|
19
|
+
classifiers = [
|
|
20
|
+
"Intended Audience :: Developers",
|
|
21
|
+
"Topic :: Text Processing :: Linguistic",
|
|
22
|
+
"License :: OSI Approved :: MIT License",
|
|
23
|
+
"Programming Language :: Python :: 3"
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
dependencies = [
|
|
27
|
+
"spacy",
|
|
28
|
+
"lingua-language-detector==2.1.1",
|
|
29
|
+
"num2words==0.5.14",
|
|
30
|
+
"requests==2.32.5",
|
|
31
|
+
"annotated-types==0.7.0",
|
|
32
|
+
"blis==1.3.0",
|
|
33
|
+
"catalogue==2.0.10",
|
|
34
|
+
"certifi==2025.10.5",
|
|
35
|
+
"charset-normalizer==3.4.3",
|
|
36
|
+
"click==8.3.0",
|
|
37
|
+
"cloudpathlib==0.22.0",
|
|
38
|
+
"colorama==0.4.6",
|
|
39
|
+
"confection==0.1.5",
|
|
40
|
+
"cymem==2.0.11",
|
|
41
|
+
"de-core-news-sm",
|
|
42
|
+
"docopt==0.6.2",
|
|
43
|
+
"en-core-web-sm",
|
|
44
|
+
"es-core-news-sm",
|
|
45
|
+
"fr-core-news-sm",
|
|
46
|
+
"idna==3.10",
|
|
47
|
+
"jinja2==3.1.6",
|
|
48
|
+
"langcodes==3.5.0",
|
|
49
|
+
"language-data==1.3.0",
|
|
50
|
+
"marisa-trie==1.3.1",
|
|
51
|
+
"markdown-it-py==4.0.0",
|
|
52
|
+
"markupsafe==3.0.3",
|
|
53
|
+
"mdurl==0.1.2",
|
|
54
|
+
"murmurhash==1.0.13",
|
|
55
|
+
"numpy==2.3.3",
|
|
56
|
+
"packaging==25.0",
|
|
57
|
+
"preshed==3.0.10",
|
|
58
|
+
"pydantic==2.11.10",
|
|
59
|
+
"pydantic-core==2.33.2",
|
|
60
|
+
"pygments==2.19.2",
|
|
61
|
+
"rich==14.1.0",
|
|
62
|
+
"setuptools==80.9.0",
|
|
63
|
+
"shellingham==1.5.4",
|
|
64
|
+
"smart-open==7.3.1",
|
|
65
|
+
"spacy-legacy==3.0.12",
|
|
66
|
+
"spacy-loggers==1.0.5",
|
|
67
|
+
"srsly==2.5.1",
|
|
68
|
+
"thinc==8.3.6",
|
|
69
|
+
"tqdm==4.67.1",
|
|
70
|
+
"typer==0.19.2",
|
|
71
|
+
"typing-inspection==0.4.2",
|
|
72
|
+
"typing-extensions==4.15.0",
|
|
73
|
+
"urllib3==2.5.0",
|
|
74
|
+
"wasabi==1.1.3",
|
|
75
|
+
"weasel==0.4.1",
|
|
76
|
+
"wrapt==1.17.3",
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
[project.urls]
|
|
80
|
+
Homepage = "https://github.com/iisys-hof/olaph"
|
|
81
|
+
Documentation = "https://github.com/iisys-hof/olaph#readme"
|
|
82
|
+
Issues = "https://github.com/iisys-hof/olaph/issues"
|
|
83
|
+
|
|
84
|
+
[tool.setuptools.packages.find]
|
|
85
|
+
where = ["."]
|
|
86
|
+
include = ["olaph*"]
|
|
87
|
+
|
|
88
|
+
[tool.setuptools.package-data]
|
|
89
|
+
"olaph" = ["dictionaries/**/*.txt"]
|
|
90
|
+
|
|
91
|
+
[tool.uv.sources]
|
|
92
|
+
de-core-news-sm = { url = "https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.8.0/de_core_news_sm-3.8.0-py3-none-any.whl" }
|
|
93
|
+
en-core-web-sm = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl" }
|
|
94
|
+
es-core-news-sm = { url = "https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.8.0/es_core_news_sm-3.8.0-py3-none-any.whl" }
|
|
95
|
+
fr-core-news-sm = { url = "https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl" }
|
|
96
|
+
|
|
97
|
+
[project.optional-dependencies]
|
|
98
|
+
dev = ["pytest", "black", "flake8"]
|
olaph-0.1.0/setup.cfg
ADDED