indic-tts-preprocess 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indic_tts_preprocess-0.1.1/LICENSE +21 -0
- indic_tts_preprocess-0.1.1/PKG-INFO +162 -0
- indic_tts_preprocess-0.1.1/README.md +116 -0
- indic_tts_preprocess-0.1.1/indic_tts_preprocess/__init__.py +25 -0
- indic_tts_preprocess-0.1.1/indic_tts_preprocess/core.py +56 -0
- indic_tts_preprocess-0.1.1/indic_tts_preprocess/languages/__init__.py +0 -0
- indic_tts_preprocess-0.1.1/indic_tts_preprocess/languages/english.py +124 -0
- indic_tts_preprocess-0.1.1/indic_tts_preprocess/languages/hindi.py +121 -0
- indic_tts_preprocess-0.1.1/indic_tts_preprocess/languages/marathi.py +112 -0
- indic_tts_preprocess-0.1.1/indic_tts_preprocess.egg-info/PKG-INFO +162 -0
- indic_tts_preprocess-0.1.1/indic_tts_preprocess.egg-info/SOURCES.txt +17 -0
- indic_tts_preprocess-0.1.1/indic_tts_preprocess.egg-info/dependency_links.txt +1 -0
- indic_tts_preprocess-0.1.1/indic_tts_preprocess.egg-info/top_level.txt +1 -0
- indic_tts_preprocess-0.1.1/pyproject.toml +35 -0
- indic_tts_preprocess-0.1.1/setup.cfg +4 -0
- indic_tts_preprocess-0.1.1/tests/test_core.py +44 -0
- indic_tts_preprocess-0.1.1/tests/test_english.py +60 -0
- indic_tts_preprocess-0.1.1/tests/test_hindi.py +72 -0
- indic_tts_preprocess-0.1.1/tests/test_marathi.py +50 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Dhruv
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: indic-tts-preprocess
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Convert numbers and dates in Indic text to spoken words for TTS models
|
|
5
|
+
Author-email: Dhruv Dornal <dhruvdornal2003@gmail.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2025 Dhruv
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Keywords: tts,indic,hindi,marathi,nlp,text-to-speech,preprocessing
|
|
29
|
+
Classifier: Development Status :: 3 - Alpha
|
|
30
|
+
Classifier: Intended Audience :: Developers
|
|
31
|
+
Classifier: Intended Audience :: Science/Research
|
|
32
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
33
|
+
Classifier: Programming Language :: Python :: 3
|
|
34
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
35
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
39
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
40
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
41
|
+
Classifier: Natural Language :: Hindi
|
|
42
|
+
Requires-Python: >=3.8
|
|
43
|
+
Description-Content-Type: text/markdown
|
|
44
|
+
License-File: LICENSE
|
|
45
|
+
Dynamic: license-file
|
|
46
|
+
|
|
47
|
+
# indic-tts-preprocess
|
|
48
|
+
|
|
49
|
+
A small Python library that fixes one specific but very annoying problem with open-source Indic TTS models: **they cannot read numbers**.
|
|
50
|
+
|
|
51
|
+
Models like [ai4bharat/indic-parler-tts](https://huggingface.co/ai4bharat/indic-parler-tts) hallucinate badly when the input text contains raw digits. Feed them `"1997 में"` and you get garbled audio. Feed them `"उन्नीस सौ सत्तानवे में"` and it works perfectly.
|
|
52
|
+
|
|
53
|
+
This library does that conversion for you — numbers, years, dates — before the text ever reaches the tokenizer.
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## Install
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install indic-tts-preprocess
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
No dependencies. Pure Python. Works on Python 3.8 and above.
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## Quick start
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
from indic_tts_preprocess import preprocess
|
|
71
|
+
|
|
72
|
+
# Hindi
|
|
73
|
+
preprocess("उनका जन्म 5 अगस्त 1997 को हुआ", "hi")
|
|
74
|
+
# -> "उनका जन्म पाँच अगस्त उन्नीस सौ सत्तानवे को हुआ"
|
|
75
|
+
|
|
76
|
+
# Marathi
|
|
77
|
+
preprocess("त्यांचा जन्म 15 ऑगस्ट 1947 रोजी झाला", "mr")
|
|
78
|
+
# -> "त्यांचा जन्म पंधरा ऑगस्ट एकोणीस शे सत्तेचाळीस रोजी झाला"
|
|
79
|
+
|
|
80
|
+
# English
|
|
81
|
+
preprocess("He was born on 15 August 1947", "en")
|
|
82
|
+
# -> "He was born on fifteen August nineteen forty seven"
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## Supported languages
|
|
88
|
+
|
|
89
|
+
| Code | Language |
|
|
90
|
+
|------|----------|
|
|
91
|
+
| `hi` | Hindi |
|
|
92
|
+
| `mr` | Marathi |
|
|
93
|
+
| `en` | English |
|
|
94
|
+
|
|
95
|
+
If you pass an unsupported language code, the text comes back unchanged — nothing crashes.
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## What it handles
|
|
100
|
+
|
|
101
|
+
**Date formats**
|
|
102
|
+
|
|
103
|
+
| Input | Language | Output |
|
|
104
|
+
|-------|----------|--------|
|
|
105
|
+
| `5 अगस्त 2004` | hi | `पाँच अगस्त दो हज़ार चार` |
|
|
106
|
+
| `05/08/2004` | hi | `पाँच अगस्त दो हज़ार चार` |
|
|
107
|
+
| `05-08-2004` | hi | `पाँच अगस्त दो हज़ार चार` |
|
|
108
|
+
| `15 August 1947` | en | `fifteen August nineteen forty seven` |
|
|
109
|
+
| `15/08/1947` | en | `fifteen August nineteen forty seven` |
|
|
110
|
+
|
|
111
|
+
**Standalone numbers**
|
|
112
|
+
|
|
113
|
+
| Input | Language | Output |
|
|
114
|
+
|-------|----------|--------|
|
|
115
|
+
| `73` | hi | `तिहत्तर` |
|
|
116
|
+
| `1997` | hi | `उन्नीस सौ सत्तानवे` |
|
|
117
|
+
| `2024` | en | `two thousand twenty four` |
|
|
118
|
+
| `1905` | en | `nineteen oh five` |
|
|
119
|
+
|
|
120
|
+
**Year handling** (the tricky part)
|
|
121
|
+
|
|
122
|
+
Hindi and Marathi speakers say years in the 1900s differently from how you'd read them literally:
|
|
123
|
+
- `1997` → `उन्नीस सौ सत्तानवे` (not `एक हज़ार नौ सौ सत्तानवे`)
|
|
124
|
+
|
|
125
|
+
English speakers do the same:
|
|
126
|
+
- `1997` → `nineteen ninety seven` (not `one thousand nine hundred ninety seven`)
|
|
127
|
+
- `1905` → `nineteen oh five`
|
|
128
|
+
|
|
129
|
+
The library handles all of these correctly.
|
|
130
|
+
|
|
131
|
+
---
|
|
132
|
+
|
|
133
|
+
## API reference
|
|
134
|
+
|
|
135
|
+
### `preprocess(text, lang)`
|
|
136
|
+
|
|
137
|
+
| Parameter | Type | Description |
|
|
138
|
+
|-----------|------|-------------|
|
|
139
|
+
| `text` | `str` | The raw input text containing digits/dates |
|
|
140
|
+
| `lang` | `str` | Language code: `"hi"`, `"mr"`, or `"en"` |
|
|
141
|
+
|
|
142
|
+
Returns `str` — the same text with all digits replaced by spoken words.
|
|
143
|
+
|
|
144
|
+
Raises `TypeError` if `text` is not a string.
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
## Contributing
|
|
149
|
+
|
|
150
|
+
Adding a new language is straightforward:
|
|
151
|
+
|
|
152
|
+
1. Create `indic_tts_preprocess/languages/yourlang.py` — look at `hindi.py` as a template
|
|
153
|
+
2. Add a `preprocess(text)` function and a `num_to_words(n)` function
|
|
154
|
+
3. Add the new language code in `core.py`
|
|
155
|
+
4. Add tests in `tests/test_yourlang.py`
|
|
156
|
+
5. Open a pull request
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## License
|
|
161
|
+
|
|
162
|
+
MIT
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# indic-tts-preprocess
|
|
2
|
+
|
|
3
|
+
A small Python library that fixes one specific but very annoying problem with open-source Indic TTS models: **they cannot read numbers**.
|
|
4
|
+
|
|
5
|
+
Models like [ai4bharat/indic-parler-tts](https://huggingface.co/ai4bharat/indic-parler-tts) hallucinate badly when the input text contains raw digits. Feed them `"1997 में"` and you get garbled audio. Feed them `"उन्नीस सौ सत्तानवे में"` and it works perfectly.
|
|
6
|
+
|
|
7
|
+
This library does that conversion for you — numbers, years, dates — before the text ever reaches the tokenizer.
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## Install
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
pip install indic-tts-preprocess
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
No dependencies. Pure Python. Works on Python 3.8 and above.
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## Quick start
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
from indic_tts_preprocess import preprocess
|
|
25
|
+
|
|
26
|
+
# Hindi
|
|
27
|
+
preprocess("उनका जन्म 5 अगस्त 1997 को हुआ", "hi")
|
|
28
|
+
# -> "उनका जन्म पाँच अगस्त उन्नीस सौ सत्तानवे को हुआ"
|
|
29
|
+
|
|
30
|
+
# Marathi
|
|
31
|
+
preprocess("त्यांचा जन्म 15 ऑगस्ट 1947 रोजी झाला", "mr")
|
|
32
|
+
# -> "त्यांचा जन्म पंधरा ऑगस्ट एकोणीस शे सत्तेचाळीस रोजी झाला"
|
|
33
|
+
|
|
34
|
+
# English
|
|
35
|
+
preprocess("He was born on 15 August 1947", "en")
|
|
36
|
+
# -> "He was born on fifteen August nineteen forty seven"
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Supported languages
|
|
42
|
+
|
|
43
|
+
| Code | Language |
|
|
44
|
+
|------|----------|
|
|
45
|
+
| `hi` | Hindi |
|
|
46
|
+
| `mr` | Marathi |
|
|
47
|
+
| `en` | English |
|
|
48
|
+
|
|
49
|
+
If you pass an unsupported language code, the text comes back unchanged — nothing crashes.
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## What it handles
|
|
54
|
+
|
|
55
|
+
**Date formats**
|
|
56
|
+
|
|
57
|
+
| Input | Language | Output |
|
|
58
|
+
|-------|----------|--------|
|
|
59
|
+
| `5 अगस्त 2004` | hi | `पाँच अगस्त दो हज़ार चार` |
|
|
60
|
+
| `05/08/2004` | hi | `पाँच अगस्त दो हज़ार चार` |
|
|
61
|
+
| `05-08-2004` | hi | `पाँच अगस्त दो हज़ार चार` |
|
|
62
|
+
| `15 August 1947` | en | `fifteen August nineteen forty seven` |
|
|
63
|
+
| `15/08/1947` | en | `fifteen August nineteen forty seven` |
|
|
64
|
+
|
|
65
|
+
**Standalone numbers**
|
|
66
|
+
|
|
67
|
+
| Input | Language | Output |
|
|
68
|
+
|-------|----------|--------|
|
|
69
|
+
| `73` | hi | `तिहत्तर` |
|
|
70
|
+
| `1997` | hi | `उन्नीस सौ सत्तानवे` |
|
|
71
|
+
| `2024` | en | `two thousand twenty four` |
|
|
72
|
+
| `1905` | en | `nineteen oh five` |
|
|
73
|
+
|
|
74
|
+
**Year handling** (the tricky part)
|
|
75
|
+
|
|
76
|
+
Hindi and Marathi speakers say years in the 1900s differently from how you'd read them literally:
|
|
77
|
+
- `1997` → `उन्नीस सौ सत्तानवे` (not `एक हज़ार नौ सौ सत्तानवे`)
|
|
78
|
+
|
|
79
|
+
English speakers do the same:
|
|
80
|
+
- `1997` → `nineteen ninety seven` (not `one thousand nine hundred ninety seven`)
|
|
81
|
+
- `1905` → `nineteen oh five`
|
|
82
|
+
|
|
83
|
+
The library handles all of these correctly.
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## API reference
|
|
88
|
+
|
|
89
|
+
### `preprocess(text, lang)`
|
|
90
|
+
|
|
91
|
+
| Parameter | Type | Description |
|
|
92
|
+
|-----------|------|-------------|
|
|
93
|
+
| `text` | `str` | The raw input text containing digits/dates |
|
|
94
|
+
| `lang` | `str` | Language code: `"hi"`, `"mr"`, or `"en"` |
|
|
95
|
+
|
|
96
|
+
Returns `str` — the same text with all digits replaced by spoken words.
|
|
97
|
+
|
|
98
|
+
Raises `TypeError` if `text` is not a string.
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## Contributing
|
|
103
|
+
|
|
104
|
+
Adding a new language is straightforward:
|
|
105
|
+
|
|
106
|
+
1. Create `indic_tts_preprocess/languages/yourlang.py` — look at `hindi.py` as a template
|
|
107
|
+
2. Add a `preprocess(text)` function and a `num_to_words(n)` function
|
|
108
|
+
3. Add the new language code in `core.py`
|
|
109
|
+
4. Add tests in `tests/test_yourlang.py`
|
|
110
|
+
5. Open a pull request
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## License
|
|
115
|
+
|
|
116
|
+
MIT
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""
|
|
2
|
+
indic-tts-preprocess
|
|
3
|
+
--------------------
|
|
4
|
+
Converts numbers and dates in Indic text into spoken words,
|
|
5
|
+
so that open-source TTS models like ai4bharat/indic-parler-tts
|
|
6
|
+
stop hallucinating when they see raw digits.
|
|
7
|
+
|
|
8
|
+
Basic usage:
|
|
9
|
+
|
|
10
|
+
from indic_tts_preprocess import preprocess
|
|
11
|
+
|
|
12
|
+
clean = preprocess("उनका जन्म 5 अगस्त 1997 को हुआ", "hi")
|
|
13
|
+
# -> "उनका जन्म पाँच अगस्त उन्नीस सौ सत्तानवे को हुआ"
|
|
14
|
+
|
|
15
|
+
Supported languages:
|
|
16
|
+
"hi" - Hindi
|
|
17
|
+
"mr" - Marathi
|
|
18
|
+
"en" - English
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from indic_tts_preprocess.core import preprocess, SUPPORTED_LANGUAGES
|
|
22
|
+
|
|
23
|
+
__version__ = "0.1.1"
|
|
24
|
+
__author__ = "Dhruv Dornal"
|
|
25
|
+
__all__ = ["preprocess", "SUPPORTED_LANGUAGES"]
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""
|
|
2
|
+
core.py
|
|
3
|
+
|
|
4
|
+
The main entry point. This is the only function most people will ever need to call.
|
|
5
|
+
It figures out which language to use and sends the text to the right preprocessor.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from indic_tts_preprocess.languages import hindi, marathi, english
|
|
9
|
+
|
|
10
|
+
# All the language codes we support right now.
|
|
11
|
+
# If someone passes something not in this list, we just return their text unchanged.
|
|
12
|
+
SUPPORTED_LANGUAGES = {"hi", "mr", "en"}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def preprocess(text: str, lang: str) -> str:
|
|
16
|
+
"""
|
|
17
|
+
Clean up text so Indic TTS models can read numbers and dates correctly.
|
|
18
|
+
|
|
19
|
+
The models hallucinate badly when they see raw digits like "1997" or "73".
|
|
20
|
+
This function converts them into the spoken form before the text hits the model.
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
text : str
|
|
25
|
+
The raw input text with digits/dates in it.
|
|
26
|
+
lang : str
|
|
27
|
+
Language code. Use "hi" for Hindi, "mr" for Marathi, "en" for English.
|
|
28
|
+
|
|
29
|
+
Returns
|
|
30
|
+
-------
|
|
31
|
+
str
|
|
32
|
+
The same text but with all numbers replaced by spoken words.
|
|
33
|
+
|
|
34
|
+
Examples
|
|
35
|
+
--------
|
|
36
|
+
>>> from indic_tts_preprocess import preprocess
|
|
37
|
+
>>> preprocess("उनका जन्म 5 अगस्त 1997 को हुआ", "hi")
|
|
38
|
+
'उनका जन्म पाँच अगस्त उन्नीस सौ सत्तानवे को हुआ'
|
|
39
|
+
|
|
40
|
+
>>> preprocess("He was born on 15 August 1947", "en")
|
|
41
|
+
'He was born on fifteen August nineteen forty seven'
|
|
42
|
+
"""
|
|
43
|
+
if not isinstance(text, str):
|
|
44
|
+
raise TypeError(f"text must be a string, got {type(text).__name__}")
|
|
45
|
+
|
|
46
|
+
lang = lang.strip().lower()
|
|
47
|
+
|
|
48
|
+
if lang == "hi":
|
|
49
|
+
return hindi.preprocess(text)
|
|
50
|
+
if lang == "mr":
|
|
51
|
+
return marathi.preprocess(text)
|
|
52
|
+
if lang == "en":
|
|
53
|
+
return english.preprocess(text)
|
|
54
|
+
|
|
55
|
+
# Unknown language - just return the text as-is so nothing breaks downstream
|
|
56
|
+
return text
|
|
File without changes
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""
|
|
2
|
+
english.py
|
|
3
|
+
|
|
4
|
+
English number/date cleanup for TTS.
|
|
5
|
+
Handles years the way English speakers actually say them out loud.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
|
|
10
|
+
# Numbers 1-19 don't follow any pattern, so just list them
|
|
11
|
+
ONES = [
|
|
12
|
+
"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
|
|
13
|
+
"eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
# Tens: twenty, thirty, etc.
|
|
17
|
+
TENS = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
|
|
18
|
+
|
|
19
|
+
# Month number -> English month name
|
|
20
|
+
MONTHS = {
|
|
21
|
+
1: "January", 2: "February", 3: "March", 4: "April",
|
|
22
|
+
5: "May", 6: "June", 7: "July", 8: "August",
|
|
23
|
+
9: "September", 10: "October", 11: "November", 12: "December",
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
# Regex string of all month names (used to match written dates like "5 August 2004")
|
|
27
|
+
MONTHS_RE = "|".join(MONTHS.values())
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _under_100(n: int) -> str:
|
|
31
|
+
"""Convert any number under 100 to English words."""
|
|
32
|
+
if n < 20:
|
|
33
|
+
return ONES[n]
|
|
34
|
+
tens, ones = n // 10, n % 10
|
|
35
|
+
if ones == 0:
|
|
36
|
+
return TENS[tens]
|
|
37
|
+
return TENS[tens] + " " + ONES[ones]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def num_to_words(n: int) -> str:
|
|
41
|
+
"""
|
|
42
|
+
Turn any whole number into spoken English.
|
|
43
|
+
|
|
44
|
+
Years have their own rules because people say them differently:
|
|
45
|
+
- 1900-1999: "nineteen oh five", "nineteen ninety two"
|
|
46
|
+
- 2000-2099: "two thousand", "two thousand twenty four"
|
|
47
|
+
Everything else uses standard hundred/thousand structure.
|
|
48
|
+
"""
|
|
49
|
+
if n == 0:
|
|
50
|
+
return "zero"
|
|
51
|
+
if n < 0:
|
|
52
|
+
return "minus " + num_to_words(-n)
|
|
53
|
+
|
|
54
|
+
# 1900s - 1600s: "nineteen ninety two" not "one thousand nine hundred ninety two"
|
|
55
|
+
if 1900 <= n <= 1999:
|
|
56
|
+
second = n - 1900
|
|
57
|
+
if second == 0:
|
|
58
|
+
return "nineteen hundred"
|
|
59
|
+
if second < 10:
|
|
60
|
+
return "nineteen oh " + ONES[second] # "nineteen oh five"
|
|
61
|
+
return "nineteen " + _under_100(second)
|
|
62
|
+
|
|
63
|
+
if 1800 <= n <= 1899:
|
|
64
|
+
second = n - 1800
|
|
65
|
+
if second == 0:
|
|
66
|
+
return "eighteen hundred"
|
|
67
|
+
if second < 10:
|
|
68
|
+
return "eighteen oh " + ONES[second] # "eighteen oh five"
|
|
69
|
+
return "eightteen " + _under_100(second)
|
|
70
|
+
|
|
71
|
+
if 1700 <= n <= 1799:
|
|
72
|
+
second = n - 1700
|
|
73
|
+
if second == 0:
|
|
74
|
+
return "seventeen hundred"
|
|
75
|
+
if second < 10:
|
|
76
|
+
return "seventeen oh " + ONES[second] # "seventeen oh five"
|
|
77
|
+
return "seventeen " + _under_100(second)
|
|
78
|
+
|
|
79
|
+
# 2000s: "two thousand", "two thousand twenty four"
|
|
80
|
+
if 2000 <= n <= 2099:
|
|
81
|
+
second = n - 2000
|
|
82
|
+
if second == 0:
|
|
83
|
+
return "two thousand"
|
|
84
|
+
return "two thousand " + _under_100(second)
|
|
85
|
+
|
|
86
|
+
parts = []
|
|
87
|
+
if n >= 1000:
|
|
88
|
+
parts.append(num_to_words(n // 1000) + " thousand")
|
|
89
|
+
n %= 1000
|
|
90
|
+
if n >= 100:
|
|
91
|
+
parts.append(ONES[n // 100] + " hundred")
|
|
92
|
+
n %= 100
|
|
93
|
+
if n > 0:
|
|
94
|
+
parts.append(_under_100(n))
|
|
95
|
+
return " ".join(parts)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def preprocess(text: str) -> str:
|
|
99
|
+
"""
|
|
100
|
+
Takes raw English text and replaces all numbers/dates with spoken English words.
|
|
101
|
+
"""
|
|
102
|
+
# Handle "5 August 2004" or "5 august 2004" (case-insensitive)
|
|
103
|
+
def _written_date(m):
|
|
104
|
+
day, month_name, year = int(m.group(1)), m.group(2), int(m.group(3))
|
|
105
|
+
return f"{_under_100(day)} {month_name} {num_to_words(year)}"
|
|
106
|
+
|
|
107
|
+
text = re.sub(
|
|
108
|
+
rf"(\d{{1,2}})\s+({MONTHS_RE})\s+(\d{{4}})",
|
|
109
|
+
_written_date, text, flags=re.IGNORECASE
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# Handle "05/08/2004" or "05-08-2004"
|
|
113
|
+
def _numeric_date(m):
|
|
114
|
+
day, month, year = int(m.group(1)), int(m.group(2)), int(m.group(3))
|
|
115
|
+
if 1 <= month <= 12 and 1 <= day <= 31:
|
|
116
|
+
return f"{_under_100(day)} {MONTHS[month]} {num_to_words(year)}"
|
|
117
|
+
return m.group(0)
|
|
118
|
+
|
|
119
|
+
text = re.sub(r"(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})", _numeric_date, text)
|
|
120
|
+
|
|
121
|
+
# Anything else that's still a number
|
|
122
|
+
text = re.sub(r"\d+", lambda m: num_to_words(int(m.group(0))), text)
|
|
123
|
+
|
|
124
|
+
return text
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""
|
|
2
|
+
hindi.py
|
|
3
|
+
|
|
4
|
+
All the Hindi-specific stuff lives here:
|
|
5
|
+
- Word lists for numbers 0-99
|
|
6
|
+
- Month name mappings
|
|
7
|
+
- The actual function that cleans up Hindi text for TTS
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
|
|
12
|
+
# Every number from 0 to 99 written out in Hindi.
|
|
13
|
+
# Index position = the number itself. So ONES[5] = "पाँच".
|
|
14
|
+
# Index 0 is blank because we never say "zero" in the middle of a date.
|
|
15
|
+
ONES = [
|
|
16
|
+
"", "एक", "दो", "तीन", "चार", "पाँच", "छह", "सात", "आठ", "नौ", "दस",
|
|
17
|
+
"ग्यारह", "बारह", "तेरह", "चौदह", "पंद्रह", "सोलह", "सत्रह", "अठारह", "उन्नीस", "बीस",
|
|
18
|
+
"इक्कीस", "बाईस", "तेईस", "चौबीस", "पच्चीस", "छब्बीस", "सत्ताईस", "अट्ठाईस", "उनतीस", "तीस",
|
|
19
|
+
"इकतीस", "बत्तीस", "तैंतीस", "चौंतीस", "पैंतीस", "छत्तीस", "सैंतीस", "अड़तीस", "उनतालीस", "चालीस",
|
|
20
|
+
"इकतालीस", "बयालीस", "तैंतालीस", "चौंतालीस", "पैंतालीस", "छियालीस", "सैंतालीस", "अड़तालीस", "उनचास", "पचास",
|
|
21
|
+
"इक्यावन", "बावन", "तिरपन", "चौवन", "पचपन", "छप्पन", "सत्तावन", "अट्ठावन", "उनसठ", "साठ",
|
|
22
|
+
"इकसठ", "बासठ", "तिरसठ", "चौंसठ", "पैंसठ", "छियासठ", "सड़सठ", "अड़सठ", "उनहत्तर", "सत्तर",
|
|
23
|
+
"इकहत्तर", "बहत्तर", "तिहत्तर", "चौहत्तर", "पचहत्तर", "छिहत्तर", "सतहत्तर", "अठहत्तर", "उनासी", "अस्सी",
|
|
24
|
+
"इक्यासी", "बयासी", "तिरासी", "चौरासी", "पचासी", "छियासी", "सत्तासी", "अट्ठासी", "नवासी", "नब्बे",
|
|
25
|
+
"इक्यानबे", "बानबे", "तिरानबे", "चौरानबे", "पचानबे", "छियानबे", "सत्तानवे", "अट्ठानवे", "निन्यानवे",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
# Month number -> Hindi month name
|
|
29
|
+
MONTHS = {
|
|
30
|
+
1: "जनवरी", 2: "फरवरी", 3: "मार्च", 4: "अप्रैल",
|
|
31
|
+
5: "मई", 6: "जून", 7: "जुलाई", 8: "अगस्त",
|
|
32
|
+
9: "सितंबर", 10: "अक्टूबर", 11: "नवंबर", 12: "दिसंबर",
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
# Flip it so we can go from month name -> number too
|
|
36
|
+
MONTH_TO_NUM = {v: k for k, v in MONTHS.items()}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def num_to_words(n: int) -> str:
|
|
40
|
+
"""
|
|
41
|
+
Turn any whole number into how a Hindi speaker would say it out loud.
|
|
42
|
+
|
|
43
|
+
The 1900s get special treatment: Hindi speakers say "उन्नीस सौ सत्तानवे"
|
|
44
|
+
(nineteen hundred ninety-seven) not "एक हज़ार नौ सौ सत्तानवे".
|
|
45
|
+
This matters a lot for birth years, historical dates, etc.
|
|
46
|
+
"""
|
|
47
|
+
if n == 0:
|
|
48
|
+
return "शून्य"
|
|
49
|
+
if n < 0:
|
|
50
|
+
return "माइनस " + num_to_words(-n)
|
|
51
|
+
|
|
52
|
+
# Years in the 1900s - say it the natural way
|
|
53
|
+
if 1900 <= n <= 1999:
|
|
54
|
+
remainder = n - 1900
|
|
55
|
+
if remainder == 0:
|
|
56
|
+
return "उन्नीस सौ"
|
|
57
|
+
return "उन्नीस सौ " + ONES[remainder]
|
|
58
|
+
|
|
59
|
+
# 1800s
|
|
60
|
+
if 1800 <= n <= 1899:
|
|
61
|
+
remainder = n - 1800
|
|
62
|
+
if remainder == 0:
|
|
63
|
+
return "अठारह सौ"
|
|
64
|
+
return "अठारह सौ " + ONES[remainder]
|
|
65
|
+
|
|
66
|
+
# 1700s
|
|
67
|
+
if 1700 <= n <= 1799:
|
|
68
|
+
remainder = n - 1700
|
|
69
|
+
if remainder == 0:
|
|
70
|
+
return "सत्रह सौ"
|
|
71
|
+
return "सत्रह सौ " + ONES[remainder]
|
|
72
|
+
|
|
73
|
+
parts = []
|
|
74
|
+
if n >= 1000:
|
|
75
|
+
parts.append(num_to_words(n // 1000) + " हज़ार")
|
|
76
|
+
n %= 1000
|
|
77
|
+
if n >= 100:
|
|
78
|
+
parts.append(ONES[n // 100] + " सौ")
|
|
79
|
+
n %= 100
|
|
80
|
+
if n > 0:
|
|
81
|
+
parts.append(ONES[n])
|
|
82
|
+
return " ".join(parts)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def preprocess(text: str) -> str:
|
|
86
|
+
"""
|
|
87
|
+
Takes raw Hindi text and replaces all numbers/dates with spoken Hindi words.
|
|
88
|
+
|
|
89
|
+
Three patterns it handles:
|
|
90
|
+
1. "5 अगस्त 2004" -> day in Hindi words + month name + year in words
|
|
91
|
+
2. "05/08/2004" or "05-08-2004" -> same thing but from numeric format
|
|
92
|
+
3. Any leftover numbers like "73" -> "तिहत्तर"
|
|
93
|
+
"""
|
|
94
|
+
# Build a pattern that matches any Hindi month name
|
|
95
|
+
month_pat = "|".join(MONTH_TO_NUM.keys())
|
|
96
|
+
|
|
97
|
+
# Handle "5 अगस्त 2004" or "5 अगस्त, 2004"
|
|
98
|
+
def _named_date(m):
|
|
99
|
+
day = int(m.group(1))
|
|
100
|
+
month = MONTH_TO_NUM[m.group(2)]
|
|
101
|
+
year = int(m.group(3))
|
|
102
|
+
return f"{ONES[day]} {MONTHS[month]} {num_to_words(year)}"
|
|
103
|
+
|
|
104
|
+
text = re.sub(
|
|
105
|
+
rf"(\d{{1,2}})\s+({month_pat})[,\s]+(\d{{4}})",
|
|
106
|
+
_named_date, text
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Handle "05/08/2004" or "05-08-2004"
|
|
110
|
+
def _numeric_date(m):
|
|
111
|
+
day, month, year = int(m.group(1)), int(m.group(2)), int(m.group(3))
|
|
112
|
+
if 1 <= month <= 12 and 1 <= day <= 31:
|
|
113
|
+
return f"{ONES[day]} {MONTHS[month]} {num_to_words(year)}"
|
|
114
|
+
return m.group(0) # not a valid date, leave it alone
|
|
115
|
+
|
|
116
|
+
text = re.sub(r"(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})", _numeric_date, text)
|
|
117
|
+
|
|
118
|
+
# Anything else that's still a number
|
|
119
|
+
text = re.sub(r"\d+", lambda m: num_to_words(int(m.group())), text)
|
|
120
|
+
|
|
121
|
+
return text
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""
|
|
2
|
+
marathi.py
|
|
3
|
+
|
|
4
|
+
All the Marathi-specific stuff lives here.
|
|
5
|
+
Same structure as hindi.py but with Marathi words throughout.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
|
|
10
|
+
# Every number from 0 to 99 written out in Marathi.
|
|
11
|
+
# Same idea as Hindi - index = the number. ONES[3] = "तीन".
|
|
12
|
+
ONES = [
|
|
13
|
+
"", "एक", "दोन", "तीन", "चार", "पाच", "सहा", "सात", "आठ", "नऊ", "दहा",
|
|
14
|
+
"अकरा", "बारा", "तेरा", "चौदा", "पंधरा", "सोळा", "सतरा", "अठरा", "एकोणीस", "वीस",
|
|
15
|
+
"एकवीस", "बावीस", "तेवीस", "चोवीस", "पंचवीस", "सव्वीस", "सत्तावीस", "अठ्ठावीस", "एकोणतीस", "तीस",
|
|
16
|
+
"एकतीस", "बत्तीस", "तेहेतीस", "चौतीस", "पस्तीस", "छत्तीस", "सदतीस", "अडतीस", "एकोणचाळीस", "चाळीस",
|
|
17
|
+
"एकेचाळीस", "बेचाळीस", "त्रेचाळीस", "चव्वेचाळीस", "पंचेचाळीस", "सेहेचाळीस", "सत्तेचाळीस", "अठ्ठेचाळीस", "एकोणपन्नास", "पन्नास",
|
|
18
|
+
"एक्याण्णव", "बावन्न", "त्रेपन्न", "चोपन्न", "पंचावन्न", "छप्पन्न", "सत्तावन्न", "अठ्ठावन्न", "एकोणसाठ", "साठ",
|
|
19
|
+
"एकसष्ट", "बासष्ट", "त्रेसष्ट", "चौसष्ट", "पासष्ट", "सहासष्ट", "सदुसष्ट", "अडुसष्ट", "एकोणसत्तर", "सत्तर",
|
|
20
|
+
"एकाहत्तर", "बहात्तर", "त्र्याहत्तर", "चौऱ्याहत्तर", "पंच्याहत्तर", "शहात्तर", "सत्याहत्तर", "अठ्ठ्याहत्तर", "एकोणऐंशी", "ऐंशी",
|
|
21
|
+
"एक्याऐंशी", "ब्याऐंशी", "त्र्याऐंशी", "चौऱ्याऐंशी", "पंच्याऐंशी", "शहाऐंशी", "सत्याऐंशी", "अठ्ठ्याऐंशी", "नव्याऐंशी", "नव्वद",
|
|
22
|
+
"एक्याण्णव", "बाण्णव", "त्र्याण्णव", "चौऱ्याण्णव", "पंच्याण्णव", "शहाण्णव", "सत्त्याण्णव", "अठ्ठ्याण्णव", "नव्याण्णव",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
# Month number -> Marathi month name
|
|
26
|
+
MONTHS = {
|
|
27
|
+
1: "जानेवारी", 2: "फेब्रुवारी", 3: "मार्च", 4: "एप्रिल",
|
|
28
|
+
5: "मे", 6: "जून", 7: "जुलै", 8: "ऑगस्ट",
|
|
29
|
+
9: "सप्टेंबर", 10: "ऑक्टोबर", 11: "नोव्हेंबर", 12: "डिसेंबर",
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
# Flip it so we can go from month name -> number
|
|
33
|
+
MONTH_TO_NUM = {v: k for k, v in MONTHS.items()}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def num_to_words(n: int) -> str:
|
|
37
|
+
"""
|
|
38
|
+
Turn any whole number into how a Marathi speaker would say it out loud.
|
|
39
|
+
|
|
40
|
+
Same 1900s special case as Hindi: "एकोणीस शे ..." instead of the long form.
|
|
41
|
+
Marathi uses "शे" for hundred and "हजार" for thousand.
|
|
42
|
+
"""
|
|
43
|
+
if n == 0:
|
|
44
|
+
return "शून्य"
|
|
45
|
+
if n < 0:
|
|
46
|
+
return "वजा " + num_to_words(-n)
|
|
47
|
+
|
|
48
|
+
# Years in the 1900s - say it the natural way
|
|
49
|
+
if 1900 <= n <= 1999:
|
|
50
|
+
remainder = n - 1900
|
|
51
|
+
if remainder == 0:
|
|
52
|
+
return "एकोणीस शे"
|
|
53
|
+
return "एकोणीस शे " + ONES[remainder]
|
|
54
|
+
|
|
55
|
+
# 1800s
|
|
56
|
+
if 1800 <= n <= 1899:
|
|
57
|
+
remainder = n - 1800
|
|
58
|
+
if remainder == 0:
|
|
59
|
+
return "अठरा शे"
|
|
60
|
+
return "अठरा शे " + ONES[remainder]
|
|
61
|
+
|
|
62
|
+
# 1700s
|
|
63
|
+
if 1700 <= n <= 1799:
|
|
64
|
+
remainder = n - 1700
|
|
65
|
+
if remainder == 0:
|
|
66
|
+
return "सतरा शे"
|
|
67
|
+
return "सतरा शे " + ONES[remainder]
|
|
68
|
+
|
|
69
|
+
parts = []
|
|
70
|
+
if n >= 1000:
|
|
71
|
+
parts.append(num_to_words(n // 1000) + " हजार")
|
|
72
|
+
n %= 1000
|
|
73
|
+
if n >= 100:
|
|
74
|
+
parts.append(ONES[n // 100] + " शे")
|
|
75
|
+
n %= 100
|
|
76
|
+
if n > 0:
|
|
77
|
+
parts.append(ONES[n])
|
|
78
|
+
return " ".join(parts)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def preprocess(text: str) -> str:
|
|
82
|
+
"""
|
|
83
|
+
Takes raw Marathi text and replaces all numbers/dates with spoken Marathi words.
|
|
84
|
+
Same three patterns as Hindi - just uses Marathi words.
|
|
85
|
+
"""
|
|
86
|
+
month_pat = "|".join(MONTH_TO_NUM.keys())
|
|
87
|
+
|
|
88
|
+
# Handle "5 ऑगस्ट 2004" or "5 ऑगस्ट, 2004"
|
|
89
|
+
def _named_date(m):
|
|
90
|
+
day = int(m.group(1))
|
|
91
|
+
month = MONTH_TO_NUM[m.group(2)]
|
|
92
|
+
year = int(m.group(3))
|
|
93
|
+
return f"{ONES[day]} {MONTHS[month]} {num_to_words(year)}"
|
|
94
|
+
|
|
95
|
+
text = re.sub(
|
|
96
|
+
rf"(\d{{1,2}})\s+({month_pat})[,\s]+(\d{{4}})",
|
|
97
|
+
_named_date, text
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Handle "05/08/2004" or "05-08-2004"
|
|
101
|
+
def _numeric_date(m):
|
|
102
|
+
day, month, year = int(m.group(1)), int(m.group(2)), int(m.group(3))
|
|
103
|
+
if 1 <= month <= 12 and 1 <= day <= 31:
|
|
104
|
+
return f"{ONES[day]} {MONTHS[month]} {num_to_words(year)}"
|
|
105
|
+
return m.group(0)
|
|
106
|
+
|
|
107
|
+
text = re.sub(r"(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})", _numeric_date, text)
|
|
108
|
+
|
|
109
|
+
# Anything else that's still a number
|
|
110
|
+
text = re.sub(r"\d+", lambda m: num_to_words(int(m.group())), text)
|
|
111
|
+
|
|
112
|
+
return text
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: indic-tts-preprocess
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Convert numbers and dates in Indic text to spoken words for TTS models
|
|
5
|
+
Author-email: Dhruv Dornal <dhruvdornal2003@gmail.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2025 Dhruv
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Keywords: tts,indic,hindi,marathi,nlp,text-to-speech,preprocessing
|
|
29
|
+
Classifier: Development Status :: 3 - Alpha
|
|
30
|
+
Classifier: Intended Audience :: Developers
|
|
31
|
+
Classifier: Intended Audience :: Science/Research
|
|
32
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
33
|
+
Classifier: Programming Language :: Python :: 3
|
|
34
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
35
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
39
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
40
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
41
|
+
Classifier: Natural Language :: Hindi
|
|
42
|
+
Requires-Python: >=3.8
|
|
43
|
+
Description-Content-Type: text/markdown
|
|
44
|
+
License-File: LICENSE
|
|
45
|
+
Dynamic: license-file
|
|
46
|
+
|
|
47
|
+
# indic-tts-preprocess
|
|
48
|
+
|
|
49
|
+
A small Python library that fixes one specific but very annoying problem with open-source Indic TTS models: **they cannot read numbers**.
|
|
50
|
+
|
|
51
|
+
Models like [ai4bharat/indic-parler-tts](https://huggingface.co/ai4bharat/indic-parler-tts) hallucinate badly when the input text contains raw digits. Feed them `"1997 में"` and you get garbled audio. Feed them `"उन्नीस सौ सत्तानवे में"` and it works perfectly.
|
|
52
|
+
|
|
53
|
+
This library does that conversion for you — numbers, years, dates — before the text ever reaches the tokenizer.
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## Install
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install indic-tts-preprocess
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
No dependencies. Pure Python. Works on Python 3.8 and above.
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## Quick start
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
from indic_tts_preprocess import preprocess
|
|
71
|
+
|
|
72
|
+
# Hindi
|
|
73
|
+
preprocess("उनका जन्म 5 अगस्त 1997 को हुआ", "hi")
|
|
74
|
+
# -> "उनका जन्म पाँच अगस्त उन्नीस सौ सत्तानवे को हुआ"
|
|
75
|
+
|
|
76
|
+
# Marathi
|
|
77
|
+
preprocess("त्यांचा जन्म 15 ऑगस्ट 1947 रोजी झाला", "mr")
|
|
78
|
+
# -> "त्यांचा जन्म पंधरा ऑगस्ट एकोणीस शे सत्तेचाळीस रोजी झाला"
|
|
79
|
+
|
|
80
|
+
# English
|
|
81
|
+
preprocess("He was born on 15 August 1947", "en")
|
|
82
|
+
# -> "He was born on fifteen August nineteen forty seven"
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## Supported languages
|
|
88
|
+
|
|
89
|
+
| Code | Language |
|
|
90
|
+
|------|----------|
|
|
91
|
+
| `hi` | Hindi |
|
|
92
|
+
| `mr` | Marathi |
|
|
93
|
+
| `en` | English |
|
|
94
|
+
|
|
95
|
+
If you pass an unsupported language code, the text comes back unchanged — nothing crashes.
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## What it handles
|
|
100
|
+
|
|
101
|
+
**Date formats**
|
|
102
|
+
|
|
103
|
+
| Input | Language | Output |
|
|
104
|
+
|-------|----------|--------|
|
|
105
|
+
| `5 अगस्त 2004` | hi | `पाँच अगस्त दो हज़ार चार` |
|
|
106
|
+
| `05/08/2004` | hi | `पाँच अगस्त दो हज़ार चार` |
|
|
107
|
+
| `05-08-2004` | hi | `पाँच अगस्त दो हज़ार चार` |
|
|
108
|
+
| `15 August 1947` | en | `fifteen August nineteen forty seven` |
|
|
109
|
+
| `15/08/1947` | en | `fifteen August nineteen forty seven` |
|
|
110
|
+
|
|
111
|
+
**Standalone numbers**
|
|
112
|
+
|
|
113
|
+
| Input | Language | Output |
|
|
114
|
+
|-------|----------|--------|
|
|
115
|
+
| `73` | hi | `तिहत्तर` |
|
|
116
|
+
| `1997` | hi | `उन्नीस सौ सत्तानवे` |
|
|
117
|
+
| `2024` | en | `two thousand twenty four` |
|
|
118
|
+
| `1905` | en | `nineteen oh five` |
|
|
119
|
+
|
|
120
|
+
**Year handling** (the tricky part)
|
|
121
|
+
|
|
122
|
+
Hindi and Marathi speakers say years in the 1900s differently from how you'd read them literally:
|
|
123
|
+
- `1997` → `उन्नीस सौ सत्तानवे` (not `एक हज़ार नौ सौ सत्तानवे`)
|
|
124
|
+
|
|
125
|
+
English speakers do the same:
|
|
126
|
+
- `1997` → `nineteen ninety seven` (not `one thousand nine hundred ninety seven`)
|
|
127
|
+
- `1905` → `nineteen oh five`
|
|
128
|
+
|
|
129
|
+
The library handles all of these correctly.
|
|
130
|
+
|
|
131
|
+
---
|
|
132
|
+
|
|
133
|
+
## API reference
|
|
134
|
+
|
|
135
|
+
### `preprocess(text, lang)`
|
|
136
|
+
|
|
137
|
+
| Parameter | Type | Description |
|
|
138
|
+
|-----------|------|-------------|
|
|
139
|
+
| `text` | `str` | The raw input text containing digits/dates |
|
|
140
|
+
| `lang` | `str` | Language code: `"hi"`, `"mr"`, or `"en"` |
|
|
141
|
+
|
|
142
|
+
Returns `str` — the same text with all digits replaced by spoken words.
|
|
143
|
+
|
|
144
|
+
Raises `TypeError` if `text` is not a string.
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
## Contributing
|
|
149
|
+
|
|
150
|
+
Adding a new language is straightforward:
|
|
151
|
+
|
|
152
|
+
1. Create `indic_tts_preprocess/languages/yourlang.py` — look at `hindi.py` as a template
|
|
153
|
+
2. Add a `preprocess(text)` function and a `num_to_words(n)` function
|
|
154
|
+
3. Add the new language code in `core.py`
|
|
155
|
+
4. Add tests in `tests/test_yourlang.py`
|
|
156
|
+
5. Open a pull request
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## License
|
|
161
|
+
|
|
162
|
+
MIT
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
indic_tts_preprocess/__init__.py
|
|
5
|
+
indic_tts_preprocess/core.py
|
|
6
|
+
indic_tts_preprocess.egg-info/PKG-INFO
|
|
7
|
+
indic_tts_preprocess.egg-info/SOURCES.txt
|
|
8
|
+
indic_tts_preprocess.egg-info/dependency_links.txt
|
|
9
|
+
indic_tts_preprocess.egg-info/top_level.txt
|
|
10
|
+
indic_tts_preprocess/languages/__init__.py
|
|
11
|
+
indic_tts_preprocess/languages/english.py
|
|
12
|
+
indic_tts_preprocess/languages/hindi.py
|
|
13
|
+
indic_tts_preprocess/languages/marathi.py
|
|
14
|
+
tests/test_core.py
|
|
15
|
+
tests/test_english.py
|
|
16
|
+
tests/test_hindi.py
|
|
17
|
+
tests/test_marathi.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
indic_tts_preprocess
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "indic-tts-preprocess"
|
|
7
|
+
version = "0.1.1"
|
|
8
|
+
description = "Convert numbers and dates in Indic text to spoken words for TTS models"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { file = "LICENSE" }
|
|
11
|
+
authors = [
|
|
12
|
+
{ name = "Dhruv Dornal", email = "dhruvdornal2003@gmail.com" }
|
|
13
|
+
]
|
|
14
|
+
keywords = ["tts", "indic", "hindi", "marathi", "nlp", "text-to-speech", "preprocessing"]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 3 - Alpha",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"Intended Audience :: Science/Research",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.8",
|
|
22
|
+
"Programming Language :: Python :: 3.9",
|
|
23
|
+
"Programming Language :: Python :: 3.10",
|
|
24
|
+
"Programming Language :: Python :: 3.11",
|
|
25
|
+
"Programming Language :: Python :: 3.12",
|
|
26
|
+
"Topic :: Text Processing :: Linguistic",
|
|
27
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
28
|
+
"Natural Language :: Hindi",
|
|
29
|
+
]
|
|
30
|
+
requires-python = ">=3.8"
|
|
31
|
+
dependencies = []
|
|
32
|
+
|
|
33
|
+
[tool.setuptools.packages.find]
|
|
34
|
+
where = ["."]
|
|
35
|
+
include = ["indic_tts_preprocess*"]
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tests for the top-level preprocess() function.
|
|
3
|
+
This is the main thing users will call.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import pytest
|
|
7
|
+
from indic_tts_preprocess import preprocess, SUPPORTED_LANGUAGES
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TestPreprocess:
|
|
11
|
+
def test_hindi_routing(self):
|
|
12
|
+
result = preprocess("1997", "hi")
|
|
13
|
+
assert result == "उन्नीस सौ सत्तानवे"
|
|
14
|
+
|
|
15
|
+
def test_marathi_routing(self):
|
|
16
|
+
result = preprocess("1997", "mr")
|
|
17
|
+
assert result == "एकोणीस शे सत्त्याण्णव"
|
|
18
|
+
|
|
19
|
+
def test_english_routing(self):
|
|
20
|
+
result = preprocess("1997", "en")
|
|
21
|
+
assert result == "nineteen ninety seven"
|
|
22
|
+
|
|
23
|
+
def test_unknown_lang_returns_text_unchanged(self):
|
|
24
|
+
# Should not crash, just return as-is
|
|
25
|
+
result = preprocess("hello 123", "xx")
|
|
26
|
+
assert result == "hello 123"
|
|
27
|
+
|
|
28
|
+
def test_lang_is_case_insensitive(self):
|
|
29
|
+
# "HI" and "Hi" and "hi" should all work
|
|
30
|
+
assert preprocess("5", "HI") == preprocess("5", "hi")
|
|
31
|
+
assert preprocess("5", "Hi") == preprocess("5", "hi")
|
|
32
|
+
|
|
33
|
+
def test_raises_on_non_string_input(self):
|
|
34
|
+
with pytest.raises(TypeError):
|
|
35
|
+
preprocess(12345, "hi")
|
|
36
|
+
|
|
37
|
+
def test_supported_languages_constant(self):
|
|
38
|
+
assert "hi" in SUPPORTED_LANGUAGES
|
|
39
|
+
assert "mr" in SUPPORTED_LANGUAGES
|
|
40
|
+
assert "en" in SUPPORTED_LANGUAGES
|
|
41
|
+
|
|
42
|
+
def test_empty_string(self):
|
|
43
|
+
assert preprocess("", "hi") == ""
|
|
44
|
+
assert preprocess("", "en") == ""
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tests for the English preprocessor.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
from indic_tts_preprocess.languages.english import preprocess, num_to_words
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TestNumToWords:
|
|
10
|
+
def test_zero(self):
|
|
11
|
+
assert num_to_words(0) == "zero"
|
|
12
|
+
|
|
13
|
+
def test_teens(self):
|
|
14
|
+
assert num_to_words(15) == "fifteen"
|
|
15
|
+
|
|
16
|
+
def test_two_digit(self):
|
|
17
|
+
assert num_to_words(42) == "forty two"
|
|
18
|
+
|
|
19
|
+
def test_1900s_year(self):
|
|
20
|
+
assert num_to_words(1997) == "nineteen ninety seven"
|
|
21
|
+
|
|
22
|
+
def test_1900s_oh_year(self):
|
|
23
|
+
# 1905 should be "nineteen oh five"
|
|
24
|
+
assert num_to_words(1905) == "nineteen oh five"
|
|
25
|
+
|
|
26
|
+
def test_1900_exact(self):
|
|
27
|
+
assert num_to_words(1900) == "nineteen hundred"
|
|
28
|
+
|
|
29
|
+
def test_2000(self):
|
|
30
|
+
assert num_to_words(2000) == "two thousand"
|
|
31
|
+
|
|
32
|
+
def test_2000s(self):
|
|
33
|
+
assert num_to_words(2024) == "two thousand twenty four"
|
|
34
|
+
|
|
35
|
+
def test_negative(self):
|
|
36
|
+
assert num_to_words(-10) == "minus ten"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class TestPreprocess:
|
|
40
|
+
def test_written_date(self):
|
|
41
|
+
result = preprocess("15 August 1747")
|
|
42
|
+
assert result == "fifteen August seventeen forty seven"
|
|
43
|
+
|
|
44
|
+
def test_written_date_case_insensitive(self):
|
|
45
|
+
result = preprocess("15 august 1947")
|
|
46
|
+
assert "fifteen" in result
|
|
47
|
+
assert "nineteen forty seven" in result
|
|
48
|
+
|
|
49
|
+
def test_numeric_date(self):
|
|
50
|
+
result = preprocess("15/08/1947")
|
|
51
|
+
assert "fifteen" in result
|
|
52
|
+
assert "August" in result
|
|
53
|
+
|
|
54
|
+
def test_standalone_year(self):
|
|
55
|
+
result = preprocess("The year was 1947")
|
|
56
|
+
assert "nineteen forty seven" in result
|
|
57
|
+
|
|
58
|
+
def test_text_without_numbers_unchanged(self):
|
|
59
|
+
text = "Hello world"
|
|
60
|
+
assert preprocess(text) == text
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tests for the Hindi preprocessor.
|
|
3
|
+
Run with: python -m pytest tests/
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import pytest
|
|
7
|
+
from indic_tts_preprocess.languages.hindi import preprocess, num_to_words
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TestNumToWords:
|
|
11
|
+
def test_zero(self):
|
|
12
|
+
assert num_to_words(0) == "शून्य"
|
|
13
|
+
|
|
14
|
+
def test_single_digit(self):
|
|
15
|
+
assert num_to_words(5) == "पाँच"
|
|
16
|
+
|
|
17
|
+
def test_two_digit(self):
|
|
18
|
+
assert num_to_words(73) == "तिहत्तर"
|
|
19
|
+
|
|
20
|
+
def test_hundred(self):
|
|
21
|
+
assert num_to_words(100) == "एक सौ"
|
|
22
|
+
|
|
23
|
+
def test_1900s_year(self):
|
|
24
|
+
# The special case - should say "उन्नीस सौ" not "एक हज़ार नौ सौ"
|
|
25
|
+
assert num_to_words(1997) == "उन्नीस सौ सत्तानवे"
|
|
26
|
+
|
|
27
|
+
def test_1900_exact(self):
|
|
28
|
+
assert num_to_words(1900) == "उन्नीस सौ"
|
|
29
|
+
|
|
30
|
+
def test_2000s_year(self):
|
|
31
|
+
assert num_to_words(2004) == "दो हज़ार चार"
|
|
32
|
+
|
|
33
|
+
def test_negative(self):
|
|
34
|
+
assert num_to_words(-5) == "माइनस पाँच"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class TestPreprocess:
|
|
38
|
+
def test_named_date(self):
|
|
39
|
+
result = preprocess("5 अगस्त 2004")
|
|
40
|
+
assert result == "पाँच अगस्त दो हज़ार चार"
|
|
41
|
+
|
|
42
|
+
def test_named_date_with_comma(self):
|
|
43
|
+
result = preprocess("5 अगस्त, 2004")
|
|
44
|
+
assert result == "पाँच अगस्त दो हज़ार चार"
|
|
45
|
+
|
|
46
|
+
def test_numeric_date_slash(self):
|
|
47
|
+
result = preprocess("05/08/2004")
|
|
48
|
+
assert result == "पाँच अगस्त दो हज़ार चार"
|
|
49
|
+
|
|
50
|
+
def test_numeric_date_dash(self):
|
|
51
|
+
result = preprocess("05-08-2004")
|
|
52
|
+
assert result == "पाँच अगस्त दो हज़ार चार"
|
|
53
|
+
|
|
54
|
+
def test_standalone_number_in_sentence(self):
|
|
55
|
+
result = preprocess("उसके पास 73 किताबें हैं")
|
|
56
|
+
assert "तिहत्तर" in result
|
|
57
|
+
|
|
58
|
+
def test_1900s_year_in_sentence(self):
|
|
59
|
+
result = preprocess("उनका जन्म 1947 में हुआ")
|
|
60
|
+
assert "उन्नीस सौ सैंतालीस" in result
|
|
61
|
+
|
|
62
|
+
def test_text_without_numbers_unchanged(self):
|
|
63
|
+
text = "यह एक साधारण वाक्य है"
|
|
64
|
+
assert preprocess(text) == text
|
|
65
|
+
|
|
66
|
+
def test_1800s_year_in_sentence(self):
|
|
67
|
+
result = preprocess("उनका जन्म 1803 में हुआ")
|
|
68
|
+
assert "अठारह सौ तीन" in result
|
|
69
|
+
|
|
70
|
+
def test_1700s_year_in_sentence(self):
|
|
71
|
+
result = preprocess("उनका जन्म 1709 में हुआ")
|
|
72
|
+
assert "सत्रह सौ नौ" in result
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tests for the Marathi preprocessor.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
from indic_tts_preprocess.languages.marathi import preprocess, num_to_words
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TestNumToWords:
|
|
10
|
+
def test_zero(self):
|
|
11
|
+
assert num_to_words(0) == "शून्य"
|
|
12
|
+
|
|
13
|
+
def test_single_digit(self):
|
|
14
|
+
assert num_to_words(5) == "पाच"
|
|
15
|
+
|
|
16
|
+
def test_1900s_year(self):
|
|
17
|
+
# Marathi uses "एकोणीस शे" for the 1900s
|
|
18
|
+
assert num_to_words(1997) == "एकोणीस शे सत्त्याण्णव"
|
|
19
|
+
|
|
20
|
+
def test_1900_exact(self):
|
|
21
|
+
assert num_to_words(1900) == "एकोणीस शे"
|
|
22
|
+
|
|
23
|
+
def test_negative(self):
|
|
24
|
+
assert num_to_words(-3) == "वजा तीन"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class TestPreprocess:
|
|
28
|
+
def test_named_date(self):
|
|
29
|
+
result = preprocess("5 ऑगस्ट 2004")
|
|
30
|
+
assert result == "पाच ऑगस्ट दोन हजार चार"
|
|
31
|
+
|
|
32
|
+
def test_numeric_date(self):
|
|
33
|
+
result = preprocess("05/08/2004")
|
|
34
|
+
assert "ऑगस्ट" in result
|
|
35
|
+
|
|
36
|
+
def test_standalone_number(self):
|
|
37
|
+
result = preprocess("त्याच्याकडे 10 पुस्तके आहेत")
|
|
38
|
+
assert "दहा" in result
|
|
39
|
+
|
|
40
|
+
def test_text_without_numbers_unchanged(self):
|
|
41
|
+
text = "हे एक साधे वाक्य आहे"
|
|
42
|
+
assert preprocess(text) == text
|
|
43
|
+
|
|
44
|
+
def test_1800s_year_in_sentence_marathi(self):
|
|
45
|
+
result = preprocess("त्यांचा जन्म 1807 मध्ये झाला")
|
|
46
|
+
assert "अठरा शे सात" in result
|
|
47
|
+
|
|
48
|
+
def test_1700s_year_in_sentence_marathi(self):
|
|
49
|
+
result = preprocess("त्यांचा जन्म 1702 मध्ये झाला")
|
|
50
|
+
assert "सतरा शे दोन" in result
|