indic-tts-preprocess 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Dhruv
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,162 @@
1
+ Metadata-Version: 2.4
2
+ Name: indic-tts-preprocess
3
+ Version: 0.1.1
4
+ Summary: Convert numbers and dates in Indic text to spoken words for TTS models
5
+ Author-email: Dhruv Dornal <dhruvdornal2003@gmail.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2025 Dhruv
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Keywords: tts,indic,hindi,marathi,nlp,text-to-speech,preprocessing
29
+ Classifier: Development Status :: 3 - Alpha
30
+ Classifier: Intended Audience :: Developers
31
+ Classifier: Intended Audience :: Science/Research
32
+ Classifier: License :: OSI Approved :: MIT License
33
+ Classifier: Programming Language :: Python :: 3
34
+ Classifier: Programming Language :: Python :: 3.8
35
+ Classifier: Programming Language :: Python :: 3.9
36
+ Classifier: Programming Language :: Python :: 3.10
37
+ Classifier: Programming Language :: Python :: 3.11
38
+ Classifier: Programming Language :: Python :: 3.12
39
+ Classifier: Topic :: Text Processing :: Linguistic
40
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
41
+ Classifier: Natural Language :: Hindi
42
+ Requires-Python: >=3.8
43
+ Description-Content-Type: text/markdown
44
+ License-File: LICENSE
45
+ Dynamic: license-file
46
+
47
+ # indic-tts-preprocess
48
+
49
+ A small Python library that fixes one specific but very annoying problem with open-source Indic TTS models: **they cannot read numbers**.
50
+
51
+ Models like [ai4bharat/indic-parler-tts](https://huggingface.co/ai4bharat/indic-parler-tts) hallucinate badly when the input text contains raw digits. Feed them `"1997 में"` and you get garbled audio. Feed them `"उन्नीस सौ सत्तानवे में"` and it works perfectly.
52
+
53
+ This library does that conversion for you — numbers, years, dates — before the text ever reaches the tokenizer.
54
+
55
+ ---
56
+
57
+ ## Install
58
+
59
+ ```bash
60
+ pip install indic-tts-preprocess
61
+ ```
62
+
63
+ No dependencies. Pure Python. Works on Python 3.8 and above.
64
+
65
+ ---
66
+
67
+ ## Quick start
68
+
69
+ ```python
70
+ from indic_tts_preprocess import preprocess
71
+
72
+ # Hindi
73
+ preprocess("उनका जन्म 5 अगस्त 1997 को हुआ", "hi")
74
+ # -> "उनका जन्म पाँच अगस्त उन्नीस सौ सत्तानवे को हुआ"
75
+
76
+ # Marathi
77
+ preprocess("त्यांचा जन्म 15 ऑगस्ट 1947 रोजी झाला", "mr")
78
+ # -> "त्यांचा जन्म पंधरा ऑगस्ट एकोणीस शे सत्तेचाळीस रोजी झाला"
79
+
80
+ # English
81
+ preprocess("He was born on 15 August 1947", "en")
82
+ # -> "He was born on fifteen August nineteen forty seven"
83
+ ```
84
+
85
+ ---
86
+
87
+ ## Supported languages
88
+
89
+ | Code | Language |
90
+ |------|----------|
91
+ | `hi` | Hindi |
92
+ | `mr` | Marathi |
93
+ | `en` | English |
94
+
95
+ If you pass an unsupported language code, the text comes back unchanged — nothing crashes.
96
+
97
+ ---
98
+
99
+ ## What it handles
100
+
101
+ **Date formats**
102
+
103
+ | Input | Language | Output |
104
+ |-------|----------|--------|
105
+ | `5 अगस्त 2004` | hi | `पाँच अगस्त दो हज़ार चार` |
106
+ | `05/08/2004` | hi | `पाँच अगस्त दो हज़ार चार` |
107
+ | `05-08-2004` | hi | `पाँच अगस्त दो हज़ार चार` |
108
+ | `15 August 1947` | en | `fifteen August nineteen forty seven` |
109
+ | `15/08/1947` | en | `fifteen August nineteen forty seven` |
110
+
111
+ **Standalone numbers**
112
+
113
+ | Input | Language | Output |
114
+ |-------|----------|--------|
115
+ | `73` | hi | `तिहत्तर` |
116
+ | `1997` | hi | `उन्नीस सौ सत्तानवे` |
117
+ | `2024` | en | `two thousand twenty four` |
118
+ | `1905` | en | `nineteen oh five` |
119
+
120
+ **Year handling** (the tricky part)
121
+
122
+ Hindi and Marathi speakers say years in the 1900s differently from how you'd read them literally:
123
+ - `1997` → `उन्नीस सौ सत्तानवे` (not `एक हज़ार नौ सौ सत्तानवे`)
124
+
125
+ English speakers do the same:
126
+ - `1997` → `nineteen ninety seven` (not `one thousand nine hundred ninety seven`)
127
+ - `1905` → `nineteen oh five`
128
+
129
+ The library handles all of these correctly.
130
+
131
+ ---
132
+
133
+ ## API reference
134
+
135
+ ### `preprocess(text, lang)`
136
+
137
+ | Parameter | Type | Description |
138
+ |-----------|------|-------------|
139
+ | `text` | `str` | The raw input text containing digits/dates |
140
+ | `lang` | `str` | Language code: `"hi"`, `"mr"`, or `"en"` |
141
+
142
+ Returns `str` — the same text with all digits replaced by spoken words.
143
+
144
+ Raises `TypeError` if `text` is not a string.
145
+
146
+ ---
147
+
148
+ ## Contributing
149
+
150
+ Adding a new language is straightforward:
151
+
152
+ 1. Create `indic_tts_preprocess/languages/yourlang.py` — look at `hindi.py` as a template
153
+ 2. Add a `preprocess(text)` function and a `num_to_words(n)` function
154
+ 3. Add the new language code in `core.py`
155
+ 4. Add tests in `tests/test_yourlang.py`
156
+ 5. Open a pull request
157
+
158
+ ---
159
+
160
+ ## License
161
+
162
+ MIT
@@ -0,0 +1,116 @@
1
+ # indic-tts-preprocess
2
+
3
+ A small Python library that fixes one specific but very annoying problem with open-source Indic TTS models: **they cannot read numbers**.
4
+
5
+ Models like [ai4bharat/indic-parler-tts](https://huggingface.co/ai4bharat/indic-parler-tts) hallucinate badly when the input text contains raw digits. Feed them `"1997 में"` and you get garbled audio. Feed them `"उन्नीस सौ सत्तानवे में"` and it works perfectly.
6
+
7
+ This library does that conversion for you — numbers, years, dates — before the text ever reaches the tokenizer.
8
+
9
+ ---
10
+
11
+ ## Install
12
+
13
+ ```bash
14
+ pip install indic-tts-preprocess
15
+ ```
16
+
17
+ No dependencies. Pure Python. Works on Python 3.8 and above.
18
+
19
+ ---
20
+
21
+ ## Quick start
22
+
23
+ ```python
24
+ from indic_tts_preprocess import preprocess
25
+
26
+ # Hindi
27
+ preprocess("उनका जन्म 5 अगस्त 1997 को हुआ", "hi")
28
+ # -> "उनका जन्म पाँच अगस्त उन्नीस सौ सत्तानवे को हुआ"
29
+
30
+ # Marathi
31
+ preprocess("त्यांचा जन्म 15 ऑगस्ट 1947 रोजी झाला", "mr")
32
+ # -> "त्यांचा जन्म पंधरा ऑगस्ट एकोणीस शे सत्तेचाळीस रोजी झाला"
33
+
34
+ # English
35
+ preprocess("He was born on 15 August 1947", "en")
36
+ # -> "He was born on fifteen August nineteen forty seven"
37
+ ```
38
+
39
+ ---
40
+
41
+ ## Supported languages
42
+
43
+ | Code | Language |
44
+ |------|----------|
45
+ | `hi` | Hindi |
46
+ | `mr` | Marathi |
47
+ | `en` | English |
48
+
49
+ If you pass an unsupported language code, the text comes back unchanged — nothing crashes.
50
+
51
+ ---
52
+
53
+ ## What it handles
54
+
55
+ **Date formats**
56
+
57
+ | Input | Language | Output |
58
+ |-------|----------|--------|
59
+ | `5 अगस्त 2004` | hi | `पाँच अगस्त दो हज़ार चार` |
60
+ | `05/08/2004` | hi | `पाँच अगस्त दो हज़ार चार` |
61
+ | `05-08-2004` | hi | `पाँच अगस्त दो हज़ार चार` |
62
+ | `15 August 1947` | en | `fifteen August nineteen forty seven` |
63
+ | `15/08/1947` | en | `fifteen August nineteen forty seven` |
64
+
65
+ **Standalone numbers**
66
+
67
+ | Input | Language | Output |
68
+ |-------|----------|--------|
69
+ | `73` | hi | `तिहत्तर` |
70
+ | `1997` | hi | `उन्नीस सौ सत्तानवे` |
71
+ | `2024` | en | `two thousand twenty four` |
72
+ | `1905` | en | `nineteen oh five` |
73
+
74
+ **Year handling** (the tricky part)
75
+
76
+ Hindi and Marathi speakers say years in the 1900s differently from how you'd read them literally:
77
+ - `1997` → `उन्नीस सौ सत्तानवे` (not `एक हज़ार नौ सौ सत्तानवे`)
78
+
79
+ English speakers do the same:
80
+ - `1997` → `nineteen ninety seven` (not `one thousand nine hundred ninety seven`)
81
+ - `1905` → `nineteen oh five`
82
+
83
+ The library handles all of these correctly.
84
+
85
+ ---
86
+
87
+ ## API reference
88
+
89
+ ### `preprocess(text, lang)`
90
+
91
+ | Parameter | Type | Description |
92
+ |-----------|------|-------------|
93
+ | `text` | `str` | The raw input text containing digits/dates |
94
+ | `lang` | `str` | Language code: `"hi"`, `"mr"`, or `"en"` |
95
+
96
+ Returns `str` — the same text with all digits replaced by spoken words.
97
+
98
+ Raises `TypeError` if `text` is not a string.
99
+
100
+ ---
101
+
102
+ ## Contributing
103
+
104
+ Adding a new language is straightforward:
105
+
106
+ 1. Create `indic_tts_preprocess/languages/yourlang.py` — look at `hindi.py` as a template
107
+ 2. Add a `preprocess(text)` function and a `num_to_words(n)` function
108
+ 3. Add the new language code in `core.py`
109
+ 4. Add tests in `tests/test_yourlang.py`
110
+ 5. Open a pull request
111
+
112
+ ---
113
+
114
+ ## License
115
+
116
+ MIT
@@ -0,0 +1,25 @@
1
+ """
2
+ indic-tts-preprocess
3
+ --------------------
4
+ Converts numbers and dates in Indic text into spoken words,
5
+ so that open-source TTS models like ai4bharat/indic-parler-tts
6
+ stop hallucinating when they see raw digits.
7
+
8
+ Basic usage:
9
+
10
+ from indic_tts_preprocess import preprocess
11
+
12
+ clean = preprocess("उनका जन्म 5 अगस्त 1997 को हुआ", "hi")
13
+ # -> "उनका जन्म पाँच अगस्त उन्नीस सौ सत्तानवे को हुआ"
14
+
15
+ Supported languages:
16
+ "hi" - Hindi
17
+ "mr" - Marathi
18
+ "en" - English
19
+ """
20
+
21
+ from indic_tts_preprocess.core import preprocess, SUPPORTED_LANGUAGES
22
+
23
+ __version__ = "0.1.1"
24
+ __author__ = "Dhruv Dornal"
25
+ __all__ = ["preprocess", "SUPPORTED_LANGUAGES"]
@@ -0,0 +1,56 @@
1
+ """
2
+ core.py
3
+
4
+ The main entry point. This is the only function most people will ever need to call.
5
+ It figures out which language to use and sends the text to the right preprocessor.
6
+ """
7
+
8
+ from indic_tts_preprocess.languages import hindi, marathi, english
9
+
10
+ # All the language codes we support right now.
11
+ # If someone passes something not in this list, we just return their text unchanged.
12
+ SUPPORTED_LANGUAGES = {"hi", "mr", "en"}
13
+
14
+
15
+ def preprocess(text: str, lang: str) -> str:
16
+ """
17
+ Clean up text so Indic TTS models can read numbers and dates correctly.
18
+
19
+ The models hallucinate badly when they see raw digits like "1997" or "73".
20
+ This function converts them into the spoken form before the text hits the model.
21
+
22
+ Parameters
23
+ ----------
24
+ text : str
25
+ The raw input text with digits/dates in it.
26
+ lang : str
27
+ Language code. Use "hi" for Hindi, "mr" for Marathi, "en" for English.
28
+
29
+ Returns
30
+ -------
31
+ str
32
+ The same text but with all numbers replaced by spoken words.
33
+
34
+ Examples
35
+ --------
36
+ >>> from indic_tts_preprocess import preprocess
37
+ >>> preprocess("उनका जन्म 5 अगस्त 1997 को हुआ", "hi")
38
+ 'उनका जन्म पाँच अगस्त उन्नीस सौ सत्तानवे को हुआ'
39
+
40
+ >>> preprocess("He was born on 15 August 1947", "en")
41
+ 'He was born on fifteen August nineteen forty seven'
42
+ """
43
+ if not isinstance(text, str):
44
+ raise TypeError(f"text must be a string, got {type(text).__name__}")
45
+
46
+ lang = lang.strip().lower()
47
+
48
+ if lang == "hi":
49
+ return hindi.preprocess(text)
50
+ if lang == "mr":
51
+ return marathi.preprocess(text)
52
+ if lang == "en":
53
+ return english.preprocess(text)
54
+
55
+ # Unknown language - just return the text as-is so nothing breaks downstream
56
+ return text
@@ -0,0 +1,124 @@
1
+ """
2
+ english.py
3
+
4
+ English number/date cleanup for TTS.
5
+ Handles years the way English speakers actually say them out loud.
6
+ """
7
+
8
+ import re
9
+
10
+ # Numbers 1-19 don't follow any pattern, so just list them
11
+ ONES = [
12
+ "", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
13
+ "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen",
14
+ ]
15
+
16
+ # Tens: twenty, thirty, etc.
17
+ TENS = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
18
+
19
+ # Month number -> English month name
20
+ MONTHS = {
21
+ 1: "January", 2: "February", 3: "March", 4: "April",
22
+ 5: "May", 6: "June", 7: "July", 8: "August",
23
+ 9: "September", 10: "October", 11: "November", 12: "December",
24
+ }
25
+
26
+ # Regex string of all month names (used to match written dates like "5 August 2004")
27
+ MONTHS_RE = "|".join(MONTHS.values())
28
+
29
+
30
+ def _under_100(n: int) -> str:
31
+ """Convert any number under 100 to English words."""
32
+ if n < 20:
33
+ return ONES[n]
34
+ tens, ones = n // 10, n % 10
35
+ if ones == 0:
36
+ return TENS[tens]
37
+ return TENS[tens] + " " + ONES[ones]
38
+
39
+
40
+ def num_to_words(n: int) -> str:
41
+ """
42
+ Turn any whole number into spoken English.
43
+
44
+ Years have their own rules because people say them differently:
45
+ - 1900-1999: "nineteen oh five", "nineteen ninety two"
46
+ - 2000-2099: "two thousand", "two thousand twenty four"
47
+ Everything else uses standard hundred/thousand structure.
48
+ """
49
+ if n == 0:
50
+ return "zero"
51
+ if n < 0:
52
+ return "minus " + num_to_words(-n)
53
+
54
+ # 1900s - 1600s: "nineteen ninety two" not "one thousand nine hundred ninety two"
55
+ if 1900 <= n <= 1999:
56
+ second = n - 1900
57
+ if second == 0:
58
+ return "nineteen hundred"
59
+ if second < 10:
60
+ return "nineteen oh " + ONES[second] # "nineteen oh five"
61
+ return "nineteen " + _under_100(second)
62
+
63
+ if 1800 <= n <= 1899:
64
+ second = n - 1800
65
+ if second == 0:
66
+ return "eighteen hundred"
67
+ if second < 10:
68
+ return "eighteen oh " + ONES[second] # "eighteen oh five"
69
+ return "eightteen " + _under_100(second)
70
+
71
+ if 1700 <= n <= 1799:
72
+ second = n - 1700
73
+ if second == 0:
74
+ return "seventeen hundred"
75
+ if second < 10:
76
+ return "seventeen oh " + ONES[second] # "seventeen oh five"
77
+ return "seventeen " + _under_100(second)
78
+
79
+ # 2000s: "two thousand", "two thousand twenty four"
80
+ if 2000 <= n <= 2099:
81
+ second = n - 2000
82
+ if second == 0:
83
+ return "two thousand"
84
+ return "two thousand " + _under_100(second)
85
+
86
+ parts = []
87
+ if n >= 1000:
88
+ parts.append(num_to_words(n // 1000) + " thousand")
89
+ n %= 1000
90
+ if n >= 100:
91
+ parts.append(ONES[n // 100] + " hundred")
92
+ n %= 100
93
+ if n > 0:
94
+ parts.append(_under_100(n))
95
+ return " ".join(parts)
96
+
97
+
98
+ def preprocess(text: str) -> str:
99
+ """
100
+ Takes raw English text and replaces all numbers/dates with spoken English words.
101
+ """
102
+ # Handle "5 August 2004" or "5 august 2004" (case-insensitive)
103
+ def _written_date(m):
104
+ day, month_name, year = int(m.group(1)), m.group(2), int(m.group(3))
105
+ return f"{_under_100(day)} {month_name} {num_to_words(year)}"
106
+
107
+ text = re.sub(
108
+ rf"(\d{{1,2}})\s+({MONTHS_RE})\s+(\d{{4}})",
109
+ _written_date, text, flags=re.IGNORECASE
110
+ )
111
+
112
+ # Handle "05/08/2004" or "05-08-2004"
113
+ def _numeric_date(m):
114
+ day, month, year = int(m.group(1)), int(m.group(2)), int(m.group(3))
115
+ if 1 <= month <= 12 and 1 <= day <= 31:
116
+ return f"{_under_100(day)} {MONTHS[month]} {num_to_words(year)}"
117
+ return m.group(0)
118
+
119
+ text = re.sub(r"(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})", _numeric_date, text)
120
+
121
+ # Anything else that's still a number
122
+ text = re.sub(r"\d+", lambda m: num_to_words(int(m.group(0))), text)
123
+
124
+ return text
@@ -0,0 +1,121 @@
1
+ """
2
+ hindi.py
3
+
4
+ All the Hindi-specific stuff lives here:
5
+ - Word lists for numbers 0-99
6
+ - Month name mappings
7
+ - The actual function that cleans up Hindi text for TTS
8
+ """
9
+
10
+ import re
11
+
12
+ # Every number from 0 to 99 written out in Hindi.
13
+ # Index position = the number itself. So ONES[5] = "पाँच".
14
+ # Index 0 is blank because we never say "zero" in the middle of a date.
15
+ ONES = [
16
+ "", "एक", "दो", "तीन", "चार", "पाँच", "छह", "सात", "आठ", "नौ", "दस",
17
+ "ग्यारह", "बारह", "तेरह", "चौदह", "पंद्रह", "सोलह", "सत्रह", "अठारह", "उन्नीस", "बीस",
18
+ "इक्कीस", "बाईस", "तेईस", "चौबीस", "पच्चीस", "छब्बीस", "सत्ताईस", "अट्ठाईस", "उनतीस", "तीस",
19
+ "इकतीस", "बत्तीस", "तैंतीस", "चौंतीस", "पैंतीस", "छत्तीस", "सैंतीस", "अड़तीस", "उनतालीस", "चालीस",
20
+ "इकतालीस", "बयालीस", "तैंतालीस", "चौंतालीस", "पैंतालीस", "छियालीस", "सैंतालीस", "अड़तालीस", "उनचास", "पचास",
21
+ "इक्यावन", "बावन", "तिरपन", "चौवन", "पचपन", "छप्पन", "सत्तावन", "अट्ठावन", "उनसठ", "साठ",
22
+ "इकसठ", "बासठ", "तिरसठ", "चौंसठ", "पैंसठ", "छियासठ", "सड़सठ", "अड़सठ", "उनहत्तर", "सत्तर",
23
+ "इकहत्तर", "बहत्तर", "तिहत्तर", "चौहत्तर", "पचहत्तर", "छिहत्तर", "सतहत्तर", "अठहत्तर", "उनासी", "अस्सी",
24
+ "इक्यासी", "बयासी", "तिरासी", "चौरासी", "पचासी", "छियासी", "सत्तासी", "अट्ठासी", "नवासी", "नब्बे",
25
+ "इक्यानबे", "बानबे", "तिरानबे", "चौरानबे", "पचानबे", "छियानबे", "सत्तानवे", "अट्ठानवे", "निन्यानवे",
26
+ ]
27
+
28
+ # Month number -> Hindi month name
29
+ MONTHS = {
30
+ 1: "जनवरी", 2: "फरवरी", 3: "मार्च", 4: "अप्रैल",
31
+ 5: "मई", 6: "जून", 7: "जुलाई", 8: "अगस्त",
32
+ 9: "सितंबर", 10: "अक्टूबर", 11: "नवंबर", 12: "दिसंबर",
33
+ }
34
+
35
+ # Flip it so we can go from month name -> number too
36
+ MONTH_TO_NUM = {v: k for k, v in MONTHS.items()}
37
+
38
+
39
+ def num_to_words(n: int) -> str:
40
+ """
41
+ Turn any whole number into how a Hindi speaker would say it out loud.
42
+
43
+ The 1900s get special treatment: Hindi speakers say "उन्नीस सौ सत्तानवे"
44
+ (nineteen hundred ninety-seven) not "एक हज़ार नौ सौ सत्तानवे".
45
+ This matters a lot for birth years, historical dates, etc.
46
+ """
47
+ if n == 0:
48
+ return "शून्य"
49
+ if n < 0:
50
+ return "माइनस " + num_to_words(-n)
51
+
52
+ # Years in the 1900s - say it the natural way
53
+ if 1900 <= n <= 1999:
54
+ remainder = n - 1900
55
+ if remainder == 0:
56
+ return "उन्नीस सौ"
57
+ return "उन्नीस सौ " + ONES[remainder]
58
+
59
+ # 1800s
60
+ if 1800 <= n <= 1899:
61
+ remainder = n - 1800
62
+ if remainder == 0:
63
+ return "अठारह सौ"
64
+ return "अठारह सौ " + ONES[remainder]
65
+
66
+ # 1700s
67
+ if 1700 <= n <= 1799:
68
+ remainder = n - 1700
69
+ if remainder == 0:
70
+ return "सत्रह सौ"
71
+ return "सत्रह सौ " + ONES[remainder]
72
+
73
+ parts = []
74
+ if n >= 1000:
75
+ parts.append(num_to_words(n // 1000) + " हज़ार")
76
+ n %= 1000
77
+ if n >= 100:
78
+ parts.append(ONES[n // 100] + " सौ")
79
+ n %= 100
80
+ if n > 0:
81
+ parts.append(ONES[n])
82
+ return " ".join(parts)
83
+
84
+
85
+ def preprocess(text: str) -> str:
86
+ """
87
+ Takes raw Hindi text and replaces all numbers/dates with spoken Hindi words.
88
+
89
+ Three patterns it handles:
90
+ 1. "5 अगस्त 2004" -> day in Hindi words + month name + year in words
91
+ 2. "05/08/2004" or "05-08-2004" -> same thing but from numeric format
92
+ 3. Any leftover numbers like "73" -> "तिहत्तर"
93
+ """
94
+ # Build a pattern that matches any Hindi month name
95
+ month_pat = "|".join(MONTH_TO_NUM.keys())
96
+
97
+ # Handle "5 अगस्त 2004" or "5 अगस्त, 2004"
98
+ def _named_date(m):
99
+ day = int(m.group(1))
100
+ month = MONTH_TO_NUM[m.group(2)]
101
+ year = int(m.group(3))
102
+ return f"{ONES[day]} {MONTHS[month]} {num_to_words(year)}"
103
+
104
+ text = re.sub(
105
+ rf"(\d{{1,2}})\s+({month_pat})[,\s]+(\d{{4}})",
106
+ _named_date, text
107
+ )
108
+
109
+ # Handle "05/08/2004" or "05-08-2004"
110
+ def _numeric_date(m):
111
+ day, month, year = int(m.group(1)), int(m.group(2)), int(m.group(3))
112
+ if 1 <= month <= 12 and 1 <= day <= 31:
113
+ return f"{ONES[day]} {MONTHS[month]} {num_to_words(year)}"
114
+ return m.group(0) # not a valid date, leave it alone
115
+
116
+ text = re.sub(r"(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})", _numeric_date, text)
117
+
118
+ # Anything else that's still a number
119
+ text = re.sub(r"\d+", lambda m: num_to_words(int(m.group())), text)
120
+
121
+ return text
@@ -0,0 +1,112 @@
1
+ """
2
+ marathi.py
3
+
4
+ All the Marathi-specific stuff lives here.
5
+ Same structure as hindi.py but with Marathi words throughout.
6
+ """
7
+
8
+ import re
9
+
10
+ # Every number from 0 to 99 written out in Marathi.
11
+ # Same idea as Hindi - index = the number. ONES[3] = "तीन".
12
+ ONES = [
13
+ "", "एक", "दोन", "तीन", "चार", "पाच", "सहा", "सात", "आठ", "नऊ", "दहा",
14
+ "अकरा", "बारा", "तेरा", "चौदा", "पंधरा", "सोळा", "सतरा", "अठरा", "एकोणीस", "वीस",
15
+ "एकवीस", "बावीस", "तेवीस", "चोवीस", "पंचवीस", "सव्वीस", "सत्तावीस", "अठ्ठावीस", "एकोणतीस", "तीस",
16
+ "एकतीस", "बत्तीस", "तेहेतीस", "चौतीस", "पस्तीस", "छत्तीस", "सदतीस", "अडतीस", "एकोणचाळीस", "चाळीस",
17
+ "एकेचाळीस", "बेचाळीस", "त्रेचाळीस", "चव्वेचाळीस", "पंचेचाळीस", "सेहेचाळीस", "सत्तेचाळीस", "अठ्ठेचाळीस", "एकोणपन्नास", "पन्नास",
18
+ "एक्याण्णव", "बावन्न", "त्रेपन्न", "चोपन्न", "पंचावन्न", "छप्पन्न", "सत्तावन्न", "अठ्ठावन्न", "एकोणसाठ", "साठ",
19
+ "एकसष्ट", "बासष्ट", "त्रेसष्ट", "चौसष्ट", "पासष्ट", "सहासष्ट", "सदुसष्ट", "अडुसष्ट", "एकोणसत्तर", "सत्तर",
20
+ "एकाहत्तर", "बहात्तर", "त्र्याहत्तर", "चौऱ्याहत्तर", "पंच्याहत्तर", "शहात्तर", "सत्याहत्तर", "अठ्ठ्याहत्तर", "एकोणऐंशी", "ऐंशी",
21
+ "एक्याऐंशी", "ब्याऐंशी", "त्र्याऐंशी", "चौऱ्याऐंशी", "पंच्याऐंशी", "शहाऐंशी", "सत्याऐंशी", "अठ्ठ्याऐंशी", "नव्याऐंशी", "नव्वद",
22
+ "एक्याण्णव", "बाण्णव", "त्र्याण्णव", "चौऱ्याण्णव", "पंच्याण्णव", "शहाण्णव", "सत्त्याण्णव", "अठ्ठ्याण्णव", "नव्याण्णव",
23
+ ]
24
+
25
+ # Month number -> Marathi month name
26
+ MONTHS = {
27
+ 1: "जानेवारी", 2: "फेब्रुवारी", 3: "मार्च", 4: "एप्रिल",
28
+ 5: "मे", 6: "जून", 7: "जुलै", 8: "ऑगस्ट",
29
+ 9: "सप्टेंबर", 10: "ऑक्टोबर", 11: "नोव्हेंबर", 12: "डिसेंबर",
30
+ }
31
+
32
+ # Flip it so we can go from month name -> number
33
+ MONTH_TO_NUM = {v: k for k, v in MONTHS.items()}
34
+
35
+
36
+ def num_to_words(n: int) -> str:
37
+ """
38
+ Turn any whole number into how a Marathi speaker would say it out loud.
39
+
40
+ Same 1900s special case as Hindi: "एकोणीस शे ..." instead of the long form.
41
+ Marathi uses "शे" for hundred and "हजार" for thousand.
42
+ """
43
+ if n == 0:
44
+ return "शून्य"
45
+ if n < 0:
46
+ return "वजा " + num_to_words(-n)
47
+
48
+ # Years in the 1900s - say it the natural way
49
+ if 1900 <= n <= 1999:
50
+ remainder = n - 1900
51
+ if remainder == 0:
52
+ return "एकोणीस शे"
53
+ return "एकोणीस शे " + ONES[remainder]
54
+
55
+ # 1800s
56
+ if 1800 <= n <= 1899:
57
+ remainder = n - 1800
58
+ if remainder == 0:
59
+ return "अठरा शे"
60
+ return "अठरा शे " + ONES[remainder]
61
+
62
+ # 1700s
63
+ if 1700 <= n <= 1799:
64
+ remainder = n - 1700
65
+ if remainder == 0:
66
+ return "सतरा शे"
67
+ return "सतरा शे " + ONES[remainder]
68
+
69
+ parts = []
70
+ if n >= 1000:
71
+ parts.append(num_to_words(n // 1000) + " हजार")
72
+ n %= 1000
73
+ if n >= 100:
74
+ parts.append(ONES[n // 100] + " शे")
75
+ n %= 100
76
+ if n > 0:
77
+ parts.append(ONES[n])
78
+ return " ".join(parts)
79
+
80
+
81
+ def preprocess(text: str) -> str:
82
+ """
83
+ Takes raw Marathi text and replaces all numbers/dates with spoken Marathi words.
84
+ Same three patterns as Hindi - just uses Marathi words.
85
+ """
86
+ month_pat = "|".join(MONTH_TO_NUM.keys())
87
+
88
+ # Handle "5 ऑगस्ट 2004" or "5 ऑगस्ट, 2004"
89
+ def _named_date(m):
90
+ day = int(m.group(1))
91
+ month = MONTH_TO_NUM[m.group(2)]
92
+ year = int(m.group(3))
93
+ return f"{ONES[day]} {MONTHS[month]} {num_to_words(year)}"
94
+
95
+ text = re.sub(
96
+ rf"(\d{{1,2}})\s+({month_pat})[,\s]+(\d{{4}})",
97
+ _named_date, text
98
+ )
99
+
100
+ # Handle "05/08/2004" or "05-08-2004"
101
+ def _numeric_date(m):
102
+ day, month, year = int(m.group(1)), int(m.group(2)), int(m.group(3))
103
+ if 1 <= month <= 12 and 1 <= day <= 31:
104
+ return f"{ONES[day]} {MONTHS[month]} {num_to_words(year)}"
105
+ return m.group(0)
106
+
107
+ text = re.sub(r"(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})", _numeric_date, text)
108
+
109
+ # Anything else that's still a number
110
+ text = re.sub(r"\d+", lambda m: num_to_words(int(m.group())), text)
111
+
112
+ return text
@@ -0,0 +1,162 @@
1
+ Metadata-Version: 2.4
2
+ Name: indic-tts-preprocess
3
+ Version: 0.1.1
4
+ Summary: Convert numbers and dates in Indic text to spoken words for TTS models
5
+ Author-email: Dhruv Dornal <dhruvdornal2003@gmail.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2025 Dhruv
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Keywords: tts,indic,hindi,marathi,nlp,text-to-speech,preprocessing
29
+ Classifier: Development Status :: 3 - Alpha
30
+ Classifier: Intended Audience :: Developers
31
+ Classifier: Intended Audience :: Science/Research
32
+ Classifier: License :: OSI Approved :: MIT License
33
+ Classifier: Programming Language :: Python :: 3
34
+ Classifier: Programming Language :: Python :: 3.8
35
+ Classifier: Programming Language :: Python :: 3.9
36
+ Classifier: Programming Language :: Python :: 3.10
37
+ Classifier: Programming Language :: Python :: 3.11
38
+ Classifier: Programming Language :: Python :: 3.12
39
+ Classifier: Topic :: Text Processing :: Linguistic
40
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
41
+ Classifier: Natural Language :: Hindi
42
+ Requires-Python: >=3.8
43
+ Description-Content-Type: text/markdown
44
+ License-File: LICENSE
45
+ Dynamic: license-file
46
+
47
+ # indic-tts-preprocess
48
+
49
+ A small Python library that fixes one specific but very annoying problem with open-source Indic TTS models: **they cannot read numbers**.
50
+
51
+ Models like [ai4bharat/indic-parler-tts](https://huggingface.co/ai4bharat/indic-parler-tts) hallucinate badly when the input text contains raw digits. Feed them `"1997 में"` and you get garbled audio. Feed them `"उन्नीस सौ सत्तानवे में"` and it works perfectly.
52
+
53
+ This library does that conversion for you — numbers, years, dates — before the text ever reaches the tokenizer.
54
+
55
+ ---
56
+
57
+ ## Install
58
+
59
+ ```bash
60
+ pip install indic-tts-preprocess
61
+ ```
62
+
63
+ No dependencies. Pure Python. Works on Python 3.8 and above.
64
+
65
+ ---
66
+
67
+ ## Quick start
68
+
69
+ ```python
70
+ from indic_tts_preprocess import preprocess
71
+
72
+ # Hindi
73
+ preprocess("उनका जन्म 5 अगस्त 1997 को हुआ", "hi")
74
+ # -> "उनका जन्म पाँच अगस्त उन्नीस सौ सत्तानवे को हुआ"
75
+
76
+ # Marathi
77
+ preprocess("त्यांचा जन्म 15 ऑगस्ट 1947 रोजी झाला", "mr")
78
+ # -> "त्यांचा जन्म पंधरा ऑगस्ट एकोणीस शे सत्तेचाळीस रोजी झाला"
79
+
80
+ # English
81
+ preprocess("He was born on 15 August 1947", "en")
82
+ # -> "He was born on fifteen August nineteen forty seven"
83
+ ```
84
+
85
+ ---
86
+
87
+ ## Supported languages
88
+
89
+ | Code | Language |
90
+ |------|----------|
91
+ | `hi` | Hindi |
92
+ | `mr` | Marathi |
93
+ | `en` | English |
94
+
95
+ If you pass an unsupported language code, the text comes back unchanged — nothing crashes.
96
+
97
+ ---
98
+
99
+ ## What it handles
100
+
101
+ **Date formats**
102
+
103
+ | Input | Language | Output |
104
+ |-------|----------|--------|
105
+ | `5 अगस्त 2004` | hi | `पाँच अगस्त दो हज़ार चार` |
106
+ | `05/08/2004` | hi | `पाँच अगस्त दो हज़ार चार` |
107
+ | `05-08-2004` | hi | `पाँच अगस्त दो हज़ार चार` |
108
+ | `15 August 1947` | en | `fifteen August nineteen forty seven` |
109
+ | `15/08/1947` | en | `fifteen August nineteen forty seven` |
110
+
111
+ **Standalone numbers**
112
+
113
+ | Input | Language | Output |
114
+ |-------|----------|--------|
115
+ | `73` | hi | `तिहत्तर` |
116
+ | `1997` | hi | `उन्नीस सौ सत्तानवे` |
117
+ | `2024` | en | `two thousand twenty four` |
118
+ | `1905` | en | `nineteen oh five` |
119
+
120
+ **Year handling** (the tricky part)
121
+
122
+ Hindi and Marathi speakers say years in the 1900s differently from how you'd read them literally:
123
+ - `1997` → `उन्नीस सौ सत्तानवे` (not `एक हज़ार नौ सौ सत्तानवे`)
124
+
125
+ English speakers do the same:
126
+ - `1997` → `nineteen ninety seven` (not `one thousand nine hundred ninety seven`)
127
+ - `1905` → `nineteen oh five`
128
+
129
+ The library handles all of these correctly.
130
+
131
+ ---
132
+
133
+ ## API reference
134
+
135
+ ### `preprocess(text, lang)`
136
+
137
+ | Parameter | Type | Description |
138
+ |-----------|------|-------------|
139
+ | `text` | `str` | The raw input text containing digits/dates |
140
+ | `lang` | `str` | Language code: `"hi"`, `"mr"`, or `"en"` |
141
+
142
+ Returns `str` — the same text with all digits replaced by spoken words.
143
+
144
+ Raises `TypeError` if `text` is not a string.
145
+
146
+ ---
147
+
148
+ ## Contributing
149
+
150
+ Adding a new language is straightforward:
151
+
152
+ 1. Create `indic_tts_preprocess/languages/yourlang.py` — look at `hindi.py` as a template
153
+ 2. Add a `preprocess(text)` function and a `num_to_words(n)` function
154
+ 3. Add the new language code in `core.py`
155
+ 4. Add tests in `tests/test_yourlang.py`
156
+ 5. Open a pull request
157
+
158
+ ---
159
+
160
+ ## License
161
+
162
+ MIT
@@ -0,0 +1,17 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ indic_tts_preprocess/__init__.py
5
+ indic_tts_preprocess/core.py
6
+ indic_tts_preprocess.egg-info/PKG-INFO
7
+ indic_tts_preprocess.egg-info/SOURCES.txt
8
+ indic_tts_preprocess.egg-info/dependency_links.txt
9
+ indic_tts_preprocess.egg-info/top_level.txt
10
+ indic_tts_preprocess/languages/__init__.py
11
+ indic_tts_preprocess/languages/english.py
12
+ indic_tts_preprocess/languages/hindi.py
13
+ indic_tts_preprocess/languages/marathi.py
14
+ tests/test_core.py
15
+ tests/test_english.py
16
+ tests/test_hindi.py
17
+ tests/test_marathi.py
@@ -0,0 +1 @@
1
+ indic_tts_preprocess
@@ -0,0 +1,35 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "indic-tts-preprocess"
7
+ version = "0.1.1"
8
+ description = "Convert numbers and dates in Indic text to spoken words for TTS models"
9
+ readme = "README.md"
10
+ license = { file = "LICENSE" }
11
+ authors = [
12
+ { name = "Dhruv Dornal", email = "dhruvdornal2003@gmail.com" }
13
+ ]
14
+ keywords = ["tts", "indic", "hindi", "marathi", "nlp", "text-to-speech", "preprocessing"]
15
+ classifiers = [
16
+ "Development Status :: 3 - Alpha",
17
+ "Intended Audience :: Developers",
18
+ "Intended Audience :: Science/Research",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.8",
22
+ "Programming Language :: Python :: 3.9",
23
+ "Programming Language :: Python :: 3.10",
24
+ "Programming Language :: Python :: 3.11",
25
+ "Programming Language :: Python :: 3.12",
26
+ "Topic :: Text Processing :: Linguistic",
27
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
28
+ "Natural Language :: Hindi",
29
+ ]
30
+ requires-python = ">=3.8"
31
+ dependencies = []
32
+
33
+ [tool.setuptools.packages.find]
34
+ where = ["."]
35
+ include = ["indic_tts_preprocess*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,44 @@
1
+ """
2
+ Tests for the top-level preprocess() function.
3
+ This is the main thing users will call.
4
+ """
5
+
6
+ import pytest
7
+ from indic_tts_preprocess import preprocess, SUPPORTED_LANGUAGES
8
+
9
+
10
+ class TestPreprocess:
11
+ def test_hindi_routing(self):
12
+ result = preprocess("1997", "hi")
13
+ assert result == "उन्नीस सौ सत्तानवे"
14
+
15
+ def test_marathi_routing(self):
16
+ result = preprocess("1997", "mr")
17
+ assert result == "एकोणीस शे सत्त्याण्णव"
18
+
19
+ def test_english_routing(self):
20
+ result = preprocess("1997", "en")
21
+ assert result == "nineteen ninety seven"
22
+
23
+ def test_unknown_lang_returns_text_unchanged(self):
24
+ # Should not crash, just return as-is
25
+ result = preprocess("hello 123", "xx")
26
+ assert result == "hello 123"
27
+
28
+ def test_lang_is_case_insensitive(self):
29
+ # "HI" and "Hi" and "hi" should all work
30
+ assert preprocess("5", "HI") == preprocess("5", "hi")
31
+ assert preprocess("5", "Hi") == preprocess("5", "hi")
32
+
33
+ def test_raises_on_non_string_input(self):
34
+ with pytest.raises(TypeError):
35
+ preprocess(12345, "hi")
36
+
37
+ def test_supported_languages_constant(self):
38
+ assert "hi" in SUPPORTED_LANGUAGES
39
+ assert "mr" in SUPPORTED_LANGUAGES
40
+ assert "en" in SUPPORTED_LANGUAGES
41
+
42
+ def test_empty_string(self):
43
+ assert preprocess("", "hi") == ""
44
+ assert preprocess("", "en") == ""
@@ -0,0 +1,60 @@
1
+ """
2
+ Tests for the English preprocessor.
3
+ """
4
+
5
+ import pytest
6
+ from indic_tts_preprocess.languages.english import preprocess, num_to_words
7
+
8
+
9
+ class TestNumToWords:
10
+ def test_zero(self):
11
+ assert num_to_words(0) == "zero"
12
+
13
+ def test_teens(self):
14
+ assert num_to_words(15) == "fifteen"
15
+
16
+ def test_two_digit(self):
17
+ assert num_to_words(42) == "forty two"
18
+
19
+ def test_1900s_year(self):
20
+ assert num_to_words(1997) == "nineteen ninety seven"
21
+
22
+ def test_1900s_oh_year(self):
23
+ # 1905 should be "nineteen oh five"
24
+ assert num_to_words(1905) == "nineteen oh five"
25
+
26
+ def test_1900_exact(self):
27
+ assert num_to_words(1900) == "nineteen hundred"
28
+
29
+ def test_2000(self):
30
+ assert num_to_words(2000) == "two thousand"
31
+
32
+ def test_2000s(self):
33
+ assert num_to_words(2024) == "two thousand twenty four"
34
+
35
+ def test_negative(self):
36
+ assert num_to_words(-10) == "minus ten"
37
+
38
+
39
+ class TestPreprocess:
40
+ def test_written_date(self):
41
+ result = preprocess("15 August 1747")
42
+ assert result == "fifteen August seventeen forty seven"
43
+
44
+ def test_written_date_case_insensitive(self):
45
+ result = preprocess("15 august 1947")
46
+ assert "fifteen" in result
47
+ assert "nineteen forty seven" in result
48
+
49
+ def test_numeric_date(self):
50
+ result = preprocess("15/08/1947")
51
+ assert "fifteen" in result
52
+ assert "August" in result
53
+
54
+ def test_standalone_year(self):
55
+ result = preprocess("The year was 1947")
56
+ assert "nineteen forty seven" in result
57
+
58
+ def test_text_without_numbers_unchanged(self):
59
+ text = "Hello world"
60
+ assert preprocess(text) == text
@@ -0,0 +1,72 @@
1
+ """
2
+ Tests for the Hindi preprocessor.
3
+ Run with: python -m pytest tests/
4
+ """
5
+
6
+ import pytest
7
+ from indic_tts_preprocess.languages.hindi import preprocess, num_to_words
8
+
9
+
10
+ class TestNumToWords:
11
+ def test_zero(self):
12
+ assert num_to_words(0) == "शून्य"
13
+
14
+ def test_single_digit(self):
15
+ assert num_to_words(5) == "पाँच"
16
+
17
+ def test_two_digit(self):
18
+ assert num_to_words(73) == "तिहत्तर"
19
+
20
+ def test_hundred(self):
21
+ assert num_to_words(100) == "एक सौ"
22
+
23
+ def test_1900s_year(self):
24
+ # The special case - should say "उन्नीस सौ" not "एक हज़ार नौ सौ"
25
+ assert num_to_words(1997) == "उन्नीस सौ सत्तानवे"
26
+
27
+ def test_1900_exact(self):
28
+ assert num_to_words(1900) == "उन्नीस सौ"
29
+
30
+ def test_2000s_year(self):
31
+ assert num_to_words(2004) == "दो हज़ार चार"
32
+
33
+ def test_negative(self):
34
+ assert num_to_words(-5) == "माइनस पाँच"
35
+
36
+
37
+ class TestPreprocess:
38
+ def test_named_date(self):
39
+ result = preprocess("5 अगस्त 2004")
40
+ assert result == "पाँच अगस्त दो हज़ार चार"
41
+
42
+ def test_named_date_with_comma(self):
43
+ result = preprocess("5 अगस्त, 2004")
44
+ assert result == "पाँच अगस्त दो हज़ार चार"
45
+
46
+ def test_numeric_date_slash(self):
47
+ result = preprocess("05/08/2004")
48
+ assert result == "पाँच अगस्त दो हज़ार चार"
49
+
50
+ def test_numeric_date_dash(self):
51
+ result = preprocess("05-08-2004")
52
+ assert result == "पाँच अगस्त दो हज़ार चार"
53
+
54
+ def test_standalone_number_in_sentence(self):
55
+ result = preprocess("उसके पास 73 किताबें हैं")
56
+ assert "तिहत्तर" in result
57
+
58
+ def test_1900s_year_in_sentence(self):
59
+ result = preprocess("उनका जन्म 1947 में हुआ")
60
+ assert "उन्नीस सौ सैंतालीस" in result
61
+
62
+ def test_text_without_numbers_unchanged(self):
63
+ text = "यह एक साधारण वाक्य है"
64
+ assert preprocess(text) == text
65
+
66
+ def test_1800s_year_in_sentence(self):
67
+ result = preprocess("उनका जन्म 1803 में हुआ")
68
+ assert "अठारह सौ तीन" in result
69
+
70
+ def test_1700s_year_in_sentence(self):
71
+ result = preprocess("उनका जन्म 1709 में हुआ")
72
+ assert "सत्रह सौ नौ" in result
@@ -0,0 +1,50 @@
1
+ """
2
+ Tests for the Marathi preprocessor.
3
+ """
4
+
5
+ import pytest
6
+ from indic_tts_preprocess.languages.marathi import preprocess, num_to_words
7
+
8
+
9
+ class TestNumToWords:
10
+ def test_zero(self):
11
+ assert num_to_words(0) == "शून्य"
12
+
13
+ def test_single_digit(self):
14
+ assert num_to_words(5) == "पाच"
15
+
16
+ def test_1900s_year(self):
17
+ # Marathi uses "एकोणीस शे" for the 1900s
18
+ assert num_to_words(1997) == "एकोणीस शे सत्त्याण्णव"
19
+
20
+ def test_1900_exact(self):
21
+ assert num_to_words(1900) == "एकोणीस शे"
22
+
23
+ def test_negative(self):
24
+ assert num_to_words(-3) == "वजा तीन"
25
+
26
+
27
+ class TestPreprocess:
28
+ def test_named_date(self):
29
+ result = preprocess("5 ऑगस्ट 2004")
30
+ assert result == "पाच ऑगस्ट दोन हजार चार"
31
+
32
+ def test_numeric_date(self):
33
+ result = preprocess("05/08/2004")
34
+ assert "ऑगस्ट" in result
35
+
36
+ def test_standalone_number(self):
37
+ result = preprocess("त्याच्याकडे 10 पुस्तके आहेत")
38
+ assert "दहा" in result
39
+
40
+ def test_text_without_numbers_unchanged(self):
41
+ text = "हे एक साधे वाक्य आहे"
42
+ assert preprocess(text) == text
43
+
44
+ def test_1800s_year_in_sentence_marathi(self):
45
+ result = preprocess("त्यांचा जन्म 1807 मध्ये झाला")
46
+ assert "अठरा शे सात" in result
47
+
48
+ def test_1700s_year_in_sentence_marathi(self):
49
+ result = preprocess("त्यांचा जन्म 1702 मध्ये झाला")
50
+ assert "सतरा शे दोन" in result