turkificate 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- turkificate-0.1.0/LICENSE +21 -0
- turkificate-0.1.0/PKG-INFO +179 -0
- turkificate-0.1.0/README.md +158 -0
- turkificate-0.1.0/__init__.py +109 -0
- turkificate-0.1.0/base.py +61 -0
- turkificate-0.1.0/data.py +107 -0
- turkificate-0.1.0/normalizers.py +270 -0
- turkificate-0.1.0/numbers.py +178 -0
- turkificate-0.1.0/pipeline.py +119 -0
- turkificate-0.1.0/pyproject.toml +34 -0
- turkificate-0.1.0/setup.cfg +4 -0
- turkificate-0.1.0/test_turkificate.py +85 -0
- turkificate-0.1.0/turkificate.egg-info/PKG-INFO +179 -0
- turkificate-0.1.0/turkificate.egg-info/SOURCES.txt +22 -0
- turkificate-0.1.0/turkificate.egg-info/dependency_links.txt +1 -0
- turkificate-0.1.0/turkificate.egg-info/requires.txt +3 -0
- turkificate-0.1.0/turkificate.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Enes
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: turkificate
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A Turkish text normalization library (numbers, dates, times, abbreviations, currency, and more)
|
|
5
|
+
Author: Enes
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/<kullanici-adi>/turkificate
|
|
8
|
+
Project-URL: Repository, https://github.com/<kullanici-adi>/turkificate
|
|
9
|
+
Project-URL: Issues, https://github.com/<kullanici-adi>/turkificate/issues
|
|
10
|
+
Keywords: turkish,nlp,text-normalization,tts,normalization
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Natural Language :: Turkish
|
|
13
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Requires-Python: >=3.9
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
|
|
22
|
+
# turkificate
|
|
23
|
+
|
|
24
|
+
A Turkish text normalization library. It converts numbers, dates, times,
|
|
25
|
+
abbreviations, currencies, percentages, ordinals and symbols into their written
|
|
26
|
+
Turkish form, following Turkish grammar. Built for TTS pre-processing, search
|
|
27
|
+
indexing and text cleaning.
|
|
28
|
+
|
|
29
|
+
> _"The language is the core of our being - Noam Chomsky"_
|
|
30
|
+
|
|
31
|
+
No external dependencies — pure standard library.
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install turkificate
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Or from a local checkout:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install -e .
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Quick start
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
import turkificate
|
|
49
|
+
|
|
50
|
+
turkificate.turkificate("Dr. Ahmet 15.03.2024'te %25 indirimle 1.250 TL ödedi.")
|
|
51
|
+
# "doktor Ahmet on beş Mart iki bin yirmi dört'te yüzde yirmi beş
|
|
52
|
+
# indirimle bin iki yüz elli lira ödedi."
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
`turkificate()` is the main function; `normalize()` is a kept alias for the same call.
|
|
56
|
+
|
|
57
|
+
> The **output** is intentionally Turkish (e.g. `yüz`, `bin`, `Mart`) — that is the
|
|
58
|
+
> whole point of the library. Only the **code, API and docs** are in English.
|
|
59
|
+
|
|
60
|
+
## Selecting concepts
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from turkificate import TurkishNormalizer
|
|
64
|
+
|
|
65
|
+
tn = TurkishNormalizer(features=["numbers", "dates"])
|
|
66
|
+
tn.normalize("Saat 14:30, fiyat 99,90 TL")
|
|
67
|
+
# "Saat 14:30, fiyat doksan dokuz virgül doksan TL" (times & currency untouched)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Normalize everything
|
|
71
|
+
|
|
72
|
+
Pass nothing (the default), or the explicit `"all"` keyword:
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
TurkishNormalizer() # all concepts (default)
|
|
76
|
+
TurkishNormalizer(features="all") # all concepts
|
|
77
|
+
TurkishNormalizer(features=["all"]) # all concepts
|
|
78
|
+
TurkishNormalizer(features=turkificate.ALL) # all concepts
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
List available concepts with `turkificate.available_features()`.
|
|
82
|
+
|
|
83
|
+
| Concept | Description | Example |
|
|
84
|
+
|---|---|---|
|
|
85
|
+
| `numbers` | integer / decimal / signed | `3,5` → üç virgül beş |
|
|
86
|
+
| `dates` | DD.MM.YYYY | `15.03.2024` → on beş Mart iki bin yirmi dört |
|
|
87
|
+
| `times` | HH:MM(:SS) | `14:30` → on dört otuz |
|
|
88
|
+
| `percent` | percent sign | `%50` → yüzde elli |
|
|
89
|
+
| `currency` | currencies | `100 TL` → yüz lira |
|
|
90
|
+
| `ordinals` | ordinal numbers | `5'inci` → beşinci |
|
|
91
|
+
| `units` | units of measure | `42 km` → kırk iki kilometre |
|
|
92
|
+
| `abbreviations` | lexical abbreviations | `Dr.` → doktor |
|
|
93
|
+
| `symbols` | single-character symbols | `&` → ve |
|
|
94
|
+
| `whitespace` | whitespace cleanup | (always the final step) |
|
|
95
|
+
|
|
96
|
+
## Per-concept options
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
tn = TurkishNormalizer(options={
|
|
100
|
+
"times": {"prefix_hour": True}, # 09:05 → "saat dokuz beş"
|
|
101
|
+
"ordinals": {"period_ordinals": True}, # "3. kat" → "üçüncü kat"
|
|
102
|
+
})
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Per-concept helpers
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
turkificate.normalize_numbers("3 elma") # "üç elma"
|
|
109
|
+
turkificate.normalize_dates(...)
|
|
110
|
+
turkificate.normalize_currency(...)
|
|
111
|
+
# normalize_times, normalize_percent, normalize_ordinals,
|
|
112
|
+
# normalize_units, normalize_abbreviations, normalize_symbols
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
Direct number engine:
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
from turkificate import integer_to_words, integer_to_ordinal, read_number
|
|
119
|
+
integer_to_words(1_000_000) # "bir milyon"
|
|
120
|
+
integer_to_ordinal(4) # "dördüncü"
|
|
121
|
+
read_number("1.234,5") # "bin iki yüz otuz dört virgül beş"
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Adding a new concept
|
|
125
|
+
|
|
126
|
+
Subclass `Normalizer` and register it with `@register`:
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
from turkificate import Normalizer, register
|
|
130
|
+
import re
|
|
131
|
+
|
|
132
|
+
@register
|
|
133
|
+
class EmojiNormalizer(Normalizer):
|
|
134
|
+
name = "emoji"
|
|
135
|
+
|
|
136
|
+
def configure(self, **opts):
|
|
137
|
+
self._re = re.compile(r":\)")
|
|
138
|
+
|
|
139
|
+
def apply(self, text):
|
|
140
|
+
return self._re.sub("gülen yüz", text)
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
It is now usable via `TurkishNormalizer(features=["emoji", ...])` or `"all"`.
|
|
144
|
+
|
|
145
|
+
## Architecture
|
|
146
|
+
|
|
147
|
+
- **Strategy** — each concept is an independent class with a common `Normalizer` interface.
|
|
148
|
+
- **Pipeline (Chain)** — normalizers run in order; number-bearing concepts run before the bare `numbers` concept to avoid double conversion.
|
|
149
|
+
- **Registry + Facade** — concepts are selected by name; `TurkishNormalizer` composes them.
|
|
150
|
+
|
|
151
|
+
Optimization: every regex is compiled once in the constructor; the abbreviation,
|
|
152
|
+
unit and currency dictionaries are compiled into a single alternation regex; the
|
|
153
|
+
number-to-words engine is `lru_cache`-d; and because `apply` is pure, a single
|
|
154
|
+
instance is reused across thousands of calls.
|
|
155
|
+
|
|
156
|
+
## Known limits (roadmap)
|
|
157
|
+
|
|
158
|
+
- The period ordinal form (`3.`) is disabled by default because it clashes with a sentence-final period; enable it with `period_ordinals=True`.
|
|
159
|
+
- The number engine is one-way; the reverse direction (words → digits) is not yet implemented.
|
|
160
|
+
- Context-sensitive suffixes (`5'te` → "beşte") are not handled yet.
|
|
161
|
+
- Roman numerals, phone numbers and fractions (`3/4`) can be added.
|
|
162
|
+
|
|
163
|
+
## Development
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
pip install -e ".[dev]"
|
|
167
|
+
pytest
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
## Publishing
|
|
171
|
+
|
|
172
|
+
This repo ships a GitHub Actions workflow (`.github/workflows/publish.yml`) that
|
|
173
|
+
publishes to PyPI via Trusted Publishing (no API tokens) when you create a
|
|
174
|
+
GitHub Release. See the project README section below or the PyPI docs on
|
|
175
|
+
trusted publishers.
|
|
176
|
+
|
|
177
|
+
## License
|
|
178
|
+
|
|
179
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
# turkificate
|
|
2
|
+
|
|
3
|
+
A Turkish text normalization library. It converts numbers, dates, times,
|
|
4
|
+
abbreviations, currencies, percentages, ordinals and symbols into their written
|
|
5
|
+
Turkish form, following Turkish grammar. Built for TTS pre-processing, search
|
|
6
|
+
indexing and text cleaning.
|
|
7
|
+
|
|
8
|
+
> _"The language is the core of our being - Noam Chomsky"_
|
|
9
|
+
|
|
10
|
+
No external dependencies — pure standard library.
|
|
11
|
+
|
|
12
|
+
## Installation
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pip install turkificate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
Or from a local checkout:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install -e .
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Quick start
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
import turkificate
|
|
28
|
+
|
|
29
|
+
turkificate.turkificate("Dr. Ahmet 15.03.2024'te %25 indirimle 1.250 TL ödedi.")
|
|
30
|
+
# "doktor Ahmet on beş Mart iki bin yirmi dört'te yüzde yirmi beş
|
|
31
|
+
# indirimle bin iki yüz elli lira ödedi."
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
`turkificate()` is the main function; `normalize()` is a kept alias for the same call.
|
|
35
|
+
|
|
36
|
+
> The **output** is intentionally Turkish (e.g. `yüz`, `bin`, `Mart`) — that is the
|
|
37
|
+
> whole point of the library. Only the **code, API and docs** are in English.
|
|
38
|
+
|
|
39
|
+
## Selecting concepts
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from turkificate import TurkishNormalizer
|
|
43
|
+
|
|
44
|
+
tn = TurkishNormalizer(features=["numbers", "dates"])
|
|
45
|
+
tn.normalize("Saat 14:30, fiyat 99,90 TL")
|
|
46
|
+
# "Saat 14:30, fiyat doksan dokuz virgül doksan TL" (times & currency untouched)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Normalize everything
|
|
50
|
+
|
|
51
|
+
Pass nothing (the default), or the explicit `"all"` keyword:
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
TurkishNormalizer() # all concepts (default)
|
|
55
|
+
TurkishNormalizer(features="all") # all concepts
|
|
56
|
+
TurkishNormalizer(features=["all"]) # all concepts
|
|
57
|
+
TurkishNormalizer(features=turkificate.ALL) # all concepts
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
List available concepts with `turkificate.available_features()`.
|
|
61
|
+
|
|
62
|
+
| Concept | Description | Example |
|
|
63
|
+
|---|---|---|
|
|
64
|
+
| `numbers` | integer / decimal / signed | `3,5` → üç virgül beş |
|
|
65
|
+
| `dates` | DD.MM.YYYY | `15.03.2024` → on beş Mart iki bin yirmi dört |
|
|
66
|
+
| `times` | HH:MM(:SS) | `14:30` → on dört otuz |
|
|
67
|
+
| `percent` | percent sign | `%50` → yüzde elli |
|
|
68
|
+
| `currency` | currencies | `100 TL` → yüz lira |
|
|
69
|
+
| `ordinals` | ordinal numbers | `5'inci` → beşinci |
|
|
70
|
+
| `units` | units of measure | `42 km` → kırk iki kilometre |
|
|
71
|
+
| `abbreviations` | lexical abbreviations | `Dr.` → doktor |
|
|
72
|
+
| `symbols` | single-character symbols | `&` → ve |
|
|
73
|
+
| `whitespace` | whitespace cleanup | (always the final step) |
|
|
74
|
+
|
|
75
|
+
## Per-concept options
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
tn = TurkishNormalizer(options={
|
|
79
|
+
"times": {"prefix_hour": True}, # 09:05 → "saat dokuz beş"
|
|
80
|
+
"ordinals": {"period_ordinals": True}, # "3. kat" → "üçüncü kat"
|
|
81
|
+
})
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Per-concept helpers
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
turkificate.normalize_numbers("3 elma") # "üç elma"
|
|
88
|
+
turkificate.normalize_dates(...)
|
|
89
|
+
turkificate.normalize_currency(...)
|
|
90
|
+
# normalize_times, normalize_percent, normalize_ordinals,
|
|
91
|
+
# normalize_units, normalize_abbreviations, normalize_symbols
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
Direct number engine:
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
from turkificate import integer_to_words, integer_to_ordinal, read_number
|
|
98
|
+
integer_to_words(1_000_000) # "bir milyon"
|
|
99
|
+
integer_to_ordinal(4) # "dördüncü"
|
|
100
|
+
read_number("1.234,5") # "bin iki yüz otuz dört virgül beş"
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Adding a new concept
|
|
104
|
+
|
|
105
|
+
Subclass `Normalizer` and register it with `@register`:
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from turkificate import Normalizer, register
|
|
109
|
+
import re
|
|
110
|
+
|
|
111
|
+
@register
|
|
112
|
+
class EmojiNormalizer(Normalizer):
|
|
113
|
+
name = "emoji"
|
|
114
|
+
|
|
115
|
+
def configure(self, **opts):
|
|
116
|
+
self._re = re.compile(r":\)")
|
|
117
|
+
|
|
118
|
+
def apply(self, text):
|
|
119
|
+
return self._re.sub("gülen yüz", text)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
It is now usable via `TurkishNormalizer(features=["emoji", ...])` or `"all"`.
|
|
123
|
+
|
|
124
|
+
## Architecture
|
|
125
|
+
|
|
126
|
+
- **Strategy** — each concept is an independent class with a common `Normalizer` interface.
|
|
127
|
+
- **Pipeline (Chain)** — normalizers run in order; number-bearing concepts run before the bare `numbers` concept to avoid double conversion.
|
|
128
|
+
- **Registry + Facade** — concepts are selected by name; `TurkishNormalizer` composes them.
|
|
129
|
+
|
|
130
|
+
Optimization: every regex is compiled once in the constructor; the abbreviation,
|
|
131
|
+
unit and currency dictionaries are compiled into a single alternation regex; the
|
|
132
|
+
number-to-words engine is `lru_cache`-d; and because `apply` is pure, a single
|
|
133
|
+
instance is reused across thousands of calls.
|
|
134
|
+
|
|
135
|
+
## Known limits (roadmap)
|
|
136
|
+
|
|
137
|
+
- The period ordinal form (`3.`) is disabled by default because it clashes with a sentence-final period; enable it with `period_ordinals=True`.
|
|
138
|
+
- The number engine is one-way; the reverse direction (words → digits) is not yet implemented.
|
|
139
|
+
- Context-sensitive suffixes (`5'te` → "beşte") are not handled yet.
|
|
140
|
+
- Roman numerals, phone numbers and fractions (`3/4`) can be added.
|
|
141
|
+
|
|
142
|
+
## Development
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
pip install -e ".[dev]"
|
|
146
|
+
pytest
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## Publishing
|
|
150
|
+
|
|
151
|
+
This repo ships a GitHub Actions workflow (`.github/workflows/publish.yml`) that
|
|
152
|
+
publishes to PyPI via Trusted Publishing (no API tokens) when you create a
|
|
153
|
+
GitHub Release. See the project README section below or the PyPI docs on
|
|
154
|
+
trusted publishers.
|
|
155
|
+
|
|
156
|
+
## License
|
|
157
|
+
|
|
158
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""turkificate — a Turkish text normalization library.
|
|
2
|
+
|
|
3
|
+
Quick start
|
|
4
|
+
-----------
|
|
5
|
+
>>> import turkificate
|
|
6
|
+
>>> turkificate.turkificate("Dr. Ahmet 15.03.2024'te %25 indirimle 1.250 TL ödedi.")
|
|
7
|
+
"doktor Ahmet on beş Mart iki bin yirmi dört'te yüzde yirmi beş indirimle bin iki yüz elli lira ödedi."
|
|
8
|
+
|
|
9
|
+
``turkificate()`` is the main function; ``normalize()`` is a kept alias.
|
|
10
|
+
|
|
11
|
+
Selecting concepts
|
|
12
|
+
-------------------
|
|
13
|
+
>>> tn = turkificate.TurkishNormalizer(features=["numbers", "dates"])
|
|
14
|
+
>>> tn.normalize("Saat 14:30, fiyat 99,90 TL") # times and currency untouched
|
|
15
|
+
'Saat 14:30, fiyat doksan dokuz virgül doksan TL'
|
|
16
|
+
|
|
17
|
+
Normalize everything (default, or explicit "all")
|
|
18
|
+
-------------------------------------------------
|
|
19
|
+
>>> turkificate.TurkishNormalizer(features="all").normalize("%50")
|
|
20
|
+
'yüzde elli'
|
|
21
|
+
|
|
22
|
+
Per-concept helpers
|
|
23
|
+
-------------------
|
|
24
|
+
>>> turkificate.normalize_numbers("3 kişi geldi")
|
|
25
|
+
'üç kişi geldi'
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from .base import Normalizer, available_features, get_registry, register
|
|
29
|
+
from .numbers import (
|
|
30
|
+
integer_to_ordinal,
|
|
31
|
+
integer_to_words,
|
|
32
|
+
number_to_words,
|
|
33
|
+
read_number,
|
|
34
|
+
)
|
|
35
|
+
from .pipeline import ALL, DEFAULT_ORDER, Pipeline, TurkishNormalizer
|
|
36
|
+
|
|
37
|
+
__version__ = "0.1.0"
|
|
38
|
+
|
|
39
|
+
__all__ = [
|
|
40
|
+
# Main interface
|
|
41
|
+
"TurkishNormalizer",
|
|
42
|
+
"Pipeline",
|
|
43
|
+
"turkificate",
|
|
44
|
+
"normalize",
|
|
45
|
+
"DEFAULT_ORDER",
|
|
46
|
+
"ALL",
|
|
47
|
+
# Number engine
|
|
48
|
+
"integer_to_words",
|
|
49
|
+
"integer_to_ordinal",
|
|
50
|
+
"read_number",
|
|
51
|
+
"number_to_words",
|
|
52
|
+
# Extension
|
|
53
|
+
"Normalizer",
|
|
54
|
+
"register",
|
|
55
|
+
"get_registry",
|
|
56
|
+
"available_features",
|
|
57
|
+
# Per-concept helpers
|
|
58
|
+
"normalize_emails",
|
|
59
|
+
"normalize_urls",
|
|
60
|
+
"normalize_numbers",
|
|
61
|
+
"normalize_dates",
|
|
62
|
+
"normalize_times",
|
|
63
|
+
"normalize_percent",
|
|
64
|
+
"normalize_currency",
|
|
65
|
+
"normalize_ordinals",
|
|
66
|
+
"normalize_units",
|
|
67
|
+
"normalize_abbreviations",
|
|
68
|
+
"normalize_symbols",
|
|
69
|
+
]
|
|
70
|
+
|
|
71
|
+
# Default, reusable instance covering every concept.
|
|
72
|
+
_default = TurkishNormalizer()
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def turkificate(text: str) -> str:
|
|
76
|
+
"""Normalize Turkish text using every available concept (the main function)."""
|
|
77
|
+
return _default.normalize(text)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# Kept alias for readability / discoverability.
|
|
81
|
+
normalize = turkificate
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _make_single(feature):
|
|
85
|
+
"""Build a cached helper that runs a single concept."""
|
|
86
|
+
holder = {}
|
|
87
|
+
|
|
88
|
+
def fn(text: str) -> str:
|
|
89
|
+
tn = holder.get("tn")
|
|
90
|
+
if tn is None:
|
|
91
|
+
tn = holder["tn"] = TurkishNormalizer(features=[feature])
|
|
92
|
+
return tn.normalize(text)
|
|
93
|
+
|
|
94
|
+
fn.__name__ = f"normalize_{feature}"
|
|
95
|
+
fn.__doc__ = f"Normalize only the '{feature}' concept."
|
|
96
|
+
return fn
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
normalize_emails = _make_single("emails")
|
|
100
|
+
normalize_urls = _make_single("urls")
|
|
101
|
+
normalize_numbers = _make_single("numbers")
|
|
102
|
+
normalize_dates = _make_single("dates")
|
|
103
|
+
normalize_times = _make_single("times")
|
|
104
|
+
normalize_percent = _make_single("percent")
|
|
105
|
+
normalize_currency = _make_single("currency")
|
|
106
|
+
normalize_ordinals = _make_single("ordinals")
|
|
107
|
+
normalize_units = _make_single("units")
|
|
108
|
+
normalize_abbreviations = _make_single("abbreviations")
|
|
109
|
+
normalize_symbols = _make_single("symbols")
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Core abstractions: the Normalizer interface (Strategy) and the Registry.
|
|
2
|
+
|
|
3
|
+
Every normalization concept is an independent subclass of ``Normalizer``. Classes
|
|
4
|
+
register themselves into a central registry via ``@register``, so that
|
|
5
|
+
``TurkishNormalizer`` can select and order concepts by name only
|
|
6
|
+
(e.g. "numbers", "dates").
|
|
7
|
+
|
|
8
|
+
Design principle: each normalizer does all of its setup (regex compilation, etc.)
|
|
9
|
+
in its constructor, while ``apply`` stays completely pure and side-effect free.
|
|
10
|
+
That lets a single instance be reused across thousands of texts.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from abc import ABC, abstractmethod
|
|
14
|
+
|
|
15
|
+
__all__ = ["Normalizer", "register", "get_registry", "available_features"]
|
|
16
|
+
|
|
17
|
+
_REGISTRY: dict[str, type["Normalizer"]] = {}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Normalizer(ABC):
|
|
21
|
+
"""Abstract base for a single normalization concept (Strategy)."""
|
|
22
|
+
|
|
23
|
+
#: Unique name used in the registry and in feature selection.
|
|
24
|
+
name: str = ""
|
|
25
|
+
|
|
26
|
+
def __init__(self, **options):
|
|
27
|
+
self.options = options
|
|
28
|
+
self.configure(**options)
|
|
29
|
+
|
|
30
|
+
def configure(self, **options) -> None:
|
|
31
|
+
"""Subclasses do their setup (e.g. regex compilation) here."""
|
|
32
|
+
|
|
33
|
+
@abstractmethod
|
|
34
|
+
def apply(self, text: str) -> str:
|
|
35
|
+
"""Transform the text and return the new text. Must be side-effect free."""
|
|
36
|
+
|
|
37
|
+
def __call__(self, text: str) -> str:
|
|
38
|
+
return self.apply(text)
|
|
39
|
+
|
|
40
|
+
def __repr__(self) -> str:
|
|
41
|
+
return f"<Normalizer {self.name!r}>"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def register(cls: type[Normalizer]) -> type[Normalizer]:
|
|
45
|
+
"""Decorator that registers a Normalizer subclass by its ``name`` attribute."""
|
|
46
|
+
if not getattr(cls, "name", ""):
|
|
47
|
+
raise ValueError(f"{cls.__name__} must define a 'name'.")
|
|
48
|
+
if cls.name in _REGISTRY:
|
|
49
|
+
raise ValueError(f"The name '{cls.name}' is already registered.")
|
|
50
|
+
_REGISTRY[cls.name] = cls
|
|
51
|
+
return cls
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def get_registry() -> dict[str, type[Normalizer]]:
|
|
55
|
+
"""Return a copy of all registered normalizer classes."""
|
|
56
|
+
return dict(_REGISTRY)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def available_features() -> list[str]:
|
|
60
|
+
"""List the available concept names."""
|
|
61
|
+
return sorted(_REGISTRY)
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Lexical data: month names, abbreviations, units, symbols, currencies.
|
|
2
|
+
|
|
3
|
+
Keeping this data in one place lets the normalizers build their regexes from it
|
|
4
|
+
and makes extending the library (adding a new abbreviation, etc.) a one-line change.
|
|
5
|
+
|
|
6
|
+
The values are intentionally Turkish — they are the spoken/written forms the
|
|
7
|
+
library emits.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
# 1..12 -> month name
|
|
11
|
+
MONTHS = {
|
|
12
|
+
1: "Ocak", 2: "Şubat", 3: "Mart", 4: "Nisan",
|
|
13
|
+
5: "Mayıs", 6: "Haziran", 7: "Temmuz", 8: "Ağustos",
|
|
14
|
+
9: "Eylül", 10: "Ekim", 11: "Kasım", 12: "Aralık",
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
# Lexical abbreviations (matched case-insensitively). Keys are lowercase.
|
|
18
|
+
ABBREVIATIONS = {
|
|
19
|
+
"dr.": "doktor",
|
|
20
|
+
"prof.": "profesör",
|
|
21
|
+
"doç.": "doçent",
|
|
22
|
+
"yrd.": "yardımcı",
|
|
23
|
+
"av.": "avukat",
|
|
24
|
+
"sn.": "sayın",
|
|
25
|
+
"vs.": "vesaire",
|
|
26
|
+
"vb.": "ve benzeri",
|
|
27
|
+
"vd.": "ve diğerleri",
|
|
28
|
+
"bkz.": "bakınız",
|
|
29
|
+
"örn.": "örneğin",
|
|
30
|
+
"yy.": "yüzyıl",
|
|
31
|
+
"no.": "numara",
|
|
32
|
+
"tel.": "telefon",
|
|
33
|
+
"cad.": "cadde",
|
|
34
|
+
"sok.": "sokak",
|
|
35
|
+
"apt.": "apartman",
|
|
36
|
+
"mah.": "mahalle",
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
# Units of measure (expanded only when they follow a number).
|
|
40
|
+
# Longer keys must appear before shorter prefix-matches (e.g. "°C" before "°").
|
|
41
|
+
UNITS = {
|
|
42
|
+
"°C": "derece",
|
|
43
|
+
"°F": "fahrenheit",
|
|
44
|
+
"km²": "kilometre kare",
|
|
45
|
+
"m²": "metre kare",
|
|
46
|
+
"m³": "metre küp",
|
|
47
|
+
"km/h": "kilometre bölü saat",
|
|
48
|
+
"km/sa": "kilometre bölü saat",
|
|
49
|
+
"m/s": "metre bölü saniye",
|
|
50
|
+
"kWh": "kilowatt saat",
|
|
51
|
+
"GHz": "gigahertz",
|
|
52
|
+
"MHz": "megahertz",
|
|
53
|
+
"kHz": "kilohertz",
|
|
54
|
+
"Hz": "hertz",
|
|
55
|
+
"kW": "kilowatt",
|
|
56
|
+
"MW": "megawatt",
|
|
57
|
+
"km": "kilometre",
|
|
58
|
+
"cm": "santimetre",
|
|
59
|
+
"mm": "milimetre",
|
|
60
|
+
"kg": "kilogram",
|
|
61
|
+
"mg": "miligram",
|
|
62
|
+
"gr": "gram",
|
|
63
|
+
"ml": "mililitre",
|
|
64
|
+
"lt": "litre",
|
|
65
|
+
"kb": "kilobayt",
|
|
66
|
+
"mb": "megabayt",
|
|
67
|
+
"gb": "gigabayt",
|
|
68
|
+
"tb": "terabayt",
|
|
69
|
+
"sa": "saat",
|
|
70
|
+
"dk": "dakika",
|
|
71
|
+
"sn": "saniye",
|
|
72
|
+
"m": "metre",
|
|
73
|
+
"g": "gram",
|
|
74
|
+
"l": "litre",
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
# Currency symbols and codes -> spoken form.
|
|
78
|
+
CURRENCY = {
|
|
79
|
+
"₺": "lira",
|
|
80
|
+
"tl": "lira",
|
|
81
|
+
"try": "lira",
|
|
82
|
+
"$": "dolar",
|
|
83
|
+
"usd": "dolar",
|
|
84
|
+
"€": "avro",
|
|
85
|
+
"eur": "avro",
|
|
86
|
+
"£": "sterlin",
|
|
87
|
+
"gbp": "sterlin",
|
|
88
|
+
"₽": "ruble",
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
# Single-character symbols -> spoken form.
|
|
92
|
+
SYMBOLS = {
|
|
93
|
+
"&": "ve",
|
|
94
|
+
"@": "et",
|
|
95
|
+
"°": "derece",
|
|
96
|
+
"×": "çarpı",
|
|
97
|
+
"÷": "bölü",
|
|
98
|
+
"+": "artı",
|
|
99
|
+
"=": "eşittir",
|
|
100
|
+
"₊": "artı",
|
|
101
|
+
"≠": "eşit değil",
|
|
102
|
+
"≤": "küçük eşit",
|
|
103
|
+
"≥": "büyük eşit",
|
|
104
|
+
"√": "karekök",
|
|
105
|
+
"∞": "sonsuz",
|
|
106
|
+
"π": "pi",
|
|
107
|
+
}
|