ru-normalizr 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ru_normalizr-0.1.0/LICENSE +21 -0
- ru_normalizr-0.1.0/MANIFEST.in +9 -0
- ru_normalizr-0.1.0/PKG-INFO +228 -0
- ru_normalizr-0.1.0/README.md +199 -0
- ru_normalizr-0.1.0/__init__.py +7 -0
- ru_normalizr-0.1.0/__main__.py +86 -0
- ru_normalizr-0.1.0/_morph.py +10 -0
- ru_normalizr-0.1.0/abbreviation_rules.py +101 -0
- ru_normalizr-0.1.0/abbreviations.py +168 -0
- ru_normalizr-0.1.0/caps.py +167 -0
- ru_normalizr-0.1.0/constants.py +264 -0
- ru_normalizr-0.1.0/dates_time.py +214 -0
- ru_normalizr-0.1.0/dictionaries/latinization/latinization_rules.dic +404 -0
- ru_normalizr-0.1.0/dictionaries/your_dictionary.dic +14 -0
- ru_normalizr-0.1.0/dictionary.py +341 -0
- ru_normalizr-0.1.0/latinization.py +216 -0
- ru_normalizr-0.1.0/normalizer.py +296 -0
- ru_normalizr-0.1.0/numbering.py +87 -0
- ru_normalizr-0.1.0/numerals/__init__.py +50 -0
- ru_normalizr-0.1.0/numerals/_constants.py +727 -0
- ru_normalizr-0.1.0/numerals/_helpers.py +446 -0
- ru_normalizr-0.1.0/numerals/cardinals.py +331 -0
- ru_normalizr-0.1.0/numerals/decimals.py +110 -0
- ru_normalizr-0.1.0/numerals/fractions.py +71 -0
- ru_normalizr-0.1.0/numerals/ordinals.py +186 -0
- ru_normalizr-0.1.0/numerals/symbols.py +29 -0
- ru_normalizr-0.1.0/options.py +144 -0
- ru_normalizr-0.1.0/pipeline.py +30 -0
- ru_normalizr-0.1.0/preprocess_utils.py +268 -0
- ru_normalizr-0.1.0/py.typed +1 -0
- ru_normalizr-0.1.0/pyproject.toml +70 -0
- ru_normalizr-0.1.0/roman_numerals.py +208 -0
- ru_normalizr-0.1.0/ru_normalizr.egg-info/PKG-INFO +228 -0
- ru_normalizr-0.1.0/ru_normalizr.egg-info/SOURCES.txt +65 -0
- ru_normalizr-0.1.0/ru_normalizr.egg-info/dependency_links.txt +1 -0
- ru_normalizr-0.1.0/ru_normalizr.egg-info/entry_points.txt +2 -0
- ru_normalizr-0.1.0/ru_normalizr.egg-info/requires.txt +5 -0
- ru_normalizr-0.1.0/ru_normalizr.egg-info/top_level.txt +1 -0
- ru_normalizr-0.1.0/setup.cfg +4 -0
- ru_normalizr-0.1.0/years.py +342 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 NickZaitsev
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ru-normalizr
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Normalization-only Russian text preprocessing.
|
|
5
|
+
Author: NickZaitsev
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/NickZaitsev/ru-normalizr
|
|
8
|
+
Project-URL: Repository, https://github.com/NickZaitsev/ru-normalizr
|
|
9
|
+
Project-URL: Issues, https://github.com/NickZaitsev/ru-normalizr/issues
|
|
10
|
+
Project-URL: Changelog, https://github.com/NickZaitsev/ru-normalizr/blob/main/CHANGELOG.md
|
|
11
|
+
Keywords: russian,normalization,nlp,text-processing,tts
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Natural Language :: Russian
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: eng_to_ipa==0.0.2
|
|
24
|
+
Requires-Dist: num2words>=0.5.13
|
|
25
|
+
Requires-Dist: pymorphy3>=2.0
|
|
26
|
+
Requires-Dist: pymorphy3-dicts-ru>=2.0
|
|
27
|
+
Requires-Dist: roman>=4.0
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# ru-normalizr
|
|
31
|
+
|
|
32
|
+
Normalization-only Russian text preprocessing extracted into a standalone package.
|
|
33
|
+
|
|
34
|
+
`ru-normalizr` focuses on deterministic Russian text normalization:
|
|
35
|
+
- years, dates, time, decimals, fractions, ordinals, and cardinal numerals
|
|
36
|
+
- abbreviations, initials, Roman numerals, cleanup rules, and glued OCR-like text
|
|
37
|
+
- Latin transliteration via dictionary rules or `eng_to_ipa`
|
|
38
|
+
|
|
39
|
+
Out of scope by design:
|
|
40
|
+
- accentization and stress dictionaries
|
|
41
|
+
- pronunciation and post-phoneme fixes
|
|
42
|
+
- TTS pause hacks and chunking
|
|
43
|
+
- audio, model, or engine integration
|
|
44
|
+
|
|
45
|
+
## Installation
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install ru-normalizr
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
`eng_to_ipa` is installed by default, so the IPA backend is available out of the box.
|
|
52
|
+
|
|
53
|
+
## API
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from ru_normalizr import NormalizeOptions, Normalizer, normalize, preprocess_text
|
|
57
|
+
|
|
58
|
+
text = normalize("Глава IV. Встреча в 10:07.")
|
|
59
|
+
prepared = preprocess_text("10кг")
|
|
60
|
+
|
|
61
|
+
tts_normalizer = Normalizer(NormalizeOptions.tts())
|
|
62
|
+
batch = tts_normalizer.normalize_batch(["Глава IV.", "В 1980-е годы было 25 млн."])
|
|
63
|
+
roman_only = tts_normalizer.run_stage("roman", "Глава IV")
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Example outputs
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from ru_normalizr import normalize
|
|
70
|
+
|
|
71
|
+
print(normalize("Глава IV. Встреча в 10:07."))
|
|
72
|
+
# Глава четыре. Встреча в десять, ноль семь.
|
|
73
|
+
|
|
74
|
+
print(normalize("В 1980-е годы было 25 млн. $"))
|
|
75
|
+
# В тысяча девятьсот восьмидесятые годы было двадцать пять миллионов долларов
|
|
76
|
+
|
|
77
|
+
print(normalize("Добавьте 1/4 стакана воды."))
|
|
78
|
+
# Добавьте одну четвертую стакана воды.
|
|
79
|
+
|
|
80
|
+
print(normalize("И. О. Фамилия приехал."))
|
|
81
|
+
# и о фамилия приехал.
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Modes
|
|
85
|
+
|
|
86
|
+
`NormalizeOptions()` uses the conservative `safe` preset by default.
|
|
87
|
+
Use `NormalizeOptions.tts()` when you want the more aggressive TTS-oriented behavior.
|
|
88
|
+
|
|
89
|
+
`safe` is intended for general text where it is more important not to over-normalize:
|
|
90
|
+
- keeps all-caps headings as-is
|
|
91
|
+
- keeps initials like `И. О. Фамилия`
|
|
92
|
+
- keeps letter abbreviations like `ГИБДД`
|
|
93
|
+
- keeps bracketed numeric links like `(1)` and `[1]`
|
|
94
|
+
- keeps Latin transliteration disabled
|
|
95
|
+
|
|
96
|
+
`tts` is intended for speech-oriented pipelines:
|
|
97
|
+
- enables caps normalization and first-word decap
|
|
98
|
+
- removes confident bracketed numeric links
|
|
99
|
+
- expands initials and letter-by-letter abbreviations
|
|
100
|
+
- enables Latin transliteration
|
|
101
|
+
- keeps IPA stress markers disabled by default unless explicitly requested
|
|
102
|
+
|
|
103
|
+
### Configuring options
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
from ru_normalizr import NormalizeOptions, normalize
|
|
107
|
+
|
|
108
|
+
options = NormalizeOptions.tts(
|
|
109
|
+
latinization_backend="ipa",
|
|
110
|
+
enable_latinization_stress_marks=False,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
print(normalize("YouTube в 2024 г.", options))
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
You can also start from the conservative preset and override individual flags:
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
from ru_normalizr import NormalizeOptions, normalize
|
|
120
|
+
|
|
121
|
+
options = NormalizeOptions.safe(
|
|
122
|
+
enable_letter_abbreviation_expansion=True,
|
|
123
|
+
enable_latinization=True,
|
|
124
|
+
latinization_backend="dictionary",
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
print(normalize("USB drive", options))
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Granular abbreviation controls:
|
|
131
|
+
- `enable_contextual_abbreviation_expansion` for contextual abbreviations such as `т. д.`, `т. п.`, `млн.`, `тыс.`
|
|
132
|
+
- `enable_initials_expansion` for patterns such as `И. О. Фамилия`
|
|
133
|
+
- `enable_letter_abbreviation_expansion` for letter-by-letter expansions such as `ГИБДД`, `ООН`, `USB`
|
|
134
|
+
|
|
135
|
+
Latinization controls:
|
|
136
|
+
- `enable_latinization`
|
|
137
|
+
- `latinization_backend="ipa" | "dictionary"`
|
|
138
|
+
- `enable_latinization_stress_marks`
|
|
139
|
+
|
|
140
|
+
When `latinization_backend="ipa"`, stress markers are omitted by default.
|
|
141
|
+
Enable `enable_latinization_stress_marks=True` if you want `+` markers in the output.
|
|
142
|
+
|
|
143
|
+
### Example dictionaries
|
|
144
|
+
|
|
145
|
+
Runtime dictionary assets shipped in the package live under `ru_normalizr/dictionaries/`.
|
|
146
|
+
Latinization rules live under `ru_normalizr/dictionaries/latinization/`.
|
|
147
|
+
The source tree also includes `dictionaries/your_dictionary.dic` as an editable example dictionary for local customization.
|
|
148
|
+
That example file is included in source distributions and excluded from published wheels.
|
|
149
|
+
|
|
150
|
+
### Batch usage
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
from ru_normalizr import Normalizer
|
|
154
|
+
|
|
155
|
+
normalizer = Normalizer(NormalizeOptions.tts())
|
|
156
|
+
texts = ["Глава IV.", "12.03.2025", "Цена 1.5 кг сахара."]
|
|
157
|
+
print(normalizer.normalize_batch(texts))
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
Available stage names for expert use:
|
|
161
|
+
- `preprocess`
|
|
162
|
+
- `roman`
|
|
163
|
+
- `years`
|
|
164
|
+
- `dates_time`
|
|
165
|
+
- `numerals`
|
|
166
|
+
- `abbreviations`
|
|
167
|
+
- `dictionary`
|
|
168
|
+
- `latinization`
|
|
169
|
+
- `finalize`
|
|
170
|
+
|
|
171
|
+
Stage order is fixed in the main pipeline. Stage-level calls are for debugging, testing, and focused use, not for arbitrary reordering.
|
|
172
|
+
|
|
173
|
+
## CLI
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
python -m ru_normalizr "Глава IV. Встреча в 10:07."
|
|
177
|
+
echo "В 1980-е годы было 25 млн." | python -m ru_normalizr
|
|
178
|
+
ru-normalizr --mode safe "ГИБДД"
|
|
179
|
+
ru-normalizr --mode tts --file ./sample.txt
|
|
180
|
+
ru-normalizr --mode tts --file ./sample.txt --output ./sample.normalized.txt
|
|
181
|
+
ru-normalizr --mode tts --latinization-backend ipa --with-latin-stress "YouTube в 2024 г."
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
Useful CLI flags:
|
|
185
|
+
- `--mode safe|tts`
|
|
186
|
+
- `--latinization-backend ipa|dictionary`
|
|
187
|
+
- `--with-latin-stress`
|
|
188
|
+
- `--no-latinization`
|
|
189
|
+
- `--no-first-word-decap`
|
|
190
|
+
- `--keep-links`
|
|
191
|
+
|
|
192
|
+
## Development
|
|
193
|
+
|
|
194
|
+
```bash
|
|
195
|
+
py -3.12 -m pip install -r ./ru_normalizr/requirements-dev.txt
|
|
196
|
+
py -3.12 ./ru_normalizr/scripts/dev.py test
|
|
197
|
+
py -3.12 ./ru_normalizr/scripts/dev.py lint
|
|
198
|
+
py -3.12 ./ru_normalizr/scripts/dev.py build
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
## Release Notes
|
|
202
|
+
|
|
203
|
+
- Changelog: `CHANGELOG.md`
|
|
204
|
+
- Versioning policy: `VERSIONING.md`
|
|
205
|
+
- Publish checklist: `PYPI_RELEASE_CHECKLIST.md`
|
|
206
|
+
|
|
207
|
+
## Packaging
|
|
208
|
+
|
|
209
|
+
The package is self-contained inside `ru_normalizr/` and builds as a standalone wheel from that directory:
|
|
210
|
+
|
|
211
|
+
```bash
|
|
212
|
+
python -m pip wheel --no-deps ./ru_normalizr
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
For repeatable local workflows, use the helper script:
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
py -3.12 ./ru_normalizr/scripts/dev.py clean
|
|
219
|
+
py -3.12 ./ru_normalizr/scripts/dev.py test
|
|
220
|
+
py -3.12 ./ru_normalizr/scripts/dev.py lint
|
|
221
|
+
py -3.12 ./ru_normalizr/scripts/dev.py build
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
The supported public Python imports are from the package root, for example:
|
|
225
|
+
|
|
226
|
+
```python
|
|
227
|
+
from ru_normalizr import NormalizeOptions, Normalizer, normalize, preprocess_text
|
|
228
|
+
```
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
# ru-normalizr
|
|
2
|
+
|
|
3
|
+
Normalization-only Russian text preprocessing extracted into a standalone package.
|
|
4
|
+
|
|
5
|
+
`ru-normalizr` focuses on deterministic Russian text normalization:
|
|
6
|
+
- years, dates, time, decimals, fractions, ordinals, and cardinal numerals
|
|
7
|
+
- abbreviations, initials, Roman numerals, cleanup rules, and glued OCR-like text
|
|
8
|
+
- Latin transliteration via dictionary rules or `eng_to_ipa`
|
|
9
|
+
|
|
10
|
+
Out of scope by design:
|
|
11
|
+
- accentization and stress dictionaries
|
|
12
|
+
- pronunciation and post-phoneme fixes
|
|
13
|
+
- TTS pause hacks and chunking
|
|
14
|
+
- audio, model, or engine integration
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install ru-normalizr
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
`eng_to_ipa` is installed by default, so the IPA backend is available out of the box.
|
|
23
|
+
|
|
24
|
+
## API
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
from ru_normalizr import NormalizeOptions, Normalizer, normalize, preprocess_text
|
|
28
|
+
|
|
29
|
+
text = normalize("Глава IV. Встреча в 10:07.")
|
|
30
|
+
prepared = preprocess_text("10кг")
|
|
31
|
+
|
|
32
|
+
tts_normalizer = Normalizer(NormalizeOptions.tts())
|
|
33
|
+
batch = tts_normalizer.normalize_batch(["Глава IV.", "В 1980-е годы было 25 млн."])
|
|
34
|
+
roman_only = tts_normalizer.run_stage("roman", "Глава IV")
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
### Example outputs
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from ru_normalizr import normalize
|
|
41
|
+
|
|
42
|
+
print(normalize("Глава IV. Встреча в 10:07."))
|
|
43
|
+
# Глава четыре. Встреча в десять, ноль семь.
|
|
44
|
+
|
|
45
|
+
print(normalize("В 1980-е годы было 25 млн. $"))
|
|
46
|
+
# В тысяча девятьсот восьмидесятые годы было двадцать пять миллионов долларов
|
|
47
|
+
|
|
48
|
+
print(normalize("Добавьте 1/4 стакана воды."))
|
|
49
|
+
# Добавьте одну четвертую стакана воды.
|
|
50
|
+
|
|
51
|
+
print(normalize("И. О. Фамилия приехал."))
|
|
52
|
+
# и о фамилия приехал.
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Modes
|
|
56
|
+
|
|
57
|
+
`NormalizeOptions()` uses the conservative `safe` preset by default.
|
|
58
|
+
Use `NormalizeOptions.tts()` when you want the more aggressive TTS-oriented behavior.
|
|
59
|
+
|
|
60
|
+
`safe` is intended for general text where it is more important not to over-normalize:
|
|
61
|
+
- keeps all-caps headings as-is
|
|
62
|
+
- keeps initials like `И. О. Фамилия`
|
|
63
|
+
- keeps letter abbreviations like `ГИБДД`
|
|
64
|
+
- keeps bracketed numeric links like `(1)` and `[1]`
|
|
65
|
+
- keeps Latin transliteration disabled
|
|
66
|
+
|
|
67
|
+
`tts` is intended for speech-oriented pipelines:
|
|
68
|
+
- enables caps normalization and first-word decap
|
|
69
|
+
- removes confident bracketed numeric links
|
|
70
|
+
- expands initials and letter-by-letter abbreviations
|
|
71
|
+
- enables Latin transliteration
|
|
72
|
+
- keeps IPA stress markers disabled by default unless explicitly requested
|
|
73
|
+
|
|
74
|
+
### Configuring options
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from ru_normalizr import NormalizeOptions, normalize
|
|
78
|
+
|
|
79
|
+
options = NormalizeOptions.tts(
|
|
80
|
+
latinization_backend="ipa",
|
|
81
|
+
enable_latinization_stress_marks=False,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
print(normalize("YouTube в 2024 г.", options))
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
You can also start from the conservative preset and override individual flags:
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from ru_normalizr import NormalizeOptions, normalize
|
|
91
|
+
|
|
92
|
+
options = NormalizeOptions.safe(
|
|
93
|
+
enable_letter_abbreviation_expansion=True,
|
|
94
|
+
enable_latinization=True,
|
|
95
|
+
latinization_backend="dictionary",
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
print(normalize("USB drive", options))
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
Granular abbreviation controls:
|
|
102
|
+
- `enable_contextual_abbreviation_expansion` for contextual abbreviations such as `т. д.`, `т. п.`, `млн.`, `тыс.`
|
|
103
|
+
- `enable_initials_expansion` for patterns such as `И. О. Фамилия`
|
|
104
|
+
- `enable_letter_abbreviation_expansion` for letter-by-letter expansions such as `ГИБДД`, `ООН`, `USB`
|
|
105
|
+
|
|
106
|
+
Latinization controls:
|
|
107
|
+
- `enable_latinization`
|
|
108
|
+
- `latinization_backend="ipa" | "dictionary"`
|
|
109
|
+
- `enable_latinization_stress_marks`
|
|
110
|
+
|
|
111
|
+
When `latinization_backend="ipa"`, stress markers are omitted by default.
|
|
112
|
+
Enable `enable_latinization_stress_marks=True` if you want `+` markers in the output.
|
|
113
|
+
|
|
114
|
+
### Example dictionaries
|
|
115
|
+
|
|
116
|
+
Runtime dictionary assets shipped in the package live under `ru_normalizr/dictionaries/`.
|
|
117
|
+
Latinization rules live under `ru_normalizr/dictionaries/latinization/`.
|
|
118
|
+
The source tree also includes `dictionaries/your_dictionary.dic` as an editable example dictionary for local customization.
|
|
119
|
+
That example file is included in source distributions and excluded from published wheels.
|
|
120
|
+
|
|
121
|
+
### Batch usage
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
from ru_normalizr import Normalizer
|
|
125
|
+
|
|
126
|
+
normalizer = Normalizer(NormalizeOptions.tts())
|
|
127
|
+
texts = ["Глава IV.", "12.03.2025", "Цена 1.5 кг сахара."]
|
|
128
|
+
print(normalizer.normalize_batch(texts))
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Available stage names for expert use:
|
|
132
|
+
- `preprocess`
|
|
133
|
+
- `roman`
|
|
134
|
+
- `years`
|
|
135
|
+
- `dates_time`
|
|
136
|
+
- `numerals`
|
|
137
|
+
- `abbreviations`
|
|
138
|
+
- `dictionary`
|
|
139
|
+
- `latinization`
|
|
140
|
+
- `finalize`
|
|
141
|
+
|
|
142
|
+
Stage order is fixed in the main pipeline. Stage-level calls are for debugging, testing, and focused use, not for arbitrary reordering.
|
|
143
|
+
|
|
144
|
+
## CLI
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
python -m ru_normalizr "Глава IV. Встреча в 10:07."
|
|
148
|
+
echo "В 1980-е годы было 25 млн." | python -m ru_normalizr
|
|
149
|
+
ru-normalizr --mode safe "ГИБДД"
|
|
150
|
+
ru-normalizr --mode tts --file ./sample.txt
|
|
151
|
+
ru-normalizr --mode tts --file ./sample.txt --output ./sample.normalized.txt
|
|
152
|
+
ru-normalizr --mode tts --latinization-backend ipa --with-latin-stress "YouTube в 2024 г."
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
Useful CLI flags:
|
|
156
|
+
- `--mode safe|tts`
|
|
157
|
+
- `--latinization-backend ipa|dictionary`
|
|
158
|
+
- `--with-latin-stress`
|
|
159
|
+
- `--no-latinization`
|
|
160
|
+
- `--no-first-word-decap`
|
|
161
|
+
- `--keep-links`
|
|
162
|
+
|
|
163
|
+
## Development
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
py -3.12 -m pip install -r ./ru_normalizr/requirements-dev.txt
|
|
167
|
+
py -3.12 ./ru_normalizr/scripts/dev.py test
|
|
168
|
+
py -3.12 ./ru_normalizr/scripts/dev.py lint
|
|
169
|
+
py -3.12 ./ru_normalizr/scripts/dev.py build
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
## Release Notes
|
|
173
|
+
|
|
174
|
+
- Changelog: `CHANGELOG.md`
|
|
175
|
+
- Versioning policy: `VERSIONING.md`
|
|
176
|
+
- Publish checklist: `PYPI_RELEASE_CHECKLIST.md`
|
|
177
|
+
|
|
178
|
+
## Packaging
|
|
179
|
+
|
|
180
|
+
The package is self-contained inside `ru_normalizr/` and builds as a standalone wheel from that directory:
|
|
181
|
+
|
|
182
|
+
```bash
|
|
183
|
+
python -m pip wheel --no-deps ./ru_normalizr
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
For repeatable local workflows, use the helper script:
|
|
187
|
+
|
|
188
|
+
```bash
|
|
189
|
+
py -3.12 ./ru_normalizr/scripts/dev.py clean
|
|
190
|
+
py -3.12 ./ru_normalizr/scripts/dev.py test
|
|
191
|
+
py -3.12 ./ru_normalizr/scripts/dev.py lint
|
|
192
|
+
py -3.12 ./ru_normalizr/scripts/dev.py build
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
The supported public Python imports are from the package root, for example:
|
|
196
|
+
|
|
197
|
+
```python
|
|
198
|
+
from ru_normalizr import NormalizeOptions, Normalizer, normalize, preprocess_text
|
|
199
|
+
```
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from .options import NormalizeOptions
|
|
8
|
+
from .pipeline import normalize
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _read_input(args: argparse.Namespace) -> str:
|
|
12
|
+
if args.file:
|
|
13
|
+
return Path(args.file).read_text(encoding="utf-8")
|
|
14
|
+
if args.text:
|
|
15
|
+
return args.text
|
|
16
|
+
if not sys.stdin.isatty():
|
|
17
|
+
return sys.stdin.read()
|
|
18
|
+
raise SystemExit("Provide text, --file, or stdin.")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
22
|
+
parser = argparse.ArgumentParser(
|
|
23
|
+
prog="ru-normalizr", description="Normalize Russian text."
|
|
24
|
+
)
|
|
25
|
+
parser.add_argument("text", nargs="?", help="Inline text to normalize.")
|
|
26
|
+
parser.add_argument("--file", help="Read text from file.")
|
|
27
|
+
parser.add_argument(
|
|
28
|
+
"--output", help="Write normalized text to file instead of stdout."
|
|
29
|
+
)
|
|
30
|
+
parser.add_argument(
|
|
31
|
+
"--check", action="store_true", help="Normalize input and print the result."
|
|
32
|
+
)
|
|
33
|
+
parser.add_argument(
|
|
34
|
+
"--mode",
|
|
35
|
+
choices=["safe", "tts"],
|
|
36
|
+
default="safe",
|
|
37
|
+
help="Preset normalization mode.",
|
|
38
|
+
)
|
|
39
|
+
parser.add_argument(
|
|
40
|
+
"--latinization-backend",
|
|
41
|
+
choices=["ipa", "dictionary"],
|
|
42
|
+
help="Backend for Latin transliteration.",
|
|
43
|
+
)
|
|
44
|
+
parser.add_argument(
|
|
45
|
+
"--no-latinization", action="store_true", help="Disable Latin transliteration."
|
|
46
|
+
)
|
|
47
|
+
parser.add_argument(
|
|
48
|
+
"--no-first-word-decap", action="store_true", help="Disable first-word decap."
|
|
49
|
+
)
|
|
50
|
+
parser.add_argument(
|
|
51
|
+
"--keep-links", action="store_true", help="Keep bracketed numeric links."
|
|
52
|
+
)
|
|
53
|
+
parser.add_argument(
|
|
54
|
+
"--with-latin-stress",
|
|
55
|
+
action="store_true",
|
|
56
|
+
help="Keep '+' stress markers when using IPA latinization.",
|
|
57
|
+
)
|
|
58
|
+
return parser
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def main(argv: list[str] | None = None) -> int:
|
|
62
|
+
parser = build_parser()
|
|
63
|
+
args = parser.parse_args(argv)
|
|
64
|
+
text = _read_input(args)
|
|
65
|
+
options = NormalizeOptions(
|
|
66
|
+
mode=args.mode,
|
|
67
|
+
enable_first_word_decap=False if args.no_first_word_decap else None,
|
|
68
|
+
remove_links=False if args.keep_links else None,
|
|
69
|
+
enable_latinization=False if args.no_latinization else None,
|
|
70
|
+
latinization_backend=args.latinization_backend,
|
|
71
|
+
enable_latinization_stress_marks=args.with_latin_stress,
|
|
72
|
+
)
|
|
73
|
+
result = normalize(text, options)
|
|
74
|
+
if args.output:
|
|
75
|
+
output_path = Path(args.output)
|
|
76
|
+
output_text = result if result.endswith("\n") else result + "\n"
|
|
77
|
+
output_path.write_text(output_text, encoding="utf-8")
|
|
78
|
+
else:
|
|
79
|
+
sys.stdout.write(result)
|
|
80
|
+
if not result.endswith("\n"):
|
|
81
|
+
sys.stdout.write("\n")
|
|
82
|
+
return 0
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
if __name__ == "__main__":
|
|
86
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass(frozen=True, slots=True)
|
|
8
|
+
class RegexRule:
|
|
9
|
+
pattern: str
|
|
10
|
+
flags: int = re.IGNORECASE
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def rx(pattern: str, flags: int = re.IGNORECASE) -> RegexRule:
|
|
14
|
+
return RegexRule(pattern=pattern, flags=flags)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
ABBREVIATION_RULES = {
|
|
18
|
+
rx(r"\bсм\.\s*рис\.(?!\w)"): "смотри рисунок",
|
|
19
|
+
rx(r"\bсм\."): "смотри",
|
|
20
|
+
rx(r"\bнапр\."): "например",
|
|
21
|
+
rx(r"\bт\.?\s*е\."): "то есть",
|
|
22
|
+
rx(r"\bт\.?\s*к\."): "так как",
|
|
23
|
+
rx(r"(?<!\w)н\.?\s*э\.(?!\w)"): "нашей эры",
|
|
24
|
+
rx(r"(?<!\w)нэ\.?(?!\w)"): "нашей эры",
|
|
25
|
+
rx(r"\bс\.\s*(?=\d)"): "страница ",
|
|
26
|
+
rx(r"\bp\.\s*(?=\d)"): "страница ",
|
|
27
|
+
rx(r"\bст\.\s*(?=\d)"): "статья ",
|
|
28
|
+
rx(r"\bгл\."): "глава",
|
|
29
|
+
rx(r"\bг-?н\.?\b"): "господин",
|
|
30
|
+
rx(r"\bг-?жа\.?\b"): "госпожа",
|
|
31
|
+
rx(r"\bрис\.\s*(?=\d)"): "рисунок ",
|
|
32
|
+
rx(r"\bим\."): "имени",
|
|
33
|
+
"снск.": "сноска",
|
|
34
|
+
"прим. пер.": "примечание переводчика",
|
|
35
|
+
"прим.пер.": "примечание переводчика",
|
|
36
|
+
"прим. перев.": "примечание переводчика",
|
|
37
|
+
"прим.перев.": "примечание переводчика",
|
|
38
|
+
"прим. ред.": "примечание редактора",
|
|
39
|
+
"прим.ред.": "примечание редактора",
|
|
40
|
+
rx(r"\bперев\."): "переводчик",
|
|
41
|
+
"англ.": "английский",
|
|
42
|
+
"руск.": "русский",
|
|
43
|
+
"рус.": "русский",
|
|
44
|
+
"немецк.": "немецкий",
|
|
45
|
+
"нем.": "немецкий",
|
|
46
|
+
"франц.": "французский",
|
|
47
|
+
"греч.": "греческий",
|
|
48
|
+
"латин.": "латинский",
|
|
49
|
+
"лат.": "латинский",
|
|
50
|
+
"изд-во": "издательство",
|
|
51
|
+
"ин-т": "институт",
|
|
52
|
+
"инф-ция": "информация",
|
|
53
|
+
"орг-ция": "организация",
|
|
54
|
+
"мин-во": "министерство",
|
|
55
|
+
"гос-во": "государство",
|
|
56
|
+
"гос.": "государственный",
|
|
57
|
+
"междунар.": "международный",
|
|
58
|
+
"обл.": "область",
|
|
59
|
+
"р-н": "район",
|
|
60
|
+
"пос.": "посёлок",
|
|
61
|
+
"пгт": "посёлок городского типа",
|
|
62
|
+
"оз.": "озеро",
|
|
63
|
+
"полит.": "политический",
|
|
64
|
+
"офиц.": "официальный",
|
|
65
|
+
"юридич.": "юридический",
|
|
66
|
+
"ж.-д.": "железнодорожный",
|
|
67
|
+
"с.-х.": "сельскохозяйственный",
|
|
68
|
+
"штаб-кв.": "штаб-квартира",
|
|
69
|
+
"т.д.": "так далее",
|
|
70
|
+
"т. д.": "так далее",
|
|
71
|
+
"т.п.": "тому подобное",
|
|
72
|
+
"т. п.": "тому подобное",
|
|
73
|
+
"табл.": "таблица",
|
|
74
|
+
"стр.": "страница",
|
|
75
|
+
"ред.": "редактор",
|
|
76
|
+
"сост.": "составитель",
|
|
77
|
+
"прил.": "приложение",
|
|
78
|
+
"прим. авт.": "примечание автора",
|
|
79
|
+
"прим.": "примечание",
|
|
80
|
+
"УК РФ": "уголовный кодекс российской федерации",
|
|
81
|
+
"АК РФ": "арбитражный кодекс российской федерации",
|
|
82
|
+
"псевд.": "псевдоним",
|
|
83
|
+
"респ.": "республика",
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _compile_literal_rule(abbr: str) -> re.Pattern[str]:
|
|
88
|
+
return re.compile(rf"(?<!\w){re.escape(abbr)}(?!\w)", re.IGNORECASE)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
ABBREVIATION_PATTERNS = [
|
|
92
|
+
(
|
|
93
|
+
(
|
|
94
|
+
re.compile(matcher.pattern, matcher.flags)
|
|
95
|
+
if isinstance(matcher, RegexRule)
|
|
96
|
+
else _compile_literal_rule(matcher)
|
|
97
|
+
),
|
|
98
|
+
replacement,
|
|
99
|
+
)
|
|
100
|
+
for matcher, replacement in ABBREVIATION_RULES.items()
|
|
101
|
+
]
|