ru-normalizr 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. ru_normalizr-0.1.0/LICENSE +21 -0
  2. ru_normalizr-0.1.0/MANIFEST.in +9 -0
  3. ru_normalizr-0.1.0/PKG-INFO +228 -0
  4. ru_normalizr-0.1.0/README.md +199 -0
  5. ru_normalizr-0.1.0/__init__.py +7 -0
  6. ru_normalizr-0.1.0/__main__.py +86 -0
  7. ru_normalizr-0.1.0/_morph.py +10 -0
  8. ru_normalizr-0.1.0/abbreviation_rules.py +101 -0
  9. ru_normalizr-0.1.0/abbreviations.py +168 -0
  10. ru_normalizr-0.1.0/caps.py +167 -0
  11. ru_normalizr-0.1.0/constants.py +264 -0
  12. ru_normalizr-0.1.0/dates_time.py +214 -0
  13. ru_normalizr-0.1.0/dictionaries/latinization/latinization_rules.dic +404 -0
  14. ru_normalizr-0.1.0/dictionaries/your_dictionary.dic +14 -0
  15. ru_normalizr-0.1.0/dictionary.py +341 -0
  16. ru_normalizr-0.1.0/latinization.py +216 -0
  17. ru_normalizr-0.1.0/normalizer.py +296 -0
  18. ru_normalizr-0.1.0/numbering.py +87 -0
  19. ru_normalizr-0.1.0/numerals/__init__.py +50 -0
  20. ru_normalizr-0.1.0/numerals/_constants.py +727 -0
  21. ru_normalizr-0.1.0/numerals/_helpers.py +446 -0
  22. ru_normalizr-0.1.0/numerals/cardinals.py +331 -0
  23. ru_normalizr-0.1.0/numerals/decimals.py +110 -0
  24. ru_normalizr-0.1.0/numerals/fractions.py +71 -0
  25. ru_normalizr-0.1.0/numerals/ordinals.py +186 -0
  26. ru_normalizr-0.1.0/numerals/symbols.py +29 -0
  27. ru_normalizr-0.1.0/options.py +144 -0
  28. ru_normalizr-0.1.0/pipeline.py +30 -0
  29. ru_normalizr-0.1.0/preprocess_utils.py +268 -0
  30. ru_normalizr-0.1.0/py.typed +1 -0
  31. ru_normalizr-0.1.0/pyproject.toml +70 -0
  32. ru_normalizr-0.1.0/roman_numerals.py +208 -0
  33. ru_normalizr-0.1.0/ru_normalizr.egg-info/PKG-INFO +228 -0
  34. ru_normalizr-0.1.0/ru_normalizr.egg-info/SOURCES.txt +65 -0
  35. ru_normalizr-0.1.0/ru_normalizr.egg-info/dependency_links.txt +1 -0
  36. ru_normalizr-0.1.0/ru_normalizr.egg-info/entry_points.txt +2 -0
  37. ru_normalizr-0.1.0/ru_normalizr.egg-info/requires.txt +5 -0
  38. ru_normalizr-0.1.0/ru_normalizr.egg-info/top_level.txt +1 -0
  39. ru_normalizr-0.1.0/setup.cfg +4 -0
  40. ru_normalizr-0.1.0/years.py +342 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 NickZaitsev
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,9 @@
1
+ include LICENSE
2
+ include README.md
3
+ include pyproject.toml
4
+ recursive-include dictionaries *.dic
5
+ recursive-include numerals *.py
6
+ prune tests
7
+ prune scripts
8
+ prune examples
9
+ global-exclude __pycache__ *.py[cod] *.pkl
@@ -0,0 +1,228 @@
1
+ Metadata-Version: 2.4
2
+ Name: ru-normalizr
3
+ Version: 0.1.0
4
+ Summary: Normalization-only Russian text preprocessing.
5
+ Author: NickZaitsev
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/NickZaitsev/ru-normalizr
8
+ Project-URL: Repository, https://github.com/NickZaitsev/ru-normalizr
9
+ Project-URL: Issues, https://github.com/NickZaitsev/ru-normalizr/issues
10
+ Project-URL: Changelog, https://github.com/NickZaitsev/ru-normalizr/blob/main/CHANGELOG.md
11
+ Keywords: russian,normalization,nlp,text-processing,tts
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Natural Language :: Russian
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Text Processing :: Linguistic
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: eng_to_ipa==0.0.2
24
+ Requires-Dist: num2words>=0.5.13
25
+ Requires-Dist: pymorphy3>=2.0
26
+ Requires-Dist: pymorphy3-dicts-ru>=2.0
27
+ Requires-Dist: roman>=4.0
28
+ Dynamic: license-file
29
+
30
+ # ru-normalizr
31
+
32
+ Normalization-only Russian text preprocessing extracted into a standalone package.
33
+
34
+ `ru-normalizr` focuses on deterministic Russian text normalization:
35
+ - years, dates, time, decimals, fractions, ordinals, and cardinal numerals
36
+ - abbreviations, initials, Roman numerals, cleanup rules, and glued OCR-like text
37
+ - Latin transliteration via dictionary rules or `eng_to_ipa`
38
+
39
+ Out of scope by design:
40
+ - accentization and stress dictionaries
41
+ - pronunciation and post-phoneme fixes
42
+ - TTS pause hacks and chunking
43
+ - audio, model, or engine integration
44
+
45
+ ## Installation
46
+
47
+ ```bash
48
+ pip install ru-normalizr
49
+ ```
50
+
51
+ `eng_to_ipa` is installed by default, so the IPA backend is available out of the box.
52
+
53
+ ## API
54
+
55
+ ```python
56
+ from ru_normalizr import NormalizeOptions, Normalizer, normalize, preprocess_text
57
+
58
+ text = normalize("Глава IV. Встреча в 10:07.")
59
+ prepared = preprocess_text("10кг")
60
+
61
+ tts_normalizer = Normalizer(NormalizeOptions.tts())
62
+ batch = tts_normalizer.normalize_batch(["Глава IV.", "В 1980-е годы было 25 млн."])
63
+ roman_only = tts_normalizer.run_stage("roman", "Глава IV")
64
+ ```
65
+
66
+ ### Example outputs
67
+
68
+ ```python
69
+ from ru_normalizr import normalize
70
+
71
+ print(normalize("Глава IV. Встреча в 10:07."))
72
+ # Глава четыре. Встреча в десять, ноль семь.
73
+
74
+ print(normalize("В 1980-е годы было 25 млн. $"))
75
+ # В тысяча девятьсот восьмидесятые годы было двадцать пять миллионов долларов
76
+
77
+ print(normalize("Добавьте 1/4 стакана воды."))
78
+ # Добавьте одну четвертую стакана воды.
79
+
80
+ print(normalize("И. О. Фамилия приехал."))
81
+ # и о фамилия приехал.
82
+ ```
83
+
84
+ ### Modes
85
+
86
+ `NormalizeOptions()` uses the conservative `safe` preset by default.
87
+ Use `NormalizeOptions.tts()` when you want the more aggressive TTS-oriented behavior.
88
+
89
+ `safe` is intended for general text where it is more important not to over-normalize:
90
+ - keeps all-caps headings as-is
91
+ - keeps initials like `И. О. Фамилия`
92
+ - keeps letter abbreviations like `ГИБДД`
93
+ - keeps bracketed numeric links like `(1)` and `[1]`
94
+ - keeps Latin transliteration disabled
95
+
96
+ `tts` is intended for speech-oriented pipelines:
97
+ - enables caps normalization and first-word decap
98
+ - removes confident bracketed numeric links
99
+ - expands initials and letter-by-letter abbreviations
100
+ - enables Latin transliteration
101
+ - keeps IPA stress markers disabled by default unless explicitly requested
102
+
103
+ ### Configuring options
104
+
105
+ ```python
106
+ from ru_normalizr import NormalizeOptions, normalize
107
+
108
+ options = NormalizeOptions.tts(
109
+ latinization_backend="ipa",
110
+ enable_latinization_stress_marks=False,
111
+ )
112
+
113
+ print(normalize("YouTube в 2024 г.", options))
114
+ ```
115
+
116
+ You can also start from the conservative preset and override individual flags:
117
+
118
+ ```python
119
+ from ru_normalizr import NormalizeOptions, normalize
120
+
121
+ options = NormalizeOptions.safe(
122
+ enable_letter_abbreviation_expansion=True,
123
+ enable_latinization=True,
124
+ latinization_backend="dictionary",
125
+ )
126
+
127
+ print(normalize("USB drive", options))
128
+ ```
129
+
130
+ Granular abbreviation controls:
131
+ - `enable_contextual_abbreviation_expansion` for contextual abbreviations such as `т. д.`, `т. п.`, `млн.`, `тыс.`
132
+ - `enable_initials_expansion` for patterns such as `И. О. Фамилия`
133
+ - `enable_letter_abbreviation_expansion` for letter-by-letter expansions such as `ГИБДД`, `ООН`, `USB`
134
+
135
+ Latinization controls:
136
+ - `enable_latinization`
137
+ - `latinization_backend="ipa" | "dictionary"`
138
+ - `enable_latinization_stress_marks`
139
+
140
+ When `latinization_backend="ipa"`, stress markers are omitted by default.
141
+ Enable `enable_latinization_stress_marks=True` if you want `+` markers in the output.
142
+
143
+ ### Example dictionaries
144
+
145
+ Runtime dictionary assets shipped in the package live under `ru_normalizr/dictionaries/`.
146
+ Latinization rules live under `ru_normalizr/dictionaries/latinization/`.
147
+ The source tree also includes `dictionaries/your_dictionary.dic` as an editable example dictionary for local customization.
148
+ That example file is included in source distributions and excluded from published wheels.
149
+
150
+ ### Batch usage
151
+
152
+ ```python
153
+ from ru_normalizr import Normalizer
154
+
155
+ normalizer = Normalizer(NormalizeOptions.tts())
156
+ texts = ["Глава IV.", "12.03.2025", "Цена 1.5 кг сахара."]
157
+ print(normalizer.normalize_batch(texts))
158
+ ```
159
+
160
+ Available stage names for expert use:
161
+ - `preprocess`
162
+ - `roman`
163
+ - `years`
164
+ - `dates_time`
165
+ - `numerals`
166
+ - `abbreviations`
167
+ - `dictionary`
168
+ - `latinization`
169
+ - `finalize`
170
+
171
+ Stage order is fixed in the main pipeline. Stage-level calls are for debugging, testing, and focused use, not for arbitrary reordering.
172
+
173
+ ## CLI
174
+
175
+ ```bash
176
+ python -m ru_normalizr "Глава IV. Встреча в 10:07."
177
+ echo "В 1980-е годы было 25 млн." | python -m ru_normalizr
178
+ ru-normalizr --mode safe "ГИБДД"
179
+ ru-normalizr --mode tts --file ./sample.txt
180
+ ru-normalizr --mode tts --file ./sample.txt --output ./sample.normalized.txt
181
+ ru-normalizr --mode tts --latinization-backend ipa --with-latin-stress "YouTube в 2024 г."
182
+ ```
183
+
184
+ Useful CLI flags:
185
+ - `--mode safe|tts`
186
+ - `--latinization-backend ipa|dictionary`
187
+ - `--with-latin-stress`
188
+ - `--no-latinization`
189
+ - `--no-first-word-decap`
190
+ - `--keep-links`
191
+
192
+ ## Development
193
+
194
+ ```bash
195
+ py -3.12 -m pip install -r ./ru_normalizr/requirements-dev.txt
196
+ py -3.12 ./ru_normalizr/scripts/dev.py test
197
+ py -3.12 ./ru_normalizr/scripts/dev.py lint
198
+ py -3.12 ./ru_normalizr/scripts/dev.py build
199
+ ```
200
+
201
+ ## Release Notes
202
+
203
+ - Changelog: `CHANGELOG.md`
204
+ - Versioning policy: `VERSIONING.md`
205
+ - Publish checklist: `PYPI_RELEASE_CHECKLIST.md`
206
+
207
+ ## Packaging
208
+
209
+ The package is self-contained inside `ru_normalizr/` and builds as a standalone wheel from that directory:
210
+
211
+ ```bash
212
+ python -m pip wheel --no-deps ./ru_normalizr
213
+ ```
214
+
215
+ For repeatable local workflows, use the helper script:
216
+
217
+ ```bash
218
+ py -3.12 ./ru_normalizr/scripts/dev.py clean
219
+ py -3.12 ./ru_normalizr/scripts/dev.py test
220
+ py -3.12 ./ru_normalizr/scripts/dev.py lint
221
+ py -3.12 ./ru_normalizr/scripts/dev.py build
222
+ ```
223
+
224
+ The supported public Python imports are from the package root, for example:
225
+
226
+ ```python
227
+ from ru_normalizr import NormalizeOptions, Normalizer, normalize, preprocess_text
228
+ ```
@@ -0,0 +1,199 @@
1
+ # ru-normalizr
2
+
3
+ Normalization-only Russian text preprocessing extracted into a standalone package.
4
+
5
+ `ru-normalizr` focuses on deterministic Russian text normalization:
6
+ - years, dates, time, decimals, fractions, ordinals, and cardinal numerals
7
+ - abbreviations, initials, Roman numerals, cleanup rules, and glued OCR-like text
8
+ - Latin transliteration via dictionary rules or `eng_to_ipa`
9
+
10
+ Out of scope by design:
11
+ - accentization and stress dictionaries
12
+ - pronunciation and post-phoneme fixes
13
+ - TTS pause hacks and chunking
14
+ - audio, model, or engine integration
15
+
16
+ ## Installation
17
+
18
+ ```bash
19
+ pip install ru-normalizr
20
+ ```
21
+
22
+ `eng_to_ipa` is installed by default, so the IPA backend is available out of the box.
23
+
24
+ ## API
25
+
26
+ ```python
27
+ from ru_normalizr import NormalizeOptions, Normalizer, normalize, preprocess_text
28
+
29
+ text = normalize("Глава IV. Встреча в 10:07.")
30
+ prepared = preprocess_text("10кг")
31
+
32
+ tts_normalizer = Normalizer(NormalizeOptions.tts())
33
+ batch = tts_normalizer.normalize_batch(["Глава IV.", "В 1980-е годы было 25 млн."])
34
+ roman_only = tts_normalizer.run_stage("roman", "Глава IV")
35
+ ```
36
+
37
+ ### Example outputs
38
+
39
+ ```python
40
+ from ru_normalizr import normalize
41
+
42
+ print(normalize("Глава IV. Встреча в 10:07."))
43
+ # Глава четыре. Встреча в десять, ноль семь.
44
+
45
+ print(normalize("В 1980-е годы было 25 млн. $"))
46
+ # В тысяча девятьсот восьмидесятые годы было двадцать пять миллионов долларов
47
+
48
+ print(normalize("Добавьте 1/4 стакана воды."))
49
+ # Добавьте одну четвертую стакана воды.
50
+
51
+ print(normalize("И. О. Фамилия приехал."))
52
+ # и о фамилия приехал.
53
+ ```
54
+
55
+ ### Modes
56
+
57
+ `NormalizeOptions()` uses the conservative `safe` preset by default.
58
+ Use `NormalizeOptions.tts()` when you want the more aggressive TTS-oriented behavior.
59
+
60
+ `safe` is intended for general text where it is more important not to over-normalize:
61
+ - keeps all-caps headings as-is
62
+ - keeps initials like `И. О. Фамилия`
63
+ - keeps letter abbreviations like `ГИБДД`
64
+ - keeps bracketed numeric links like `(1)` and `[1]`
65
+ - keeps Latin transliteration disabled
66
+
67
+ `tts` is intended for speech-oriented pipelines:
68
+ - enables caps normalization and first-word decap
69
+ - removes confident bracketed numeric links
70
+ - expands initials and letter-by-letter abbreviations
71
+ - enables Latin transliteration
72
+ - keeps IPA stress markers disabled by default unless explicitly requested
73
+
74
+ ### Configuring options
75
+
76
+ ```python
77
+ from ru_normalizr import NormalizeOptions, normalize
78
+
79
+ options = NormalizeOptions.tts(
80
+ latinization_backend="ipa",
81
+ enable_latinization_stress_marks=False,
82
+ )
83
+
84
+ print(normalize("YouTube в 2024 г.", options))
85
+ ```
86
+
87
+ You can also start from the conservative preset and override individual flags:
88
+
89
+ ```python
90
+ from ru_normalizr import NormalizeOptions, normalize
91
+
92
+ options = NormalizeOptions.safe(
93
+ enable_letter_abbreviation_expansion=True,
94
+ enable_latinization=True,
95
+ latinization_backend="dictionary",
96
+ )
97
+
98
+ print(normalize("USB drive", options))
99
+ ```
100
+
101
+ Granular abbreviation controls:
102
+ - `enable_contextual_abbreviation_expansion` for contextual abbreviations such as `т. д.`, `т. п.`, `млн.`, `тыс.`
103
+ - `enable_initials_expansion` for patterns such as `И. О. Фамилия`
104
+ - `enable_letter_abbreviation_expansion` for letter-by-letter expansions such as `ГИБДД`, `ООН`, `USB`
105
+
106
+ Latinization controls:
107
+ - `enable_latinization`
108
+ - `latinization_backend="ipa" | "dictionary"`
109
+ - `enable_latinization_stress_marks`
110
+
111
+ When `latinization_backend="ipa"`, stress markers are omitted by default.
112
+ Enable `enable_latinization_stress_marks=True` if you want `+` markers in the output.
113
+
114
+ ### Example dictionaries
115
+
116
+ Runtime dictionary assets shipped in the package live under `ru_normalizr/dictionaries/`.
117
+ Latinization rules live under `ru_normalizr/dictionaries/latinization/`.
118
+ The source tree also includes `dictionaries/your_dictionary.dic` as an editable example dictionary for local customization.
119
+ That example file is included in source distributions and excluded from published wheels.
120
+
121
+ ### Batch usage
122
+
123
+ ```python
124
+ from ru_normalizr import Normalizer
125
+
126
+ normalizer = Normalizer(NormalizeOptions.tts())
127
+ texts = ["Глава IV.", "12.03.2025", "Цена 1.5 кг сахара."]
128
+ print(normalizer.normalize_batch(texts))
129
+ ```
130
+
131
+ Available stage names for expert use:
132
+ - `preprocess`
133
+ - `roman`
134
+ - `years`
135
+ - `dates_time`
136
+ - `numerals`
137
+ - `abbreviations`
138
+ - `dictionary`
139
+ - `latinization`
140
+ - `finalize`
141
+
142
+ Stage order is fixed in the main pipeline. Stage-level calls are for debugging, testing, and focused use, not for arbitrary reordering.
143
+
144
+ ## CLI
145
+
146
+ ```bash
147
+ python -m ru_normalizr "Глава IV. Встреча в 10:07."
148
+ echo "В 1980-е годы было 25 млн." | python -m ru_normalizr
149
+ ru-normalizr --mode safe "ГИБДД"
150
+ ru-normalizr --mode tts --file ./sample.txt
151
+ ru-normalizr --mode tts --file ./sample.txt --output ./sample.normalized.txt
152
+ ru-normalizr --mode tts --latinization-backend ipa --with-latin-stress "YouTube в 2024 г."
153
+ ```
154
+
155
+ Useful CLI flags:
156
+ - `--mode safe|tts`
157
+ - `--latinization-backend ipa|dictionary`
158
+ - `--with-latin-stress`
159
+ - `--no-latinization`
160
+ - `--no-first-word-decap`
161
+ - `--keep-links`
162
+
163
+ ## Development
164
+
165
+ ```bash
166
+ py -3.12 -m pip install -r ./ru_normalizr/requirements-dev.txt
167
+ py -3.12 ./ru_normalizr/scripts/dev.py test
168
+ py -3.12 ./ru_normalizr/scripts/dev.py lint
169
+ py -3.12 ./ru_normalizr/scripts/dev.py build
170
+ ```
171
+
172
+ ## Release Notes
173
+
174
+ - Changelog: `CHANGELOG.md`
175
+ - Versioning policy: `VERSIONING.md`
176
+ - Publish checklist: `PYPI_RELEASE_CHECKLIST.md`
177
+
178
+ ## Packaging
179
+
180
+ The package is self-contained inside `ru_normalizr/` and builds as a standalone wheel from that directory:
181
+
182
+ ```bash
183
+ python -m pip wheel --no-deps ./ru_normalizr
184
+ ```
185
+
186
+ For repeatable local workflows, use the helper script:
187
+
188
+ ```bash
189
+ py -3.12 ./ru_normalizr/scripts/dev.py clean
190
+ py -3.12 ./ru_normalizr/scripts/dev.py test
191
+ py -3.12 ./ru_normalizr/scripts/dev.py lint
192
+ py -3.12 ./ru_normalizr/scripts/dev.py build
193
+ ```
194
+
195
+ The supported public Python imports are from the package root, for example:
196
+
197
+ ```python
198
+ from ru_normalizr import NormalizeOptions, Normalizer, normalize, preprocess_text
199
+ ```
@@ -0,0 +1,7 @@
1
+ """Russian text normalization library."""
2
+
3
+ from .options import NormalizeOptions
4
+ from .pipeline import Normalizer, normalize, preprocess_text
5
+
6
+ __all__ = ["NormalizeOptions", "Normalizer", "normalize", "preprocess_text"]
7
+ __version__ = "0.1.0"
@@ -0,0 +1,86 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ from .options import NormalizeOptions
8
+ from .pipeline import normalize
9
+
10
+
11
+ def _read_input(args: argparse.Namespace) -> str:
12
+ if args.file:
13
+ return Path(args.file).read_text(encoding="utf-8")
14
+ if args.text:
15
+ return args.text
16
+ if not sys.stdin.isatty():
17
+ return sys.stdin.read()
18
+ raise SystemExit("Provide text, --file, or stdin.")
19
+
20
+
21
+ def build_parser() -> argparse.ArgumentParser:
22
+ parser = argparse.ArgumentParser(
23
+ prog="ru-normalizr", description="Normalize Russian text."
24
+ )
25
+ parser.add_argument("text", nargs="?", help="Inline text to normalize.")
26
+ parser.add_argument("--file", help="Read text from file.")
27
+ parser.add_argument(
28
+ "--output", help="Write normalized text to file instead of stdout."
29
+ )
30
+ parser.add_argument(
31
+ "--check", action="store_true", help="Normalize input and print the result."
32
+ )
33
+ parser.add_argument(
34
+ "--mode",
35
+ choices=["safe", "tts"],
36
+ default="safe",
37
+ help="Preset normalization mode.",
38
+ )
39
+ parser.add_argument(
40
+ "--latinization-backend",
41
+ choices=["ipa", "dictionary"],
42
+ help="Backend for Latin transliteration.",
43
+ )
44
+ parser.add_argument(
45
+ "--no-latinization", action="store_true", help="Disable Latin transliteration."
46
+ )
47
+ parser.add_argument(
48
+ "--no-first-word-decap", action="store_true", help="Disable first-word decap."
49
+ )
50
+ parser.add_argument(
51
+ "--keep-links", action="store_true", help="Keep bracketed numeric links."
52
+ )
53
+ parser.add_argument(
54
+ "--with-latin-stress",
55
+ action="store_true",
56
+ help="Keep '+' stress markers when using IPA latinization.",
57
+ )
58
+ return parser
59
+
60
+
61
+ def main(argv: list[str] | None = None) -> int:
62
+ parser = build_parser()
63
+ args = parser.parse_args(argv)
64
+ text = _read_input(args)
65
+ options = NormalizeOptions(
66
+ mode=args.mode,
67
+ enable_first_word_decap=False if args.no_first_word_decap else None,
68
+ remove_links=False if args.keep_links else None,
69
+ enable_latinization=False if args.no_latinization else None,
70
+ latinization_backend=args.latinization_backend,
71
+ enable_latinization_stress_marks=args.with_latin_stress,
72
+ )
73
+ result = normalize(text, options)
74
+ if args.output:
75
+ output_path = Path(args.output)
76
+ output_text = result if result.endswith("\n") else result + "\n"
77
+ output_path.write_text(output_text, encoding="utf-8")
78
+ else:
79
+ sys.stdout.write(result)
80
+ if not result.endswith("\n"):
81
+ sys.stdout.write("\n")
82
+ return 0
83
+
84
+
85
+ if __name__ == "__main__":
86
+ raise SystemExit(main())
@@ -0,0 +1,10 @@
1
+ from __future__ import annotations
2
+
3
+ import functools
4
+
5
+ import pymorphy3
6
+
7
+
8
+ @functools.lru_cache(maxsize=1)
9
+ def get_morph() -> pymorphy3.MorphAnalyzer:
10
+ return pymorphy3.MorphAnalyzer()
@@ -0,0 +1,101 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from dataclasses import dataclass
5
+
6
+
7
+ @dataclass(frozen=True, slots=True)
8
+ class RegexRule:
9
+ pattern: str
10
+ flags: int = re.IGNORECASE
11
+
12
+
13
+ def rx(pattern: str, flags: int = re.IGNORECASE) -> RegexRule:
14
+ return RegexRule(pattern=pattern, flags=flags)
15
+
16
+
17
+ ABBREVIATION_RULES = {
18
+ rx(r"\bсм\.\s*рис\.(?!\w)"): "смотри рисунок",
19
+ rx(r"\bсм\."): "смотри",
20
+ rx(r"\bнапр\."): "например",
21
+ rx(r"\bт\.?\s*е\."): "то есть",
22
+ rx(r"\bт\.?\s*к\."): "так как",
23
+ rx(r"(?<!\w)н\.?\s*э\.(?!\w)"): "нашей эры",
24
+ rx(r"(?<!\w)нэ\.?(?!\w)"): "нашей эры",
25
+ rx(r"\bс\.\s*(?=\d)"): "страница ",
26
+ rx(r"\bp\.\s*(?=\d)"): "страница ",
27
+ rx(r"\bст\.\s*(?=\d)"): "статья ",
28
+ rx(r"\bгл\."): "глава",
29
+ rx(r"\bг-?н\.?\b"): "господин",
30
+ rx(r"\bг-?жа\.?\b"): "госпожа",
31
+ rx(r"\bрис\.\s*(?=\d)"): "рисунок ",
32
+ rx(r"\bим\."): "имени",
33
+ "снск.": "сноска",
34
+ "прим. пер.": "примечание переводчика",
35
+ "прим.пер.": "примечание переводчика",
36
+ "прим. перев.": "примечание переводчика",
37
+ "прим.перев.": "примечание переводчика",
38
+ "прим. ред.": "примечание редактора",
39
+ "прим.ред.": "примечание редактора",
40
+ rx(r"\bперев\."): "переводчик",
41
+ "англ.": "английский",
42
+ "руск.": "русский",
43
+ "рус.": "русский",
44
+ "немецк.": "немецкий",
45
+ "нем.": "немецкий",
46
+ "франц.": "французский",
47
+ "греч.": "греческий",
48
+ "латин.": "латинский",
49
+ "лат.": "латинский",
50
+ "изд-во": "издательство",
51
+ "ин-т": "институт",
52
+ "инф-ция": "информация",
53
+ "орг-ция": "организация",
54
+ "мин-во": "министерство",
55
+ "гос-во": "государство",
56
+ "гос.": "государственный",
57
+ "междунар.": "международный",
58
+ "обл.": "область",
59
+ "р-н": "район",
60
+ "пос.": "посёлок",
61
+ "пгт": "посёлок городского типа",
62
+ "оз.": "озеро",
63
+ "полит.": "политический",
64
+ "офиц.": "официальный",
65
+ "юридич.": "юридический",
66
+ "ж.-д.": "железнодорожный",
67
+ "с.-х.": "сельскохозяйственный",
68
+ "штаб-кв.": "штаб-квартира",
69
+ "т.д.": "так далее",
70
+ "т. д.": "так далее",
71
+ "т.п.": "тому подобное",
72
+ "т. п.": "тому подобное",
73
+ "табл.": "таблица",
74
+ "стр.": "страница",
75
+ "ред.": "редактор",
76
+ "сост.": "составитель",
77
+ "прил.": "приложение",
78
+ "прим. авт.": "примечание автора",
79
+ "прим.": "примечание",
80
+ "УК РФ": "уголовный кодекс российской федерации",
81
+ "АК РФ": "арбитражный кодекс российской федерации",
82
+ "псевд.": "псевдоним",
83
+ "респ.": "республика",
84
+ }
85
+
86
+
87
+ def _compile_literal_rule(abbr: str) -> re.Pattern[str]:
88
+ return re.compile(rf"(?<!\w){re.escape(abbr)}(?!\w)", re.IGNORECASE)
89
+
90
+
91
+ ABBREVIATION_PATTERNS = [
92
+ (
93
+ (
94
+ re.compile(matcher.pattern, matcher.flags)
95
+ if isinstance(matcher, RegexRule)
96
+ else _compile_literal_rule(matcher)
97
+ ),
98
+ replacement,
99
+ )
100
+ for matcher, replacement in ABBREVIATION_RULES.items()
101
+ ]