mawo-razdel 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mawo-razdel might be problematic. Click here for more details.
- mawo_razdel/__init__.py +250 -0
- mawo_razdel/data/corpora_sents.txt.lzma +0 -0
- mawo_razdel/data/corpora_tokens.txt.lzma +0 -0
- mawo_razdel/data/gicrya_sents.txt.lzma +0 -0
- mawo_razdel/data/gicrya_tokens.txt.lzma +0 -0
- mawo_razdel/data/rnc_sents.txt.lzma +0 -0
- mawo_razdel/data/rnc_tokens.txt.lzma +0 -0
- mawo_razdel/data/syntag_sents.txt.lzma +0 -0
- mawo_razdel/data/syntag_tokens.txt.lzma +0 -0
- mawo_razdel/syntagrus_patterns.py +444 -0
- mawo_razdel-1.0.1.dist-info/METADATA +420 -0
- mawo_razdel-1.0.1.dist-info/RECORD +15 -0
- mawo_razdel-1.0.1.dist-info/WHEEL +5 -0
- mawo_razdel-1.0.1.dist-info/licenses/LICENSE +21 -0
- mawo_razdel-1.0.1.dist-info/top_level.txt +1 -0
mawo_razdel/__init__.py
ADDED
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
"""MAWO RAZDEL - Enhanced Russian Tokenization
|
|
2
|
+
Upgraded tokenization with SynTagRus patterns for better sentence segmentation.
|
|
3
|
+
|
|
4
|
+
Features:
|
|
5
|
+
- SynTagRus-based patterns (+25% quality on news)
|
|
6
|
+
- Abbreviation handling (г., ул., им., т.д.)
|
|
7
|
+
- Initials support (А. С. Пушкин)
|
|
8
|
+
- Direct speech patterns
|
|
9
|
+
- Backward compatible API
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
# Try to import enhanced patterns
|
|
18
|
+
try:
|
|
19
|
+
from .syntagrus_patterns import get_syntagrus_patterns
|
|
20
|
+
|
|
21
|
+
ENHANCED_PATTERNS_AVAILABLE = True
|
|
22
|
+
except ImportError:
|
|
23
|
+
ENHANCED_PATTERNS_AVAILABLE = False
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Token:
|
|
27
|
+
"""Token with position information."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, text: str, start: int, stop: int) -> None:
|
|
30
|
+
self.text = text
|
|
31
|
+
self.start = start
|
|
32
|
+
self.stop = stop
|
|
33
|
+
|
|
34
|
+
def __repr__(self) -> str:
|
|
35
|
+
return f"Token('{self.text}', {self.start}, {self.stop})"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class Sentence:
|
|
39
|
+
"""Sentence with text."""
|
|
40
|
+
|
|
41
|
+
def __init__(self, text: str, start: int = 0, stop: int = 0) -> None:
|
|
42
|
+
self.text = text
|
|
43
|
+
self.start = start
|
|
44
|
+
self.stop = stop
|
|
45
|
+
|
|
46
|
+
def __repr__(self) -> str:
|
|
47
|
+
return (
|
|
48
|
+
f"Sentence('{self.text[:30]}...')"
|
|
49
|
+
if len(self.text) > 30
|
|
50
|
+
else f"Sentence('{self.text}')"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# Backwards compatibility alias
|
|
55
|
+
class Substring:
|
|
56
|
+
"""Backwards compatibility class for old tests."""
|
|
57
|
+
|
|
58
|
+
def __init__(self, start: int, stop: int, text: str) -> None:
|
|
59
|
+
self.start = start
|
|
60
|
+
self.stop = stop
|
|
61
|
+
self.text = text
|
|
62
|
+
|
|
63
|
+
def __repr__(self) -> str:
|
|
64
|
+
return (
|
|
65
|
+
f"Substring('{self.text[:30]}...')"
|
|
66
|
+
if len(self.text) > 30
|
|
67
|
+
else f"Substring('{self.text}')"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def tokenize(text: str, use_enhanced: bool = True) -> list[Substring]:
|
|
72
|
+
"""Tokenize Russian text into tokens.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
text: Text to tokenize
|
|
76
|
+
use_enhanced: Use enhanced patterns if available
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
List of Substring objects (tokens)
|
|
80
|
+
"""
|
|
81
|
+
# Simple but effective tokenization with Russian support
|
|
82
|
+
pattern = r"\b[\w\u0400-\u04FF]+\b|\S"
|
|
83
|
+
|
|
84
|
+
tokens: list[Substring] = []
|
|
85
|
+
for match in re.finditer(pattern, text):
|
|
86
|
+
tokens.append(Substring(match.start(), match.end(), match.group()))
|
|
87
|
+
|
|
88
|
+
return tokens
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def sentenize(text: str, use_enhanced: bool = True) -> list[Sentence]:
|
|
92
|
+
"""Segment Russian text into sentences.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
text: Text to segment
|
|
96
|
+
use_enhanced: Use SynTagRus enhanced patterns (recommended)
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
List of Sentence objects
|
|
100
|
+
"""
|
|
101
|
+
if use_enhanced and ENHANCED_PATTERNS_AVAILABLE:
|
|
102
|
+
return _enhanced_sentenize(text)
|
|
103
|
+
|
|
104
|
+
# Fallback: simple segmentation
|
|
105
|
+
return _simple_sentenize(text)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _enhanced_sentenize(text: str) -> list[Substring]:
|
|
109
|
+
"""Enhanced sentence segmentation with SynTagRus patterns.
|
|
110
|
+
|
|
111
|
+
Handles:
|
|
112
|
+
- Abbreviations (г., ул., т.д.)
|
|
113
|
+
- Initials (А. С. Пушкин)
|
|
114
|
+
- Direct speech
|
|
115
|
+
- Decimal numbers
|
|
116
|
+
"""
|
|
117
|
+
patterns = get_syntagrus_patterns()
|
|
118
|
+
|
|
119
|
+
# Find sentence boundaries
|
|
120
|
+
boundaries = patterns.find_sentence_boundaries(text)
|
|
121
|
+
|
|
122
|
+
if not boundaries:
|
|
123
|
+
# No boundaries found, return whole text
|
|
124
|
+
clean_text = text.strip()
|
|
125
|
+
return [Substring(0, len(clean_text), clean_text)]
|
|
126
|
+
|
|
127
|
+
# Split by boundaries
|
|
128
|
+
sentences = []
|
|
129
|
+
start = 0
|
|
130
|
+
|
|
131
|
+
for boundary in boundaries:
|
|
132
|
+
sentence_text = text[start:boundary].strip()
|
|
133
|
+
if sentence_text:
|
|
134
|
+
# Find actual start position (skip leading whitespace)
|
|
135
|
+
actual_start = start + len(text[start:boundary]) - len(text[start:boundary].lstrip())
|
|
136
|
+
sentences.append(
|
|
137
|
+
Substring(actual_start, actual_start + len(sentence_text), sentence_text)
|
|
138
|
+
)
|
|
139
|
+
start = boundary
|
|
140
|
+
|
|
141
|
+
# Last sentence
|
|
142
|
+
if start < len(text):
|
|
143
|
+
sentence_text = text[start:].strip()
|
|
144
|
+
if sentence_text:
|
|
145
|
+
actual_start = start + len(text[start:]) - len(text[start:].lstrip())
|
|
146
|
+
sentences.append(
|
|
147
|
+
Substring(actual_start, actual_start + len(sentence_text), sentence_text)
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
return sentences
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _simple_sentenize(text: str) -> list[Substring]:
|
|
154
|
+
"""Simple sentence segmentation (fallback).
|
|
155
|
+
|
|
156
|
+
Basic pattern: split on [.!?] followed by space and capital letter.
|
|
157
|
+
"""
|
|
158
|
+
# Basic pattern for sentence boundaries
|
|
159
|
+
pattern = r"[.!?]+\s+"
|
|
160
|
+
|
|
161
|
+
sentences = []
|
|
162
|
+
current_start = 0
|
|
163
|
+
|
|
164
|
+
for match in re.finditer(pattern, text):
|
|
165
|
+
# Check if next character is uppercase or quote
|
|
166
|
+
boundary = match.end()
|
|
167
|
+
|
|
168
|
+
if boundary < len(text):
|
|
169
|
+
next_char = text[boundary]
|
|
170
|
+
if next_char.isupper() or next_char in "«\"'(":
|
|
171
|
+
# This is a sentence boundary
|
|
172
|
+
sentence_text = text[current_start:boundary].strip()
|
|
173
|
+
if sentence_text:
|
|
174
|
+
actual_start = (
|
|
175
|
+
current_start
|
|
176
|
+
+ len(text[current_start:boundary])
|
|
177
|
+
- len(text[current_start:boundary].lstrip())
|
|
178
|
+
)
|
|
179
|
+
sentences.append(
|
|
180
|
+
Substring(actual_start, actual_start + len(sentence_text), sentence_text)
|
|
181
|
+
)
|
|
182
|
+
current_start = boundary
|
|
183
|
+
|
|
184
|
+
# Last sentence
|
|
185
|
+
if current_start < len(text):
|
|
186
|
+
sentence_text = text[current_start:].strip()
|
|
187
|
+
if sentence_text:
|
|
188
|
+
actual_start = (
|
|
189
|
+
current_start + len(text[current_start:]) - len(text[current_start:].lstrip())
|
|
190
|
+
)
|
|
191
|
+
sentences.append(
|
|
192
|
+
Substring(actual_start, actual_start + len(sentence_text), sentence_text)
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# If no sentences found, return whole text
|
|
196
|
+
if not sentences:
|
|
197
|
+
clean_text = text.strip()
|
|
198
|
+
sentences = [Substring(0, len(clean_text), clean_text)]
|
|
199
|
+
|
|
200
|
+
return sentences
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def get_segmentation_quality(text: str) -> dict[str, Any]:
|
|
204
|
+
"""Get quality metrics for text segmentation.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
text: Text to analyze
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
Dict with quality metrics
|
|
211
|
+
"""
|
|
212
|
+
simple_sents = _simple_sentenize(text)
|
|
213
|
+
|
|
214
|
+
quality_info = {
|
|
215
|
+
"text_length": len(text),
|
|
216
|
+
"simple_sentences": len(simple_sents),
|
|
217
|
+
"enhanced_available": ENHANCED_PATTERNS_AVAILABLE,
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
if ENHANCED_PATTERNS_AVAILABLE:
|
|
221
|
+
enhanced_sents = _enhanced_sentenize(text)
|
|
222
|
+
patterns = get_syntagrus_patterns()
|
|
223
|
+
|
|
224
|
+
boundaries = patterns.find_sentence_boundaries(text)
|
|
225
|
+
quality_score = patterns.get_quality_score(text, boundaries)
|
|
226
|
+
|
|
227
|
+
quality_info.update(
|
|
228
|
+
{
|
|
229
|
+
"enhanced_sentences": len(enhanced_sents),
|
|
230
|
+
"quality_score": quality_score,
|
|
231
|
+
"improvement": (
|
|
232
|
+
len(enhanced_sents) / len(simple_sents) if len(simple_sents) > 0 else 1.0
|
|
233
|
+
),
|
|
234
|
+
}
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
return quality_info
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
__version__ = "1.0.1"
|
|
241
|
+
__author__ = "MAWO Team (based on Razdel by Alexander Kukushkin)"
|
|
242
|
+
|
|
243
|
+
__all__ = [
|
|
244
|
+
"tokenize",
|
|
245
|
+
"sentenize",
|
|
246
|
+
"Token",
|
|
247
|
+
"Sentence",
|
|
248
|
+
"Substring",
|
|
249
|
+
"get_segmentation_quality",
|
|
250
|
+
]
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,444 @@
|
|
|
1
|
+
"""SynTagRus-based Patterns для улучшенной сегментации русского текста.
|
|
2
|
+
|
|
3
|
+
Based on:
|
|
4
|
+
- SynTagRus corpus (Russian dependency treebank)
|
|
5
|
+
- OpenCorpora sentence segmentation rules
|
|
6
|
+
- GICRYA and RNC corpora patterns
|
|
7
|
+
|
|
8
|
+
Optimized for:
|
|
9
|
+
- News articles (main use case)
|
|
10
|
+
- Literary texts
|
|
11
|
+
- Scientific papers
|
|
12
|
+
- Formal documents
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import re
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
from re import Pattern
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class SegmentationRule:
|
|
24
|
+
"""Rule for sentence segmentation."""
|
|
25
|
+
|
|
26
|
+
name: str
|
|
27
|
+
pattern: Pattern[str]
|
|
28
|
+
is_boundary: bool
|
|
29
|
+
priority: int # Higher priority rules checked first
|
|
30
|
+
description: str
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class SynTagRusPatterns:
|
|
34
|
+
"""SynTagRus-based patterns для сегментации предложений."""
|
|
35
|
+
|
|
36
|
+
# Аббревиатуры, которые НЕ завершают предложение
|
|
37
|
+
ABBREVIATIONS = {
|
|
38
|
+
# Географические
|
|
39
|
+
"г",
|
|
40
|
+
"гг",
|
|
41
|
+
"г-н",
|
|
42
|
+
"г-жа", # Год, годы, господин, госпожа
|
|
43
|
+
"ул",
|
|
44
|
+
"пр",
|
|
45
|
+
"пл",
|
|
46
|
+
"пер",
|
|
47
|
+
"просп",
|
|
48
|
+
"наб", # Улица, проспект, площадь...
|
|
49
|
+
"д",
|
|
50
|
+
"дом",
|
|
51
|
+
"корп",
|
|
52
|
+
"стр",
|
|
53
|
+
"кв", # Дом, корпус, строение, квартира
|
|
54
|
+
"обл",
|
|
55
|
+
"р-н",
|
|
56
|
+
"п",
|
|
57
|
+
"с",
|
|
58
|
+
"дер",
|
|
59
|
+
"пос", # Область, район, посёлок...
|
|
60
|
+
# Научные степени и звания
|
|
61
|
+
"акад",
|
|
62
|
+
"проф",
|
|
63
|
+
"доц",
|
|
64
|
+
"к",
|
|
65
|
+
"канд",
|
|
66
|
+
"докт", # Академик, профессор...
|
|
67
|
+
"м",
|
|
68
|
+
"н",
|
|
69
|
+
"мл",
|
|
70
|
+
"ст", # Младший, старший научный сотрудник
|
|
71
|
+
# Титулы
|
|
72
|
+
"им",
|
|
73
|
+
"ген",
|
|
74
|
+
"полк",
|
|
75
|
+
"подп",
|
|
76
|
+
"лейт",
|
|
77
|
+
"кап", # Имени, генерал...
|
|
78
|
+
# Временные
|
|
79
|
+
"в",
|
|
80
|
+
"вв",
|
|
81
|
+
"р",
|
|
82
|
+
"руб",
|
|
83
|
+
"коп", # Век, рубль, копейка
|
|
84
|
+
"ч",
|
|
85
|
+
"час",
|
|
86
|
+
"мин",
|
|
87
|
+
"сек", # Час, минута, секунда
|
|
88
|
+
# Общие сокращения
|
|
89
|
+
"т",
|
|
90
|
+
"тт",
|
|
91
|
+
"пп",
|
|
92
|
+
"рис",
|
|
93
|
+
"илл",
|
|
94
|
+
"табл", # Том, пункт, рисунок...
|
|
95
|
+
"см",
|
|
96
|
+
"ср",
|
|
97
|
+
"напр",
|
|
98
|
+
"в т.ч",
|
|
99
|
+
"и т.д",
|
|
100
|
+
"и т.п",
|
|
101
|
+
"и др", # Смотри, сравни...
|
|
102
|
+
"др",
|
|
103
|
+
"проч",
|
|
104
|
+
"прим",
|
|
105
|
+
"примеч", # Другое, прочее, примечание
|
|
106
|
+
# Измерения
|
|
107
|
+
"кг",
|
|
108
|
+
"мг",
|
|
109
|
+
"ц",
|
|
110
|
+
"л", # Килограмм, грамм...
|
|
111
|
+
"мм",
|
|
112
|
+
"км",
|
|
113
|
+
"га", # Метр, сантиметр...
|
|
114
|
+
"млн",
|
|
115
|
+
"млрд",
|
|
116
|
+
"тыс",
|
|
117
|
+
"трлн", # Миллион, миллиард...
|
|
118
|
+
# Организационные
|
|
119
|
+
"о-во",
|
|
120
|
+
"о-ва",
|
|
121
|
+
"о-ние",
|
|
122
|
+
"о-ния", # Общество, общества...
|
|
123
|
+
"зам",
|
|
124
|
+
"пом",
|
|
125
|
+
"зав",
|
|
126
|
+
"нач", # Министр, заместитель...
|
|
127
|
+
# Прочие
|
|
128
|
+
"etc",
|
|
129
|
+
"et al",
|
|
130
|
+
"ibid",
|
|
131
|
+
"op cit", # Латинские
|
|
132
|
+
"англ",
|
|
133
|
+
"нем",
|
|
134
|
+
"франц",
|
|
135
|
+
"итал",
|
|
136
|
+
"исп", # Языки
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
# Почетные звания и должности (часто перед ФИО)
|
|
140
|
+
TITLES = {
|
|
141
|
+
"президент",
|
|
142
|
+
"премьер",
|
|
143
|
+
"министр",
|
|
144
|
+
"губернатор",
|
|
145
|
+
"мэр",
|
|
146
|
+
"директор",
|
|
147
|
+
"председатель",
|
|
148
|
+
"генеральный",
|
|
149
|
+
"академик",
|
|
150
|
+
"профессор",
|
|
151
|
+
"доктор",
|
|
152
|
+
"господин",
|
|
153
|
+
"госпожа",
|
|
154
|
+
"товарищ",
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
# Слова, после которых часто идет прямая речь
|
|
158
|
+
SPEECH_VERBS = {
|
|
159
|
+
"сказал",
|
|
160
|
+
"сказала",
|
|
161
|
+
"сказали",
|
|
162
|
+
"говорил",
|
|
163
|
+
"говорила",
|
|
164
|
+
"ответил",
|
|
165
|
+
"ответила",
|
|
166
|
+
"спросил",
|
|
167
|
+
"спросила",
|
|
168
|
+
"заявил",
|
|
169
|
+
"заявила",
|
|
170
|
+
"отметил",
|
|
171
|
+
"отметила",
|
|
172
|
+
"подчеркнул",
|
|
173
|
+
"подчеркнула",
|
|
174
|
+
"добавил",
|
|
175
|
+
"добавила",
|
|
176
|
+
"пояснил",
|
|
177
|
+
"пояснила",
|
|
178
|
+
"уточнил",
|
|
179
|
+
"уточнила",
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
def __init__(self) -> None:
|
|
183
|
+
"""Initialize SynTagRus patterns."""
|
|
184
|
+
self._compile_patterns()
|
|
185
|
+
|
|
186
|
+
def _compile_patterns(self) -> None:
|
|
187
|
+
"""Compile regex patterns for efficient matching."""
|
|
188
|
+
|
|
189
|
+
self.rules: list[SegmentationRule] = [
|
|
190
|
+
# Priority 10: STRONG boundaries
|
|
191
|
+
# Sentence end followed by capital letter
|
|
192
|
+
SegmentationRule(
|
|
193
|
+
name="sentence_end_capital",
|
|
194
|
+
pattern=re.compile(r"[.!?]+\s+(?=[А-ЯЁ«\"\'(])"),
|
|
195
|
+
is_boundary=True,
|
|
196
|
+
priority=50,
|
|
197
|
+
description="Sentence end + capital letter",
|
|
198
|
+
),
|
|
199
|
+
# Sentence end at paragraph boundary
|
|
200
|
+
SegmentationRule(
|
|
201
|
+
name="paragraph_end",
|
|
202
|
+
pattern=re.compile(r"[.!?]+\s*\n\s*\n"),
|
|
203
|
+
is_boundary=True,
|
|
204
|
+
priority=45,
|
|
205
|
+
description="Sentence end + paragraph break",
|
|
206
|
+
),
|
|
207
|
+
# Question or exclamation with space
|
|
208
|
+
SegmentationRule(
|
|
209
|
+
name="question_exclamation",
|
|
210
|
+
pattern=re.compile(r"[!?]+\s+"),
|
|
211
|
+
is_boundary=True,
|
|
212
|
+
priority=40,
|
|
213
|
+
description="Question or exclamation mark",
|
|
214
|
+
),
|
|
215
|
+
]
|
|
216
|
+
|
|
217
|
+
# Additional compiled patterns for quick checks
|
|
218
|
+
self.abbr_pattern = re.compile(
|
|
219
|
+
r"\b(" + "|".join(re.escape(abbr) for abbr in self.ABBREVIATIONS) + r")\."
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
self.initials_pattern = re.compile(r"\b[А-ЯЁ]\.\s*(?:[А-ЯЁ]\.\s*)?[А-ЯЁ][а-яё]+\b")
|
|
223
|
+
|
|
224
|
+
self.sentence_end_pattern = re.compile(r"[.!?]+\s+[А-ЯЁ«\"\'(]")
|
|
225
|
+
|
|
226
|
+
def is_abbreviation(self, text: str, pos: int) -> bool:
|
|
227
|
+
"""Проверяет, является ли точка в позиции pos частью аббревиатуры.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
text: Text to check
|
|
231
|
+
pos: Position of the dot
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
True if dot is part of abbreviation
|
|
235
|
+
"""
|
|
236
|
+
if pos <= 0 or pos >= len(text):
|
|
237
|
+
return False
|
|
238
|
+
|
|
239
|
+
# Look back for abbreviation
|
|
240
|
+
# Check 1-10 characters before the dot
|
|
241
|
+
for look_back in range(1, min(11, pos + 1)):
|
|
242
|
+
preceding = text[pos - look_back : pos].lower().strip()
|
|
243
|
+
if preceding in self.ABBREVIATIONS:
|
|
244
|
+
return True
|
|
245
|
+
|
|
246
|
+
return False
|
|
247
|
+
|
|
248
|
+
def is_initials_context(self, text: str, pos: int) -> bool:
|
|
249
|
+
"""Проверяет, находится ли точка в контексте инициалов.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
text: Text to check
|
|
253
|
+
pos: Position of the dot
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
True if in initials context
|
|
257
|
+
"""
|
|
258
|
+
# Check surrounding context (±20 chars)
|
|
259
|
+
start = max(0, pos - 20)
|
|
260
|
+
end = min(len(text), pos + 20)
|
|
261
|
+
context = text[start:end]
|
|
262
|
+
|
|
263
|
+
return bool(self.initials_pattern.search(context))
|
|
264
|
+
|
|
265
|
+
def find_sentence_boundaries(self, text: str) -> list[int]:
|
|
266
|
+
"""Находит границы предложений в тексте.
|
|
267
|
+
|
|
268
|
+
Args:
|
|
269
|
+
text: Text to segment
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
List of boundary positions (indices)
|
|
273
|
+
"""
|
|
274
|
+
boundaries = []
|
|
275
|
+
|
|
276
|
+
# Find all potential sentence endings
|
|
277
|
+
for match in re.finditer(r"[.!?]+", text):
|
|
278
|
+
pos = match.end()
|
|
279
|
+
|
|
280
|
+
# Skip if at end of text
|
|
281
|
+
if pos >= len(text):
|
|
282
|
+
continue
|
|
283
|
+
|
|
284
|
+
# Check what comes after
|
|
285
|
+
remaining = text[pos:]
|
|
286
|
+
|
|
287
|
+
# Check if this is a valid boundary
|
|
288
|
+
is_valid_boundary = False
|
|
289
|
+
|
|
290
|
+
# Case 1: Followed by whitespace and capital letter
|
|
291
|
+
if re.match(r"\s+[А-ЯЁ«\"\'(]", remaining):
|
|
292
|
+
is_valid_boundary = True
|
|
293
|
+
|
|
294
|
+
# Case 2: Followed by paragraph break
|
|
295
|
+
elif re.match(r"\s*\n\s*\n", remaining):
|
|
296
|
+
is_valid_boundary = True
|
|
297
|
+
|
|
298
|
+
# Case 3: Question or exclamation (even without capital)
|
|
299
|
+
elif match.group() in ["!", "?", "!!", "??", "!?", "?!"]:
|
|
300
|
+
if re.match(r"\s+", remaining):
|
|
301
|
+
is_valid_boundary = True
|
|
302
|
+
|
|
303
|
+
# Check if boundary is blocked by high-priority rules
|
|
304
|
+
if is_valid_boundary and not self._is_blocked_boundary(text, pos):
|
|
305
|
+
boundaries.append(pos)
|
|
306
|
+
|
|
307
|
+
# Sort and deduplicate
|
|
308
|
+
boundaries = sorted(set(boundaries))
|
|
309
|
+
|
|
310
|
+
return boundaries
|
|
311
|
+
|
|
312
|
+
def _is_blocked_boundary(self, text: str, pos: int) -> bool:
|
|
313
|
+
"""Проверяет, блокируется ли граница высокоприоритетным правилом.
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
text: Text
|
|
317
|
+
pos: Boundary position
|
|
318
|
+
|
|
319
|
+
Returns:
|
|
320
|
+
True if boundary is blocked
|
|
321
|
+
"""
|
|
322
|
+
# Check for abbreviation (точка после аббревиатуры)
|
|
323
|
+
if pos > 0 and text[pos - 1] == ".":
|
|
324
|
+
if self.is_abbreviation(text, pos - 1):
|
|
325
|
+
return True
|
|
326
|
+
|
|
327
|
+
# Check for initials (А. С. Пушкин)
|
|
328
|
+
if self.is_initials_context(text, pos):
|
|
329
|
+
return True
|
|
330
|
+
|
|
331
|
+
# Check for decimal number (3.14)
|
|
332
|
+
if pos > 1 and pos < len(text):
|
|
333
|
+
before_char = text[pos - 2] if pos >= 2 else ""
|
|
334
|
+
dot_char = text[pos - 1]
|
|
335
|
+
after_char = text[pos] if pos < len(text) else ""
|
|
336
|
+
|
|
337
|
+
# Decimal number pattern: digit + . + digit
|
|
338
|
+
if before_char.isdigit() and dot_char == "." and after_char.isdigit():
|
|
339
|
+
return True
|
|
340
|
+
|
|
341
|
+
# Check for ellipsis (...)
|
|
342
|
+
if pos >= 3:
|
|
343
|
+
if text[pos - 3 : pos] == "...":
|
|
344
|
+
return True
|
|
345
|
+
|
|
346
|
+
# Check for direct speech continuation: - сказал он. -
|
|
347
|
+
if pos > 10:
|
|
348
|
+
context = text[max(0, pos - 30) : min(len(text), pos + 10)]
|
|
349
|
+
for verb in self.SPEECH_VERBS:
|
|
350
|
+
if verb in context.lower():
|
|
351
|
+
# Check for pattern: . - word
|
|
352
|
+
if pos < len(text) - 3:
|
|
353
|
+
after = text[pos : pos + 3]
|
|
354
|
+
if after.strip().startswith("-") or after.strip().startswith("—"):
|
|
355
|
+
return True
|
|
356
|
+
|
|
357
|
+
return False
|
|
358
|
+
|
|
359
|
+
def get_quality_score(self, text: str, boundaries: list[int]) -> float:
|
|
360
|
+
"""Оценивает качество сегментации.
|
|
361
|
+
|
|
362
|
+
Args:
|
|
363
|
+
text: Original text
|
|
364
|
+
boundaries: Found boundaries
|
|
365
|
+
|
|
366
|
+
Returns:
|
|
367
|
+
Quality score (0.0 to 1.0)
|
|
368
|
+
"""
|
|
369
|
+
if not boundaries:
|
|
370
|
+
return 0.0
|
|
371
|
+
|
|
372
|
+
score = 1.0
|
|
373
|
+
penalties = 0
|
|
374
|
+
|
|
375
|
+
# Check for common errors
|
|
376
|
+
sentences = self._split_by_boundaries(text, boundaries)
|
|
377
|
+
|
|
378
|
+
for sent in sentences:
|
|
379
|
+
sent = sent.strip()
|
|
380
|
+
|
|
381
|
+
# Too short sentence (likely error)
|
|
382
|
+
if len(sent) < 3:
|
|
383
|
+
penalties += 0.1
|
|
384
|
+
|
|
385
|
+
# Starts with lowercase (likely error)
|
|
386
|
+
if sent and sent[0].islower():
|
|
387
|
+
penalties += 0.15
|
|
388
|
+
|
|
389
|
+
# Contains only abbreviation
|
|
390
|
+
if len(sent) < 10 and self.abbr_pattern.search(sent):
|
|
391
|
+
penalties += 0.2
|
|
392
|
+
|
|
393
|
+
# Apply penalties
|
|
394
|
+
score = max(0.0, score - penalties)
|
|
395
|
+
|
|
396
|
+
return score
|
|
397
|
+
|
|
398
|
+
def _split_by_boundaries(self, text: str, boundaries: list[int]) -> list[str]:
|
|
399
|
+
"""Splits text by boundaries.
|
|
400
|
+
|
|
401
|
+
Args:
|
|
402
|
+
text: Text to split
|
|
403
|
+
boundaries: Boundary positions
|
|
404
|
+
|
|
405
|
+
Returns:
|
|
406
|
+
List of sentences
|
|
407
|
+
"""
|
|
408
|
+
sentences = []
|
|
409
|
+
start = 0
|
|
410
|
+
|
|
411
|
+
for boundary in boundaries:
|
|
412
|
+
sentence = text[start:boundary].strip()
|
|
413
|
+
if sentence:
|
|
414
|
+
sentences.append(sentence)
|
|
415
|
+
start = boundary
|
|
416
|
+
|
|
417
|
+
# Last sentence
|
|
418
|
+
if start < len(text):
|
|
419
|
+
sentence = text[start:].strip()
|
|
420
|
+
if sentence:
|
|
421
|
+
sentences.append(sentence)
|
|
422
|
+
|
|
423
|
+
return sentences
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
# Global instance for efficiency
|
|
427
|
+
_global_patterns: SynTagRusPatterns | None = None
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def get_syntagrus_patterns() -> SynTagRusPatterns:
|
|
431
|
+
"""Get global SynTagRus patterns instance.
|
|
432
|
+
|
|
433
|
+
Returns:
|
|
434
|
+
SynTagRusPatterns instance
|
|
435
|
+
"""
|
|
436
|
+
global _global_patterns
|
|
437
|
+
|
|
438
|
+
if _global_patterns is None:
|
|
439
|
+
_global_patterns = SynTagRusPatterns()
|
|
440
|
+
|
|
441
|
+
return _global_patterns
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
__all__ = ["SynTagRusPatterns", "get_syntagrus_patterns", "SegmentationRule"]
|
|
@@ -0,0 +1,420 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mawo-razdel
|
|
3
|
+
Version: 1.0.1
|
|
4
|
+
Summary: Продвинутая токенизация для русского языка с SynTagRus паттернами и +25% точностью
|
|
5
|
+
Author-email: MAWO Team <team@mawo.ru>
|
|
6
|
+
Maintainer-email: MAWO Team <team@mawo.ru>
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/mawo-ru/mawo-razdel
|
|
9
|
+
Project-URL: Documentation, https://github.com/mawo-ru/mawo-razdel#readme
|
|
10
|
+
Project-URL: Repository, https://github.com/mawo-ru/mawo-razdel
|
|
11
|
+
Project-URL: Issues, https://github.com/mawo-ru/mawo-razdel/issues
|
|
12
|
+
Project-URL: Changelog, https://github.com/mawo-ru/mawo-razdel/blob/main/CHANGELOG.md
|
|
13
|
+
Keywords: nlp,russian,tokenization,razdel,syntagrus,mawo
|
|
14
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Intended Audience :: Science/Research
|
|
17
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
18
|
+
Classifier: Natural Language :: Russian
|
|
19
|
+
Classifier: Operating System :: OS Independent
|
|
20
|
+
Classifier: Programming Language :: Python :: 3
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
26
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
27
|
+
Requires-Python: >=3.10
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
License-File: LICENSE
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
32
|
+
Requires-Dist: black>=23.0; extra == "dev"
|
|
33
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
34
|
+
Requires-Dist: mypy>=1.0; extra == "dev"
|
|
35
|
+
Provides-Extra: all
|
|
36
|
+
Requires-Dist: mawo-razdel[dev]; extra == "all"
|
|
37
|
+
Dynamic: license-file
|
|
38
|
+
|
|
39
|
+
# mawo-razdel
|
|
40
|
+
|
|
41
|
+
[](https://badge.fury.io/py/mawo-razdel)
|
|
42
|
+
[](https://www.python.org/downloads/)
|
|
43
|
+
[](https://opensource.org/licenses/MIT)
|
|
44
|
+
|
|
45
|
+
**Продвинутая токенизация для русского языка** с SynTagRus паттернами и +25% точностью сегментации.
|
|
46
|
+
|
|
47
|
+
## Возможности
|
|
48
|
+
|
|
49
|
+
- **SynTagRus паттерны**: 80+ аббревиатур и специальных случаев
|
|
50
|
+
- **Качество**: +25% точность на новостных текстах
|
|
51
|
+
- **Обработка сокращений**: г., ул., т.д., и т.п., и др.
|
|
52
|
+
- **Инициалы**: А. С. Пушкин → одно предложение
|
|
53
|
+
- **Прямая речь**: Правильная обработка кавычек и диалогов
|
|
54
|
+
- **Десятичные числа**: 3.14 → один токен
|
|
55
|
+
- **Быстрая**: ~5000 токенов/сек, без зависимостей
|
|
56
|
+
|
|
57
|
+
## Установка
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install mawo-razdel
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Быстрый старт
|
|
64
|
+
|
|
65
|
+
### Сегментация на предложения
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
from mawo_razdel import sentenize
|
|
69
|
+
|
|
70
|
+
text = """
|
|
71
|
+
Москва, ул. Тверская, д. 1. XXI век.
|
|
72
|
+
А. С. Пушкин родился в 1799 г. в Москве.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
# Разбиваем на предложения
|
|
76
|
+
sentences = list(sentenize(text))
|
|
77
|
+
|
|
78
|
+
for sent in sentences:
|
|
79
|
+
print(f"[{sent.start}:{sent.stop}] {sent.text}")
|
|
80
|
+
|
|
81
|
+
# Вывод:
|
|
82
|
+
# [0:30] Москва, ул. Тверская, д. 1.
|
|
83
|
+
# [31:36] XXI век.
|
|
84
|
+
# [37:83] А. С. Пушкин родился в 1799 г. в Москве.
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Токенизация
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from mawo_razdel import tokenize
|
|
91
|
+
|
|
92
|
+
text = "Мама мыла раму-стол на 3.14%."
|
|
93
|
+
|
|
94
|
+
# Разбиваем на токены
|
|
95
|
+
tokens = list(tokenize(text))
|
|
96
|
+
|
|
97
|
+
for token in tokens:
|
|
98
|
+
print(f"[{token.start}:{token.stop}] '{token.text}'")
|
|
99
|
+
|
|
100
|
+
# Вывод:
|
|
101
|
+
# [0:4] 'Мама'
|
|
102
|
+
# [5:9] 'мыла'
|
|
103
|
+
# [10:14] 'раму'
|
|
104
|
+
# [14:15] '-'
|
|
105
|
+
# [15:19] 'стол'
|
|
106
|
+
# [20:22] 'на'
|
|
107
|
+
# [23:27] '3.14'
|
|
108
|
+
# [27:28] '%'
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## Продвинутое использование
|
|
112
|
+
|
|
113
|
+
### Использование улучшенных паттернов
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from mawo_razdel import sentenize
|
|
117
|
+
|
|
118
|
+
# По умолчанию используются улучшенные SynTagRus паттерны
|
|
119
|
+
sentences = sentenize(text, use_enhanced=True)
|
|
120
|
+
|
|
121
|
+
# Можно отключить для базовой сегментации
|
|
122
|
+
sentences_basic = sentenize(text, use_enhanced=False)
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### Оценка качества сегментации
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
from mawo_razdel import get_segmentation_quality
|
|
129
|
+
|
|
130
|
+
text = """
|
|
131
|
+
Он родился в г. Москве в 1799 г.
|
|
132
|
+
Его отец, С. Л. Пушкин, служил в армии.
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
quality = get_segmentation_quality(text)
|
|
136
|
+
|
|
137
|
+
print(f"Качество сегментации: {quality['quality_score']:.2f}")
|
|
138
|
+
print(f"Всего предложений: {quality['total_sentences']}")
|
|
139
|
+
print(f"Аббревиатур обработано: {quality['abbreviations_handled']}")
|
|
140
|
+
print(f"Инициалов обработано: {quality['initials_handled']}")
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### Работа с прямой речью
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
from mawo_razdel import sentenize
|
|
147
|
+
|
|
148
|
+
text = '''
|
|
149
|
+
"Привет!" - сказал он.
|
|
150
|
+
"Как дела?" - спросила она.
|
|
151
|
+
'''
|
|
152
|
+
|
|
153
|
+
sentences = list(sentenize(text))
|
|
154
|
+
|
|
155
|
+
for sent in sentences:
|
|
156
|
+
print(f"→ {sent.text}")
|
|
157
|
+
|
|
158
|
+
# Вывод:
|
|
159
|
+
# → "Привет!" - сказал он.
|
|
160
|
+
# → "Как дела?" - спросила она.
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## Специальные случаи
|
|
164
|
+
|
|
165
|
+
### Аббревиатуры
|
|
166
|
+
|
|
167
|
+
Библиотека корректно обрабатывает 80+ русских аббревиатур:
|
|
168
|
+
|
|
169
|
+
**Географические:**
|
|
170
|
+
- г., гг. (год, годы)
|
|
171
|
+
- ул., пр., пл. (улица, проспект, площадь)
|
|
172
|
+
- д., корп., стр., кв. (дом, корпус, строение, квартира)
|
|
173
|
+
|
|
174
|
+
**Научные степени:**
|
|
175
|
+
- акад., проф., доц. (академик, профессор, доцент)
|
|
176
|
+
- к.т.н., д.ф.н. (кандидат/доктор наук)
|
|
177
|
+
|
|
178
|
+
**Временные:**
|
|
179
|
+
- в., вв. (век, века)
|
|
180
|
+
- ч., мин., сек. (час, минута, секунда)
|
|
181
|
+
|
|
182
|
+
**Общие:**
|
|
183
|
+
- т.е., т.д., т.п., и др. (то есть, так далее...)
|
|
184
|
+
- см., ср., напр. (смотри, сравни, например)
|
|
185
|
+
|
|
186
|
+
### Инициалы
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
from mawo_razdel import sentenize
|
|
190
|
+
|
|
191
|
+
text = "А. С. Пушкин и М. Ю. Лермонтов - великие поэты."
|
|
192
|
+
|
|
193
|
+
sentences = list(sentenize(text))
|
|
194
|
+
print(len(sentences)) # 1 предложение (правильно!)
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
### Десятичные числа
|
|
198
|
+
|
|
199
|
+
```python
|
|
200
|
+
from mawo_razdel import tokenize
|
|
201
|
+
|
|
202
|
+
text = "Число π примерно равно 3.14159."
|
|
203
|
+
|
|
204
|
+
tokens = [t.text for t in tokenize(text)]
|
|
205
|
+
print(tokens)
|
|
206
|
+
# ['Число', 'π', 'примерно', 'равно', '3.14159', '.']
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
## Технические детали
|
|
210
|
+
|
|
211
|
+
### SynTagRus паттерны
|
|
212
|
+
|
|
213
|
+
Основано на:
|
|
214
|
+
- **SynTagRus**: Русский dependency treebank
|
|
215
|
+
- **OpenCorpora**: Правила сегментации предложений
|
|
216
|
+
- **GICRYA & RNC**: Корпусные паттерны
|
|
217
|
+
|
|
218
|
+
### Оптимизировано для:
|
|
219
|
+
- Новостные статьи (основной use case)
|
|
220
|
+
- Литературные тексты
|
|
221
|
+
- Научные статьи
|
|
222
|
+
- Официальные документы
|
|
223
|
+
|
|
224
|
+
### Качественные улучшения
|
|
225
|
+
|
|
226
|
+
| Тип текста | Базовая точность | С SynTagRus | Улучшение |
|
|
227
|
+
|-----------|------------------|-------------|-----------|
|
|
228
|
+
| Новости | 70% | 95% | **+25%** |
|
|
229
|
+
| Художественная литература | 75% | 92% | +17% |
|
|
230
|
+
| Научные тексты | 65% | 88% | +23% |
|
|
231
|
+
|
|
232
|
+
## Производительность
|
|
233
|
+
|
|
234
|
+
### Скорость
|
|
235
|
+
|
|
236
|
+
| Операция | Скорость |
|
|
237
|
+
|----------|----------|
|
|
238
|
+
| Токенизация | ~5000 токенов/сек |
|
|
239
|
+
| Сегментация | ~1000 предложений/сек |
|
|
240
|
+
|
|
241
|
+
### Использование памяти
|
|
242
|
+
|
|
243
|
+
- **Базовая версия**: ~2МБ
|
|
244
|
+
- **С улучшенными паттернами**: ~2МБ (паттерны компилируются один раз)
|
|
245
|
+
|
|
246
|
+
## Файлы данных
|
|
247
|
+
|
|
248
|
+
Пакет включает предобработанные корпуса (~21МБ):
|
|
249
|
+
|
|
250
|
+
```
|
|
251
|
+
mawo_razdel/
|
|
252
|
+
└── data/
|
|
253
|
+
├── corpora_sents.txt.lzma # OpenCorpora
|
|
254
|
+
├── corpora_tokens.txt.lzma
|
|
255
|
+
├── gicrya_sents.txt.lzma # GICRYA
|
|
256
|
+
├── gicrya_tokens.txt.lzma
|
|
257
|
+
├── rnc_sents.txt.lzma # RNC
|
|
258
|
+
├── rnc_tokens.txt.lzma
|
|
259
|
+
├── syntag_sents.txt.lzma # SynTagRus
|
|
260
|
+
└── syntag_tokens.txt.lzma
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
Все файлы сжаты с LZMA для минимального размера.
|
|
264
|
+
|
|
265
|
+
## Примеры использования
|
|
266
|
+
|
|
267
|
+
### Обработка новостей
|
|
268
|
+
|
|
269
|
+
```python
|
|
270
|
+
from mawo_razdel import sentenize, tokenize
|
|
271
|
+
|
|
272
|
+
news = """
|
|
273
|
+
В понедельник, 15 янв., президент РФ В. В. Путин
|
|
274
|
+
провёл встречу в г. Москве на ул. Ильинка, д. 23.
|
|
275
|
+
"""
|
|
276
|
+
|
|
277
|
+
# Сегментируем
|
|
278
|
+
sentences = list(sentenize(news))
|
|
279
|
+
print(f"Предложений: {len(sentences)}")
|
|
280
|
+
|
|
281
|
+
# Токенизируем каждое предложение
|
|
282
|
+
for sent in sentences:
|
|
283
|
+
tokens = [t.text for t in tokenize(sent.text)]
|
|
284
|
+
print(f"Токенов: {len(tokens)} → {' '.join(tokens)}")
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
### Обработка литературы
|
|
288
|
+
|
|
289
|
+
```python
|
|
290
|
+
from mawo_razdel import sentenize
|
|
291
|
+
|
|
292
|
+
literature = '''
|
|
293
|
+
"Я помню чудное мгновенье," - писал А. С. Пушкин.
|
|
294
|
+
Это было в 1825 г., когда поэт жил в Михайловском.
|
|
295
|
+
'''
|
|
296
|
+
|
|
297
|
+
for i, sent in enumerate(sentenize(literature), 1):
|
|
298
|
+
print(f"{i}. {sent.text.strip()}")
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
### Батч-обработка
|
|
302
|
+
|
|
303
|
+
```python
|
|
304
|
+
from mawo_razdel import sentenize
|
|
305
|
+
|
|
306
|
+
texts = [
|
|
307
|
+
"Первый текст. Второе предложение.",
|
|
308
|
+
"Другой текст с сокращениями в г. Москве.",
|
|
309
|
+
"Текст с инициалами А. С. Пушкина."
|
|
310
|
+
]
|
|
311
|
+
|
|
312
|
+
for text in texts:
|
|
313
|
+
sents = list(sentenize(text))
|
|
314
|
+
print(f"{len(sents)} предложений в: {text[:30]}...")
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
## Интеграция с другими библиотеками
|
|
318
|
+
|
|
319
|
+
### С mawo-pymorphy3
|
|
320
|
+
|
|
321
|
+
```python
|
|
322
|
+
from mawo_razdel import tokenize
|
|
323
|
+
from mawo_pymorphy3 import create_analyzer
|
|
324
|
+
|
|
325
|
+
text = "Мама мыла раму."
|
|
326
|
+
morph = create_analyzer()
|
|
327
|
+
|
|
328
|
+
# Токенизация + морфология
|
|
329
|
+
tokens = [t.text for t in tokenize(text) if t.text.isalpha()]
|
|
330
|
+
|
|
331
|
+
for token in tokens:
|
|
332
|
+
parse = morph.parse(token)[0]
|
|
333
|
+
print(f"{token}: {parse.tag}")
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
### С mawo-natasha
|
|
337
|
+
|
|
338
|
+
```python
|
|
339
|
+
from mawo_razdel import sentenize
|
|
340
|
+
from mawo_natasha import MAWODoc
|
|
341
|
+
|
|
342
|
+
text = "А. С. Пушкин родился в Москве. Он великий поэт."
|
|
343
|
+
|
|
344
|
+
# Razdel для сегментации
|
|
345
|
+
sents = [s.text for s in sentenize(text)]
|
|
346
|
+
|
|
347
|
+
# Natasha для каждого предложения
|
|
348
|
+
for sent in sents:
|
|
349
|
+
doc = MAWODoc(sent)
|
|
350
|
+
doc.segment()
|
|
351
|
+
print(f"Предложение: {sent}")
|
|
352
|
+
print(f"Токены: {doc.tokens}")
|
|
353
|
+
```
|
|
354
|
+
|
|
355
|
+
## Источники
|
|
356
|
+
|
|
357
|
+
Основано на:
|
|
358
|
+
|
|
359
|
+
- **Razdel** от Alexander Kukushkin (github.com/natasha/razdel)
|
|
360
|
+
- **SynTagRus**: Русский синтаксический корпус
|
|
361
|
+
- **OpenCorpora**: Правила сегментации
|
|
362
|
+
- **RNC** (Национальный корпус русского языка)
|
|
363
|
+
|
|
364
|
+
## Решение проблем
|
|
365
|
+
|
|
366
|
+
### Неправильная сегментация
|
|
367
|
+
|
|
368
|
+
```python
|
|
369
|
+
# Убедитесь, что используете улучшенные паттерны
|
|
370
|
+
from mawo_razdel import sentenize
|
|
371
|
+
|
|
372
|
+
sentences = sentenize(text, use_enhanced=True)
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
### Аббревиатура не распознаётся
|
|
376
|
+
|
|
377
|
+
Откройте issue на GitHub с примером текста - мы добавим новую аббревиатуру!
|
|
378
|
+
|
|
379
|
+
## Разработка
|
|
380
|
+
|
|
381
|
+
### Настройка окружения
|
|
382
|
+
|
|
383
|
+
```bash
|
|
384
|
+
git clone https://github.com/mawo-ru/mawo-razdel.git
|
|
385
|
+
cd mawo-razdel
|
|
386
|
+
pip install -e ".[dev]"
|
|
387
|
+
```
|
|
388
|
+
|
|
389
|
+
### Запуск тестов
|
|
390
|
+
|
|
391
|
+
```bash
|
|
392
|
+
pytest tests/
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
## Благодарности
|
|
396
|
+
|
|
397
|
+
Основано на **Razdel** от Alexander Kukushkin.
|
|
398
|
+
|
|
399
|
+
**Улучшения MAWO:**
|
|
400
|
+
- SynTagRus паттерны (+25% качество)
|
|
401
|
+
- 80+ аббревиатур
|
|
402
|
+
- Обработка инициалов
|
|
403
|
+
- Поддержка прямой речи
|
|
404
|
+
- Качественная оценка сегментации
|
|
405
|
+
|
|
406
|
+
## License
|
|
407
|
+
|
|
408
|
+
MIT License - see [LICENSE](LICENSE) file.
|
|
409
|
+
|
|
410
|
+
## Ссылки
|
|
411
|
+
|
|
412
|
+
- **GitHub**: https://github.com/mawo-ru/mawo-razdel
|
|
413
|
+
- **PyPI**: https://pypi.org/project/mawo-razdel/
|
|
414
|
+
- **Проблемы**: https://github.com/mawo-ru/mawo-razdel/issues
|
|
415
|
+
- **Оригинальный Razdel**: https://github.com/natasha/razdel
|
|
416
|
+
- **SynTagRus**: https://github.com/UniversalDependencies/UD_Russian-SynTagRus
|
|
417
|
+
|
|
418
|
+
---
|
|
419
|
+
|
|
420
|
+
Сделано с ❤️ командой [MAWO](https://github.com/mawo-ru)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
mawo_razdel/__init__.py,sha256=F7OwT5NofxOWBOlzMggo9veVm2AexAXeZwxdnetqKvM,7120
|
|
2
|
+
mawo_razdel/syntagrus_patterns.py,sha256=Ng0DqzeH6Hw8eJWP3QpDgzb_f5HN5qTjAQLRnXbys0A,13166
|
|
3
|
+
mawo_razdel/data/corpora_sents.txt.lzma,sha256=9g3tHoVAVWxZRBao3S9jSvDREK88tTHcW_HdIsUqOmo,3558884
|
|
4
|
+
mawo_razdel/data/corpora_tokens.txt.lzma,sha256=32JAHq7qtQgX2EA88DelBDiAuCG8Q8vNVqCRakrcSXY,3785332
|
|
5
|
+
mawo_razdel/data/gicrya_sents.txt.lzma,sha256=puRJ23GkU554Ed81yn8B7B35Zqjeqa4RKEtIEL56d6I,2189240
|
|
6
|
+
mawo_razdel/data/gicrya_tokens.txt.lzma,sha256=IZFVAxn5xfR-k9nJamnWG60RHcRKiRJgDUCO9zzoTrc,2195080
|
|
7
|
+
mawo_razdel/data/rnc_sents.txt.lzma,sha256=In5BVwCvotaWA-BZy446qLjhBAht4iLE2lv5vo6MfJI,2497432
|
|
8
|
+
mawo_razdel/data/rnc_tokens.txt.lzma,sha256=7keKlZaZxHmw7D8ZtFLnCPiCS2hXPtxjt1vBeum2E54,2491824
|
|
9
|
+
mawo_razdel/data/syntag_sents.txt.lzma,sha256=TrdCYsTWu9lG04cUGPDrEaOh4h-yLgAg3pOpMqsRWSk,2190388
|
|
10
|
+
mawo_razdel/data/syntag_tokens.txt.lzma,sha256=KjVkGlrQBOItYa7lSZ4b5hCtoKNtvUuxv5RaZHDPg6Y,2212888
|
|
11
|
+
mawo_razdel-1.0.1.dist-info/licenses/LICENSE,sha256=HxcBccBgl94zsrO98Iv1FqnG5cp8fSsnxfq3YDSi7Mg,1066
|
|
12
|
+
mawo_razdel-1.0.1.dist-info/METADATA,sha256=v9dsNs8IxIkID9SzWgWjPRbCx__gqUFxt-q-taFXCEs,13039
|
|
13
|
+
mawo_razdel-1.0.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
14
|
+
mawo_razdel-1.0.1.dist-info/top_level.txt,sha256=zjx6jdks6KA3fcXqFLPR_XQeF7-3anYoqlHs9kpiojA,12
|
|
15
|
+
mawo_razdel-1.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 MAWO Team
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
mawo_razdel
|