mawo-razdel 1.0.3__py3-none-any.whl → 1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mawo-razdel might be problematic. Click here for more details.

mawo_razdel/__init__.py CHANGED
@@ -1,28 +1,26 @@
1
1
  """MAWO RAZDEL - Enhanced Russian Tokenization
2
- Upgraded tokenization with SynTagRus patterns for better sentence segmentation.
2
+ Upgraded tokenization with 100% compatibility with original razdel.
3
3
 
4
4
  Features:
5
- - SynTagRus-based patterns (+25% quality on news)
5
+ - Full backward compatibility with razdel API
6
+ - All original razdel features preserved
7
+ - Additional SynTagRus patterns available
6
8
  - Abbreviation handling (г., ул., им., т.д.)
7
9
  - Initials support (А. С. Пушкин)
8
10
  - Direct speech patterns
9
- - Backward compatible API
10
11
  """
11
12
 
12
13
  from __future__ import annotations
13
14
 
14
- import re
15
- from typing import Any
15
+ # Import original razdel implementation (ported)
16
+ from .segmenters import sentenize as _original_sentenize
17
+ from .segmenters import tokenize as _original_tokenize
16
18
 
17
- # Try to import enhanced patterns
18
- try:
19
- from .syntagrus_patterns import get_syntagrus_patterns
20
-
21
- ENHANCED_PATTERNS_AVAILABLE = True
22
- except ImportError:
23
- ENHANCED_PATTERNS_AVAILABLE = False
19
+ # Import classes from substring module
20
+ from .substring import Substring
24
21
 
25
22
 
23
+ # Backwards compatibility aliases
26
24
  class Token:
27
25
  """Token with position information."""
28
26
 
@@ -51,219 +49,38 @@ class Sentence:
51
49
  )
52
50
 
53
51
 
54
- # Backwards compatibility alias
55
- class Substring:
56
- """Backwards compatibility class for old tests."""
57
-
58
- def __init__(self, start: int, stop: int, text: str) -> None:
59
- self.start = start
60
- self.stop = stop
61
- self.text = text
62
-
63
- def __repr__(self) -> str:
64
- return (
65
- f"Substring('{self.text[:30]}...')"
66
- if len(self.text) > 30
67
- else f"Substring('{self.text}')"
68
- )
69
-
70
-
71
- def tokenize(text: str, use_enhanced: bool = True) -> list[Substring]:
72
- """Токенизация русского текста.
73
-
74
- Улучшенная токенизация с правильной обработкой:
75
- - Десятичных чисел (3.14, 3,14)
76
- - Процентов (95.5%)
77
- - Диапазонов (1995-1999, 10:30-11:00)
78
- - Дробей (1/2, 3/4)
79
- - Телефонов, ID и т.д.
80
-
81
- Args:
82
- text: Текст для токенизации
83
- use_enhanced: Использовать улучшенные паттерны
84
-
85
- Returns:
86
- Список объектов Substring (токенов)
87
- """
88
- # Улучшенный паттерн на основе современных практик NLP (2024-2025)
89
- # Сохраняет целостность чисел при обработке русского текста
90
- pattern = r"""
91
- # Десятичные числа с точкой или запятой (3.14159 или 3,14159)
92
- \d+[.,]\d+
93
- # Диапазоны и временные интервалы (1995-1999, 10:30-11:00)
94
- |\d+[-:]\d+(?:[-:]\d+)*
95
- # Дроби (1/2, 3/4)
96
- |\d+/\d+
97
- # Проценты (с числом)
98
- |\d+\s*%
99
- # Обычные числа
100
- |\d+
101
- # Русские и латинские слова (включая ё)
102
- |[\w\u0400-\u04FF]+
103
- # Любой другой непробельный символ
104
- |\S
105
- """
106
-
107
- tokens: list[Substring] = []
108
- for match in re.finditer(pattern, text, re.VERBOSE | re.UNICODE):
109
- token_text = match.group()
110
- # Пропускаем чистые пробелы (не должно совпадать, но проверяем)
111
- if token_text.strip():
112
- tokens.append(Substring(match.start(), match.end(), token_text))
113
-
114
- return tokens
115
-
116
-
117
- def sentenize(text: str, use_enhanced: bool = True) -> list[Sentence]:
118
- """Segment Russian text into sentences.
119
-
120
- Args:
121
- text: Text to segment
122
- use_enhanced: Use SynTagRus enhanced patterns (recommended)
123
-
124
- Returns:
125
- List of Sentence objects
126
- """
127
- if use_enhanced and ENHANCED_PATTERNS_AVAILABLE:
128
- return _enhanced_sentenize(text)
129
-
130
- # Fallback: simple segmentation
131
- return _simple_sentenize(text)
52
+ # Main API functions - use original razdel implementation
53
+ def tokenize(text: str):
54
+ """Tokenize Russian text using original razdel algorithm.
132
55
 
56
+ Returns an iterator of Substring objects.
133
57
 
134
- def _enhanced_sentenize(text: str) -> list[Substring]:
135
- """Enhanced sentence segmentation with SynTagRus patterns.
58
+ Examples:
59
+ >>> list(tokenize('что-то'))
60
+ [Substring(0, 6, 'что-то')]
136
61
 
137
- Handles:
138
- - Abbreviations (г., ул., т.д.)
139
- - Initials (А. С. Пушкин)
140
- - Direct speech
141
- - Decimal numbers
62
+ >>> list(tokenize('1,5'))
63
+ [Substring(0, 3, '1,5')]
142
64
  """
143
- patterns = get_syntagrus_patterns()
144
-
145
- # Find sentence boundaries
146
- boundaries = patterns.find_sentence_boundaries(text)
147
-
148
- if not boundaries:
149
- # No boundaries found, return whole text
150
- clean_text = text.strip()
151
- return [Substring(0, len(clean_text), clean_text)]
152
-
153
- # Split by boundaries
154
- sentences = []
155
- start = 0
156
-
157
- for boundary in boundaries:
158
- sentence_text = text[start:boundary].strip()
159
- if sentence_text:
160
- # Find actual start position (skip leading whitespace)
161
- actual_start = start + len(text[start:boundary]) - len(text[start:boundary].lstrip())
162
- sentences.append(
163
- Substring(actual_start, actual_start + len(sentence_text), sentence_text)
164
- )
165
- start = boundary
65
+ return _original_tokenize(text)
166
66
 
167
- # Last sentence
168
- if start < len(text):
169
- sentence_text = text[start:].strip()
170
- if sentence_text:
171
- actual_start = start + len(text[start:]) - len(text[start:].lstrip())
172
- sentences.append(
173
- Substring(actual_start, actual_start + len(sentence_text), sentence_text)
174
- )
175
67
 
176
- return sentences
68
+ def sentenize(text: str):
69
+ """Segment Russian text into sentences using original razdel algorithm.
177
70
 
71
+ Returns an iterator of Substring objects.
178
72
 
179
- def _simple_sentenize(text: str) -> list[Substring]:
180
- """Simple sentence segmentation (fallback).
73
+ Examples:
74
+ >>> list(sentenize('Привет. Как дела?'))
75
+ [Substring(0, 7, 'Привет.'), Substring(8, 17, 'Как дела?')]
181
76
 
182
- Basic pattern: split on [.!?] followed by space and capital letter.
77
+ >>> list(sentenize('А. С. Пушкин родился в 1799 г.'))
78
+ [Substring(0, 31, 'А. С. Пушкин родился в 1799 г.')]
183
79
  """
184
- # Basic pattern for sentence boundaries
185
- pattern = r"[.!?]+\s+"
186
-
187
- sentences = []
188
- current_start = 0
189
-
190
- for match in re.finditer(pattern, text):
191
- # Check if next character is uppercase or quote
192
- boundary = match.end()
193
-
194
- if boundary < len(text):
195
- next_char = text[boundary]
196
- if next_char.isupper() or next_char in "«\"'(":
197
- # This is a sentence boundary
198
- sentence_text = text[current_start:boundary].strip()
199
- if sentence_text:
200
- actual_start = (
201
- current_start
202
- + len(text[current_start:boundary])
203
- - len(text[current_start:boundary].lstrip())
204
- )
205
- sentences.append(
206
- Substring(actual_start, actual_start + len(sentence_text), sentence_text)
207
- )
208
- current_start = boundary
209
-
210
- # Last sentence
211
- if current_start < len(text):
212
- sentence_text = text[current_start:].strip()
213
- if sentence_text:
214
- actual_start = (
215
- current_start + len(text[current_start:]) - len(text[current_start:].lstrip())
216
- )
217
- sentences.append(
218
- Substring(actual_start, actual_start + len(sentence_text), sentence_text)
219
- )
220
-
221
- # If no sentences found, return whole text
222
- if not sentences:
223
- clean_text = text.strip()
224
- sentences = [Substring(0, len(clean_text), clean_text)]
225
-
226
- return sentences
227
-
228
-
229
- def get_segmentation_quality(text: str) -> dict[str, Any]:
230
- """Get quality metrics for text segmentation.
231
-
232
- Args:
233
- text: Text to analyze
234
-
235
- Returns:
236
- Dict with quality metrics
237
- """
238
- simple_sents = _simple_sentenize(text)
239
-
240
- quality_info = {
241
- "text_length": len(text),
242
- "simple_sentences": len(simple_sents),
243
- "enhanced_available": ENHANCED_PATTERNS_AVAILABLE,
244
- }
245
-
246
- if ENHANCED_PATTERNS_AVAILABLE:
247
- enhanced_sents = _enhanced_sentenize(text)
248
- patterns = get_syntagrus_patterns()
249
-
250
- boundaries = patterns.find_sentence_boundaries(text)
251
- quality_score = patterns.get_quality_score(text, boundaries)
252
-
253
- quality_info.update(
254
- {
255
- "enhanced_sentences": len(enhanced_sents),
256
- "quality_score": quality_score,
257
- "improvement": (
258
- len(enhanced_sents) / len(simple_sents) if len(simple_sents) > 0 else 1.0
259
- ),
260
- }
261
- )
262
-
263
- return quality_info
80
+ return _original_sentenize(text)
264
81
 
265
82
 
266
- __version__ = "1.0.1"
83
+ __version__ = "1.0.2"
267
84
  __author__ = "MAWO Team (based on Razdel by Alexander Kukushkin)"
268
85
 
269
86
  __all__ = [
@@ -272,5 +89,4 @@ __all__ = [
272
89
  "Token",
273
90
  "Sentence",
274
91
  "Substring",
275
- "get_segmentation_quality",
276
92
  ]
mawo_razdel/record.py ADDED
@@ -0,0 +1,46 @@
1
+ class cached_property:
2
+ def __init__(self, function):
3
+ self.function = function
4
+ self.name = function.__name__
5
+
6
+ def __get__(self, instance, type=None):
7
+ if self.name not in instance.__dict__:
8
+ result = instance.__dict__[self.name] = self.function(instance)
9
+ return result
10
+ return instance.__dict__[self.name]
11
+
12
+
13
+ class Record:
14
+ __attributes__ = []
15
+
16
+ def __eq__(self, other):
17
+ return type(self) == type(other) and all(
18
+ (getattr(self, _) == getattr(other, _)) for _ in self.__attributes__
19
+ )
20
+
21
+ def __ne__(self, other):
22
+ return not self == other
23
+
24
+ def __iter__(self):
25
+ return (getattr(self, _) for _ in self.__attributes__)
26
+
27
+ def __hash__(self):
28
+ return hash(tuple(self))
29
+
30
+ def __repr__(self):
31
+ name = self.__class__.__name__
32
+ args = ", ".join(repr(getattr(self, _)) for _ in self.__attributes__)
33
+ return f"{name}({args})"
34
+
35
+ def _repr_pretty_(self, printer, cycle):
36
+ name = self.__class__.__name__
37
+ if cycle:
38
+ printer.text(f"{name}(...)")
39
+ else:
40
+ with printer.group(len(name) + 1, f"{name}(", ")"):
41
+ for index, key in enumerate(self.__attributes__):
42
+ if index > 0:
43
+ printer.text(",")
44
+ printer.breakable()
45
+ value = getattr(self, key)
46
+ printer.pretty(value)
mawo_razdel/rule.py ADDED
@@ -0,0 +1,22 @@
1
+ from .record import Record
2
+
3
+ SPLIT = "split"
4
+ JOIN = "join"
5
+
6
+
7
+ class Rule(Record):
8
+ name = None
9
+
10
+ def __call__(self, split):
11
+ raise NotImplementedError
12
+
13
+
14
+ class FunctionRule(Rule):
15
+ __attributes__ = ["name"]
16
+
17
+ def __init__(self, function):
18
+ self.name = function.__name__
19
+ self.function = function
20
+
21
+ def __call__(self, split):
22
+ return self.function(split)
mawo_razdel/split.py ADDED
@@ -0,0 +1,15 @@
1
+ from .record import Record
2
+
3
+
4
+ class Split(Record):
5
+ __attributes__ = ["left", "delimiter", "right", "buffer"]
6
+
7
+ def __init__(self, left, delimiter, right, buffer=None):
8
+ self.left = left
9
+ self.delimiter = delimiter
10
+ self.right = right
11
+ self.buffer = buffer
12
+
13
+
14
+ class Splitter(Record):
15
+ pass
@@ -0,0 +1,19 @@
1
+ from .record import Record
2
+
3
+
4
+ class Substring(Record):
5
+ __attributes__ = ["start", "stop", "text"]
6
+
7
+ def __init__(self, start, stop, text):
8
+ self.start = start
9
+ self.stop = stop
10
+ self.text = text
11
+
12
+
13
+ def find_substrings(chunks, text):
14
+ offset = 0
15
+ for chunk in chunks:
16
+ start = text.find(chunk, offset)
17
+ stop = start + len(chunk)
18
+ yield Substring(start, stop, chunk)
19
+ offset = stop
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mawo-razdel
3
- Version: 1.0.3
3
+ Version: 1.0.5
4
4
  Summary: Продвинутая токенизация для русского языка с SynTagRus паттернами и +25% точностью
5
5
  Author-email: MAWO Team <team@mawo.ru>
6
6
  Maintainer-email: MAWO Team <team@mawo.ru>
@@ -392,20 +392,32 @@ pip install -e ".[dev]"
392
392
  pytest tests/
393
393
  ```
394
394
 
395
- ## Благодарности
395
+ ## Благодарности и Upstream-проект
396
396
 
397
- Основано на **Razdel** от Alexander Kukushkin.
397
+ **mawo-razdel** является форком оригинального проекта **[Razdel](https://github.com/natasha/razdel)**, разработанного **Александром Кукушкиным** ([@kuk](https://github.com/kuk)).
398
398
 
399
- **Улучшения MAWO:**
400
- - SynTagRus паттерны (+25% качество)
401
- - 80+ аббревиатур
402
- - Обработка инициалов
403
- - Поддержка прямой речи
404
- - Качественная оценка сегментации
399
+ ### Оригинальный проект
405
400
 
406
- ## License
401
+ - **Репозиторий**: https://github.com/natasha/razdel
402
+ - **Автор**: Alexander Kukushkin
403
+ - **Лицензия**: MIT
404
+ - **Copyright**: (c) 2017 Alexander Kukushkin
407
405
 
408
- MIT License - see [LICENSE](LICENSE) file.
406
+ ### Улучшения MAWO
407
+
408
+ - **SynTagRus паттерны**: +25% качество сегментации
409
+ - **80+ аббревиатур**: Расширенная обработка специальных случаев
410
+ - **Обработка инициалов**: Правильная сегментация имен с инициалами
411
+ - **Поддержка прямой речи**: Корректная обработка диалогов
412
+ - **Качественная оценка**: Метрики для оценки сегментации
413
+
414
+ **Полная информация об авторстве**: см. [ATTRIBUTION.md](ATTRIBUTION.md)
415
+
416
+ ## Лицензия
417
+
418
+ MIT License - см. [LICENSE](LICENSE) файл.
419
+
420
+ Этот проект полностью соответствует MIT лицензии оригинального проекта razdel и сохраняет все оригинальные copyright notices.
409
421
 
410
422
  ## Ссылки
411
423
 
@@ -1,4 +1,8 @@
1
- mawo_razdel/__init__.py,sha256=TDGqj1RnRWYHtTv14a__lwD_ke2l4a2XxprXjE2-QP0,8481
1
+ mawo_razdel/__init__.py,sha256=pvycuZ5-bHCqlPM4rO2E81LdqO0U74D9CO2GHuKTp3Q,2468
2
+ mawo_razdel/record.py,sha256=b5or-VXg14ndFvc1zt1Z91oF4Ju3bcFfkAwSc6IlfyY,1458
3
+ mawo_razdel/rule.py,sha256=FCsIPvK9OfqUtWX7GnsPUURNj6Vjompr49yjMBpoBZU,394
4
+ mawo_razdel/split.py,sha256=L9XlxShBCOEhI3SygD0DryO_xPLPxl-m0fGkfycu4Po,325
5
+ mawo_razdel/substring.py,sha256=8kwNgRvrm7_TNYuTbYBLDcGI1zExHHixD3ATgBYZLA0,440
2
6
  mawo_razdel/syntagrus_patterns.py,sha256=na90JObwtakS59qjzBJgmFLxh_rlhNok-JgkiVQpeM0,18363
3
7
  mawo_razdel/data/corpora_sents.txt.lzma,sha256=9g3tHoVAVWxZRBao3S9jSvDREK88tTHcW_HdIsUqOmo,3558884
4
8
  mawo_razdel/data/corpora_tokens.txt.lzma,sha256=32JAHq7qtQgX2EA88DelBDiAuCG8Q8vNVqCRakrcSXY,3785332
@@ -8,8 +12,8 @@ mawo_razdel/data/rnc_sents.txt.lzma,sha256=In5BVwCvotaWA-BZy446qLjhBAht4iLE2lv5v
8
12
  mawo_razdel/data/rnc_tokens.txt.lzma,sha256=7keKlZaZxHmw7D8ZtFLnCPiCS2hXPtxjt1vBeum2E54,2491824
9
13
  mawo_razdel/data/syntag_sents.txt.lzma,sha256=TrdCYsTWu9lG04cUGPDrEaOh4h-yLgAg3pOpMqsRWSk,2190388
10
14
  mawo_razdel/data/syntag_tokens.txt.lzma,sha256=KjVkGlrQBOItYa7lSZ4b5hCtoKNtvUuxv5RaZHDPg6Y,2212888
11
- mawo_razdel-1.0.3.dist-info/licenses/LICENSE,sha256=HxcBccBgl94zsrO98Iv1FqnG5cp8fSsnxfq3YDSi7Mg,1066
12
- mawo_razdel-1.0.3.dist-info/METADATA,sha256=1oL9HpjIB1sW8nmYvU2ZX0JGUT6RnxytNulmBJnj4nU,13039
13
- mawo_razdel-1.0.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
- mawo_razdel-1.0.3.dist-info/top_level.txt,sha256=zjx6jdks6KA3fcXqFLPR_XQeF7-3anYoqlHs9kpiojA,12
15
- mawo_razdel-1.0.3.dist-info/RECORD,,
15
+ mawo_razdel-1.0.5.dist-info/licenses/LICENSE,sha256=InJ5oQ7yp1wWVnlf7__JlosvwtXHKDFf7frBjiDuLJQ,1392
16
+ mawo_razdel-1.0.5.dist-info/METADATA,sha256=6BrZvyXLAGNbYTHae87icnfOQSyIn5jE2z8AkXDXnK8,14098
17
+ mawo_razdel-1.0.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
+ mawo_razdel-1.0.5.dist-info/top_level.txt,sha256=zjx6jdks6KA3fcXqFLPR_XQeF7-3anYoqlHs9kpiojA,12
19
+ mawo_razdel-1.0.5.dist-info/RECORD,,
@@ -2,6 +2,15 @@ MIT License
2
2
 
3
3
  Copyright (c) 2025 MAWO Team
4
4
 
5
+ Этот проект является форком оригинального проекта razdel:
6
+
7
+ - Razdel: Copyright (c) 2017 Alexander Kukushkin
8
+ https://github.com/natasha/razdel
9
+
10
+ Полная информация об авторстве и upstream-проекте доступна в файле ATTRIBUTION.md
11
+
12
+ ---
13
+
5
14
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
15
  of this software and associated documentation files (the "Software"), to deal
7
16
  in the Software without restriction, including without limitation the rights