text-corrector 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
text_corrector/core.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import List, Dict, Set, Tuple
|
|
3
|
+
|
|
4
|
+
def tokenize(text: str) -> List[str]:
|
|
5
|
+
"""Поддержка русских и английских букв"""
|
|
6
|
+
return re.findall(r'[a-zA-Zа-яА-ЯёЁ]+', text)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def detect_suspects(words: List[str], dictionary_set: Set[str]) -> Tuple[List[int], List[str]]:
|
|
10
|
+
"""Находит слова, которых нет в словаре"""
|
|
11
|
+
suspect_indices = []
|
|
12
|
+
suspect_words = []
|
|
13
|
+
for i, word in enumerate(words):
|
|
14
|
+
if word.lower() not in dictionary_set:
|
|
15
|
+
suspect_indices.append(i)
|
|
16
|
+
suspect_words.append(word)
|
|
17
|
+
return suspect_indices, suspect_words
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def levenshtein(a: str, b: str, max_dist: int = 2) -> int:
|
|
21
|
+
"""Вычисляет расстояние Левенштейна с ранним выходом"""
|
|
22
|
+
a = a.lower()
|
|
23
|
+
b = b.lower()
|
|
24
|
+
|
|
25
|
+
len_a, len_b = len(a), len(b)
|
|
26
|
+
|
|
27
|
+
if abs(len_a - len_b) > max_dist:
|
|
28
|
+
return max_dist + 1
|
|
29
|
+
|
|
30
|
+
prev_row = list(range(len_b + 1))
|
|
31
|
+
|
|
32
|
+
for i in range(1, len_a + 1):
|
|
33
|
+
curr_row = [i]
|
|
34
|
+
min_in_row = i
|
|
35
|
+
|
|
36
|
+
for j in range(1, len_b + 1):
|
|
37
|
+
if a[i - 1] == b[j - 1]:
|
|
38
|
+
cost = 0
|
|
39
|
+
else:
|
|
40
|
+
cost = 1
|
|
41
|
+
|
|
42
|
+
insert = prev_row[j] + 1
|
|
43
|
+
delete = curr_row[j - 1] + 1
|
|
44
|
+
replace = prev_row[j - 1] + cost
|
|
45
|
+
|
|
46
|
+
curr_row.append(min(insert, delete, replace))
|
|
47
|
+
min_in_row = min(min_in_row, curr_row[-1])
|
|
48
|
+
|
|
49
|
+
if min_in_row > max_dist:
|
|
50
|
+
return max_dist + 1
|
|
51
|
+
|
|
52
|
+
prev_row = curr_row
|
|
53
|
+
|
|
54
|
+
return prev_row[len_b]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_candidates(word: str, dictionary: Dict[str, float], max_dist: int = 2) -> List[Tuple[str, int, float]]:
|
|
58
|
+
"""Ищет в словаре слова, похожие на заданное"""
|
|
59
|
+
word_lower = word.lower()
|
|
60
|
+
candidates = []
|
|
61
|
+
|
|
62
|
+
for dict_word, freq in dictionary.items():
|
|
63
|
+
if abs(len(word_lower) - len(dict_word)) > max_dist:
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
dist = levenshtein(word_lower, dict_word, max_dist)
|
|
67
|
+
if dist <= max_dist:
|
|
68
|
+
candidates.append((dict_word, dist, freq))
|
|
69
|
+
|
|
70
|
+
return candidates
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def rank_candidates(candidates: List[Tuple[str, int, float]]) -> List[str]:
|
|
74
|
+
"""Сортирует кандидатов: по расстоянию, затем по частоте"""
|
|
75
|
+
sorted_candidates = sorted(candidates, key=lambda x: (x[1], -x[2], x[0]))
|
|
76
|
+
return [c[0] for c in sorted_candidates]
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def correct_text(text: str, dictionary: Dict[str, float], max_dist: int = 2) -> str:
|
|
80
|
+
"""Главная функция: возвращает исправленный текст"""
|
|
81
|
+
# Токенизация
|
|
82
|
+
words = tokenize(text)
|
|
83
|
+
|
|
84
|
+
# Детекция
|
|
85
|
+
dictionary_set = set(dictionary.keys())
|
|
86
|
+
indices, suspects = detect_suspects(words, dictionary_set)
|
|
87
|
+
|
|
88
|
+
# Поиск и замена
|
|
89
|
+
corrections = {}
|
|
90
|
+
for idx, suspect in zip(indices, suspects):
|
|
91
|
+
candidates = get_candidates(suspect, dictionary, max_dist)
|
|
92
|
+
ranked = rank_candidates(candidates)
|
|
93
|
+
|
|
94
|
+
if ranked:
|
|
95
|
+
best = ranked[0]
|
|
96
|
+
if suspect and suspect[0].isupper():
|
|
97
|
+
best = best.capitalize()
|
|
98
|
+
corrections[idx] = best
|
|
99
|
+
|
|
100
|
+
# Сборка результата
|
|
101
|
+
result_words = words.copy()
|
|
102
|
+
for idx, corrected in corrections.items():
|
|
103
|
+
result_words[idx] = corrected
|
|
104
|
+
|
|
105
|
+
return ' '.join(result_words)
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: text-corrector
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Простая библиотека на основе расстояния Левенштейна для исправления опечаток
|
|
5
|
+
License: MIT
|
|
6
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
7
|
+
Classifier: Operating System :: OS Independent
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Requires-Python: >=3.8
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
text_corrector/__init__.py,sha256=LweYlRH6XHOF4igHhZdwn41gRiCphmXgvIPgeMrmx9c,139
|
|
2
|
+
text_corrector/core.py,sha256=aLBNXEbAN8VrAxnAHcBRgqySI_j97bmgzAgfRNzi8VM,3497
|
|
3
|
+
text_corrector-0.1.0.dist-info/METADATA,sha256=-l4DrnPPrg6OUZxB7_SmWxFnupNenJWJgtkEMaAqJ90,394
|
|
4
|
+
text_corrector-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
5
|
+
text_corrector-0.1.0.dist-info/RECORD,,
|