text-corrector 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ from .core import correct_text, tokenize, get_candidates
2
+
3
+ __version__ = "0.1.0"
4
+ __all__ = ["correct_text", "tokenize", "get_candidates"]
text_corrector/core.py ADDED
@@ -0,0 +1,105 @@
1
+ import re
2
+ from typing import List, Dict, Set, Tuple
3
+
4
+ def tokenize(text: str) -> List[str]:
5
+ """Поддержка русских и английских букв"""
6
+ return re.findall(r'[a-zA-Zа-яА-ЯёЁ]+', text)
7
+
8
+
9
+ def detect_suspects(words: List[str], dictionary_set: Set[str]) -> Tuple[List[int], List[str]]:
10
+ """Находит слова, которых нет в словаре"""
11
+ suspect_indices = []
12
+ suspect_words = []
13
+ for i, word in enumerate(words):
14
+ if word.lower() not in dictionary_set:
15
+ suspect_indices.append(i)
16
+ suspect_words.append(word)
17
+ return suspect_indices, suspect_words
18
+
19
+
20
+ def levenshtein(a: str, b: str, max_dist: int = 2) -> int:
21
+ """Вычисляет расстояние Левенштейна с ранним выходом"""
22
+ a = a.lower()
23
+ b = b.lower()
24
+
25
+ len_a, len_b = len(a), len(b)
26
+
27
+ if abs(len_a - len_b) > max_dist:
28
+ return max_dist + 1
29
+
30
+ prev_row = list(range(len_b + 1))
31
+
32
+ for i in range(1, len_a + 1):
33
+ curr_row = [i]
34
+ min_in_row = i
35
+
36
+ for j in range(1, len_b + 1):
37
+ if a[i - 1] == b[j - 1]:
38
+ cost = 0
39
+ else:
40
+ cost = 1
41
+
42
+ insert = prev_row[j] + 1
43
+ delete = curr_row[j - 1] + 1
44
+ replace = prev_row[j - 1] + cost
45
+
46
+ curr_row.append(min(insert, delete, replace))
47
+ min_in_row = min(min_in_row, curr_row[-1])
48
+
49
+ if min_in_row > max_dist:
50
+ return max_dist + 1
51
+
52
+ prev_row = curr_row
53
+
54
+ return prev_row[len_b]
55
+
56
+
57
+ def get_candidates(word: str, dictionary: Dict[str, float], max_dist: int = 2) -> List[Tuple[str, int, float]]:
58
+ """Ищет в словаре слова, похожие на заданное"""
59
+ word_lower = word.lower()
60
+ candidates = []
61
+
62
+ for dict_word, freq in dictionary.items():
63
+ if abs(len(word_lower) - len(dict_word)) > max_dist:
64
+ continue
65
+
66
+ dist = levenshtein(word_lower, dict_word, max_dist)
67
+ if dist <= max_dist:
68
+ candidates.append((dict_word, dist, freq))
69
+
70
+ return candidates
71
+
72
+
73
+ def rank_candidates(candidates: List[Tuple[str, int, float]]) -> List[str]:
74
+ """Сортирует кандидатов: по расстоянию, затем по частоте"""
75
+ sorted_candidates = sorted(candidates, key=lambda x: (x[1], -x[2], x[0]))
76
+ return [c[0] for c in sorted_candidates]
77
+
78
+
79
+ def correct_text(text: str, dictionary: Dict[str, float], max_dist: int = 2) -> str:
80
+ """Главная функция: возвращает исправленный текст"""
81
+ # Токенизация
82
+ words = tokenize(text)
83
+
84
+ # Детекция
85
+ dictionary_set = set(dictionary.keys())
86
+ indices, suspects = detect_suspects(words, dictionary_set)
87
+
88
+ # Поиск и замена
89
+ corrections = {}
90
+ for idx, suspect in zip(indices, suspects):
91
+ candidates = get_candidates(suspect, dictionary, max_dist)
92
+ ranked = rank_candidates(candidates)
93
+
94
+ if ranked:
95
+ best = ranked[0]
96
+ if suspect and suspect[0].isupper():
97
+ best = best.capitalize()
98
+ corrections[idx] = best
99
+
100
+ # Сборка результата
101
+ result_words = words.copy()
102
+ for idx, corrected in corrections.items():
103
+ result_words[idx] = corrected
104
+
105
+ return ' '.join(result_words)
@@ -0,0 +1,9 @@
1
+ Metadata-Version: 2.4
2
+ Name: text-corrector
3
+ Version: 0.1.0
4
+ Summary: Простая библиотека на основе расстояния Левенштейна для исправления опечаток
5
+ License: MIT
6
+ Classifier: License :: OSI Approved :: MIT License
7
+ Classifier: Operating System :: OS Independent
8
+ Classifier: Programming Language :: Python :: 3
9
+ Requires-Python: >=3.8
@@ -0,0 +1,5 @@
1
+ text_corrector/__init__.py,sha256=LweYlRH6XHOF4igHhZdwn41gRiCphmXgvIPgeMrmx9c,139
2
+ text_corrector/core.py,sha256=aLBNXEbAN8VrAxnAHcBRgqySI_j97bmgzAgfRNzi8VM,3497
3
+ text_corrector-0.1.0.dist-info/METADATA,sha256=-l4DrnPPrg6OUZxB7_SmWxFnupNenJWJgtkEMaAqJ90,394
4
+ text_corrector-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
5
+ text_corrector-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any