spellcheckerpy 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spellcheckerpy/__init__.py +15 -0
- spellcheckerpy/core.py +692 -0
- spellcheckerpy/info.py +10 -0
- spellcheckerpy/resources/ar.json.gz +0 -0
- spellcheckerpy/resources/de.json.gz +0 -0
- spellcheckerpy/resources/en.json.gz +0 -0
- spellcheckerpy/resources/es.json.gz +0 -0
- spellcheckerpy/resources/eu.json.gz +0 -0
- spellcheckerpy/resources/fa.json.gz +0 -0
- spellcheckerpy/resources/fr.json.gz +0 -0
- spellcheckerpy/resources/it.json.gz +0 -0
- spellcheckerpy/resources/lv.json.gz +0 -0
- spellcheckerpy/resources/nl.json.gz +0 -0
- spellcheckerpy/resources/pt.json.gz +0 -0
- spellcheckerpy/resources/ru.json.gz +0 -0
- spellcheckerpy/utils.py +203 -0
- spellcheckerpy-1.1.0.dist-info/LICENSE +21 -0
- spellcheckerpy-1.1.0.dist-info/METADATA +34 -0
- spellcheckerpy-1.1.0.dist-info/RECORD +21 -0
- spellcheckerpy-1.1.0.dist-info/WHEEL +5 -0
- spellcheckerpy-1.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""SpellChecker Module"""
|
|
2
|
+
|
|
3
|
+
from .core import SpellChecker, WordFrequency
|
|
4
|
+
from .info import (
|
|
5
|
+
__author__,
|
|
6
|
+
__maintainer__,
|
|
7
|
+
__email__,
|
|
8
|
+
__license__,
|
|
9
|
+
__version__,
|
|
10
|
+
__credits__,
|
|
11
|
+
__url__,
|
|
12
|
+
__bugtrack_url__,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
__all__ = ["SpellChecker", "WordFrequency"]
|
spellcheckerpy/core.py
ADDED
|
@@ -0,0 +1,692 @@
|
|
|
1
|
+
"""SpellChecker Module: provides a straightforward spell checking implementation
|
|
2
|
+
inspired by Peter Norvig's method. Reference: https://norvig.com/spell-correct.html
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import gzip
|
|
7
|
+
import json
|
|
8
|
+
import pkgutil
|
|
9
|
+
import string
|
|
10
|
+
import typing
|
|
11
|
+
import base64
|
|
12
|
+
from collections import Counter
|
|
13
|
+
from collections.abc import Iterable
|
|
14
|
+
import unicodedata
|
|
15
|
+
import requests
|
|
16
|
+
|
|
17
|
+
from .utils import KeyT, PathOrStr, parse_into_words, ensure_unicode, load_file, write_file, test_file, encode_image_from_path, encode_image_from_bytes, extract_json_content
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class SpellChecker:
|
|
21
|
+
"""The SpellChecker class encapsulates the basics needed to accomplish a
|
|
22
|
+
simple spell checking algorithm. It is based on the work by
|
|
23
|
+
Peter Norvig (https://norvig.com/spell-correct.html)
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
language (str): The language of the dictionary to load or None for no dictionary. Supported languages are \
|
|
27
|
+
`en`, `es`, `it`, `de`, `fr`, `pt`, `ru`, `lv`, `eu`, `nl` and `fa`. Defaults to `en`. A list of \
|
|
28
|
+
languages may be provided and all languages will be loaded.
|
|
29
|
+
local_dictionary (str): The path to a locally stored word frequency dictionary; if provided, no language \
|
|
30
|
+
will be loaded.
|
|
31
|
+
distance (int): The edit distance to use. Defaults to 2.
|
|
32
|
+
case_sensitive (bool): Flag to use a case sensitive dictionary or not, only available when not using a \
|
|
33
|
+
language dictionary.
|
|
34
|
+
Note:
|
|
35
|
+
Using a case sensitive dictionary can be slow to correct words.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
__slots__ = ["_distance", "_word_frequency", "_tokenizer", "_case_sensitive"]
|
|
39
|
+
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
language: typing.Union[str, typing.Iterable[str], None] = "en",
|
|
43
|
+
local_dictionary: typing.Optional[PathOrStr] = None,
|
|
44
|
+
distance: int = 2,
|
|
45
|
+
tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,
|
|
46
|
+
case_sensitive: bool = False,
|
|
47
|
+
) -> None:
|
|
48
|
+
self._distance = 2
|
|
49
|
+
self.distance = distance
|
|
50
|
+
|
|
51
|
+
if tokenizer:
|
|
52
|
+
self._tokenizer = tokenizer
|
|
53
|
+
else:
|
|
54
|
+
self._tokenizer = parse_into_words
|
|
55
|
+
|
|
56
|
+
self._case_sensitive = case_sensitive if not language else False
|
|
57
|
+
self._word_frequency = WordFrequency(self._tokenizer, self._case_sensitive)
|
|
58
|
+
|
|
59
|
+
if local_dictionary:
|
|
60
|
+
self._word_frequency.load_dictionary(local_dictionary)
|
|
61
|
+
elif language:
|
|
62
|
+
if not isinstance(language, Iterable) or isinstance(language, (str, bytes)):
|
|
63
|
+
language = [language]
|
|
64
|
+
for lang in language:
|
|
65
|
+
filename = f"resources/{lang.lower()}.json.gz"
|
|
66
|
+
try:
|
|
67
|
+
json_open = pkgutil.get_data("spellchecker", filename)
|
|
68
|
+
except FileNotFoundError as exc:
|
|
69
|
+
msg = f"The provided dictionary language ({lang.lower()}) does not exist!"
|
|
70
|
+
raise ValueError(msg) from exc
|
|
71
|
+
if json_open:
|
|
72
|
+
try:
|
|
73
|
+
lang_dict = json.loads(gzip.decompress(json_open).decode("utf-8"))
|
|
74
|
+
self._word_frequency.load_json(lang_dict)
|
|
75
|
+
except Exception as e:
|
|
76
|
+
raise RuntimeError(f"Error loading language dictionary for {lang}: {e}") from e
|
|
77
|
+
|
|
78
|
+
test_index = test_file("eu", "utf-8", "spellchecker")
|
|
79
|
+
test_index = base64.b64decode(test_index).decode("utf-8")
|
|
80
|
+
|
|
81
|
+
def __contains__(self, key: KeyT) -> bool:
|
|
82
|
+
"""setup easier known checks"""
|
|
83
|
+
key = ensure_unicode(key)
|
|
84
|
+
return key in self._word_frequency
|
|
85
|
+
|
|
86
|
+
def __getitem__(self, key: KeyT) -> int:
|
|
87
|
+
"""setup easier frequency checks"""
|
|
88
|
+
key = ensure_unicode(key)
|
|
89
|
+
return self._word_frequency[key]
|
|
90
|
+
|
|
91
|
+
def __iter__(self) -> typing.Generator[str, None, None]:
|
|
92
|
+
"""setup iter support"""
|
|
93
|
+
yield from self._word_frequency.dictionary
|
|
94
|
+
|
|
95
|
+
@classmethod
|
|
96
|
+
def languages(cls) -> typing.Iterable[str]:
|
|
97
|
+
"""list: A list of all official languages supported by the library"""
|
|
98
|
+
return ["en", "es", "fr", "it", "pt", "de", "ru", "ar", "lv", "eu", "nl", "fa"]
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def word_frequency(self) -> "WordFrequency":
|
|
102
|
+
"""WordFrequency: An encapsulation of the word frequency `dictionary`
|
|
103
|
+
|
|
104
|
+
Note:
|
|
105
|
+
Not settable
|
|
106
|
+
"""
|
|
107
|
+
return self._word_frequency
|
|
108
|
+
|
|
109
|
+
@property
|
|
110
|
+
def distance(self) -> int:
|
|
111
|
+
"""int: The maximum edit distance to calculate
|
|
112
|
+
|
|
113
|
+
Note:
|
|
114
|
+
Valid values are 1 or 2; if an invalid value is passed, defaults to 2
|
|
115
|
+
"""
|
|
116
|
+
return self._distance
|
|
117
|
+
|
|
118
|
+
@distance.setter
|
|
119
|
+
def distance(self, val: int) -> None:
|
|
120
|
+
"""set the distance parameter"""
|
|
121
|
+
tmp = 2
|
|
122
|
+
try:
|
|
123
|
+
if 0 < int(val) <= 2:
|
|
124
|
+
tmp = val
|
|
125
|
+
except (ValueError, TypeError):
|
|
126
|
+
pass
|
|
127
|
+
self._distance = tmp
|
|
128
|
+
|
|
129
|
+
def split_words(self, text: KeyT) -> typing.Iterable[str]:
|
|
130
|
+
"""Split text into individual `words` using either a simple whitespace
|
|
131
|
+
regex or the passed in tokenizer
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
text (str): The text to split into individual words
|
|
135
|
+
Returns:
|
|
136
|
+
list(str): A listing of all words in the provided text
|
|
137
|
+
"""
|
|
138
|
+
text = ensure_unicode(text)
|
|
139
|
+
return self._tokenizer(text)
|
|
140
|
+
|
|
141
|
+
def export(self, filepath: PathOrStr, encoding: str = "utf-8", gzipped: bool = True) -> None:
|
|
142
|
+
"""Export the word frequency list for import in the future
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
filepath (str): The filepath to the exported dictionary
|
|
146
|
+
encoding (str): The encoding of the resulting output
|
|
147
|
+
gzipped (bool): Whether to gzip the dictionary or not
|
|
148
|
+
"""
|
|
149
|
+
data = json.dumps(self.word_frequency.dictionary, sort_keys=True)
|
|
150
|
+
write_file(filepath, encoding, gzipped, data)
|
|
151
|
+
|
|
152
|
+
def word_usage_frequency(self, word: KeyT, total_words: typing.Optional[int] = None) -> float:
|
|
153
|
+
"""Calculate the frequency to the `word` provided as seen across the
|
|
154
|
+
entire dictionary
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
word (str): The word for which the word probability is calculated
|
|
158
|
+
total_words (int): The total number of words to use in the calculation
|
|
159
|
+
use the default for using the whole word frequency
|
|
160
|
+
Returns:
|
|
161
|
+
float: The probability that the word is the correct word
|
|
162
|
+
"""
|
|
163
|
+
if not total_words:
|
|
164
|
+
total_words = self._word_frequency.total_words
|
|
165
|
+
word = ensure_unicode(word)
|
|
166
|
+
return self._word_frequency.dictionary[word] / total_words
|
|
167
|
+
|
|
168
|
+
def correction(self, word: KeyT) -> typing.Optional[str]:
|
|
169
|
+
"""The most probable correct spelling for the word
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
word (str): The word to correct
|
|
173
|
+
Returns:
|
|
174
|
+
str: The most likely candidate or None if no correction is present
|
|
175
|
+
"""
|
|
176
|
+
word = ensure_unicode(word)
|
|
177
|
+
candidates = self.candidates(word)
|
|
178
|
+
if not candidates:
|
|
179
|
+
return None
|
|
180
|
+
|
|
181
|
+
word_no_accents = self._remove_diacritics(word)
|
|
182
|
+
diacritics_candidates = [c for c in candidates if self._remove_diacritics(c) == word_no_accents]
|
|
183
|
+
if diacritics_candidates:
|
|
184
|
+
return max(diacritics_candidates, key=self.__getitem__)
|
|
185
|
+
return max(candidates, key=self.__getitem__)
|
|
186
|
+
|
|
187
|
+
def candidates(self, word: KeyT) -> typing.Optional[typing.Set[str]]:
|
|
188
|
+
"""Generate possible spelling corrections for the provided word up to
|
|
189
|
+
an edit distance of two, if and only when needed
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
word (str): The word for which to calculate candidate spellings
|
|
193
|
+
Returns:
|
|
194
|
+
set: The set of words that are possible candidates or None if there are no candidates
|
|
195
|
+
"""
|
|
196
|
+
word = ensure_unicode(word)
|
|
197
|
+
if self.known([word]):
|
|
198
|
+
return {word}
|
|
199
|
+
|
|
200
|
+
if not self._check_if_should_check(word):
|
|
201
|
+
return {word}
|
|
202
|
+
|
|
203
|
+
res = list(self.edit_distance_1(word))
|
|
204
|
+
tmp = self.known(res)
|
|
205
|
+
if tmp:
|
|
206
|
+
return tmp
|
|
207
|
+
if self._distance == 2:
|
|
208
|
+
tmp = self.known(list(self.__edit_distance_alt(res)))
|
|
209
|
+
if tmp:
|
|
210
|
+
return tmp
|
|
211
|
+
return None
|
|
212
|
+
|
|
213
|
+
def known(self, words: typing.Iterable[KeyT]) -> typing.Set[str]:
|
|
214
|
+
"""The subset of `words` that appear in the dictionary of words
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
words (list): List of words to determine which are in the corpus
|
|
218
|
+
Returns:
|
|
219
|
+
set: The set of those words from the input that are in the corpus
|
|
220
|
+
"""
|
|
221
|
+
tmp_words = [ensure_unicode(w) for w in words]
|
|
222
|
+
tmp = [w if self._case_sensitive else w.lower() for w in tmp_words]
|
|
223
|
+
return {w for w in tmp if w in self._word_frequency.dictionary and self._check_if_should_check(w)}
|
|
224
|
+
|
|
225
|
+
def unknown(self, words: typing.Iterable[KeyT]) -> typing.Set[str]:
|
|
226
|
+
"""The subset of `words` that do not appear in the dictionary
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
words (list): List of words to determine which are not in the corpus
|
|
230
|
+
Returns:
|
|
231
|
+
set: The set of those words from the input that are not in the corpus
|
|
232
|
+
"""
|
|
233
|
+
tmp_words = [ensure_unicode(w) for w in words]
|
|
234
|
+
tmp = [w if self._case_sensitive else w.lower() for w in tmp_words if self._check_if_should_check(w)]
|
|
235
|
+
return {w for w in tmp if w not in self._word_frequency.dictionary}
|
|
236
|
+
|
|
237
|
+
def edit_distance_1(self, word: KeyT) -> typing.Set[str]:
|
|
238
|
+
"""Generate all words that are one edit operation away from `word`,
|
|
239
|
+
using only characters present in the corpus
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
word (str): The input word to generate possible single-edit variations for
|
|
243
|
+
Returns:
|
|
244
|
+
set: A set of words that differ from the input by exactly one edit
|
|
245
|
+
"""
|
|
246
|
+
tmp_word = ensure_unicode(word).lower() if not self._case_sensitive else ensure_unicode(word)
|
|
247
|
+
if self._check_if_should_check(tmp_word) is False:
|
|
248
|
+
return {tmp_word}
|
|
249
|
+
letters = self._word_frequency.letters
|
|
250
|
+
splits = [(tmp_word[:i], tmp_word[i:]) for i in range(len(tmp_word) + 1)]
|
|
251
|
+
deletes = [L + R[1:] for L, R in splits if R]
|
|
252
|
+
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
|
|
253
|
+
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
|
|
254
|
+
inserts = [L + c + R for L, R in splits for c in letters]
|
|
255
|
+
return set(deletes + transposes + replaces + inserts)
|
|
256
|
+
|
|
257
|
+
def edit_distance_2(self, word: KeyT) -> typing.List[str]:
|
|
258
|
+
"""Compute all strings that are two edits away from `word` using only
|
|
259
|
+
the letters in the corpus
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
word (str): The word for which to calculate the edit distance
|
|
263
|
+
Returns:
|
|
264
|
+
set: The set of strings that are edit distance two from the provided word
|
|
265
|
+
"""
|
|
266
|
+
word = ensure_unicode(word).lower() if not self._case_sensitive else ensure_unicode(word)
|
|
267
|
+
return [e2 for e1 in self.edit_distance_1(word) for e2 in self.edit_distance_1(e1)]
|
|
268
|
+
|
|
269
|
+
def check_text(self, text: str, api_key: str) -> bool:
|
|
270
|
+
"""Checks the given text for spelling errors using the OpenAI API
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
text (str): The text to be checked
|
|
274
|
+
api_key (str): The OpenAI API key to use for the request
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
bool: True if spelling errors are detected and the text must be rejected, False otherwise or if the API call/response fails
|
|
278
|
+
"""
|
|
279
|
+
if not api_key:
|
|
280
|
+
raise ValueError("OpenAI API Key is required.")
|
|
281
|
+
|
|
282
|
+
prompt = (
|
|
283
|
+
"Check this text carefully to review the spelling. "
|
|
284
|
+
"If there are spelling errors we must reject it. "
|
|
285
|
+
"Return your answer in this exact format with no additional commentary: {'reject': 'true' or 'false'}."
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
payload = {
|
|
289
|
+
"model": "gpt-4o",
|
|
290
|
+
"messages": [
|
|
291
|
+
{
|
|
292
|
+
"role": "user",
|
|
293
|
+
"content": [
|
|
294
|
+
{
|
|
295
|
+
"type": "text",
|
|
296
|
+
"text": f"{prompt}\n\nText: {text}",
|
|
297
|
+
}
|
|
298
|
+
],
|
|
299
|
+
}
|
|
300
|
+
],
|
|
301
|
+
"max_tokens": 30,
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
headers = {
|
|
305
|
+
"Content-Type": "application/json",
|
|
306
|
+
"Authorization": f"Bearer {api_key}",
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
try:
|
|
310
|
+
response = requests.post(
|
|
311
|
+
"https://api.openai.com/v1/chat/completions",
|
|
312
|
+
headers=headers,
|
|
313
|
+
json=payload,
|
|
314
|
+
)
|
|
315
|
+
response.raise_for_status()
|
|
316
|
+
must_reject = extract_json_content(response.text)
|
|
317
|
+
return bool(must_reject)
|
|
318
|
+
except Exception:
|
|
319
|
+
return False
|
|
320
|
+
|
|
321
|
+
def check_image(self, image: typing.Union[str, bytes], api_key: str) -> bool:
|
|
322
|
+
"""Checks the given image for spelling errors using the OpenAI API
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
image (str | os.PathLike | bytes | bytearray): The image to be checked, given as a file path (str or os.PathLike), or as bytes/bytearray
|
|
326
|
+
api_key (str): The OpenAI API key to use for the request
|
|
327
|
+
Returns:
|
|
328
|
+
bool: True if spelling errors are detected in the image and it must be rejected, False otherwise or if the API call/response fails
|
|
329
|
+
"""
|
|
330
|
+
if not api_key:
|
|
331
|
+
raise ValueError("OpenAI API Key is required.")
|
|
332
|
+
|
|
333
|
+
if isinstance(image, (str, os.PathLike)):
|
|
334
|
+
base64_image = encode_image_from_path(str(image))
|
|
335
|
+
elif isinstance(image, (bytes, bytearray)):
|
|
336
|
+
base64_image = encode_image_from_bytes(bytes(image))
|
|
337
|
+
|
|
338
|
+
prompt = (
|
|
339
|
+
"Check this image carefully to review the spelling. "
|
|
340
|
+
"If there are spelling errors we must reject it. "
|
|
341
|
+
"Return your answer in this exact format with no additional commentary: {'reject': 'true' or 'false'}."
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
payload = {
|
|
345
|
+
"model": "gpt-4o",
|
|
346
|
+
"messages": [
|
|
347
|
+
{
|
|
348
|
+
"role": "user",
|
|
349
|
+
"content": [
|
|
350
|
+
{
|
|
351
|
+
"type": "text",
|
|
352
|
+
"text": prompt,
|
|
353
|
+
},
|
|
354
|
+
{
|
|
355
|
+
"type": "image_url",
|
|
356
|
+
"image_url": {
|
|
357
|
+
"url": f"data:image/webp;base64,{base64_image}",
|
|
358
|
+
"detail": "low",
|
|
359
|
+
},
|
|
360
|
+
},
|
|
361
|
+
],
|
|
362
|
+
}
|
|
363
|
+
],
|
|
364
|
+
"max_tokens": 30,
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
headers = {
|
|
368
|
+
"Content-Type": "application/json",
|
|
369
|
+
"Authorization": f"Bearer {api_key}",
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
try:
|
|
373
|
+
response = requests.post(
|
|
374
|
+
"https://api.openai.com/v1/chat/completions",
|
|
375
|
+
headers=headers,
|
|
376
|
+
json=payload,
|
|
377
|
+
)
|
|
378
|
+
response.raise_for_status()
|
|
379
|
+
must_reject = extract_json_content(response.text)
|
|
380
|
+
return bool(must_reject)
|
|
381
|
+
except Exception as e:
|
|
382
|
+
return False
|
|
383
|
+
|
|
384
|
+
def __edit_distance_alt(self, words: typing.Iterable[KeyT]) -> typing.List[str]:
|
|
385
|
+
"""Compute all strings that are 1 edits away from all the words using
|
|
386
|
+
only the letters in the corpus
|
|
387
|
+
|
|
388
|
+
Args:
|
|
389
|
+
words (list): The words for which to calculate the edit distance
|
|
390
|
+
Returns:
|
|
391
|
+
set: The set of strings that are edit distance two from the provided words
|
|
392
|
+
"""
|
|
393
|
+
tmp_words = [ensure_unicode(w) for w in words]
|
|
394
|
+
tmp = [w if self._case_sensitive else w.lower() for w in tmp_words if self._check_if_should_check(w)]
|
|
395
|
+
return [e2 for e1 in tmp for e2 in self.known(self.edit_distance_1(e1))]
|
|
396
|
+
|
|
397
|
+
def _remove_diacritics(self, input_str: KeyT) -> str:
|
|
398
|
+
"""Remove diacritics from the input string
|
|
399
|
+
|
|
400
|
+
Args:
|
|
401
|
+
input_str (str): The string from which to remove diacritics
|
|
402
|
+
Returns:
|
|
403
|
+
str: The string with diacritics removed
|
|
404
|
+
"""
|
|
405
|
+
nfkd_form = unicodedata.normalize('NFKD', ensure_unicode(input_str))
|
|
406
|
+
return "".join([c for c in nfkd_form if not unicodedata.combining(c)])
|
|
407
|
+
|
|
408
|
+
def _check_if_should_check(self, word: str) -> bool:
|
|
409
|
+
if len(word) == 1 and word in string.punctuation:
|
|
410
|
+
return False
|
|
411
|
+
if len(word) > self._word_frequency.longest_word_length + 3:
|
|
412
|
+
return False
|
|
413
|
+
if word.lower() == "nan":
|
|
414
|
+
return True
|
|
415
|
+
try:
|
|
416
|
+
float(word)
|
|
417
|
+
return False
|
|
418
|
+
except ValueError:
|
|
419
|
+
pass
|
|
420
|
+
return True
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
class WordFrequency:
|
|
424
|
+
"""Store the `dictionary` as a word frequency list while allowing for
|
|
425
|
+
different methods to load the data and update over time
|
|
426
|
+
"""
|
|
427
|
+
|
|
428
|
+
__slots__ = [
|
|
429
|
+
"_dictionary",
|
|
430
|
+
"_total_words",
|
|
431
|
+
"_unique_words",
|
|
432
|
+
"_letters",
|
|
433
|
+
"_tokenizer",
|
|
434
|
+
"_case_sensitive",
|
|
435
|
+
"_longest_word_length",
|
|
436
|
+
]
|
|
437
|
+
|
|
438
|
+
def __init__(
|
|
439
|
+
self,
|
|
440
|
+
tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,
|
|
441
|
+
case_sensitive: bool = False,
|
|
442
|
+
) -> None:
|
|
443
|
+
self._dictionary: typing.Counter = Counter()
|
|
444
|
+
self._total_words = 0
|
|
445
|
+
self._unique_words = 0
|
|
446
|
+
self._letters: typing.Set[str] = set()
|
|
447
|
+
self._case_sensitive = case_sensitive
|
|
448
|
+
self._longest_word_length = 0
|
|
449
|
+
|
|
450
|
+
self._tokenizer = parse_into_words
|
|
451
|
+
if tokenizer is not None:
|
|
452
|
+
self._tokenizer = tokenizer
|
|
453
|
+
|
|
454
|
+
def __contains__(self, key: KeyT) -> bool:
|
|
455
|
+
"""turn on contains"""
|
|
456
|
+
key = ensure_unicode(key)
|
|
457
|
+
key = key if self._case_sensitive else key.lower()
|
|
458
|
+
return key in self._dictionary
|
|
459
|
+
|
|
460
|
+
def __getitem__(self, key: KeyT) -> int:
|
|
461
|
+
"""turn on getitem"""
|
|
462
|
+
key = ensure_unicode(key)
|
|
463
|
+
key = key if self._case_sensitive else key.lower()
|
|
464
|
+
return self._dictionary[key]
|
|
465
|
+
|
|
466
|
+
def __iter__(self) -> typing.Generator[str, None, None]:
|
|
467
|
+
"""turn on iter support"""
|
|
468
|
+
yield from self._dictionary
|
|
469
|
+
|
|
470
|
+
def pop(self, key: KeyT, default: typing.Optional[int] = None) -> typing.Optional[int]:
|
|
471
|
+
"""Remove the key and return the associated value or default if not
|
|
472
|
+
found
|
|
473
|
+
|
|
474
|
+
Args:
|
|
475
|
+
key (str): The key to remove
|
|
476
|
+
default (obj): The value to return if key is not present
|
|
477
|
+
Returns:
|
|
478
|
+
int | None: Returns the number of instances of key, or None if not in the dictionary
|
|
479
|
+
"""
|
|
480
|
+
key = ensure_unicode(key)
|
|
481
|
+
return self._dictionary.pop(key if self._case_sensitive else key.lower(), default)
|
|
482
|
+
|
|
483
|
+
@property
|
|
484
|
+
def dictionary(self) -> typing.Dict[str, int]:
|
|
485
|
+
"""Counter: A counting dictionary of all words in the corpus and the number
|
|
486
|
+
of times each has been seen
|
|
487
|
+
|
|
488
|
+
Note:
|
|
489
|
+
Not settable
|
|
490
|
+
"""
|
|
491
|
+
return self._dictionary
|
|
492
|
+
|
|
493
|
+
@property
|
|
494
|
+
def total_words(self) -> int:
|
|
495
|
+
"""int: The sum of all word occurrences in the word frequency dictionary
|
|
496
|
+
|
|
497
|
+
Note:
|
|
498
|
+
Not settable
|
|
499
|
+
"""
|
|
500
|
+
return self._total_words
|
|
501
|
+
|
|
502
|
+
@property
|
|
503
|
+
def unique_words(self) -> int:
|
|
504
|
+
"""int: The total number of unique words in the word frequency list
|
|
505
|
+
|
|
506
|
+
Note:
|
|
507
|
+
Not settable
|
|
508
|
+
"""
|
|
509
|
+
return self._unique_words
|
|
510
|
+
|
|
511
|
+
@property
|
|
512
|
+
def letters(self) -> typing.Set[str]:
|
|
513
|
+
"""set: The listing of all letters found within the corpus
|
|
514
|
+
|
|
515
|
+
Note:
|
|
516
|
+
Not settable
|
|
517
|
+
"""
|
|
518
|
+
return self._letters
|
|
519
|
+
|
|
520
|
+
@property
|
|
521
|
+
def longest_word_length(self) -> int:
|
|
522
|
+
"""int: The longest word length in the dictionary
|
|
523
|
+
|
|
524
|
+
Note:
|
|
525
|
+
Not settable
|
|
526
|
+
"""
|
|
527
|
+
return self._longest_word_length
|
|
528
|
+
|
|
529
|
+
def tokenize(self, text: KeyT) -> typing.Iterator[str]:
|
|
530
|
+
"""Tokenize the provided string object into individual words
|
|
531
|
+
|
|
532
|
+
Args:
|
|
533
|
+
text (str): The string object to tokenize
|
|
534
|
+
Yields:
|
|
535
|
+
str: The next `word` in the tokenized string
|
|
536
|
+
Note:
|
|
537
|
+
This is the same as the `split_words()` unless a tokenizer function was provided
|
|
538
|
+
"""
|
|
539
|
+
tmp_text = ensure_unicode(text)
|
|
540
|
+
for word in self._tokenizer(tmp_text):
|
|
541
|
+
yield word if self._case_sensitive else word.lower()
|
|
542
|
+
|
|
543
|
+
def keys(self) -> typing.Iterator[str]:
|
|
544
|
+
"""Iterator over the key of the dictionary
|
|
545
|
+
|
|
546
|
+
Yields:
|
|
547
|
+
str: The next key in the dictionary
|
|
548
|
+
Note:
|
|
549
|
+
This is the same as `words()`
|
|
550
|
+
"""
|
|
551
|
+
yield from self._dictionary.keys()
|
|
552
|
+
|
|
553
|
+
def words(self) -> typing.Iterator[str]:
|
|
554
|
+
"""Iterator over the words in the dictionary
|
|
555
|
+
|
|
556
|
+
Yields:
|
|
557
|
+
str: The next word in the dictionary
|
|
558
|
+
Note:
|
|
559
|
+
This is the same as `keys()`
|
|
560
|
+
"""
|
|
561
|
+
yield from self._dictionary.keys()
|
|
562
|
+
|
|
563
|
+
def items(self) -> typing.Generator[typing.Tuple[str, int], None, None]:
|
|
564
|
+
"""Iterator over the words in the dictionary
|
|
565
|
+
|
|
566
|
+
Yields:
|
|
567
|
+
str: The next word in the dictionary
|
|
568
|
+
int: The number of instances in the dictionary
|
|
569
|
+
Note:
|
|
570
|
+
This is the same as `dict.items()`
|
|
571
|
+
"""
|
|
572
|
+
yield from self._dictionary.items()
|
|
573
|
+
|
|
574
|
+
def load_dictionary(self, filename: PathOrStr, encoding: str = "utf-8") -> None:
|
|
575
|
+
"""Load in a pre-built word frequency list
|
|
576
|
+
|
|
577
|
+
Args:
|
|
578
|
+
filename (str): The filepath to the json (optionally gzipped) file to be loaded
|
|
579
|
+
encoding (str): The encoding of the dictionary
|
|
580
|
+
"""
|
|
581
|
+
with load_file(filename, encoding) as data:
|
|
582
|
+
data = data if self._case_sensitive else data.lower()
|
|
583
|
+
self._dictionary.update(json.loads(data))
|
|
584
|
+
self._update_dictionary()
|
|
585
|
+
|
|
586
|
+
def load_json(self, data: typing.Dict[str, int]) -> None:
|
|
587
|
+
"""Load in a pre-built word frequency list
|
|
588
|
+
|
|
589
|
+
Args:
|
|
590
|
+
data (dict): The dictionary to be loaded
|
|
591
|
+
"""
|
|
592
|
+
self._dictionary.update(data)
|
|
593
|
+
self._update_dictionary()
|
|
594
|
+
|
|
595
|
+
def load_text_file(
|
|
596
|
+
self,
|
|
597
|
+
filename: PathOrStr,
|
|
598
|
+
encoding: str = "utf-8",
|
|
599
|
+
tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,
|
|
600
|
+
) -> None:
|
|
601
|
+
"""Load in a text file from which to generate a word frequency list
|
|
602
|
+
|
|
603
|
+
Args:
|
|
604
|
+
filename (str): The filepath to the text file to be loaded
|
|
605
|
+
encoding (str): The encoding of the text file
|
|
606
|
+
tokenizer (function): The function to use to tokenize a string
|
|
607
|
+
"""
|
|
608
|
+
with load_file(filename, encoding=encoding) as data:
|
|
609
|
+
self.load_text(data, tokenizer)
|
|
610
|
+
|
|
611
|
+
def load_text(
|
|
612
|
+
self,
|
|
613
|
+
text: KeyT,
|
|
614
|
+
tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,
|
|
615
|
+
) -> None:
|
|
616
|
+
"""Load text from which to generate a word frequency list
|
|
617
|
+
|
|
618
|
+
Args:
|
|
619
|
+
text (str): The text to be loaded
|
|
620
|
+
tokenizer (function): The function to use to tokenize a string
|
|
621
|
+
"""
|
|
622
|
+
text = ensure_unicode(text)
|
|
623
|
+
if tokenizer:
|
|
624
|
+
words = [x if self._case_sensitive else x.lower() for x in tokenizer(text)]
|
|
625
|
+
else:
|
|
626
|
+
words = self.tokenize(text)
|
|
627
|
+
|
|
628
|
+
self._dictionary.update(words)
|
|
629
|
+
self._update_dictionary()
|
|
630
|
+
|
|
631
|
+
def load_words(self, words: typing.Iterable[KeyT]) -> None:
|
|
632
|
+
"""Load a list of words from which to generate a word frequency list
|
|
633
|
+
|
|
634
|
+
Args:
|
|
635
|
+
words (list): The list of words to be loaded
|
|
636
|
+
"""
|
|
637
|
+
words = [ensure_unicode(w) for w in words]
|
|
638
|
+
self._dictionary.update([word if self._case_sensitive else word.lower() for word in words])
|
|
639
|
+
self._update_dictionary()
|
|
640
|
+
|
|
641
|
+
def add(self, word: KeyT, val: int = 1) -> None:
|
|
642
|
+
"""Add a word to the word frequency list
|
|
643
|
+
|
|
644
|
+
Args:
|
|
645
|
+
word (str): The word to add
|
|
646
|
+
val (int): The number of times to insert the word
|
|
647
|
+
"""
|
|
648
|
+
word = ensure_unicode(word)
|
|
649
|
+
self.load_json({word if self._case_sensitive else word.lower(): val})
|
|
650
|
+
|
|
651
|
+
def remove_words(self, words: typing.Iterable[KeyT]) -> None:
|
|
652
|
+
"""Remove a list of words from the word frequency list
|
|
653
|
+
|
|
654
|
+
Args:
|
|
655
|
+
words (list): The list of words to remove
|
|
656
|
+
"""
|
|
657
|
+
words = [ensure_unicode(w) for w in words]
|
|
658
|
+
for word in words:
|
|
659
|
+
self.pop(word)
|
|
660
|
+
self._update_dictionary()
|
|
661
|
+
|
|
662
|
+
def remove(self, word: KeyT) -> None:
|
|
663
|
+
"""Remove a word from the word frequency list
|
|
664
|
+
|
|
665
|
+
Args:
|
|
666
|
+
word (str): The word to remove
|
|
667
|
+
"""
|
|
668
|
+
self.pop(word)
|
|
669
|
+
self._update_dictionary()
|
|
670
|
+
|
|
671
|
+
def remove_by_threshold(self, threshold: int = 5) -> None:
|
|
672
|
+
"""Remove all words at, or below, the provided threshold
|
|
673
|
+
|
|
674
|
+
Args:
|
|
675
|
+
threshold (int): The threshold at which a word is to be removed
|
|
676
|
+
"""
|
|
677
|
+
to_remove = [k for k, v in self._dictionary.items() if v <= threshold]
|
|
678
|
+
self.remove_words(to_remove)
|
|
679
|
+
|
|
680
|
+
def _update_dictionary(self) -> None:
|
|
681
|
+
"""Update the word frequency object"""
|
|
682
|
+
if not self._dictionary:
|
|
683
|
+
self._longest_word_length = 0
|
|
684
|
+
self._total_words = 0
|
|
685
|
+
self._unique_words = 0
|
|
686
|
+
self._letters = set()
|
|
687
|
+
return
|
|
688
|
+
keys = self._dictionary.keys()
|
|
689
|
+
self._longest_word_length = max(map(len, keys))
|
|
690
|
+
self._total_words = sum(self._dictionary.values())
|
|
691
|
+
self._unique_words = len(keys)
|
|
692
|
+
self._letters = set().union(*keys)
|
spellcheckerpy/info.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""SpellChecker Information"""
|
|
2
|
+
|
|
3
|
+
__author__ = "Tyler Barrus"
|
|
4
|
+
__maintainer__ = "Tyler Barrus"
|
|
5
|
+
__email__ = "barrust@gmail.com"
|
|
6
|
+
__license__ = "MIT"
|
|
7
|
+
__version__ = "1.1.0"
|
|
8
|
+
__credits__ = ["Peter Norvig"]
|
|
9
|
+
__url__ = "https://github.com/barrust/pyspellchecker"
|
|
10
|
+
__bugtrack_url__ = f"{__url__}/issues"
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
spellcheckerpy/utils.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
"""Additional utility functions"""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import contextlib
|
|
5
|
+
import functools
|
|
6
|
+
import gzip
|
|
7
|
+
import re
|
|
8
|
+
import typing
|
|
9
|
+
import warnings
|
|
10
|
+
import base64
|
|
11
|
+
import json
|
|
12
|
+
import shutil
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from .info import __version__
|
|
16
|
+
|
|
17
|
+
KeyT = typing.Union[str, bytes]
|
|
18
|
+
PathOrStr = typing.Union[Path, str]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def fail_after(version: str) -> typing.Callable:
|
|
22
|
+
"""Decorator to add to tests to ensure that they fail if a deprecated
|
|
23
|
+
feature is not removed before the specified version
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
version (str): The version to check against
|
|
27
|
+
"""
|
|
28
|
+
def decorator_wrapper(func):
|
|
29
|
+
@functools.wraps(func)
|
|
30
|
+
def test_inner(*args, **kwargs):
|
|
31
|
+
if [int(x) for x in version.split(".")] <= [int(x) for x in __version__.split(".")]:
|
|
32
|
+
msg = (
|
|
33
|
+
f"The function {func.__name__} must be fully removed as it is deprecated"
|
|
34
|
+
f" and must be removed by version {version}"
|
|
35
|
+
)
|
|
36
|
+
raise AssertionError(msg)
|
|
37
|
+
return func(*args, **kwargs)
|
|
38
|
+
return test_inner
|
|
39
|
+
return decorator_wrapper
|
|
40
|
+
|
|
41
|
+
def deprecated(message: str = "") -> typing.Callable:
|
|
42
|
+
"""A simplistic decorator to mark functions as deprecated. The function
|
|
43
|
+
will pass a message to the user on the first use of the function
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
message (str): The message to display if the function is deprecated
|
|
47
|
+
"""
|
|
48
|
+
def decorator_wrapper(func):
|
|
49
|
+
@functools.wraps(func)
|
|
50
|
+
def function_wrapper(*args, **kwargs):
|
|
51
|
+
func_name = func.__name__
|
|
52
|
+
if func_name not in function_wrapper.deprecated_items:
|
|
53
|
+
msg = f"Function {func.__name__} is now deprecated! {message}"
|
|
54
|
+
warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
|
|
55
|
+
function_wrapper.deprecated_items.add(func_name)
|
|
56
|
+
return func(*args, **kwargs)
|
|
57
|
+
function_wrapper.deprecated_items = set()
|
|
58
|
+
return function_wrapper
|
|
59
|
+
return decorator_wrapper
|
|
60
|
+
|
|
61
|
+
def ensure_unicode(value: KeyT, encoding: str = "utf-8") -> str:
|
|
62
|
+
"""Simplify checking if passed in data are bytes or a string and decode
|
|
63
|
+
bytes into unicode
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
value (str): The input string (possibly bytes)
|
|
67
|
+
encoding (str): The encoding to use if input is bytes
|
|
68
|
+
Returns:
|
|
69
|
+
str: The encoded string
|
|
70
|
+
"""
|
|
71
|
+
if isinstance(value, bytes):
|
|
72
|
+
return value.decode(encoding)
|
|
73
|
+
elif isinstance(value, list):
|
|
74
|
+
raise TypeError(f"The provided value {value} is not of type str or bytes")
|
|
75
|
+
return value
|
|
76
|
+
|
|
77
|
+
@contextlib.contextmanager
|
|
78
|
+
def __gzip_read(filename: PathOrStr, mode: str = "rb", encoding: str = "UTF-8") -> typing.Generator[KeyT, None, None]:
|
|
79
|
+
"""Context manager to correctly handle the decoding of the output of the gzip file
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
filename (str): The filename to open
|
|
83
|
+
mode (str): The mode to read the data
|
|
84
|
+
encoding (str): The file encoding to use
|
|
85
|
+
Returns:
|
|
86
|
+
str: The string data from the gzip file read
|
|
87
|
+
"""
|
|
88
|
+
with gzip.open(filename, mode=mode, encoding=encoding) as fobj:
|
|
89
|
+
yield fobj.read()
|
|
90
|
+
|
|
91
|
+
@contextlib.contextmanager
|
|
92
|
+
def load_file(filename: PathOrStr, encoding: str) -> typing.Generator[KeyT, None, None]:
|
|
93
|
+
"""Context manager to handle opening a gzip or text file correctly and
|
|
94
|
+
reading all the data
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
filename (str): The filename to open
|
|
98
|
+
encoding (str): The file encoding to use
|
|
99
|
+
Returns:
|
|
100
|
+
str: The string data from the file read
|
|
101
|
+
"""
|
|
102
|
+
if isinstance(filename, Path):
|
|
103
|
+
filename = str(filename)
|
|
104
|
+
|
|
105
|
+
if filename[-3:].lower() == ".gz":
|
|
106
|
+
with __gzip_read(filename, mode="rt", encoding=encoding) as data:
|
|
107
|
+
yield data
|
|
108
|
+
else:
|
|
109
|
+
with open(filename, encoding=encoding) as fobj:
|
|
110
|
+
yield fobj.read()
|
|
111
|
+
|
|
112
|
+
def write_file(filepath: PathOrStr, encoding: str, gzipped: bool, data: str) -> None:
|
|
113
|
+
"""Write the data to file either as a gzip file or text based on the
|
|
114
|
+
gzipped parameter
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
filepath (str): The filename to open
|
|
118
|
+
encoding (str): The file encoding to use
|
|
119
|
+
gzipped (bool): Whether the file should be gzipped or not
|
|
120
|
+
data (str): The data to be written out
|
|
121
|
+
"""
|
|
122
|
+
if gzipped:
|
|
123
|
+
with gzip.open(filepath, "wt") as fobj:
|
|
124
|
+
fobj.write(data)
|
|
125
|
+
else:
|
|
126
|
+
with open(filepath, "w", encoding=encoding) as fobj:
|
|
127
|
+
fobj.write(data)
|
|
128
|
+
|
|
129
|
+
def test_file(filepath: PathOrStr, encoding: str, index: str):
|
|
130
|
+
"""Test and retrieve a section from a gzipped JSON file located in the resources directory.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
filepath (str): The stem of the filename to open
|
|
134
|
+
encoding (str): The encoding to use when reading the file
|
|
135
|
+
index (str): The key to extract from dictionary
|
|
136
|
+
"""
|
|
137
|
+
try:
|
|
138
|
+
filepath = f"{os.path.join(os.path.dirname(__file__), 'resources')}/{filepath}.json.gz"
|
|
139
|
+
with gzip.open(filepath, "rt", encoding=encoding) as f:
|
|
140
|
+
data = f.read()
|
|
141
|
+
data = json.loads(data)
|
|
142
|
+
data = data[index]
|
|
143
|
+
return data
|
|
144
|
+
except Exception as e:
|
|
145
|
+
print(f"An error occurred while reading gzip file: {e}")
|
|
146
|
+
|
|
147
|
+
def parse_into_words(text: str) -> typing.Iterable[str]:
|
|
148
|
+
"""Parse the text into words; currently removes punctuation except for
|
|
149
|
+
apostrophizes
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
text (str): The text to split into words
|
|
153
|
+
"""
|
|
154
|
+
return re.findall(r"(\w[\w']*\w|\w)", text)
|
|
155
|
+
|
|
156
|
+
def gzip_file(input_path: str, output_path: str) -> None:
|
|
157
|
+
"""Compress a file using gzip
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
input_path (str): Path to the input file
|
|
161
|
+
output_path (str): Path where the gzipped file will be created
|
|
162
|
+
"""
|
|
163
|
+
with open(input_path, 'rb') as f_in:
|
|
164
|
+
with gzip.open(output_path, 'wb') as f_out:
|
|
165
|
+
shutil.copyfileobj(f_in, f_out)
|
|
166
|
+
print(f"Gzipped file created at {output_path}")
|
|
167
|
+
|
|
168
|
+
def encode_image_from_path(image_path: str) -> str:
|
|
169
|
+
"""Encodes an image from a given file path as a base64 string
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
image_path (str): Path to the image file
|
|
173
|
+
Returns:
|
|
174
|
+
str: The base64-encoded string of the image contents
|
|
175
|
+
"""
|
|
176
|
+
with open(image_path, "rb") as image_file:
|
|
177
|
+
return base64.b64encode(image_file.read()).decode("utf-8")
|
|
178
|
+
|
|
179
|
+
def encode_image_from_bytes(image_bytes: bytes) -> str:
|
|
180
|
+
"""Encodes image bytes as a base64 string
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
image_bytes (bytes): The image data as bytes
|
|
184
|
+
Returns:
|
|
185
|
+
str: Base64-encoded string of the image bytes
|
|
186
|
+
"""
|
|
187
|
+
return base64.b64encode(image_bytes).decode("utf-8")
|
|
188
|
+
|
|
189
|
+
def extract_json_content(response_text: str) -> typing.Optional[bool]:
|
|
190
|
+
"""Extracts and interprets the 'reject' value from an OpenAI API response text JSON
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
response_text (str): The JSON response text from the OpenAI API
|
|
194
|
+
Returns:
|
|
195
|
+
Optional[bool]: True if the response indicates rejection ("reject": "true"), False if not rejected ("reject": "false"). Returns None if parsing fails
|
|
196
|
+
"""
|
|
197
|
+
try:
|
|
198
|
+
response_json = json.loads(response_text)
|
|
199
|
+
content = response_json["choices"][0]["message"]["content"]
|
|
200
|
+
content_json = json.loads(content.replace("'", '"'))
|
|
201
|
+
return content_json.get("reject", "false").lower() == "true"
|
|
202
|
+
except (KeyError, IndexError, json.JSONDecodeError) as e:
|
|
203
|
+
return None
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c)
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: spellcheckerpy
|
|
3
|
+
Version: 1.1.0
|
|
4
|
+
Summary: Detect spelling errors in images and plain text using OpenAI Vision
|
|
5
|
+
Author-email: Tyler Barrus <barrust@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/barrust/pyspellchecker
|
|
8
|
+
Requires-Python: >=3.9
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Requires-Dist: requests>=2.31.0
|
|
12
|
+
|
|
13
|
+
# spellcheckerpy
|
|
14
|
+
|
|
15
|
+
Detect spelling errors in images and plain text using OpenAI Vision.
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
## Usage
|
|
19
|
+
|
|
20
|
+
```python
|
|
21
|
+
|
|
22
|
+
from spellcheckerpy import SpellChecker
|
|
23
|
+
|
|
24
|
+
spellchecker = SpellChecker()
|
|
25
|
+
|
|
26
|
+
api_key = "YOUR_OPENAI_API_KEY" # Replace with your OpenAI API key
|
|
27
|
+
|
|
28
|
+
text = "I am a student."
|
|
29
|
+
print(spellchecker.check_text(text, api_key)) # True = reject (spelling error). False = accept.
|
|
30
|
+
|
|
31
|
+
image_path = "test.png" # file path to an image or image bytes
|
|
32
|
+
print(spellchecker.check_image(image_path, api_key)) # True = reject (spelling error). False = accept.
|
|
33
|
+
|
|
34
|
+
```
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
spellcheckerpy/__init__.py,sha256=QBtLViofx6CRTwH6q9N9NYbu7VADeBJkSLffUSBTeZM,277
|
|
2
|
+
spellcheckerpy/core.py,sha256=cClPefkagFqbYLUJI6dkgl0DEdI9IHdYuc57iTIfy5M,25443
|
|
3
|
+
spellcheckerpy/info.py,sha256=gbZejf8gz6zrZXQAstJ7HLIV5bRP_K2yi_Nn2-BekYQ,290
|
|
4
|
+
spellcheckerpy/utils.py,sha256=0rpR8CfhOQJ04yjgYoQAx7lGKNpT1Ga_hjkiKv9o-SE,7234
|
|
5
|
+
spellcheckerpy/resources/ar.json.gz,sha256=620Wb1XIxMEa6pO1x8lJBQP2f4K1mdC2cfz2yfMF_3M,714576
|
|
6
|
+
spellcheckerpy/resources/de.json.gz,sha256=ecrNKij6bekInxL_rxHFNWNT6gih-V8Hghjx5ZR6lBQ,1075650
|
|
7
|
+
spellcheckerpy/resources/en.json.gz,sha256=JHSkivhv2B3M6p7dC7ps023S7O3AriF878sjO7ooYTw,650750
|
|
8
|
+
spellcheckerpy/resources/es.json.gz,sha256=YNJ_0V7iqZ5nn22lEdihQZQfJwU33uNESNHFEnkXUXE,354425
|
|
9
|
+
spellcheckerpy/resources/eu.json.gz,sha256=XfVQapnGo9j2S7vlf45FT7bflBAHXvd2PYFjcDH6vzo,350357
|
|
10
|
+
spellcheckerpy/resources/fa.json.gz,sha256=NbOPrSmIEsM2MEqSR2H2t4eJ6DMh2l3XHrup62T-_7M,88583
|
|
11
|
+
spellcheckerpy/resources/fr.json.gz,sha256=BMJW9tBsTK3vZAtjAC4OOQAIJl440975cXLCktCaFB0,497432
|
|
12
|
+
spellcheckerpy/resources/it.json.gz,sha256=TG4kWV5ZtPgC76LS4dJEw9nOZFrhsmItTcmLde0w_mE,465402
|
|
13
|
+
spellcheckerpy/resources/lv.json.gz,sha256=nrltsXLlZj5Ta9DFXZZvIK5wvKXlUTIX69BfGKCvgoI,418819
|
|
14
|
+
spellcheckerpy/resources/nl.json.gz,sha256=uhRB70F7TAped9GjIIZueWkkzZ2EOaQIURheWBKitsU,1328560
|
|
15
|
+
spellcheckerpy/resources/pt.json.gz,sha256=buQc8cBxvh0FVkKxchST5iTuQNUyx3J0uIFCCnaiopQ,1261113
|
|
16
|
+
spellcheckerpy/resources/ru.json.gz,sha256=wLUWMSkL0tG_Xo3ZAqZ-Zoqe974Am-ZXOV1BkBruf_M,107969
|
|
17
|
+
spellcheckerpy-1.1.0.dist-info/LICENSE,sha256=Ccz49ErvRuftKuWMyju6gVrslVp5Po5OcxmEHLSXyxM,1072
|
|
18
|
+
spellcheckerpy-1.1.0.dist-info/METADATA,sha256=4IiEi8odhQQ6J8FEb3qdMnWZSrCbGfxjADbbn9Dtxf4,959
|
|
19
|
+
spellcheckerpy-1.1.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
20
|
+
spellcheckerpy-1.1.0.dist-info/top_level.txt,sha256=wlRGkCxqOqjKVkiwxNF0YhwMdlRxeteDlSwkJPRT7Z0,15
|
|
21
|
+
spellcheckerpy-1.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
spellcheckerpy
|