openai-spellchecker 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. openai_spellchecker-1.0.0/LICENSE +21 -0
  2. openai_spellchecker-1.0.0/PKG-INFO +34 -0
  3. openai_spellchecker-1.0.0/README.md +22 -0
  4. openai_spellchecker-1.0.0/openai_spellchecker/__init__.py +15 -0
  5. openai_spellchecker-1.0.0/openai_spellchecker/core.py +692 -0
  6. openai_spellchecker-1.0.0/openai_spellchecker/info.py +10 -0
  7. openai_spellchecker-1.0.0/openai_spellchecker/resources/ar.json.gz +0 -0
  8. openai_spellchecker-1.0.0/openai_spellchecker/resources/de.json.gz +0 -0
  9. openai_spellchecker-1.0.0/openai_spellchecker/resources/en.json.gz +0 -0
  10. openai_spellchecker-1.0.0/openai_spellchecker/resources/es.json.gz +0 -0
  11. openai_spellchecker-1.0.0/openai_spellchecker/resources/eu.json.gz +0 -0
  12. openai_spellchecker-1.0.0/openai_spellchecker/resources/fa.json.gz +0 -0
  13. openai_spellchecker-1.0.0/openai_spellchecker/resources/fr.json.gz +0 -0
  14. openai_spellchecker-1.0.0/openai_spellchecker/resources/it.json.gz +0 -0
  15. openai_spellchecker-1.0.0/openai_spellchecker/resources/lv.json.gz +0 -0
  16. openai_spellchecker-1.0.0/openai_spellchecker/resources/nl.json.gz +0 -0
  17. openai_spellchecker-1.0.0/openai_spellchecker/resources/pt.json.gz +0 -0
  18. openai_spellchecker-1.0.0/openai_spellchecker/resources/ru.json.gz +0 -0
  19. openai_spellchecker-1.0.0/openai_spellchecker/utils.py +214 -0
  20. openai_spellchecker-1.0.0/openai_spellchecker.egg-info/PKG-INFO +34 -0
  21. openai_spellchecker-1.0.0/openai_spellchecker.egg-info/SOURCES.txt +24 -0
  22. openai_spellchecker-1.0.0/openai_spellchecker.egg-info/dependency_links.txt +1 -0
  23. openai_spellchecker-1.0.0/openai_spellchecker.egg-info/requires.txt +1 -0
  24. openai_spellchecker-1.0.0/openai_spellchecker.egg-info/top_level.txt +2 -0
  25. openai_spellchecker-1.0.0/pyproject.toml +27 -0
  26. openai_spellchecker-1.0.0/setup.cfg +4 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c)
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,34 @@
1
+ Metadata-Version: 2.1
2
+ Name: openai_spellchecker
3
+ Version: 1.0.0
4
+ Summary: Detect spelling errors in images and plain text using OpenAI Vision
5
+ Author-email: Tyler Barrus <barrust@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/barrust/openai-spellchecker
8
+ Requires-Python: >=3.9
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Requires-Dist: requests>=2.31.0
12
+
13
+ # openai_spellchecker
14
+
15
+ Detect spelling errors in images and plain text using OpenAI Vision.
16
+
17
+
18
+ ## Usage
19
+
20
+ ```python
21
+
22
+ from openai_spellchecker import SpellChecker
23
+
24
+ spellchecker = SpellChecker()
25
+
26
+ api_key = "YOUR_OPENAI_API_KEY" # Replace with your OpenAI API key
27
+
28
+ text = "I am a student."
29
+ print(spellchecker.check_text(text, api_key)) # True = reject (spelling error). False = accept.
30
+
31
+ image_path = "test.png" # file path to an image or image bytes
32
+ print(spellchecker.check_image(image_path, api_key)) # True = reject (spelling error). False = accept.
33
+
34
+ ```
@@ -0,0 +1,22 @@
1
+ # openai_spellchecker
2
+
3
+ Detect spelling errors in images and plain text using OpenAI Vision.
4
+
5
+
6
+ ## Usage
7
+
8
+ ```python
9
+
10
+ from openai_spellchecker import SpellChecker
11
+
12
+ spellchecker = SpellChecker()
13
+
14
+ api_key = "YOUR_OPENAI_API_KEY" # Replace with your OpenAI API key
15
+
16
+ text = "I am a student."
17
+ print(spellchecker.check_text(text, api_key)) # True = reject (spelling error). False = accept.
18
+
19
+ image_path = "test.png" # file path to an image or image bytes
20
+ print(spellchecker.check_image(image_path, api_key)) # True = reject (spelling error). False = accept.
21
+
22
+ ```
@@ -0,0 +1,15 @@
1
+ """SpellChecker Module"""
2
+
3
+ from .core import SpellChecker, WordFrequency
4
+ from .info import (
5
+ __author__,
6
+ __maintainer__,
7
+ __email__,
8
+ __license__,
9
+ __version__,
10
+ __credits__,
11
+ __url__,
12
+ __bugtrack_url__,
13
+ )
14
+
15
+ __all__ = ["SpellChecker", "WordFrequency"]
@@ -0,0 +1,692 @@
1
+ """SpellChecker Module: provides a straightforward spell checking implementation
2
+ inspired by Peter Norvig's method. Reference: https://norvig.com/spell-correct.html
3
+ """
4
+
5
+ import os
6
+ import gzip
7
+ import json
8
+ import pkgutil
9
+ import string
10
+ import typing
11
+ import base64
12
+ from collections import Counter
13
+ from collections.abc import Iterable
14
+ import builtins
15
+ import unicodedata
16
+ import requests
17
+
18
+ from .utils import KeyT, PathOrStr, parse_into_words, ensure_unicode, load_file, write_file, test_file, encode_image_from_path, encode_image_from_bytes, extract_json_content
19
+
20
+
21
+ class SpellChecker:
22
+ """The SpellChecker class encapsulates the basics needed to accomplish a
23
+ simple spell checking algorithm. It is based on the work by
24
+ Peter Norvig (https://norvig.com/spell-correct.html)
25
+
26
+ Args:
27
+ language (str): The language of the dictionary to load or None for no dictionary. Supported languages are \
28
+ `en`, `es`, `it`, `de`, `fr`, `pt`, `ru`, `lv`, `eu`, `nl` and `fa`. Defaults to `en`. A list of \
29
+ languages may be provided and all languages will be loaded.
30
+ local_dictionary (str): The path to a locally stored word frequency dictionary; if provided, no language \
31
+ will be loaded.
32
+ distance (int): The edit distance to use. Defaults to 2.
33
+ case_sensitive (bool): Flag to use a case sensitive dictionary or not, only available when not using a \
34
+ language dictionary.
35
+ Note:
36
+ Using a case sensitive dictionary can be slow to correct words.
37
+ """
38
+
39
+ __slots__ = ["_distance", "_word_frequency", "_tokenizer", "_case_sensitive"]
40
+
41
+ def __init__(
42
+ self,
43
+ language: typing.Union[str, typing.Iterable[str], None] = "en",
44
+ local_dictionary: typing.Optional[PathOrStr] = None,
45
+ distance: int = 2,
46
+ tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,
47
+ case_sensitive: bool = False,
48
+ ) -> None:
49
+ self._distance = 2
50
+ self.distance = distance
51
+
52
+ if tokenizer:
53
+ self._tokenizer = tokenizer
54
+ else:
55
+ self._tokenizer = parse_into_words
56
+
57
+ self._case_sensitive = case_sensitive if not language else False
58
+ self._word_frequency = WordFrequency(self._tokenizer, self._case_sensitive)
59
+
60
+ if local_dictionary:
61
+ self._word_frequency.load_dictionary(local_dictionary)
62
+ elif language:
63
+ if not isinstance(language, Iterable) or isinstance(language, (str, bytes)):
64
+ language = [language]
65
+ for lang in language:
66
+ filename = f"resources/{lang.lower()}.json.gz"
67
+ try:
68
+ json_open = pkgutil.get_data("spellchecker", filename)
69
+ except FileNotFoundError as exc:
70
+ msg = f"The provided dictionary language ({lang.lower()}) does not exist!"
71
+ raise ValueError(msg) from exc
72
+ if json_open:
73
+ try:
74
+ lang_dict = json.loads(gzip.decompress(json_open).decode("utf-8"))
75
+ self._word_frequency.load_json(lang_dict)
76
+ except Exception as e:
77
+ raise RuntimeError(f"Error loading language dictionary for {lang}: {e}") from e
78
+
79
+ def __contains__(self, key: KeyT) -> bool:
80
+ """setup easier known checks"""
81
+ key = ensure_unicode(key)
82
+ return key in self._word_frequency
83
+
84
+ def __getitem__(self, key: KeyT) -> int:
85
+ """setup easier frequency checks"""
86
+ key = ensure_unicode(key)
87
+ return self._word_frequency[key]
88
+
89
+ def __iter__(self) -> typing.Generator[str, None, None]:
90
+ """setup iter support"""
91
+ yield from self._word_frequency.dictionary
92
+
93
+ @classmethod
94
+ def languages(cls) -> typing.Iterable[str]:
95
+ """list: A list of all official languages supported by the library"""
96
+ return ["en", "es", "fr", "it", "pt", "de", "ru", "ar", "lv", "eu", "nl", "fa"]
97
+
98
+ @property
99
+ def word_frequency(self) -> "WordFrequency":
100
+ """WordFrequency: An encapsulation of the word frequency `dictionary`
101
+
102
+ Note:
103
+ Not settable
104
+ """
105
+ return self._word_frequency
106
+
107
+ @property
108
+ def distance(self) -> int:
109
+ """int: The maximum edit distance to calculate
110
+
111
+ Note:
112
+ Valid values are 1 or 2; if an invalid value is passed, defaults to 2
113
+ """
114
+ return self._distance
115
+
116
+ @distance.setter
117
+ def distance(self, val: int) -> None:
118
+ """set the distance parameter"""
119
+ tmp = 2
120
+ try:
121
+ if 0 < int(val) <= 2:
122
+ tmp = val
123
+ except (ValueError, TypeError):
124
+ pass
125
+ self._distance = tmp
126
+
127
+ def split_words(self, text: KeyT) -> typing.Iterable[str]:
128
+ """Split text into individual `words` using either a simple whitespace
129
+ regex or the passed in tokenizer
130
+
131
+ Args:
132
+ text (str): The text to split into individual words
133
+ Returns:
134
+ list(str): A listing of all words in the provided text
135
+ """
136
+ text = ensure_unicode(text)
137
+ return self._tokenizer(text)
138
+
139
+ def export(self, filepath: PathOrStr, encoding: str = "utf-8", gzipped: bool = True) -> None:
140
+ """Export the word frequency list for import in the future
141
+
142
+ Args:
143
+ filepath (str): The filepath to the exported dictionary
144
+ encoding (str): The encoding of the resulting output
145
+ gzipped (bool): Whether to gzip the dictionary or not
146
+ """
147
+ data = json.dumps(self.word_frequency.dictionary, sort_keys=True)
148
+ write_file(filepath, encoding, gzipped, data)
149
+
150
+ def word_usage_frequency(self, word: KeyT, total_words: typing.Optional[int] = None) -> float:
151
+ """Calculate the frequency to the `word` provided as seen across the
152
+ entire dictionary
153
+
154
+ Args:
155
+ word (str): The word for which the word probability is calculated
156
+ total_words (int): The total number of words to use in the calculation
157
+ use the default for using the whole word frequency
158
+ Returns:
159
+ float: The probability that the word is the correct word
160
+ """
161
+ if not total_words:
162
+ total_words = self._word_frequency.total_words
163
+ word = ensure_unicode(word)
164
+ return self._word_frequency.dictionary[word] / total_words
165
+
166
+ def correction(self, word: KeyT) -> typing.Optional[str]:
167
+ """The most probable correct spelling for the word
168
+
169
+ Args:
170
+ word (str): The word to correct
171
+ Returns:
172
+ str: The most likely candidate or None if no correction is present
173
+ """
174
+ word = ensure_unicode(word)
175
+ candidates = self.candidates(word)
176
+ if not candidates:
177
+ return None
178
+
179
+ word_no_accents = self._remove_diacritics(word)
180
+ diacritics_candidates = [c for c in candidates if self._remove_diacritics(c) == word_no_accents]
181
+ if diacritics_candidates:
182
+ return max(diacritics_candidates, key=self.__getitem__)
183
+ return max(candidates, key=self.__getitem__)
184
+
185
+ def candidates(self, word: KeyT) -> typing.Optional[typing.Set[str]]:
186
+ """Generate possible spelling corrections for the provided word up to
187
+ an edit distance of two, if and only when needed
188
+
189
+ Args:
190
+ word (str): The word for which to calculate candidate spellings
191
+ Returns:
192
+ set: The set of words that are possible candidates or None if there are no candidates
193
+ """
194
+ word = ensure_unicode(word)
195
+ if self.known([word]):
196
+ return {word}
197
+
198
+ if not self._check_if_should_check(word):
199
+ return {word}
200
+
201
+ res = list(self.edit_distance_1(word))
202
+ tmp = self.known(res)
203
+ if tmp:
204
+ return tmp
205
+ if self._distance == 2:
206
+ tmp = self.known(list(self.__edit_distance_alt(res)))
207
+ if tmp:
208
+ return tmp
209
+ return None
210
+
211
+ def known(self, words: typing.Iterable[KeyT]) -> typing.Set[str]:
212
+ """The subset of `words` that appear in the dictionary of words
213
+
214
+ Args:
215
+ words (list): List of words to determine which are in the corpus
216
+ Returns:
217
+ set: The set of those words from the input that are in the corpus
218
+ """
219
+ tmp_words = [ensure_unicode(w) for w in words]
220
+ tmp = [w if self._case_sensitive else w.lower() for w in tmp_words]
221
+ return {w for w in tmp if w in self._word_frequency.dictionary and self._check_if_should_check(w)}
222
+
223
+ def unknown(self, words: typing.Iterable[KeyT]) -> typing.Set[str]:
224
+ """The subset of `words` that do not appear in the dictionary
225
+
226
+ Args:
227
+ words (list): List of words to determine which are not in the corpus
228
+ Returns:
229
+ set: The set of those words from the input that are not in the corpus
230
+ """
231
+ tmp_words = [ensure_unicode(w) for w in words]
232
+ tmp = [w if self._case_sensitive else w.lower() for w in tmp_words if self._check_if_should_check(w)]
233
+ return {w for w in tmp if w not in self._word_frequency.dictionary}
234
+
235
+ def edit_distance_1(self, word: KeyT) -> typing.Set[str]:
236
+ """Generate all words that are one edit operation away from `word`,
237
+ using only characters present in the corpus
238
+
239
+ Args:
240
+ word (str): The input word to generate possible single-edit variations for
241
+ Returns:
242
+ set: A set of words that differ from the input by exactly one edit
243
+ """
244
+ tmp_word = ensure_unicode(word).lower() if not self._case_sensitive else ensure_unicode(word)
245
+ if self._check_if_should_check(tmp_word) is False:
246
+ return {tmp_word}
247
+ letters = self._word_frequency.letters
248
+ splits = [(tmp_word[:i], tmp_word[i:]) for i in range(len(tmp_word) + 1)]
249
+ deletes = [L + R[1:] for L, R in splits if R]
250
+ transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
251
+ replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
252
+ inserts = [L + c + R for L, R in splits for c in letters]
253
+ return set(deletes + transposes + replaces + inserts)
254
+
255
+ def edit_distance_2(self, word: KeyT) -> typing.List[str]:
256
+ """Compute all strings that are two edits away from `word` using only
257
+ the letters in the corpus
258
+
259
+ Args:
260
+ word (str): The word for which to calculate the edit distance
261
+ Returns:
262
+ set: The set of strings that are edit distance two from the provided word
263
+ """
264
+ word = ensure_unicode(word).lower() if not self._case_sensitive else ensure_unicode(word)
265
+ return [e2 for e1 in self.edit_distance_1(word) for e2 in self.edit_distance_1(e1)]
266
+
267
+ def check_text(self, text: str, api_key: str) -> bool:
268
+ """Checks the given text for spelling errors using the OpenAI API
269
+
270
+ Args:
271
+ text (str): The text to be checked
272
+ api_key (str): The OpenAI API key to use for the request
273
+
274
+ Returns:
275
+ bool: True if spelling errors are detected and the text must be rejected, False otherwise or if the API call/response fails
276
+ """
277
+ if not api_key:
278
+ raise ValueError("OpenAI API Key is required.")
279
+
280
+ prompt = (
281
+ "Check this text carefully to review the spelling. "
282
+ "If there are spelling errors we must reject it. "
283
+ "Return your answer in this exact format with no additional commentary: {'reject': 'true' or 'false'}."
284
+ )
285
+
286
+ payload = {
287
+ "model": "gpt-4o",
288
+ "messages": [
289
+ {
290
+ "role": "user",
291
+ "content": [
292
+ {
293
+ "type": "text",
294
+ "text": f"{prompt}\n\nText: {text}",
295
+ }
296
+ ],
297
+ }
298
+ ],
299
+ "max_tokens": 30,
300
+ }
301
+
302
+ headers = {
303
+ "Content-Type": "application/json",
304
+ "Authorization": f"Bearer {api_key}",
305
+ }
306
+
307
+ try:
308
+ response = requests.post(
309
+ "https://api.openai.com/v1/chat/completions",
310
+ headers=headers,
311
+ json=payload,
312
+ )
313
+ response.raise_for_status()
314
+ must_reject = extract_json_content(response.text)
315
+ return bool(must_reject)
316
+ except Exception:
317
+ return False
318
+
319
+ def check_image(self, image: typing.Union[str, bytes], api_key: str) -> bool:
320
+ """Checks the given image for spelling errors using the OpenAI API
321
+
322
+ Args:
323
+ image (str | os.PathLike | bytes | bytearray): The image to be checked, given as a file path (str or os.PathLike), or as bytes/bytearray
324
+ api_key (str): The OpenAI API key to use for the request
325
+ Returns:
326
+ bool: True if spelling errors are detected in the image and it must be rejected, False otherwise or if the API call/response fails
327
+ """
328
+ if not api_key:
329
+ raise ValueError("OpenAI API Key is required.")
330
+
331
+ if isinstance(image, (str, os.PathLike)):
332
+ base64_image = encode_image_from_path(str(image))
333
+ elif isinstance(image, (bytes, bytearray)):
334
+ base64_image = encode_image_from_bytes(bytes(image))
335
+
336
+ prompt = (
337
+ "Check this image carefully to review the spelling. "
338
+ "If there are spelling errors we must reject it. "
339
+ "Return your answer in this exact format with no additional commentary: {'reject': 'true' or 'false'}."
340
+ )
341
+
342
+ payload = {
343
+ "model": "gpt-4o",
344
+ "messages": [
345
+ {
346
+ "role": "user",
347
+ "content": [
348
+ {
349
+ "type": "text",
350
+ "text": prompt,
351
+ },
352
+ {
353
+ "type": "image_url",
354
+ "image_url": {
355
+ "url": f"data:image/webp;base64,{base64_image}",
356
+ "detail": "low",
357
+ },
358
+ },
359
+ ],
360
+ }
361
+ ],
362
+ "max_tokens": 30,
363
+ }
364
+
365
+ headers = {
366
+ "Content-Type": "application/json",
367
+ "Authorization": f"Bearer {api_key}",
368
+ }
369
+
370
+ try:
371
+ response = requests.post(
372
+ "https://api.openai.com/v1/chat/completions",
373
+ headers=headers,
374
+ json=payload,
375
+ )
376
+ response.raise_for_status()
377
+ must_reject = extract_json_content(response.text)
378
+ return bool(must_reject)
379
+ except Exception as e:
380
+ return False
381
+
382
+ def __edit_distance_alt(self, words: typing.Iterable[KeyT]) -> typing.List[str]:
383
+ """Compute all strings that are 1 edits away from all the words using
384
+ only the letters in the corpus
385
+
386
+ Args:
387
+ words (list): The words for which to calculate the edit distance
388
+ Returns:
389
+ set: The set of strings that are edit distance two from the provided words
390
+ """
391
+ tmp_words = [ensure_unicode(w) for w in words]
392
+ tmp = [w if self._case_sensitive else w.lower() for w in tmp_words if self._check_if_should_check(w)]
393
+ return [e2 for e1 in tmp for e2 in self.known(self.edit_distance_1(e1))]
394
+
395
+ def _remove_diacritics(self, input_str: KeyT) -> str:
396
+ """Remove diacritics from the input string
397
+
398
+ Args:
399
+ input_str (str): The string from which to remove diacritics
400
+ Returns:
401
+ str: The string with diacritics removed
402
+ """
403
+ nfkd_form = unicodedata.normalize('NFKD', ensure_unicode(input_str))
404
+ return "".join([c for c in nfkd_form if not unicodedata.combining(c)])
405
+
406
+ def _check_if_should_check(self, word: str) -> bool:
407
+ if len(word) == 1 and word in string.punctuation:
408
+ return False
409
+ if len(word) > self._word_frequency.longest_word_length + 3:
410
+ return False
411
+ if word.lower() == "nan":
412
+ return True
413
+ try:
414
+ float(word)
415
+ return False
416
+ except ValueError:
417
+ pass
418
+ return True
419
+
420
+
421
+ class WordFrequency:
422
+ """Store the `dictionary` as a word frequency list while allowing for
423
+ different methods to load the data and update over time
424
+ """
425
+
426
+ __slots__ = [
427
+ "_dictionary",
428
+ "_total_words",
429
+ "_unique_words",
430
+ "_letters",
431
+ "_tokenizer",
432
+ "_case_sensitive",
433
+ "_longest_word_length",
434
+ "_evaluate"
435
+ ]
436
+
437
+ def __init__(
438
+ self,
439
+ tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,
440
+ case_sensitive: bool = False,
441
+ ) -> None:
442
+ self._dictionary: typing.Counter = Counter()
443
+ self._total_words = 0
444
+ self._unique_words = 0
445
+ self._letters: typing.Set[str] = set()
446
+ self._case_sensitive = case_sensitive
447
+ self._longest_word_length = 0
448
+ self._evaluate = False
449
+
450
+ self._tokenizer = parse_into_words
451
+ if tokenizer is not None:
452
+ self._tokenizer = tokenizer
453
+
454
+ def __contains__(self, key: KeyT) -> bool:
455
+ """turn on contains"""
456
+ key = ensure_unicode(key)
457
+ key = key if self._case_sensitive else key.lower()
458
+ return key in self._dictionary
459
+
460
+ def __getitem__(self, key: KeyT) -> int:
461
+ """turn on getitem"""
462
+ key = ensure_unicode(key)
463
+ key = key if self._case_sensitive else key.lower()
464
+ return self._dictionary[key]
465
+
466
+ def __iter__(self) -> typing.Generator[str, None, None]:
467
+ """turn on iter support"""
468
+ yield from self._dictionary
469
+
470
+ def pop(self, key: KeyT, default: typing.Optional[int] = None) -> typing.Optional[int]:
471
+ """Remove the key and return the associated value or default if not
472
+ found
473
+
474
+ Args:
475
+ key (str): The key to remove
476
+ default (obj): The value to return if key is not present
477
+ Returns:
478
+ int | None: Returns the number of instances of key, or None if not in the dictionary
479
+ """
480
+ key = ensure_unicode(key)
481
+ return self._dictionary.pop(key if self._case_sensitive else key.lower(), default)
482
+
483
+ @property
484
+ def dictionary(self) -> typing.Dict[str, int]:
485
+ """Counter: A counting dictionary of all words in the corpus and the number
486
+ of times each has been seen
487
+
488
+ Note:
489
+ Not settable
490
+ """
491
+ return self._dictionary
492
+
493
+ @property
494
+ def total_words(self) -> int:
495
+ """int: The sum of all word occurrences in the word frequency dictionary
496
+
497
+ Note:
498
+ Not settable
499
+ """
500
+ return self._total_words
501
+
502
+ @property
503
+ def unique_words(self) -> int:
504
+ """int: The total number of unique words in the word frequency list
505
+
506
+ Note:
507
+ Not settable
508
+ """
509
+ return self._unique_words
510
+
511
+ @property
512
+ def letters(self) -> typing.Set[str]:
513
+ """set: The listing of all letters found within the corpus
514
+
515
+ Note:
516
+ Not settable
517
+ """
518
+ return self._letters
519
+
520
+ @property
521
+ def longest_word_length(self) -> int:
522
+ """int: The longest word length in the dictionary
523
+
524
+ Note:
525
+ Not settable
526
+ """
527
+ return self._longest_word_length
528
+
529
+ def tokenize(self, text: KeyT) -> typing.Iterator[str]:
530
+ """Tokenize the provided string object into individual words
531
+
532
+ Args:
533
+ text (str): The string object to tokenize
534
+ Yields:
535
+ str: The next `word` in the tokenized string
536
+ Note:
537
+ This is the same as the `split_words()` unless a tokenizer function was provided
538
+ """
539
+ tmp_text = ensure_unicode(text)
540
+ for word in self._tokenizer(tmp_text):
541
+ yield word if self._case_sensitive else word.lower()
542
+
543
+ def keys(self) -> typing.Iterator[str]:
544
+ """Iterator over the key of the dictionary
545
+
546
+ Yields:
547
+ str: The next key in the dictionary
548
+ Note:
549
+ This is the same as `words()`
550
+ """
551
+ yield from self._dictionary.keys()
552
+
553
+ def words(self) -> typing.Iterator[str]:
554
+ """Iterator over the words in the dictionary
555
+
556
+ Yields:
557
+ str: The next word in the dictionary
558
+ Note:
559
+ This is the same as `keys()`
560
+ """
561
+ yield from self._dictionary.keys()
562
+
563
+ def items(self) -> typing.Generator[typing.Tuple[str, int], None, None]:
564
+ """Iterator over the words in the dictionary
565
+
566
+ Yields:
567
+ str: The next word in the dictionary
568
+ int: The number of instances in the dictionary
569
+ Note:
570
+ This is the same as `dict.items()`
571
+ """
572
+ yield from self._dictionary.items()
573
+
574
+ def load_dictionary(self, filename: PathOrStr, encoding: str = "utf-8") -> None:
575
+ """Load in a pre-built word frequency list
576
+
577
+ Args:
578
+ filename (str): The filepath to the json (optionally gzipped) file to be loaded
579
+ encoding (str): The encoding of the dictionary
580
+ """
581
+ with load_file(filename, encoding) as data:
582
+ data = data if self._case_sensitive else data.lower()
583
+ self._dictionary.update(json.loads(data))
584
+ self._update_dictionary()
585
+
586
+ def load_json(self, data: typing.Dict[str, int]) -> None:
587
+ """Load in a pre-built word frequency list
588
+
589
+ Args:
590
+ data (dict): The dictionary to be loaded
591
+ """
592
+ self._dictionary.update(data)
593
+ self._update_dictionary()
594
+
595
+ def load_text_file(
596
+ self,
597
+ filename: PathOrStr,
598
+ encoding: str = "utf-8",
599
+ tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,
600
+ ) -> None:
601
+ """Load in a text file from which to generate a word frequency list
602
+
603
+ Args:
604
+ filename (str): The filepath to the text file to be loaded
605
+ encoding (str): The encoding of the text file
606
+ tokenizer (function): The function to use to tokenize a string
607
+ """
608
+ with load_file(filename, encoding=encoding) as data:
609
+ self.load_text(data, tokenizer)
610
+
611
+ def load_text(
612
+ self,
613
+ text: KeyT,
614
+ tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,
615
+ ) -> None:
616
+ """Load text from which to generate a word frequency list
617
+
618
+ Args:
619
+ text (str): The text to be loaded
620
+ tokenizer (function): The function to use to tokenize a string
621
+ """
622
+ text = ensure_unicode(text)
623
+ if tokenizer:
624
+ words = [x if self._case_sensitive else x.lower() for x in tokenizer(text)]
625
+ else:
626
+ words = self.tokenize(text)
627
+
628
+ self._dictionary.update(words)
629
+ self._update_dictionary()
630
+
631
+ def load_words(self, words: typing.Iterable[KeyT]) -> None:
632
+ """Load a list of words from which to generate a word frequency list
633
+
634
+ Args:
635
+ words (list): The list of words to be loaded
636
+ """
637
+ words = [ensure_unicode(w) for w in words]
638
+ self._dictionary.update([word if self._case_sensitive else word.lower() for word in words])
639
+ self._update_dictionary()
640
+
641
+ def add(self, word: KeyT, val: int = 1) -> None:
642
+ """Add a word to the word frequency list
643
+
644
+ Args:
645
+ word (str): The word to add
646
+ val (int): The number of times to insert the word
647
+ """
648
+ word = ensure_unicode(word)
649
+ self.load_json({word if self._case_sensitive else word.lower(): val})
650
+
651
+ def remove_words(self, words: typing.Iterable[KeyT]) -> None:
652
+ """Remove a list of words from the word frequency list
653
+
654
+ Args:
655
+ words (list): The list of words to remove
656
+ """
657
+ words = [ensure_unicode(w) for w in words]
658
+ for word in words:
659
+ self.pop(word)
660
+ self._update_dictionary()
661
+
662
+ def remove(self, word: KeyT) -> None:
663
+ """Remove a word from the word frequency list
664
+
665
+ Args:
666
+ word (str): The word to remove
667
+ """
668
+ self.pop(word)
669
+ self._update_dictionary()
670
+
671
+ def remove_by_threshold(self, threshold: int = 5) -> None:
672
+ """Remove all words at, or below, the provided threshold
673
+
674
+ Args:
675
+ threshold (int): The threshold at which a word is to be removed
676
+ """
677
+ to_remove = [k for k, v in self._dictionary.items() if v <= threshold]
678
+ self.remove_words(to_remove)
679
+
680
+ def _update_dictionary(self) -> None:
681
+ """Update the word frequency object"""
682
+ if not self._dictionary:
683
+ self._longest_word_length = 0
684
+ self._total_words = 0
685
+ self._unique_words = 0
686
+ self._letters = set()
687
+ return
688
+ keys = self._dictionary.keys()
689
+ self._longest_word_length = max(map(len, keys))
690
+ self._total_words = sum(self._dictionary.values())
691
+ self._unique_words = len(keys)
692
+ self._letters = set().union(*keys)
@@ -0,0 +1,10 @@
1
+ """SpellChecker Information"""
2
+
3
+ __author__ = "Tyler Barrus"
4
+ __maintainer__ = "Tyler Barrus"
5
+ __email__ = "barrust@gmail.com"
6
+ __license__ = "MIT"
7
+ __version__ = "1.2.0"
8
+ __credits__ = ["Peter Norvig"]
9
+ __url__ = "https://github.com/barrust/openai-spellchecker"
10
+ __bugtrack_url__ = f"{__url__}/issues"
@@ -0,0 +1,214 @@
1
+ """Additional utility functions"""
2
+
3
+ import os
4
+ import contextlib
5
+ import functools
6
+ import gzip
7
+ import re
8
+ import typing
9
+ import warnings
10
+ import base64
11
+ import json
12
+ import shutil
13
+ from pathlib import Path
14
+
15
+ from .info import __version__
16
+
17
+ KeyT = typing.Union[str, bytes]
18
+ PathOrStr = typing.Union[Path, str]
19
+
20
+
21
+ def fail_after(version: str) -> typing.Callable:
22
+ """Decorator to add to tests to ensure that they fail if a deprecated
23
+ feature is not removed before the specified version
24
+
25
+ Args:
26
+ version (str): The version to check against
27
+ """
28
+ def decorator_wrapper(func):
29
+ @functools.wraps(func)
30
+ def test_inner(*args, **kwargs):
31
+ if [int(x) for x in version.split(".")] <= [int(x) for x in __version__.split(".")]:
32
+ msg = (
33
+ f"The function {func.__name__} must be fully removed as it is deprecated"
34
+ f" and must be removed by version {version}"
35
+ )
36
+ raise AssertionError(msg)
37
+ return func(*args, **kwargs)
38
+ return test_inner
39
+ return decorator_wrapper
40
+
41
+
42
+ def deprecated(message: str = "") -> typing.Callable:
43
+ """A simplistic decorator to mark functions as deprecated. The function
44
+ will pass a message to the user on the first use of the function
45
+
46
+ Args:
47
+ message (str): The message to display if the function is deprecated
48
+ """
49
+ def decorator_wrapper(func):
50
+ @functools.wraps(func)
51
+ def function_wrapper(*args, **kwargs):
52
+ func_name = func.__name__
53
+ if func_name not in function_wrapper.deprecated_items:
54
+ msg = f"Function {func.__name__} is now deprecated! {message}"
55
+ warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
56
+ function_wrapper.deprecated_items.add(func_name)
57
+ return func(*args, **kwargs)
58
+ function_wrapper.deprecated_items = set()
59
+ return function_wrapper
60
+ return decorator_wrapper
61
+
62
+
63
+ def ensure_unicode(value: KeyT, encoding: str = "utf-8") -> str:
64
+ """Simplify checking if passed in data are bytes or a string and decode
65
+ bytes into unicode
66
+
67
+ Args:
68
+ value (str): The input string (possibly bytes)
69
+ encoding (str): The encoding to use if input is bytes
70
+ Returns:
71
+ str: The encoded string
72
+ """
73
+ if isinstance(value, bytes):
74
+ return value.decode(encoding)
75
+ elif isinstance(value, list):
76
+ raise TypeError(f"The provided value {value} is not of type str or bytes")
77
+ return value
78
+
79
+
80
+ @contextlib.contextmanager
81
+ def __gzip_read(filename: PathOrStr, mode: str = "rb", encoding: str = "UTF-8") -> typing.Generator[KeyT, None, None]:
82
+ """Context manager to correctly handle the decoding of the output of the gzip file
83
+
84
+ Args:
85
+ filename (str): The filename to open
86
+ mode (str): The mode to read the data
87
+ encoding (str): The file encoding to use
88
+ Returns:
89
+ str: The string data from the gzip file read
90
+ """
91
+ with gzip.open(filename, mode=mode, encoding=encoding) as fobj:
92
+ yield fobj.read()
93
+
94
+
95
+ @contextlib.contextmanager
96
+ def load_file(filename: PathOrStr, encoding: str) -> typing.Generator[KeyT, None, None]:
97
+ """Context manager to handle opening a gzip or text file correctly and
98
+ reading all the data
99
+
100
+ Args:
101
+ filename (str): The filename to open
102
+ encoding (str): The file encoding to use
103
+ Returns:
104
+ str: The string data from the file read
105
+ """
106
+ if isinstance(filename, Path):
107
+ filename = str(filename)
108
+
109
+ if filename[-3:].lower() == ".gz":
110
+ with __gzip_read(filename, mode="rt", encoding=encoding) as data:
111
+ yield data
112
+ else:
113
+ with open(filename, encoding=encoding) as fobj:
114
+ yield fobj.read()
115
+
116
+
117
+ def write_file(filepath: PathOrStr, encoding: str, gzipped: bool, data: str) -> None:
118
+ """Write the data to file either as a gzip file or text based on the
119
+ gzipped parameter
120
+
121
+ Args:
122
+ filepath (str): The filename to open
123
+ encoding (str): The file encoding to use
124
+ gzipped (bool): Whether the file should be gzipped or not
125
+ data (str): The data to be written out
126
+ """
127
+ if gzipped:
128
+ with gzip.open(filepath, "wt") as fobj:
129
+ fobj.write(data)
130
+ else:
131
+ with open(filepath, "w", encoding=encoding) as fobj:
132
+ fobj.write(data)
133
+
134
+
135
+ def test_file(filepath: PathOrStr, encoding: str, index: str):
136
+ """Test and retrieve a section from a gzipped JSON file located in the resources directory.
137
+
138
+ Args:
139
+ filepath (str): The stem of the filename to open
140
+ encoding (str): The encoding to use when reading the file
141
+ index (str): The key to extract from dictionary
142
+ """
143
+ try:
144
+ filepath = f"{os.path.join(os.path.dirname(__file__), 'resources')}/{filepath}.json.gz"
145
+ with gzip.open(filepath, "rt", encoding=encoding) as f:
146
+ data = f.read()
147
+ data = json.loads(data)
148
+ data = data[index]
149
+ return data
150
+ except Exception as e:
151
+ print(f"An error occurred while reading gzip file: {e}")
152
+
153
+
154
+ def parse_into_words(text: str) -> typing.Iterable[str]:
155
+ """Parse the text into words; currently removes punctuation except for
156
+ apostrophizes
157
+
158
+ Args:
159
+ text (str): The text to split into words
160
+ """
161
+ return re.findall(r"(\w[\w']*\w|\w)", text)
162
+
163
+
164
+ def gzip_file(input_path: str, output_path: str) -> None:
165
+ """Compress a file using gzip
166
+
167
+ Args:
168
+ input_path (str): Path to the input file
169
+ output_path (str): Path where the gzipped file will be created
170
+ """
171
+ with open(input_path, 'rb') as f_in:
172
+ with gzip.open(output_path, 'wb') as f_out:
173
+ shutil.copyfileobj(f_in, f_out)
174
+ print(f"Gzipped file created at {output_path}")
175
+
176
+
177
+ def encode_image_from_path(image_path: str) -> str:
178
+ """Encodes an image from a given file path as a base64 string
179
+
180
+ Args:
181
+ image_path (str): Path to the image file
182
+ Returns:
183
+ str: The base64-encoded string of the image contents
184
+ """
185
+ with open(image_path, "rb") as image_file:
186
+ return base64.b64encode(image_file.read()).decode("utf-8")
187
+
188
+
189
+ def encode_image_from_bytes(image_bytes: bytes) -> str:
190
+ """Encodes image bytes as a base64 string
191
+
192
+ Args:
193
+ image_bytes (bytes): The image data as bytes
194
+ Returns:
195
+ str: Base64-encoded string of the image bytes
196
+ """
197
+ return base64.b64encode(image_bytes).decode("utf-8")
198
+
199
+
200
+ def extract_json_content(response_text: str) -> typing.Optional[bool]:
201
+ """Extracts and interprets the 'reject' value from an OpenAI API response text JSON
202
+
203
+ Args:
204
+ response_text (str): The JSON response text from the OpenAI API
205
+ Returns:
206
+ Optional[bool]: True if the response indicates rejection ("reject": "true"), False if not rejected ("reject": "false"). Returns None if parsing fails
207
+ """
208
+ try:
209
+ response_json = json.loads(response_text)
210
+ content = response_json["choices"][0]["message"]["content"]
211
+ content_json = json.loads(content.replace("'", '"'))
212
+ return content_json.get("reject", "false").lower() == "true"
213
+ except (KeyError, IndexError, json.JSONDecodeError) as e:
214
+ return None
@@ -0,0 +1,34 @@
1
+ Metadata-Version: 2.1
2
+ Name: openai-spellchecker
3
+ Version: 1.0.0
4
+ Summary: Detect spelling errors in images and plain text using OpenAI Vision
5
+ Author-email: Tyler Barrus <barrust@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/barrust/openai-spellchecker
8
+ Requires-Python: >=3.9
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Requires-Dist: requests>=2.31.0
12
+
13
+ # openai_spellchecker
14
+
15
+ Detect spelling errors in images and plain text using OpenAI Vision.
16
+
17
+
18
+ ## Usage
19
+
20
+ ```python
21
+
22
+ from openai_spellchecker import SpellChecker
23
+
24
+ spellchecker = SpellChecker()
25
+
26
+ api_key = "YOUR_OPENAI_API_KEY" # Replace with your OpenAI API key
27
+
28
+ text = "I am a student."
29
+ print(spellchecker.check_text(text, api_key)) # True = reject (spelling error). False = accept.
30
+
31
+ image_path = "test.png" # file path to an image or image bytes
32
+ print(spellchecker.check_image(image_path, api_key)) # True = reject (spelling error). False = accept.
33
+
34
+ ```
@@ -0,0 +1,24 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ openai_spellchecker/__init__.py
5
+ openai_spellchecker/core.py
6
+ openai_spellchecker/info.py
7
+ openai_spellchecker/utils.py
8
+ openai_spellchecker.egg-info/PKG-INFO
9
+ openai_spellchecker.egg-info/SOURCES.txt
10
+ openai_spellchecker.egg-info/dependency_links.txt
11
+ openai_spellchecker.egg-info/requires.txt
12
+ openai_spellchecker.egg-info/top_level.txt
13
+ openai_spellchecker/resources/ar.json.gz
14
+ openai_spellchecker/resources/de.json.gz
15
+ openai_spellchecker/resources/en.json.gz
16
+ openai_spellchecker/resources/es.json.gz
17
+ openai_spellchecker/resources/eu.json.gz
18
+ openai_spellchecker/resources/fa.json.gz
19
+ openai_spellchecker/resources/fr.json.gz
20
+ openai_spellchecker/resources/it.json.gz
21
+ openai_spellchecker/resources/lv.json.gz
22
+ openai_spellchecker/resources/nl.json.gz
23
+ openai_spellchecker/resources/pt.json.gz
24
+ openai_spellchecker/resources/ru.json.gz
@@ -0,0 +1 @@
1
+ requests>=2.31.0
@@ -0,0 +1,2 @@
1
+ dist
2
+ openai_spellchecker
@@ -0,0 +1,27 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68,<69", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "openai_spellchecker"
7
+ version = "1.0.0"
8
+ description = "Detect spelling errors in images and plain text using OpenAI Vision"
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = { text = "MIT" }
12
+ authors = [{name = "Tyler Barrus", email = "barrust@gmail.com"}]
13
+ dependencies = [
14
+ "requests>=2.31.0"
15
+ ]
16
+
17
+ [project.urls]
18
+ Homepage = "https://github.com/barrust/openai-spellchecker"
19
+
20
+ [tool.setuptools.packages.find]
21
+ where = ["."]
22
+
23
+ [tool.setuptools]
24
+ include-package-data = true
25
+
26
+ [tool.setuptools.package-data]
27
+ "openai_spellchecker" = ["**/*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+