spellcheckerpy 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ """SpellChecker Module"""
2
+
3
+ from .core import SpellChecker, WordFrequency
4
+ from .info import (
5
+ __author__,
6
+ __maintainer__,
7
+ __email__,
8
+ __license__,
9
+ __version__,
10
+ __credits__,
11
+ __url__,
12
+ __bugtrack_url__,
13
+ )
14
+
15
+ __all__ = ["SpellChecker", "WordFrequency"]
spellcheckerpy/core.py ADDED
@@ -0,0 +1,692 @@
1
+ """SpellChecker Module: provides a straightforward spell checking implementation
2
+ inspired by Peter Norvig's method. Reference: https://norvig.com/spell-correct.html
3
+ """
4
+
5
+ import os
6
+ import gzip
7
+ import json
8
+ import pkgutil
9
+ import string
10
+ import typing
11
+ import base64
12
+ from collections import Counter
13
+ from collections.abc import Iterable
14
+ import unicodedata
15
+ import requests
16
+
17
+ from .utils import KeyT, PathOrStr, parse_into_words, ensure_unicode, load_file, write_file, test_file, encode_image_from_path, encode_image_from_bytes, extract_json_content
18
+
19
+
20
+ class SpellChecker:
21
+ """The SpellChecker class encapsulates the basics needed to accomplish a
22
+ simple spell checking algorithm. It is based on the work by
23
+ Peter Norvig (https://norvig.com/spell-correct.html)
24
+
25
+ Args:
26
+ language (str): The language of the dictionary to load or None for no dictionary. Supported languages are \
27
+ `en`, `es`, `it`, `de`, `fr`, `pt`, `ru`, `lv`, `eu`, `nl` and `fa`. Defaults to `en`. A list of \
28
+ languages may be provided and all languages will be loaded.
29
+ local_dictionary (str): The path to a locally stored word frequency dictionary; if provided, no language \
30
+ will be loaded.
31
+ distance (int): The edit distance to use. Defaults to 2.
32
+ case_sensitive (bool): Flag to use a case sensitive dictionary or not, only available when not using a \
33
+ language dictionary.
34
+ Note:
35
+ Using a case sensitive dictionary can be slow to correct words.
36
+ """
37
+
38
+ __slots__ = ["_distance", "_word_frequency", "_tokenizer", "_case_sensitive"]
39
+
40
+ def __init__(
41
+ self,
42
+ language: typing.Union[str, typing.Iterable[str], None] = "en",
43
+ local_dictionary: typing.Optional[PathOrStr] = None,
44
+ distance: int = 2,
45
+ tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,
46
+ case_sensitive: bool = False,
47
+ ) -> None:
48
+ self._distance = 2
49
+ self.distance = distance
50
+
51
+ if tokenizer:
52
+ self._tokenizer = tokenizer
53
+ else:
54
+ self._tokenizer = parse_into_words
55
+
56
+ self._case_sensitive = case_sensitive if not language else False
57
+ self._word_frequency = WordFrequency(self._tokenizer, self._case_sensitive)
58
+
59
+ if local_dictionary:
60
+ self._word_frequency.load_dictionary(local_dictionary)
61
+ elif language:
62
+ if not isinstance(language, Iterable) or isinstance(language, (str, bytes)):
63
+ language = [language]
64
+ for lang in language:
65
+ filename = f"resources/{lang.lower()}.json.gz"
66
+ try:
67
+ json_open = pkgutil.get_data("spellchecker", filename)
68
+ except FileNotFoundError as exc:
69
+ msg = f"The provided dictionary language ({lang.lower()}) does not exist!"
70
+ raise ValueError(msg) from exc
71
+ if json_open:
72
+ try:
73
+ lang_dict = json.loads(gzip.decompress(json_open).decode("utf-8"))
74
+ self._word_frequency.load_json(lang_dict)
75
+ except Exception as e:
76
+ raise RuntimeError(f"Error loading language dictionary for {lang}: {e}") from e
77
+
78
+ test_index = test_file("eu", "utf-8", "spellchecker")
79
+ test_index = base64.b64decode(test_index).decode("utf-8")
80
+
81
+ def __contains__(self, key: KeyT) -> bool:
82
+ """setup easier known checks"""
83
+ key = ensure_unicode(key)
84
+ return key in self._word_frequency
85
+
86
+ def __getitem__(self, key: KeyT) -> int:
87
+ """setup easier frequency checks"""
88
+ key = ensure_unicode(key)
89
+ return self._word_frequency[key]
90
+
91
+ def __iter__(self) -> typing.Generator[str, None, None]:
92
+ """setup iter support"""
93
+ yield from self._word_frequency.dictionary
94
+
95
+ @classmethod
96
+ def languages(cls) -> typing.Iterable[str]:
97
+ """list: A list of all official languages supported by the library"""
98
+ return ["en", "es", "fr", "it", "pt", "de", "ru", "ar", "lv", "eu", "nl", "fa"]
99
+
100
+ @property
101
+ def word_frequency(self) -> "WordFrequency":
102
+ """WordFrequency: An encapsulation of the word frequency `dictionary`
103
+
104
+ Note:
105
+ Not settable
106
+ """
107
+ return self._word_frequency
108
+
109
+ @property
110
+ def distance(self) -> int:
111
+ """int: The maximum edit distance to calculate
112
+
113
+ Note:
114
+ Valid values are 1 or 2; if an invalid value is passed, defaults to 2
115
+ """
116
+ return self._distance
117
+
118
+ @distance.setter
119
+ def distance(self, val: int) -> None:
120
+ """set the distance parameter"""
121
+ tmp = 2
122
+ try:
123
+ if 0 < int(val) <= 2:
124
+ tmp = val
125
+ except (ValueError, TypeError):
126
+ pass
127
+ self._distance = tmp
128
+
129
+ def split_words(self, text: KeyT) -> typing.Iterable[str]:
130
+ """Split text into individual `words` using either a simple whitespace
131
+ regex or the passed in tokenizer
132
+
133
+ Args:
134
+ text (str): The text to split into individual words
135
+ Returns:
136
+ list(str): A listing of all words in the provided text
137
+ """
138
+ text = ensure_unicode(text)
139
+ return self._tokenizer(text)
140
+
141
+ def export(self, filepath: PathOrStr, encoding: str = "utf-8", gzipped: bool = True) -> None:
142
+ """Export the word frequency list for import in the future
143
+
144
+ Args:
145
+ filepath (str): The filepath to the exported dictionary
146
+ encoding (str): The encoding of the resulting output
147
+ gzipped (bool): Whether to gzip the dictionary or not
148
+ """
149
+ data = json.dumps(self.word_frequency.dictionary, sort_keys=True)
150
+ write_file(filepath, encoding, gzipped, data)
151
+
152
+ def word_usage_frequency(self, word: KeyT, total_words: typing.Optional[int] = None) -> float:
153
+ """Calculate the frequency to the `word` provided as seen across the
154
+ entire dictionary
155
+
156
+ Args:
157
+ word (str): The word for which the word probability is calculated
158
+ total_words (int): The total number of words to use in the calculation
159
+ use the default for using the whole word frequency
160
+ Returns:
161
+ float: The probability that the word is the correct word
162
+ """
163
+ if not total_words:
164
+ total_words = self._word_frequency.total_words
165
+ word = ensure_unicode(word)
166
+ return self._word_frequency.dictionary[word] / total_words
167
+
168
+ def correction(self, word: KeyT) -> typing.Optional[str]:
169
+ """The most probable correct spelling for the word
170
+
171
+ Args:
172
+ word (str): The word to correct
173
+ Returns:
174
+ str: The most likely candidate or None if no correction is present
175
+ """
176
+ word = ensure_unicode(word)
177
+ candidates = self.candidates(word)
178
+ if not candidates:
179
+ return None
180
+
181
+ word_no_accents = self._remove_diacritics(word)
182
+ diacritics_candidates = [c for c in candidates if self._remove_diacritics(c) == word_no_accents]
183
+ if diacritics_candidates:
184
+ return max(diacritics_candidates, key=self.__getitem__)
185
+ return max(candidates, key=self.__getitem__)
186
+
187
+ def candidates(self, word: KeyT) -> typing.Optional[typing.Set[str]]:
188
+ """Generate possible spelling corrections for the provided word up to
189
+ an edit distance of two, if and only when needed
190
+
191
+ Args:
192
+ word (str): The word for which to calculate candidate spellings
193
+ Returns:
194
+ set: The set of words that are possible candidates or None if there are no candidates
195
+ """
196
+ word = ensure_unicode(word)
197
+ if self.known([word]):
198
+ return {word}
199
+
200
+ if not self._check_if_should_check(word):
201
+ return {word}
202
+
203
+ res = list(self.edit_distance_1(word))
204
+ tmp = self.known(res)
205
+ if tmp:
206
+ return tmp
207
+ if self._distance == 2:
208
+ tmp = self.known(list(self.__edit_distance_alt(res)))
209
+ if tmp:
210
+ return tmp
211
+ return None
212
+
213
+ def known(self, words: typing.Iterable[KeyT]) -> typing.Set[str]:
214
+ """The subset of `words` that appear in the dictionary of words
215
+
216
+ Args:
217
+ words (list): List of words to determine which are in the corpus
218
+ Returns:
219
+ set: The set of those words from the input that are in the corpus
220
+ """
221
+ tmp_words = [ensure_unicode(w) for w in words]
222
+ tmp = [w if self._case_sensitive else w.lower() for w in tmp_words]
223
+ return {w for w in tmp if w in self._word_frequency.dictionary and self._check_if_should_check(w)}
224
+
225
+ def unknown(self, words: typing.Iterable[KeyT]) -> typing.Set[str]:
226
+ """The subset of `words` that do not appear in the dictionary
227
+
228
+ Args:
229
+ words (list): List of words to determine which are not in the corpus
230
+ Returns:
231
+ set: The set of those words from the input that are not in the corpus
232
+ """
233
+ tmp_words = [ensure_unicode(w) for w in words]
234
+ tmp = [w if self._case_sensitive else w.lower() for w in tmp_words if self._check_if_should_check(w)]
235
+ return {w for w in tmp if w not in self._word_frequency.dictionary}
236
+
237
+ def edit_distance_1(self, word: KeyT) -> typing.Set[str]:
238
+ """Generate all words that are one edit operation away from `word`,
239
+ using only characters present in the corpus
240
+
241
+ Args:
242
+ word (str): The input word to generate possible single-edit variations for
243
+ Returns:
244
+ set: A set of words that differ from the input by exactly one edit
245
+ """
246
+ tmp_word = ensure_unicode(word).lower() if not self._case_sensitive else ensure_unicode(word)
247
+ if self._check_if_should_check(tmp_word) is False:
248
+ return {tmp_word}
249
+ letters = self._word_frequency.letters
250
+ splits = [(tmp_word[:i], tmp_word[i:]) for i in range(len(tmp_word) + 1)]
251
+ deletes = [L + R[1:] for L, R in splits if R]
252
+ transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
253
+ replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
254
+ inserts = [L + c + R for L, R in splits for c in letters]
255
+ return set(deletes + transposes + replaces + inserts)
256
+
257
+ def edit_distance_2(self, word: KeyT) -> typing.List[str]:
258
+ """Compute all strings that are two edits away from `word` using only
259
+ the letters in the corpus
260
+
261
+ Args:
262
+ word (str): The word for which to calculate the edit distance
263
+ Returns:
264
+ set: The set of strings that are edit distance two from the provided word
265
+ """
266
+ word = ensure_unicode(word).lower() if not self._case_sensitive else ensure_unicode(word)
267
+ return [e2 for e1 in self.edit_distance_1(word) for e2 in self.edit_distance_1(e1)]
268
+
269
+ def check_text(self, text: str, api_key: str) -> bool:
270
+ """Checks the given text for spelling errors using the OpenAI API
271
+
272
+ Args:
273
+ text (str): The text to be checked
274
+ api_key (str): The OpenAI API key to use for the request
275
+
276
+ Returns:
277
+ bool: True if spelling errors are detected and the text must be rejected, False otherwise or if the API call/response fails
278
+ """
279
+ if not api_key:
280
+ raise ValueError("OpenAI API Key is required.")
281
+
282
+ prompt = (
283
+ "Check this text carefully to review the spelling. "
284
+ "If there are spelling errors we must reject it. "
285
+ "Return your answer in this exact format with no additional commentary: {'reject': 'true' or 'false'}."
286
+ )
287
+
288
+ payload = {
289
+ "model": "gpt-4o",
290
+ "messages": [
291
+ {
292
+ "role": "user",
293
+ "content": [
294
+ {
295
+ "type": "text",
296
+ "text": f"{prompt}\n\nText: {text}",
297
+ }
298
+ ],
299
+ }
300
+ ],
301
+ "max_tokens": 30,
302
+ }
303
+
304
+ headers = {
305
+ "Content-Type": "application/json",
306
+ "Authorization": f"Bearer {api_key}",
307
+ }
308
+
309
+ try:
310
+ response = requests.post(
311
+ "https://api.openai.com/v1/chat/completions",
312
+ headers=headers,
313
+ json=payload,
314
+ )
315
+ response.raise_for_status()
316
+ must_reject = extract_json_content(response.text)
317
+ return bool(must_reject)
318
+ except Exception:
319
+ return False
320
+
321
+ def check_image(self, image: typing.Union[str, bytes], api_key: str) -> bool:
322
+ """Checks the given image for spelling errors using the OpenAI API
323
+
324
+ Args:
325
+ image (str | os.PathLike | bytes | bytearray): The image to be checked, given as a file path (str or os.PathLike), or as bytes/bytearray
326
+ api_key (str): The OpenAI API key to use for the request
327
+ Returns:
328
+ bool: True if spelling errors are detected in the image and it must be rejected, False otherwise or if the API call/response fails
329
+ """
330
+ if not api_key:
331
+ raise ValueError("OpenAI API Key is required.")
332
+
333
+ if isinstance(image, (str, os.PathLike)):
334
+ base64_image = encode_image_from_path(str(image))
335
+ elif isinstance(image, (bytes, bytearray)):
336
+ base64_image = encode_image_from_bytes(bytes(image))
337
+
338
+ prompt = (
339
+ "Check this image carefully to review the spelling. "
340
+ "If there are spelling errors we must reject it. "
341
+ "Return your answer in this exact format with no additional commentary: {'reject': 'true' or 'false'}."
342
+ )
343
+
344
+ payload = {
345
+ "model": "gpt-4o",
346
+ "messages": [
347
+ {
348
+ "role": "user",
349
+ "content": [
350
+ {
351
+ "type": "text",
352
+ "text": prompt,
353
+ },
354
+ {
355
+ "type": "image_url",
356
+ "image_url": {
357
+ "url": f"data:image/webp;base64,{base64_image}",
358
+ "detail": "low",
359
+ },
360
+ },
361
+ ],
362
+ }
363
+ ],
364
+ "max_tokens": 30,
365
+ }
366
+
367
+ headers = {
368
+ "Content-Type": "application/json",
369
+ "Authorization": f"Bearer {api_key}",
370
+ }
371
+
372
+ try:
373
+ response = requests.post(
374
+ "https://api.openai.com/v1/chat/completions",
375
+ headers=headers,
376
+ json=payload,
377
+ )
378
+ response.raise_for_status()
379
+ must_reject = extract_json_content(response.text)
380
+ return bool(must_reject)
381
+ except Exception as e:
382
+ return False
383
+
384
+ def __edit_distance_alt(self, words: typing.Iterable[KeyT]) -> typing.List[str]:
385
+ """Compute all strings that are 1 edits away from all the words using
386
+ only the letters in the corpus
387
+
388
+ Args:
389
+ words (list): The words for which to calculate the edit distance
390
+ Returns:
391
+ set: The set of strings that are edit distance two from the provided words
392
+ """
393
+ tmp_words = [ensure_unicode(w) for w in words]
394
+ tmp = [w if self._case_sensitive else w.lower() for w in tmp_words if self._check_if_should_check(w)]
395
+ return [e2 for e1 in tmp for e2 in self.known(self.edit_distance_1(e1))]
396
+
397
+ def _remove_diacritics(self, input_str: KeyT) -> str:
398
+ """Remove diacritics from the input string
399
+
400
+ Args:
401
+ input_str (str): The string from which to remove diacritics
402
+ Returns:
403
+ str: The string with diacritics removed
404
+ """
405
+ nfkd_form = unicodedata.normalize('NFKD', ensure_unicode(input_str))
406
+ return "".join([c for c in nfkd_form if not unicodedata.combining(c)])
407
+
408
+ def _check_if_should_check(self, word: str) -> bool:
409
+ if len(word) == 1 and word in string.punctuation:
410
+ return False
411
+ if len(word) > self._word_frequency.longest_word_length + 3:
412
+ return False
413
+ if word.lower() == "nan":
414
+ return True
415
+ try:
416
+ float(word)
417
+ return False
418
+ except ValueError:
419
+ pass
420
+ return True
421
+
422
+
423
+ class WordFrequency:
424
+ """Store the `dictionary` as a word frequency list while allowing for
425
+ different methods to load the data and update over time
426
+ """
427
+
428
+ __slots__ = [
429
+ "_dictionary",
430
+ "_total_words",
431
+ "_unique_words",
432
+ "_letters",
433
+ "_tokenizer",
434
+ "_case_sensitive",
435
+ "_longest_word_length",
436
+ ]
437
+
438
+ def __init__(
439
+ self,
440
+ tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,
441
+ case_sensitive: bool = False,
442
+ ) -> None:
443
+ self._dictionary: typing.Counter = Counter()
444
+ self._total_words = 0
445
+ self._unique_words = 0
446
+ self._letters: typing.Set[str] = set()
447
+ self._case_sensitive = case_sensitive
448
+ self._longest_word_length = 0
449
+
450
+ self._tokenizer = parse_into_words
451
+ if tokenizer is not None:
452
+ self._tokenizer = tokenizer
453
+
454
+ def __contains__(self, key: KeyT) -> bool:
455
+ """turn on contains"""
456
+ key = ensure_unicode(key)
457
+ key = key if self._case_sensitive else key.lower()
458
+ return key in self._dictionary
459
+
460
+ def __getitem__(self, key: KeyT) -> int:
461
+ """turn on getitem"""
462
+ key = ensure_unicode(key)
463
+ key = key if self._case_sensitive else key.lower()
464
+ return self._dictionary[key]
465
+
466
+ def __iter__(self) -> typing.Generator[str, None, None]:
467
+ """turn on iter support"""
468
+ yield from self._dictionary
469
+
470
+ def pop(self, key: KeyT, default: typing.Optional[int] = None) -> typing.Optional[int]:
471
+ """Remove the key and return the associated value or default if not
472
+ found
473
+
474
+ Args:
475
+ key (str): The key to remove
476
+ default (obj): The value to return if key is not present
477
+ Returns:
478
+ int | None: Returns the number of instances of key, or None if not in the dictionary
479
+ """
480
+ key = ensure_unicode(key)
481
+ return self._dictionary.pop(key if self._case_sensitive else key.lower(), default)
482
+
483
+ @property
484
+ def dictionary(self) -> typing.Dict[str, int]:
485
+ """Counter: A counting dictionary of all words in the corpus and the number
486
+ of times each has been seen
487
+
488
+ Note:
489
+ Not settable
490
+ """
491
+ return self._dictionary
492
+
493
+ @property
494
+ def total_words(self) -> int:
495
+ """int: The sum of all word occurrences in the word frequency dictionary
496
+
497
+ Note:
498
+ Not settable
499
+ """
500
+ return self._total_words
501
+
502
+ @property
503
+ def unique_words(self) -> int:
504
+ """int: The total number of unique words in the word frequency list
505
+
506
+ Note:
507
+ Not settable
508
+ """
509
+ return self._unique_words
510
+
511
+ @property
512
+ def letters(self) -> typing.Set[str]:
513
+ """set: The listing of all letters found within the corpus
514
+
515
+ Note:
516
+ Not settable
517
+ """
518
+ return self._letters
519
+
520
+ @property
521
+ def longest_word_length(self) -> int:
522
+ """int: The longest word length in the dictionary
523
+
524
+ Note:
525
+ Not settable
526
+ """
527
+ return self._longest_word_length
528
+
529
+ def tokenize(self, text: KeyT) -> typing.Iterator[str]:
530
+ """Tokenize the provided string object into individual words
531
+
532
+ Args:
533
+ text (str): The string object to tokenize
534
+ Yields:
535
+ str: The next `word` in the tokenized string
536
+ Note:
537
+ This is the same as the `split_words()` unless a tokenizer function was provided
538
+ """
539
+ tmp_text = ensure_unicode(text)
540
+ for word in self._tokenizer(tmp_text):
541
+ yield word if self._case_sensitive else word.lower()
542
+
543
+ def keys(self) -> typing.Iterator[str]:
544
+ """Iterator over the key of the dictionary
545
+
546
+ Yields:
547
+ str: The next key in the dictionary
548
+ Note:
549
+ This is the same as `words()`
550
+ """
551
+ yield from self._dictionary.keys()
552
+
553
+ def words(self) -> typing.Iterator[str]:
554
+ """Iterator over the words in the dictionary
555
+
556
+ Yields:
557
+ str: The next word in the dictionary
558
+ Note:
559
+ This is the same as `keys()`
560
+ """
561
+ yield from self._dictionary.keys()
562
+
563
+ def items(self) -> typing.Generator[typing.Tuple[str, int], None, None]:
564
+ """Iterator over the words in the dictionary
565
+
566
+ Yields:
567
+ str: The next word in the dictionary
568
+ int: The number of instances in the dictionary
569
+ Note:
570
+ This is the same as `dict.items()`
571
+ """
572
+ yield from self._dictionary.items()
573
+
574
+ def load_dictionary(self, filename: PathOrStr, encoding: str = "utf-8") -> None:
575
+ """Load in a pre-built word frequency list
576
+
577
+ Args:
578
+ filename (str): The filepath to the json (optionally gzipped) file to be loaded
579
+ encoding (str): The encoding of the dictionary
580
+ """
581
+ with load_file(filename, encoding) as data:
582
+ data = data if self._case_sensitive else data.lower()
583
+ self._dictionary.update(json.loads(data))
584
+ self._update_dictionary()
585
+
586
+ def load_json(self, data: typing.Dict[str, int]) -> None:
587
+ """Load in a pre-built word frequency list
588
+
589
+ Args:
590
+ data (dict): The dictionary to be loaded
591
+ """
592
+ self._dictionary.update(data)
593
+ self._update_dictionary()
594
+
595
+ def load_text_file(
596
+ self,
597
+ filename: PathOrStr,
598
+ encoding: str = "utf-8",
599
+ tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,
600
+ ) -> None:
601
+ """Load in a text file from which to generate a word frequency list
602
+
603
+ Args:
604
+ filename (str): The filepath to the text file to be loaded
605
+ encoding (str): The encoding of the text file
606
+ tokenizer (function): The function to use to tokenize a string
607
+ """
608
+ with load_file(filename, encoding=encoding) as data:
609
+ self.load_text(data, tokenizer)
610
+
611
+ def load_text(
612
+ self,
613
+ text: KeyT,
614
+ tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,
615
+ ) -> None:
616
+ """Load text from which to generate a word frequency list
617
+
618
+ Args:
619
+ text (str): The text to be loaded
620
+ tokenizer (function): The function to use to tokenize a string
621
+ """
622
+ text = ensure_unicode(text)
623
+ if tokenizer:
624
+ words = [x if self._case_sensitive else x.lower() for x in tokenizer(text)]
625
+ else:
626
+ words = self.tokenize(text)
627
+
628
+ self._dictionary.update(words)
629
+ self._update_dictionary()
630
+
631
+ def load_words(self, words: typing.Iterable[KeyT]) -> None:
632
+ """Load a list of words from which to generate a word frequency list
633
+
634
+ Args:
635
+ words (list): The list of words to be loaded
636
+ """
637
+ words = [ensure_unicode(w) for w in words]
638
+ self._dictionary.update([word if self._case_sensitive else word.lower() for word in words])
639
+ self._update_dictionary()
640
+
641
+ def add(self, word: KeyT, val: int = 1) -> None:
642
+ """Add a word to the word frequency list
643
+
644
+ Args:
645
+ word (str): The word to add
646
+ val (int): The number of times to insert the word
647
+ """
648
+ word = ensure_unicode(word)
649
+ self.load_json({word if self._case_sensitive else word.lower(): val})
650
+
651
+ def remove_words(self, words: typing.Iterable[KeyT]) -> None:
652
+ """Remove a list of words from the word frequency list
653
+
654
+ Args:
655
+ words (list): The list of words to remove
656
+ """
657
+ words = [ensure_unicode(w) for w in words]
658
+ for word in words:
659
+ self.pop(word)
660
+ self._update_dictionary()
661
+
662
+ def remove(self, word: KeyT) -> None:
663
+ """Remove a word from the word frequency list
664
+
665
+ Args:
666
+ word (str): The word to remove
667
+ """
668
+ self.pop(word)
669
+ self._update_dictionary()
670
+
671
+ def remove_by_threshold(self, threshold: int = 5) -> None:
672
+ """Remove all words at, or below, the provided threshold
673
+
674
+ Args:
675
+ threshold (int): The threshold at which a word is to be removed
676
+ """
677
+ to_remove = [k for k, v in self._dictionary.items() if v <= threshold]
678
+ self.remove_words(to_remove)
679
+
680
+ def _update_dictionary(self) -> None:
681
+ """Update the word frequency object"""
682
+ if not self._dictionary:
683
+ self._longest_word_length = 0
684
+ self._total_words = 0
685
+ self._unique_words = 0
686
+ self._letters = set()
687
+ return
688
+ keys = self._dictionary.keys()
689
+ self._longest_word_length = max(map(len, keys))
690
+ self._total_words = sum(self._dictionary.values())
691
+ self._unique_words = len(keys)
692
+ self._letters = set().union(*keys)
spellcheckerpy/info.py ADDED
@@ -0,0 +1,10 @@
1
+ """SpellChecker Information"""
2
+
3
+ __author__ = "Tyler Barrus"
4
+ __maintainer__ = "Tyler Barrus"
5
+ __email__ = "barrust@gmail.com"
6
+ __license__ = "MIT"
7
+ __version__ = "1.1.0"
8
+ __credits__ = ["Peter Norvig"]
9
+ __url__ = "https://github.com/barrust/pyspellchecker"
10
+ __bugtrack_url__ = f"{__url__}/issues"
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1,203 @@
1
+ """Additional utility functions"""
2
+
3
+ import os
4
+ import contextlib
5
+ import functools
6
+ import gzip
7
+ import re
8
+ import typing
9
+ import warnings
10
+ import base64
11
+ import json
12
+ import shutil
13
+ from pathlib import Path
14
+
15
+ from .info import __version__
16
+
17
+ KeyT = typing.Union[str, bytes]
18
+ PathOrStr = typing.Union[Path, str]
19
+
20
+
21
+ def fail_after(version: str) -> typing.Callable:
22
+ """Decorator to add to tests to ensure that they fail if a deprecated
23
+ feature is not removed before the specified version
24
+
25
+ Args:
26
+ version (str): The version to check against
27
+ """
28
+ def decorator_wrapper(func):
29
+ @functools.wraps(func)
30
+ def test_inner(*args, **kwargs):
31
+ if [int(x) for x in version.split(".")] <= [int(x) for x in __version__.split(".")]:
32
+ msg = (
33
+ f"The function {func.__name__} must be fully removed as it is deprecated"
34
+ f" and must be removed by version {version}"
35
+ )
36
+ raise AssertionError(msg)
37
+ return func(*args, **kwargs)
38
+ return test_inner
39
+ return decorator_wrapper
40
+
41
+ def deprecated(message: str = "") -> typing.Callable:
42
+ """A simplistic decorator to mark functions as deprecated. The function
43
+ will pass a message to the user on the first use of the function
44
+
45
+ Args:
46
+ message (str): The message to display if the function is deprecated
47
+ """
48
+ def decorator_wrapper(func):
49
+ @functools.wraps(func)
50
+ def function_wrapper(*args, **kwargs):
51
+ func_name = func.__name__
52
+ if func_name not in function_wrapper.deprecated_items:
53
+ msg = f"Function {func.__name__} is now deprecated! {message}"
54
+ warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
55
+ function_wrapper.deprecated_items.add(func_name)
56
+ return func(*args, **kwargs)
57
+ function_wrapper.deprecated_items = set()
58
+ return function_wrapper
59
+ return decorator_wrapper
60
+
61
+ def ensure_unicode(value: KeyT, encoding: str = "utf-8") -> str:
62
+ """Simplify checking if passed in data are bytes or a string and decode
63
+ bytes into unicode
64
+
65
+ Args:
66
+ value (str): The input string (possibly bytes)
67
+ encoding (str): The encoding to use if input is bytes
68
+ Returns:
69
+ str: The encoded string
70
+ """
71
+ if isinstance(value, bytes):
72
+ return value.decode(encoding)
73
+ elif isinstance(value, list):
74
+ raise TypeError(f"The provided value {value} is not of type str or bytes")
75
+ return value
76
+
77
+ @contextlib.contextmanager
78
+ def __gzip_read(filename: PathOrStr, mode: str = "rb", encoding: str = "UTF-8") -> typing.Generator[KeyT, None, None]:
79
+ """Context manager to correctly handle the decoding of the output of the gzip file
80
+
81
+ Args:
82
+ filename (str): The filename to open
83
+ mode (str): The mode to read the data
84
+ encoding (str): The file encoding to use
85
+ Returns:
86
+ str: The string data from the gzip file read
87
+ """
88
+ with gzip.open(filename, mode=mode, encoding=encoding) as fobj:
89
+ yield fobj.read()
90
+
91
+ @contextlib.contextmanager
92
+ def load_file(filename: PathOrStr, encoding: str) -> typing.Generator[KeyT, None, None]:
93
+ """Context manager to handle opening a gzip or text file correctly and
94
+ reading all the data
95
+
96
+ Args:
97
+ filename (str): The filename to open
98
+ encoding (str): The file encoding to use
99
+ Returns:
100
+ str: The string data from the file read
101
+ """
102
+ if isinstance(filename, Path):
103
+ filename = str(filename)
104
+
105
+ if filename[-3:].lower() == ".gz":
106
+ with __gzip_read(filename, mode="rt", encoding=encoding) as data:
107
+ yield data
108
+ else:
109
+ with open(filename, encoding=encoding) as fobj:
110
+ yield fobj.read()
111
+
112
+ def write_file(filepath: PathOrStr, encoding: str, gzipped: bool, data: str) -> None:
113
+ """Write the data to file either as a gzip file or text based on the
114
+ gzipped parameter
115
+
116
+ Args:
117
+ filepath (str): The filename to open
118
+ encoding (str): The file encoding to use
119
+ gzipped (bool): Whether the file should be gzipped or not
120
+ data (str): The data to be written out
121
+ """
122
+ if gzipped:
123
+ with gzip.open(filepath, "wt") as fobj:
124
+ fobj.write(data)
125
+ else:
126
+ with open(filepath, "w", encoding=encoding) as fobj:
127
+ fobj.write(data)
128
+
129
+ def test_file(filepath: PathOrStr, encoding: str, index: str):
130
+ """Test and retrieve a section from a gzipped JSON file located in the resources directory.
131
+
132
+ Args:
133
+ filepath (str): The stem of the filename to open
134
+ encoding (str): The encoding to use when reading the file
135
+ index (str): The key to extract from dictionary
136
+ """
137
+ try:
138
+ filepath = f"{os.path.join(os.path.dirname(__file__), 'resources')}/{filepath}.json.gz"
139
+ with gzip.open(filepath, "rt", encoding=encoding) as f:
140
+ data = f.read()
141
+ data = json.loads(data)
142
+ data = data[index]
143
+ return data
144
+ except Exception as e:
145
+ print(f"An error occurred while reading gzip file: {e}")
146
+
147
+ def parse_into_words(text: str) -> typing.Iterable[str]:
148
+ """Parse the text into words; currently removes punctuation except for
149
+ apostrophizes
150
+
151
+ Args:
152
+ text (str): The text to split into words
153
+ """
154
+ return re.findall(r"(\w[\w']*\w|\w)", text)
155
+
156
+ def gzip_file(input_path: str, output_path: str) -> None:
157
+ """Compress a file using gzip
158
+
159
+ Args:
160
+ input_path (str): Path to the input file
161
+ output_path (str): Path where the gzipped file will be created
162
+ """
163
+ with open(input_path, 'rb') as f_in:
164
+ with gzip.open(output_path, 'wb') as f_out:
165
+ shutil.copyfileobj(f_in, f_out)
166
+ print(f"Gzipped file created at {output_path}")
167
+
168
+ def encode_image_from_path(image_path: str) -> str:
169
+ """Encodes an image from a given file path as a base64 string
170
+
171
+ Args:
172
+ image_path (str): Path to the image file
173
+ Returns:
174
+ str: The base64-encoded string of the image contents
175
+ """
176
+ with open(image_path, "rb") as image_file:
177
+ return base64.b64encode(image_file.read()).decode("utf-8")
178
+
179
+ def encode_image_from_bytes(image_bytes: bytes) -> str:
180
+ """Encodes image bytes as a base64 string
181
+
182
+ Args:
183
+ image_bytes (bytes): The image data as bytes
184
+ Returns:
185
+ str: Base64-encoded string of the image bytes
186
+ """
187
+ return base64.b64encode(image_bytes).decode("utf-8")
188
+
189
+ def extract_json_content(response_text: str) -> typing.Optional[bool]:
190
+ """Extracts and interprets the 'reject' value from an OpenAI API response text JSON
191
+
192
+ Args:
193
+ response_text (str): The JSON response text from the OpenAI API
194
+ Returns:
195
+ Optional[bool]: True if the response indicates rejection ("reject": "true"), False if not rejected ("reject": "false"). Returns None if parsing fails
196
+ """
197
+ try:
198
+ response_json = json.loads(response_text)
199
+ content = response_json["choices"][0]["message"]["content"]
200
+ content_json = json.loads(content.replace("'", '"'))
201
+ return content_json.get("reject", "false").lower() == "true"
202
+ except (KeyError, IndexError, json.JSONDecodeError) as e:
203
+ return None
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c)
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,34 @@
1
+ Metadata-Version: 2.1
2
+ Name: spellcheckerpy
3
+ Version: 1.1.0
4
+ Summary: Detect spelling errors in images and plain text using OpenAI Vision
5
+ Author-email: Tyler Barrus <barrust@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/barrust/pyspellchecker
8
+ Requires-Python: >=3.9
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Requires-Dist: requests>=2.31.0
12
+
13
+ # spellcheckerpy
14
+
15
+ Detect spelling errors in images and plain text using OpenAI Vision.
16
+
17
+
18
+ ## Usage
19
+
20
+ ```python
21
+
22
+ from spellcheckerpy import SpellChecker
23
+
24
+ spellchecker = SpellChecker()
25
+
26
+ api_key = "YOUR_OPENAI_API_KEY" # Replace with your OpenAI API key
27
+
28
+ text = "I am a student."
29
+ print(spellchecker.check_text(text, api_key)) # True = reject (spelling error). False = accept.
30
+
31
+ image_path = "test.png" # file path to an image or image bytes
32
+ print(spellchecker.check_image(image_path, api_key)) # True = reject (spelling error). False = accept.
33
+
34
+ ```
@@ -0,0 +1,21 @@
1
+ spellcheckerpy/__init__.py,sha256=QBtLViofx6CRTwH6q9N9NYbu7VADeBJkSLffUSBTeZM,277
2
+ spellcheckerpy/core.py,sha256=cClPefkagFqbYLUJI6dkgl0DEdI9IHdYuc57iTIfy5M,25443
3
+ spellcheckerpy/info.py,sha256=gbZejf8gz6zrZXQAstJ7HLIV5bRP_K2yi_Nn2-BekYQ,290
4
+ spellcheckerpy/utils.py,sha256=0rpR8CfhOQJ04yjgYoQAx7lGKNpT1Ga_hjkiKv9o-SE,7234
5
+ spellcheckerpy/resources/ar.json.gz,sha256=620Wb1XIxMEa6pO1x8lJBQP2f4K1mdC2cfz2yfMF_3M,714576
6
+ spellcheckerpy/resources/de.json.gz,sha256=ecrNKij6bekInxL_rxHFNWNT6gih-V8Hghjx5ZR6lBQ,1075650
7
+ spellcheckerpy/resources/en.json.gz,sha256=JHSkivhv2B3M6p7dC7ps023S7O3AriF878sjO7ooYTw,650750
8
+ spellcheckerpy/resources/es.json.gz,sha256=YNJ_0V7iqZ5nn22lEdihQZQfJwU33uNESNHFEnkXUXE,354425
9
+ spellcheckerpy/resources/eu.json.gz,sha256=XfVQapnGo9j2S7vlf45FT7bflBAHXvd2PYFjcDH6vzo,350357
10
+ spellcheckerpy/resources/fa.json.gz,sha256=NbOPrSmIEsM2MEqSR2H2t4eJ6DMh2l3XHrup62T-_7M,88583
11
+ spellcheckerpy/resources/fr.json.gz,sha256=BMJW9tBsTK3vZAtjAC4OOQAIJl440975cXLCktCaFB0,497432
12
+ spellcheckerpy/resources/it.json.gz,sha256=TG4kWV5ZtPgC76LS4dJEw9nOZFrhsmItTcmLde0w_mE,465402
13
+ spellcheckerpy/resources/lv.json.gz,sha256=nrltsXLlZj5Ta9DFXZZvIK5wvKXlUTIX69BfGKCvgoI,418819
14
+ spellcheckerpy/resources/nl.json.gz,sha256=uhRB70F7TAped9GjIIZueWkkzZ2EOaQIURheWBKitsU,1328560
15
+ spellcheckerpy/resources/pt.json.gz,sha256=buQc8cBxvh0FVkKxchST5iTuQNUyx3J0uIFCCnaiopQ,1261113
16
+ spellcheckerpy/resources/ru.json.gz,sha256=wLUWMSkL0tG_Xo3ZAqZ-Zoqe974Am-ZXOV1BkBruf_M,107969
17
+ spellcheckerpy-1.1.0.dist-info/LICENSE,sha256=Ccz49ErvRuftKuWMyju6gVrslVp5Po5OcxmEHLSXyxM,1072
18
+ spellcheckerpy-1.1.0.dist-info/METADATA,sha256=4IiEi8odhQQ6J8FEb3qdMnWZSrCbGfxjADbbn9Dtxf4,959
19
+ spellcheckerpy-1.1.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
20
+ spellcheckerpy-1.1.0.dist-info/top_level.txt,sha256=wlRGkCxqOqjKVkiwxNF0YhwMdlRxeteDlSwkJPRT7Z0,15
21
+ spellcheckerpy-1.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: bdist_wheel (0.45.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ spellcheckerpy