sonatoki 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonatoki/Configs.py ADDED
@@ -0,0 +1,80 @@
1
+ # STL
2
+ from copy import deepcopy
3
+ from typing import List, Type, TypedDict
4
+
5
+ # PDM
6
+ from typing_extensions import NotRequired
7
+
8
+ # LOCAL
9
+ from sonatoki.Filters import (
10
+ Filter,
11
+ NimiPu,
12
+ Numerics,
13
+ Syllabic,
14
+ NimiLinku,
15
+ NimiPuAle,
16
+ Alphabetic,
17
+ ProperName,
18
+ Phonotactic,
19
+ NimiLinkuAle,
20
+ Punctuations,
21
+ )
22
+ from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
23
+ from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
24
+ from sonatoki.Tokenizers import Tokenizer, WordTokenizerTok
25
+ from sonatoki.Preprocessors import (
26
+ URLs,
27
+ Preprocessor,
28
+ DiscordEmotes,
29
+ DiscordSpecial,
30
+ DiscordChannels,
31
+ DiscordMentions,
32
+ )
33
+
34
+
35
+ class IloConfig(TypedDict):
36
+ preprocessors: List[Type[Preprocessor]]
37
+ word_tokenizer: Type[Tokenizer]
38
+ cleaners: List[Type[Cleaner]]
39
+ ignoring_filters: List[Type[Filter]]
40
+ scoring_filters: List[Type[Filter]]
41
+ scorer: Type[Scorer]
42
+ passing_score: Number
43
+
44
+
45
+ BaseConfig: IloConfig = {
46
+ "preprocessors": [URLs],
47
+ "cleaners": [ConsecutiveDuplicates],
48
+ "ignoring_filters": [Numerics, Punctuations],
49
+ "scoring_filters": [],
50
+ "scorer": PassFail,
51
+ "passing_score": 0.8,
52
+ "word_tokenizer": WordTokenizerTok,
53
+ }
54
+
55
+
56
+ PrefConfig: IloConfig = deepcopy(BaseConfig)
57
+ PrefConfig["scoring_filters"].extend([NimiLinku, Syllabic, ProperName, Alphabetic])
58
+ PrefConfig["scorer"] = SoftScaling
59
+
60
+
61
+ LazyConfig: IloConfig = deepcopy(BaseConfig)
62
+ LazyConfig["scoring_filters"].extend([Alphabetic, ProperName])
63
+ LazyConfig["scorer"] = SoftPassFail
64
+
65
+ DiscordConfig: IloConfig = deepcopy(PrefConfig)
66
+ DiscordConfig["preprocessors"].extend(
67
+ [DiscordEmotes, DiscordMentions, DiscordChannels, DiscordSpecial]
68
+ )
69
+ TelegramConfig: IloConfig = deepcopy(PrefConfig)
70
+ ForumConfig: IloConfig = deepcopy(PrefConfig)
71
+
72
+ __all__ = [
73
+ "IloConfig",
74
+ "BaseConfig",
75
+ "PrefConfig",
76
+ "LazyConfig",
77
+ "DiscordConfig",
78
+ "TelegramConfig",
79
+ "ForumConfig",
80
+ ]
sonatoki/Filters.py CHANGED
@@ -17,6 +17,7 @@ from sonatoki.constants import (
17
17
  NIMI_LINKU_SET,
18
18
  NIMI_PU_ALE_SET,
19
19
  NIMI_LINKU_ALE_SET,
20
+ NIMI_LINKU_SANDBOX_SET,
20
21
  )
21
22
 
22
23
  re.DEFAULT_VERSION = re.VERSION1
@@ -87,6 +88,10 @@ class NimiLinkuAle(SetFilter):
87
88
  tokens = NIMI_LINKU_ALE_SET
88
89
 
89
90
 
91
+ class NimiLinkuSandbox(SetFilter):
92
+ tokens = NIMI_LINKU_SANDBOX_SET
93
+
94
+
90
95
  class Phonotactic(RegexFilter):
91
96
  """Determines if a given token is phonotactically valid Toki Pona (or `n`).
92
97
  Excludes both consecutive nasals and the illegal syllables:
sonatoki/Preprocessors.py CHANGED
@@ -13,7 +13,7 @@ There are currently two distinct types of Preprocessor:
13
13
  - ArrowQuote
14
14
 
15
15
  Order does not generally matter, but if there were two overlapping containers such as in the string "|| spoiler ` monospace || `", order would matter.
16
- As such, each Preprocessor exposes a .precedence attribute which is optionally usable for ordering them. Lower precedence means it should be applied first.
16
+ It is up to the user to order them appropriately.
17
17
  """
18
18
 
19
19
  # STL
@@ -27,8 +27,6 @@ re.DEFAULT_VERSION = re.VERSION1
27
27
 
28
28
 
29
29
  class Preprocessor(ABC):
30
- precedence: int = 0
31
-
32
30
  @classmethod # order matters
33
31
  @abstractmethod
34
32
  def process(cls, msg: str) -> str:
@@ -104,7 +102,6 @@ class DoubleQuotes(RegexPreprocessor):
104
102
  class Backticks(RegexPreprocessor):
105
103
  """Remove paired backticks and their contents `like this`"""
106
104
 
107
- precedence = -10
108
105
  pattern = re.compile(r"`[^`]+`", flags=re.S)
109
106
 
110
107
 
sonatoki/Tokenizers.py CHANGED
@@ -1,10 +1,10 @@
1
1
  # STL
2
- from typing import List, Callable
2
+ from abc import ABC, abstractmethod
3
+ from typing import List
3
4
 
4
5
  # PDM
5
6
  import regex as re
6
-
7
- # TODO: Entire module should be reworked to match the class scheme of the rest of the module, imo
7
+ from typing_extensions import override
8
8
 
9
9
  try:
10
10
  # PDM
@@ -17,42 +17,60 @@ except ImportError as e:
17
17
 
18
18
  LANGUAGE = "english" # for NLTK
19
19
 
20
- SENT_DELIMS_RE = re.compile(r"""(.*?[.?!;:])|(.+?$)""")
21
- SENT_DELIMS_TOK = re.compile(r"""(?<=[.?!:;·…“”"'()\[\]\-]|$)""")
22
- # TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
23
- # TODO: do the typography characters matter?
24
- # NOTE: | / and , are *not* sentence delimiters for my purpose
25
20
 
26
- WORD_DELIMS_RE = re.compile(r"""\s+|(?=[.?!;:'"-])""")
27
- WORD_DELIMS_TOK = re.compile(r"([\p{Punctuation}\p{posix_punct}]+|\s+)")
21
+ class Tokenizer(ABC):
22
+ @classmethod
23
+ @abstractmethod
24
+ def tokenize(cls, s: str) -> List[str]: ...
28
25
 
29
- Tokenizer = Callable[[str], List[str]]
30
26
 
27
+ class NoOpTokenizer(Tokenizer):
28
+ """This is a special case that you do not want or need."""
31
29
 
32
- if not isinstance(nltk, ImportError):
30
+ @classmethod
31
+ @override
32
+ def tokenize(cls, s: str) -> List[str]:
33
+ return [s]
34
+
35
+
36
+ class RegexTokenizer(Tokenizer):
37
+ pattern: "re.Pattern[str]"
33
38
 
34
- def sent_tokenize_nltk(s: str) -> List[str]:
35
- return __sent_tokenize_nltk(text=s, language=LANGUAGE)
39
+ @classmethod
40
+ @override
41
+ def tokenize(cls, s: str) -> List[str]:
42
+ return [clean for word in re.split(cls.pattern, s) if (clean := word.strip())]
36
43
 
37
- def word_tokenize_nltk(s: str) -> List[str]:
38
- return __word_tokenize_nltk(text=s, language=LANGUAGE)
39
44
 
45
+ class WordTokenizerTok(RegexTokenizer):
46
+ pattern = re.compile(r"""([\p{Punctuation}\p{posix_punct}]+|\s+)""")
47
+ # TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
48
+ # TODO: do the typography characters matter?
49
+ # NOTE: | / and , are *not* sentence delimiters for my purpose
40
50
 
41
- def sent_tokenize_re(s: str) -> List[str]:
42
- return [
43
- clean
44
- for sent in re.findall(SENT_DELIMS_RE, s)
45
- if (clean := sent[0].strip() or sent[1].strip())
46
- ]
47
51
 
52
+ class SentTokenizerTok(RegexTokenizer):
53
+ pattern = re.compile(r"""(?<=[.?!:;·…“”"'()\[\]\-]|$)""")
48
54
 
49
- def word_tokenize_re(s: str) -> List[str]:
50
- return [clean for word in re.split(WORD_DELIMS_RE, s) if (clean := word.strip())]
51
55
 
56
+ class WordTokenizerRe(RegexTokenizer):
57
+ pattern = re.compile(r"""(?<=[.?!;:'"-])""")
52
58
 
53
- def sent_tokenize_tok(s: str) -> List[str]:
54
- return [clean for sent in re.split(SENT_DELIMS_TOK, s) if (clean := sent.strip())]
55
59
 
60
+ class SentTokenizerRe(RegexTokenizer):
61
+ pattern = re.compile(r"""(.*?[.?!;:])|(.+?$)""")
62
+
63
+
64
+ if not isinstance(nltk, ImportError):
56
65
 
57
- def word_tokenize_tok(s: str) -> List[str]:
58
- return [clean for word in re.split(WORD_DELIMS_TOK, s) if (clean := word.strip())]
66
+ class WordTokenizerNLTK(Tokenizer):
67
+ @classmethod
68
+ @override
69
+ def tokenize(cls, s: str) -> List[str]:
70
+ return __word_tokenize_nltk(text=s, language=LANGUAGE)
71
+
72
+ class SentTokenizerNLTK(Tokenizer):
73
+ @classmethod
74
+ @override
75
+ def tokenize(cls, s: str) -> List[str]:
76
+ return __sent_tokenize_nltk(text=s, language=LANGUAGE)
sonatoki/constants.py CHANGED
@@ -4,6 +4,7 @@ from typing import Dict, List
4
4
  from pathlib import Path
5
5
 
6
6
  LINKU = Path(__file__).resolve().parent / Path("linku.json")
7
+ SANDBOX = Path(__file__).resolve().parent / Path("sandbox.json")
7
8
 
8
9
  VOWELS = "aeiou"
9
10
  CONSONANTS = "jklmnpstw"
@@ -29,10 +30,16 @@ with open(LINKU) as f:
29
30
  ]
30
31
  NIMI_LINKU_ALE: List[str] = [d["word"] for d in r.values()]
31
32
 
33
+ with open(SANDBOX) as f:
34
+ r: Dict[str, Dict[str, str]] = json.loads(f.read())
35
+ NIMI_LINKU_SANDBOX: List[str] = [d["word"] for d in r.values()]
36
+
37
+
32
38
  NIMI_PU_SET = set(NIMI_PU)
33
39
  NIMI_PU_ALE_SET = set(NIMI_PU_ALE)
34
40
  NIMI_LINKU_SET = set(NIMI_LINKU)
35
41
  NIMI_LINKU_ALE_SET = set(NIMI_LINKU_ALE)
42
+ NIMI_LINKU_SANDBOX_SET = set(NIMI_LINKU_SANDBOX)
36
43
  ALLOWABLES_SET = set(ALLOWABLES)
37
44
 
38
45
  __all__ = [
@@ -54,4 +61,7 @@ __all__ = [
54
61
  #
55
62
  "NIMI_LINKU_ALE",
56
63
  "NIMI_LINKU_ALE_SET",
64
+ #
65
+ "NIMI_LINKU_SANDBOX",
66
+ "NIMI_LINKU_SANDBOX_SET",
57
67
  ]
sonatoki/ilo.py CHANGED
@@ -14,13 +14,13 @@ LOG = logging.getLogger(__name__)
14
14
 
15
15
  class Ilo:
16
16
  __preprocessors: List[Type[Preprocessor]]
17
+ __word_tokenizer: Type[Tokenizer]
17
18
  __cleaners: List[Type[Cleaner]]
18
19
  __ignoring_filters: List[Type[Filter]]
19
20
  __scoring_filters: List[Type[Filter]]
20
21
  __scorer: Type[Scorer]
21
- __tokenize: Tokenizer
22
22
  __passing_score: Number
23
- logging_threshold: Number = 1.0
23
+ logging_threshold: Number = -1
24
24
 
25
25
  def __init__(
26
26
  self,
@@ -29,61 +29,62 @@ class Ilo:
29
29
  ignoring_filters: List[Type[Filter]],
30
30
  scoring_filters: List[Type[Filter]],
31
31
  scorer: Type[Scorer],
32
- tokenizer: Tokenizer, # NOTE: no wrapper needed?
33
32
  passing_score: Number,
33
+ word_tokenizer: Type[Tokenizer],
34
34
  ):
35
35
  super().__init__()
36
36
  # avoid keeping a ref to user's list just in case
37
37
  self.__preprocessors = [*preprocessors]
38
+ self.__word_tokenizer = word_tokenizer
38
39
  self.__cleaners = [*cleaners]
39
40
  self.__ignoring_filters = [*ignoring_filters]
40
41
  self.__scoring_filters = [*scoring_filters]
41
42
  self.__scorer = scorer
42
- self.__tokenize = tokenizer
43
43
  self.__passing_score = passing_score
44
44
 
45
- def __preprocess(self, msg: str) -> str:
45
+ def preprocess(self, msg: str) -> str:
46
46
  for p in self.__preprocessors:
47
47
  msg = p.process(msg)
48
48
  return msg
49
49
 
50
- def __clean_token(self, token: str) -> str:
50
+ def word_tokenize(self, msg: str) -> List[str]:
51
+ """It is *highly* recommended that you run `ilo.preprocess` first."""
52
+ return self.__word_tokenizer.tokenize(msg)
53
+
54
+ def clean_token(self, token: str) -> str:
51
55
  for c in self.__cleaners:
52
56
  token = c.clean(token)
53
57
  return token
54
58
 
55
- def __clean_tokens(self, tokens: List[str]) -> List[str]:
56
- # NOTE: tested, making a new list with a for loop *is* faster than
57
- # - list comps
58
- # - generator comps
59
- # - in-place replacement/removal
60
- # - in place replacement with result of generator comp
59
+ def clean_tokens(self, tokens: List[str]) -> List[str]:
60
+ # NOTE: tested, making a new list with a for loop *is* faster than:
61
+ # list comp, generator comp, in-place replacement
61
62
  cleaned_tokens: List[str] = list()
62
63
  for token in tokens:
63
- cleaned_token = self.__clean_token(token)
64
+ cleaned_token = self.clean_token(token)
64
65
  if not cleaned_token:
65
66
  # TODO: warn user?
66
67
  continue
67
68
  cleaned_tokens.append(cleaned_token)
68
69
  return cleaned_tokens
69
70
 
70
- def __filter_token(self, token: str) -> bool:
71
+ def _filter_token(self, token: str) -> bool:
71
72
  for f in self.__ignoring_filters:
72
73
  if f.filter(token):
73
74
  return True
74
75
  return False
75
76
 
76
- def __filter_tokens(self, tokens: List[str]) -> List[str]:
77
+ def filter_tokens(self, tokens: List[str]) -> List[str]:
77
78
  filtered_tokens: List[str] = []
78
79
  for token in tokens:
79
- if self.__filter_token(token):
80
+ if self._filter_token(token):
80
81
  continue
81
82
  # the ignoring filter is true if the token matches
82
83
  # the user wants to ignore these so keep non-matching tokens
83
84
  filtered_tokens.append(token)
84
85
  return filtered_tokens
85
86
 
86
- def __score_tokens(self, tokens: List[str]) -> float:
87
+ def score_tokens(self, tokens: List[str]) -> float:
87
88
  return self.__scorer.score(tokens, self.__scoring_filters)
88
89
 
89
90
  def _is_toki_pona(
@@ -95,26 +96,25 @@ class Ilo:
95
96
  - Filtered message (list[str])
96
97
  - Cleaned message (list[str])
97
98
  - Score (float)
98
- - Result (bool)
99
- """
100
- preprocessed = self.__preprocess(message)
101
- tokenized = self.__tokenize(preprocessed)
102
- filtered = self.__filter_tokens(tokenized)
103
- cleaned = self.__clean_tokens(filtered)
104
- score = self.__score_tokens(cleaned)
99
+ - Result (bool)"""
100
+ preprocessed = self.preprocess(message)
101
+ tokenized = self.word_tokenize(preprocessed)
102
+ filtered = self.filter_tokens(tokenized)
103
+ cleaned = self.clean_tokens(filtered)
104
+ score = self.score_tokens(cleaned)
105
105
  result = score >= self.__passing_score
106
106
 
107
- # NOTE: this method may break if above funcs start sharing a list
108
107
  if score <= self.logging_threshold:
109
- LOG.debug("Msg: %.2f %s", score, repr(message))
110
- LOG.debug("Preproc: %s", repr(preprocessed))
111
- LOG.debug("Tokenized: %s", tokenized)
112
- LOG.debug("Filtered: %s", filtered)
113
- LOG.debug("Cleaned: %s", cleaned)
108
+ LOG.debug("msg: %.2f %s", score, repr(message))
109
+ LOG.debug("preproc: %s", repr(preprocessed))
110
+ LOG.debug("tokenized: %s", tokenized)
111
+ LOG.debug("filtered: %s", filtered)
112
+ LOG.debug("cleaned: %s", cleaned)
114
113
  # TODO: Move to each function? Loses ability to control when logging occurs by threshold
115
114
 
116
115
  return preprocessed, tokenized, filtered, cleaned, score, result
117
116
 
118
117
  def is_toki_pona(self, message: str) -> bool:
118
+ """Determines whether a single statement is or is not Toki Pona."""
119
119
  *_, result = self._is_toki_pona(message)
120
120
  return result