sonatoki 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonatoki/Configs.py ADDED
@@ -0,0 +1,80 @@
1
+ # STL
2
+ from copy import deepcopy
3
+ from typing import List, Type, TypedDict
4
+
5
+ # PDM
6
+ from typing_extensions import NotRequired
7
+
8
+ # LOCAL
9
+ from sonatoki.Filters import (
10
+ Filter,
11
+ NimiPu,
12
+ Numerics,
13
+ Syllabic,
14
+ NimiLinku,
15
+ NimiPuAle,
16
+ Alphabetic,
17
+ ProperName,
18
+ Phonotactic,
19
+ NimiLinkuAle,
20
+ Punctuations,
21
+ )
22
+ from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
23
+ from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
24
+ from sonatoki.Tokenizers import Tokenizer, WordTokenizerTok
25
+ from sonatoki.Preprocessors import (
26
+ URLs,
27
+ Preprocessor,
28
+ DiscordEmotes,
29
+ DiscordSpecial,
30
+ DiscordChannels,
31
+ DiscordMentions,
32
+ )
33
+
34
+
35
+ class IloConfig(TypedDict):
36
+ preprocessors: List[Type[Preprocessor]]
37
+ word_tokenizer: Type[Tokenizer]
38
+ cleaners: List[Type[Cleaner]]
39
+ ignoring_filters: List[Type[Filter]]
40
+ scoring_filters: List[Type[Filter]]
41
+ scorer: Type[Scorer]
42
+ passing_score: Number
43
+
44
+
45
+ BaseConfig: IloConfig = {
46
+ "preprocessors": [URLs],
47
+ "cleaners": [ConsecutiveDuplicates],
48
+ "ignoring_filters": [Numerics, Punctuations],
49
+ "scoring_filters": [],
50
+ "scorer": PassFail,
51
+ "passing_score": 0.8,
52
+ "word_tokenizer": WordTokenizerTok,
53
+ }
54
+
55
+
56
+ PrefConfig: IloConfig = deepcopy(BaseConfig)
57
+ PrefConfig["scoring_filters"].extend([NimiLinku, Syllabic, ProperName, Alphabetic])
58
+ PrefConfig["scorer"] = SoftScaling
59
+
60
+
61
+ LazyConfig: IloConfig = deepcopy(BaseConfig)
62
+ LazyConfig["scoring_filters"].extend([Alphabetic, ProperName])
63
+ LazyConfig["scorer"] = SoftPassFail
64
+
65
+ DiscordConfig: IloConfig = deepcopy(PrefConfig)
66
+ DiscordConfig["preprocessors"].extend(
67
+ [DiscordEmotes, DiscordMentions, DiscordChannels, DiscordSpecial]
68
+ )
69
+ TelegramConfig: IloConfig = deepcopy(PrefConfig)
70
+ ForumConfig: IloConfig = deepcopy(PrefConfig)
71
+
72
+ __all__ = [
73
+ "IloConfig",
74
+ "BaseConfig",
75
+ "PrefConfig",
76
+ "LazyConfig",
77
+ "DiscordConfig",
78
+ "TelegramConfig",
79
+ "ForumConfig",
80
+ ]
sonatoki/Filters.py CHANGED
@@ -17,6 +17,7 @@ from sonatoki.constants import (
17
17
  NIMI_LINKU_SET,
18
18
  NIMI_PU_ALE_SET,
19
19
  NIMI_LINKU_ALE_SET,
20
+ NIMI_LINKU_SANDBOX_SET,
20
21
  )
21
22
 
22
23
  re.DEFAULT_VERSION = re.VERSION1
@@ -87,6 +88,10 @@ class NimiLinkuAle(SetFilter):
87
88
  tokens = NIMI_LINKU_ALE_SET
88
89
 
89
90
 
91
+ class NimiLinkuSandbox(SetFilter):
92
+ tokens = NIMI_LINKU_SANDBOX_SET
93
+
94
+
90
95
  class Phonotactic(RegexFilter):
91
96
  """Determines if a given token is phonotactically valid Toki Pona (or `n`).
92
97
  Excludes both consecutive nasals and the illegal syllables:
sonatoki/Preprocessors.py CHANGED
@@ -13,7 +13,7 @@ There are currently two distinct types of Preprocessor:
13
13
  - ArrowQuote
14
14
 
15
15
  Order does not generally matter, but if there were two overlapping containers such as in the string "|| spoiler ` monospace || `", order would matter.
16
- As such, each Preprocessor exposes a .precedence attribute which is optionally usable for ordering them. Lower precedence means it should be applied first.
16
+ It is up to the user to order them appropriately.
17
17
  """
18
18
 
19
19
  # STL
@@ -27,8 +27,6 @@ re.DEFAULT_VERSION = re.VERSION1
27
27
 
28
28
 
29
29
  class Preprocessor(ABC):
30
- precedence: int = 0
31
-
32
30
  @classmethod # order matters
33
31
  @abstractmethod
34
32
  def process(cls, msg: str) -> str:
@@ -104,7 +102,6 @@ class DoubleQuotes(RegexPreprocessor):
104
102
  class Backticks(RegexPreprocessor):
105
103
  """Remove paired backticks and their contents `like this`"""
106
104
 
107
- precedence = -10
108
105
  pattern = re.compile(r"`[^`]+`", flags=re.S)
109
106
 
110
107
 
@@ -121,6 +118,9 @@ class ArrowQuote(RegexPreprocessor):
121
118
 
122
119
 
123
120
  __all__ = [
121
+ "DiscordChannels",
122
+ "DiscordMentions",
123
+ "DiscordSpecial",
124
124
  "DiscordEmotes",
125
125
  "SingleQuotes",
126
126
  "DoubleQuotes",
sonatoki/Scorers.py CHANGED
@@ -16,6 +16,13 @@ Number = Union[int, float]
16
16
  Weights = Dict[str, Number]
17
17
 
18
18
 
19
+ def sigmoid(n: int) -> Number:
20
+ return 1 / (1 + math.exp(-(0.30 * (n - 1))))
21
+ # n-1 makes sigmoid(1) == 0.5
22
+ # 0.30 softens scaling in favor of short input
23
+ # return n / (1+abs(n)) # too weak in 0.7+
24
+
25
+
19
26
  class Scorer(ABC):
20
27
  @classmethod
21
28
  @abstractmethod
@@ -27,7 +34,7 @@ class PassFail(Scorer):
27
34
  """The token passes any filter or fails all of them, scoring 1 or 0 respectively."""
28
35
 
29
36
  @classmethod
30
- def __score(cls, token: str, filters: List[Type[Filter]]) -> Number:
37
+ def score_token(cls, token: str, filters: List[Type[Filter]]) -> Number:
31
38
  for f in filters:
32
39
  if f.filter(token):
33
40
  score = 1
@@ -47,10 +54,27 @@ class PassFail(Scorer):
47
54
  total_score = 0
48
55
  len_tokens = len(tokens)
49
56
  for token in tokens:
50
- total_score += cls.__score(token, filters)
57
+ total_score += cls.score_token(token, filters)
51
58
  return total_score / len_tokens if len_tokens else 0
52
59
 
53
60
 
61
+ class SoftPassFail(PassFail):
62
+ @classmethod
63
+ @override
64
+ def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
65
+ if not tokens:
66
+ return 1
67
+
68
+ total_score = 0
69
+ len_tokens = len(tokens)
70
+ for token in tokens:
71
+ total_score += cls.score_token(token, filters)
72
+
73
+ percentage = total_score / len_tokens if len_tokens else 0
74
+ percentage **= sigmoid(len_tokens)
75
+ return percentage
76
+
77
+
54
78
  class Scaling(Scorer):
55
79
  """
56
80
  The sooner a token matches a filter, the higher its score.
@@ -91,13 +115,6 @@ class SoftScaling(Scaling):
91
115
  For example, a single token scoring 0.64 will now score 0.8.
92
116
  """
93
117
 
94
- @staticmethod
95
- def sigmoid(n: int) -> Number:
96
- return 1 / (1 + math.exp(-(0.30 * (n - 1))))
97
- # n-1 makes sigmoid(1) == 0.5
98
- # 0.30 softens scaling in favor of short input
99
- # return n / (1+abs(n)) # too weak in 0.7+
100
-
101
118
  @classmethod
102
119
  @override
103
120
  def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
@@ -113,11 +130,11 @@ class SoftScaling(Scaling):
113
130
  total_score += cls.score_token(token, filters, len_filters)
114
131
 
115
132
  percentage = total_score / max_score if max_score else 0
116
- percentage **= cls.sigmoid(len_tokens)
133
+ percentage **= sigmoid(len_tokens)
117
134
  return percentage
118
135
 
119
136
 
120
137
  class Logarithmic(Scorer): ...
121
138
 
122
139
 
123
- __all__ = ["PassFail", "Scaling", "SoftScaling"]
140
+ __all__ = ["PassFail", "SoftPassFail", "Scaling", "SoftScaling"]
sonatoki/Tokenizers.py CHANGED
@@ -1,8 +1,10 @@
1
1
  # STL
2
- from typing import List, Callable
2
+ from abc import ABC, abstractmethod
3
+ from typing import List
3
4
 
4
5
  # PDM
5
6
  import regex as re
7
+ from typing_extensions import override
6
8
 
7
9
  try:
8
10
  # PDM
@@ -15,50 +17,60 @@ except ImportError as e:
15
17
 
16
18
  LANGUAGE = "english" # for NLTK
17
19
 
18
- SENT_DELIMS_RE = r"""(.*?[.?!;:])|(.+?$)"""
19
- SENT_DELIMS_RE = re.compile(SENT_DELIMS_RE)
20
20
 
21
- SENT_DELIMS_TOK = r"""(.*?[.?!;:-])|(.+?$)"""
22
- SENT_DELIMS_TOK = re.compile(SENT_DELIMS_TOK)
21
+ class Tokenizer(ABC):
22
+ @classmethod
23
+ @abstractmethod
24
+ def tokenize(cls, s: str) -> List[str]: ...
23
25
 
24
26
 
25
- WORD_DELIMS_RE = r"""\s+|(?=[.?!;:'"-])"""
26
- WORD_DELIMS_RE = re.compile(WORD_DELIMS_RE)
27
+ class NoOpTokenizer(Tokenizer):
28
+ """This is a special case that you do not want or need."""
27
29
 
28
- WORD_DELIMS_TOK = r"([\p{Punctuation}\p{posix_punct}]+|\s+)"
29
- WORD_DELIMS_TOK = re.compile(WORD_DELIMS_TOK)
30
+ @classmethod
31
+ @override
32
+ def tokenize(cls, s: str) -> List[str]:
33
+ return [s]
30
34
 
31
- Tokenizer = Callable[[str], List[str]]
32
35
 
36
+ class RegexTokenizer(Tokenizer):
37
+ pattern: "re.Pattern[str]"
38
+
39
+ @classmethod
40
+ @override
41
+ def tokenize(cls, s: str) -> List[str]:
42
+ return [clean for word in re.split(cls.pattern, s) if (clean := word.strip())]
33
43
 
34
- if not isinstance(nltk, ImportError):
35
44
 
36
- def sent_tokenize_nltk(s: str) -> List[str]:
37
- return __sent_tokenize_nltk(text=s, language=LANGUAGE)
45
+ class WordTokenizerTok(RegexTokenizer):
46
+ pattern = re.compile(r"""([\p{Punctuation}\p{posix_punct}]+|\s+)""")
47
+ # TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
48
+ # TODO: do the typography characters matter?
49
+ # NOTE: | / and , are *not* sentence delimiters for my purpose
38
50
 
39
- def word_tokenize_nltk(s: str) -> List[str]:
40
- return __word_tokenize_nltk(text=s, language=LANGUAGE)
41
51
 
52
+ class SentTokenizerTok(RegexTokenizer):
53
+ pattern = re.compile(r"""(?<=[.?!:;·…“”"'()\[\]\-]|$)""")
42
54
 
43
- def sent_tokenize_re(s: str) -> List[str]:
44
- return [
45
- clean
46
- for sent in re.findall(SENT_DELIMS_RE, s)
47
- if (clean := sent[0].strip() or sent[1].strip())
48
- ]
49
55
 
56
+ class WordTokenizerRe(RegexTokenizer):
57
+ pattern = re.compile(r"""(?<=[.?!;:'"-])""")
50
58
 
51
- def word_tokenize_re(s: str) -> List[str]:
52
- return [clean for word in re.split(WORD_DELIMS_RE, s) if (clean := word.strip())]
53
59
 
60
+ class SentTokenizerRe(RegexTokenizer):
61
+ pattern = re.compile(r"""(.*?[.?!;:])|(.+?$)""")
54
62
 
55
- def sent_tokenize_tok(s: str) -> List[str]:
56
- return [
57
- clean
58
- for sent in re.findall(SENT_DELIMS_TOK, s)
59
- if (clean := sent[0].strip() or sent[1].strip())
60
- ]
61
63
 
64
+ if not isinstance(nltk, ImportError):
62
65
 
63
- def word_tokenize_tok(s: str) -> List[str]:
64
- return [clean for word in re.split(WORD_DELIMS_TOK, s) if (clean := word.strip())]
66
+ class WordTokenizerNLTK(Tokenizer):
67
+ @classmethod
68
+ @override
69
+ def tokenize(cls, s: str) -> List[str]:
70
+ return __word_tokenize_nltk(text=s, language=LANGUAGE)
71
+
72
+ class SentTokenizerNLTK(Tokenizer):
73
+ @classmethod
74
+ @override
75
+ def tokenize(cls, s: str) -> List[str]:
76
+ return __sent_tokenize_nltk(text=s, language=LANGUAGE)
sonatoki/constants.py CHANGED
@@ -4,6 +4,7 @@ from typing import Dict, List
4
4
  from pathlib import Path
5
5
 
6
6
  LINKU = Path(__file__).resolve().parent / Path("linku.json")
7
+ SANDBOX = Path(__file__).resolve().parent / Path("sandbox.json")
7
8
 
8
9
  VOWELS = "aeiou"
9
10
  CONSONANTS = "jklmnpstw"
@@ -29,10 +30,16 @@ with open(LINKU) as f:
29
30
  ]
30
31
  NIMI_LINKU_ALE: List[str] = [d["word"] for d in r.values()]
31
32
 
33
+ with open(SANDBOX) as f:
34
+ r: Dict[str, Dict[str, str]] = json.loads(f.read())
35
+ NIMI_LINKU_SANDBOX: List[str] = [d["word"] for d in r.values()]
36
+
37
+
32
38
  NIMI_PU_SET = set(NIMI_PU)
33
39
  NIMI_PU_ALE_SET = set(NIMI_PU_ALE)
34
40
  NIMI_LINKU_SET = set(NIMI_LINKU)
35
41
  NIMI_LINKU_ALE_SET = set(NIMI_LINKU_ALE)
42
+ NIMI_LINKU_SANDBOX_SET = set(NIMI_LINKU_SANDBOX)
36
43
  ALLOWABLES_SET = set(ALLOWABLES)
37
44
 
38
45
  __all__ = [
@@ -54,4 +61,7 @@ __all__ = [
54
61
  #
55
62
  "NIMI_LINKU_ALE",
56
63
  "NIMI_LINKU_ALE_SET",
64
+ #
65
+ "NIMI_LINKU_SANDBOX",
66
+ "NIMI_LINKU_SANDBOX_SET",
57
67
  ]
sonatoki/ilo.py CHANGED
@@ -14,13 +14,13 @@ LOG = logging.getLogger(__name__)
14
14
 
15
15
  class Ilo:
16
16
  __preprocessors: List[Type[Preprocessor]]
17
+ __word_tokenizer: Type[Tokenizer]
17
18
  __cleaners: List[Type[Cleaner]]
18
19
  __ignoring_filters: List[Type[Filter]]
19
20
  __scoring_filters: List[Type[Filter]]
20
21
  __scorer: Type[Scorer]
21
- __tokenize: Tokenizer
22
22
  __passing_score: Number
23
- logging_threshold: Number = 1.0
23
+ logging_threshold: Number = -1
24
24
 
25
25
  def __init__(
26
26
  self,
@@ -29,61 +29,62 @@ class Ilo:
29
29
  ignoring_filters: List[Type[Filter]],
30
30
  scoring_filters: List[Type[Filter]],
31
31
  scorer: Type[Scorer],
32
- tokenizer: Tokenizer, # NOTE: no wrapper needed?
33
32
  passing_score: Number,
33
+ word_tokenizer: Type[Tokenizer],
34
34
  ):
35
35
  super().__init__()
36
36
  # avoid keeping a ref to user's list just in case
37
37
  self.__preprocessors = [*preprocessors]
38
+ self.__word_tokenizer = word_tokenizer
38
39
  self.__cleaners = [*cleaners]
39
40
  self.__ignoring_filters = [*ignoring_filters]
40
41
  self.__scoring_filters = [*scoring_filters]
41
42
  self.__scorer = scorer
42
- self.__tokenize = tokenizer
43
43
  self.__passing_score = passing_score
44
44
 
45
- def __preprocess(self, msg: str) -> str:
45
+ def preprocess(self, msg: str) -> str:
46
46
  for p in self.__preprocessors:
47
47
  msg = p.process(msg)
48
48
  return msg
49
49
 
50
- def __clean_token(self, token: str) -> str:
50
+ def word_tokenize(self, msg: str) -> List[str]:
51
+ """It is *highly* recommended that you run `ilo.preprocess` first."""
52
+ return self.__word_tokenizer.tokenize(msg)
53
+
54
+ def clean_token(self, token: str) -> str:
51
55
  for c in self.__cleaners:
52
56
  token = c.clean(token)
53
57
  return token
54
58
 
55
- def __clean_tokens(self, tokens: List[str]) -> List[str]:
56
- # NOTE: tested, making a new list with a for loop *is* faster than
57
- # - list comps
58
- # - generator comps
59
- # - in-place replacement/removal
60
- # - in place replacement with result of generator comp
59
+ def clean_tokens(self, tokens: List[str]) -> List[str]:
60
+ # NOTE: tested, making a new list with a for loop *is* faster than:
61
+ # list comp, generator comp, in-place replacement
61
62
  cleaned_tokens: List[str] = list()
62
63
  for token in tokens:
63
- cleaned_token = self.__clean_token(token)
64
+ cleaned_token = self.clean_token(token)
64
65
  if not cleaned_token:
65
66
  # TODO: warn user?
66
67
  continue
67
68
  cleaned_tokens.append(cleaned_token)
68
69
  return cleaned_tokens
69
70
 
70
- def __filter_token(self, token: str) -> bool:
71
+ def _filter_token(self, token: str) -> bool:
71
72
  for f in self.__ignoring_filters:
72
73
  if f.filter(token):
73
74
  return True
74
75
  return False
75
76
 
76
- def __filter_tokens(self, tokens: List[str]) -> List[str]:
77
+ def filter_tokens(self, tokens: List[str]) -> List[str]:
77
78
  filtered_tokens: List[str] = []
78
79
  for token in tokens:
79
- if self.__filter_token(token):
80
+ if self._filter_token(token):
80
81
  continue
81
82
  # the ignoring filter is true if the token matches
82
83
  # the user wants to ignore these so keep non-matching tokens
83
84
  filtered_tokens.append(token)
84
85
  return filtered_tokens
85
86
 
86
- def __score_tokens(self, tokens: List[str]) -> float:
87
+ def score_tokens(self, tokens: List[str]) -> float:
87
88
  return self.__scorer.score(tokens, self.__scoring_filters)
88
89
 
89
90
  def _is_toki_pona(
@@ -95,26 +96,25 @@ class Ilo:
95
96
  - Filtered message (list[str])
96
97
  - Cleaned message (list[str])
97
98
  - Score (float)
98
- - Result (bool)
99
- """
100
- preprocessed = self.__preprocess(message)
101
- tokenized = self.__tokenize(preprocessed)
102
- filtered = self.__filter_tokens(tokenized)
103
- cleaned = self.__clean_tokens(filtered)
104
- score = self.__score_tokens(cleaned)
99
+ - Result (bool)"""
100
+ preprocessed = self.preprocess(message)
101
+ tokenized = self.word_tokenize(preprocessed)
102
+ filtered = self.filter_tokens(tokenized)
103
+ cleaned = self.clean_tokens(filtered)
104
+ score = self.score_tokens(cleaned)
105
105
  result = score >= self.__passing_score
106
106
 
107
- # NOTE: this method may break if above funcs start sharing a list
108
107
  if score <= self.logging_threshold:
109
- LOG.debug("Msg: %.2f %s", score, repr(message))
110
- LOG.debug("Preproc: %s", repr(preprocessed))
111
- LOG.debug("Tokenized: %s", tokenized)
112
- LOG.debug("Filtered: %s", filtered)
113
- LOG.debug("Cleaned: %s", cleaned)
108
+ LOG.debug("msg: %.2f %s", score, repr(message))
109
+ LOG.debug("preproc: %s", repr(preprocessed))
110
+ LOG.debug("tokenized: %s", tokenized)
111
+ LOG.debug("filtered: %s", filtered)
112
+ LOG.debug("cleaned: %s", cleaned)
114
113
  # TODO: Move to each function? Loses ability to control when logging occurs by threshold
115
114
 
116
115
  return preprocessed, tokenized, filtered, cleaned, score, result
117
116
 
118
117
  def is_toki_pona(self, message: str) -> bool:
118
+ """Determines whether a single statement is or is not Toki Pona."""
119
119
  *_, result = self._is_toki_pona(message)
120
120
  return result