sonatoki 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonatoki/Configs.py ADDED
@@ -0,0 +1,80 @@
1
+ # STL
2
+ from copy import deepcopy
3
+ from typing import List, Type, TypedDict
4
+
5
+ # PDM
6
+ from typing_extensions import NotRequired
7
+
8
+ # LOCAL
9
+ from sonatoki.Filters import (
10
+ Filter,
11
+ NimiPu,
12
+ Numeric,
13
+ Syllabic,
14
+ NimiLinku,
15
+ NimiPuAle,
16
+ Alphabetic,
17
+ ProperName,
18
+ Phonotactic,
19
+ Punctuation,
20
+ NimiLinkuAle,
21
+ )
22
+ from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
23
+ from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
24
+ from sonatoki.Tokenizers import Tokenizer, WordTokenizerTok
25
+ from sonatoki.Preprocessors import (
26
+ URLs,
27
+ Preprocessor,
28
+ DiscordEmotes,
29
+ DiscordSpecial,
30
+ DiscordChannels,
31
+ DiscordMentions,
32
+ )
33
+
34
+
35
+ class IloConfig(TypedDict):
36
+ preprocessors: List[Type[Preprocessor]]
37
+ word_tokenizer: Type[Tokenizer]
38
+ cleaners: List[Type[Cleaner]]
39
+ ignoring_filters: List[Type[Filter]]
40
+ scoring_filters: List[Type[Filter]]
41
+ scorer: Type[Scorer]
42
+ passing_score: Number
43
+
44
+
45
+ BaseConfig: IloConfig = {
46
+ "preprocessors": [URLs],
47
+ "cleaners": [ConsecutiveDuplicates],
48
+ "ignoring_filters": [Numeric, Punctuation],
49
+ "scoring_filters": [],
50
+ "scorer": PassFail,
51
+ "passing_score": 0.8,
52
+ "word_tokenizer": WordTokenizerTok,
53
+ }
54
+
55
+
56
+ PrefConfig: IloConfig = deepcopy(BaseConfig)
57
+ PrefConfig["scoring_filters"].extend([NimiLinku, Syllabic, ProperName, Alphabetic])
58
+ PrefConfig["scorer"] = SoftScaling
59
+
60
+
61
+ LazyConfig: IloConfig = deepcopy(BaseConfig)
62
+ LazyConfig["scoring_filters"].extend([Alphabetic, ProperName])
63
+ LazyConfig["scorer"] = SoftPassFail
64
+
65
+ DiscordConfig: IloConfig = deepcopy(PrefConfig)
66
+ DiscordConfig["preprocessors"].extend(
67
+ [DiscordEmotes, DiscordMentions, DiscordChannels, DiscordSpecial]
68
+ )
69
+ TelegramConfig: IloConfig = deepcopy(PrefConfig)
70
+ ForumConfig: IloConfig = deepcopy(PrefConfig)
71
+
72
+ __all__ = [
73
+ "IloConfig",
74
+ "BaseConfig",
75
+ "PrefConfig",
76
+ "LazyConfig",
77
+ "DiscordConfig",
78
+ "TelegramConfig",
79
+ "ForumConfig",
80
+ ]
sonatoki/Filters.py CHANGED
@@ -17,6 +17,7 @@ from sonatoki.constants import (
17
17
  NIMI_LINKU_SET,
18
18
  NIMI_PU_ALE_SET,
19
19
  NIMI_LINKU_ALE_SET,
20
+ NIMI_LINKU_SANDBOX_SET,
20
21
  )
21
22
 
22
23
  re.DEFAULT_VERSION = re.VERSION1
@@ -87,6 +88,10 @@ class NimiLinkuAle(SetFilter):
87
88
  tokens = NIMI_LINKU_ALE_SET
88
89
 
89
90
 
91
+ class NimiLinkuSandbox(SetFilter):
92
+ tokens = NIMI_LINKU_SANDBOX_SET
93
+
94
+
90
95
  class Phonotactic(RegexFilter):
91
96
  """Determines if a given token is phonotactically valid Toki Pona (or `n`).
92
97
  Excludes both consecutive nasals and the illegal syllables:
@@ -126,7 +131,7 @@ class Alphabetic(Filter):
126
131
  return set(token.lower()).issubset(ALPHABET_SET)
127
132
 
128
133
 
129
- class Numerics(Filter):
134
+ class Numeric(Filter):
130
135
  """Determine if a given token is entirely numeric.
131
136
  Covers all numeric symbols in Unicode.
132
137
 
@@ -142,7 +147,7 @@ class Numerics(Filter):
142
147
  return msg.isnumeric()
143
148
 
144
149
 
145
- class Punctuations(RegexFilter):
150
+ class Punctuation(RegexFilter):
146
151
  pattern = re.compile(r"[\p{Punctuation}\p{posix_punct}]+")
147
152
 
148
153
 
@@ -154,6 +159,6 @@ __all__ = [
154
159
  "Syllabic",
155
160
  "Alphabetic",
156
161
  "ProperName",
157
- "Punctuations",
158
- "Numerics",
162
+ "Punctuation",
163
+ "Numeric",
159
164
  ]
sonatoki/Preprocessors.py CHANGED
@@ -13,7 +13,7 @@ There are currently two distinct types of Preprocessor:
13
13
  - ArrowQuote
14
14
 
15
15
  Order does not generally matter, but if there were two overlapping containers such as in the string "|| spoiler ` monospace || `", order would matter.
16
- As such, each Preprocessor exposes a .precedence attribute which is optionally usable for ordering them. Lower precedence means it should be applied first.
16
+ It is up to the user to order them appropriately.
17
17
  """
18
18
 
19
19
  # STL
@@ -27,8 +27,6 @@ re.DEFAULT_VERSION = re.VERSION1
27
27
 
28
28
 
29
29
  class Preprocessor(ABC):
30
- precedence: int = 0
31
-
32
30
  @classmethod # order matters
33
31
  @abstractmethod
34
32
  def process(cls, msg: str) -> str:
@@ -64,6 +62,13 @@ class URLs(RegexPreprocessor):
64
62
  pattern = re.compile(r"https?:\/\/\S+")
65
63
 
66
64
 
65
+ class Reference(RegexPreprocessor):
66
+ """Remove text contained in double brackets.
67
+ Often used to fetch articles on Wikipedia, or Magic the Gathering cards."""
68
+
69
+ pattern = re.compile(r"\[\[.+\]\]")
70
+
71
+
67
72
  class DiscordEmotes(RegexPreprocessor):
68
73
  """Remove text-formatted Discord emotes `<flags:name:id>`"""
69
74
 
@@ -82,6 +87,13 @@ class DiscordSpecial(RegexPreprocessor):
82
87
  pattern = re.compile(r"<id:[a-zA-Z0-9_]{4,}>")
83
88
 
84
89
 
90
+ class AngleBracketObject(RegexPreprocessor):
91
+ """A generalized version of the Discord-specific angle bracket objects.
92
+ Removes any contiguous (not broken by whitespace) text in angle brackets."""
93
+
94
+ pattern = re.compile(r"<[^<>\s]+>")
95
+
96
+
85
97
  """
86
98
  The following classes are Containers.
87
99
 
@@ -94,24 +106,23 @@ would likely be using a language other than Toki Pona.
94
106
 
95
107
 
96
108
  class SingleQuotes(RegexPreprocessor):
97
- pattern = re.compile(r"'[^']+'", flags=re.S) # . matches newline
109
+ pattern = re.compile(r"'[^']+'", flags=re.DOTALL)
98
110
 
99
111
 
100
112
  class DoubleQuotes(RegexPreprocessor):
101
- pattern = re.compile(r'"[^"]+"', flags=re.S)
113
+ pattern = re.compile(r'"[^"]+"', flags=re.DOTALL)
102
114
 
103
115
 
104
116
  class Backticks(RegexPreprocessor):
105
117
  """Remove paired backticks and their contents `like this`"""
106
118
 
107
- precedence = -10
108
- pattern = re.compile(r"`[^`]+`", flags=re.S)
119
+ pattern = re.compile(r"`[^`]+`", flags=re.DOTALL)
109
120
 
110
121
 
111
122
  class Spoilers(RegexPreprocessor):
112
123
  """Remove paired double bars and their contents `||like this||`"""
113
124
 
114
- pattern = re.compile(r"\|\|(?:(?!\|\|).)+\|\|", flags=re.S)
125
+ pattern = re.compile(r"\|\|(?:(?!\|\|).)+\|\|", flags=re.DOTALL)
115
126
 
116
127
 
117
128
  class ArrowQuote(RegexPreprocessor):
@@ -120,7 +131,22 @@ class ArrowQuote(RegexPreprocessor):
120
131
  pattern = re.compile(r"^>\ .+$", re.MULTILINE)
121
132
 
122
133
 
134
+ class AllQuotes(RegexPreprocessor):
135
+ pattern = re.compile(
136
+ "|".join(
137
+ [
138
+ SingleQuotes.pattern.pattern,
139
+ DoubleQuotes.pattern.pattern,
140
+ Backticks.pattern.pattern,
141
+ ArrowQuote.pattern.pattern,
142
+ ]
143
+ ),
144
+ flags=re.MULTILINE | re.DOTALL,
145
+ )
146
+
147
+
123
148
  __all__ = [
149
+ "AngleBracketObject",
124
150
  "DiscordChannels",
125
151
  "DiscordMentions",
126
152
  "DiscordSpecial",
@@ -128,7 +154,9 @@ __all__ = [
128
154
  "SingleQuotes",
129
155
  "DoubleQuotes",
130
156
  "ArrowQuote",
157
+ "AllQuotes",
131
158
  "Backticks",
159
+ "Reference",
132
160
  "Spoilers",
133
161
  "URLs",
134
162
  ]
sonatoki/Tokenizers.py CHANGED
@@ -1,10 +1,10 @@
1
1
  # STL
2
- from typing import List, Callable
2
+ from abc import ABC, abstractmethod
3
+ from typing import List
3
4
 
4
5
  # PDM
5
6
  import regex as re
6
-
7
- # TODO: Entire module should be reworked to match the class scheme of the rest of the module, imo
7
+ from typing_extensions import override
8
8
 
9
9
  try:
10
10
  # PDM
@@ -17,42 +17,60 @@ except ImportError as e:
17
17
 
18
18
  LANGUAGE = "english" # for NLTK
19
19
 
20
- SENT_DELIMS_RE = re.compile(r"""(.*?[.?!;:])|(.+?$)""")
21
- SENT_DELIMS_TOK = re.compile(r"""(?<=[.?!:;·…“”"'()\[\]\-]|$)""")
22
- # TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
23
- # TODO: do the typography characters matter?
24
- # NOTE: | / and , are *not* sentence delimiters for my purpose
25
20
 
26
- WORD_DELIMS_RE = re.compile(r"""\s+|(?=[.?!;:'"-])""")
27
- WORD_DELIMS_TOK = re.compile(r"([\p{Punctuation}\p{posix_punct}]+|\s+)")
21
+ class Tokenizer(ABC):
22
+ @classmethod
23
+ @abstractmethod
24
+ def tokenize(cls, s: str) -> List[str]: ...
28
25
 
29
- Tokenizer = Callable[[str], List[str]]
30
26
 
27
+ class NoOpTokenizer(Tokenizer):
28
+ """This is a special case that you do not want or need."""
31
29
 
32
- if not isinstance(nltk, ImportError):
30
+ @classmethod
31
+ @override
32
+ def tokenize(cls, s: str) -> List[str]:
33
+ return [s]
34
+
35
+
36
+ class RegexTokenizer(Tokenizer):
37
+ pattern: "re.Pattern[str]"
33
38
 
34
- def sent_tokenize_nltk(s: str) -> List[str]:
35
- return __sent_tokenize_nltk(text=s, language=LANGUAGE)
39
+ @classmethod
40
+ @override
41
+ def tokenize(cls, s: str) -> List[str]:
42
+ return [clean for word in re.split(cls.pattern, s) if (clean := word.strip())]
36
43
 
37
- def word_tokenize_nltk(s: str) -> List[str]:
38
- return __word_tokenize_nltk(text=s, language=LANGUAGE)
39
44
 
45
+ class WordTokenizerTok(RegexTokenizer):
46
+ pattern = re.compile(r"""([\p{Punctuation}\p{posix_punct}]+|\s+)""")
47
+ # TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
48
+ # TODO: do the typography characters matter?
49
+ # NOTE: | / and , are *not* sentence delimiters for my purpose
40
50
 
41
- def sent_tokenize_re(s: str) -> List[str]:
42
- return [
43
- clean
44
- for sent in re.findall(SENT_DELIMS_RE, s)
45
- if (clean := sent[0].strip() or sent[1].strip())
46
- ]
47
51
 
52
+ class SentTokenizerTok(RegexTokenizer):
53
+ pattern = re.compile(r"""(?<=[.?!:;·…“”"'()\[\]\-]|$)""")
48
54
 
49
- def word_tokenize_re(s: str) -> List[str]:
50
- return [clean for word in re.split(WORD_DELIMS_RE, s) if (clean := word.strip())]
51
55
 
56
+ class WordTokenizerRe(RegexTokenizer):
57
+ pattern = re.compile(r"""(?<=[.?!;:'"-])""")
52
58
 
53
- def sent_tokenize_tok(s: str) -> List[str]:
54
- return [clean for sent in re.split(SENT_DELIMS_TOK, s) if (clean := sent.strip())]
55
59
 
60
+ class SentTokenizerRe(RegexTokenizer):
61
+ pattern = re.compile(r"""(.*?[.?!;:])|(.+?$)""")
62
+
63
+
64
+ if not isinstance(nltk, ImportError):
56
65
 
57
- def word_tokenize_tok(s: str) -> List[str]:
58
- return [clean for word in re.split(WORD_DELIMS_TOK, s) if (clean := word.strip())]
66
+ class WordTokenizerNLTK(Tokenizer):
67
+ @classmethod
68
+ @override
69
+ def tokenize(cls, s: str) -> List[str]:
70
+ return __word_tokenize_nltk(text=s, language=LANGUAGE)
71
+
72
+ class SentTokenizerNLTK(Tokenizer):
73
+ @classmethod
74
+ @override
75
+ def tokenize(cls, s: str) -> List[str]:
76
+ return __sent_tokenize_nltk(text=s, language=LANGUAGE)
sonatoki/constants.py CHANGED
@@ -4,6 +4,7 @@ from typing import Dict, List
4
4
  from pathlib import Path
5
5
 
6
6
  LINKU = Path(__file__).resolve().parent / Path("linku.json")
7
+ SANDBOX = Path(__file__).resolve().parent / Path("sandbox.json")
7
8
 
8
9
  VOWELS = "aeiou"
9
10
  CONSONANTS = "jklmnpstw"
@@ -29,10 +30,16 @@ with open(LINKU) as f:
29
30
  ]
30
31
  NIMI_LINKU_ALE: List[str] = [d["word"] for d in r.values()]
31
32
 
33
+ with open(SANDBOX) as f:
34
+ r: Dict[str, Dict[str, str]] = json.loads(f.read())
35
+ NIMI_LINKU_SANDBOX: List[str] = [d["word"] for d in r.values()]
36
+
37
+
32
38
  NIMI_PU_SET = set(NIMI_PU)
33
39
  NIMI_PU_ALE_SET = set(NIMI_PU_ALE)
34
40
  NIMI_LINKU_SET = set(NIMI_LINKU)
35
41
  NIMI_LINKU_ALE_SET = set(NIMI_LINKU_ALE)
42
+ NIMI_LINKU_SANDBOX_SET = set(NIMI_LINKU_SANDBOX)
36
43
  ALLOWABLES_SET = set(ALLOWABLES)
37
44
 
38
45
  __all__ = [
@@ -54,4 +61,7 @@ __all__ = [
54
61
  #
55
62
  "NIMI_LINKU_ALE",
56
63
  "NIMI_LINKU_ALE_SET",
64
+ #
65
+ "NIMI_LINKU_SANDBOX",
66
+ "NIMI_LINKU_SANDBOX_SET",
57
67
  ]
sonatoki/ilo.py CHANGED
@@ -14,13 +14,13 @@ LOG = logging.getLogger(__name__)
14
14
 
15
15
  class Ilo:
16
16
  __preprocessors: List[Type[Preprocessor]]
17
+ __word_tokenizer: Type[Tokenizer]
17
18
  __cleaners: List[Type[Cleaner]]
18
19
  __ignoring_filters: List[Type[Filter]]
19
20
  __scoring_filters: List[Type[Filter]]
20
21
  __scorer: Type[Scorer]
21
- __tokenize: Tokenizer
22
22
  __passing_score: Number
23
- logging_threshold: Number = 1.0
23
+ logging_threshold: Number = -1
24
24
 
25
25
  def __init__(
26
26
  self,
@@ -29,61 +29,62 @@ class Ilo:
29
29
  ignoring_filters: List[Type[Filter]],
30
30
  scoring_filters: List[Type[Filter]],
31
31
  scorer: Type[Scorer],
32
- tokenizer: Tokenizer, # NOTE: no wrapper needed?
33
32
  passing_score: Number,
33
+ word_tokenizer: Type[Tokenizer],
34
34
  ):
35
35
  super().__init__()
36
36
  # avoid keeping a ref to user's list just in case
37
37
  self.__preprocessors = [*preprocessors]
38
+ self.__word_tokenizer = word_tokenizer
38
39
  self.__cleaners = [*cleaners]
39
40
  self.__ignoring_filters = [*ignoring_filters]
40
41
  self.__scoring_filters = [*scoring_filters]
41
42
  self.__scorer = scorer
42
- self.__tokenize = tokenizer
43
43
  self.__passing_score = passing_score
44
44
 
45
- def __preprocess(self, msg: str) -> str:
45
+ def preprocess(self, msg: str) -> str:
46
46
  for p in self.__preprocessors:
47
47
  msg = p.process(msg)
48
48
  return msg
49
49
 
50
- def __clean_token(self, token: str) -> str:
50
+ def word_tokenize(self, msg: str) -> List[str]:
51
+ """It is *highly* recommended that you run `ilo.preprocess` first."""
52
+ return self.__word_tokenizer.tokenize(msg)
53
+
54
+ def clean_token(self, token: str) -> str:
51
55
  for c in self.__cleaners:
52
56
  token = c.clean(token)
53
57
  return token
54
58
 
55
- def __clean_tokens(self, tokens: List[str]) -> List[str]:
56
- # NOTE: tested, making a new list with a for loop *is* faster than
57
- # - list comps
58
- # - generator comps
59
- # - in-place replacement/removal
60
- # - in place replacement with result of generator comp
59
+ def clean_tokens(self, tokens: List[str]) -> List[str]:
60
+ # NOTE: tested, making a new list with a for loop *is* faster than:
61
+ # list comp, generator comp, in-place replacement
61
62
  cleaned_tokens: List[str] = list()
62
63
  for token in tokens:
63
- cleaned_token = self.__clean_token(token)
64
+ cleaned_token = self.clean_token(token)
64
65
  if not cleaned_token:
65
66
  # TODO: warn user?
66
67
  continue
67
68
  cleaned_tokens.append(cleaned_token)
68
69
  return cleaned_tokens
69
70
 
70
- def __filter_token(self, token: str) -> bool:
71
+ def _filter_token(self, token: str) -> bool:
71
72
  for f in self.__ignoring_filters:
72
73
  if f.filter(token):
73
74
  return True
74
75
  return False
75
76
 
76
- def __filter_tokens(self, tokens: List[str]) -> List[str]:
77
+ def filter_tokens(self, tokens: List[str]) -> List[str]:
77
78
  filtered_tokens: List[str] = []
78
79
  for token in tokens:
79
- if self.__filter_token(token):
80
+ if self._filter_token(token):
80
81
  continue
81
82
  # the ignoring filter is true if the token matches
82
83
  # the user wants to ignore these so keep non-matching tokens
83
84
  filtered_tokens.append(token)
84
85
  return filtered_tokens
85
86
 
86
- def __score_tokens(self, tokens: List[str]) -> float:
87
+ def score_tokens(self, tokens: List[str]) -> float:
87
88
  return self.__scorer.score(tokens, self.__scoring_filters)
88
89
 
89
90
  def _is_toki_pona(
@@ -95,26 +96,25 @@ class Ilo:
95
96
  - Filtered message (list[str])
96
97
  - Cleaned message (list[str])
97
98
  - Score (float)
98
- - Result (bool)
99
- """
100
- preprocessed = self.__preprocess(message)
101
- tokenized = self.__tokenize(preprocessed)
102
- filtered = self.__filter_tokens(tokenized)
103
- cleaned = self.__clean_tokens(filtered)
104
- score = self.__score_tokens(cleaned)
99
+ - Result (bool)"""
100
+ preprocessed = self.preprocess(message)
101
+ tokenized = self.word_tokenize(preprocessed)
102
+ filtered = self.filter_tokens(tokenized)
103
+ cleaned = self.clean_tokens(filtered)
104
+ score = self.score_tokens(cleaned)
105
105
  result = score >= self.__passing_score
106
106
 
107
- # NOTE: this method may break if above funcs start sharing a list
108
107
  if score <= self.logging_threshold:
109
- LOG.debug("Msg: %.2f %s", score, repr(message))
110
- LOG.debug("Preproc: %s", repr(preprocessed))
111
- LOG.debug("Tokenized: %s", tokenized)
112
- LOG.debug("Filtered: %s", filtered)
113
- LOG.debug("Cleaned: %s", cleaned)
108
+ LOG.debug("msg: %.2f %s", score, repr(message))
109
+ LOG.debug("preproc: %s", repr(preprocessed))
110
+ LOG.debug("tokenized: %s", tokenized)
111
+ LOG.debug("filtered: %s", filtered)
112
+ LOG.debug("cleaned: %s", cleaned)
114
113
  # TODO: Move to each function? Loses ability to control when logging occurs by threshold
115
114
 
116
115
  return preprocessed, tokenized, filtered, cleaned, score, result
117
116
 
118
117
  def is_toki_pona(self, message: str) -> bool:
118
+ """Determines whether a single statement is or is not Toki Pona."""
119
119
  *_, result = self._is_toki_pona(message)
120
120
  return result