sonatoki 0.1.1__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {sonatoki-0.1.1 → sonatoki-0.1.3}/PKG-INFO +30 -24
  2. {sonatoki-0.1.1 → sonatoki-0.1.3}/README.md +29 -23
  3. {sonatoki-0.1.1 → sonatoki-0.1.3}/pyproject.toml +1 -1
  4. sonatoki-0.1.3/src/sonatoki/Configs.py +80 -0
  5. {sonatoki-0.1.1 → sonatoki-0.1.3}/src/sonatoki/Filters.py +5 -0
  6. {sonatoki-0.1.1 → sonatoki-0.1.3}/src/sonatoki/Preprocessors.py +4 -4
  7. {sonatoki-0.1.1 → sonatoki-0.1.3}/src/sonatoki/Scorers.py +28 -11
  8. sonatoki-0.1.3/src/sonatoki/Tokenizers.py +76 -0
  9. {sonatoki-0.1.1 → sonatoki-0.1.3}/src/sonatoki/constants.py +10 -0
  10. {sonatoki-0.1.1 → sonatoki-0.1.3}/src/sonatoki/ilo.py +30 -30
  11. sonatoki-0.1.3/src/sonatoki/linku.json +1 -0
  12. sonatoki-0.1.3/src/sonatoki/sandbox.json +1 -0
  13. sonatoki-0.1.3/tests/test_ilo.py +158 -0
  14. {sonatoki-0.1.1 → sonatoki-0.1.3}/tests/test_tokenize.py +11 -11
  15. sonatoki-0.1.3/tests/tokenize_cases/tokenize_sentences_tok.yml +37 -0
  16. {sonatoki-0.1.1 → sonatoki-0.1.3}/tests/tokenize_cases/tokenize_words.yml +0 -4
  17. {sonatoki-0.1.1 → sonatoki-0.1.3}/tests/tokenize_cases/tokenize_words_tok.yml +12 -0
  18. sonatoki-0.1.1/src/sonatoki/Tokenizers.py +0 -64
  19. sonatoki-0.1.1/src/sonatoki/linku.json +0 -1
  20. sonatoki-0.1.1/tests/test_ilo.py +0 -73
  21. sonatoki-0.1.1/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -20
  22. {sonatoki-0.1.1 → sonatoki-0.1.3}/LICENSE +0 -0
  23. {sonatoki-0.1.1 → sonatoki-0.1.3}/src/sonatoki/Cleaners.py +0 -0
  24. {sonatoki-0.1.1 → sonatoki-0.1.3}/src/sonatoki/__init__.py +0 -0
  25. {sonatoki-0.1.1 → sonatoki-0.1.3}/src/sonatoki/__main__.py +0 -0
  26. {sonatoki-0.1.1 → sonatoki-0.1.3}/tests/__init__.py +0 -0
  27. {sonatoki-0.1.1 → sonatoki-0.1.3}/tests/test_cleaners.py +0 -0
  28. {sonatoki-0.1.1 → sonatoki-0.1.3}/tests/test_filters.py +0 -0
  29. {sonatoki-0.1.1 → sonatoki-0.1.3}/tests/test_preprocessors.py +0 -0
  30. {sonatoki-0.1.1 → sonatoki-0.1.3}/tests/test_scorers.py +0 -0
  31. {sonatoki-0.1.1 → sonatoki-0.1.3}/tests/test_utils.py +0 -0
  32. {sonatoki-0.1.1 → sonatoki-0.1.3}/tests/tokenize_cases/tokenize_sentences.yml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -20,9 +20,9 @@ This library, "Language Knowledge," helps you identify whether a message is in T
20
20
 
21
21
  I wrote it with a variety of scraps and lessons learned from a prior project, [ilo pi toki pona taso, "toki-pona-only tool"](https://github.com/gregdan3/ilo-pi-toki-pona-taso). That tool will be rewritten to use this library shortly.
22
22
 
23
- If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, time, preferences of the speaker, and much more, can all alter whether a given message is "in toki pona," and this applies to essentially any language.
23
+ If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, time, preferences of the speaker, and much more, can all alter whether a given message is "in" any specific language, and this question applies to Toki Pona too.
24
24
 
25
- This project "solves" that complex problem by offering a highly configurable and incredibly lazy parser
25
+ This project "solves" that complex problem by offering a highly configurable parser, so you can tune it to your preferences and goals.
26
26
 
27
27
  ## Quick Start
28
28
 
@@ -36,28 +36,11 @@ pdm add sonatoki
36
36
  Then get started with a script along these lines:
37
37
 
38
38
  ```py
39
- from sonatoki.Filters import (
40
- Numerics,
41
- Syllabic,
42
- NimiLinku,
43
- Alphabetic,
44
- ProperName,
45
- Punctuations,
46
- )
47
- from sonatoki.Scorers import SoftScaling
48
- from sonatoki.Cleaners import ConsecutiveDuplicates
49
- from sonatoki.Tokenizers import word_tokenize_tok
50
- from sonatoki.Preprocessors import URLs, DiscordEmotes
39
+ from sonatoki.ilo import Ilo
40
+ from sonatoki.Configs import PrefConfig
51
41
 
52
42
  def main():
53
- ilo = Ilo(
54
- preprocessors=[URLs, DiscordEmotes],
55
- ignoring_filters=[Numerics, Punctuations],
56
- scoring_filters=[NimiLinku, Syllabic, ProperName, Alphabetic],
57
- cleaners=[ConsecutiveDuplicates],
58
- scorer=SoftScaling,
59
- tokenizer=word_tokenize_tok,
60
- )
43
+ ilo = Ilo(**PrefConfig)
61
44
  ilo.is_toki_pona("imagine how is touch the sky") # False
62
45
  ilo.is_toki_pona("o pilin insa e ni: sina pilin e sewi") # True
63
46
  ilo.is_toki_pona("I Think I Can Evade Detection") # False
@@ -66,7 +49,30 @@ if __name__ == "__main__":
66
49
  main()
67
50
  ```
68
51
 
69
- `Ilo` is highly configurable by design, so I recommend exploring the `Preprocessors`, `Filters`, and `Scorers` modules. The `Cleaners` module only contains one cleaner, which I recommend using. The `Tokenizers` module contains several other word tokenizers, but their performance will be worse than the dedicated Toki Pona tokenizer `word_tokenize_tok`.
52
+ Or if you'd prefer to configure on your own:
53
+
54
+ ```py
55
+ from copy import deepcopy
56
+ from sonatoki.ilo import Ilo
57
+ from sonatoki.Configs import BaseConfig
58
+ from sonatoki.Filters import NimiPuAle, Phonotactic, ProperName
59
+ from sonatoki.Scorers import SoftPassFail
60
+
61
+ def main():
62
+ config = deepcopy(BaseConfig)
63
+ config["scoring_filters"].extend([NimiPuAle, Phonotactic, ProperName])
64
+ config["scorer"] = SoftPassFail
65
+
66
+ ilo = Ilo(**config)
67
+ ilo.is_toki_pona("mu mu!") # True
68
+ ilo.is_toki_pona("mi namako e moku mi") # True
69
+ ilo.is_toki_pona("ma wulin") # False
70
+
71
+ if __name__ == "__main__":
72
+ main()
73
+ ```
74
+
75
+ `Ilo` is highly configurable by necessity, so I recommend looking through the premade configs in `Configs` as well as the individual `Preprocessors`, `Filters`, and `Scorers`. The `Cleaners` module only contains one cleaner, which I recommend always using. Similarly, the `Tokenizers` module contains several other word tokenizers, but their performance will be worse than the dedicated Toki Pona tokenizer `WordTokenizerTok`.
70
76
 
71
77
  ## Development
72
78
 
@@ -6,9 +6,9 @@ This library, "Language Knowledge," helps you identify whether a message is in T
6
6
 
7
7
  I wrote it with a variety of scraps and lessons learned from a prior project, [ilo pi toki pona taso, "toki-pona-only tool"](https://github.com/gregdan3/ilo-pi-toki-pona-taso). That tool will be rewritten to use this library shortly.
8
8
 
9
- If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, time, preferences of the speaker, and much more, can all alter whether a given message is "in toki pona," and this applies to essentially any language.
9
+ If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, time, preferences of the speaker, and much more, can all alter whether a given message is "in" any specific language, and this question applies to Toki Pona too.
10
10
 
11
- This project "solves" that complex problem by offering a highly configurable and incredibly lazy parser
11
+ This project "solves" that complex problem by offering a highly configurable parser, so you can tune it to your preferences and goals.
12
12
 
13
13
  ## Quick Start
14
14
 
@@ -22,28 +22,11 @@ pdm add sonatoki
22
22
  Then get started with a script along these lines:
23
23
 
24
24
  ```py
25
- from sonatoki.Filters import (
26
- Numerics,
27
- Syllabic,
28
- NimiLinku,
29
- Alphabetic,
30
- ProperName,
31
- Punctuations,
32
- )
33
- from sonatoki.Scorers import SoftScaling
34
- from sonatoki.Cleaners import ConsecutiveDuplicates
35
- from sonatoki.Tokenizers import word_tokenize_tok
36
- from sonatoki.Preprocessors import URLs, DiscordEmotes
25
+ from sonatoki.ilo import Ilo
26
+ from sonatoki.Configs import PrefConfig
37
27
 
38
28
  def main():
39
- ilo = Ilo(
40
- preprocessors=[URLs, DiscordEmotes],
41
- ignoring_filters=[Numerics, Punctuations],
42
- scoring_filters=[NimiLinku, Syllabic, ProperName, Alphabetic],
43
- cleaners=[ConsecutiveDuplicates],
44
- scorer=SoftScaling,
45
- tokenizer=word_tokenize_tok,
46
- )
29
+ ilo = Ilo(**PrefConfig)
47
30
  ilo.is_toki_pona("imagine how is touch the sky") # False
48
31
  ilo.is_toki_pona("o pilin insa e ni: sina pilin e sewi") # True
49
32
  ilo.is_toki_pona("I Think I Can Evade Detection") # False
@@ -52,7 +35,30 @@ if __name__ == "__main__":
52
35
  main()
53
36
  ```
54
37
 
55
- `Ilo` is highly configurable by design, so I recommend exploring the `Preprocessors`, `Filters`, and `Scorers` modules. The `Cleaners` module only contains one cleaner, which I recommend using. The `Tokenizers` module contains several other word tokenizers, but their performance will be worse than the dedicated Toki Pona tokenizer `word_tokenize_tok`.
38
+ Or if you'd prefer to configure on your own:
39
+
40
+ ```py
41
+ from copy import deepcopy
42
+ from sonatoki.ilo import Ilo
43
+ from sonatoki.Configs import BaseConfig
44
+ from sonatoki.Filters import NimiPuAle, Phonotactic, ProperName
45
+ from sonatoki.Scorers import SoftPassFail
46
+
47
+ def main():
48
+ config = deepcopy(BaseConfig)
49
+ config["scoring_filters"].extend([NimiPuAle, Phonotactic, ProperName])
50
+ config["scorer"] = SoftPassFail
51
+
52
+ ilo = Ilo(**config)
53
+ ilo.is_toki_pona("mu mu!") # True
54
+ ilo.is_toki_pona("mi namako e moku mi") # True
55
+ ilo.is_toki_pona("ma wulin") # False
56
+
57
+ if __name__ == "__main__":
58
+ main()
59
+ ```
60
+
61
+ `Ilo` is highly configurable by necessity, so I recommend looking through the premade configs in `Configs` as well as the individual `Preprocessors`, `Filters`, and `Scorers`. The `Cleaners` module only contains one cleaner, which I recommend always using. Similarly, the `Tokenizers` module contains several other word tokenizers, but their performance will be worse than the dedicated Toki Pona tokenizer `WordTokenizerTok`.
56
62
 
57
63
  ## Development
58
64
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sonatoki"
3
- version = "0.1.1"
3
+ version = "0.1.3"
4
4
  description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
5
5
  authors = [
6
6
  { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
@@ -0,0 +1,80 @@
1
+ # STL
2
+ from copy import deepcopy
3
+ from typing import List, Type, TypedDict
4
+
5
+ # PDM
6
+ from typing_extensions import NotRequired
7
+
8
+ # LOCAL
9
+ from sonatoki.Filters import (
10
+ Filter,
11
+ NimiPu,
12
+ Numerics,
13
+ Syllabic,
14
+ NimiLinku,
15
+ NimiPuAle,
16
+ Alphabetic,
17
+ ProperName,
18
+ Phonotactic,
19
+ NimiLinkuAle,
20
+ Punctuations,
21
+ )
22
+ from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
23
+ from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
24
+ from sonatoki.Tokenizers import Tokenizer, WordTokenizerTok
25
+ from sonatoki.Preprocessors import (
26
+ URLs,
27
+ Preprocessor,
28
+ DiscordEmotes,
29
+ DiscordSpecial,
30
+ DiscordChannels,
31
+ DiscordMentions,
32
+ )
33
+
34
+
35
+ class IloConfig(TypedDict):
36
+ preprocessors: List[Type[Preprocessor]]
37
+ word_tokenizer: Type[Tokenizer]
38
+ cleaners: List[Type[Cleaner]]
39
+ ignoring_filters: List[Type[Filter]]
40
+ scoring_filters: List[Type[Filter]]
41
+ scorer: Type[Scorer]
42
+ passing_score: Number
43
+
44
+
45
+ BaseConfig: IloConfig = {
46
+ "preprocessors": [URLs],
47
+ "cleaners": [ConsecutiveDuplicates],
48
+ "ignoring_filters": [Numerics, Punctuations],
49
+ "scoring_filters": [],
50
+ "scorer": PassFail,
51
+ "passing_score": 0.8,
52
+ "word_tokenizer": WordTokenizerTok,
53
+ }
54
+
55
+
56
+ PrefConfig: IloConfig = deepcopy(BaseConfig)
57
+ PrefConfig["scoring_filters"].extend([NimiLinku, Syllabic, ProperName, Alphabetic])
58
+ PrefConfig["scorer"] = SoftScaling
59
+
60
+
61
+ LazyConfig: IloConfig = deepcopy(BaseConfig)
62
+ LazyConfig["scoring_filters"].extend([Alphabetic, ProperName])
63
+ LazyConfig["scorer"] = SoftPassFail
64
+
65
+ DiscordConfig: IloConfig = deepcopy(PrefConfig)
66
+ DiscordConfig["preprocessors"].extend(
67
+ [DiscordEmotes, DiscordMentions, DiscordChannels, DiscordSpecial]
68
+ )
69
+ TelegramConfig: IloConfig = deepcopy(PrefConfig)
70
+ ForumConfig: IloConfig = deepcopy(PrefConfig)
71
+
72
+ __all__ = [
73
+ "IloConfig",
74
+ "BaseConfig",
75
+ "PrefConfig",
76
+ "LazyConfig",
77
+ "DiscordConfig",
78
+ "TelegramConfig",
79
+ "ForumConfig",
80
+ ]
@@ -17,6 +17,7 @@ from sonatoki.constants import (
17
17
  NIMI_LINKU_SET,
18
18
  NIMI_PU_ALE_SET,
19
19
  NIMI_LINKU_ALE_SET,
20
+ NIMI_LINKU_SANDBOX_SET,
20
21
  )
21
22
 
22
23
  re.DEFAULT_VERSION = re.VERSION1
@@ -87,6 +88,10 @@ class NimiLinkuAle(SetFilter):
87
88
  tokens = NIMI_LINKU_ALE_SET
88
89
 
89
90
 
91
+ class NimiLinkuSandbox(SetFilter):
92
+ tokens = NIMI_LINKU_SANDBOX_SET
93
+
94
+
90
95
  class Phonotactic(RegexFilter):
91
96
  """Determines if a given token is phonotactically valid Toki Pona (or `n`).
92
97
  Excludes both consecutive nasals and the illegal syllables:
@@ -13,7 +13,7 @@ There are currently two distinct types of Preprocessor:
13
13
  - ArrowQuote
14
14
 
15
15
  Order does not generally matter, but if there were two overlapping containers such as in the string "|| spoiler ` monospace || `", order would matter.
16
- As such, each Preprocessor exposes a .precedence attribute which is optionally usable for ordering them. Lower precedence means it should be applied first.
16
+ It is up to the user to order them appropriately.
17
17
  """
18
18
 
19
19
  # STL
@@ -27,8 +27,6 @@ re.DEFAULT_VERSION = re.VERSION1
27
27
 
28
28
 
29
29
  class Preprocessor(ABC):
30
- precedence: int = 0
31
-
32
30
  @classmethod # order matters
33
31
  @abstractmethod
34
32
  def process(cls, msg: str) -> str:
@@ -104,7 +102,6 @@ class DoubleQuotes(RegexPreprocessor):
104
102
  class Backticks(RegexPreprocessor):
105
103
  """Remove paired backticks and their contents `like this`"""
106
104
 
107
- precedence = -10
108
105
  pattern = re.compile(r"`[^`]+`", flags=re.S)
109
106
 
110
107
 
@@ -121,6 +118,9 @@ class ArrowQuote(RegexPreprocessor):
121
118
 
122
119
 
123
120
  __all__ = [
121
+ "DiscordChannels",
122
+ "DiscordMentions",
123
+ "DiscordSpecial",
124
124
  "DiscordEmotes",
125
125
  "SingleQuotes",
126
126
  "DoubleQuotes",
@@ -16,6 +16,13 @@ Number = Union[int, float]
16
16
  Weights = Dict[str, Number]
17
17
 
18
18
 
19
+ def sigmoid(n: int) -> Number:
20
+ return 1 / (1 + math.exp(-(0.30 * (n - 1))))
21
+ # n-1 makes sigmoid(1) == 0.5
22
+ # 0.30 softens scaling in favor of short input
23
+ # return n / (1+abs(n)) # too weak in 0.7+
24
+
25
+
19
26
  class Scorer(ABC):
20
27
  @classmethod
21
28
  @abstractmethod
@@ -27,7 +34,7 @@ class PassFail(Scorer):
27
34
  """The token passes any filter or fails all of them, scoring 1 or 0 respectively."""
28
35
 
29
36
  @classmethod
30
- def __score(cls, token: str, filters: List[Type[Filter]]) -> Number:
37
+ def score_token(cls, token: str, filters: List[Type[Filter]]) -> Number:
31
38
  for f in filters:
32
39
  if f.filter(token):
33
40
  score = 1
@@ -47,10 +54,27 @@ class PassFail(Scorer):
47
54
  total_score = 0
48
55
  len_tokens = len(tokens)
49
56
  for token in tokens:
50
- total_score += cls.__score(token, filters)
57
+ total_score += cls.score_token(token, filters)
51
58
  return total_score / len_tokens if len_tokens else 0
52
59
 
53
60
 
61
+ class SoftPassFail(PassFail):
62
+ @classmethod
63
+ @override
64
+ def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
65
+ if not tokens:
66
+ return 1
67
+
68
+ total_score = 0
69
+ len_tokens = len(tokens)
70
+ for token in tokens:
71
+ total_score += cls.score_token(token, filters)
72
+
73
+ percentage = total_score / len_tokens if len_tokens else 0
74
+ percentage **= sigmoid(len_tokens)
75
+ return percentage
76
+
77
+
54
78
  class Scaling(Scorer):
55
79
  """
56
80
  The sooner a token matches a filter, the higher its score.
@@ -91,13 +115,6 @@ class SoftScaling(Scaling):
91
115
  For example, a single token scoring 0.64 will now score 0.8.
92
116
  """
93
117
 
94
- @staticmethod
95
- def sigmoid(n: int) -> Number:
96
- return 1 / (1 + math.exp(-(0.30 * (n - 1))))
97
- # n-1 makes sigmoid(1) == 0.5
98
- # 0.30 softens scaling in favor of short input
99
- # return n / (1+abs(n)) # too weak in 0.7+
100
-
101
118
  @classmethod
102
119
  @override
103
120
  def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
@@ -113,11 +130,11 @@ class SoftScaling(Scaling):
113
130
  total_score += cls.score_token(token, filters, len_filters)
114
131
 
115
132
  percentage = total_score / max_score if max_score else 0
116
- percentage **= cls.sigmoid(len_tokens)
133
+ percentage **= sigmoid(len_tokens)
117
134
  return percentage
118
135
 
119
136
 
120
137
  class Logarithmic(Scorer): ...
121
138
 
122
139
 
123
- __all__ = ["PassFail", "Scaling", "SoftScaling"]
140
+ __all__ = ["PassFail", "SoftPassFail", "Scaling", "SoftScaling"]
@@ -0,0 +1,76 @@
1
+ # STL
2
+ from abc import ABC, abstractmethod
3
+ from typing import List
4
+
5
+ # PDM
6
+ import regex as re
7
+ from typing_extensions import override
8
+
9
+ try:
10
+ # PDM
11
+ import nltk
12
+ from nltk.tokenize import sent_tokenize as __sent_tokenize_nltk
13
+ from nltk.tokenize import word_tokenize as __word_tokenize_nltk
14
+ except ImportError as e:
15
+ nltk = e
16
+
17
+
18
+ LANGUAGE = "english" # for NLTK
19
+
20
+
21
+ class Tokenizer(ABC):
22
+ @classmethod
23
+ @abstractmethod
24
+ def tokenize(cls, s: str) -> List[str]: ...
25
+
26
+
27
+ class NoOpTokenizer(Tokenizer):
28
+ """This is a special case that you do not want or need."""
29
+
30
+ @classmethod
31
+ @override
32
+ def tokenize(cls, s: str) -> List[str]:
33
+ return [s]
34
+
35
+
36
+ class RegexTokenizer(Tokenizer):
37
+ pattern: "re.Pattern[str]"
38
+
39
+ @classmethod
40
+ @override
41
+ def tokenize(cls, s: str) -> List[str]:
42
+ return [clean for word in re.split(cls.pattern, s) if (clean := word.strip())]
43
+
44
+
45
+ class WordTokenizerTok(RegexTokenizer):
46
+ pattern = re.compile(r"""([\p{Punctuation}\p{posix_punct}]+|\s+)""")
47
+ # TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
48
+ # TODO: do the typography characters matter?
49
+ # NOTE: | / and , are *not* sentence delimiters for my purpose
50
+
51
+
52
+ class SentTokenizerTok(RegexTokenizer):
53
+ pattern = re.compile(r"""(?<=[.?!:;·…“”"'()\[\]\-]|$)""")
54
+
55
+
56
+ class WordTokenizerRe(RegexTokenizer):
57
+ pattern = re.compile(r"""(?<=[.?!;:'"-])""")
58
+
59
+
60
+ class SentTokenizerRe(RegexTokenizer):
61
+ pattern = re.compile(r"""(.*?[.?!;:])|(.+?$)""")
62
+
63
+
64
+ if not isinstance(nltk, ImportError):
65
+
66
+ class WordTokenizerNLTK(Tokenizer):
67
+ @classmethod
68
+ @override
69
+ def tokenize(cls, s: str) -> List[str]:
70
+ return __word_tokenize_nltk(text=s, language=LANGUAGE)
71
+
72
+ class SentTokenizerNLTK(Tokenizer):
73
+ @classmethod
74
+ @override
75
+ def tokenize(cls, s: str) -> List[str]:
76
+ return __sent_tokenize_nltk(text=s, language=LANGUAGE)
@@ -4,6 +4,7 @@ from typing import Dict, List
4
4
  from pathlib import Path
5
5
 
6
6
  LINKU = Path(__file__).resolve().parent / Path("linku.json")
7
+ SANDBOX = Path(__file__).resolve().parent / Path("sandbox.json")
7
8
 
8
9
  VOWELS = "aeiou"
9
10
  CONSONANTS = "jklmnpstw"
@@ -29,10 +30,16 @@ with open(LINKU) as f:
29
30
  ]
30
31
  NIMI_LINKU_ALE: List[str] = [d["word"] for d in r.values()]
31
32
 
33
+ with open(SANDBOX) as f:
34
+ r: Dict[str, Dict[str, str]] = json.loads(f.read())
35
+ NIMI_LINKU_SANDBOX: List[str] = [d["word"] for d in r.values()]
36
+
37
+
32
38
  NIMI_PU_SET = set(NIMI_PU)
33
39
  NIMI_PU_ALE_SET = set(NIMI_PU_ALE)
34
40
  NIMI_LINKU_SET = set(NIMI_LINKU)
35
41
  NIMI_LINKU_ALE_SET = set(NIMI_LINKU_ALE)
42
+ NIMI_LINKU_SANDBOX_SET = set(NIMI_LINKU_SANDBOX)
36
43
  ALLOWABLES_SET = set(ALLOWABLES)
37
44
 
38
45
  __all__ = [
@@ -54,4 +61,7 @@ __all__ = [
54
61
  #
55
62
  "NIMI_LINKU_ALE",
56
63
  "NIMI_LINKU_ALE_SET",
64
+ #
65
+ "NIMI_LINKU_SANDBOX",
66
+ "NIMI_LINKU_SANDBOX_SET",
57
67
  ]
@@ -14,13 +14,13 @@ LOG = logging.getLogger(__name__)
14
14
 
15
15
  class Ilo:
16
16
  __preprocessors: List[Type[Preprocessor]]
17
+ __word_tokenizer: Type[Tokenizer]
17
18
  __cleaners: List[Type[Cleaner]]
18
19
  __ignoring_filters: List[Type[Filter]]
19
20
  __scoring_filters: List[Type[Filter]]
20
21
  __scorer: Type[Scorer]
21
- __tokenize: Tokenizer
22
22
  __passing_score: Number
23
- logging_threshold: Number = 1.0
23
+ logging_threshold: Number = -1
24
24
 
25
25
  def __init__(
26
26
  self,
@@ -29,61 +29,62 @@ class Ilo:
29
29
  ignoring_filters: List[Type[Filter]],
30
30
  scoring_filters: List[Type[Filter]],
31
31
  scorer: Type[Scorer],
32
- tokenizer: Tokenizer, # NOTE: no wrapper needed?
33
32
  passing_score: Number,
33
+ word_tokenizer: Type[Tokenizer],
34
34
  ):
35
35
  super().__init__()
36
36
  # avoid keeping a ref to user's list just in case
37
37
  self.__preprocessors = [*preprocessors]
38
+ self.__word_tokenizer = word_tokenizer
38
39
  self.__cleaners = [*cleaners]
39
40
  self.__ignoring_filters = [*ignoring_filters]
40
41
  self.__scoring_filters = [*scoring_filters]
41
42
  self.__scorer = scorer
42
- self.__tokenize = tokenizer
43
43
  self.__passing_score = passing_score
44
44
 
45
- def __preprocess(self, msg: str) -> str:
45
+ def preprocess(self, msg: str) -> str:
46
46
  for p in self.__preprocessors:
47
47
  msg = p.process(msg)
48
48
  return msg
49
49
 
50
- def __clean_token(self, token: str) -> str:
50
+ def word_tokenize(self, msg: str) -> List[str]:
51
+ """It is *highly* recommended that you run `ilo.preprocess` first."""
52
+ return self.__word_tokenizer.tokenize(msg)
53
+
54
+ def clean_token(self, token: str) -> str:
51
55
  for c in self.__cleaners:
52
56
  token = c.clean(token)
53
57
  return token
54
58
 
55
- def __clean_tokens(self, tokens: List[str]) -> List[str]:
56
- # NOTE: tested, making a new list with a for loop *is* faster than
57
- # - list comps
58
- # - generator comps
59
- # - in-place replacement/removal
60
- # - in place replacement with result of generator comp
59
+ def clean_tokens(self, tokens: List[str]) -> List[str]:
60
+ # NOTE: tested, making a new list with a for loop *is* faster than:
61
+ # list comp, generator comp, in-place replacement
61
62
  cleaned_tokens: List[str] = list()
62
63
  for token in tokens:
63
- cleaned_token = self.__clean_token(token)
64
+ cleaned_token = self.clean_token(token)
64
65
  if not cleaned_token:
65
66
  # TODO: warn user?
66
67
  continue
67
68
  cleaned_tokens.append(cleaned_token)
68
69
  return cleaned_tokens
69
70
 
70
- def __filter_token(self, token: str) -> bool:
71
+ def _filter_token(self, token: str) -> bool:
71
72
  for f in self.__ignoring_filters:
72
73
  if f.filter(token):
73
74
  return True
74
75
  return False
75
76
 
76
- def __filter_tokens(self, tokens: List[str]) -> List[str]:
77
+ def filter_tokens(self, tokens: List[str]) -> List[str]:
77
78
  filtered_tokens: List[str] = []
78
79
  for token in tokens:
79
- if self.__filter_token(token):
80
+ if self._filter_token(token):
80
81
  continue
81
82
  # the ignoring filter is true if the token matches
82
83
  # the user wants to ignore these so keep non-matching tokens
83
84
  filtered_tokens.append(token)
84
85
  return filtered_tokens
85
86
 
86
- def __score_tokens(self, tokens: List[str]) -> float:
87
+ def score_tokens(self, tokens: List[str]) -> float:
87
88
  return self.__scorer.score(tokens, self.__scoring_filters)
88
89
 
89
90
  def _is_toki_pona(
@@ -95,26 +96,25 @@ class Ilo:
95
96
  - Filtered message (list[str])
96
97
  - Cleaned message (list[str])
97
98
  - Score (float)
98
- - Result (bool)
99
- """
100
- preprocessed = self.__preprocess(message)
101
- tokenized = self.__tokenize(preprocessed)
102
- filtered = self.__filter_tokens(tokenized)
103
- cleaned = self.__clean_tokens(filtered)
104
- score = self.__score_tokens(cleaned)
99
+ - Result (bool)"""
100
+ preprocessed = self.preprocess(message)
101
+ tokenized = self.word_tokenize(preprocessed)
102
+ filtered = self.filter_tokens(tokenized)
103
+ cleaned = self.clean_tokens(filtered)
104
+ score = self.score_tokens(cleaned)
105
105
  result = score >= self.__passing_score
106
106
 
107
- # NOTE: this method may break if above funcs start sharing a list
108
107
  if score <= self.logging_threshold:
109
- LOG.debug("Msg: %.2f %s", score, repr(message))
110
- LOG.debug("Preproc: %s", repr(preprocessed))
111
- LOG.debug("Tokenized: %s", tokenized)
112
- LOG.debug("Filtered: %s", filtered)
113
- LOG.debug("Cleaned: %s", cleaned)
108
+ LOG.debug("msg: %.2f %s", score, repr(message))
109
+ LOG.debug("preproc: %s", repr(preprocessed))
110
+ LOG.debug("tokenized: %s", tokenized)
111
+ LOG.debug("filtered: %s", filtered)
112
+ LOG.debug("cleaned: %s", cleaned)
114
113
  # TODO: Move to each function? Loses ability to control when logging occurs by threshold
115
114
 
116
115
  return preprocessed, tokenized, filtered, cleaned, score, result
117
116
 
118
117
  def is_toki_pona(self, message: str) -> bool:
118
+ """Determines whether a single statement is or is not Toki Pona."""
119
119
  *_, result = self._is_toki_pona(message)
120
120
  return result