sonatoki 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonatoki/Scorers.py CHANGED
@@ -1,5 +1,6 @@
1
1
  # STL
2
2
  import math
3
+ import logging
3
4
  from abc import ABC, abstractmethod
4
5
  from typing import Dict, List, Type, Union
5
6
 
@@ -9,24 +10,13 @@ from typing_extensions import override
9
10
  # LOCAL
10
11
  from sonatoki.Filters import Filter
11
12
 
13
+ LOG = logging.getLogger(__name__)
14
+
12
15
  Number = Union[int, float]
13
16
  Weights = Dict[str, Number]
14
17
 
15
18
 
16
19
  class Scorer(ABC):
17
- weights: Weights
18
-
19
- # @classmethod
20
- # def __score(cls, token: str, filters: List[Type[Filter]]) -> Tuple[int, Number]:
21
- # for filter in filters:
22
- # if not filter.filter(token):
23
- # continue
24
- # # NOTE: We assume the filters are ordered by their score
25
- # # Thus the first match is also the highest scoring
26
- # return filter.counts, cls.weights[filter.__name__]
27
- # # TODO: override weight if count is 0?
28
- # return 1, 0
29
-
30
20
  @classmethod
31
21
  @abstractmethod
32
22
  def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
@@ -40,7 +30,12 @@ class PassFail(Scorer):
40
30
  def __score(cls, token: str, filters: List[Type[Filter]]) -> Number:
41
31
  for f in filters:
42
32
  if f.filter(token):
43
- return 1
33
+ score = 1
34
+ LOG.debug(
35
+ "%12s.%s('%s') = %.2f", cls.__name__, f.__name__, token, score
36
+ )
37
+ return score
38
+ LOG.debug("%12s('%s') = 0.00", cls.__name__, token)
44
39
  return 0
45
40
 
46
41
  @classmethod
@@ -67,7 +62,12 @@ class Scaling(Scorer):
67
62
  def score_token(cls, token: str, filters: List[Type[Filter]], scale: int):
68
63
  for i, f in enumerate(filters):
69
64
  if f.filter(token):
70
- return scale - i
65
+ score = scale - i
66
+ LOG.debug(
67
+ "%12s.%s('%s') = %.2f", cls.__name__, f.__name__, token, score
68
+ )
69
+ return score
70
+ LOG.debug("%12s('%s') = 0.00", cls.__name__, token)
71
71
  return 0
72
72
 
73
73
  @classmethod
@@ -95,7 +95,7 @@ class SoftScaling(Scaling):
95
95
  def sigmoid(n: int) -> Number:
96
96
  return 1 / (1 + math.exp(-(0.30 * (n - 1))))
97
97
  # n-1 makes sigmoid(1) == 0.5
98
- # 0.30 softens scaling against input
98
+ # 0.30 softens scaling in favor of short input
99
99
  # return n / (1+abs(n)) # too weak in 0.7+
100
100
 
101
101
  @classmethod
sonatoki/ilo.py CHANGED
@@ -1,5 +1,6 @@
1
1
  # STL
2
- from typing import List, Type
2
+ import logging
3
+ from typing import List, Type, Tuple
3
4
 
4
5
  # LOCAL
5
6
  from sonatoki.Filters import Filter
@@ -8,6 +9,8 @@ from sonatoki.Cleaners import Cleaner
8
9
  from sonatoki.Tokenizers import Tokenizer
9
10
  from sonatoki.Preprocessors import Preprocessor
10
11
 
12
+ LOG = logging.getLogger(__name__)
13
+
11
14
 
12
15
  class Ilo:
13
16
  __preprocessors: List[Type[Preprocessor]]
@@ -17,7 +20,7 @@ class Ilo:
17
20
  __scorer: Type[Scorer]
18
21
  __tokenize: Tokenizer
19
22
  __passing_score: Number
20
- debug: bool = False
23
+ logging_threshold: Number = 1.0
21
24
 
22
25
  def __init__(
23
26
  self,
@@ -83,19 +86,35 @@ class Ilo:
83
86
  def __score_tokens(self, tokens: List[str]) -> float:
84
87
  return self.__scorer.score(tokens, self.__scoring_filters)
85
88
 
86
- def is_toki_pona(self, message: str) -> bool:
89
+ def _is_toki_pona(
90
+ self, message: str
91
+ ) -> Tuple[str, List[str], List[str], List[str], Number, bool]:
92
+ """Returns all components of the processing algorithm:
93
+ - Preprocessed message (str)
94
+ - Tokenized message (list[str])
95
+ - Filtered message (list[str])
96
+ - Cleaned message (list[str])
97
+ - Score (float)
98
+ - Result (bool)
99
+ """
87
100
  preprocessed = self.__preprocess(message)
88
101
  tokenized = self.__tokenize(preprocessed)
89
102
  filtered = self.__filter_tokens(tokenized)
90
103
  cleaned = self.__clean_tokens(filtered)
91
104
  score = self.__score_tokens(cleaned)
105
+ result = score >= self.__passing_score
92
106
 
93
- if self.debug:
94
- print("msg: %.2f %s" % (score, repr(message)))
95
- print("Preproc: %s" % repr(preprocessed))
96
- print("Tokenized: %s" % tokenized)
97
- print("Filtered: %s" % filtered)
98
- print("Cleaned: %s" % cleaned)
99
- print()
107
+ # NOTE: this method may break if above funcs start sharing a list
108
+ if score <= self.logging_threshold:
109
+ LOG.debug("Msg: %.2f %s", score, repr(message))
110
+ LOG.debug("Preproc: %s", repr(preprocessed))
111
+ LOG.debug("Tokenized: %s", tokenized)
112
+ LOG.debug("Filtered: %s", filtered)
113
+ LOG.debug("Cleaned: %s", cleaned)
114
+ # TODO: Move to each function? Loses ability to control when logging occurs by threshold
100
115
 
101
- return score >= self.__passing_score
116
+ return preprocessed, tokenized, filtered, cleaned, score, result
117
+
118
+ def is_toki_pona(self, message: str) -> bool:
119
+ *_, result = self._is_toki_pona(message)
120
+ return result
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -44,7 +44,7 @@ from sonatoki.Filters import (
44
44
  ProperName,
45
45
  Punctuations,
46
46
  )
47
- from sonatoki.Scorers import Scaling
47
+ from sonatoki.Scorers import SoftScaling
48
48
  from sonatoki.Cleaners import ConsecutiveDuplicates
49
49
  from sonatoki.Tokenizers import word_tokenize_tok
50
50
  from sonatoki.Preprocessors import URLs, DiscordEmotes
@@ -55,22 +55,23 @@ def main():
55
55
  ignoring_filters=[Numerics, Punctuations],
56
56
  scoring_filters=[NimiLinku, Syllabic, ProperName, Alphabetic],
57
57
  cleaners=[ConsecutiveDuplicates],
58
- scorer=Scaling,
58
+ scorer=SoftScaling,
59
59
  tokenizer=word_tokenize_tok,
60
60
  )
61
61
  ilo.is_toki_pona("imagine how is touch the sky") # False
62
62
  ilo.is_toki_pona("o pilin insa e ni: sina pilin e sewi") # True
63
+ ilo.is_toki_pona("I Think I Can Evade Detection") # False
63
64
 
64
65
  if __name__ == "__main__":
65
66
  main()
66
67
  ```
67
68
 
68
- `Ilo` is highly configurable by design, so I recommend exploring the `Preprocessors`, `Filters`, and `Scorers` modules. The `Cleaners` module only contains one cleaner, which I highly recommend. The `Tokenizers` module contains several other word tokenizers, but their performance will be worse than the
69
+ `Ilo` is highly configurable by design, so I recommend exploring the `Preprocessors`, `Filters`, and `Scorers` modules. The `Cleaners` module only contains one cleaner, which I recommend using. The `Tokenizers` module contains several other word tokenizers, but their performance will be worse than the dedicated Toki Pona tokenizer `word_tokenize_tok`.
69
70
 
70
71
  ## Development
71
72
 
72
73
  1. Install [pdm](https://github.com/pdm-project/pdm)
73
- 1. `pdm sync --dev`
74
+ 1. `pdm install --dev`
74
75
  1. Open any file you like!
75
76
 
76
77
  ## FAQ
@@ -81,4 +82,26 @@ The intent is to show our methodology to the Unicode Consortium, particularly to
81
82
 
82
83
  After our proposal has been examined and a result given by the committee, I will translate this file and library into Toki Pona, with a note left behind for those who do not understand it.
83
84
 
84
- ### Why aren't any of the specific
85
+ ### What's the deal with the tokenizers?
86
+
87
+ The Toki Pona tokenizer `word_tokenize_tok` is very specific in always separating writing characters from punctuation, and leaving contiguous punctuation as contiguous- this is a level of precision that NLTK's English tokenizer does not want for several reasons, such as that English words can have "punctuation" characters in them.
88
+
89
+ Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet, so a more aggressive tokenizer is highly desirable.
90
+
91
+ The other tokenizers are provided as a comparison case more than anything. I do not recommend their use.
92
+
93
+ ### Aren't there a lot of false positives?
94
+
95
+ Yes. It's up to you to use this tool responsibly on input you've done your best to clean, and better, use stronger filters before weaker ones. For now though, here's a list of relevant false positives:
96
+
97
+ - `ProperName` will errantly match text in languages without a capital/lowercase distinction, artificially inflating the scores.
98
+ - `Alphabetic` will match a _lot_ of undesirable text- it essentially allows 14 letters of the English alphabet.
99
+
100
+ ### Don't some of the cleaners/filters conflict?
101
+
102
+ Yes. Some do so
103
+
104
+ - `ConsecutiveDuplicates` may errantly change a word's validity. For example, "manna" is phonotactically invalid in Toki Pona, but would become "mana" which is valid.
105
+ - `ConsecutiveDuplicates` will not work correctly with syllabaries (alphabets, but representing a pair of consonant and vowel).
106
+
107
+ You'll notice a _lot_ of these are troubles regarding the application of latin alphabet filters to non-latin text. Working on it!
@@ -1,14 +1,14 @@
1
- sonatoki-0.1.0.dist-info/METADATA,sha256=EQaB5tsicEQ4wYn5curehbhkzGF0qHqC1bnUbOVDCu0,3332
2
- sonatoki-0.1.0.dist-info/WHEEL,sha256=vnE8JVcI2Wz7GRKorsPArnBdnW2SWKWGow5gu5tHlRU,90
3
- sonatoki-0.1.0.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
1
+ sonatoki-0.1.1.dist-info/METADATA,sha256=gUp7IAG1ZmBarZnwyxHt6RmP8GX0tahsaG1qkp047GM,5020
2
+ sonatoki-0.1.1.dist-info/WHEEL,sha256=vnE8JVcI2Wz7GRKorsPArnBdnW2SWKWGow5gu5tHlRU,90
3
+ sonatoki-0.1.1.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
4
4
  sonatoki/Cleaners.py,sha256=gTZ9dSsnvKVUtxM_ECSZ-_2heh--nD5A9dCQR1ATb1c,1160
5
5
  sonatoki/Filters.py,sha256=yzhYF79GX03cOwlR_-B8SPMQPZv4UpAPytH0fQwBE70,4093
6
6
  sonatoki/Preprocessors.py,sha256=G2up2jKKSrHQtTQWYNWH_fkjgroL45ZeajVn1KUECt8,3431
7
- sonatoki/Scorers.py,sha256=X1vo-eIPbtl0IC5suIX6hu-4VG7NSzR90rkrLpep8WY,3690
7
+ sonatoki/Scorers.py,sha256=twuFGqcIg6UTaeBVb1SmKenldC3hj1s97m6zyM_HXjg,3678
8
8
  sonatoki/Tokenizers.py,sha256=epOG3jZHI3MSO_L_6Z3zsSkexDEMLVzA2ARg6EnPMO0,1628
9
9
  sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  sonatoki/__main__.py,sha256=6xc-wIrrFo9wTyn4zRQNAmqwmJBtVvCMwV-CrM-hueA,82
11
11
  sonatoki/constants.py,sha256=h5rbCfu9YF76BsjQYud5d2wq1HODY05zOaw0Ir1cwjo,1320
12
- sonatoki/ilo.py,sha256=Uu0zipAF-L-5Wxw_EBB7-EMc40PM4WBa59Atq0zmYYE,3482
12
+ sonatoki/ilo.py,sha256=h3TYoqrjHxMGhB8ZJLVijVzy1AVCeWJk5x0q-bs4JMc,4278
13
13
  sonatoki/linku.json,sha256=MdFuFRIHniPDUVxKEKuUg1KyzPVgcCj4ZeyvburCwD0,270928
14
- sonatoki-0.1.0.dist-info/RECORD,,
14
+ sonatoki-0.1.1.dist-info/RECORD,,