sonatoki 0.1.1__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {sonatoki-0.1.1 → sonatoki-0.1.2}/PKG-INFO +1 -1
  2. {sonatoki-0.1.1 → sonatoki-0.1.2}/pyproject.toml +1 -1
  3. {sonatoki-0.1.1 → sonatoki-0.1.2}/src/sonatoki/Preprocessors.py +3 -0
  4. {sonatoki-0.1.1 → sonatoki-0.1.2}/src/sonatoki/Scorers.py +28 -11
  5. {sonatoki-0.1.1 → sonatoki-0.1.2}/src/sonatoki/Tokenizers.py +10 -16
  6. sonatoki-0.1.2/tests/test_ilo.py +185 -0
  7. sonatoki-0.1.2/tests/tokenize_cases/tokenize_sentences_tok.yml +37 -0
  8. {sonatoki-0.1.1 → sonatoki-0.1.2}/tests/tokenize_cases/tokenize_words.yml +0 -4
  9. {sonatoki-0.1.1 → sonatoki-0.1.2}/tests/tokenize_cases/tokenize_words_tok.yml +12 -0
  10. sonatoki-0.1.1/tests/test_ilo.py +0 -73
  11. sonatoki-0.1.1/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -20
  12. {sonatoki-0.1.1 → sonatoki-0.1.2}/LICENSE +0 -0
  13. {sonatoki-0.1.1 → sonatoki-0.1.2}/README.md +0 -0
  14. {sonatoki-0.1.1 → sonatoki-0.1.2}/src/sonatoki/Cleaners.py +0 -0
  15. {sonatoki-0.1.1 → sonatoki-0.1.2}/src/sonatoki/Filters.py +0 -0
  16. {sonatoki-0.1.1 → sonatoki-0.1.2}/src/sonatoki/__init__.py +0 -0
  17. {sonatoki-0.1.1 → sonatoki-0.1.2}/src/sonatoki/__main__.py +0 -0
  18. {sonatoki-0.1.1 → sonatoki-0.1.2}/src/sonatoki/constants.py +0 -0
  19. {sonatoki-0.1.1 → sonatoki-0.1.2}/src/sonatoki/ilo.py +0 -0
  20. {sonatoki-0.1.1 → sonatoki-0.1.2}/src/sonatoki/linku.json +0 -0
  21. {sonatoki-0.1.1 → sonatoki-0.1.2}/tests/__init__.py +0 -0
  22. {sonatoki-0.1.1 → sonatoki-0.1.2}/tests/test_cleaners.py +0 -0
  23. {sonatoki-0.1.1 → sonatoki-0.1.2}/tests/test_filters.py +0 -0
  24. {sonatoki-0.1.1 → sonatoki-0.1.2}/tests/test_preprocessors.py +0 -0
  25. {sonatoki-0.1.1 → sonatoki-0.1.2}/tests/test_scorers.py +0 -0
  26. {sonatoki-0.1.1 → sonatoki-0.1.2}/tests/test_tokenize.py +0 -0
  27. {sonatoki-0.1.1 → sonatoki-0.1.2}/tests/test_utils.py +0 -0
  28. {sonatoki-0.1.1 → sonatoki-0.1.2}/tests/tokenize_cases/tokenize_sentences.yml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sonatoki"
3
- version = "0.1.1"
3
+ version = "0.1.2"
4
4
  description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
5
5
  authors = [
6
6
  { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
@@ -121,6 +121,9 @@ class ArrowQuote(RegexPreprocessor):
121
121
 
122
122
 
123
123
  __all__ = [
124
+ "DiscordChannels",
125
+ "DiscordMentions",
126
+ "DiscordSpecial",
124
127
  "DiscordEmotes",
125
128
  "SingleQuotes",
126
129
  "DoubleQuotes",
@@ -16,6 +16,13 @@ Number = Union[int, float]
16
16
  Weights = Dict[str, Number]
17
17
 
18
18
 
19
+ def sigmoid(n: int) -> Number:
20
+ return 1 / (1 + math.exp(-(0.30 * (n - 1))))
21
+ # n-1 makes sigmoid(1) == 0.5
22
+ # 0.30 softens scaling in favor of short input
23
+ # return n / (1+abs(n)) # too weak in 0.7+
24
+
25
+
19
26
  class Scorer(ABC):
20
27
  @classmethod
21
28
  @abstractmethod
@@ -27,7 +34,7 @@ class PassFail(Scorer):
27
34
  """The token passes any filter or fails all of them, scoring 1 or 0 respectively."""
28
35
 
29
36
  @classmethod
30
- def __score(cls, token: str, filters: List[Type[Filter]]) -> Number:
37
+ def score_token(cls, token: str, filters: List[Type[Filter]]) -> Number:
31
38
  for f in filters:
32
39
  if f.filter(token):
33
40
  score = 1
@@ -47,10 +54,27 @@ class PassFail(Scorer):
47
54
  total_score = 0
48
55
  len_tokens = len(tokens)
49
56
  for token in tokens:
50
- total_score += cls.__score(token, filters)
57
+ total_score += cls.score_token(token, filters)
51
58
  return total_score / len_tokens if len_tokens else 0
52
59
 
53
60
 
61
+ class SoftPassFail(PassFail):
62
+ @classmethod
63
+ @override
64
+ def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
65
+ if not tokens:
66
+ return 1
67
+
68
+ total_score = 0
69
+ len_tokens = len(tokens)
70
+ for token in tokens:
71
+ total_score += cls.score_token(token, filters)
72
+
73
+ percentage = total_score / len_tokens if len_tokens else 0
74
+ percentage **= sigmoid(len_tokens)
75
+ return percentage
76
+
77
+
54
78
  class Scaling(Scorer):
55
79
  """
56
80
  The sooner a token matches a filter, the higher its score.
@@ -91,13 +115,6 @@ class SoftScaling(Scaling):
91
115
  For example, a single token scoring 0.64 will now score 0.8.
92
116
  """
93
117
 
94
- @staticmethod
95
- def sigmoid(n: int) -> Number:
96
- return 1 / (1 + math.exp(-(0.30 * (n - 1))))
97
- # n-1 makes sigmoid(1) == 0.5
98
- # 0.30 softens scaling in favor of short input
99
- # return n / (1+abs(n)) # too weak in 0.7+
100
-
101
118
  @classmethod
102
119
  @override
103
120
  def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
@@ -113,11 +130,11 @@ class SoftScaling(Scaling):
113
130
  total_score += cls.score_token(token, filters, len_filters)
114
131
 
115
132
  percentage = total_score / max_score if max_score else 0
116
- percentage **= cls.sigmoid(len_tokens)
133
+ percentage **= sigmoid(len_tokens)
117
134
  return percentage
118
135
 
119
136
 
120
137
  class Logarithmic(Scorer): ...
121
138
 
122
139
 
123
- __all__ = ["PassFail", "Scaling", "SoftScaling"]
140
+ __all__ = ["PassFail", "SoftPassFail", "Scaling", "SoftScaling"]
@@ -4,6 +4,8 @@ from typing import List, Callable
4
4
  # PDM
5
5
  import regex as re
6
6
 
7
+ # TODO: Entire module should be reworked to match the class scheme of the rest of the module, imo
8
+
7
9
  try:
8
10
  # PDM
9
11
  import nltk
@@ -15,18 +17,14 @@ except ImportError as e:
15
17
 
16
18
  LANGUAGE = "english" # for NLTK
17
19
 
18
- SENT_DELIMS_RE = r"""(.*?[.?!;:])|(.+?$)"""
19
- SENT_DELIMS_RE = re.compile(SENT_DELIMS_RE)
20
-
21
- SENT_DELIMS_TOK = r"""(.*?[.?!;:-])|(.+?$)"""
22
- SENT_DELIMS_TOK = re.compile(SENT_DELIMS_TOK)
23
-
24
-
25
- WORD_DELIMS_RE = r"""\s+|(?=[.?!;:'"-])"""
26
- WORD_DELIMS_RE = re.compile(WORD_DELIMS_RE)
20
+ SENT_DELIMS_RE = re.compile(r"""(.*?[.?!;:])|(.+?$)""")
21
+ SENT_DELIMS_TOK = re.compile(r"""(?<=[.?!:;·…“”"'()\[\]\-]|$)""")
22
+ # TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
23
+ # TODO: do the typography characters matter?
24
+ # NOTE: | / and , are *not* sentence delimiters for my purpose
27
25
 
28
- WORD_DELIMS_TOK = r"([\p{Punctuation}\p{posix_punct}]+|\s+)"
29
- WORD_DELIMS_TOK = re.compile(WORD_DELIMS_TOK)
26
+ WORD_DELIMS_RE = re.compile(r"""\s+|(?=[.?!;:'"-])""")
27
+ WORD_DELIMS_TOK = re.compile(r"([\p{Punctuation}\p{posix_punct}]+|\s+)")
30
28
 
31
29
  Tokenizer = Callable[[str], List[str]]
32
30
 
@@ -53,11 +51,7 @@ def word_tokenize_re(s: str) -> List[str]:
53
51
 
54
52
 
55
53
  def sent_tokenize_tok(s: str) -> List[str]:
56
- return [
57
- clean
58
- for sent in re.findall(SENT_DELIMS_TOK, s)
59
- if (clean := sent[0].strip() or sent[1].strip())
60
- ]
54
+ return [clean for sent in re.split(SENT_DELIMS_TOK, s) if (clean := sent.strip())]
61
55
 
62
56
 
63
57
  def word_tokenize_tok(s: str) -> List[str]:
@@ -0,0 +1,185 @@
1
+ # PDM
2
+ import pytest
3
+
4
+ # LOCAL
5
+ from sonatoki.ilo import Ilo
6
+ from sonatoki.Filters import (
7
+ Numerics,
8
+ Syllabic,
9
+ NimiLinku,
10
+ Alphabetic,
11
+ ProperName,
12
+ Punctuations,
13
+ )
14
+ from sonatoki.Scorers import SoftScaling, SoftPassFail
15
+ from sonatoki.Cleaners import ConsecutiveDuplicates
16
+ from sonatoki.Tokenizers import word_tokenize_tok
17
+ from sonatoki.Preprocessors import URLs
18
+
19
+
20
+ @pytest.fixture
21
+ def ilo():
22
+ ilo = Ilo(
23
+ preprocessors=[URLs],
24
+ ignoring_filters=[Numerics, Punctuations],
25
+ scoring_filters=[NimiLinku, Syllabic, ProperName, Alphabetic],
26
+ cleaners=[ConsecutiveDuplicates],
27
+ scorer=SoftScaling,
28
+ tokenizer=word_tokenize_tok,
29
+ passing_score=0.8,
30
+ )
31
+ # ilo.logging_threshold = 0.8
32
+ return ilo
33
+
34
+
35
+ @pytest.fixture()
36
+ def lazy_ilo():
37
+ ilo = Ilo(
38
+ preprocessors=[URLs],
39
+ ignoring_filters=[Numerics, Punctuations],
40
+ scoring_filters=[Alphabetic, ProperName],
41
+ cleaners=[ConsecutiveDuplicates],
42
+ scorer=SoftPassFail,
43
+ tokenizer=word_tokenize_tok,
44
+ passing_score=0.8,
45
+ )
46
+ # ilo.logging_threshold = 0.8
47
+ return ilo
48
+
49
+
50
+ ALL_VALID = [
51
+ "mi unpa e mama sina",
52
+ "mama sina li lon seme? mi wile toki tawa ona",
53
+ "sina sike pakala",
54
+ " sina seme e mi ?",
55
+ "AAAAAAAAAAA",
56
+ "muuuu MUUU muUuUuU",
57
+ "wawa mute. " * 10,
58
+ ]
59
+
60
+ IGNORABLES = [
61
+ "",
62
+ " ",
63
+ "2+2=5",
64
+ "kiwen moli 42",
65
+ "https://mun.la/sona",
66
+ "https://example.com/",
67
+ "mi wile e ni: <https://example.com> li pona",
68
+ "lipu https://example.com li kama pona",
69
+ "...",
70
+ " ⟨·⟩, a",
71
+ "·····",
72
+ ]
73
+
74
+ SYLLABIC_MATCHES = [
75
+ "ni li tenpo penpo",
76
+ "sipisi",
77
+ "walawa malama walama malama mupi",
78
+ "mi sona ala e nimi sunopatikuna",
79
+ "kalama wuwojiti li pana e sona",
80
+ "jan Awaja en jan Alasali en jan Akesinu li pona", # syllables match before names here
81
+ ]
82
+
83
+ ALPHABETIC_MATCHES = [
84
+ "mi mtue o kama sona",
85
+ "mi mute o kma son",
86
+ "ni li tptpt",
87
+ "mi wile pana lon sptp",
88
+ "tmo tawa mi li pona mute la mi kepeken ona lon tenpo mute",
89
+ "mi pakla lon nimi pi mute lili, taso ale li pona tan ni: mi toki mute",
90
+ ]
91
+
92
+ NAME_MATCHES = [
93
+ "musi Homestuck li ike tawa mi",
94
+ "ilo Google li sona ala e nimi Emoticon la mi wile utala e ona",
95
+ "toki Kanse li lon",
96
+ "toki Lojban li nasa e lawa mi",
97
+ ]
98
+
99
+ SOME_INVALID = ["kulupu xerox li ike", "mi tawa ma ohio"]
100
+
101
+
102
+ EXCESSIVE_SYLLABICS = [
103
+ "manama manama namana namana majani makala",
104
+ ]
105
+
106
+ EXCESSIVE_ALPHABETICS = [
107
+ "21st", # candidate for xfails?
108
+ "tok",
109
+ "mut",
110
+ "mtue",
111
+ "I wait, I sulk, as a tool I make stoops to ineptness.",
112
+ "aaa i non-saw usa's most multiple element-set. it's as asinine as in `e`-less speak",
113
+ "mi pakla ln tepo mtue ls mi kn ala tok poan aun seem",
114
+ "so, to atone like papa—an awesome anon (no-name) sin man—i ate an asinine lemon-limelike tomato jalapeno isotope. 'nonsense!' amen. note to Oman: take mine katana to imitate a ninja in pantomime. atomise one nuke? 'insane misuse!' same. likewise, Susan, awaken a pepino melon in a linen pipeline. (penile) emanate semen. joke: manipulate a tame toneme to elope online tonite",
115
+ ]
116
+
117
+ EXCESSIVE_NAMES = [
118
+ "I Want To Evade The Filter",
119
+ "If You Do This The Bot Can't See You",
120
+ "This Is A Statement In Perfect Toki Pona, I Guarantee",
121
+ ]
122
+
123
+ NON_MATCHES = [
124
+ "bong",
125
+ "super bruh moment 64",
126
+ "homestuck",
127
+ "homestuck Homestuck",
128
+ ]
129
+
130
+ XFAILS = [
131
+ "lete li ike x.x", # emoticon should not be a problem
132
+ ]
133
+
134
+
135
+ @pytest.mark.parametrize(
136
+ "text",
137
+ ALL_VALID
138
+ + SYLLABIC_MATCHES
139
+ + ALPHABETIC_MATCHES
140
+ + NAME_MATCHES
141
+ + SOME_INVALID
142
+ + IGNORABLES,
143
+ )
144
+ def test_known_good(ilo: Ilo, lazy_ilo: Ilo, text: str):
145
+ assert ilo.is_toki_pona(text), text
146
+
147
+
148
+ @pytest.mark.parametrize(
149
+ "text", EXCESSIVE_SYLLABICS + EXCESSIVE_ALPHABETICS + EXCESSIVE_NAMES + NON_MATCHES
150
+ )
151
+ def test_known_bad(ilo: Ilo, text: str):
152
+ assert not ilo.is_toki_pona(text), text
153
+
154
+
155
+ @pytest.mark.parametrize(
156
+ "text",
157
+ ALL_VALID
158
+ + SYLLABIC_MATCHES
159
+ + ALPHABETIC_MATCHES
160
+ + NAME_MATCHES
161
+ + SOME_INVALID
162
+ + IGNORABLES,
163
+ )
164
+ def test_known_good_lazy(lazy_ilo: Ilo, text: str):
165
+ assert lazy_ilo.is_toki_pona(text), text
166
+ # assumption: lazy ilo should pass anything the more strict ilo does
167
+
168
+
169
+ @pytest.mark.parametrize("text", NON_MATCHES)
170
+ def test_known_bad_lazy(lazy_ilo: Ilo, text: str):
171
+ assert not lazy_ilo.is_toki_pona(text), text
172
+
173
+
174
+ @pytest.mark.parametrize(
175
+ "text", EXCESSIVE_SYLLABICS + EXCESSIVE_ALPHABETICS + EXCESSIVE_NAMES
176
+ )
177
+ def test_weakness_of_lazy(lazy_ilo: Ilo, text: str):
178
+ # NOTE: This is demonstrative, not preferential
179
+ assert lazy_ilo.is_toki_pona(text), text
180
+
181
+
182
+ @pytest.mark.xfail
183
+ @pytest.mark.parametrize("text", XFAILS)
184
+ def test_known_xfails(ilo: Ilo, text: str):
185
+ assert ilo.is_toki_pona(text)
@@ -0,0 +1,37 @@
1
+ ---
2
+ - name: "basic1"
3
+ input: "mu. mu."
4
+ output:
5
+ - "mu."
6
+ - "mu."
7
+ - name: "basic2"
8
+ input: "mu! mu!"
9
+ output:
10
+ - "mu!"
11
+ - "mu!"
12
+ - name: "basic3"
13
+ input: "mu? mu?"
14
+ output:
15
+ - "mu?"
16
+ - "mu?"
17
+ - name: "basic4"
18
+ input: "mi mu. mi wawa."
19
+ output:
20
+ - "mi mu."
21
+ - "mi wawa."
22
+ - name: "dash"
23
+ input: "mi sona ala e ni- sina seme a"
24
+ output:
25
+ - "mi sona ala e ni-"
26
+ - "sina seme a"
27
+ - name: "comma"
28
+ input: "mi mu tawa sina, mi wawa e sina."
29
+ output:
30
+ - "mi mu tawa sina, mi wawa e sina."
31
+ - name: "quotes"
32
+ input: "toki li tan kulupu Kuko li ni: 'o ike ala!'"
33
+ output: # expected; we split on right of all sentence-ending puncts
34
+ - "toki li tan kulupu Kuko li ni:"
35
+ - "'"
36
+ - "o ike ala!"
37
+ - "'"
@@ -1,19 +1,15 @@
1
1
  ---
2
2
  - name: "basic"
3
3
  input: "mi mu mute tawa sina."
4
- should_be_equal: true
5
4
  - name: "spoilers"
6
5
  input: "||ni li toki len.||"
7
- should_be_equal: true
8
6
  xfail: true # lookbehind for . breaks it
9
7
  - name: "quotes"
10
8
  input: "toki li tan kulupu Kuko li ni: 'o ike ala!'"
11
- should_be_equal: true
12
9
  xfail: true
13
10
  - name: periods every word
14
11
  input: "mi.unpa.e.mama.sina"
15
12
  xfail: true # lookbehind for . breaks it
16
13
  - name: "url"
17
14
  input: "https://mun.la/sona/"
18
- should_be_equal: true
19
15
  xfail: true # i have no idea how to emulate the : behavior
@@ -73,6 +73,18 @@
73
73
  - "are"
74
74
  - "boring"
75
75
  - "'"
76
+ - name: periods every word
77
+ input: "mi.unpa.e.mama.sina"
78
+ output:
79
+ - "mi"
80
+ - "."
81
+ - "unpa"
82
+ - "."
83
+ - "e"
84
+ - "."
85
+ - "mama"
86
+ - "."
87
+ - "sina"
76
88
  - name: "discovered case 1"
77
89
  input: "***__U T A L A__ __M U N__***"
78
90
  output:
@@ -1,73 +0,0 @@
1
- # LOCAL
2
- from sonatoki.ilo import Ilo
3
- from sonatoki.Filters import (
4
- Numerics,
5
- Syllabic,
6
- NimiLinku,
7
- Alphabetic,
8
- ProperName,
9
- Punctuations,
10
- )
11
- from sonatoki.Scorers import Scaling, SoftScaling
12
- from sonatoki.Cleaners import ConsecutiveDuplicates
13
- from sonatoki.Tokenizers import word_tokenize_tok
14
- from sonatoki.Preprocessors import (
15
- URLs,
16
- DiscordEmotes,
17
- DiscordSpecial,
18
- DiscordChannels,
19
- DiscordMentions,
20
- )
21
-
22
-
23
- def test_constructor():
24
- ilo = Ilo(
25
- preprocessors=[
26
- URLs,
27
- DiscordEmotes,
28
- DiscordMentions,
29
- DiscordChannels,
30
- DiscordSpecial,
31
- ],
32
- ignoring_filters=[Numerics, Punctuations],
33
- scoring_filters=[NimiLinku, Syllabic, ProperName, Alphabetic],
34
- cleaners=[ConsecutiveDuplicates],
35
- scorer=SoftScaling,
36
- tokenizer=word_tokenize_tok,
37
- passing_score=0.8,
38
- )
39
- # ilo._logging_threshold = 0.8
40
- assert ilo.is_toki_pona("mi unpa e mama sina")
41
- # toki pona
42
- assert ilo.is_toki_pona("mama sina li lon seme? mi wile toki tawa ona")
43
- assert ilo.is_toki_pona("sina sike pakala")
44
- # names
45
- assert ilo.is_toki_pona("musi Homestuck li ike tawa mi")
46
- # typoes
47
- assert ilo.is_toki_pona("mi mtue o kama sona")
48
- assert ilo.is_toki_pona("mi mute o kma son")
49
- # phonotactically valid
50
- assert ilo.is_toki_pona("ni li tenpo penpo")
51
- # alphabetically valid
52
- assert ilo.is_toki_pona("ni li tptpt")
53
- # a single
54
- assert ilo.is_toki_pona("sipisi")
55
-
56
- # soft scaling with syllablic filter at 2/4 will pass up to 5 syllablic words
57
- assert ilo.is_toki_pona("walawa malama walama malama mupi")
58
- # but fail 6 or more
59
- assert not ilo.is_toki_pona("manama manama namana namana majani makala")
60
-
61
- # TODO: should soft scaling save an alphabetically valid single word?
62
- assert not ilo.is_toki_pona("tok")
63
- assert not ilo.is_toki_pona("mtue")
64
-
65
- # just english
66
- assert not ilo.is_toki_pona("bong")
67
- assert not ilo.is_toki_pona("super bruh moment 64")
68
- # all names
69
- assert not ilo.is_toki_pona("I Want To Evade The Filter")
70
- # all alphabetic
71
- assert not ilo.is_toki_pona(
72
- "aaa i non-saw usa's most multiple element-set. it's as asinine as in `e`-less speak"
73
- )
@@ -1,20 +0,0 @@
1
- ---
2
- - name: "basic"
3
- input: "mi mu. mi wawa."
4
- output:
5
- - "mi mu."
6
- - "mi wawa."
7
- should_be_equal: true
8
- - name: "dash"
9
- input: "mi sona ala e ni- sina seme a"
10
- output:
11
- - "mi sona ala e ni-"
12
- - "sina seme a"
13
- should_be_equal: true
14
- - name: "quotes"
15
- input: "toki li tan kulupu Kuko li ni: 'o ike ala!'"
16
- output:
17
- - "toki li tan kulupu Kuko li ni:"
18
- - "'o ike ala!'"
19
- should_be_equal: true
20
- xfail: true
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes