sonatoki 0.7.0__tar.gz → 0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {sonatoki-0.7.0 → sonatoki-0.8.0}/PKG-INFO +1 -1
  2. {sonatoki-0.7.0 → sonatoki-0.8.0}/pyproject.toml +1 -1
  3. {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/Configs.py +2 -2
  4. {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/Filters.py +25 -2
  5. {sonatoki-0.7.0 → sonatoki-0.8.0}/tests/test_filters.py +3 -3
  6. {sonatoki-0.7.0 → sonatoki-0.8.0}/tests/test_ilo.py +4 -1
  7. {sonatoki-0.7.0 → sonatoki-0.8.0}/tests/test_scorers.py +2 -2
  8. {sonatoki-0.7.0 → sonatoki-0.8.0}/LICENSE +0 -0
  9. {sonatoki-0.7.0 → sonatoki-0.8.0}/README.md +0 -0
  10. {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/Cleaners.py +0 -0
  11. {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/Preprocessors.py +0 -0
  12. {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/Scorers.py +0 -0
  13. {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/Tokenizers.py +0 -0
  14. {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/__init__.py +0 -0
  15. {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/__main__.py +0 -0
  16. {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/alphabetic.txt +0 -0
  17. {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/constants.py +0 -0
  18. {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/ilo.py +0 -0
  19. {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/linku.json +0 -0
  20. {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/py.typed +0 -0
  21. {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/sandbox.json +0 -0
  22. {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/syllabic.txt +0 -0
  23. {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/types.py +0 -0
  24. {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/utils.py +0 -0
  25. {sonatoki-0.7.0 → sonatoki-0.8.0}/tests/__init__.py +0 -0
  26. {sonatoki-0.7.0 → sonatoki-0.8.0}/tests/test_cleaners.py +0 -0
  27. {sonatoki-0.7.0 → sonatoki-0.8.0}/tests/test_preprocessors.py +0 -0
  28. {sonatoki-0.7.0 → sonatoki-0.8.0}/tests/test_properties.py +0 -0
  29. {sonatoki-0.7.0 → sonatoki-0.8.0}/tests/test_tokenize.py +0 -0
  30. {sonatoki-0.7.0 → sonatoki-0.8.0}/tests/test_utils.py +0 -0
  31. {sonatoki-0.7.0 → sonatoki-0.8.0}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
  32. {sonatoki-0.7.0 → sonatoki-0.8.0}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.7.0
3
+ Version: 0.8.0
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sonatoki"
3
- version = "0.7.0"
3
+ version = "0.8.0"
4
4
  description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
5
5
  authors = [
6
6
  { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
@@ -11,12 +11,12 @@ from sonatoki.Filters import (
11
11
  And,
12
12
  Not,
13
13
  Filter,
14
+ PuName,
14
15
  Numeric,
15
16
  NimiUCSUR,
16
17
  Alphabetic,
17
18
  NimiKuLili,
18
19
  NimiKuSuli,
19
- ProperName,
20
20
  Punctuation,
21
21
  LongSyllabic,
22
22
  Miscellaneous,
@@ -131,7 +131,7 @@ LazyConfig: IloConfig = {
131
131
  "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
132
132
  "cleaners": [ConsecutiveDuplicates],
133
133
  "ignoring_filters": [Numeric, Punctuation],
134
- "scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
134
+ "scoring_filters": [Alphabetic, NimiUCSUR, PuName, Miscellaneous],
135
135
  "scorer": SoftPassFail,
136
136
  "passing_score": 0.8,
137
137
  "word_tokenizer": WordTokenizerRe, # mimics old tokenizer
@@ -141,8 +141,27 @@ class FalsePosAlphabetic(MemberFilter):
141
141
 
142
142
 
143
143
  class ProperName(Filter):
144
- """Determines if a given token is a valid name (also called a loan word).
145
- When Toki Pona is written with the Latin alphabet, names are generally
144
+ """Determine if a given token is a valid name based on a reasonable weakening of
145
+ the rules given in Toki Pona: The Language of Good. A token matches if it has a capital
146
+ letter at its start and is **not** fully capitalized.
147
+
148
+ This corrects an issue with PuName, where scripts lacking a case distinction are
149
+ errantly counted"""
150
+
151
+ @classmethod
152
+ @override
153
+ @cache(maxsize=None)
154
+ def filter(cls, token: str) -> bool:
155
+ first_capitalized = token[0].isupper()
156
+ all_caps = token.isupper()
157
+
158
+ return first_capitalized and not all_caps
159
+
160
+
161
+ class PuName(Filter):
162
+ """Determine if a given token is a valid name (also called a loan word) based on
163
+ the rules given in Toki Pona: The Language of Good.
164
+ When Toki Pona is written with the Latin alphabet, names are
146
165
  capitalized at their start. This filter identifies those tokens.
147
166
 
148
167
  Note that this alone cannot determine if a token is a valid name,
@@ -156,6 +175,9 @@ class ProperName(Filter):
156
175
  @override
157
176
  @cache(maxsize=None)
158
177
  def filter(cls, token: str) -> bool:
178
+ # first_capitalized = token[0].isupper()
179
+ # rest_capitalized = token[1:] == token[1:].upper()
180
+ # return first_capitalized and not rest_capitalized
159
181
  return token == token.capitalize()
160
182
  # TODO: If the token is in a script which doesn't have a case distinction,
161
183
  # this will errantly match.
@@ -445,6 +467,7 @@ __all__ = [
445
467
  "Or",
446
468
  "Phonotactic",
447
469
  "ProperName",
470
+ "PuName",
448
471
  "Punctuation",
449
472
  "Syllabic",
450
473
  ]
@@ -11,12 +11,12 @@ from sonatoki.Filters import (
11
11
  And,
12
12
  Not,
13
13
  NimiPu,
14
+ PuName,
14
15
  Numeric,
15
16
  Syllabic,
16
17
  Alphabetic,
17
18
  NimiKuLili,
18
19
  NimiKuSuli,
19
- ProperName,
20
20
  Phonotactic,
21
21
  Punctuation,
22
22
  AlphabeticRe,
@@ -144,7 +144,7 @@ def test_AlphabeticRe(s: str):
144
144
 
145
145
  @given(st.from_regex(PROPER_NAME_RE, fullmatch=True))
146
146
  def test_ProperName(s: str):
147
- res = ProperName.filter(s)
147
+ res = PuName.filter(s)
148
148
  assert res, repr(s)
149
149
 
150
150
 
@@ -247,7 +247,7 @@ def test_OrFilter_IsipinEpiku(s: str):
247
247
  @given(st.sampled_from(list(words_by_tag("book", "pu"))))
248
248
  def test_AndFilter(s: str):
249
249
  s = s.capitalize()
250
- f = And(ProperName, NimiPu)
250
+ f = And(PuName, NimiPu)
251
251
  assert f.filter(s)
252
252
 
253
253
 
@@ -94,6 +94,9 @@ NAME_MATCHES = [
94
94
  "toki Kanse li lon",
95
95
  "toki Lojban li nasa e lawa mi",
96
96
  "ilo Firefox",
97
+ "ilo FaceBook li nasa",
98
+ "mi kepeken ilo MySQL",
99
+ "poki li nasin SQLite",
97
100
  "mi musi Space Station 13",
98
101
  "jan Tepo en jan Salo en jan Lakuse en pipi Kewapi en soweli Eweke en mi li musi",
99
102
  ]
@@ -135,7 +138,6 @@ EXCESSIVE_SYLLABICS = [
135
138
  ]
136
139
 
137
140
  EXCESSIVE_ALPHABETICS = [
138
- "21st", # candidate for xfails?
139
141
  "wen i tok usin onli notes in toki pona i look silli. ",
140
142
  "I wait, I sulk, as a tool I make stoops to ineptness.",
141
143
  "aaa i non-saw usa's most multiple element-set. it's as asinine as in `e`-less speak",
@@ -161,6 +163,7 @@ EXCESSIVE_ENGLISH = [
161
163
  "i'm online all the time",
162
164
  "How to Cut a Kiwi",
163
165
  "a e i o u",
166
+ "21st", # previous false positive; fixed by ProperName change
164
167
  ]
165
168
 
166
169
  NON_MATCHES = [
@@ -10,10 +10,10 @@ from hypothesis import given, example
10
10
  from sonatoki.Filters import (
11
11
  Filter,
12
12
  NimiPu,
13
+ PuName,
13
14
  Numeric,
14
15
  Syllabic,
15
16
  Alphabetic,
16
- ProperName,
17
17
  Phonotactic,
18
18
  NimiLinkuCore,
19
19
  PunctuationRe,
@@ -31,7 +31,7 @@ FILTERS = [
31
31
  NimiLinkuCore,
32
32
  NimiLinkuCommon,
33
33
  Alphabetic,
34
- ProperName,
34
+ PuName,
35
35
  Phonotactic,
36
36
  PunctuationRe,
37
37
  ]
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes