sonatoki 0.8.0__tar.gz → 0.8.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {sonatoki-0.8.0 → sonatoki-0.8.2}/PKG-INFO +1 -1
  2. {sonatoki-0.8.0 → sonatoki-0.8.2}/pyproject.toml +1 -1
  3. {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/Preprocessors.py +4 -1
  4. {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/constants.py +3 -2
  5. {sonatoki-0.8.0 → sonatoki-0.8.2}/tests/test_cleaners.py +2 -2
  6. {sonatoki-0.8.0 → sonatoki-0.8.2}/tests/test_filters.py +14 -14
  7. {sonatoki-0.8.0 → sonatoki-0.8.2}/tests/test_preprocessors.py +30 -23
  8. {sonatoki-0.8.0 → sonatoki-0.8.2}/tests/test_utils.py +3 -3
  9. {sonatoki-0.8.0 → sonatoki-0.8.2}/tests/tokenize_cases/tokenize_sentences_tok.yml +2 -1
  10. {sonatoki-0.8.0 → sonatoki-0.8.2}/LICENSE +0 -0
  11. {sonatoki-0.8.0 → sonatoki-0.8.2}/README.md +0 -0
  12. {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/Cleaners.py +0 -0
  13. {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/Configs.py +0 -0
  14. {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/Filters.py +0 -0
  15. {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/Scorers.py +0 -0
  16. {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/Tokenizers.py +0 -0
  17. {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/__init__.py +0 -0
  18. {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/__main__.py +0 -0
  19. {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/alphabetic.txt +0 -0
  20. {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/ilo.py +0 -0
  21. {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/linku.json +0 -0
  22. {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/py.typed +0 -0
  23. {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/sandbox.json +0 -0
  24. {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/syllabic.txt +0 -0
  25. {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/types.py +0 -0
  26. {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/utils.py +0 -0
  27. {sonatoki-0.8.0 → sonatoki-0.8.2}/tests/__init__.py +0 -0
  28. {sonatoki-0.8.0 → sonatoki-0.8.2}/tests/test_ilo.py +0 -0
  29. {sonatoki-0.8.0 → sonatoki-0.8.2}/tests/test_properties.py +0 -0
  30. {sonatoki-0.8.0 → sonatoki-0.8.2}/tests/test_scorers.py +0 -0
  31. {sonatoki-0.8.0 → sonatoki-0.8.2}/tests/test_tokenize.py +0 -0
  32. {sonatoki-0.8.0 → sonatoki-0.8.2}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.8.0
3
+ Version: 0.8.2
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sonatoki"
3
- version = "0.8.0"
3
+ version = "0.8.2"
4
4
  description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
5
5
  authors = [
6
6
  { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
@@ -149,7 +149,10 @@ class Codeblock(RegexPreprocessor):
149
149
  Subset of what would be removed by Backticks, but may be preferable.
150
150
  """
151
151
 
152
- pattern = re.compile(r"```\n(?:(?!```).*?)?```", flags=re.DOTALL)
152
+ pattern = re.compile(
153
+ r"```.+?```",
154
+ flags=re.DOTALL,
155
+ )
153
156
 
154
157
 
155
158
  class Spoilers(RegexPreprocessor):
@@ -503,8 +503,9 @@ ALL_PUNCT = "".join(sorted(list(set(POSIX_PUNCT + UNICODE_PUNCT))))
503
503
  ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
504
504
  # combined bc the result could be simpler
505
505
 
506
- SENTENCE_PUNCT = """.?!:;()[-]·•…"""
507
- # NOTE: quotes were previously included, but in TP they are *not* reliably sentence boundaries
506
+ SENTENCE_PUNCT = """.?!:;"()[-]«»‹›“”‟„⹂‽·•…「」『』"""
507
+ # single quotes are word boundaries if not intra-word, but double quotes are sentence
508
+ # boundaries
508
509
 
509
510
  INTRA_WORD_PUNCT = """-'’"""
510
511
 
@@ -12,7 +12,7 @@ from sonatoki.Cleaners import Lowercase, ConsecutiveDuplicates, ConsecutiveDupli
12
12
  from .test_utils import PROPER_NAME_RE
13
13
 
14
14
 
15
- @given(st.from_regex(ConsecutiveDuplicatesRe.pattern.pattern))
15
+ @given(st.from_regex(ConsecutiveDuplicatesRe.pattern))
16
16
  @example("tooooki a")
17
17
  @example("muuuuuu")
18
18
  @example("nnn")
@@ -25,7 +25,7 @@ def test_ConsecutiveDuplicatesRe(s: str):
25
25
  assert a.lower() != b.lower(), (s, res)
26
26
 
27
27
 
28
- @given(st.from_regex(ConsecutiveDuplicatesRe.pattern.pattern))
28
+ @given(st.from_regex(ConsecutiveDuplicatesRe.pattern))
29
29
  @example("Aaa")
30
30
  @example("aAa")
31
31
  @example("aaA")
@@ -90,7 +90,7 @@ def test_NimiLinkuSandbox(s: str):
90
90
  assert res, repr(s)
91
91
 
92
92
 
93
- @given(st.from_regex(Phonotactic.pattern.pattern, fullmatch=True))
93
+ @given(st.from_regex(Phonotactic.pattern, fullmatch=True))
94
94
  @example("kijetesantakalu")
95
95
  @example("n")
96
96
  def test_Phonotactic(s: str):
@@ -98,28 +98,28 @@ def test_Phonotactic(s: str):
98
98
  assert res, repr(s)
99
99
 
100
100
 
101
- @given(st.from_regex(Phonotactic.pattern.pattern, fullmatch=True))
101
+ @given(st.from_regex(Phonotactic.pattern, fullmatch=True))
102
102
  def test_LongPhonotactic(s: str):
103
103
  len_ok = len(s) >= LongPhonotactic.length
104
104
  res = LongPhonotactic.filter(s)
105
105
  assert res == len_ok, repr(s) # will match given fullmatch
106
106
 
107
107
 
108
- @given(st.from_regex(Syllabic.pattern.pattern, fullmatch=True))
108
+ @given(st.from_regex(Syllabic.pattern, fullmatch=True))
109
109
  @example("wuwojitiwunwonjintinmanna")
110
110
  def test_Syllabic(s: str):
111
111
  res = Syllabic.filter(s)
112
112
  assert res, repr(s)
113
113
 
114
114
 
115
- @given(st.from_regex(Syllabic.pattern.pattern, fullmatch=True))
115
+ @given(st.from_regex(Syllabic.pattern, fullmatch=True))
116
116
  def test_LongSyllabic(s: str):
117
117
  len_ok = len(s) >= LongSyllabic.length
118
118
  res = LongSyllabic.filter(s)
119
119
  assert res == len_ok
120
120
 
121
121
 
122
- @given(st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True))
122
+ @given(st.from_regex(AlphabeticRe.pattern, fullmatch=True))
123
123
  @example("muems")
124
124
  @example("mpptp")
125
125
  @example("tptpt")
@@ -129,14 +129,14 @@ def test_Alphabetic(s: str):
129
129
  assert res_fn == res_re, repr(s)
130
130
 
131
131
 
132
- @given(st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True))
132
+ @given(st.from_regex(AlphabeticRe.pattern, fullmatch=True))
133
133
  def test_LongAlphabetic(s: str):
134
134
  len_ok = len(s) >= LongAlphabetic.length
135
135
  res = LongAlphabetic.filter(s)
136
136
  assert res == len_ok
137
137
 
138
138
 
139
- @given(st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True))
139
+ @given(st.from_regex(AlphabeticRe.pattern, fullmatch=True))
140
140
  def test_AlphabeticRe(s: str):
141
141
  res_re = AlphabeticRe.filter(s)
142
142
  assert res_re, repr(s)
@@ -148,7 +148,7 @@ def test_ProperName(s: str):
148
148
  assert res, repr(s)
149
149
 
150
150
 
151
- @given(st.from_regex(PunctuationRe.pattern.pattern, fullmatch=True))
151
+ @given(st.from_regex(PunctuationRe.pattern, fullmatch=True))
152
152
  @example("[]")
153
153
  @example(r"\\")
154
154
  @example(r"\"")
@@ -161,14 +161,14 @@ def test_PunctuationRe1(s: str):
161
161
  assert res, repr(s)
162
162
 
163
163
 
164
- @given(st.from_regex(PunctuationRe.pattern.pattern, fullmatch=True))
164
+ @given(st.from_regex(PunctuationRe.pattern, fullmatch=True))
165
165
  def test_PunctuationRe(s: str):
166
166
  res_re = PunctuationRe.filter(s)
167
167
  res_re1 = PunctuationRe1.filter(s)
168
168
  assert res_re == res_re1, repr(s)
169
169
 
170
170
 
171
- @given(st.from_regex(PunctuationRe.pattern.pattern, fullmatch=True))
171
+ @given(st.from_regex(PunctuationRe.pattern, fullmatch=True))
172
172
  @example("\U000f1990") # UCSUR char
173
173
  def test_Punctuation(s: str):
174
174
  res_fn = Punctuation.filter(s)
@@ -185,7 +185,7 @@ def test_Numeric(s: str):
185
185
 
186
186
 
187
187
  @given(
188
- st.from_regex(PunctuationRe.pattern.pattern, fullmatch=True)
188
+ st.from_regex(PunctuationRe.pattern, fullmatch=True)
189
189
  | st.from_regex(r"\d+", fullmatch=True),
190
190
  )
191
191
  def test_OrFilter(s: str):
@@ -259,8 +259,8 @@ def test_NotFilter(s: str):
259
259
 
260
260
  @given(
261
261
  st.sampled_from(list(FALSE_POS_SYLLABIC))
262
- | st.from_regex(Syllabic.pattern.pattern, fullmatch=True)
263
- | st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True)
262
+ | st.from_regex(Syllabic.pattern, fullmatch=True)
263
+ | st.from_regex(AlphabeticRe.pattern, fullmatch=True)
264
264
  )
265
265
  def test_AndNotFilter(s: str):
266
266
  AndNotFilter = And(Syllabic, Not(FalsePosSyllabic))
@@ -309,7 +309,7 @@ def test_AddTokensToMemberFilterNegative(s: str):
309
309
  | words_by_tag("usage_category", "sandbox")
310
310
  ),
311
311
  )
312
- | st.from_regex(Syllabic.pattern.pattern, fullmatch=True)
312
+ | st.from_regex(Syllabic.pattern, fullmatch=True)
313
313
  )
314
314
  def test_SubTokensFromMemberFilter(s: str):
315
315
  NimiAlaFilter = NimiLinkuCore(sub=NimiPu.tokens)
@@ -22,7 +22,7 @@ from sonatoki.Preprocessors import (
22
22
  )
23
23
 
24
24
 
25
- @given(st.from_regex(URLs.pattern.pattern, fullmatch=True))
25
+ @given(st.from_regex(URLs.pattern, fullmatch=True))
26
26
  @example("https://google.com")
27
27
  @example("https://mun.la")
28
28
  @example("https://discord.gg/")
@@ -32,7 +32,7 @@ def test_URLs(s: str):
32
32
  assert URLs.process(s).strip() == ""
33
33
 
34
34
 
35
- @given(st.from_regex(Spoilers.pattern.pattern, fullmatch=True))
35
+ @given(st.from_regex(Spoilers.pattern, fullmatch=True))
36
36
  @example("|| | ||")
37
37
  @example("|| content\n\n\ncontent ||")
38
38
  @example("||\n||")
@@ -42,14 +42,15 @@ def test_Spoilers(s: str):
42
42
  assert res == "", (repr(s), repr(res))
43
43
 
44
44
 
45
- @given(st.from_regex(Backticks.pattern.pattern, fullmatch=True))
45
+ @given(st.from_regex(Backticks.pattern, fullmatch=True))
46
46
  @example("` ` ` `")
47
47
  def test_Backticks(s: str):
48
48
  res = Backticks.process(s).strip()
49
49
  assert res == "", (repr(s), repr(res))
50
50
 
51
51
 
52
- @given(st.from_regex(Codeblock.pattern.pattern, fullmatch=True))
52
+ @given(st.from_regex(r"```(?:(?!`).+?)```", fullmatch=True))
53
+ @example("""```0```""")
53
54
  @example(
54
55
  """```
55
56
  ```"""
@@ -63,12 +64,18 @@ blocky message
63
64
  second blocky message
64
65
  ```"""
65
66
  )
67
+ @example(
68
+ """```oisandm123-_mu
69
+ arbitrary content
70
+ ```"""
71
+ )
72
+ @example("""```mu```""")
66
73
  def test_Codeblock(s: str):
67
74
  res = Codeblock.process(s).strip()
68
75
  assert res == "", (repr(s), repr(res))
69
76
 
70
77
 
71
- @given(st.from_regex(ArrowQuote.pattern.pattern, fullmatch=True))
78
+ @given(st.from_regex(ArrowQuote.pattern, fullmatch=True))
72
79
  @example("> base")
73
80
  @example("> newline\n> newline")
74
81
  def test_ArrowQuote(s: str):
@@ -76,7 +83,7 @@ def test_ArrowQuote(s: str):
76
83
  assert res == "", (repr(s), repr(res))
77
84
 
78
85
 
79
- @given(st.from_regex(DoubleQuotes.pattern.pattern, fullmatch=True))
86
+ @given(st.from_regex(DoubleQuotes.pattern, fullmatch=True))
80
87
  @example('" "" "')
81
88
  @example('" "\n" "')
82
89
  @example('" \n "')
@@ -85,7 +92,7 @@ def test_DoubleQuotes(s: str):
85
92
  assert res == "", (repr(s), repr(res))
86
93
 
87
94
 
88
- @given(st.from_regex(SingleQuotes.pattern.pattern, fullmatch=True))
95
+ @given(st.from_regex(SingleQuotes.pattern, fullmatch=True))
89
96
  @example("' '' '")
90
97
  @example("' '\n' '")
91
98
  @example("' \n '")
@@ -94,7 +101,7 @@ def test_SingleQuotes(s: str):
94
101
  assert res == "", (repr(s), repr(res))
95
102
 
96
103
 
97
- @given(st.from_regex(DiscordEmotes.pattern.pattern, fullmatch=True))
104
+ @given(st.from_regex(DiscordEmotes.pattern, fullmatch=True))
98
105
  @example("<a:example:123123>")
99
106
  @example("<:example:123123>")
100
107
  def test_DiscordEmotes(s: str):
@@ -102,7 +109,7 @@ def test_DiscordEmotes(s: str):
102
109
  assert res == "", (repr(s), repr(res))
103
110
 
104
111
 
105
- @given(st.from_regex(DiscordMentions.pattern.pattern, fullmatch=True))
112
+ @given(st.from_regex(DiscordMentions.pattern, fullmatch=True))
106
113
  @example("<@497549183847497739>")
107
114
  @example("<@!457890000>")
108
115
  @example("<@&18398198981985>")
@@ -111,7 +118,7 @@ def test_DiscordMentions(s: str):
111
118
  assert res == "", (repr(s), repr(res))
112
119
 
113
120
 
114
- @given(st.from_regex(DiscordChannels.pattern.pattern, fullmatch=True))
121
+ @given(st.from_regex(DiscordChannels.pattern, fullmatch=True))
115
122
  @example("<#19858915>")
116
123
  @example("<#18591912589812985>")
117
124
  def test_DiscordChannels(s: str):
@@ -119,7 +126,7 @@ def test_DiscordChannels(s: str):
119
126
  assert res == "", (repr(s), repr(res))
120
127
 
121
128
 
122
- @given(st.from_regex(DiscordSpecial.pattern.pattern, fullmatch=True))
129
+ @given(st.from_regex(DiscordSpecial.pattern, fullmatch=True))
123
130
  @example("<id:guide>")
124
131
  @example("<id:browse>")
125
132
  def test_DiscordSpecial(s: str):
@@ -128,11 +135,11 @@ def test_DiscordSpecial(s: str):
128
135
 
129
136
 
130
137
  @given(
131
- st.from_regex(DiscordEmotes.pattern.pattern, fullmatch=True)
132
- | st.from_regex(DiscordMentions.pattern.pattern, fullmatch=True)
133
- | st.from_regex(DiscordChannels.pattern.pattern, fullmatch=True)
134
- | st.from_regex(DiscordSpecial.pattern.pattern, fullmatch=True)
135
- | st.from_regex(AngleBracketObject.pattern.pattern, fullmatch=True)
138
+ st.from_regex(DiscordEmotes.pattern, fullmatch=True)
139
+ | st.from_regex(DiscordMentions.pattern, fullmatch=True)
140
+ | st.from_regex(DiscordChannels.pattern, fullmatch=True)
141
+ | st.from_regex(DiscordSpecial.pattern, fullmatch=True)
142
+ | st.from_regex(AngleBracketObject.pattern, fullmatch=True)
136
143
  )
137
144
  @example("<https://example.com>")
138
145
  @example("<#123124125125>")
@@ -142,11 +149,11 @@ def test_AngleBracketObject(s: str):
142
149
 
143
150
 
144
151
  @given(
145
- st.from_regex(SingleQuotes.pattern.pattern, fullmatch=True)
146
- | st.from_regex(DoubleQuotes.pattern.pattern, fullmatch=True)
147
- | st.from_regex(Backticks.pattern.pattern, fullmatch=True)
148
- | st.from_regex(ArrowQuote.pattern.pattern, fullmatch=True)
149
- | st.from_regex(AllQuotes.pattern.pattern, fullmatch=True)
152
+ st.from_regex(SingleQuotes.pattern, fullmatch=True)
153
+ | st.from_regex(DoubleQuotes.pattern, fullmatch=True)
154
+ | st.from_regex(Backticks.pattern, fullmatch=True)
155
+ | st.from_regex(ArrowQuote.pattern, fullmatch=True)
156
+ | st.from_regex(AllQuotes.pattern, fullmatch=True)
150
157
  )
151
158
  @example("> bruh")
152
159
  @example("`bruh`")
@@ -155,7 +162,7 @@ def test_AllQuotes(s: str):
155
162
  assert res == "", (repr(s), repr(res))
156
163
 
157
164
 
158
- @given(st.from_regex(Reference.pattern.pattern, fullmatch=True))
165
+ @given(st.from_regex(Reference.pattern, fullmatch=True))
159
166
  @example("[[Brainstorm]]")
160
167
  @example("[[Phatic Phrases]]")
161
168
  @example("[[Yahoo!]]")
@@ -164,7 +171,7 @@ def test_Reference(s: str):
164
171
  assert res == "", (repr(s), repr(res))
165
172
 
166
173
 
167
- @given(st.from_regex(ColonEmotes.pattern.pattern, fullmatch=True))
174
+ @given(st.from_regex(ColonEmotes.pattern, fullmatch=True))
168
175
  @example(":owe::owe:")
169
176
  @example(":suffering:")
170
177
  @example(":presid65despair:")
@@ -9,10 +9,10 @@ PROPER_NAME_RE = r"[A-Z][a-z]*"
9
9
 
10
10
  token_strategy = (
11
11
  st.sampled_from(list(words_by_usage(60)))
12
- | st.from_regex(Phonotactic.pattern.pattern, fullmatch=True)
13
- | st.from_regex(Syllabic.pattern.pattern, fullmatch=True)
12
+ | st.from_regex(Phonotactic.pattern, fullmatch=True)
13
+ | st.from_regex(Syllabic.pattern, fullmatch=True)
14
14
  | st.from_regex(PROPER_NAME_RE, fullmatch=True)
15
- | st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True)
15
+ | st.from_regex(AlphabeticRe.pattern, fullmatch=True)
16
16
  )
17
17
 
18
18
 
@@ -56,7 +56,8 @@
56
56
  input: 'ona li toki e ni: "mama sina"'
57
57
  output:
58
58
  - "ona li toki e ni:"
59
- - '"mama sina"'
59
+ - '"'
60
+ - 'mama sina"'
60
61
  - name: "discovered case 1"
61
62
  input: "ona li ken lukin e sitelen [_ike_nanpa_lete_ike]. ni li pona kin."
62
63
  output:
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes