sonatoki 0.1.3__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {sonatoki-0.1.3 → sonatoki-0.1.4}/PKG-INFO +1 -1
  2. {sonatoki-0.1.3 → sonatoki-0.1.4}/pyproject.toml +1 -1
  3. {sonatoki-0.1.3 → sonatoki-0.1.4}/src/sonatoki/Configs.py +3 -3
  4. {sonatoki-0.1.3 → sonatoki-0.1.4}/src/sonatoki/Filters.py +4 -4
  5. {sonatoki-0.1.3 → sonatoki-0.1.4}/src/sonatoki/Preprocessors.py +35 -4
  6. {sonatoki-0.1.3 → sonatoki-0.1.4}/tests/test_filters.py +6 -6
  7. {sonatoki-0.1.3 → sonatoki-0.1.4}/tests/test_preprocessors.py +40 -0
  8. {sonatoki-0.1.3 → sonatoki-0.1.4}/tests/test_scorers.py +8 -6
  9. {sonatoki-0.1.3 → sonatoki-0.1.4}/LICENSE +0 -0
  10. {sonatoki-0.1.3 → sonatoki-0.1.4}/README.md +0 -0
  11. {sonatoki-0.1.3 → sonatoki-0.1.4}/src/sonatoki/Cleaners.py +0 -0
  12. {sonatoki-0.1.3 → sonatoki-0.1.4}/src/sonatoki/Scorers.py +0 -0
  13. {sonatoki-0.1.3 → sonatoki-0.1.4}/src/sonatoki/Tokenizers.py +0 -0
  14. {sonatoki-0.1.3 → sonatoki-0.1.4}/src/sonatoki/__init__.py +0 -0
  15. {sonatoki-0.1.3 → sonatoki-0.1.4}/src/sonatoki/__main__.py +0 -0
  16. {sonatoki-0.1.3 → sonatoki-0.1.4}/src/sonatoki/constants.py +0 -0
  17. {sonatoki-0.1.3 → sonatoki-0.1.4}/src/sonatoki/ilo.py +0 -0
  18. {sonatoki-0.1.3 → sonatoki-0.1.4}/src/sonatoki/linku.json +0 -0
  19. {sonatoki-0.1.3 → sonatoki-0.1.4}/src/sonatoki/sandbox.json +0 -0
  20. {sonatoki-0.1.3 → sonatoki-0.1.4}/tests/__init__.py +0 -0
  21. {sonatoki-0.1.3 → sonatoki-0.1.4}/tests/test_cleaners.py +0 -0
  22. {sonatoki-0.1.3 → sonatoki-0.1.4}/tests/test_ilo.py +0 -0
  23. {sonatoki-0.1.3 → sonatoki-0.1.4}/tests/test_tokenize.py +0 -0
  24. {sonatoki-0.1.3 → sonatoki-0.1.4}/tests/test_utils.py +0 -0
  25. {sonatoki-0.1.3 → sonatoki-0.1.4}/tests/tokenize_cases/tokenize_sentences.yml +0 -0
  26. {sonatoki-0.1.3 → sonatoki-0.1.4}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
  27. {sonatoki-0.1.3 → sonatoki-0.1.4}/tests/tokenize_cases/tokenize_words.yml +0 -0
  28. {sonatoki-0.1.3 → sonatoki-0.1.4}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sonatoki"
3
- version = "0.1.3"
3
+ version = "0.1.4"
4
4
  description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
5
5
  authors = [
6
6
  { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
@@ -9,15 +9,15 @@ from typing_extensions import NotRequired
9
9
  from sonatoki.Filters import (
10
10
  Filter,
11
11
  NimiPu,
12
- Numerics,
12
+ Numeric,
13
13
  Syllabic,
14
14
  NimiLinku,
15
15
  NimiPuAle,
16
16
  Alphabetic,
17
17
  ProperName,
18
18
  Phonotactic,
19
+ Punctuation,
19
20
  NimiLinkuAle,
20
- Punctuations,
21
21
  )
22
22
  from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
23
23
  from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
@@ -45,7 +45,7 @@ class IloConfig(TypedDict):
45
45
  BaseConfig: IloConfig = {
46
46
  "preprocessors": [URLs],
47
47
  "cleaners": [ConsecutiveDuplicates],
48
- "ignoring_filters": [Numerics, Punctuations],
48
+ "ignoring_filters": [Numeric, Punctuation],
49
49
  "scoring_filters": [],
50
50
  "scorer": PassFail,
51
51
  "passing_score": 0.8,
@@ -131,7 +131,7 @@ class Alphabetic(Filter):
131
131
  return set(token.lower()).issubset(ALPHABET_SET)
132
132
 
133
133
 
134
- class Numerics(Filter):
134
+ class Numeric(Filter):
135
135
  """Determine if a given token is entirely numeric.
136
136
  Covers all numeric symbols in Unicode.
137
137
 
@@ -147,7 +147,7 @@ class Numerics(Filter):
147
147
  return msg.isnumeric()
148
148
 
149
149
 
150
- class Punctuations(RegexFilter):
150
+ class Punctuation(RegexFilter):
151
151
  pattern = re.compile(r"[\p{Punctuation}\p{posix_punct}]+")
152
152
 
153
153
 
@@ -159,6 +159,6 @@ __all__ = [
159
159
  "Syllabic",
160
160
  "Alphabetic",
161
161
  "ProperName",
162
- "Punctuations",
163
- "Numerics",
162
+ "Punctuation",
163
+ "Numeric",
164
164
  ]
@@ -62,6 +62,13 @@ class URLs(RegexPreprocessor):
62
62
  pattern = re.compile(r"https?:\/\/\S+")
63
63
 
64
64
 
65
+ class Reference(RegexPreprocessor):
66
+ """Remove text contained in double brackets.
67
+ Often used to fetch articles on Wikipedia, or Magic the Gathering cards."""
68
+
69
+ pattern = re.compile(r"\[\[.+\]\]")
70
+
71
+
65
72
  class DiscordEmotes(RegexPreprocessor):
66
73
  """Remove text-formatted Discord emotes `<flags:name:id>`"""
67
74
 
@@ -80,6 +87,13 @@ class DiscordSpecial(RegexPreprocessor):
80
87
  pattern = re.compile(r"<id:[a-zA-Z0-9_]{4,}>")
81
88
 
82
89
 
90
+ class AngleBracketObject(RegexPreprocessor):
91
+ """A generalized version of the Discord-specific angle bracket objects.
92
+ Removes any contiguous (not broken by whitespace) text in angle brackets."""
93
+
94
+ pattern = re.compile(r"<[^<>\s]+>")
95
+
96
+
83
97
  """
84
98
  The following classes are Containers.
85
99
 
@@ -92,23 +106,23 @@ would likely be using a language other than Toki Pona.
92
106
 
93
107
 
94
108
  class SingleQuotes(RegexPreprocessor):
95
- pattern = re.compile(r"'[^']+'", flags=re.S) # . matches newline
109
+ pattern = re.compile(r"'[^']+'", flags=re.DOTALL)
96
110
 
97
111
 
98
112
  class DoubleQuotes(RegexPreprocessor):
99
- pattern = re.compile(r'"[^"]+"', flags=re.S)
113
+ pattern = re.compile(r'"[^"]+"', flags=re.DOTALL)
100
114
 
101
115
 
102
116
  class Backticks(RegexPreprocessor):
103
117
  """Remove paired backticks and their contents `like this`"""
104
118
 
105
- pattern = re.compile(r"`[^`]+`", flags=re.S)
119
+ pattern = re.compile(r"`[^`]+`", flags=re.DOTALL)
106
120
 
107
121
 
108
122
  class Spoilers(RegexPreprocessor):
109
123
  """Remove paired double bars and their contents `||like this||`"""
110
124
 
111
- pattern = re.compile(r"\|\|(?:(?!\|\|).)+\|\|", flags=re.S)
125
+ pattern = re.compile(r"\|\|(?:(?!\|\|).)+\|\|", flags=re.DOTALL)
112
126
 
113
127
 
114
128
  class ArrowQuote(RegexPreprocessor):
@@ -117,7 +131,22 @@ class ArrowQuote(RegexPreprocessor):
117
131
  pattern = re.compile(r"^>\ .+$", re.MULTILINE)
118
132
 
119
133
 
134
+ class AllQuotes(RegexPreprocessor):
135
+ pattern = re.compile(
136
+ "|".join(
137
+ [
138
+ SingleQuotes.pattern.pattern,
139
+ DoubleQuotes.pattern.pattern,
140
+ Backticks.pattern.pattern,
141
+ ArrowQuote.pattern.pattern,
142
+ ]
143
+ ),
144
+ flags=re.MULTILINE | re.DOTALL,
145
+ )
146
+
147
+
120
148
  __all__ = [
149
+ "AngleBracketObject",
121
150
  "DiscordChannels",
122
151
  "DiscordMentions",
123
152
  "DiscordSpecial",
@@ -125,7 +154,9 @@ __all__ = [
125
154
  "SingleQuotes",
126
155
  "DoubleQuotes",
127
156
  "ArrowQuote",
157
+ "AllQuotes",
128
158
  "Backticks",
159
+ "Reference",
129
160
  "Spoilers",
130
161
  "URLs",
131
162
  ]
@@ -9,13 +9,13 @@ from hypothesis import HealthCheck, given, assume, example, settings
9
9
  # LOCAL
10
10
  from sonatoki.Filters import (
11
11
  NimiPu,
12
- Numerics,
12
+ Numeric,
13
13
  Syllabic,
14
14
  NimiLinku,
15
15
  Alphabetic,
16
16
  ProperName,
17
17
  Phonotactic,
18
- Punctuations,
18
+ Punctuation,
19
19
  )
20
20
  from sonatoki.Cleaners import ConsecutiveDuplicates
21
21
  from sonatoki.constants import NIMI_PU, NIMI_LINKU
@@ -90,9 +90,9 @@ def test_ProperName(s: str):
90
90
  @example("「 」")
91
91
  @example(string.punctuation)
92
92
  @settings(suppress_health_check=[HealthCheck.filter_too_much]) # FIXME
93
- def test_Punctuations(s: str):
94
- _ = assume(re.fullmatch(Punctuations.pattern.pattern, s))
95
- res = Punctuations.filter(s)
93
+ def test_Punctuation(s: str):
94
+ _ = assume(re.fullmatch(Punctuation.pattern.pattern, s))
95
+ res = Punctuation.filter(s)
96
96
  assert res, repr(s)
97
97
 
98
98
 
@@ -100,5 +100,5 @@ def test_Punctuations(s: str):
100
100
  @example("124125")
101
101
  @example("99990000")
102
102
  def test_Numeric(s: str):
103
- res = Numerics.filter(s)
103
+ res = Numeric.filter(s)
104
104
  assert res, repr(s)
@@ -6,7 +6,9 @@ from hypothesis import given, example
6
6
  from sonatoki.Preprocessors import (
7
7
  URLs,
8
8
  Spoilers,
9
+ AllQuotes,
9
10
  Backticks,
11
+ Reference,
10
12
  ArrowQuote,
11
13
  DoubleQuotes,
12
14
  SingleQuotes,
@@ -14,6 +16,7 @@ from sonatoki.Preprocessors import (
14
16
  DiscordSpecial,
15
17
  DiscordChannels,
16
18
  DiscordMentions,
19
+ AngleBracketObject,
17
20
  )
18
21
 
19
22
 
@@ -101,3 +104,40 @@ def test_DiscordChannels(s: str):
101
104
  def test_DiscordSpecial(s: str):
102
105
  res = DiscordSpecial.process(s).strip()
103
106
  assert res == "", (repr(s), repr(res))
107
+
108
+
109
+ @given(
110
+ st.from_regex(DiscordEmotes.pattern.pattern, fullmatch=True)
111
+ | st.from_regex(DiscordMentions.pattern.pattern, fullmatch=True)
112
+ | st.from_regex(DiscordChannels.pattern.pattern, fullmatch=True)
113
+ | st.from_regex(DiscordSpecial.pattern.pattern, fullmatch=True)
114
+ | st.from_regex(AngleBracketObject.pattern.pattern, fullmatch=True)
115
+ )
116
+ @example("<https://example.com>")
117
+ @example("<#123124125125>")
118
+ def test_AngleBracketObject(s: str):
119
+ res = AngleBracketObject.process(s).strip()
120
+ assert res == "", (repr(s), repr(res))
121
+
122
+
123
+ @given(
124
+ st.from_regex(SingleQuotes.pattern.pattern, fullmatch=True)
125
+ | st.from_regex(DoubleQuotes.pattern.pattern, fullmatch=True)
126
+ | st.from_regex(Backticks.pattern.pattern, fullmatch=True)
127
+ | st.from_regex(ArrowQuote.pattern.pattern, fullmatch=True)
128
+ | st.from_regex(AllQuotes.pattern.pattern, fullmatch=True)
129
+ )
130
+ @example("> bruh")
131
+ @example("`bruh`")
132
+ def test_AllQuotes(s: str):
133
+ res = AllQuotes.process(s).strip()
134
+ assert res == "", (repr(s), repr(res))
135
+
136
+
137
+ @given(st.from_regex(Reference.pattern.pattern, fullmatch=True))
138
+ @example("[[Brainstorm]]")
139
+ @example("[[Phatic Phrases]]")
140
+ @example("[[Yahoo!]]")
141
+ def test_Reference(s: str):
142
+ res = Reference.process(s).strip()
143
+ assert res == "", (repr(s), repr(res))
@@ -4,38 +4,39 @@ from typing import List, Type
4
4
  # PDM
5
5
  import pytest
6
6
  import hypothesis.strategies as st
7
- from hypothesis import given
7
+ from hypothesis import given, example
8
8
 
9
9
  # LOCAL
10
10
  from sonatoki.Filters import (
11
11
  Filter,
12
12
  NimiPu,
13
- Numerics,
13
+ Numeric,
14
14
  Syllabic,
15
15
  NimiLinku,
16
16
  Alphabetic,
17
17
  ProperName,
18
18
  Phonotactic,
19
- Punctuations,
19
+ Punctuation,
20
20
  )
21
- from sonatoki.Scorers import Scorer, Scaling, PassFail, SoftScaling
21
+ from sonatoki.Scorers import Scorer, Scaling, PassFail, SoftScaling, SoftPassFail
22
22
 
23
23
  # FILESYSTEM
24
24
  from .test_utils import token_strategy
25
25
 
26
26
  FILTERS = [
27
27
  NimiPu,
28
- Numerics,
28
+ Numeric,
29
29
  Syllabic,
30
30
  NimiLinku,
31
31
  Alphabetic,
32
32
  ProperName,
33
33
  Phonotactic,
34
- Punctuations,
34
+ Punctuation,
35
35
  ]
36
36
 
37
37
  SCORERS = [
38
38
  PassFail,
39
+ SoftPassFail,
39
40
  Scaling,
40
41
  SoftScaling,
41
42
  ]
@@ -46,6 +47,7 @@ SCORERS = [
46
47
  st.lists(st.sampled_from(FILTERS), min_size=1, unique=True),
47
48
  st.lists(token_strategy, min_size=0, max_size=10),
48
49
  )
50
+ @example(st.sampled_from(FILTERS), [])
49
51
  def test_score_bounds(scorer: Scorer, filters: List[Type[Filter]], text: List[str]):
50
52
  score = scorer.score(text, filters)
51
53
  assert 0 <= score <= 1, (score, filters, text)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes