sonatoki 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonatoki/Configs.py CHANGED
@@ -9,15 +9,15 @@ from typing_extensions import NotRequired
9
9
  from sonatoki.Filters import (
10
10
  Filter,
11
11
  NimiPu,
12
- Numerics,
12
+ Numeric,
13
13
  Syllabic,
14
14
  NimiLinku,
15
15
  NimiPuAle,
16
16
  Alphabetic,
17
17
  ProperName,
18
18
  Phonotactic,
19
+ Punctuation,
19
20
  NimiLinkuAle,
20
- Punctuations,
21
21
  )
22
22
  from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
23
23
  from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
@@ -45,7 +45,7 @@ class IloConfig(TypedDict):
45
45
  BaseConfig: IloConfig = {
46
46
  "preprocessors": [URLs],
47
47
  "cleaners": [ConsecutiveDuplicates],
48
- "ignoring_filters": [Numerics, Punctuations],
48
+ "ignoring_filters": [Numeric, Punctuation],
49
49
  "scoring_filters": [],
50
50
  "scorer": PassFail,
51
51
  "passing_score": 0.8,
sonatoki/Filters.py CHANGED
@@ -131,7 +131,7 @@ class Alphabetic(Filter):
131
131
  return set(token.lower()).issubset(ALPHABET_SET)
132
132
 
133
133
 
134
- class Numerics(Filter):
134
+ class Numeric(Filter):
135
135
  """Determine if a given token is entirely numeric.
136
136
  Covers all numeric symbols in Unicode.
137
137
 
@@ -147,7 +147,7 @@ class Numerics(Filter):
147
147
  return msg.isnumeric()
148
148
 
149
149
 
150
- class Punctuations(RegexFilter):
150
+ class Punctuation(RegexFilter):
151
151
  pattern = re.compile(r"[\p{Punctuation}\p{posix_punct}]+")
152
152
 
153
153
 
@@ -159,6 +159,6 @@ __all__ = [
159
159
  "Syllabic",
160
160
  "Alphabetic",
161
161
  "ProperName",
162
- "Punctuations",
163
- "Numerics",
162
+ "Punctuation",
163
+ "Numeric",
164
164
  ]
sonatoki/Preprocessors.py CHANGED
@@ -62,6 +62,13 @@ class URLs(RegexPreprocessor):
62
62
  pattern = re.compile(r"https?:\/\/\S+")
63
63
 
64
64
 
65
+ class Reference(RegexPreprocessor):
66
+ """Remove text contained in double brackets.
67
+ Often used to fetch articles on Wikipedia, or Magic the Gathering cards."""
68
+
69
+ pattern = re.compile(r"\[\[.+\]\]")
70
+
71
+
65
72
  class DiscordEmotes(RegexPreprocessor):
66
73
  """Remove text-formatted Discord emotes `<flags:name:id>`"""
67
74
 
@@ -80,6 +87,13 @@ class DiscordSpecial(RegexPreprocessor):
80
87
  pattern = re.compile(r"<id:[a-zA-Z0-9_]{4,}>")
81
88
 
82
89
 
90
+ class AngleBracketObject(RegexPreprocessor):
91
+ """A generalized version of the Discord-specific angle bracket objects.
92
+ Removes any contiguous (not broken by whitespace) text in angle brackets."""
93
+
94
+ pattern = re.compile(r"<[^<>\s]+>")
95
+
96
+
83
97
  """
84
98
  The following classes are Containers.
85
99
 
@@ -92,23 +106,23 @@ would likely be using a language other than Toki Pona.
92
106
 
93
107
 
94
108
  class SingleQuotes(RegexPreprocessor):
95
- pattern = re.compile(r"'[^']+'", flags=re.S) # . matches newline
109
+ pattern = re.compile(r"'[^']+'", flags=re.DOTALL)
96
110
 
97
111
 
98
112
  class DoubleQuotes(RegexPreprocessor):
99
- pattern = re.compile(r'"[^"]+"', flags=re.S)
113
+ pattern = re.compile(r'"[^"]+"', flags=re.DOTALL)
100
114
 
101
115
 
102
116
  class Backticks(RegexPreprocessor):
103
117
  """Remove paired backticks and their contents `like this`"""
104
118
 
105
- pattern = re.compile(r"`[^`]+`", flags=re.S)
119
+ pattern = re.compile(r"`[^`]+`", flags=re.DOTALL)
106
120
 
107
121
 
108
122
  class Spoilers(RegexPreprocessor):
109
123
  """Remove paired double bars and their contents `||like this||`"""
110
124
 
111
- pattern = re.compile(r"\|\|(?:(?!\|\|).)+\|\|", flags=re.S)
125
+ pattern = re.compile(r"\|\|(?:(?!\|\|).)+\|\|", flags=re.DOTALL)
112
126
 
113
127
 
114
128
  class ArrowQuote(RegexPreprocessor):
@@ -117,7 +131,22 @@ class ArrowQuote(RegexPreprocessor):
117
131
  pattern = re.compile(r"^>\ .+$", re.MULTILINE)
118
132
 
119
133
 
134
+ class AllQuotes(RegexPreprocessor):
135
+ pattern = re.compile(
136
+ "|".join(
137
+ [
138
+ SingleQuotes.pattern.pattern,
139
+ DoubleQuotes.pattern.pattern,
140
+ Backticks.pattern.pattern,
141
+ ArrowQuote.pattern.pattern,
142
+ ]
143
+ ),
144
+ flags=re.MULTILINE | re.DOTALL,
145
+ )
146
+
147
+
120
148
  __all__ = [
149
+ "AngleBracketObject",
121
150
  "DiscordChannels",
122
151
  "DiscordMentions",
123
152
  "DiscordSpecial",
@@ -125,7 +154,9 @@ __all__ = [
125
154
  "SingleQuotes",
126
155
  "DoubleQuotes",
127
156
  "ArrowQuote",
157
+ "AllQuotes",
128
158
  "Backticks",
159
+ "Reference",
129
160
  "Spoilers",
130
161
  "URLs",
131
162
  ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,10 +1,10 @@
1
- sonatoki-0.1.3.dist-info/METADATA,sha256=ivcjgCnmdW1Typsn01RgqHX4PePGx6r4U_Ms5h5ksYo,5225
2
- sonatoki-0.1.3.dist-info/WHEEL,sha256=vnE8JVcI2Wz7GRKorsPArnBdnW2SWKWGow5gu5tHlRU,90
3
- sonatoki-0.1.3.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
1
+ sonatoki-0.1.4.dist-info/METADATA,sha256=cK_EyYXPeY4rm9Plcre-i_DbPJZD06572cYQEIUQ804,5225
2
+ sonatoki-0.1.4.dist-info/WHEEL,sha256=vnE8JVcI2Wz7GRKorsPArnBdnW2SWKWGow5gu5tHlRU,90
3
+ sonatoki-0.1.4.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
4
4
  sonatoki/Cleaners.py,sha256=gTZ9dSsnvKVUtxM_ECSZ-_2heh--nD5A9dCQR1ATb1c,1160
5
- sonatoki/Configs.py,sha256=yzTbjEyWS7sysaQ_9mIQMmO-acpEgQBzv4foP_J5x1o,1939
6
- sonatoki/Filters.py,sha256=RgbOXLat30WdsJW4y1DlMNttmGfVtLM6T7cD_qK-ASo,4194
7
- sonatoki/Preprocessors.py,sha256=uJ8-Y51gcgu5Wrri9BiP1F1YT-cYiLeWhrquFbYi9AI,3347
5
+ sonatoki/Configs.py,sha256=iY6Lyn1rMi7iF0M62yx0ET4pEb35-QAd1FS0tkyUfSc,1935
6
+ sonatoki/Filters.py,sha256=dL3XgH62OrVVvc8b6dtR5-JZmErVF4bl7ultAoHHqpo,4190
7
+ sonatoki/Preprocessors.py,sha256=h2sX6nJIIOPotwHL0476VQe4KxERlD_F6nrvxDyuaTs,4205
8
8
  sonatoki/Scorers.py,sha256=V293DBiupBiujzuc4yMrKOAiuNTLltIsiCzIAlLeokA,4129
9
9
  sonatoki/Tokenizers.py,sha256=fvqxpubs2F63va2RzZKZQhZbFnVaC_9haXIA9Mqznis,1942
10
10
  sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -13,4 +13,4 @@ sonatoki/constants.py,sha256=m0Z4At6MfbqZRio2glT3J3zT9x_itcWZBT_G82mpaVc,1647
13
13
  sonatoki/ilo.py,sha256=oN14iYFKxgjFjjOslgqBrMaIgpnvS5gO6MscbS0JS5A,4343
14
14
  sonatoki/linku.json,sha256=B5KNdhyM5UEfMciROgh1ECHr3i-ASBeMvwrkzNJX47c,271013
15
15
  sonatoki/sandbox.json,sha256=hx6LRsfvmmTtqXcXIyCsfSaGK3DZ-GCdbM8xhZQBHoA,77650
16
- sonatoki-0.1.3.dist-info/RECORD,,
16
+ sonatoki-0.1.4.dist-info/RECORD,,