sonatoki 0.8.2__py3-none-any.whl → 0.8.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Configs.py +5 -4
- sonatoki/Preprocessors.py +41 -1
- sonatoki/constants.py +0 -2
- {sonatoki-0.8.2.dist-info → sonatoki-0.8.4.dist-info}/METADATA +3 -2
- {sonatoki-0.8.2.dist-info → sonatoki-0.8.4.dist-info}/RECORD +7 -7
- {sonatoki-0.8.2.dist-info → sonatoki-0.8.4.dist-info}/WHEEL +0 -0
- {sonatoki-0.8.2.dist-info → sonatoki-0.8.4.dist-info}/licenses/LICENSE +0 -0
sonatoki/Configs.py
CHANGED
@@ -33,6 +33,7 @@ from sonatoki.Scorers import Scorer, PassFail, SoftScaling, SoftPassFail
|
|
33
33
|
from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
|
34
34
|
from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe
|
35
35
|
from sonatoki.Preprocessors import (
|
36
|
+
RECOMMENDED_PREPROCESSORS,
|
36
37
|
URLs,
|
37
38
|
Emoji,
|
38
39
|
Codeblock,
|
@@ -94,7 +95,7 @@ BaseConfig: IloConfig = {
|
|
94
95
|
|
95
96
|
|
96
97
|
PrefConfig: IloConfig = {
|
97
|
-
"preprocessors":
|
98
|
+
"preprocessors": RECOMMENDED_PREPROCESSORS,
|
98
99
|
"cleaners": [ConsecutiveDuplicates],
|
99
100
|
"ignoring_filters": [Numeric, Punctuation],
|
100
101
|
"scoring_filters": [
|
@@ -109,7 +110,7 @@ PrefConfig: IloConfig = {
|
|
109
110
|
}
|
110
111
|
|
111
112
|
CorpusConfig: IloConfig = {
|
112
|
-
"preprocessors":
|
113
|
+
"preprocessors": RECOMMENDED_PREPROCESSORS,
|
113
114
|
"cleaners": [ConsecutiveDuplicates],
|
114
115
|
"ignoring_filters": [Numeric, Punctuation],
|
115
116
|
"scoring_filters": [
|
@@ -128,7 +129,7 @@ CorpusConfig: IloConfig = {
|
|
128
129
|
}
|
129
130
|
"""Mimics the previous implementation of ilo pi toki pona taso."""
|
130
131
|
LazyConfig: IloConfig = {
|
131
|
-
"preprocessors":
|
132
|
+
"preprocessors": RECOMMENDED_PREPROCESSORS,
|
132
133
|
"cleaners": [ConsecutiveDuplicates],
|
133
134
|
"ignoring_filters": [Numeric, Punctuation],
|
134
135
|
"scoring_filters": [Alphabetic, NimiUCSUR, PuName, Miscellaneous],
|
@@ -138,7 +139,7 @@ LazyConfig: IloConfig = {
|
|
138
139
|
}
|
139
140
|
"""This is extremely silly."""
|
140
141
|
IsipinEpikuConfig: IloConfig = {
|
141
|
-
"preprocessors":
|
142
|
+
"preprocessors": RECOMMENDED_PREPROCESSORS,
|
142
143
|
"cleaners": [ConsecutiveDuplicates],
|
143
144
|
"ignoring_filters": [Numeric, Punctuation],
|
144
145
|
"scoring_filters": [
|
sonatoki/Preprocessors.py
CHANGED
@@ -19,6 +19,7 @@ It is up to the user to order them appropriately.
|
|
19
19
|
# STL
|
20
20
|
import re
|
21
21
|
from abc import ABC, abstractmethod
|
22
|
+
from typing import List, Type
|
22
23
|
|
23
24
|
# PDM
|
24
25
|
import emoji
|
@@ -74,6 +75,27 @@ class URLs(RegexPreprocessor):
|
|
74
75
|
pattern = re.compile(r"https?:\/\/\S+")
|
75
76
|
|
76
77
|
|
78
|
+
class MarkdownURLs(RegexPreprocessor):
|
79
|
+
"""Remove URLs in markdown format, replacing them with their corresponding
|
80
|
+
text."""
|
81
|
+
|
82
|
+
pattern = re.compile(r"\[(.+?)\]\(https?:\/\/\S+\)")
|
83
|
+
replace = r"\1"
|
84
|
+
|
85
|
+
|
86
|
+
class Emails(RegexPreprocessor):
|
87
|
+
"""Attempt to remove emails, for a particularly strong definition of
|
88
|
+
"email".
|
89
|
+
|
90
|
+
https://www.regular-expressions.info/email.html
|
91
|
+
"""
|
92
|
+
|
93
|
+
pattern = re.compile(
|
94
|
+
r"\b[a-zA-Z0-9._%+-]{2,}@[a-zA-Z0-9.-]{2,}\.[a-zA-Z]{2,24}\b",
|
95
|
+
flags=re.IGNORECASE,
|
96
|
+
)
|
97
|
+
|
98
|
+
|
77
99
|
class Reference(RegexPreprocessor):
|
78
100
|
"""Remove text contained in double brackets.
|
79
101
|
|
@@ -209,6 +231,21 @@ class ZeroWidths(RegexPreprocessor):
|
|
209
231
|
pattern = re.compile("[\\U0000200C-\\U0000200D]")
|
210
232
|
|
211
233
|
|
234
|
+
RECOMMENDED_PREPROCESSORS: List[Type[Preprocessor]] = [
|
235
|
+
# These are sorted by the "strength" of their definition, which would be roughly
|
236
|
+
# "How confidently have we matched this object?"
|
237
|
+
# Additionally, MarkdownURLs must come before URLs, and Emoji must come last due to
|
238
|
+
# its ability to appear in all of the others.
|
239
|
+
Codeblock,
|
240
|
+
AngleBracketObject,
|
241
|
+
Reference,
|
242
|
+
MarkdownURLs,
|
243
|
+
URLs,
|
244
|
+
Emails,
|
245
|
+
Emoji,
|
246
|
+
]
|
247
|
+
|
248
|
+
|
212
249
|
__all__ = [
|
213
250
|
"AllQuotes",
|
214
251
|
"AngleBracketObject",
|
@@ -219,10 +256,13 @@ __all__ = [
|
|
219
256
|
"DiscordMentions",
|
220
257
|
"DiscordSpecial",
|
221
258
|
"DoubleQuotes",
|
259
|
+
"Emails",
|
260
|
+
"Emoji",
|
261
|
+
"MarkdownURLs",
|
262
|
+
"RECOMMENDED_PREPROCESSORS",
|
222
263
|
"Reference",
|
223
264
|
"SingleQuotes",
|
224
265
|
"Spoilers",
|
225
266
|
"URLs",
|
226
267
|
"ZeroWidths",
|
227
|
-
"Emoji",
|
228
268
|
]
|
sonatoki/constants.py
CHANGED
@@ -518,8 +518,6 @@ ALPHABETICS = Path(__file__).resolve().parent / Path("alphabetic.txt")
|
|
518
518
|
VOWELS = "aeiou"
|
519
519
|
CONSONANTS = "jklmnpstw"
|
520
520
|
ALPHABET = VOWELS + CONSONANTS
|
521
|
-
|
522
|
-
LANGUAGE = "english" # for NLTK
|
523
521
|
"""Commonly occurring strings which are some kind of valid Toki Pona or
|
524
522
|
external token."""
|
525
523
|
ALLOWABLES = {
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sonatoki
|
3
|
-
Version: 0.8.
|
3
|
+
Version: 0.8.4
|
4
4
|
Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
|
5
5
|
Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
|
6
6
|
License: AGPL-3.0-or-later
|
@@ -97,8 +97,9 @@ I originally intended to translate this file and library into Toki Pona once Uni
|
|
97
97
|
### What's the deal with the tokenizers?
|
98
98
|
|
99
99
|
The Toki Pona tokenizer `sonatoki.Tokenizers.WordTokenizer` attempts to tokenize statements such that every token either represents a word candidate ("toki", "mumumu") or a complete non-candidate ("..!", "123").
|
100
|
-
This design is highly undesirable for NLTK's English tokenizer because
|
100
|
+
This design is highly undesirable for NLTK's English tokenizer because words in languages other than Toki Pona can have punctuation characters in or around them which are part of the word.
|
101
101
|
Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet or in [Private Use Area Unicode characters](https://www.kreativekorp.com/ucsur/), so a more aggressive tokenizer is highly desirable.
|
102
|
+
However, this tokenizer doesn't ignore intra-word punctuation entirely. Instead, exactly one of `-` or `'` is allowed at a time, so long as both of its neighbors are writing characters. This increases the accuracy of the tokenizer significantly, and makes identifying Toki Pona sentences among arbitrary ones similarly more accurate.
|
102
103
|
|
103
104
|
The goal of splitting into word candidates and non-candidates is important, because any [encoding of Toki Pona's logographic script](https://www.kreativekorp.com/ucsur/charts/sitelen.html) will require each character be split into its own token, where the default behavior would be to leave consecutive non-punctuation together.
|
104
105
|
|
@@ -1,16 +1,16 @@
|
|
1
|
-
sonatoki-0.8.
|
2
|
-
sonatoki-0.8.
|
3
|
-
sonatoki-0.8.
|
1
|
+
sonatoki-0.8.4.dist-info/METADATA,sha256=Nui-Em5-CwsiOt5mkyhF5bb6WM9VQ6sp9UlENnH5Udw,6893
|
2
|
+
sonatoki-0.8.4.dist-info/WHEEL,sha256=rSwsxJWe3vzyR5HCwjWXQruDgschpei4h_giTm0dJVE,90
|
3
|
+
sonatoki-0.8.4.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
|
4
4
|
sonatoki/Cleaners.py,sha256=x2dT3MpDUfbrHA0EP2D3n1sTiKFFi5jw9ha-1dX973o,1958
|
5
|
-
sonatoki/Configs.py,sha256
|
5
|
+
sonatoki/Configs.py,sha256=-R-rTPUJfuSintpvC4UnOF1B9B93-Ooh_jmkZwhKvtk,4669
|
6
6
|
sonatoki/Filters.py,sha256=rBEJrY_R6koFpoYl4yfo_9UR-i21HbvlUF0ORg1g0WE,13411
|
7
|
-
sonatoki/Preprocessors.py,sha256=
|
7
|
+
sonatoki/Preprocessors.py,sha256=RmzkvPVo6Kdx1rZ5HeR9cTtx6oxpp2iLKrOMCUEqIrM,7107
|
8
8
|
sonatoki/Scorers.py,sha256=aCU3p9rD4QOy-uu851FGGw-ARqUCG_l4V_z5rtRL420,5236
|
9
9
|
sonatoki/Tokenizers.py,sha256=8lpC70bzXOpHyhVr5bmqpYKmdmQvJdf7X5-Icc9RRCw,5040
|
10
10
|
sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
sonatoki/__main__.py,sha256=6n4kUF80APl6a0jV46h_ncHNuQbrLpZ_nAmiNAakiag,5673
|
12
12
|
sonatoki/alphabetic.txt,sha256=duyqAKilD2vLIr75RShCIAnktNJcGeEoQIk18V6czmg,11702
|
13
|
-
sonatoki/constants.py,sha256=
|
13
|
+
sonatoki/constants.py,sha256=Or9VEk19N8Bef14ZRgcP-r0F9qyVByv4CImqpoJ8I6Q,19324
|
14
14
|
sonatoki/ilo.py,sha256=Dsn0yagkwjqpAQoCj6mkZ6NqWeanRF2lxNDNoqjWGLo,5993
|
15
15
|
sonatoki/linku.json,sha256=d72Dvht-a4gBmdqLLI8mElvo83zSpbxDmxJj05hOudM,295413
|
16
16
|
sonatoki/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -18,4 +18,4 @@ sonatoki/sandbox.json,sha256=44csrQDaVtV-n8OyewabX1J9MmUFCsPct5C8E5Xuc58,140197
|
|
18
18
|
sonatoki/syllabic.txt,sha256=HnqY4TrZ3tPcHah3TsvG9F9gjMrnAGdJ8hHJNHyyUPc,1712
|
19
19
|
sonatoki/types.py,sha256=zoVJeaDLOPstREiHtoD9pv-AOCsJq2C4_GG3nTYd114,1267
|
20
20
|
sonatoki/utils.py,sha256=sT5xLMEj0aLpy8GP92HKblJU1Wt1m8NUlMgCFWB32xQ,2265
|
21
|
-
sonatoki-0.8.
|
21
|
+
sonatoki-0.8.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|