sonatoki 0.8.1__py3-none-any.whl → 0.8.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Configs.py +5 -4
- sonatoki/Preprocessors.py +26 -1
- sonatoki/constants.py +3 -4
- {sonatoki-0.8.1.dist-info → sonatoki-0.8.3.dist-info}/METADATA +3 -2
- {sonatoki-0.8.1.dist-info → sonatoki-0.8.3.dist-info}/RECORD +7 -7
- {sonatoki-0.8.1.dist-info → sonatoki-0.8.3.dist-info}/WHEEL +0 -0
- {sonatoki-0.8.1.dist-info → sonatoki-0.8.3.dist-info}/licenses/LICENSE +0 -0
sonatoki/Configs.py
CHANGED
@@ -33,6 +33,7 @@ from sonatoki.Scorers import Scorer, PassFail, SoftScaling, SoftPassFail
|
|
33
33
|
from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
|
34
34
|
from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe
|
35
35
|
from sonatoki.Preprocessors import (
|
36
|
+
RECOMMENDED_PREPROCESSORS,
|
36
37
|
URLs,
|
37
38
|
Emoji,
|
38
39
|
Codeblock,
|
@@ -94,7 +95,7 @@ BaseConfig: IloConfig = {
|
|
94
95
|
|
95
96
|
|
96
97
|
PrefConfig: IloConfig = {
|
97
|
-
"preprocessors":
|
98
|
+
"preprocessors": RECOMMENDED_PREPROCESSORS,
|
98
99
|
"cleaners": [ConsecutiveDuplicates],
|
99
100
|
"ignoring_filters": [Numeric, Punctuation],
|
100
101
|
"scoring_filters": [
|
@@ -109,7 +110,7 @@ PrefConfig: IloConfig = {
|
|
109
110
|
}
|
110
111
|
|
111
112
|
CorpusConfig: IloConfig = {
|
112
|
-
"preprocessors":
|
113
|
+
"preprocessors": RECOMMENDED_PREPROCESSORS,
|
113
114
|
"cleaners": [ConsecutiveDuplicates],
|
114
115
|
"ignoring_filters": [Numeric, Punctuation],
|
115
116
|
"scoring_filters": [
|
@@ -128,7 +129,7 @@ CorpusConfig: IloConfig = {
|
|
128
129
|
}
|
129
130
|
"""Mimics the previous implementation of ilo pi toki pona taso."""
|
130
131
|
LazyConfig: IloConfig = {
|
131
|
-
"preprocessors":
|
132
|
+
"preprocessors": RECOMMENDED_PREPROCESSORS,
|
132
133
|
"cleaners": [ConsecutiveDuplicates],
|
133
134
|
"ignoring_filters": [Numeric, Punctuation],
|
134
135
|
"scoring_filters": [Alphabetic, NimiUCSUR, PuName, Miscellaneous],
|
@@ -138,7 +139,7 @@ LazyConfig: IloConfig = {
|
|
138
139
|
}
|
139
140
|
"""This is extremely silly."""
|
140
141
|
IsipinEpikuConfig: IloConfig = {
|
141
|
-
"preprocessors":
|
142
|
+
"preprocessors": RECOMMENDED_PREPROCESSORS,
|
142
143
|
"cleaners": [ConsecutiveDuplicates],
|
143
144
|
"ignoring_filters": [Numeric, Punctuation],
|
144
145
|
"scoring_filters": [
|
sonatoki/Preprocessors.py
CHANGED
@@ -19,6 +19,7 @@ It is up to the user to order them appropriately.
|
|
19
19
|
# STL
|
20
20
|
import re
|
21
21
|
from abc import ABC, abstractmethod
|
22
|
+
from typing import List, Type
|
22
23
|
|
23
24
|
# PDM
|
24
25
|
import emoji
|
@@ -74,6 +75,14 @@ class URLs(RegexPreprocessor):
|
|
74
75
|
pattern = re.compile(r"https?:\/\/\S+")
|
75
76
|
|
76
77
|
|
78
|
+
class MarkdownURLs(RegexPreprocessor):
|
79
|
+
"""Remove URLs in markdown format, replacing them with their corresponding
|
80
|
+
text."""
|
81
|
+
|
82
|
+
pattern = re.compile(r"\[(.+?)\]\(https?:\/\/\S+\)")
|
83
|
+
replace = r"\1"
|
84
|
+
|
85
|
+
|
77
86
|
class Reference(RegexPreprocessor):
|
78
87
|
"""Remove text contained in double brackets.
|
79
88
|
|
@@ -209,6 +218,20 @@ class ZeroWidths(RegexPreprocessor):
|
|
209
218
|
pattern = re.compile("[\\U0000200C-\\U0000200D]")
|
210
219
|
|
211
220
|
|
221
|
+
RECOMMENDED_PREPROCESSORS: List[Type[Preprocessor]] = [
|
222
|
+
# These are sorted by the "strength" of their definition, which would be roughly
|
223
|
+
# "How confidently have we matched this object?"
|
224
|
+
# Additionally, MarkdownURLs must come before URLs, and Emoji must come last due to
|
225
|
+
# its ability to appear in all of the others.
|
226
|
+
Codeblock,
|
227
|
+
AngleBracketObject,
|
228
|
+
Reference,
|
229
|
+
MarkdownURLs,
|
230
|
+
URLs,
|
231
|
+
Emoji,
|
232
|
+
]
|
233
|
+
|
234
|
+
|
212
235
|
__all__ = [
|
213
236
|
"AllQuotes",
|
214
237
|
"AngleBracketObject",
|
@@ -219,10 +242,12 @@ __all__ = [
|
|
219
242
|
"DiscordMentions",
|
220
243
|
"DiscordSpecial",
|
221
244
|
"DoubleQuotes",
|
245
|
+
"Emoji",
|
246
|
+
"MarkdownURLs",
|
247
|
+
"RECOMMENDED_PREPROCESSORS",
|
222
248
|
"Reference",
|
223
249
|
"SingleQuotes",
|
224
250
|
"Spoilers",
|
225
251
|
"URLs",
|
226
252
|
"ZeroWidths",
|
227
|
-
"Emoji",
|
228
253
|
]
|
sonatoki/constants.py
CHANGED
@@ -503,8 +503,9 @@ ALL_PUNCT = "".join(sorted(list(set(POSIX_PUNCT + UNICODE_PUNCT))))
|
|
503
503
|
ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
|
504
504
|
# combined bc the result could be simpler
|
505
505
|
|
506
|
-
SENTENCE_PUNCT = """.?!:;()[-]
|
507
|
-
#
|
506
|
+
SENTENCE_PUNCT = """.?!:;"()[-]«»‹›“”‟„⹂‽·•…「」『』"""
|
507
|
+
# single quotes are word boundaries if not intra-word, but double quotes are sentence
|
508
|
+
# boundaries
|
508
509
|
|
509
510
|
INTRA_WORD_PUNCT = """-'’"""
|
510
511
|
|
@@ -517,8 +518,6 @@ ALPHABETICS = Path(__file__).resolve().parent / Path("alphabetic.txt")
|
|
517
518
|
VOWELS = "aeiou"
|
518
519
|
CONSONANTS = "jklmnpstw"
|
519
520
|
ALPHABET = VOWELS + CONSONANTS
|
520
|
-
|
521
|
-
LANGUAGE = "english" # for NLTK
|
522
521
|
"""Commonly occurring strings which are some kind of valid Toki Pona or
|
523
522
|
external token."""
|
524
523
|
ALLOWABLES = {
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sonatoki
|
3
|
-
Version: 0.8.
|
3
|
+
Version: 0.8.3
|
4
4
|
Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
|
5
5
|
Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
|
6
6
|
License: AGPL-3.0-or-later
|
@@ -97,8 +97,9 @@ I originally intended to translate this file and library into Toki Pona once Uni
|
|
97
97
|
### What's the deal with the tokenizers?
|
98
98
|
|
99
99
|
The Toki Pona tokenizer `sonatoki.Tokenizers.WordTokenizer` attempts to tokenize statements such that every token either represents a word candidate ("toki", "mumumu") or a complete non-candidate ("..!", "123").
|
100
|
-
This design is highly undesirable for NLTK's English tokenizer because
|
100
|
+
This design is highly undesirable for NLTK's English tokenizer because words in languages other than Toki Pona can have punctuation characters in or around them which are part of the word.
|
101
101
|
Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet or in [Private Use Area Unicode characters](https://www.kreativekorp.com/ucsur/), so a more aggressive tokenizer is highly desirable.
|
102
|
+
However, this tokenizer doesn't ignore intra-word punctuation entirely. Instead, exactly one of `-` or `'` is allowed at a time, so long as both of its neighbors are writing characters. This increases the accuracy of the tokenizer significantly, and makes identifying Toki Pona sentences among arbitrary ones similarly more accurate.
|
102
103
|
|
103
104
|
The goal of splitting into word candidates and non-candidates is important, because any [encoding of Toki Pona's logographic script](https://www.kreativekorp.com/ucsur/charts/sitelen.html) will require each character be split into its own token, where the default behavior would be to leave consecutive non-punctuation together.
|
104
105
|
|
@@ -1,16 +1,16 @@
|
|
1
|
-
sonatoki-0.8.
|
2
|
-
sonatoki-0.8.
|
3
|
-
sonatoki-0.8.
|
1
|
+
sonatoki-0.8.3.dist-info/METADATA,sha256=W35cZGS_DWg3Q-mTRfFCVKPWWJOe50U3Uy4dzyIxRaw,6893
|
2
|
+
sonatoki-0.8.3.dist-info/WHEEL,sha256=rSwsxJWe3vzyR5HCwjWXQruDgschpei4h_giTm0dJVE,90
|
3
|
+
sonatoki-0.8.3.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
|
4
4
|
sonatoki/Cleaners.py,sha256=x2dT3MpDUfbrHA0EP2D3n1sTiKFFi5jw9ha-1dX973o,1958
|
5
|
-
sonatoki/Configs.py,sha256
|
5
|
+
sonatoki/Configs.py,sha256=-R-rTPUJfuSintpvC4UnOF1B9B93-Ooh_jmkZwhKvtk,4669
|
6
6
|
sonatoki/Filters.py,sha256=rBEJrY_R6koFpoYl4yfo_9UR-i21HbvlUF0ORg1g0WE,13411
|
7
|
-
sonatoki/Preprocessors.py,sha256=
|
7
|
+
sonatoki/Preprocessors.py,sha256=AcvYKr7oT9eumsOiXPM8EBo9TagEaFIYIMC8L2YwPVk,6766
|
8
8
|
sonatoki/Scorers.py,sha256=aCU3p9rD4QOy-uu851FGGw-ARqUCG_l4V_z5rtRL420,5236
|
9
9
|
sonatoki/Tokenizers.py,sha256=8lpC70bzXOpHyhVr5bmqpYKmdmQvJdf7X5-Icc9RRCw,5040
|
10
10
|
sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
sonatoki/__main__.py,sha256=6n4kUF80APl6a0jV46h_ncHNuQbrLpZ_nAmiNAakiag,5673
|
12
12
|
sonatoki/alphabetic.txt,sha256=duyqAKilD2vLIr75RShCIAnktNJcGeEoQIk18V6czmg,11702
|
13
|
-
sonatoki/constants.py,sha256=
|
13
|
+
sonatoki/constants.py,sha256=Or9VEk19N8Bef14ZRgcP-r0F9qyVByv4CImqpoJ8I6Q,19324
|
14
14
|
sonatoki/ilo.py,sha256=Dsn0yagkwjqpAQoCj6mkZ6NqWeanRF2lxNDNoqjWGLo,5993
|
15
15
|
sonatoki/linku.json,sha256=d72Dvht-a4gBmdqLLI8mElvo83zSpbxDmxJj05hOudM,295413
|
16
16
|
sonatoki/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -18,4 +18,4 @@ sonatoki/sandbox.json,sha256=44csrQDaVtV-n8OyewabX1J9MmUFCsPct5C8E5Xuc58,140197
|
|
18
18
|
sonatoki/syllabic.txt,sha256=HnqY4TrZ3tPcHah3TsvG9F9gjMrnAGdJ8hHJNHyyUPc,1712
|
19
19
|
sonatoki/types.py,sha256=zoVJeaDLOPstREiHtoD9pv-AOCsJq2C4_GG3nTYd114,1267
|
20
20
|
sonatoki/utils.py,sha256=sT5xLMEj0aLpy8GP92HKblJU1Wt1m8NUlMgCFWB32xQ,2265
|
21
|
-
sonatoki-0.8.
|
21
|
+
sonatoki-0.8.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|