sonatoki 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Configs.py +36 -1
- sonatoki/Preprocessors.py +6 -0
- sonatoki/constants.py +6 -3
- sonatoki/linku.json +1 -1
- sonatoki/sandbox.json +1 -1
- {sonatoki-0.5.1.dist-info → sonatoki-0.5.3.dist-info}/METADATA +17 -16
- {sonatoki-0.5.1.dist-info → sonatoki-0.5.3.dist-info}/RECORD +9 -9
- {sonatoki-0.5.1.dist-info → sonatoki-0.5.3.dist-info}/WHEEL +1 -1
- {sonatoki-0.5.1.dist-info → sonatoki-0.5.3.dist-info}/licenses/LICENSE +0 -0
sonatoki/Configs.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# STL
|
2
2
|
from copy import deepcopy
|
3
|
-
from typing import List, Type, TypedDict
|
3
|
+
from typing import Set, List, Type, TypedDict, cast
|
4
4
|
|
5
5
|
# PDM
|
6
6
|
from typing_extensions import NotRequired
|
@@ -18,6 +18,7 @@ from sonatoki.Filters import (
|
|
18
18
|
NimiKuLili,
|
19
19
|
NimiKuSuli,
|
20
20
|
ProperName,
|
21
|
+
Phonotactic,
|
21
22
|
Punctuation,
|
22
23
|
LongSyllabic,
|
23
24
|
Miscellaneous,
|
@@ -102,6 +103,40 @@ CorpusConfig: IloConfig = {
|
|
102
103
|
"scorer": SoftScaling,
|
103
104
|
"passing_score": 0.8,
|
104
105
|
}
|
106
|
+
|
107
|
+
# TODO: create a mechanism to omit tokens from a filter with more granularity
|
108
|
+
__corpus_tokens_dict: Set[str] = cast(
|
109
|
+
Set[str],
|
110
|
+
CorpusConfig["scoring_filters"][
|
111
|
+
0
|
112
|
+
].tokens, # pyright: ignore[reportAttributeAccessIssue]
|
113
|
+
)
|
114
|
+
__corpus_tokens_dict -= {
|
115
|
+
# Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
|
116
|
+
# In this case, all of these appear more often in English by a factor of at least 10.
|
117
|
+
"aka", # also known as
|
118
|
+
"an", # article
|
119
|
+
"api", # API
|
120
|
+
"i", # 1st person
|
121
|
+
"kana", # japanese script
|
122
|
+
"me", # 1st person
|
123
|
+
"ne", # "no" in several languages
|
124
|
+
"nu", # "new", now in dutch
|
125
|
+
"se", # spanish particle, "see"
|
126
|
+
"take", # acquire, perhaps forcefully or without permission
|
127
|
+
"ten", # 10
|
128
|
+
"to", # to, too
|
129
|
+
"u", # no u
|
130
|
+
"we", # 1st person plural
|
131
|
+
"wi", # wii and discussions of syllables
|
132
|
+
"sole", # singular, of shoe
|
133
|
+
# unexplored candidates for removal
|
134
|
+
# "omen", # ominous
|
135
|
+
# "papa", # father
|
136
|
+
# "lo", # "lo" and "loo"
|
137
|
+
# "ewe", # sheep
|
138
|
+
# "pa", # father- eh?
|
139
|
+
}
|
105
140
|
"""Mimics the previous implementation of ilo pi toki pona taso."""
|
106
141
|
LazyConfig: IloConfig = {
|
107
142
|
"preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
|
sonatoki/Preprocessors.py
CHANGED
@@ -90,6 +90,12 @@ class DiscordEmotes(RegexPreprocessor):
|
|
90
90
|
pattern = re.compile(r"<a?:[a-zA-Z0-9_]{2,}:[0-9]{2,}>")
|
91
91
|
|
92
92
|
|
93
|
+
class ColonEmotes(RegexPreprocessor):
|
94
|
+
"""Remove colon-marked emotes `:name:`"""
|
95
|
+
|
96
|
+
pattern = re.compile(r":[a-zA-Z0-9_]{2,}:")
|
97
|
+
|
98
|
+
|
93
99
|
class DiscordMentions(RegexPreprocessor):
|
94
100
|
pattern = re.compile(r"<@[\!\&]?[0-9]{2,}>")
|
95
101
|
|
sonatoki/constants.py
CHANGED
@@ -553,7 +553,7 @@ FALSE_POS_SYLLABIC = {
|
|
553
553
|
"in",
|
554
554
|
"no",
|
555
555
|
"some",
|
556
|
-
# "papa",
|
556
|
+
# "papa", # now in sandbox
|
557
557
|
"on",
|
558
558
|
"me",
|
559
559
|
"ipa",
|
@@ -591,7 +591,7 @@ FALSE_POS_SYLLABIC = {
|
|
591
591
|
"oposite",
|
592
592
|
"anime",
|
593
593
|
"potato",
|
594
|
-
|
594
|
+
"japan",
|
595
595
|
"nose",
|
596
596
|
"kilo",
|
597
597
|
"alone",
|
@@ -629,17 +629,20 @@ FALSE_POS_SYLLABIC = {
|
|
629
629
|
"awaken",
|
630
630
|
"eliminate",
|
631
631
|
"elite",
|
632
|
-
"misuse",
|
633
632
|
"emanate",
|
634
633
|
"iluminate",
|
635
634
|
"imense",
|
636
635
|
"imitate",
|
636
|
+
"injoke",
|
637
637
|
"insane",
|
638
638
|
"insolate",
|
639
639
|
"insulate",
|
640
640
|
"intense",
|
641
641
|
"lemon",
|
642
642
|
"manipulate",
|
643
|
+
"misuse",
|
644
|
+
"ne", # "no" in many other languages
|
645
|
+
"wana",
|
643
646
|
}
|
644
647
|
|
645
648
|
FALSE_POS_ALPHABETIC: Set[str] = {
|