sonatoki 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Configs.py +22 -1
- sonatoki/Preprocessors.py +6 -0
- sonatoki/constants.py +6 -3
- sonatoki/linku.json +1 -1
- sonatoki/sandbox.json +1 -1
- {sonatoki-0.5.1.dist-info → sonatoki-0.5.2.dist-info}/METADATA +17 -16
- {sonatoki-0.5.1.dist-info → sonatoki-0.5.2.dist-info}/RECORD +9 -9
- {sonatoki-0.5.1.dist-info → sonatoki-0.5.2.dist-info}/WHEEL +1 -1
- {sonatoki-0.5.1.dist-info → sonatoki-0.5.2.dist-info}/licenses/LICENSE +0 -0
sonatoki/Configs.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# STL
|
2
2
|
from copy import deepcopy
|
3
|
-
from typing import List, Type, TypedDict
|
3
|
+
from typing import Set, List, Type, TypedDict, cast
|
4
4
|
|
5
5
|
# PDM
|
6
6
|
from typing_extensions import NotRequired
|
@@ -18,6 +18,7 @@ from sonatoki.Filters import (
|
|
18
18
|
NimiKuLili,
|
19
19
|
NimiKuSuli,
|
20
20
|
ProperName,
|
21
|
+
Phonotactic,
|
21
22
|
Punctuation,
|
22
23
|
LongSyllabic,
|
23
24
|
Miscellaneous,
|
@@ -102,6 +103,26 @@ CorpusConfig: IloConfig = {
|
|
102
103
|
"scorer": SoftScaling,
|
103
104
|
"passing_score": 0.8,
|
104
105
|
}
|
106
|
+
|
107
|
+
# TODO: create a mechanism to omit tokens from a filter with more granularity
|
108
|
+
__corpus_tokens_dict: Set[str] = cast(
|
109
|
+
Set[str],
|
110
|
+
CorpusConfig["scoring_filters"][
|
111
|
+
0
|
112
|
+
].tokens, # pyright: ignore[reportAttributeAccessIssue]
|
113
|
+
)
|
114
|
+
__corpus_tokens_dict -= {
|
115
|
+
"an",
|
116
|
+
"i",
|
117
|
+
"me",
|
118
|
+
"ne",
|
119
|
+
"se",
|
120
|
+
"take",
|
121
|
+
"ten",
|
122
|
+
"to",
|
123
|
+
"u",
|
124
|
+
"we",
|
125
|
+
}
|
105
126
|
"""Mimics the previous implementation of ilo pi toki pona taso."""
|
106
127
|
LazyConfig: IloConfig = {
|
107
128
|
"preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
|
sonatoki/Preprocessors.py
CHANGED
@@ -90,6 +90,12 @@ class DiscordEmotes(RegexPreprocessor):
|
|
90
90
|
pattern = re.compile(r"<a?:[a-zA-Z0-9_]{2,}:[0-9]{2,}>")
|
91
91
|
|
92
92
|
|
93
|
+
class ColonEmotes(RegexPreprocessor):
|
94
|
+
"""Remove colon-marked emotes `:name:`"""
|
95
|
+
|
96
|
+
pattern = re.compile(r":[a-zA-Z0-9_]{2,}:")
|
97
|
+
|
98
|
+
|
93
99
|
class DiscordMentions(RegexPreprocessor):
|
94
100
|
pattern = re.compile(r"<@[\!\&]?[0-9]{2,}>")
|
95
101
|
|
sonatoki/constants.py
CHANGED
@@ -553,7 +553,7 @@ FALSE_POS_SYLLABIC = {
|
|
553
553
|
"in",
|
554
554
|
"no",
|
555
555
|
"some",
|
556
|
-
# "papa",
|
556
|
+
# "papa", # now in sandbox
|
557
557
|
"on",
|
558
558
|
"me",
|
559
559
|
"ipa",
|
@@ -591,7 +591,7 @@ FALSE_POS_SYLLABIC = {
|
|
591
591
|
"oposite",
|
592
592
|
"anime",
|
593
593
|
"potato",
|
594
|
-
|
594
|
+
"japan",
|
595
595
|
"nose",
|
596
596
|
"kilo",
|
597
597
|
"alone",
|
@@ -629,17 +629,20 @@ FALSE_POS_SYLLABIC = {
|
|
629
629
|
"awaken",
|
630
630
|
"eliminate",
|
631
631
|
"elite",
|
632
|
-
"misuse",
|
633
632
|
"emanate",
|
634
633
|
"iluminate",
|
635
634
|
"imense",
|
636
635
|
"imitate",
|
636
|
+
"injoke",
|
637
637
|
"insane",
|
638
638
|
"insolate",
|
639
639
|
"insulate",
|
640
640
|
"intense",
|
641
641
|
"lemon",
|
642
642
|
"manipulate",
|
643
|
+
"misuse",
|
644
|
+
"ne", # "no" in many other languages
|
645
|
+
"wana",
|
643
646
|
}
|
644
647
|
|
645
648
|
FALSE_POS_ALPHABETIC: Set[str] = {
|