sonatoki 0.6.2__py3-none-any.whl → 0.6.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Configs.py +31 -39
- sonatoki/Filters.py +16 -1
- {sonatoki-0.6.2.dist-info → sonatoki-0.6.3.dist-info}/METADATA +1 -1
- {sonatoki-0.6.2.dist-info → sonatoki-0.6.3.dist-info}/RECORD +6 -6
- {sonatoki-0.6.2.dist-info → sonatoki-0.6.3.dist-info}/WHEEL +0 -0
- {sonatoki-0.6.2.dist-info → sonatoki-0.6.3.dist-info}/licenses/LICENSE +0 -0
sonatoki/Configs.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# STL
|
2
2
|
from copy import deepcopy
|
3
|
-
from typing import
|
3
|
+
from typing import List, Type, TypedDict
|
4
4
|
|
5
5
|
# PDM
|
6
6
|
from typing_extensions import NotRequired
|
@@ -12,13 +12,11 @@ from sonatoki.Filters import (
|
|
12
12
|
Not,
|
13
13
|
Filter,
|
14
14
|
Numeric,
|
15
|
-
Syllabic,
|
16
15
|
NimiUCSUR,
|
17
16
|
Alphabetic,
|
18
17
|
NimiKuLili,
|
19
18
|
NimiKuSuli,
|
20
19
|
ProperName,
|
21
|
-
Phonotactic,
|
22
20
|
Punctuation,
|
23
21
|
LongSyllabic,
|
24
22
|
Miscellaneous,
|
@@ -44,6 +42,34 @@ from sonatoki.Preprocessors import (
|
|
44
42
|
AngleBracketObject,
|
45
43
|
)
|
46
44
|
|
45
|
+
__DICT_PHONOMATCHES = {
|
46
|
+
# Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
|
47
|
+
# In this case, all of these appear more often in English by a factor of at least 10.
|
48
|
+
"aka", # also known as
|
49
|
+
"an", # article
|
50
|
+
"api", # API
|
51
|
+
"i", # 1st person
|
52
|
+
"kana", # japanese script
|
53
|
+
"me", # 1st person singular, english
|
54
|
+
"ne", # "no" in several languages
|
55
|
+
"nu", # "new" in english, "now" in dutch
|
56
|
+
"se", # spanish particle, english "see"
|
57
|
+
"take", # acquire, perhaps forcefully or without permission
|
58
|
+
"ten", # 10
|
59
|
+
"to", # to, too
|
60
|
+
"je", # 1st person pronoun, french
|
61
|
+
"u", # no u
|
62
|
+
"we", # 1st person plural, english
|
63
|
+
"wi", # wii and discussions of syllables
|
64
|
+
"sole", # singular, of shoe
|
65
|
+
# unexplored candidates for removal
|
66
|
+
# "omen", # ominous
|
67
|
+
# "papa", # father
|
68
|
+
# "lo", # "lo" and "loo"
|
69
|
+
# "ewe", # sheep
|
70
|
+
# "pa", # father- eh?
|
71
|
+
}
|
72
|
+
|
47
73
|
|
48
74
|
class IloConfig(TypedDict):
|
49
75
|
preprocessors: List[Type[Preprocessor]]
|
@@ -92,8 +118,8 @@ CorpusConfig: IloConfig = {
|
|
92
118
|
NimiLinkuCore,
|
93
119
|
NimiLinkuCommon,
|
94
120
|
NimiLinkuUncommon,
|
95
|
-
NimiLinkuObscure,
|
96
|
-
NimiLinkuSandbox,
|
121
|
+
NimiLinkuObscure(sub=__DICT_PHONOMATCHES),
|
122
|
+
NimiLinkuSandbox(sub=__DICT_PHONOMATCHES),
|
97
123
|
NimiUCSUR,
|
98
124
|
Miscellaneous,
|
99
125
|
),
|
@@ -104,40 +130,6 @@ CorpusConfig: IloConfig = {
|
|
104
130
|
"scorer": SoftScaling,
|
105
131
|
"passing_score": 0.8,
|
106
132
|
}
|
107
|
-
|
108
|
-
# TODO: create a mechanism to omit tokens from a filter with more granularity
|
109
|
-
__corpus_tokens_dict: Set[str] = cast(
|
110
|
-
Set[str],
|
111
|
-
CorpusConfig["scoring_filters"][
|
112
|
-
0
|
113
|
-
].tokens, # pyright: ignore[reportAttributeAccessIssue]
|
114
|
-
)
|
115
|
-
__corpus_tokens_dict -= {
|
116
|
-
# Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
|
117
|
-
# In this case, all of these appear more often in English by a factor of at least 10.
|
118
|
-
"aka", # also known as
|
119
|
-
"an", # article
|
120
|
-
"api", # API
|
121
|
-
"i", # 1st person
|
122
|
-
"kana", # japanese script
|
123
|
-
"me", # 1st person
|
124
|
-
"ne", # "no" in several languages
|
125
|
-
"nu", # "new", now in dutch
|
126
|
-
"se", # spanish particle, "see"
|
127
|
-
"take", # acquire, perhaps forcefully or without permission
|
128
|
-
"ten", # 10
|
129
|
-
"to", # to, too
|
130
|
-
"u", # no u
|
131
|
-
"we", # 1st person plural
|
132
|
-
"wi", # wii and discussions of syllables
|
133
|
-
"sole", # singular, of shoe
|
134
|
-
# unexplored candidates for removal
|
135
|
-
# "omen", # ominous
|
136
|
-
# "papa", # father
|
137
|
-
# "lo", # "lo" and "loo"
|
138
|
-
# "ewe", # sheep
|
139
|
-
# "pa", # father- eh?
|
140
|
-
}
|
141
133
|
"""Mimics the previous implementation of ilo pi toki pona taso."""
|
142
134
|
LazyConfig: IloConfig = {
|
143
135
|
"preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
|
sonatoki/Filters.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
# STL
|
2
2
|
import re
|
3
3
|
from abc import ABC, abstractmethod
|
4
|
-
from
|
4
|
+
from copy import deepcopy
|
5
|
+
from typing import Set, List, Type, Optional
|
5
6
|
from functools import lru_cache as cache # cache comes in 3.9
|
6
7
|
|
7
8
|
# PDM
|
@@ -101,6 +102,20 @@ class MemberFilter(Filter):
|
|
101
102
|
def filter(cls, token: str) -> bool:
|
102
103
|
return token.lower() in cls.tokens
|
103
104
|
|
105
|
+
def __new__(
|
106
|
+
cls, add: Optional[Set[str]] = None, sub: Optional[Set[str]] = None
|
107
|
+
) -> Type[Filter]:
|
108
|
+
parent_tokens = deepcopy(cls.tokens)
|
109
|
+
if add:
|
110
|
+
parent_tokens = parent_tokens.union(add)
|
111
|
+
if sub:
|
112
|
+
parent_tokens -= sub
|
113
|
+
|
114
|
+
class AnonMemberFilter(MemberFilter):
|
115
|
+
tokens = parent_tokens
|
116
|
+
|
117
|
+
return AnonMemberFilter
|
118
|
+
|
104
119
|
|
105
120
|
class SubsetFilter(Filter):
|
106
121
|
tokens: Set[str]
|
@@ -1,9 +1,9 @@
|
|
1
|
-
sonatoki-0.6.
|
2
|
-
sonatoki-0.6.
|
3
|
-
sonatoki-0.6.
|
1
|
+
sonatoki-0.6.3.dist-info/METADATA,sha256=AWtjziHObR8LdeB-QwIXaqWe-k8YQj9C0yDpa1_Y0Q0,6517
|
2
|
+
sonatoki-0.6.3.dist-info/WHEEL,sha256=rSwsxJWe3vzyR5HCwjWXQruDgschpei4h_giTm0dJVE,90
|
3
|
+
sonatoki-0.6.3.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
|
4
4
|
sonatoki/Cleaners.py,sha256=x2dT3MpDUfbrHA0EP2D3n1sTiKFFi5jw9ha-1dX973o,1958
|
5
|
-
sonatoki/Configs.py,sha256=
|
6
|
-
sonatoki/Filters.py,sha256=
|
5
|
+
sonatoki/Configs.py,sha256=l0OTEpbq6_IcNburV5pPTzRxsQ7_UCIugGv02adT8R8,5550
|
6
|
+
sonatoki/Filters.py,sha256=3daBdOagJtkb4Qx6p5F2cCUd21FfMIY62UDWrR6Jj2Q,12131
|
7
7
|
sonatoki/Preprocessors.py,sha256=nN6xL6mvVAnWZjSNW8CaeLm8x4kK3dCoB-1WYqi0ANU,5763
|
8
8
|
sonatoki/Scorers.py,sha256=LRQLgXKTU2VqhkMHFPVxyVt83DXf85_zrpDGk4ThU24,3811
|
9
9
|
sonatoki/Tokenizers.py,sha256=8lpC70bzXOpHyhVr5bmqpYKmdmQvJdf7X5-Icc9RRCw,5040
|
@@ -17,4 +17,4 @@ sonatoki/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
sonatoki/sandbox.json,sha256=44csrQDaVtV-n8OyewabX1J9MmUFCsPct5C8E5Xuc58,140197
|
18
18
|
sonatoki/syllabic.txt,sha256=HnqY4TrZ3tPcHah3TsvG9F9gjMrnAGdJ8hHJNHyyUPc,1712
|
19
19
|
sonatoki/utils.py,sha256=sT5xLMEj0aLpy8GP92HKblJU1Wt1m8NUlMgCFWB32xQ,2265
|
20
|
-
sonatoki-0.6.
|
20
|
+
sonatoki-0.6.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|