sonatoki 0.6.1__py3-none-any.whl → 0.6.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonatoki/Configs.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # STL
2
2
  from copy import deepcopy
3
- from typing import Set, List, Type, TypedDict, cast
3
+ from typing import List, Type, TypedDict
4
4
 
5
5
  # PDM
6
6
  from typing_extensions import NotRequired
@@ -12,13 +12,11 @@ from sonatoki.Filters import (
12
12
  Not,
13
13
  Filter,
14
14
  Numeric,
15
- Syllabic,
16
15
  NimiUCSUR,
17
16
  Alphabetic,
18
17
  NimiKuLili,
19
18
  NimiKuSuli,
20
19
  ProperName,
21
- Phonotactic,
22
20
  Punctuation,
23
21
  LongSyllabic,
24
22
  Miscellaneous,
@@ -44,6 +42,34 @@ from sonatoki.Preprocessors import (
44
42
  AngleBracketObject,
45
43
  )
46
44
 
45
+ __DICT_PHONOMATCHES = {
46
+ # Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
47
+ # In this case, all of these appear more often in English by a factor of at least 10.
48
+ "aka", # also known as
49
+ "an", # article
50
+ "api", # API
51
+ "i", # 1st person
52
+ "kana", # japanese script
53
+ "me", # 1st person singular, english
54
+ "ne", # "no" in several languages
55
+ "nu", # "new" in english, "now" in dutch
56
+ "se", # spanish particle, english "see"
57
+ "take", # acquire, perhaps forcefully or without permission
58
+ "ten", # 10
59
+ "to", # to, too
60
+ "je", # 1st person pronoun, french
61
+ "u", # no u
62
+ "we", # 1st person plural, english
63
+ "wi", # wii and discussions of syllables
64
+ "sole", # singular, of shoe
65
+ # unexplored candidates for removal
66
+ # "omen", # ominous
67
+ # "papa", # father
68
+ # "lo", # "lo" and "loo"
69
+ # "ewe", # sheep
70
+ # "pa", # father- eh?
71
+ }
72
+
47
73
 
48
74
  class IloConfig(TypedDict):
49
75
  preprocessors: List[Type[Preprocessor]]
@@ -92,8 +118,8 @@ CorpusConfig: IloConfig = {
92
118
  NimiLinkuCore,
93
119
  NimiLinkuCommon,
94
120
  NimiLinkuUncommon,
95
- NimiLinkuObscure,
96
- NimiLinkuSandbox,
121
+ NimiLinkuObscure(sub=__DICT_PHONOMATCHES),
122
+ NimiLinkuSandbox(sub=__DICT_PHONOMATCHES),
97
123
  NimiUCSUR,
98
124
  Miscellaneous,
99
125
  ),
@@ -104,40 +130,6 @@ CorpusConfig: IloConfig = {
104
130
  "scorer": SoftScaling,
105
131
  "passing_score": 0.8,
106
132
  }
107
-
108
- # TODO: create a mechanism to omit tokens from a filter with more granularity
109
- __corpus_tokens_dict: Set[str] = cast(
110
- Set[str],
111
- CorpusConfig["scoring_filters"][
112
- 0
113
- ].tokens, # pyright: ignore[reportAttributeAccessIssue]
114
- )
115
- __corpus_tokens_dict -= {
116
- # Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
117
- # In this case, all of these appear more often in English by a factor of at least 10.
118
- "aka", # also known as
119
- "an", # article
120
- "api", # API
121
- "i", # 1st person
122
- "kana", # japanese script
123
- "me", # 1st person
124
- "ne", # "no" in several languages
125
- "nu", # "new", now in dutch
126
- "se", # spanish particle, "see"
127
- "take", # acquire, perhaps forcefully or without permission
128
- "ten", # 10
129
- "to", # to, too
130
- "u", # no u
131
- "we", # 1st person plural
132
- "wi", # wii and discussions of syllables
133
- "sole", # singular, of shoe
134
- # unexplored candidates for removal
135
- # "omen", # ominous
136
- # "papa", # father
137
- # "lo", # "lo" and "loo"
138
- # "ewe", # sheep
139
- # "pa", # father- eh?
140
- }
141
133
  """Mimics the previous implementation of ilo pi toki pona taso."""
142
134
  LazyConfig: IloConfig = {
143
135
  "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
sonatoki/Filters.py CHANGED
@@ -1,7 +1,8 @@
1
1
  # STL
2
2
  import re
3
3
  from abc import ABC, abstractmethod
4
- from typing import Set, List, Type
4
+ from copy import deepcopy
5
+ from typing import Set, List, Type, Optional
5
6
  from functools import lru_cache as cache # cache comes in 3.9
6
7
 
7
8
  # PDM
@@ -101,6 +102,20 @@ class MemberFilter(Filter):
101
102
  def filter(cls, token: str) -> bool:
102
103
  return token.lower() in cls.tokens
103
104
 
105
+ def __new__(
106
+ cls, add: Optional[Set[str]] = None, sub: Optional[Set[str]] = None
107
+ ) -> Type[Filter]:
108
+ parent_tokens = deepcopy(cls.tokens)
109
+ if add:
110
+ parent_tokens = parent_tokens.union(add)
111
+ if sub:
112
+ parent_tokens -= sub
113
+
114
+ class AnonMemberFilter(MemberFilter):
115
+ tokens = parent_tokens
116
+
117
+ return AnonMemberFilter
118
+
104
119
 
105
120
  class SubsetFilter(Filter):
106
121
  tokens: Set[str]
sonatoki/constants.py CHANGED
@@ -501,7 +501,7 @@ ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
501
501
  SENTENCE_PUNCT = """.?!:;()[-]·•…"""
502
502
  # NOTE: quotes were previously included, but in TP they are *not* reliably sentence boundaries
503
503
 
504
- INTRA_WORD_PUNCT = """-'"""
504
+ INTRA_WORD_PUNCT = """-'"""
505
505
 
506
506
 
507
507
  LINKU = Path(__file__).resolve().parent / Path("linku.json")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.6.1
3
+ Version: 0.6.3
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,20 +1,20 @@
1
- sonatoki-0.6.1.dist-info/METADATA,sha256=lHSEJsJlmARsWbMBZujSafV4R5JdgXhguqdR3YLUg_c,6517
2
- sonatoki-0.6.1.dist-info/WHEEL,sha256=rSwsxJWe3vzyR5HCwjWXQruDgschpei4h_giTm0dJVE,90
3
- sonatoki-0.6.1.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
1
+ sonatoki-0.6.3.dist-info/METADATA,sha256=AWtjziHObR8LdeB-QwIXaqWe-k8YQj9C0yDpa1_Y0Q0,6517
2
+ sonatoki-0.6.3.dist-info/WHEEL,sha256=rSwsxJWe3vzyR5HCwjWXQruDgschpei4h_giTm0dJVE,90
3
+ sonatoki-0.6.3.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
4
4
  sonatoki/Cleaners.py,sha256=x2dT3MpDUfbrHA0EP2D3n1sTiKFFi5jw9ha-1dX973o,1958
5
- sonatoki/Configs.py,sha256=RD6YUYW45pwIFx8ebJgGs5PhIhL9sjn_VqIg4zf3VUE,5697
6
- sonatoki/Filters.py,sha256=nVSmw5M4sEYA_8KI1fI53rMHkd9KO6yWbKfdxxExxN8,11700
5
+ sonatoki/Configs.py,sha256=l0OTEpbq6_IcNburV5pPTzRxsQ7_UCIugGv02adT8R8,5550
6
+ sonatoki/Filters.py,sha256=3daBdOagJtkb4Qx6p5F2cCUd21FfMIY62UDWrR6Jj2Q,12131
7
7
  sonatoki/Preprocessors.py,sha256=nN6xL6mvVAnWZjSNW8CaeLm8x4kK3dCoB-1WYqi0ANU,5763
8
8
  sonatoki/Scorers.py,sha256=LRQLgXKTU2VqhkMHFPVxyVt83DXf85_zrpDGk4ThU24,3811
9
9
  sonatoki/Tokenizers.py,sha256=8lpC70bzXOpHyhVr5bmqpYKmdmQvJdf7X5-Icc9RRCw,5040
10
10
  sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  sonatoki/__main__.py,sha256=6n4kUF80APl6a0jV46h_ncHNuQbrLpZ_nAmiNAakiag,5673
12
12
  sonatoki/alphabetic.txt,sha256=duyqAKilD2vLIr75RShCIAnktNJcGeEoQIk18V6czmg,11702
13
- sonatoki/constants.py,sha256=FDZYYfst_hYPJ8Fhc9KDceFgOwOci4hUIPN9SnCBzFw,19209
13
+ sonatoki/constants.py,sha256=mPbU-X9PNzelOHVZn-8ZqR_ewKYNjDA6lj2XQpnuoRw,19212
14
14
  sonatoki/ilo.py,sha256=PWZa202Q4h7IjnLxmfgT93iAPJL7dqJbA97L9kQDPiA,5658
15
15
  sonatoki/linku.json,sha256=d72Dvht-a4gBmdqLLI8mElvo83zSpbxDmxJj05hOudM,295413
16
16
  sonatoki/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  sonatoki/sandbox.json,sha256=44csrQDaVtV-n8OyewabX1J9MmUFCsPct5C8E5Xuc58,140197
18
18
  sonatoki/syllabic.txt,sha256=HnqY4TrZ3tPcHah3TsvG9F9gjMrnAGdJ8hHJNHyyUPc,1712
19
19
  sonatoki/utils.py,sha256=sT5xLMEj0aLpy8GP92HKblJU1Wt1m8NUlMgCFWB32xQ,2265
20
- sonatoki-0.6.1.dist-info/RECORD,,
20
+ sonatoki-0.6.3.dist-info/RECORD,,