sonatoki 0.7.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonatoki/Configs.py CHANGED
@@ -11,12 +11,12 @@ from sonatoki.Filters import (
11
11
  And,
12
12
  Not,
13
13
  Filter,
14
+ PuName,
14
15
  Numeric,
15
16
  NimiUCSUR,
16
17
  Alphabetic,
17
18
  NimiKuLili,
18
19
  NimiKuSuli,
19
- ProperName,
20
20
  Punctuation,
21
21
  LongSyllabic,
22
22
  Miscellaneous,
@@ -131,7 +131,7 @@ LazyConfig: IloConfig = {
131
131
  "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
132
132
  "cleaners": [ConsecutiveDuplicates],
133
133
  "ignoring_filters": [Numeric, Punctuation],
134
- "scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
134
+ "scoring_filters": [Alphabetic, NimiUCSUR, PuName, Miscellaneous],
135
135
  "scorer": SoftPassFail,
136
136
  "passing_score": 0.8,
137
137
  "word_tokenizer": WordTokenizerRe, # mimics old tokenizer
sonatoki/Filters.py CHANGED
@@ -141,8 +141,27 @@ class FalsePosAlphabetic(MemberFilter):
141
141
 
142
142
 
143
143
  class ProperName(Filter):
144
- """Determines if a given token is a valid name (also called a loan word).
145
- When Toki Pona is written with the Latin alphabet, names are generally
144
+ """Determine if a given token is a valid name based on a reasonable weakening of
145
+ the rules given in Toki Pona: The Language of Good. A token matches if it has a capital
146
+ letter at its start and is **not** fully capitalized.
147
+
148
+ This corrects an issue with PuName, where scripts lacking a case distinction are
149
+ errantly counted"""
150
+
151
+ @classmethod
152
+ @override
153
+ @cache(maxsize=None)
154
+ def filter(cls, token: str) -> bool:
155
+ first_capitalized = token[0].isupper()
156
+ all_caps = token.isupper()
157
+
158
+ return first_capitalized and not all_caps
159
+
160
+
161
+ class PuName(Filter):
162
+ """Determine if a given token is a valid name (also called a loan word) based on
163
+ the rules given in Toki Pona: The Language of Good.
164
+ When Toki Pona is written with the Latin alphabet, names are
146
165
  capitalized at their start. This filter identifies those tokens.
147
166
 
148
167
  Note that this alone cannot determine if a token is a valid name,
@@ -156,6 +175,9 @@ class ProperName(Filter):
156
175
  @override
157
176
  @cache(maxsize=None)
158
177
  def filter(cls, token: str) -> bool:
178
+ # first_capitalized = token[0].isupper()
179
+ # rest_capitalized = token[1:] == token[1:].upper()
180
+ # return first_capitalized and not rest_capitalized
159
181
  return token == token.capitalize()
160
182
  # TODO: If the token is in a script which doesn't have a case distinction,
161
183
  # this will errantly match.
@@ -445,6 +467,7 @@ __all__ = [
445
467
  "Or",
446
468
  "Phonotactic",
447
469
  "ProperName",
470
+ "PuName",
448
471
  "Punctuation",
449
472
  "Syllabic",
450
473
  ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.7.0
3
+ Version: 0.8.0
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,9 +1,9 @@
1
- sonatoki-0.7.0.dist-info/METADATA,sha256=s6w7_WaARQijvFIFIWtg8hL2WzAkj19N7-DsKgfhi3s,6517
2
- sonatoki-0.7.0.dist-info/WHEEL,sha256=rSwsxJWe3vzyR5HCwjWXQruDgschpei4h_giTm0dJVE,90
3
- sonatoki-0.7.0.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
1
+ sonatoki-0.8.0.dist-info/METADATA,sha256=52aTVq7ljGFzYm1Pdh9tKaRN3IVfXruRJZbwIiAPi9w,6517
2
+ sonatoki-0.8.0.dist-info/WHEEL,sha256=rSwsxJWe3vzyR5HCwjWXQruDgschpei4h_giTm0dJVE,90
3
+ sonatoki-0.8.0.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
4
4
  sonatoki/Cleaners.py,sha256=x2dT3MpDUfbrHA0EP2D3n1sTiKFFi5jw9ha-1dX973o,1958
5
- sonatoki/Configs.py,sha256=rIvrkYjeJeCuWwJIjvmJX6keRZcUJ0pt7h7KdYT5IFI,4766
6
- sonatoki/Filters.py,sha256=cJ5skX9yeqd4HvjzPxIAswigRWvO0ZV2nepQksFedtk,12575
5
+ sonatoki/Configs.py,sha256=h6-igZbhbYoYA0gJLrd3YCa5annTqacsAMGB1dX3v9A,4758
6
+ sonatoki/Filters.py,sha256=rBEJrY_R6koFpoYl4yfo_9UR-i21HbvlUF0ORg1g0WE,13411
7
7
  sonatoki/Preprocessors.py,sha256=nY0_cmF4aEmGZxXc7ZEvhvf2BZO6GnrMUC8IqDwu47A,6034
8
8
  sonatoki/Scorers.py,sha256=aCU3p9rD4QOy-uu851FGGw-ARqUCG_l4V_z5rtRL420,5236
9
9
  sonatoki/Tokenizers.py,sha256=8lpC70bzXOpHyhVr5bmqpYKmdmQvJdf7X5-Icc9RRCw,5040
@@ -18,4 +18,4 @@ sonatoki/sandbox.json,sha256=44csrQDaVtV-n8OyewabX1J9MmUFCsPct5C8E5Xuc58,140197
18
18
  sonatoki/syllabic.txt,sha256=HnqY4TrZ3tPcHah3TsvG9F9gjMrnAGdJ8hHJNHyyUPc,1712
19
19
  sonatoki/types.py,sha256=zoVJeaDLOPstREiHtoD9pv-AOCsJq2C4_GG3nTYd114,1267
20
20
  sonatoki/utils.py,sha256=sT5xLMEj0aLpy8GP92HKblJU1Wt1m8NUlMgCFWB32xQ,2265
21
- sonatoki-0.7.0.dist-info/RECORD,,
21
+ sonatoki-0.8.0.dist-info/RECORD,,