sonatoki 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonatoki/Configs.py CHANGED
@@ -81,6 +81,7 @@ class IloConfig(TypedDict):
81
81
  scoring_filters: List[Type[Filter]]
82
82
  scorer: Type[Scorer]
83
83
  passing_score: Number
84
+ empty_passes: bool
84
85
  word_tokenizer: NotRequired[Type[Tokenizer]]
85
86
  sent_tokenizer: NotRequired[Type[Tokenizer]]
86
87
 
@@ -94,6 +95,7 @@ BaseConfig: IloConfig = {
94
95
  "scoring_filters": [],
95
96
  "scorer": PassFail,
96
97
  "passing_score": 0.8,
98
+ "empty_passes": True,
97
99
  }
98
100
 
99
101
 
@@ -110,6 +112,7 @@ PrefConfig: IloConfig = {
110
112
  ],
111
113
  "scorer": SoftScaling,
112
114
  "passing_score": 0.8,
115
+ "empty_passes": True,
113
116
  }
114
117
 
115
118
  CorpusConfig: IloConfig = {
@@ -132,6 +135,7 @@ CorpusConfig: IloConfig = {
132
135
  ],
133
136
  "scorer": SoftScaling,
134
137
  "passing_score": 0.8,
138
+ "empty_passes": True, # my client doesn't fail empty sentences; it just omits them
135
139
  }
136
140
  """Mimics the previous implementation of ilo pi toki pona taso."""
137
141
  LazyConfig: IloConfig = {
@@ -142,6 +146,7 @@ LazyConfig: IloConfig = {
142
146
  "scorer": SoftPassFail,
143
147
  "passing_score": 0.8,
144
148
  "word_tokenizer": WordTokenizerRe, # mimics old tokenizer
149
+ "empty_passes": True,
145
150
  }
146
151
  """This is extremely silly."""
147
152
  IsipinEpikuConfig: IloConfig = {
@@ -162,6 +167,7 @@ IsipinEpikuConfig: IloConfig = {
162
167
  ],
163
168
  "scorer": SoftScaling,
164
169
  "passing_score": 0.8,
170
+ "empty_passes": True,
165
171
  }
166
172
 
167
173
 
sonatoki/constants.py CHANGED
@@ -723,8 +723,6 @@ ALL_UCSUR = NIMI_UCSUR + find_unicode_chars(UCSUR_PUNCT_RANGES)
723
723
  UCSUR_MINUS_CARTOUCHE = set(ALL_UCSUR).difference(
724
724
  {UCSUR_CARTOUCHE_LEFT, UCSUR_CARTOUCHE_RIGHT}
725
725
  )
726
- print(UCSUR_MINUS_CARTOUCHE)
727
-
728
726
  # NIMI_PU_UCSUR_RANGES = ["\\U000F1900-\\U000F1977"]
729
727
  # NIMI_PU_ALE_UCSUR_RANGES = NIMI_PU_UCSUR_RANGES + ["\\U000F1978-\\U000F197A"]
730
728
 
sonatoki/ilo.py CHANGED
@@ -20,6 +20,7 @@ class Ilo:
20
20
  __scorer: Type[Scorer]
21
21
  __sentence_scorer: Type[SentenceScorer]
22
22
  __passing_score: Number
23
+ __empty_passes: bool
23
24
 
24
25
  def __init__(
25
26
  self,
@@ -29,6 +30,7 @@ class Ilo:
29
30
  scoring_filters: List[Type[Filter]],
30
31
  scorer: Type[Scorer],
31
32
  passing_score: Number,
33
+ empty_passes: bool = True,
32
34
  sentence_scorer: Type[SentenceScorer] = SentNoOp,
33
35
  word_tokenizer: Type[Tokenizer] = WordTokenizer,
34
36
  sent_tokenizer: Type[Tokenizer] = SentTokenizer,
@@ -44,6 +46,7 @@ class Ilo:
44
46
  self.__scorer = scorer
45
47
  self.__sentence_scorer = sentence_scorer
46
48
  self.__passing_score = passing_score
49
+ self.__empty_passes = empty_passes
47
50
 
48
51
  def preprocess(self, msg: str) -> str:
49
52
  for p in self.__preprocessors:
@@ -108,6 +111,10 @@ class Ilo:
108
111
  filtered = self.filter_tokens(tokenized)
109
112
  cleaned = self.clean_tokens(filtered)
110
113
  score = self.score_tokens(cleaned)
114
+ if not self.__empty_passes and not cleaned:
115
+ # NOTE: filtered will already be empty
116
+ # but clean_tokens can *technically* omit tokens too
117
+ score = 0
111
118
 
112
119
  scorecard: Scorecard = {
113
120
  "text": message,
@@ -119,10 +126,15 @@ class Ilo:
119
126
 
120
127
  return scorecard
121
128
 
129
+ def make_scorecard(self, message: str) -> Scorecard:
130
+ """Preprocess a message, then create and return a `Scorecard` for that
131
+ message."""
132
+ message = self.preprocess(message)
133
+ return self._is_toki_pona(message)
134
+
122
135
  def is_toki_pona(self, message: str) -> bool:
123
136
  """Determines whether a text is or is not Toki Pona."""
124
- message = self.preprocess(message)
125
- scorecard = self._is_toki_pona(message)
137
+ scorecard = self.make_scorecard(message)
126
138
  return scorecard["score"] >= self.__passing_score
127
139
 
128
140
  def _are_toki_pona(self, message: str) -> List[Scorecard]:
@@ -139,6 +151,12 @@ class Ilo:
139
151
  scorecards = self.score_sentences(scorecards)
140
152
  return scorecards
141
153
 
154
+ def make_scorecards(self, message: str) -> List[Scorecard]:
155
+ """Preprocess a message, then create and return a `Scorecard` for each
156
+ sentence in that message."""
157
+ message = self.preprocess(message)
158
+ return self._are_toki_pona(message)
159
+
142
160
  def are_toki_pona(self, message: str) -> List[bool]:
143
161
  """Splits a statement into sentences, then determines if each is or is not Toki Pona.
144
162
  NOTE: You will need to decide how to score the result. Examples:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.10.0
3
+ Version: 0.11.0
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -99,9 +99,10 @@ I originally intended to translate this file and library into Toki Pona once Uni
99
99
  The Toki Pona tokenizer `sonatoki.Tokenizers.WordTokenizer` attempts to tokenize statements such that every token either represents a word candidate ("toki", "mumumu") or a complete non-candidate ("..!", "123").
100
100
  This design is highly undesirable for NLTK's English tokenizer because words in languages other than Toki Pona can have punctuation characters in or around them which are part of the word.
101
101
  Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet or in [Private Use Area Unicode characters](https://www.kreativekorp.com/ucsur/), so a more aggressive tokenizer is highly desirable.
102
- However, this tokenizer doesn't ignore intra-word punctuation entirely. Instead, exactly one of `-` or `'` is allowed at a time, so long as both of its neighbors are writing characters. This increases the accuracy of the tokenizer significantly, and makes identifying Toki Pona sentences among arbitrary ones similarly more accurate.
102
+ However, this tokenizer doesn't ignore intra-word punctuation entirely. Instead, exactly one of `-`, `'`, or `.` is allowed at a time, so long as both of its neighbors are writing characters. This increases the accuracy of the tokenizer significantly, which increases the scoring accuracy of the library, ultimately making it easier to correctly identify Toki Pona sentences among arbitrary ones.
103
+ The reason for this is that, while Toki Pona itself does not have intra-word punctuation, Toki Pona may be written in mixed language environments; a more accurate tokenizer is less likely to produce sentence fragments, and accurately captured sentences are more likely to be identified as the actual language they are written in.
103
104
 
104
- The goal of splitting into word candidates and non-candidates is important, because any [encoding of Toki Pona's logographic script](https://www.kreativekorp.com/ucsur/charts/sitelen.html) will require each character be split into its own token, where the default behavior would be to leave consecutive non-punctuation together.
105
+ Another note: The intent of splitting into word candidates and non-candidates is important, because any [encoding of Toki Pona's logographic script](https://www.kreativekorp.com/ucsur/charts/sitelen.html) will require each character be split into its own token, where the default behavior would be to leave consecutive non-punctuation together.
105
106
 
106
107
  ### Aren't there a lot of false positives?
107
108
 
@@ -1,9 +1,9 @@
1
- sonatoki-0.10.0.dist-info/METADATA,sha256=FS4LM5QUcxvHUY5Zq1IyT85MRAtJiq_sNWArztUI8D8,6894
2
- sonatoki-0.10.0.dist-info/WHEEL,sha256=thaaA2w1JzcGC48WYufAs8nrYZjJm8LqNfnXFOFyCC4,90
3
- sonatoki-0.10.0.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
4
- sonatoki-0.10.0.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
1
+ sonatoki-0.11.0.dist-info/METADATA,sha256=LXxje9dMJHy-2LGPS0nSRre5jWq4qvGU2bLEpJsUbpo,7303
2
+ sonatoki-0.11.0.dist-info/WHEEL,sha256=thaaA2w1JzcGC48WYufAs8nrYZjJm8LqNfnXFOFyCC4,90
3
+ sonatoki-0.11.0.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
4
+ sonatoki-0.11.0.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
5
5
  sonatoki/Cleaners.py,sha256=x2dT3MpDUfbrHA0EP2D3n1sTiKFFi5jw9ha-1dX973o,1958
6
- sonatoki/Configs.py,sha256=6TY-G1nZFGv5EcElatWvI5MagwVCo92D5TTl7s2PX_s,4899
6
+ sonatoki/Configs.py,sha256=cQizs-wqgtM9T9F4kkUsDHpIXN91p4FAgSMefDl333s,5114
7
7
  sonatoki/Filters.py,sha256=8HAtR6_Rk6GPboaS_MHwSjZBJxYnAA8kYbRPI0eR6sM,14823
8
8
  sonatoki/Preprocessors.py,sha256=RmzkvPVo6Kdx1rZ5HeR9cTtx6oxpp2iLKrOMCUEqIrM,7107
9
9
  sonatoki/Scorers.py,sha256=zkdWc0hbtCX1HPdhI2tu2mL4Z5_S5sv7T83MefE4Yik,7756
@@ -11,12 +11,12 @@ sonatoki/Tokenizers.py,sha256=yAHqVF7G-bH5i7nsvYH-dMV2qjeKvLW2W2F-fgyUnR4,6783
11
11
  sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  sonatoki/__main__.py,sha256=394ldEB4tFpw1UJLV4S4jJ55NfyLgH8rE7o3VWJoGik,6650
13
13
  sonatoki/alphabetic.txt,sha256=duyqAKilD2vLIr75RShCIAnktNJcGeEoQIk18V6czmg,11702
14
- sonatoki/constants.py,sha256=0MWyk7a6Hq8hVxV-WJ6LEkuq8LnJy-qKUah6I3PbRns,20602
15
- sonatoki/ilo.py,sha256=Dsn0yagkwjqpAQoCj6mkZ6NqWeanRF2lxNDNoqjWGLo,5993
14
+ sonatoki/constants.py,sha256=KhE385XkF-64bGBxkQNisu7vODsjpOfmrL8bnyQP_1k,20572
15
+ sonatoki/ilo.py,sha256=MWoONZaYh8h92ZrMlG-MkNktFyqHX8Jb5zOD57800KI,6755
16
16
  sonatoki/linku.json,sha256=U5KVxFJSageQydXXDsQCT8X_QoNAK2OaZhJmbu0eoZo,299939
17
17
  sonatoki/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  sonatoki/sandbox.json,sha256=QAviQZ7_nwstUr1ejKegxiIoYmBL2YJIoiZovDYNFL8,147485
19
19
  sonatoki/syllabic.txt,sha256=HnqY4TrZ3tPcHah3TsvG9F9gjMrnAGdJ8hHJNHyyUPc,1712
20
20
  sonatoki/types.py,sha256=VjYSGAzsbR_d3mg8n-VHg__7LyXpmGdEIMDsbPHyxFw,1265
21
21
  sonatoki/utils.py,sha256=sT5xLMEj0aLpy8GP92HKblJU1Wt1m8NUlMgCFWB32xQ,2265
22
- sonatoki-0.10.0.dist-info/RECORD,,
22
+ sonatoki-0.11.0.dist-info/RECORD,,