PyPI - sonatoki - Versions diffs - 0.10.1__py3-none-any.whl → 0.11.0__py3-none-any.whl - Mend

sonatoki 0.10.1py3-none-any.whl → 0.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

sonatoki/Configs.py CHANGED Viewed

@@ -81,6 +81,7 @@ class IloConfig(TypedDict):
     scoring_filters: List[Type[Filter]]
     scorer: Type[Scorer]
     passing_score: Number
+    empty_passes: bool
     word_tokenizer: NotRequired[Type[Tokenizer]]
     sent_tokenizer: NotRequired[Type[Tokenizer]]
@@ -94,6 +95,7 @@ BaseConfig: IloConfig = {
     "scoring_filters": [],
     "scorer": PassFail,
     "passing_score": 0.8,
+    "empty_passes": True,
 }
@@ -110,6 +112,7 @@ PrefConfig: IloConfig = {
     ],
     "scorer": SoftScaling,
     "passing_score": 0.8,
+    "empty_passes": True,
 }
 CorpusConfig: IloConfig = {
@@ -132,6 +135,7 @@ CorpusConfig: IloConfig = {
     ],
     "scorer": SoftScaling,
     "passing_score": 0.8,
+    "empty_passes": True,  # my client doesn't fail empty sentences; it just omits them
 }
 """Mimics the previous implementation of ilo pi toki pona taso."""
 LazyConfig: IloConfig = {
@@ -142,6 +146,7 @@ LazyConfig: IloConfig = {
     "scorer": SoftPassFail,
     "passing_score": 0.8,
     "word_tokenizer": WordTokenizerRe,  # mimics old tokenizer
+    "empty_passes": True,
 }
 """This is extremely silly."""
 IsipinEpikuConfig: IloConfig = {
@@ -162,6 +167,7 @@ IsipinEpikuConfig: IloConfig = {
     ],
     "scorer": SoftScaling,
     "passing_score": 0.8,
+    "empty_passes": True,
 }

sonatoki/ilo.py CHANGED Viewed

@@ -20,6 +20,7 @@ class Ilo:
     __scorer: Type[Scorer]
     __sentence_scorer: Type[SentenceScorer]
     __passing_score: Number
+    __empty_passes: bool
     def __init__(
         self,
@@ -29,6 +30,7 @@ class Ilo:
         scoring_filters: List[Type[Filter]],
         scorer: Type[Scorer],
         passing_score: Number,
+        empty_passes: bool = True,
         sentence_scorer: Type[SentenceScorer] = SentNoOp,
         word_tokenizer: Type[Tokenizer] = WordTokenizer,
         sent_tokenizer: Type[Tokenizer] = SentTokenizer,
@@ -44,6 +46,7 @@ class Ilo:
         self.__scorer = scorer
         self.__sentence_scorer = sentence_scorer
         self.__passing_score = passing_score
+        self.__empty_passes = empty_passes
     def preprocess(self, msg: str) -> str:
         for p in self.__preprocessors:
@@ -108,6 +111,10 @@ class Ilo:
         filtered = self.filter_tokens(tokenized)
         cleaned = self.clean_tokens(filtered)
         score = self.score_tokens(cleaned)
+        if not self.__empty_passes and not cleaned:
+            # NOTE: filtered will already be empty
+            # but clean_tokens can *technically* omit tokens too
+            score = 0
         scorecard: Scorecard = {
             "text": message,
@@ -119,10 +126,15 @@ class Ilo:
         return scorecard
+    def make_scorecard(self, message: str) -> Scorecard:
+        """Preprocess a message, then create and return a `Scorecard` for that
+        message."""
+        message = self.preprocess(message)
+        return self._is_toki_pona(message)
     def is_toki_pona(self, message: str) -> bool:
         """Determines whether a text is or is not Toki Pona."""
-        message = self.preprocess(message)
-        scorecard = self._is_toki_pona(message)
+        scorecard = self.make_scorecard(message)
         return scorecard["score"] >= self.__passing_score
     def _are_toki_pona(self, message: str) -> List[Scorecard]:
@@ -139,6 +151,12 @@ class Ilo:
         scorecards = self.score_sentences(scorecards)
         return scorecards
+    def make_scorecards(self, message: str) -> List[Scorecard]:
+        """Preprocess a message, then create and return a `Scorecard` for each
+        sentence in that message."""
+        message = self.preprocess(message)
+        return self._are_toki_pona(message)
     def are_toki_pona(self, message: str) -> List[bool]:
         """Splits a statement into sentences, then determines if each is or is not Toki Pona.
         NOTE: You will need to decide how to score the result. Examples:

{sonatoki-0.10.1.dist-info → sonatoki-0.11.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sonatoki
-Version: 0.10.1
+Version: 0.11.0
 Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
 Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
 License: AGPL-3.0-or-later
@@ -99,9 +99,10 @@ I originally intended to translate this file and library into Toki Pona once Uni
 The Toki Pona tokenizer `sonatoki.Tokenizers.WordTokenizer` attempts to tokenize statements such that every token either represents a word candidate ("toki", "mumumu") or a complete non-candidate ("..!", "123").
 This design is highly undesirable for NLTK's English tokenizer because words in languages other than Toki Pona can have punctuation characters in or around them which are part of the word.
 Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet or in [Private Use Area Unicode characters](https://www.kreativekorp.com/ucsur/), so a more aggressive tokenizer is highly desirable.
-However, this tokenizer doesn't ignore intra-word punctuation entirely. Instead, exactly one of `-` or `'` is allowed at a time, so long as both of its neighbors are writing characters. This increases the accuracy of the tokenizer significantly, and makes identifying Toki Pona sentences among arbitrary ones similarly more accurate.
+However, this tokenizer doesn't ignore intra-word punctuation entirely. Instead, exactly one of `-`, `'`, or `.` is allowed at a time, so long as both of its neighbors are writing characters. This increases the accuracy of the tokenizer significantly, which increases the scoring accuracy of the library, ultimately making it easier to correctly identify Toki Pona sentences among arbitrary ones.
+The reason for this is that, while Toki Pona itself does not have intra-word punctuation, Toki Pona may be written in mixed language environments; a more accurate tokenizer is less likely to produce sentence fragments, and accurately captured sentences are more likely to be identified as the actual language they are written in.
-The goal of splitting into word candidates and non-candidates is important, because any [encoding of Toki Pona's logographic script](https://www.kreativekorp.com/ucsur/charts/sitelen.html) will require each character be split into its own token, where the default behavior would be to leave consecutive non-punctuation together.
+Another note: The intent of splitting into word candidates and non-candidates is important, because any [encoding of Toki Pona's logographic script](https://www.kreativekorp.com/ucsur/charts/sitelen.html) will require each character be split into its own token, where the default behavior would be to leave consecutive non-punctuation together.
 ### Aren't there a lot of false positives?

{sonatoki-0.10.1.dist-info → sonatoki-0.11.0.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
-sonatoki-0.10.1.dist-info/METADATA,sha256=VjgDuH-cfq7cRA8AYIUEkFTeq8uugGwMSNf7k3LsHGo,6894
-sonatoki-0.10.1.dist-info/WHEEL,sha256=thaaA2w1JzcGC48WYufAs8nrYZjJm8LqNfnXFOFyCC4,90
-sonatoki-0.10.1.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
-sonatoki-0.10.1.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
+sonatoki-0.11.0.dist-info/METADATA,sha256=LXxje9dMJHy-2LGPS0nSRre5jWq4qvGU2bLEpJsUbpo,7303
+sonatoki-0.11.0.dist-info/WHEEL,sha256=thaaA2w1JzcGC48WYufAs8nrYZjJm8LqNfnXFOFyCC4,90
+sonatoki-0.11.0.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
+sonatoki-0.11.0.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
 sonatoki/Cleaners.py,sha256=x2dT3MpDUfbrHA0EP2D3n1sTiKFFi5jw9ha-1dX973o,1958
-sonatoki/Configs.py,sha256=6TY-G1nZFGv5EcElatWvI5MagwVCo92D5TTl7s2PX_s,4899
+sonatoki/Configs.py,sha256=cQizs-wqgtM9T9F4kkUsDHpIXN91p4FAgSMefDl333s,5114
 sonatoki/Filters.py,sha256=8HAtR6_Rk6GPboaS_MHwSjZBJxYnAA8kYbRPI0eR6sM,14823
 sonatoki/Preprocessors.py,sha256=RmzkvPVo6Kdx1rZ5HeR9cTtx6oxpp2iLKrOMCUEqIrM,7107
 sonatoki/Scorers.py,sha256=zkdWc0hbtCX1HPdhI2tu2mL4Z5_S5sv7T83MefE4Yik,7756
@@ -12,11 +12,11 @@ sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sonatoki/__main__.py,sha256=394ldEB4tFpw1UJLV4S4jJ55NfyLgH8rE7o3VWJoGik,6650
 sonatoki/alphabetic.txt,sha256=duyqAKilD2vLIr75RShCIAnktNJcGeEoQIk18V6czmg,11702
 sonatoki/constants.py,sha256=KhE385XkF-64bGBxkQNisu7vODsjpOfmrL8bnyQP_1k,20572
-sonatoki/ilo.py,sha256=Dsn0yagkwjqpAQoCj6mkZ6NqWeanRF2lxNDNoqjWGLo,5993
+sonatoki/ilo.py,sha256=MWoONZaYh8h92ZrMlG-MkNktFyqHX8Jb5zOD57800KI,6755
 sonatoki/linku.json,sha256=U5KVxFJSageQydXXDsQCT8X_QoNAK2OaZhJmbu0eoZo,299939
 sonatoki/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sonatoki/sandbox.json,sha256=QAviQZ7_nwstUr1ejKegxiIoYmBL2YJIoiZovDYNFL8,147485
 sonatoki/syllabic.txt,sha256=HnqY4TrZ3tPcHah3TsvG9F9gjMrnAGdJ8hHJNHyyUPc,1712
 sonatoki/types.py,sha256=VjYSGAzsbR_d3mg8n-VHg__7LyXpmGdEIMDsbPHyxFw,1265
 sonatoki/utils.py,sha256=sT5xLMEj0aLpy8GP92HKblJU1Wt1m8NUlMgCFWB32xQ,2265
-sonatoki-0.10.1.dist-info/RECORD,,
+sonatoki-0.11.0.dist-info/RECORD,,

{sonatoki-0.10.1.dist-info → sonatoki-0.11.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{sonatoki-0.10.1.dist-info → sonatoki-0.11.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{sonatoki-0.10.1.dist-info → sonatoki-0.11.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

sonatoki 0.10.1__py3-none-any.whl → 0.11.0__py3-none-any.whl

sonatoki 0.10.1py3-none-any.whl → 0.11.0py3-none-any.whl