PyPI - sonatoki - Versions diffs - 0.8.2__py3-none-any.whl → 0.8.4__py3-none-any.whl - Mend

sonatoki 0.8.2py3-none-any.whl → 0.8.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

sonatoki/Configs.py CHANGED Viewed

@@ -33,6 +33,7 @@ from sonatoki.Scorers import Scorer, PassFail, SoftScaling, SoftPassFail
 from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
 from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe
 from sonatoki.Preprocessors import (
+    RECOMMENDED_PREPROCESSORS,
     URLs,
     Emoji,
     Codeblock,
@@ -94,7 +95,7 @@ BaseConfig: IloConfig = {
 PrefConfig: IloConfig = {
-    "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
+    "preprocessors": RECOMMENDED_PREPROCESSORS,
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [
@@ -109,7 +110,7 @@ PrefConfig: IloConfig = {
 }
 CorpusConfig: IloConfig = {
-    "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
+    "preprocessors": RECOMMENDED_PREPROCESSORS,
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [
@@ -128,7 +129,7 @@ CorpusConfig: IloConfig = {
 }
 """Mimics the previous implementation of ilo pi toki pona taso."""
 LazyConfig: IloConfig = {
-    "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
+    "preprocessors": RECOMMENDED_PREPROCESSORS,
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [Alphabetic, NimiUCSUR, PuName, Miscellaneous],
@@ -138,7 +139,7 @@ LazyConfig: IloConfig = {
 }
 """This is extremely silly."""
 IsipinEpikuConfig: IloConfig = {
-    "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
+    "preprocessors": RECOMMENDED_PREPROCESSORS,
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [

sonatoki/Preprocessors.py CHANGED Viewed

@@ -19,6 +19,7 @@ It is up to the user to order them appropriately.
 # STL
 import re
 from abc import ABC, abstractmethod
+from typing import List, Type
 # PDM
 import emoji
@@ -74,6 +75,27 @@ class URLs(RegexPreprocessor):
     pattern = re.compile(r"https?:\/\/\S+")
+class MarkdownURLs(RegexPreprocessor):
+    """Remove URLs in markdown format, replacing them with their corresponding
+    text."""
+    pattern = re.compile(r"\[(.+?)\]\(https?:\/\/\S+\)")
+    replace = r"\1"
+class Emails(RegexPreprocessor):
+    """Attempt to remove emails, for a particularly strong definition of
+    "email".
+    https://www.regular-expressions.info/email.html
+    """
+    pattern = re.compile(
+        r"\b[a-zA-Z0-9._%+-]{2,}@[a-zA-Z0-9.-]{2,}\.[a-zA-Z]{2,24}\b",
+        flags=re.IGNORECASE,
+    )
 class Reference(RegexPreprocessor):
     """Remove text contained in double brackets.
@@ -209,6 +231,21 @@ class ZeroWidths(RegexPreprocessor):
     pattern = re.compile("[\\U0000200C-\\U0000200D]")
+RECOMMENDED_PREPROCESSORS: List[Type[Preprocessor]] = [
+    # These are sorted by the "strength" of their definition, which would be roughly
+    # "How confidently have we matched this object?"
+    # Additionally, MarkdownURLs must come before URLs, and Emoji must come last due to
+    # its ability to appear in all of the others.
+    Codeblock,
+    AngleBracketObject,
+    Reference,
+    MarkdownURLs,
+    URLs,
+    Emails,
+    Emoji,
+]
 __all__ = [
     "AllQuotes",
     "AngleBracketObject",
@@ -219,10 +256,13 @@ __all__ = [
     "DiscordMentions",
     "DiscordSpecial",
     "DoubleQuotes",
+    "Emails",
+    "Emoji",
+    "MarkdownURLs",
+    "RECOMMENDED_PREPROCESSORS",
     "Reference",
     "SingleQuotes",
     "Spoilers",
     "URLs",
     "ZeroWidths",
-    "Emoji",
 ]

sonatoki/constants.py CHANGED Viewed

@@ -518,8 +518,6 @@ ALPHABETICS = Path(__file__).resolve().parent / Path("alphabetic.txt")
 VOWELS = "aeiou"
 CONSONANTS = "jklmnpstw"
 ALPHABET = VOWELS + CONSONANTS
-LANGUAGE = "english"  # for NLTK
 """Commonly occurring strings which are some kind of valid Toki Pona or
 external token."""
 ALLOWABLES = {

{sonatoki-0.8.2.dist-info → sonatoki-0.8.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sonatoki
-Version: 0.8.2
+Version: 0.8.4
 Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
 Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
 License: AGPL-3.0-or-later
@@ -97,8 +97,9 @@ I originally intended to translate this file and library into Toki Pona once Uni
 ### What's the deal with the tokenizers?
 The Toki Pona tokenizer `sonatoki.Tokenizers.WordTokenizer` attempts to tokenize statements such that every token either represents a word candidate ("toki", "mumumu") or a complete non-candidate ("..!", "123").
-This design is highly undesirable for NLTK's English tokenizer because English words can have "punctuation" characters in them such as `'` or `-`.
+This design is highly undesirable for NLTK's English tokenizer because words in languages other than Toki Pona can have punctuation characters in or around them which are part of the word.
 Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet or in [Private Use Area Unicode characters](https://www.kreativekorp.com/ucsur/), so a more aggressive tokenizer is highly desirable.
+However, this tokenizer doesn't ignore intra-word punctuation entirely. Instead, exactly one of `-` or `'` is allowed at a time, so long as both of its neighbors are writing characters. This increases the accuracy of the tokenizer significantly, and makes identifying Toki Pona sentences among arbitrary ones similarly more accurate.
 The goal of splitting into word candidates and non-candidates is important, because any [encoding of Toki Pona's logographic script](https://www.kreativekorp.com/ucsur/charts/sitelen.html) will require each character be split into its own token, where the default behavior would be to leave consecutive non-punctuation together.

{sonatoki-0.8.2.dist-info → sonatoki-0.8.4.dist-info}/RECORD RENAMED Viewed

@@ -1,16 +1,16 @@
-sonatoki-0.8.2.dist-info/METADATA,sha256=-B-LR4O8O16t7Ond150qs5Il9j08wWnRa76q3sjjszA,6517
-sonatoki-0.8.2.dist-info/WHEEL,sha256=rSwsxJWe3vzyR5HCwjWXQruDgschpei4h_giTm0dJVE,90
-sonatoki-0.8.2.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
+sonatoki-0.8.4.dist-info/METADATA,sha256=Nui-Em5-CwsiOt5mkyhF5bb6WM9VQ6sp9UlENnH5Udw,6893
+sonatoki-0.8.4.dist-info/WHEEL,sha256=rSwsxJWe3vzyR5HCwjWXQruDgschpei4h_giTm0dJVE,90
+sonatoki-0.8.4.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
 sonatoki/Cleaners.py,sha256=x2dT3MpDUfbrHA0EP2D3n1sTiKFFi5jw9ha-1dX973o,1958
-sonatoki/Configs.py,sha256=h6-igZbhbYoYA0gJLrd3YCa5annTqacsAMGB1dX3v9A,4758
+sonatoki/Configs.py,sha256=-R-rTPUJfuSintpvC4UnOF1B9B93-Ooh_jmkZwhKvtk,4669
 sonatoki/Filters.py,sha256=rBEJrY_R6koFpoYl4yfo_9UR-i21HbvlUF0ORg1g0WE,13411
-sonatoki/Preprocessors.py,sha256=5xKBifsaHMm_fg8nQq4IdyLBGKe8SuWXg67-O5tl1qM,6043
+sonatoki/Preprocessors.py,sha256=RmzkvPVo6Kdx1rZ5HeR9cTtx6oxpp2iLKrOMCUEqIrM,7107
 sonatoki/Scorers.py,sha256=aCU3p9rD4QOy-uu851FGGw-ARqUCG_l4V_z5rtRL420,5236
 sonatoki/Tokenizers.py,sha256=8lpC70bzXOpHyhVr5bmqpYKmdmQvJdf7X5-Icc9RRCw,5040
 sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sonatoki/__main__.py,sha256=6n4kUF80APl6a0jV46h_ncHNuQbrLpZ_nAmiNAakiag,5673
 sonatoki/alphabetic.txt,sha256=duyqAKilD2vLIr75RShCIAnktNJcGeEoQIk18V6czmg,11702
-sonatoki/constants.py,sha256=y1ZyuvCC1geTWBEnHiOHPWBxisf5McMnJn4LzGd7RFI,19358
+sonatoki/constants.py,sha256=Or9VEk19N8Bef14ZRgcP-r0F9qyVByv4CImqpoJ8I6Q,19324
 sonatoki/ilo.py,sha256=Dsn0yagkwjqpAQoCj6mkZ6NqWeanRF2lxNDNoqjWGLo,5993
 sonatoki/linku.json,sha256=d72Dvht-a4gBmdqLLI8mElvo83zSpbxDmxJj05hOudM,295413
 sonatoki/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -18,4 +18,4 @@ sonatoki/sandbox.json,sha256=44csrQDaVtV-n8OyewabX1J9MmUFCsPct5C8E5Xuc58,140197
 sonatoki/syllabic.txt,sha256=HnqY4TrZ3tPcHah3TsvG9F9gjMrnAGdJ8hHJNHyyUPc,1712
 sonatoki/types.py,sha256=zoVJeaDLOPstREiHtoD9pv-AOCsJq2C4_GG3nTYd114,1267
 sonatoki/utils.py,sha256=sT5xLMEj0aLpy8GP92HKblJU1Wt1m8NUlMgCFWB32xQ,2265
-sonatoki-0.8.2.dist-info/RECORD,,
+sonatoki-0.8.4.dist-info/RECORD,,

{sonatoki-0.8.2.dist-info → sonatoki-0.8.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{sonatoki-0.8.2.dist-info → sonatoki-0.8.4.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

sonatoki 0.8.2__py3-none-any.whl → 0.8.4__py3-none-any.whl

sonatoki 0.8.2py3-none-any.whl → 0.8.4py3-none-any.whl