sonatoki 0.8.2__py3-none-any.whl → 0.8.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonatoki/Configs.py CHANGED
@@ -33,6 +33,7 @@ from sonatoki.Scorers import Scorer, PassFail, SoftScaling, SoftPassFail
33
33
  from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
34
34
  from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe
35
35
  from sonatoki.Preprocessors import (
36
+ RECOMMENDED_PREPROCESSORS,
36
37
  URLs,
37
38
  Emoji,
38
39
  Codeblock,
@@ -94,7 +95,7 @@ BaseConfig: IloConfig = {
94
95
 
95
96
 
96
97
  PrefConfig: IloConfig = {
97
- "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
98
+ "preprocessors": RECOMMENDED_PREPROCESSORS,
98
99
  "cleaners": [ConsecutiveDuplicates],
99
100
  "ignoring_filters": [Numeric, Punctuation],
100
101
  "scoring_filters": [
@@ -109,7 +110,7 @@ PrefConfig: IloConfig = {
109
110
  }
110
111
 
111
112
  CorpusConfig: IloConfig = {
112
- "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
113
+ "preprocessors": RECOMMENDED_PREPROCESSORS,
113
114
  "cleaners": [ConsecutiveDuplicates],
114
115
  "ignoring_filters": [Numeric, Punctuation],
115
116
  "scoring_filters": [
@@ -128,7 +129,7 @@ CorpusConfig: IloConfig = {
128
129
  }
129
130
  """Mimics the previous implementation of ilo pi toki pona taso."""
130
131
  LazyConfig: IloConfig = {
131
- "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
132
+ "preprocessors": RECOMMENDED_PREPROCESSORS,
132
133
  "cleaners": [ConsecutiveDuplicates],
133
134
  "ignoring_filters": [Numeric, Punctuation],
134
135
  "scoring_filters": [Alphabetic, NimiUCSUR, PuName, Miscellaneous],
@@ -138,7 +139,7 @@ LazyConfig: IloConfig = {
138
139
  }
139
140
  """This is extremely silly."""
140
141
  IsipinEpikuConfig: IloConfig = {
141
- "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
142
+ "preprocessors": RECOMMENDED_PREPROCESSORS,
142
143
  "cleaners": [ConsecutiveDuplicates],
143
144
  "ignoring_filters": [Numeric, Punctuation],
144
145
  "scoring_filters": [
sonatoki/Preprocessors.py CHANGED
@@ -19,6 +19,7 @@ It is up to the user to order them appropriately.
19
19
  # STL
20
20
  import re
21
21
  from abc import ABC, abstractmethod
22
+ from typing import List, Type
22
23
 
23
24
  # PDM
24
25
  import emoji
@@ -74,6 +75,27 @@ class URLs(RegexPreprocessor):
74
75
  pattern = re.compile(r"https?:\/\/\S+")
75
76
 
76
77
 
78
+ class MarkdownURLs(RegexPreprocessor):
79
+ """Remove URLs in markdown format, replacing them with their corresponding
80
+ text."""
81
+
82
+ pattern = re.compile(r"\[(.+?)\]\(https?:\/\/\S+\)")
83
+ replace = r"\1"
84
+
85
+
86
+ class Emails(RegexPreprocessor):
87
+ """Attempt to remove emails, for a particularly strong definition of
88
+ "email".
89
+
90
+ https://www.regular-expressions.info/email.html
91
+ """
92
+
93
+ pattern = re.compile(
94
+ r"\b[a-zA-Z0-9._%+-]{2,}@[a-zA-Z0-9.-]{2,}\.[a-zA-Z]{2,24}\b",
95
+ flags=re.IGNORECASE,
96
+ )
97
+
98
+
77
99
  class Reference(RegexPreprocessor):
78
100
  """Remove text contained in double brackets.
79
101
 
@@ -209,6 +231,21 @@ class ZeroWidths(RegexPreprocessor):
209
231
  pattern = re.compile("[\\U0000200C-\\U0000200D]")
210
232
 
211
233
 
234
+ RECOMMENDED_PREPROCESSORS: List[Type[Preprocessor]] = [
235
+ # These are sorted by the "strength" of their definition, which would be roughly
236
+ # "How confidently have we matched this object?"
237
+ # Additionally, MarkdownURLs must come before URLs, and Emoji must come last due to
238
+ # its ability to appear in all of the others.
239
+ Codeblock,
240
+ AngleBracketObject,
241
+ Reference,
242
+ MarkdownURLs,
243
+ URLs,
244
+ Emails,
245
+ Emoji,
246
+ ]
247
+
248
+
212
249
  __all__ = [
213
250
  "AllQuotes",
214
251
  "AngleBracketObject",
@@ -219,10 +256,13 @@ __all__ = [
219
256
  "DiscordMentions",
220
257
  "DiscordSpecial",
221
258
  "DoubleQuotes",
259
+ "Emails",
260
+ "Emoji",
261
+ "MarkdownURLs",
262
+ "RECOMMENDED_PREPROCESSORS",
222
263
  "Reference",
223
264
  "SingleQuotes",
224
265
  "Spoilers",
225
266
  "URLs",
226
267
  "ZeroWidths",
227
- "Emoji",
228
268
  ]
sonatoki/constants.py CHANGED
@@ -518,8 +518,6 @@ ALPHABETICS = Path(__file__).resolve().parent / Path("alphabetic.txt")
518
518
  VOWELS = "aeiou"
519
519
  CONSONANTS = "jklmnpstw"
520
520
  ALPHABET = VOWELS + CONSONANTS
521
-
522
- LANGUAGE = "english" # for NLTK
523
521
  """Commonly occurring strings which are some kind of valid Toki Pona or
524
522
  external token."""
525
523
  ALLOWABLES = {
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.8.2
3
+ Version: 0.8.4
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -97,8 +97,9 @@ I originally intended to translate this file and library into Toki Pona once Uni
97
97
  ### What's the deal with the tokenizers?
98
98
 
99
99
  The Toki Pona tokenizer `sonatoki.Tokenizers.WordTokenizer` attempts to tokenize statements such that every token either represents a word candidate ("toki", "mumumu") or a complete non-candidate ("..!", "123").
100
- This design is highly undesirable for NLTK's English tokenizer because English words can have "punctuation" characters in them such as `'` or `-`.
100
+ This design is highly undesirable for NLTK's English tokenizer because words in languages other than Toki Pona can have punctuation characters in or around them which are part of the word.
101
101
  Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet or in [Private Use Area Unicode characters](https://www.kreativekorp.com/ucsur/), so a more aggressive tokenizer is highly desirable.
102
+ However, this tokenizer doesn't ignore intra-word punctuation entirely. Instead, exactly one of `-` or `'` is allowed at a time, so long as both of its neighbors are writing characters. This increases the accuracy of the tokenizer significantly, and makes identifying Toki Pona sentences among arbitrary ones similarly more accurate.
102
103
 
103
104
  The goal of splitting into word candidates and non-candidates is important, because any [encoding of Toki Pona's logographic script](https://www.kreativekorp.com/ucsur/charts/sitelen.html) will require each character be split into its own token, where the default behavior would be to leave consecutive non-punctuation together.
104
105
 
@@ -1,16 +1,16 @@
1
- sonatoki-0.8.2.dist-info/METADATA,sha256=-B-LR4O8O16t7Ond150qs5Il9j08wWnRa76q3sjjszA,6517
2
- sonatoki-0.8.2.dist-info/WHEEL,sha256=rSwsxJWe3vzyR5HCwjWXQruDgschpei4h_giTm0dJVE,90
3
- sonatoki-0.8.2.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
1
+ sonatoki-0.8.4.dist-info/METADATA,sha256=Nui-Em5-CwsiOt5mkyhF5bb6WM9VQ6sp9UlENnH5Udw,6893
2
+ sonatoki-0.8.4.dist-info/WHEEL,sha256=rSwsxJWe3vzyR5HCwjWXQruDgschpei4h_giTm0dJVE,90
3
+ sonatoki-0.8.4.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
4
4
  sonatoki/Cleaners.py,sha256=x2dT3MpDUfbrHA0EP2D3n1sTiKFFi5jw9ha-1dX973o,1958
5
- sonatoki/Configs.py,sha256=h6-igZbhbYoYA0gJLrd3YCa5annTqacsAMGB1dX3v9A,4758
5
+ sonatoki/Configs.py,sha256=-R-rTPUJfuSintpvC4UnOF1B9B93-Ooh_jmkZwhKvtk,4669
6
6
  sonatoki/Filters.py,sha256=rBEJrY_R6koFpoYl4yfo_9UR-i21HbvlUF0ORg1g0WE,13411
7
- sonatoki/Preprocessors.py,sha256=5xKBifsaHMm_fg8nQq4IdyLBGKe8SuWXg67-O5tl1qM,6043
7
+ sonatoki/Preprocessors.py,sha256=RmzkvPVo6Kdx1rZ5HeR9cTtx6oxpp2iLKrOMCUEqIrM,7107
8
8
  sonatoki/Scorers.py,sha256=aCU3p9rD4QOy-uu851FGGw-ARqUCG_l4V_z5rtRL420,5236
9
9
  sonatoki/Tokenizers.py,sha256=8lpC70bzXOpHyhVr5bmqpYKmdmQvJdf7X5-Icc9RRCw,5040
10
10
  sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  sonatoki/__main__.py,sha256=6n4kUF80APl6a0jV46h_ncHNuQbrLpZ_nAmiNAakiag,5673
12
12
  sonatoki/alphabetic.txt,sha256=duyqAKilD2vLIr75RShCIAnktNJcGeEoQIk18V6czmg,11702
13
- sonatoki/constants.py,sha256=y1ZyuvCC1geTWBEnHiOHPWBxisf5McMnJn4LzGd7RFI,19358
13
+ sonatoki/constants.py,sha256=Or9VEk19N8Bef14ZRgcP-r0F9qyVByv4CImqpoJ8I6Q,19324
14
14
  sonatoki/ilo.py,sha256=Dsn0yagkwjqpAQoCj6mkZ6NqWeanRF2lxNDNoqjWGLo,5993
15
15
  sonatoki/linku.json,sha256=d72Dvht-a4gBmdqLLI8mElvo83zSpbxDmxJj05hOudM,295413
16
16
  sonatoki/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -18,4 +18,4 @@ sonatoki/sandbox.json,sha256=44csrQDaVtV-n8OyewabX1J9MmUFCsPct5C8E5Xuc58,140197
18
18
  sonatoki/syllabic.txt,sha256=HnqY4TrZ3tPcHah3TsvG9F9gjMrnAGdJ8hHJNHyyUPc,1712
19
19
  sonatoki/types.py,sha256=zoVJeaDLOPstREiHtoD9pv-AOCsJq2C4_GG3nTYd114,1267
20
20
  sonatoki/utils.py,sha256=sT5xLMEj0aLpy8GP92HKblJU1Wt1m8NUlMgCFWB32xQ,2265
21
- sonatoki-0.8.2.dist-info/RECORD,,
21
+ sonatoki-0.8.4.dist-info/RECORD,,