sonatoki 0.1.3__tar.gz → 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {sonatoki-0.1.3 → sonatoki-0.1.5}/PKG-INFO +1 -1
  2. {sonatoki-0.1.3 → sonatoki-0.1.5}/pyproject.toml +1 -1
  3. {sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/Configs.py +3 -3
  4. {sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/Filters.py +20 -7
  5. {sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/Preprocessors.py +48 -6
  6. {sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/Scorers.py +2 -14
  7. {sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/Tokenizers.py +22 -7
  8. sonatoki-0.1.5/src/sonatoki/constants.py +83 -0
  9. {sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/ilo.py +0 -12
  10. {sonatoki-0.1.3 → sonatoki-0.1.5}/tests/test_filters.py +10 -11
  11. {sonatoki-0.1.3 → sonatoki-0.1.5}/tests/test_ilo.py +0 -1
  12. {sonatoki-0.1.3 → sonatoki-0.1.5}/tests/test_preprocessors.py +40 -0
  13. {sonatoki-0.1.3 → sonatoki-0.1.5}/tests/test_scorers.py +8 -6
  14. {sonatoki-0.1.3 → sonatoki-0.1.5}/tests/tokenize_cases/tokenize_sentences_tok.yml +18 -0
  15. sonatoki-0.1.3/src/sonatoki/constants.py +0 -67
  16. {sonatoki-0.1.3 → sonatoki-0.1.5}/LICENSE +0 -0
  17. {sonatoki-0.1.3 → sonatoki-0.1.5}/README.md +0 -0
  18. {sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/Cleaners.py +0 -0
  19. {sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/__init__.py +0 -0
  20. {sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/__main__.py +0 -0
  21. {sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/linku.json +0 -0
  22. {sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/sandbox.json +0 -0
  23. {sonatoki-0.1.3 → sonatoki-0.1.5}/tests/__init__.py +0 -0
  24. {sonatoki-0.1.3 → sonatoki-0.1.5}/tests/test_cleaners.py +0 -0
  25. {sonatoki-0.1.3 → sonatoki-0.1.5}/tests/test_tokenize.py +0 -0
  26. {sonatoki-0.1.3 → sonatoki-0.1.5}/tests/test_utils.py +0 -0
  27. {sonatoki-0.1.3 → sonatoki-0.1.5}/tests/tokenize_cases/tokenize_sentences.yml +0 -0
  28. {sonatoki-0.1.3 → sonatoki-0.1.5}/tests/tokenize_cases/tokenize_words.yml +0 -0
  29. {sonatoki-0.1.3 → sonatoki-0.1.5}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.1.3
3
+ Version: 0.1.5
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sonatoki"
3
- version = "0.1.3"
3
+ version = "0.1.5"
4
4
  description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
5
5
  authors = [
6
6
  { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
@@ -9,15 +9,15 @@ from typing_extensions import NotRequired
9
9
  from sonatoki.Filters import (
10
10
  Filter,
11
11
  NimiPu,
12
- Numerics,
12
+ Numeric,
13
13
  Syllabic,
14
14
  NimiLinku,
15
15
  NimiPuAle,
16
16
  Alphabetic,
17
17
  ProperName,
18
18
  Phonotactic,
19
+ Punctuation,
19
20
  NimiLinkuAle,
20
- Punctuations,
21
21
  )
22
22
  from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
23
23
  from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
@@ -45,7 +45,7 @@ class IloConfig(TypedDict):
45
45
  BaseConfig: IloConfig = {
46
46
  "preprocessors": [URLs],
47
47
  "cleaners": [ConsecutiveDuplicates],
48
- "ignoring_filters": [Numerics, Punctuations],
48
+ "ignoring_filters": [Numeric, Punctuation],
49
49
  "scoring_filters": [],
50
50
  "scorer": PassFail,
51
51
  "passing_score": 0.8,
@@ -1,10 +1,11 @@
1
1
  # STL
2
+ import re
2
3
  from abc import ABC, abstractmethod
3
4
  from typing import Set
4
5
  from functools import lru_cache as cache # cache comes in 3.9
5
6
 
6
7
  # PDM
7
- import regex as re
8
+ import regex
8
9
  from typing_extensions import override
9
10
 
10
11
  # LOCAL
@@ -13,14 +14,16 @@ from sonatoki.constants import (
13
14
  CONSONANTS,
14
15
  NIMI_PU_SET,
15
16
  ALPHABET_SET,
17
+ UNICODE_PUNCT,
16
18
  ALLOWABLES_SET,
17
19
  NIMI_LINKU_SET,
18
20
  NIMI_PU_ALE_SET,
19
21
  NIMI_LINKU_ALE_SET,
22
+ PRUNED_POSIX_PUNCT,
20
23
  NIMI_LINKU_SANDBOX_SET,
21
24
  )
22
25
 
23
- re.DEFAULT_VERSION = re.VERSION1
26
+ regex.DEFAULT_VERSION = regex.VERSION1
24
27
 
25
28
 
26
29
  class Filter(ABC):
@@ -41,6 +44,16 @@ class RegexFilter(Filter):
41
44
  return not not re.fullmatch(cls.pattern, token)
42
45
 
43
46
 
47
+ class Regex1Filter(Filter):
48
+ pattern: "regex.Pattern[str]"
49
+
50
+ @classmethod
51
+ @override
52
+ @cache(maxsize=None)
53
+ def filter(cls, token: str) -> bool:
54
+ return not not regex.fullmatch(cls.pattern, token)
55
+
56
+
44
57
  class SetFilter(Filter):
45
58
  tokens: Set[str]
46
59
 
@@ -131,7 +144,7 @@ class Alphabetic(Filter):
131
144
  return set(token.lower()).issubset(ALPHABET_SET)
132
145
 
133
146
 
134
- class Numerics(Filter):
147
+ class Numeric(Filter):
135
148
  """Determine if a given token is entirely numeric.
136
149
  Covers all numeric symbols in Unicode.
137
150
 
@@ -147,8 +160,8 @@ class Numerics(Filter):
147
160
  return msg.isnumeric()
148
161
 
149
162
 
150
- class Punctuations(RegexFilter):
151
- pattern = re.compile(r"[\p{Punctuation}\p{posix_punct}]+")
163
+ class Punctuation(RegexFilter):
164
+ pattern = re.compile(rf"[{PRUNED_POSIX_PUNCT}{UNICODE_PUNCT}]+")
152
165
 
153
166
 
154
167
  __all__ = [
@@ -159,6 +172,6 @@ __all__ = [
159
172
  "Syllabic",
160
173
  "Alphabetic",
161
174
  "ProperName",
162
- "Punctuations",
163
- "Numerics",
175
+ "Punctuation",
176
+ "Numeric",
164
177
  ]
@@ -17,13 +17,14 @@ It is up to the user to order them appropriately.
17
17
  """
18
18
 
19
19
  # STL
20
+ import re
20
21
  from abc import ABC, abstractmethod
21
22
 
22
23
  # PDM
23
- import regex as re
24
+ import regex
24
25
  from typing_extensions import override
25
26
 
26
- re.DEFAULT_VERSION = re.VERSION1
27
+ regex.DEFAULT_VERSION = regex.VERSION1
27
28
 
28
29
 
29
30
  class Preprocessor(ABC):
@@ -43,6 +44,16 @@ class RegexPreprocessor(Preprocessor):
43
44
  return re.sub(cls.pattern, cls.replace, msg)
44
45
 
45
46
 
47
+ class Regex1Preprocessor(Preprocessor):
48
+ pattern: "regex.Pattern[str]"
49
+ replace: str = " "
50
+
51
+ @classmethod
52
+ @override
53
+ def process(cls, msg: str) -> str:
54
+ return regex.sub(cls.pattern, cls.replace, msg)
55
+
56
+
46
57
  """
47
58
  The following classes are Ignorables.
48
59
 
@@ -62,6 +73,13 @@ class URLs(RegexPreprocessor):
62
73
  pattern = re.compile(r"https?:\/\/\S+")
63
74
 
64
75
 
76
+ class Reference(RegexPreprocessor):
77
+ """Remove text contained in double brackets.
78
+ Often used to fetch articles on Wikipedia, or Magic the Gathering cards."""
79
+
80
+ pattern = re.compile(r"\[\[.+\]\]")
81
+
82
+
65
83
  class DiscordEmotes(RegexPreprocessor):
66
84
  """Remove text-formatted Discord emotes `<flags:name:id>`"""
67
85
 
@@ -80,6 +98,13 @@ class DiscordSpecial(RegexPreprocessor):
80
98
  pattern = re.compile(r"<id:[a-zA-Z0-9_]{4,}>")
81
99
 
82
100
 
101
+ class AngleBracketObject(RegexPreprocessor):
102
+ """A generalized version of the Discord-specific angle bracket objects.
103
+ Removes any contiguous (not broken by whitespace) text in angle brackets."""
104
+
105
+ pattern = re.compile(r"<[^<>\s]+>")
106
+
107
+
83
108
  """
84
109
  The following classes are Containers.
85
110
 
@@ -92,23 +117,23 @@ would likely be using a language other than Toki Pona.
92
117
 
93
118
 
94
119
  class SingleQuotes(RegexPreprocessor):
95
- pattern = re.compile(r"'[^']+'", flags=re.S) # . matches newline
120
+ pattern = re.compile(r"'[^']+'", flags=re.DOTALL)
96
121
 
97
122
 
98
123
  class DoubleQuotes(RegexPreprocessor):
99
- pattern = re.compile(r'"[^"]+"', flags=re.S)
124
+ pattern = re.compile(r'"[^"]+"', flags=re.DOTALL)
100
125
 
101
126
 
102
127
  class Backticks(RegexPreprocessor):
103
128
  """Remove paired backticks and their contents `like this`"""
104
129
 
105
- pattern = re.compile(r"`[^`]+`", flags=re.S)
130
+ pattern = re.compile(r"`[^`]+`", flags=re.DOTALL)
106
131
 
107
132
 
108
133
  class Spoilers(RegexPreprocessor):
109
134
  """Remove paired double bars and their contents `||like this||`"""
110
135
 
111
- pattern = re.compile(r"\|\|(?:(?!\|\|).)+\|\|", flags=re.S)
136
+ pattern = re.compile(r"\|\|(?:(?!\|\|).)+\|\|", flags=re.DOTALL)
112
137
 
113
138
 
114
139
  class ArrowQuote(RegexPreprocessor):
@@ -117,7 +142,22 @@ class ArrowQuote(RegexPreprocessor):
117
142
  pattern = re.compile(r"^>\ .+$", re.MULTILINE)
118
143
 
119
144
 
145
+ class AllQuotes(RegexPreprocessor):
146
+ pattern = re.compile(
147
+ "|".join(
148
+ [
149
+ SingleQuotes.pattern.pattern,
150
+ DoubleQuotes.pattern.pattern,
151
+ Backticks.pattern.pattern,
152
+ ArrowQuote.pattern.pattern,
153
+ ]
154
+ ),
155
+ flags=re.MULTILINE | re.DOTALL,
156
+ )
157
+
158
+
120
159
  __all__ = [
160
+ "AngleBracketObject",
121
161
  "DiscordChannels",
122
162
  "DiscordMentions",
123
163
  "DiscordSpecial",
@@ -125,7 +165,9 @@ __all__ = [
125
165
  "SingleQuotes",
126
166
  "DoubleQuotes",
127
167
  "ArrowQuote",
168
+ "AllQuotes",
128
169
  "Backticks",
170
+ "Reference",
129
171
  "Spoilers",
130
172
  "URLs",
131
173
  ]
@@ -10,8 +10,6 @@ from typing_extensions import override
10
10
  # LOCAL
11
11
  from sonatoki.Filters import Filter
12
12
 
13
- LOG = logging.getLogger(__name__)
14
-
15
13
  Number = Union[int, float]
16
14
  Weights = Dict[str, Number]
17
15
 
@@ -37,12 +35,7 @@ class PassFail(Scorer):
37
35
  def score_token(cls, token: str, filters: List[Type[Filter]]) -> Number:
38
36
  for f in filters:
39
37
  if f.filter(token):
40
- score = 1
41
- LOG.debug(
42
- "%12s.%s('%s') = %.2f", cls.__name__, f.__name__, token, score
43
- )
44
- return score
45
- LOG.debug("%12s('%s') = 0.00", cls.__name__, token)
38
+ return 1
46
39
  return 0
47
40
 
48
41
  @classmethod
@@ -86,12 +79,7 @@ class Scaling(Scorer):
86
79
  def score_token(cls, token: str, filters: List[Type[Filter]], scale: int):
87
80
  for i, f in enumerate(filters):
88
81
  if f.filter(token):
89
- score = scale - i
90
- LOG.debug(
91
- "%12s.%s('%s') = %.2f", cls.__name__, f.__name__, token, score
92
- )
93
- return score
94
- LOG.debug("%12s('%s') = 0.00", cls.__name__, token)
82
+ return scale - i
95
83
  return 0
96
84
 
97
85
  @classmethod
@@ -1,11 +1,15 @@
1
1
  # STL
2
+ import re
2
3
  from abc import ABC, abstractmethod
3
4
  from typing import List
4
5
 
5
6
  # PDM
6
- import regex as re
7
+ import regex
7
8
  from typing_extensions import override
8
9
 
10
+ # LOCAL
11
+ from sonatoki.constants import UNICODE_PUNCT, PRUNED_POSIX_PUNCT
12
+
9
13
  try:
10
14
  # PDM
11
15
  import nltk
@@ -15,7 +19,7 @@ except ImportError as e:
15
19
  nltk = e
16
20
 
17
21
 
18
- LANGUAGE = "english" # for NLTK
22
+ regex.DEFAULT_VERSION = regex.VERSION1
19
23
 
20
24
 
21
25
  class Tokenizer(ABC):
@@ -42,15 +46,26 @@ class RegexTokenizer(Tokenizer):
42
46
  return [clean for word in re.split(cls.pattern, s) if (clean := word.strip())]
43
47
 
44
48
 
49
+ class Regex1Tokenizer(Tokenizer):
50
+ pattern: "regex.Pattern[str]"
51
+
52
+ @classmethod
53
+ @override
54
+ def tokenize(cls, s: str) -> List[str]:
55
+ return [
56
+ clean for word in regex.split(cls.pattern, s) if (clean := word.strip())
57
+ ]
58
+
59
+
45
60
  class WordTokenizerTok(RegexTokenizer):
46
- pattern = re.compile(r"""([\p{Punctuation}\p{posix_punct}]+|\s+)""")
47
- # TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
48
- # TODO: do the typography characters matter?
49
- # NOTE: | / and , are *not* sentence delimiters for my purpose
61
+ pattern = re.compile(rf"""([{PRUNED_POSIX_PUNCT}{UNICODE_PUNCT}]+|\s+)""")
50
62
 
51
63
 
52
64
  class SentTokenizerTok(RegexTokenizer):
53
- pattern = re.compile(r"""(?<=[.?!:;·…“”"'()\[\]\-]|$)""")
65
+ pattern = re.compile(r"""(?<=[.?!:;·…“”"'()\[\]\-])|$""", flags=re.MULTILINE)
66
+ # TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
67
+ # TODO: do the typography characters matter?
68
+ # NOTE: | / and , are *not* sentence delimiters for my purpose
54
69
 
55
70
 
56
71
  class WordTokenizerRe(RegexTokenizer):
@@ -0,0 +1,83 @@
1
+ # STL
2
+ import json
3
+ from typing import Dict, List
4
+ from pathlib import Path
5
+
6
+ LINKU = Path(__file__).resolve().parent / Path("linku.json")
7
+ SANDBOX = Path(__file__).resolve().parent / Path("sandbox.json")
8
+
9
+ VOWELS = "aeiou"
10
+ CONSONANTS = "jklmnpstw"
11
+ ALPHABET = VOWELS + CONSONANTS
12
+ ALPHABET_SET = set(ALPHABET)
13
+
14
+ LANGUAGE = "english" # for NLTK
15
+
16
+ # `\p{posix_punct}` character class
17
+ POSIX_PUNCT = r"""-!"#$%&'()*+,./:;<=>?@[\]^_`{|}~"""
18
+ PRUNED_POSIX_PUNCT = r"""$+<=>^`|~""" # only those that are not in UNICODE_PUNCT
19
+
20
+ # `\p{Punctuation}` character class
21
+ UNICODE_PUNCT = r"""!"#%&'()*,-./:;?@\[\\\]_{}¡§«¶·»¿;·՚՛՜՝՞՟։֊־׀׃׆׳״؉؊،؍؛؝؞؟٪٫٬٭۔܀܁܂܃܄܅܆܇܈܉܊܋܌܍߷߸߹࠰࠱࠲࠳࠴࠵࠶࠷࠸࠹࠺࠻࠼࠽࠾࡞।॥॰৽੶૰౷಄෴๏๚๛༄༅༆༇༈༉༊་༌།༎༏༐༑༒༔༺༻༼༽྅࿐࿑࿒࿓࿔࿙࿚၊။၌၍၎၏჻፠፡።፣፤፥፦፧፨᐀᙮᚛᚜᛫᛬᛭᜵᜶។៕៖៘៙៚᠀᠁᠂᠃᠄᠅᠆᠇᠈᠉᠊᥄᥅᨞᨟᪠᪡᪢᪣᪤᪥᪦᪨᪩᪪᪫᪬᪭᭚᭛᭜᭝᭞᭟᭠᭽᭾᯼᯽᯾᯿᰻᰼᰽᰾᰿᱾᱿᳀᳁᳂᳃᳄᳅᳆᳇᳓‐‑‒–—―‖‗‘’‚‛“”„‟†‡•‣․‥…‧‰‱′″‴‵‶‷‸‹›※‼‽‾‿⁀⁁⁂⁃⁅⁆⁇⁈⁉⁊⁋⁌⁍⁎⁏⁐⁑⁓⁔⁕⁖⁗⁘⁙⁚⁛⁜⁝⁞⁽⁾₍₎⌈⌉⌊⌋〈〉❨❩❪❫❬❭❮❯❰❱❲❳❴❵⟅⟆⟦⟧⟨⟩⟪⟫⟬⟭⟮⟯⦃⦄⦅⦆⦇⦈⦉⦊⦋⦌⦍⦎⦏⦐⦑⦒⦓⦔⦕⦖⦗⦘⧘⧙⧚⧛⧼⧽⳹⳺⳻⳼⳾⳿⵰⸀⸁⸂⸃⸄⸅⸆⸇⸈⸉⸊⸋⸌⸍⸎⸏⸐⸑⸒⸓⸔⸕⸖⸗⸘⸙⸚⸛⸜⸝⸞⸟⸠⸡⸢⸣⸤⸥⸦⸧⸨⸩⸪⸫⸬⸭⸮⸰⸱⸲⸳⸴⸵⸶⸷⸸⸹⸺⸻⸼⸽⸾⸿⹀⹁⹂⹃⹄⹅⹆⹇⹈⹉⹊⹋⹌⹍⹎⹏⹒⹓⹔⹕⹖⹗⹘⹙⹚⹛⹜⹝、。〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〽゠・꓾꓿꘍꘎꘏꙳꙾꛲꛳꛴꛵꛶꛷꡴꡵꡶꡷꣎꣏꣸꣹꣺꣼꤮꤯꥟꧁꧂꧃꧄꧅꧆꧇꧈꧉꧊꧋꧌꧍꧞꧟꩜꩝꩞꩟꫞꫟꫰꫱꯫﴾﴿︐︑︒︓︔︕︖︗︘︙︰︱︲︳︴︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄﹅﹆﹇﹈﹉﹊﹋﹌﹍﹎﹏﹐﹑﹒﹔﹕﹖﹗﹘﹙﹚﹛﹜﹝﹞﹟﹠﹡﹣﹨﹪﹫!"#%&'()*,-./:;?@[\]_{}⦅⦆。「」、・𐄀𐄁𐄂𐎟𐏐𐕯𐡗𐤟𐤿𐩐𐩑𐩒𐩓𐩔𐩕𐩖𐩗𐩘𐩿𐫰𐫱𐫲𐫳𐫴𐫵𐫶𐬹𐬺𐬻𐬼𐬽𐬾𐬿𐮙𐮚𐮛𐮜𐺭𐽕𐽖𐽗𐽘𐽙𐾆𐾇𐾈𐾉𑁇𑁈𑁉𑁊𑁋𑁌𑁍𑂻𑂼𑂾𑂿𑃀𑃁𑅀𑅁𑅂𑅃𑅴𑅵𑇅𑇆𑇇𑇈𑇍𑇛𑇝𑇞𑇟𑈸𑈹𑈺𑈻𑈼𑈽𑊩𑑋𑑌𑑍𑑎𑑏𑑚𑑛𑑝𑓆𑗁𑗂𑗃𑗄𑗅𑗆𑗇𑗈𑗉𑗊𑗋𑗌𑗍𑗎𑗏𑗐𑗑𑗒𑗓𑗔𑗕𑗖𑗗𑙁𑙂𑙃𑙠𑙡𑙢𑙣𑙤𑙥𑙦𑙧𑙨𑙩𑙪𑙫𑙬𑚹𑜼𑜽𑜾𑠻𑥄𑥅𑥆𑧢𑨿𑩀𑩁𑩂𑩃𑩄𑩅𑩆𑪚𑪛𑪜𑪞𑪟𑪠𑪡𑪢𑬀𑬁𑬂𑬃𑬄𑬅𑬆𑬇𑬈𑬉𑱁𑱂𑱃𑱄𑱅𑱰𑱱𑻷𑻸𑽃𑽄𑽅𑽆𑽇𑽈𑽉𑽊𑽋𑽌𑽍𑽎𑽏𑿿𒑰𒑱𒑲𒑳𒑴𒿱𒿲𖩮𖩯𖫵𖬷𖬸𖬹𖬺𖬻𖭄𖺗𖺘𖺙𖺚𖿢𛲟𝪇𝪈𝪉𝪊𝪋𞥞𞥟"""
22
+ # NOTE: This list diverges slightly from the raw list, since []\ must be escaped
23
+ # The [] need to be escaped to avoid prematurely closing the regex character class
24
+ # The \ needs to be escaped to be considered as a raw \
25
+
26
+ # https://www.compart.com/en/unicode/category
27
+ # https://unicode.org/Public/UNIDATA/UnicodeData.txt
28
+
29
+
30
+ """Commonly occurring strings which are some kind of valid Toki Pona or external token"""
31
+ ALLOWABLES = {
32
+ "cw", # Content Warning
33
+ "x", # ala
34
+ "y", # anu
35
+ "kxk", # ken ala ken
36
+ "wxw", # wile ala wile
37
+ }
38
+
39
+
40
+ with open(LINKU) as f:
41
+ r: Dict[str, Dict[str, str]] = json.loads(f.read())
42
+ NIMI_PU: List[str] = [d["word"] for d in r.values() if d["book"] == "pu"]
43
+ NIMI_PU_ALE: List[str] = NIMI_PU + ["namako", "kin", "oko"]
44
+ NIMI_LINKU: List[str] = [
45
+ d["word"] for d in r.values() if d["usage_category"] in ["core", "common"]
46
+ ]
47
+ NIMI_LINKU_ALE: List[str] = [d["word"] for d in r.values()]
48
+
49
+ with open(SANDBOX) as f:
50
+ r: Dict[str, Dict[str, str]] = json.loads(f.read())
51
+ NIMI_LINKU_SANDBOX: List[str] = [d["word"] for d in r.values()]
52
+
53
+
54
+ NIMI_PU_SET = set(NIMI_PU)
55
+ NIMI_PU_ALE_SET = set(NIMI_PU_ALE)
56
+ NIMI_LINKU_SET = set(NIMI_LINKU)
57
+ NIMI_LINKU_ALE_SET = set(NIMI_LINKU_ALE)
58
+ NIMI_LINKU_SANDBOX_SET = set(NIMI_LINKU_SANDBOX)
59
+ ALLOWABLES_SET = set(ALLOWABLES)
60
+
61
+ __all__ = [
62
+ "VOWELS",
63
+ #
64
+ "CONSONANTS",
65
+ #
66
+ "ALPHABET",
67
+ "ALPHABET_SET",
68
+ #
69
+ "NIMI_PU",
70
+ "NIMI_PU_SET",
71
+ #
72
+ "NIMI_PU_ALE",
73
+ "NIMI_PU_ALE_SET",
74
+ #
75
+ "NIMI_LINKU",
76
+ "NIMI_LINKU_SET",
77
+ #
78
+ "NIMI_LINKU_ALE",
79
+ "NIMI_LINKU_ALE_SET",
80
+ #
81
+ "NIMI_LINKU_SANDBOX",
82
+ "NIMI_LINKU_SANDBOX_SET",
83
+ ]
@@ -1,5 +1,4 @@
1
1
  # STL
2
- import logging
3
2
  from typing import List, Type, Tuple
4
3
 
5
4
  # LOCAL
@@ -9,8 +8,6 @@ from sonatoki.Cleaners import Cleaner
9
8
  from sonatoki.Tokenizers import Tokenizer
10
9
  from sonatoki.Preprocessors import Preprocessor
11
10
 
12
- LOG = logging.getLogger(__name__)
13
-
14
11
 
15
12
  class Ilo:
16
13
  __preprocessors: List[Type[Preprocessor]]
@@ -20,7 +17,6 @@ class Ilo:
20
17
  __scoring_filters: List[Type[Filter]]
21
18
  __scorer: Type[Scorer]
22
19
  __passing_score: Number
23
- logging_threshold: Number = -1
24
20
 
25
21
  def __init__(
26
22
  self,
@@ -104,14 +100,6 @@ class Ilo:
104
100
  score = self.score_tokens(cleaned)
105
101
  result = score >= self.__passing_score
106
102
 
107
- if score <= self.logging_threshold:
108
- LOG.debug("msg: %.2f %s", score, repr(message))
109
- LOG.debug("preproc: %s", repr(preprocessed))
110
- LOG.debug("tokenized: %s", tokenized)
111
- LOG.debug("filtered: %s", filtered)
112
- LOG.debug("cleaned: %s", cleaned)
113
- # TODO: Move to each function? Loses ability to control when logging occurs by threshold
114
-
115
103
  return preprocessed, tokenized, filtered, cleaned, score, result
116
104
 
117
105
  def is_toki_pona(self, message: str) -> bool:
@@ -9,13 +9,13 @@ from hypothesis import HealthCheck, given, assume, example, settings
9
9
  # LOCAL
10
10
  from sonatoki.Filters import (
11
11
  NimiPu,
12
- Numerics,
12
+ Numeric,
13
13
  Syllabic,
14
14
  NimiLinku,
15
15
  Alphabetic,
16
16
  ProperName,
17
17
  Phonotactic,
18
- Punctuations,
18
+ Punctuation,
19
19
  )
20
20
  from sonatoki.Cleaners import ConsecutiveDuplicates
21
21
  from sonatoki.constants import NIMI_PU, NIMI_LINKU
@@ -82,17 +82,16 @@ def test_ProperName(s: str):
82
82
  assert res, repr(s)
83
83
 
84
84
 
85
- # I use `regex`'s Unicode property feature, which Hypothesis doesn't understand
86
- # So I have to provide a different regex tha doesn't technically match
87
- @given(st.from_regex(r"[^\w\s]+", fullmatch=True))
85
+ @given(st.from_regex(Punctuation.pattern.pattern, fullmatch=True))
86
+ @example("[]")
87
+ @example(r"\\")
88
+ @example(r"\"")
88
89
  @example("⟨·⟩")
89
90
  @example("…")
90
- @example("「 」")
91
+ @example("「」") # ` `
91
92
  @example(string.punctuation)
92
- @settings(suppress_health_check=[HealthCheck.filter_too_much]) # FIXME
93
- def test_Punctuations(s: str):
94
- _ = assume(re.fullmatch(Punctuations.pattern.pattern, s))
95
- res = Punctuations.filter(s)
93
+ def test_Punctuation(s: str):
94
+ res = Punctuation.filter(s)
96
95
  assert res, repr(s)
97
96
 
98
97
 
@@ -100,5 +99,5 @@ def test_Punctuations(s: str):
100
99
  @example("124125")
101
100
  @example("99990000")
102
101
  def test_Numeric(s: str):
103
- res = Numerics.filter(s)
102
+ res = Numeric.filter(s)
104
103
  assert res, repr(s)
@@ -9,7 +9,6 @@ from sonatoki.Configs import LazyConfig, PrefConfig
9
9
  @pytest.fixture
10
10
  def ilo():
11
11
  ilo = Ilo(**PrefConfig)
12
- # ilo.logging_threshold = 0.8
13
12
  return ilo
14
13
 
15
14
 
@@ -6,7 +6,9 @@ from hypothesis import given, example
6
6
  from sonatoki.Preprocessors import (
7
7
  URLs,
8
8
  Spoilers,
9
+ AllQuotes,
9
10
  Backticks,
11
+ Reference,
10
12
  ArrowQuote,
11
13
  DoubleQuotes,
12
14
  SingleQuotes,
@@ -14,6 +16,7 @@ from sonatoki.Preprocessors import (
14
16
  DiscordSpecial,
15
17
  DiscordChannels,
16
18
  DiscordMentions,
19
+ AngleBracketObject,
17
20
  )
18
21
 
19
22
 
@@ -101,3 +104,40 @@ def test_DiscordChannels(s: str):
101
104
  def test_DiscordSpecial(s: str):
102
105
  res = DiscordSpecial.process(s).strip()
103
106
  assert res == "", (repr(s), repr(res))
107
+
108
+
109
+ @given(
110
+ st.from_regex(DiscordEmotes.pattern.pattern, fullmatch=True)
111
+ | st.from_regex(DiscordMentions.pattern.pattern, fullmatch=True)
112
+ | st.from_regex(DiscordChannels.pattern.pattern, fullmatch=True)
113
+ | st.from_regex(DiscordSpecial.pattern.pattern, fullmatch=True)
114
+ | st.from_regex(AngleBracketObject.pattern.pattern, fullmatch=True)
115
+ )
116
+ @example("<https://example.com>")
117
+ @example("<#123124125125>")
118
+ def test_AngleBracketObject(s: str):
119
+ res = AngleBracketObject.process(s).strip()
120
+ assert res == "", (repr(s), repr(res))
121
+
122
+
123
+ @given(
124
+ st.from_regex(SingleQuotes.pattern.pattern, fullmatch=True)
125
+ | st.from_regex(DoubleQuotes.pattern.pattern, fullmatch=True)
126
+ | st.from_regex(Backticks.pattern.pattern, fullmatch=True)
127
+ | st.from_regex(ArrowQuote.pattern.pattern, fullmatch=True)
128
+ | st.from_regex(AllQuotes.pattern.pattern, fullmatch=True)
129
+ )
130
+ @example("> bruh")
131
+ @example("`bruh`")
132
+ def test_AllQuotes(s: str):
133
+ res = AllQuotes.process(s).strip()
134
+ assert res == "", (repr(s), repr(res))
135
+
136
+
137
+ @given(st.from_regex(Reference.pattern.pattern, fullmatch=True))
138
+ @example("[[Brainstorm]]")
139
+ @example("[[Phatic Phrases]]")
140
+ @example("[[Yahoo!]]")
141
+ def test_Reference(s: str):
142
+ res = Reference.process(s).strip()
143
+ assert res == "", (repr(s), repr(res))
@@ -4,38 +4,39 @@ from typing import List, Type
4
4
  # PDM
5
5
  import pytest
6
6
  import hypothesis.strategies as st
7
- from hypothesis import given
7
+ from hypothesis import given, example
8
8
 
9
9
  # LOCAL
10
10
  from sonatoki.Filters import (
11
11
  Filter,
12
12
  NimiPu,
13
- Numerics,
13
+ Numeric,
14
14
  Syllabic,
15
15
  NimiLinku,
16
16
  Alphabetic,
17
17
  ProperName,
18
18
  Phonotactic,
19
- Punctuations,
19
+ Punctuation,
20
20
  )
21
- from sonatoki.Scorers import Scorer, Scaling, PassFail, SoftScaling
21
+ from sonatoki.Scorers import Scorer, Scaling, PassFail, SoftScaling, SoftPassFail
22
22
 
23
23
  # FILESYSTEM
24
24
  from .test_utils import token_strategy
25
25
 
26
26
  FILTERS = [
27
27
  NimiPu,
28
- Numerics,
28
+ Numeric,
29
29
  Syllabic,
30
30
  NimiLinku,
31
31
  Alphabetic,
32
32
  ProperName,
33
33
  Phonotactic,
34
- Punctuations,
34
+ Punctuation,
35
35
  ]
36
36
 
37
37
  SCORERS = [
38
38
  PassFail,
39
+ SoftPassFail,
39
40
  Scaling,
40
41
  SoftScaling,
41
42
  ]
@@ -46,6 +47,7 @@ SCORERS = [
46
47
  st.lists(st.sampled_from(FILTERS), min_size=1, unique=True),
47
48
  st.lists(token_strategy, min_size=0, max_size=10),
48
49
  )
50
+ @example(st.sampled_from(FILTERS), [])
49
51
  def test_score_bounds(scorer: Scorer, filters: List[Type[Filter]], text: List[str]):
50
52
  score = scorer.score(text, filters)
51
53
  assert 0 <= score <= 1, (score, filters, text)
@@ -19,6 +19,24 @@
19
19
  output:
20
20
  - "mi mu."
21
21
  - "mi wawa."
22
+ - name: "empty"
23
+ input: ""
24
+ output: []
25
+ - name: "whitespace"
26
+ input: " \n "
27
+ output: []
28
+ - name: "newline basic"
29
+ input: "sina lon seme?\nmi wile lon poka...\n"
30
+ output:
31
+ - "sina lon seme?"
32
+ - "mi wile lon poka."
33
+ - "."
34
+ - "."
35
+ - name: "newline alone"
36
+ input: "sina lon seme\nmi wile lon poka"
37
+ output:
38
+ - "sina lon seme"
39
+ - "mi wile lon poka"
22
40
  - name: "dash"
23
41
  input: "mi sona ala e ni- sina seme a"
24
42
  output:
@@ -1,67 +0,0 @@
1
- # STL
2
- import json
3
- from typing import Dict, List
4
- from pathlib import Path
5
-
6
- LINKU = Path(__file__).resolve().parent / Path("linku.json")
7
- SANDBOX = Path(__file__).resolve().parent / Path("sandbox.json")
8
-
9
- VOWELS = "aeiou"
10
- CONSONANTS = "jklmnpstw"
11
- ALPHABET = VOWELS + CONSONANTS
12
- ALPHABET_SET = set(ALPHABET)
13
-
14
- """Commonly occurring strings which are some kind of valid Toki Pona or external token"""
15
- ALLOWABLES = {
16
- "cw", # Content Warning
17
- "x", # ala
18
- "y", # anu
19
- "kxk", # ken ala ken
20
- "wxw", # wile ala wile
21
- }
22
-
23
-
24
- with open(LINKU) as f:
25
- r: Dict[str, Dict[str, str]] = json.loads(f.read())
26
- NIMI_PU: List[str] = [d["word"] for d in r.values() if d["book"] == "pu"]
27
- NIMI_PU_ALE: List[str] = NIMI_PU + ["namako", "kin", "oko"]
28
- NIMI_LINKU: List[str] = [
29
- d["word"] for d in r.values() if d["usage_category"] in ["core", "common"]
30
- ]
31
- NIMI_LINKU_ALE: List[str] = [d["word"] for d in r.values()]
32
-
33
- with open(SANDBOX) as f:
34
- r: Dict[str, Dict[str, str]] = json.loads(f.read())
35
- NIMI_LINKU_SANDBOX: List[str] = [d["word"] for d in r.values()]
36
-
37
-
38
- NIMI_PU_SET = set(NIMI_PU)
39
- NIMI_PU_ALE_SET = set(NIMI_PU_ALE)
40
- NIMI_LINKU_SET = set(NIMI_LINKU)
41
- NIMI_LINKU_ALE_SET = set(NIMI_LINKU_ALE)
42
- NIMI_LINKU_SANDBOX_SET = set(NIMI_LINKU_SANDBOX)
43
- ALLOWABLES_SET = set(ALLOWABLES)
44
-
45
- __all__ = [
46
- "VOWELS",
47
- #
48
- "CONSONANTS",
49
- #
50
- "ALPHABET",
51
- "ALPHABET_SET",
52
- #
53
- "NIMI_PU",
54
- "NIMI_PU_SET",
55
- #
56
- "NIMI_PU_ALE",
57
- "NIMI_PU_ALE_SET",
58
- #
59
- "NIMI_LINKU",
60
- "NIMI_LINKU_SET",
61
- #
62
- "NIMI_LINKU_ALE",
63
- "NIMI_LINKU_ALE_SET",
64
- #
65
- "NIMI_LINKU_SANDBOX",
66
- "NIMI_LINKU_SANDBOX_SET",
67
- ]
File without changes
File without changes
File without changes
File without changes