sonatoki 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Filters.py +16 -3
- sonatoki/Preprocessors.py +13 -2
- sonatoki/Scorers.py +2 -14
- sonatoki/Tokenizers.py +22 -7
- sonatoki/constants.py +16 -0
- sonatoki/ilo.py +0 -12
- {sonatoki-0.1.4.dist-info → sonatoki-0.1.5.dist-info}/METADATA +1 -1
- sonatoki-0.1.5.dist-info/RECORD +16 -0
- sonatoki-0.1.4.dist-info/RECORD +0 -16
- {sonatoki-0.1.4.dist-info → sonatoki-0.1.5.dist-info}/WHEEL +0 -0
- {sonatoki-0.1.4.dist-info → sonatoki-0.1.5.dist-info}/licenses/LICENSE +0 -0
sonatoki/Filters.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
# STL
|
2
|
+
import re
|
2
3
|
from abc import ABC, abstractmethod
|
3
4
|
from typing import Set
|
4
5
|
from functools import lru_cache as cache # cache comes in 3.9
|
5
6
|
|
6
7
|
# PDM
|
7
|
-
import regex
|
8
|
+
import regex
|
8
9
|
from typing_extensions import override
|
9
10
|
|
10
11
|
# LOCAL
|
@@ -13,14 +14,16 @@ from sonatoki.constants import (
|
|
13
14
|
CONSONANTS,
|
14
15
|
NIMI_PU_SET,
|
15
16
|
ALPHABET_SET,
|
17
|
+
UNICODE_PUNCT,
|
16
18
|
ALLOWABLES_SET,
|
17
19
|
NIMI_LINKU_SET,
|
18
20
|
NIMI_PU_ALE_SET,
|
19
21
|
NIMI_LINKU_ALE_SET,
|
22
|
+
PRUNED_POSIX_PUNCT,
|
20
23
|
NIMI_LINKU_SANDBOX_SET,
|
21
24
|
)
|
22
25
|
|
23
|
-
|
26
|
+
regex.DEFAULT_VERSION = regex.VERSION1
|
24
27
|
|
25
28
|
|
26
29
|
class Filter(ABC):
|
@@ -41,6 +44,16 @@ class RegexFilter(Filter):
|
|
41
44
|
return not not re.fullmatch(cls.pattern, token)
|
42
45
|
|
43
46
|
|
47
|
+
class Regex1Filter(Filter):
|
48
|
+
pattern: "regex.Pattern[str]"
|
49
|
+
|
50
|
+
@classmethod
|
51
|
+
@override
|
52
|
+
@cache(maxsize=None)
|
53
|
+
def filter(cls, token: str) -> bool:
|
54
|
+
return not not regex.fullmatch(cls.pattern, token)
|
55
|
+
|
56
|
+
|
44
57
|
class SetFilter(Filter):
|
45
58
|
tokens: Set[str]
|
46
59
|
|
@@ -148,7 +161,7 @@ class Numeric(Filter):
|
|
148
161
|
|
149
162
|
|
150
163
|
class Punctuation(RegexFilter):
|
151
|
-
pattern = re.compile(
|
164
|
+
pattern = re.compile(rf"[{PRUNED_POSIX_PUNCT}{UNICODE_PUNCT}]+")
|
152
165
|
|
153
166
|
|
154
167
|
__all__ = [
|
sonatoki/Preprocessors.py
CHANGED
@@ -17,13 +17,14 @@ It is up to the user to order them appropriately.
|
|
17
17
|
"""
|
18
18
|
|
19
19
|
# STL
|
20
|
+
import re
|
20
21
|
from abc import ABC, abstractmethod
|
21
22
|
|
22
23
|
# PDM
|
23
|
-
import regex
|
24
|
+
import regex
|
24
25
|
from typing_extensions import override
|
25
26
|
|
26
|
-
|
27
|
+
regex.DEFAULT_VERSION = regex.VERSION1
|
27
28
|
|
28
29
|
|
29
30
|
class Preprocessor(ABC):
|
@@ -43,6 +44,16 @@ class RegexPreprocessor(Preprocessor):
|
|
43
44
|
return re.sub(cls.pattern, cls.replace, msg)
|
44
45
|
|
45
46
|
|
47
|
+
class Regex1Preprocessor(Preprocessor):
|
48
|
+
pattern: "regex.Pattern[str]"
|
49
|
+
replace: str = " "
|
50
|
+
|
51
|
+
@classmethod
|
52
|
+
@override
|
53
|
+
def process(cls, msg: str) -> str:
|
54
|
+
return regex.sub(cls.pattern, cls.replace, msg)
|
55
|
+
|
56
|
+
|
46
57
|
"""
|
47
58
|
The following classes are Ignorables.
|
48
59
|
|
sonatoki/Scorers.py
CHANGED
@@ -10,8 +10,6 @@ from typing_extensions import override
|
|
10
10
|
# LOCAL
|
11
11
|
from sonatoki.Filters import Filter
|
12
12
|
|
13
|
-
LOG = logging.getLogger(__name__)
|
14
|
-
|
15
13
|
Number = Union[int, float]
|
16
14
|
Weights = Dict[str, Number]
|
17
15
|
|
@@ -37,12 +35,7 @@ class PassFail(Scorer):
|
|
37
35
|
def score_token(cls, token: str, filters: List[Type[Filter]]) -> Number:
|
38
36
|
for f in filters:
|
39
37
|
if f.filter(token):
|
40
|
-
|
41
|
-
LOG.debug(
|
42
|
-
"%12s.%s('%s') = %.2f", cls.__name__, f.__name__, token, score
|
43
|
-
)
|
44
|
-
return score
|
45
|
-
LOG.debug("%12s('%s') = 0.00", cls.__name__, token)
|
38
|
+
return 1
|
46
39
|
return 0
|
47
40
|
|
48
41
|
@classmethod
|
@@ -86,12 +79,7 @@ class Scaling(Scorer):
|
|
86
79
|
def score_token(cls, token: str, filters: List[Type[Filter]], scale: int):
|
87
80
|
for i, f in enumerate(filters):
|
88
81
|
if f.filter(token):
|
89
|
-
|
90
|
-
LOG.debug(
|
91
|
-
"%12s.%s('%s') = %.2f", cls.__name__, f.__name__, token, score
|
92
|
-
)
|
93
|
-
return score
|
94
|
-
LOG.debug("%12s('%s') = 0.00", cls.__name__, token)
|
82
|
+
return scale - i
|
95
83
|
return 0
|
96
84
|
|
97
85
|
@classmethod
|
sonatoki/Tokenizers.py
CHANGED
@@ -1,11 +1,15 @@
|
|
1
1
|
# STL
|
2
|
+
import re
|
2
3
|
from abc import ABC, abstractmethod
|
3
4
|
from typing import List
|
4
5
|
|
5
6
|
# PDM
|
6
|
-
import regex
|
7
|
+
import regex
|
7
8
|
from typing_extensions import override
|
8
9
|
|
10
|
+
# LOCAL
|
11
|
+
from sonatoki.constants import UNICODE_PUNCT, PRUNED_POSIX_PUNCT
|
12
|
+
|
9
13
|
try:
|
10
14
|
# PDM
|
11
15
|
import nltk
|
@@ -15,7 +19,7 @@ except ImportError as e:
|
|
15
19
|
nltk = e
|
16
20
|
|
17
21
|
|
18
|
-
|
22
|
+
regex.DEFAULT_VERSION = regex.VERSION1
|
19
23
|
|
20
24
|
|
21
25
|
class Tokenizer(ABC):
|
@@ -42,15 +46,26 @@ class RegexTokenizer(Tokenizer):
|
|
42
46
|
return [clean for word in re.split(cls.pattern, s) if (clean := word.strip())]
|
43
47
|
|
44
48
|
|
49
|
+
class Regex1Tokenizer(Tokenizer):
|
50
|
+
pattern: "regex.Pattern[str]"
|
51
|
+
|
52
|
+
@classmethod
|
53
|
+
@override
|
54
|
+
def tokenize(cls, s: str) -> List[str]:
|
55
|
+
return [
|
56
|
+
clean for word in regex.split(cls.pattern, s) if (clean := word.strip())
|
57
|
+
]
|
58
|
+
|
59
|
+
|
45
60
|
class WordTokenizerTok(RegexTokenizer):
|
46
|
-
pattern = re.compile(
|
47
|
-
# TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
|
48
|
-
# TODO: do the typography characters matter?
|
49
|
-
# NOTE: | / and , are *not* sentence delimiters for my purpose
|
61
|
+
pattern = re.compile(rf"""([{PRUNED_POSIX_PUNCT}{UNICODE_PUNCT}]+|\s+)""")
|
50
62
|
|
51
63
|
|
52
64
|
class SentTokenizerTok(RegexTokenizer):
|
53
|
-
pattern = re.compile(r"""(?<=[.?!:;·…“”"'()\[\]\-]|$
|
65
|
+
pattern = re.compile(r"""(?<=[.?!:;·…“”"'()\[\]\-])|$""", flags=re.MULTILINE)
|
66
|
+
# TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
|
67
|
+
# TODO: do the typography characters matter?
|
68
|
+
# NOTE: | / and , are *not* sentence delimiters for my purpose
|
54
69
|
|
55
70
|
|
56
71
|
class WordTokenizerRe(RegexTokenizer):
|
sonatoki/constants.py
CHANGED
@@ -11,6 +11,22 @@ CONSONANTS = "jklmnpstw"
|
|
11
11
|
ALPHABET = VOWELS + CONSONANTS
|
12
12
|
ALPHABET_SET = set(ALPHABET)
|
13
13
|
|
14
|
+
LANGUAGE = "english" # for NLTK
|
15
|
+
|
16
|
+
# `\p{posix_punct}` character class
|
17
|
+
POSIX_PUNCT = r"""-!"#$%&'()*+,./:;<=>?@[\]^_`{|}~"""
|
18
|
+
PRUNED_POSIX_PUNCT = r"""$+<=>^`|~""" # only those that are not in UNICODE_PUNCT
|
19
|
+
|
20
|
+
# `\p{Punctuation}` character class
|
21
|
+
UNICODE_PUNCT = r"""!"#%&'()*,-./:;?@\[\\\]_{}¡§«¶·»¿;·՚՛՜՝՞՟։֊־׀׃׆׳״؉؊،؍؛؝؞؟٪٫٬٭۔܀܁܂܃܄܅܆܇܈܉܊܋܌܍߷߸߹࠰࠱࠲࠳࠴࠵࠶࠷࠸࠹࠺࠻࠼࠽࠾࡞।॥॰৽੶૰౷಄෴๏๚๛༄༅༆༇༈༉༊་༌།༎༏༐༑༒༔༺༻༼༽྅࿐࿑࿒࿓࿔࿙࿚၊။၌၍၎၏჻፠፡።፣፤፥፦፧፨᐀᙮᚛᚜᛫᛬᛭᜵᜶។៕៖៘៙៚᠀᠁᠂᠃᠄᠅᠆᠇᠈᠉᠊᥄᥅᨞᨟᪠᪡᪢᪣᪤᪥᪦᪨᪩᪪᪫᪬᪭᭚᭛᭜᭝᭞᭟᭠᭽᭾᯼᯽᯾᯿᰻᰼᰽᰾᰿᱾᱿᳀᳁᳂᳃᳄᳅᳆᳇᳓‐‑‒–—―‖‗‘’‚‛“”„‟†‡•‣․‥…‧‰‱′″‴‵‶‷‸‹›※‼‽‾‿⁀⁁⁂⁃⁅⁆⁇⁈⁉⁊⁋⁌⁍⁎⁏⁐⁑⁓⁔⁕⁖⁗⁘⁙⁚⁛⁜⁝⁞⁽⁾₍₎⌈⌉⌊⌋〈〉❨❩❪❫❬❭❮❯❰❱❲❳❴❵⟅⟆⟦⟧⟨⟩⟪⟫⟬⟭⟮⟯⦃⦄⦅⦆⦇⦈⦉⦊⦋⦌⦍⦎⦏⦐⦑⦒⦓⦔⦕⦖⦗⦘⧘⧙⧚⧛⧼⧽⳹⳺⳻⳼⳾⳿⵰⸀⸁⸂⸃⸄⸅⸆⸇⸈⸉⸊⸋⸌⸍⸎⸏⸐⸑⸒⸓⸔⸕⸖⸗⸘⸙⸚⸛⸜⸝⸞⸟⸠⸡⸢⸣⸤⸥⸦⸧⸨⸩⸪⸫⸬⸭⸮⸰⸱⸲⸳⸴⸵⸶⸷⸸⸹⸺⸻⸼⸽⸾⸿⹀⹁⹂⹃⹄⹅⹆⹇⹈⹉⹊⹋⹌⹍⹎⹏⹒⹓⹔⹕⹖⹗⹘⹙⹚⹛⹜⹝、。〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〽゠・꓾꓿꘍꘎꘏꙳꙾꛲꛳꛴꛵꛶꛷꡴꡵꡶꡷꣎꣏꣸꣹꣺꣼꤮꤯꥟꧁꧂꧃꧄꧅꧆꧇꧈꧉꧊꧋꧌꧍꧞꧟꩜꩝꩞꩟꫞꫟꫰꫱꯫﴾﴿︐︑︒︓︔︕︖︗︘︙︰︱︲︳︴︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄﹅﹆﹇﹈﹉﹊﹋﹌﹍﹎﹏﹐﹑﹒﹔﹕﹖﹗﹘﹙﹚﹛﹜﹝﹞﹟﹠﹡﹣﹨﹪﹫!"#%&'()*,-./:;?@[\]_{}⦅⦆。「」、・𐄀𐄁𐄂𐎟𐏐𐕯𐡗𐤟𐤿𐩐𐩑𐩒𐩓𐩔𐩕𐩖𐩗𐩘𐩿𐫰𐫱𐫲𐫳𐫴𐫵𐫶𐬹𐬺𐬻𐬼𐬽𐬾𐬿𐮙𐮚𐮛𐮜𐺭𐽕𐽖𐽗𐽘𐽙𐾆𐾇𐾈𐾉𑁇𑁈𑁉𑁊𑁋𑁌𑁍𑂻𑂼𑂾𑂿𑃀𑃁𑅀𑅁𑅂𑅃𑅴𑅵𑇅𑇆𑇇𑇈𑇍𑇛𑇝𑇞𑇟𑈸𑈹𑈺𑈻𑈼𑈽𑊩𑑋𑑌𑑍𑑎𑑏𑑚𑑛𑑝𑓆𑗁𑗂𑗃𑗄𑗅𑗆𑗇𑗈𑗉𑗊𑗋𑗌𑗍𑗎𑗏𑗐𑗑𑗒𑗓𑗔𑗕𑗖𑗗𑙁𑙂𑙃𑙠𑙡𑙢𑙣𑙤𑙥𑙦𑙧𑙨𑙩𑙪𑙫𑙬𑚹𑜼𑜽𑜾𑠻𑥄𑥅𑥆𑧢𑨿𑩀𑩁𑩂𑩃𑩄𑩅𑩆𑪚𑪛𑪜𑪞𑪟𑪠𑪡𑪢𑬀𑬁𑬂𑬃𑬄𑬅𑬆𑬇𑬈𑬉𑱁𑱂𑱃𑱄𑱅𑱰𑱱𑻷𑻸𑽃𑽄𑽅𑽆𑽇𑽈𑽉𑽊𑽋𑽌𑽍𑽎𑽏𑿿𒑰𒑱𒑲𒑳𒑴𒿱𒿲𖩮𖩯𖫵𖬷𖬸𖬹𖬺𖬻𖭄𖺗𖺘𖺙𖺚𖿢𛲟𝪇𝪈𝪉𝪊𝪋𞥞𞥟"""
|
22
|
+
# NOTE: This list diverges slightly from the raw list, since []\ must be escaped
|
23
|
+
# The [] need to be escaped to avoid prematurely closing the regex character class
|
24
|
+
# The \ needs to be escaped to be considered as a raw \
|
25
|
+
|
26
|
+
# https://www.compart.com/en/unicode/category
|
27
|
+
# https://unicode.org/Public/UNIDATA/UnicodeData.txt
|
28
|
+
|
29
|
+
|
14
30
|
"""Commonly occurring strings which are some kind of valid Toki Pona or external token"""
|
15
31
|
ALLOWABLES = {
|
16
32
|
"cw", # Content Warning
|
sonatoki/ilo.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
# STL
|
2
|
-
import logging
|
3
2
|
from typing import List, Type, Tuple
|
4
3
|
|
5
4
|
# LOCAL
|
@@ -9,8 +8,6 @@ from sonatoki.Cleaners import Cleaner
|
|
9
8
|
from sonatoki.Tokenizers import Tokenizer
|
10
9
|
from sonatoki.Preprocessors import Preprocessor
|
11
10
|
|
12
|
-
LOG = logging.getLogger(__name__)
|
13
|
-
|
14
11
|
|
15
12
|
class Ilo:
|
16
13
|
__preprocessors: List[Type[Preprocessor]]
|
@@ -20,7 +17,6 @@ class Ilo:
|
|
20
17
|
__scoring_filters: List[Type[Filter]]
|
21
18
|
__scorer: Type[Scorer]
|
22
19
|
__passing_score: Number
|
23
|
-
logging_threshold: Number = -1
|
24
20
|
|
25
21
|
def __init__(
|
26
22
|
self,
|
@@ -104,14 +100,6 @@ class Ilo:
|
|
104
100
|
score = self.score_tokens(cleaned)
|
105
101
|
result = score >= self.__passing_score
|
106
102
|
|
107
|
-
if score <= self.logging_threshold:
|
108
|
-
LOG.debug("msg: %.2f %s", score, repr(message))
|
109
|
-
LOG.debug("preproc: %s", repr(preprocessed))
|
110
|
-
LOG.debug("tokenized: %s", tokenized)
|
111
|
-
LOG.debug("filtered: %s", filtered)
|
112
|
-
LOG.debug("cleaned: %s", cleaned)
|
113
|
-
# TODO: Move to each function? Loses ability to control when logging occurs by threshold
|
114
|
-
|
115
103
|
return preprocessed, tokenized, filtered, cleaned, score, result
|
116
104
|
|
117
105
|
def is_toki_pona(self, message: str) -> bool:
|
@@ -0,0 +1,16 @@
|
|
1
|
+
sonatoki-0.1.5.dist-info/METADATA,sha256=wJBa9CKSni9dcfQGQZp_-FSfANJHmTfZdYSCMH0Wolg,5225
|
2
|
+
sonatoki-0.1.5.dist-info/WHEEL,sha256=vnE8JVcI2Wz7GRKorsPArnBdnW2SWKWGow5gu5tHlRU,90
|
3
|
+
sonatoki-0.1.5.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
|
4
|
+
sonatoki/Cleaners.py,sha256=gTZ9dSsnvKVUtxM_ECSZ-_2heh--nD5A9dCQR1ATb1c,1160
|
5
|
+
sonatoki/Configs.py,sha256=iY6Lyn1rMi7iF0M62yx0ET4pEb35-QAd1FS0tkyUfSc,1935
|
6
|
+
sonatoki/Filters.py,sha256=xanTOKxasW_2OpQXsnk5pzbM1FmG8y46pakuTfMz9Hw,4470
|
7
|
+
sonatoki/Preprocessors.py,sha256=FqBcHirsXV_91mj99ju9AnbsHaCFctSnuz_vca9ckSY,4441
|
8
|
+
sonatoki/Scorers.py,sha256=w5p4qPzpEhR-xHaOXzqulN01OKtLcSPsTrgKyMhfAaQ,3658
|
9
|
+
sonatoki/Tokenizers.py,sha256=Wqyf36d1OST7sVrANlLixDIOYWSctD5rVg1_MlaPrCw,2310
|
10
|
+
sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
+
sonatoki/__main__.py,sha256=6xc-wIrrFo9wTyn4zRQNAmqwmJBtVvCMwV-CrM-hueA,82
|
12
|
+
sonatoki/constants.py,sha256=LZVI0322kTsqwrGQIJfgSl6IIZS-ste1rd9isRzcNqM,4883
|
13
|
+
sonatoki/ilo.py,sha256=yyLgNPI0Hmb4f1BzX6IRHr11FPChfL2xDR_9odlr8_8,3849
|
14
|
+
sonatoki/linku.json,sha256=B5KNdhyM5UEfMciROgh1ECHr3i-ASBeMvwrkzNJX47c,271013
|
15
|
+
sonatoki/sandbox.json,sha256=hx6LRsfvmmTtqXcXIyCsfSaGK3DZ-GCdbM8xhZQBHoA,77650
|
16
|
+
sonatoki-0.1.5.dist-info/RECORD,,
|
sonatoki-0.1.4.dist-info/RECORD
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
sonatoki-0.1.4.dist-info/METADATA,sha256=cK_EyYXPeY4rm9Plcre-i_DbPJZD06572cYQEIUQ804,5225
|
2
|
-
sonatoki-0.1.4.dist-info/WHEEL,sha256=vnE8JVcI2Wz7GRKorsPArnBdnW2SWKWGow5gu5tHlRU,90
|
3
|
-
sonatoki-0.1.4.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
|
4
|
-
sonatoki/Cleaners.py,sha256=gTZ9dSsnvKVUtxM_ECSZ-_2heh--nD5A9dCQR1ATb1c,1160
|
5
|
-
sonatoki/Configs.py,sha256=iY6Lyn1rMi7iF0M62yx0ET4pEb35-QAd1FS0tkyUfSc,1935
|
6
|
-
sonatoki/Filters.py,sha256=dL3XgH62OrVVvc8b6dtR5-JZmErVF4bl7ultAoHHqpo,4190
|
7
|
-
sonatoki/Preprocessors.py,sha256=h2sX6nJIIOPotwHL0476VQe4KxERlD_F6nrvxDyuaTs,4205
|
8
|
-
sonatoki/Scorers.py,sha256=V293DBiupBiujzuc4yMrKOAiuNTLltIsiCzIAlLeokA,4129
|
9
|
-
sonatoki/Tokenizers.py,sha256=fvqxpubs2F63va2RzZKZQhZbFnVaC_9haXIA9Mqznis,1942
|
10
|
-
sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
-
sonatoki/__main__.py,sha256=6xc-wIrrFo9wTyn4zRQNAmqwmJBtVvCMwV-CrM-hueA,82
|
12
|
-
sonatoki/constants.py,sha256=m0Z4At6MfbqZRio2glT3J3zT9x_itcWZBT_G82mpaVc,1647
|
13
|
-
sonatoki/ilo.py,sha256=oN14iYFKxgjFjjOslgqBrMaIgpnvS5gO6MscbS0JS5A,4343
|
14
|
-
sonatoki/linku.json,sha256=B5KNdhyM5UEfMciROgh1ECHr3i-ASBeMvwrkzNJX47c,271013
|
15
|
-
sonatoki/sandbox.json,sha256=hx6LRsfvmmTtqXcXIyCsfSaGK3DZ-GCdbM8xhZQBHoA,77650
|
16
|
-
sonatoki-0.1.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|