sonatoki 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Preprocessors.py +3 -0
- sonatoki/Scorers.py +28 -11
- sonatoki/Tokenizers.py +10 -16
- {sonatoki-0.1.1.dist-info → sonatoki-0.1.2.dist-info}/METADATA +1 -1
- {sonatoki-0.1.1.dist-info → sonatoki-0.1.2.dist-info}/RECORD +7 -7
- {sonatoki-0.1.1.dist-info → sonatoki-0.1.2.dist-info}/WHEEL +0 -0
- {sonatoki-0.1.1.dist-info → sonatoki-0.1.2.dist-info}/licenses/LICENSE +0 -0
sonatoki/Preprocessors.py
CHANGED
sonatoki/Scorers.py
CHANGED
@@ -16,6 +16,13 @@ Number = Union[int, float]
|
|
16
16
|
Weights = Dict[str, Number]
|
17
17
|
|
18
18
|
|
19
|
+
def sigmoid(n: int) -> Number:
|
20
|
+
return 1 / (1 + math.exp(-(0.30 * (n - 1))))
|
21
|
+
# n-1 makes sigmoid(1) == 0.5
|
22
|
+
# 0.30 softens scaling in favor of short input
|
23
|
+
# return n / (1+abs(n)) # too weak in 0.7+
|
24
|
+
|
25
|
+
|
19
26
|
class Scorer(ABC):
|
20
27
|
@classmethod
|
21
28
|
@abstractmethod
|
@@ -27,7 +34,7 @@ class PassFail(Scorer):
|
|
27
34
|
"""The token passes any filter or fails all of them, scoring 1 or 0 respectively."""
|
28
35
|
|
29
36
|
@classmethod
|
30
|
-
def
|
37
|
+
def score_token(cls, token: str, filters: List[Type[Filter]]) -> Number:
|
31
38
|
for f in filters:
|
32
39
|
if f.filter(token):
|
33
40
|
score = 1
|
@@ -47,10 +54,27 @@ class PassFail(Scorer):
|
|
47
54
|
total_score = 0
|
48
55
|
len_tokens = len(tokens)
|
49
56
|
for token in tokens:
|
50
|
-
total_score += cls.
|
57
|
+
total_score += cls.score_token(token, filters)
|
51
58
|
return total_score / len_tokens if len_tokens else 0
|
52
59
|
|
53
60
|
|
61
|
+
class SoftPassFail(PassFail):
|
62
|
+
@classmethod
|
63
|
+
@override
|
64
|
+
def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
|
65
|
+
if not tokens:
|
66
|
+
return 1
|
67
|
+
|
68
|
+
total_score = 0
|
69
|
+
len_tokens = len(tokens)
|
70
|
+
for token in tokens:
|
71
|
+
total_score += cls.score_token(token, filters)
|
72
|
+
|
73
|
+
percentage = total_score / len_tokens if len_tokens else 0
|
74
|
+
percentage **= sigmoid(len_tokens)
|
75
|
+
return percentage
|
76
|
+
|
77
|
+
|
54
78
|
class Scaling(Scorer):
|
55
79
|
"""
|
56
80
|
The sooner a token matches a filter, the higher its score.
|
@@ -91,13 +115,6 @@ class SoftScaling(Scaling):
|
|
91
115
|
For example, a single token scoring 0.64 will now score 0.8.
|
92
116
|
"""
|
93
117
|
|
94
|
-
@staticmethod
|
95
|
-
def sigmoid(n: int) -> Number:
|
96
|
-
return 1 / (1 + math.exp(-(0.30 * (n - 1))))
|
97
|
-
# n-1 makes sigmoid(1) == 0.5
|
98
|
-
# 0.30 softens scaling in favor of short input
|
99
|
-
# return n / (1+abs(n)) # too weak in 0.7+
|
100
|
-
|
101
118
|
@classmethod
|
102
119
|
@override
|
103
120
|
def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
|
@@ -113,11 +130,11 @@ class SoftScaling(Scaling):
|
|
113
130
|
total_score += cls.score_token(token, filters, len_filters)
|
114
131
|
|
115
132
|
percentage = total_score / max_score if max_score else 0
|
116
|
-
percentage **=
|
133
|
+
percentage **= sigmoid(len_tokens)
|
117
134
|
return percentage
|
118
135
|
|
119
136
|
|
120
137
|
class Logarithmic(Scorer): ...
|
121
138
|
|
122
139
|
|
123
|
-
__all__ = ["PassFail", "Scaling", "SoftScaling"]
|
140
|
+
__all__ = ["PassFail", "SoftPassFail", "Scaling", "SoftScaling"]
|
sonatoki/Tokenizers.py
CHANGED
@@ -4,6 +4,8 @@ from typing import List, Callable
|
|
4
4
|
# PDM
|
5
5
|
import regex as re
|
6
6
|
|
7
|
+
# TODO: Entire module should be reworked to match the class scheme of the rest of the module, imo
|
8
|
+
|
7
9
|
try:
|
8
10
|
# PDM
|
9
11
|
import nltk
|
@@ -15,18 +17,14 @@ except ImportError as e:
|
|
15
17
|
|
16
18
|
LANGUAGE = "english" # for NLTK
|
17
19
|
|
18
|
-
SENT_DELIMS_RE = r"""(.*?[.?!;:])|(.+?$)"""
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
WORD_DELIMS_RE = r"""\s+|(?=[.?!;:'"-])"""
|
26
|
-
WORD_DELIMS_RE = re.compile(WORD_DELIMS_RE)
|
20
|
+
SENT_DELIMS_RE = re.compile(r"""(.*?[.?!;:])|(.+?$)""")
|
21
|
+
SENT_DELIMS_TOK = re.compile(r"""(?<=[.?!:;·…“”"'()\[\]\-]|$)""")
|
22
|
+
# TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
|
23
|
+
# TODO: do the typography characters matter?
|
24
|
+
# NOTE: | / and , are *not* sentence delimiters for my purpose
|
27
25
|
|
28
|
-
|
29
|
-
WORD_DELIMS_TOK = re.compile(
|
26
|
+
WORD_DELIMS_RE = re.compile(r"""\s+|(?=[.?!;:'"-])""")
|
27
|
+
WORD_DELIMS_TOK = re.compile(r"([\p{Punctuation}\p{posix_punct}]+|\s+)")
|
30
28
|
|
31
29
|
Tokenizer = Callable[[str], List[str]]
|
32
30
|
|
@@ -53,11 +51,7 @@ def word_tokenize_re(s: str) -> List[str]:
|
|
53
51
|
|
54
52
|
|
55
53
|
def sent_tokenize_tok(s: str) -> List[str]:
|
56
|
-
return [
|
57
|
-
clean
|
58
|
-
for sent in re.findall(SENT_DELIMS_TOK, s)
|
59
|
-
if (clean := sent[0].strip() or sent[1].strip())
|
60
|
-
]
|
54
|
+
return [clean for sent in re.split(SENT_DELIMS_TOK, s) if (clean := sent.strip())]
|
61
55
|
|
62
56
|
|
63
57
|
def word_tokenize_tok(s: str) -> List[str]:
|
@@ -1,14 +1,14 @@
|
|
1
|
-
sonatoki-0.1.
|
2
|
-
sonatoki-0.1.
|
3
|
-
sonatoki-0.1.
|
1
|
+
sonatoki-0.1.2.dist-info/METADATA,sha256=3ccNKi5ODyxbkBvihKJ-XyXEDF5mzO0AzB0myhCMNMM,5020
|
2
|
+
sonatoki-0.1.2.dist-info/WHEEL,sha256=vnE8JVcI2Wz7GRKorsPArnBdnW2SWKWGow5gu5tHlRU,90
|
3
|
+
sonatoki-0.1.2.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
|
4
4
|
sonatoki/Cleaners.py,sha256=gTZ9dSsnvKVUtxM_ECSZ-_2heh--nD5A9dCQR1ATb1c,1160
|
5
5
|
sonatoki/Filters.py,sha256=yzhYF79GX03cOwlR_-B8SPMQPZv4UpAPytH0fQwBE70,4093
|
6
|
-
sonatoki/Preprocessors.py,sha256=
|
7
|
-
sonatoki/Scorers.py,sha256=
|
8
|
-
sonatoki/Tokenizers.py,sha256=
|
6
|
+
sonatoki/Preprocessors.py,sha256=OhjKcH_nlDznT1Y-ssdU1xH6_xbNrPDQEh8D7H3UTcs,3499
|
7
|
+
sonatoki/Scorers.py,sha256=V293DBiupBiujzuc4yMrKOAiuNTLltIsiCzIAlLeokA,4129
|
8
|
+
sonatoki/Tokenizers.py,sha256=lFvYS9SXij6gIk-CatKyFQtbRV0ska9FAgynEObxd-A,1741
|
9
9
|
sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
sonatoki/__main__.py,sha256=6xc-wIrrFo9wTyn4zRQNAmqwmJBtVvCMwV-CrM-hueA,82
|
11
11
|
sonatoki/constants.py,sha256=h5rbCfu9YF76BsjQYud5d2wq1HODY05zOaw0Ir1cwjo,1320
|
12
12
|
sonatoki/ilo.py,sha256=h3TYoqrjHxMGhB8ZJLVijVzy1AVCeWJk5x0q-bs4JMc,4278
|
13
13
|
sonatoki/linku.json,sha256=MdFuFRIHniPDUVxKEKuUg1KyzPVgcCj4ZeyvburCwD0,270928
|
14
|
-
sonatoki-0.1.
|
14
|
+
sonatoki-0.1.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|