sonatoki 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonatoki/Preprocessors.py CHANGED
@@ -121,6 +121,9 @@ class ArrowQuote(RegexPreprocessor):
121
121
 
122
122
 
123
123
  __all__ = [
124
+ "DiscordChannels",
125
+ "DiscordMentions",
126
+ "DiscordSpecial",
124
127
  "DiscordEmotes",
125
128
  "SingleQuotes",
126
129
  "DoubleQuotes",
sonatoki/Scorers.py CHANGED
@@ -16,6 +16,13 @@ Number = Union[int, float]
16
16
  Weights = Dict[str, Number]
17
17
 
18
18
 
19
+ def sigmoid(n: int) -> Number:
20
+ return 1 / (1 + math.exp(-(0.30 * (n - 1))))
21
+ # n-1 makes sigmoid(1) == 0.5
22
+ # 0.30 softens scaling in favor of short input
23
+ # return n / (1+abs(n)) # too weak in 0.7+
24
+
25
+
19
26
  class Scorer(ABC):
20
27
  @classmethod
21
28
  @abstractmethod
@@ -27,7 +34,7 @@ class PassFail(Scorer):
27
34
  """The token passes any filter or fails all of them, scoring 1 or 0 respectively."""
28
35
 
29
36
  @classmethod
30
- def __score(cls, token: str, filters: List[Type[Filter]]) -> Number:
37
+ def score_token(cls, token: str, filters: List[Type[Filter]]) -> Number:
31
38
  for f in filters:
32
39
  if f.filter(token):
33
40
  score = 1
@@ -47,10 +54,27 @@ class PassFail(Scorer):
47
54
  total_score = 0
48
55
  len_tokens = len(tokens)
49
56
  for token in tokens:
50
- total_score += cls.__score(token, filters)
57
+ total_score += cls.score_token(token, filters)
51
58
  return total_score / len_tokens if len_tokens else 0
52
59
 
53
60
 
61
+ class SoftPassFail(PassFail):
62
+ @classmethod
63
+ @override
64
+ def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
65
+ if not tokens:
66
+ return 1
67
+
68
+ total_score = 0
69
+ len_tokens = len(tokens)
70
+ for token in tokens:
71
+ total_score += cls.score_token(token, filters)
72
+
73
+ percentage = total_score / len_tokens if len_tokens else 0
74
+ percentage **= sigmoid(len_tokens)
75
+ return percentage
76
+
77
+
54
78
  class Scaling(Scorer):
55
79
  """
56
80
  The sooner a token matches a filter, the higher its score.
@@ -91,13 +115,6 @@ class SoftScaling(Scaling):
91
115
  For example, a single token scoring 0.64 will now score 0.8.
92
116
  """
93
117
 
94
- @staticmethod
95
- def sigmoid(n: int) -> Number:
96
- return 1 / (1 + math.exp(-(0.30 * (n - 1))))
97
- # n-1 makes sigmoid(1) == 0.5
98
- # 0.30 softens scaling in favor of short input
99
- # return n / (1+abs(n)) # too weak in 0.7+
100
-
101
118
  @classmethod
102
119
  @override
103
120
  def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
@@ -113,11 +130,11 @@ class SoftScaling(Scaling):
113
130
  total_score += cls.score_token(token, filters, len_filters)
114
131
 
115
132
  percentage = total_score / max_score if max_score else 0
116
- percentage **= cls.sigmoid(len_tokens)
133
+ percentage **= sigmoid(len_tokens)
117
134
  return percentage
118
135
 
119
136
 
120
137
  class Logarithmic(Scorer): ...
121
138
 
122
139
 
123
- __all__ = ["PassFail", "Scaling", "SoftScaling"]
140
+ __all__ = ["PassFail", "SoftPassFail", "Scaling", "SoftScaling"]
sonatoki/Tokenizers.py CHANGED
@@ -4,6 +4,8 @@ from typing import List, Callable
4
4
  # PDM
5
5
  import regex as re
6
6
 
7
+ # TODO: Entire module should be reworked to match the class scheme of the rest of the module, imo
8
+
7
9
  try:
8
10
  # PDM
9
11
  import nltk
@@ -15,18 +17,14 @@ except ImportError as e:
15
17
 
16
18
  LANGUAGE = "english" # for NLTK
17
19
 
18
- SENT_DELIMS_RE = r"""(.*?[.?!;:])|(.+?$)"""
19
- SENT_DELIMS_RE = re.compile(SENT_DELIMS_RE)
20
-
21
- SENT_DELIMS_TOK = r"""(.*?[.?!;:-])|(.+?$)"""
22
- SENT_DELIMS_TOK = re.compile(SENT_DELIMS_TOK)
23
-
24
-
25
- WORD_DELIMS_RE = r"""\s+|(?=[.?!;:'"-])"""
26
- WORD_DELIMS_RE = re.compile(WORD_DELIMS_RE)
20
+ SENT_DELIMS_RE = re.compile(r"""(.*?[.?!;:])|(.+?$)""")
21
+ SENT_DELIMS_TOK = re.compile(r"""(?<=[.?!:;·…“”"'()\[\]\-]|$)""")
22
+ # TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
23
+ # TODO: do the typography characters matter?
24
+ # NOTE: | / and , are *not* sentence delimiters for my purpose
27
25
 
28
- WORD_DELIMS_TOK = r"([\p{Punctuation}\p{posix_punct}]+|\s+)"
29
- WORD_DELIMS_TOK = re.compile(WORD_DELIMS_TOK)
26
+ WORD_DELIMS_RE = re.compile(r"""\s+|(?=[.?!;:'"-])""")
27
+ WORD_DELIMS_TOK = re.compile(r"([\p{Punctuation}\p{posix_punct}]+|\s+)")
30
28
 
31
29
  Tokenizer = Callable[[str], List[str]]
32
30
 
@@ -53,11 +51,7 @@ def word_tokenize_re(s: str) -> List[str]:
53
51
 
54
52
 
55
53
  def sent_tokenize_tok(s: str) -> List[str]:
56
- return [
57
- clean
58
- for sent in re.findall(SENT_DELIMS_TOK, s)
59
- if (clean := sent[0].strip() or sent[1].strip())
60
- ]
54
+ return [clean for sent in re.split(SENT_DELIMS_TOK, s) if (clean := sent.strip())]
61
55
 
62
56
 
63
57
  def word_tokenize_tok(s: str) -> List[str]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,14 +1,14 @@
1
- sonatoki-0.1.1.dist-info/METADATA,sha256=gUp7IAG1ZmBarZnwyxHt6RmP8GX0tahsaG1qkp047GM,5020
2
- sonatoki-0.1.1.dist-info/WHEEL,sha256=vnE8JVcI2Wz7GRKorsPArnBdnW2SWKWGow5gu5tHlRU,90
3
- sonatoki-0.1.1.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
1
+ sonatoki-0.1.2.dist-info/METADATA,sha256=3ccNKi5ODyxbkBvihKJ-XyXEDF5mzO0AzB0myhCMNMM,5020
2
+ sonatoki-0.1.2.dist-info/WHEEL,sha256=vnE8JVcI2Wz7GRKorsPArnBdnW2SWKWGow5gu5tHlRU,90
3
+ sonatoki-0.1.2.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
4
4
  sonatoki/Cleaners.py,sha256=gTZ9dSsnvKVUtxM_ECSZ-_2heh--nD5A9dCQR1ATb1c,1160
5
5
  sonatoki/Filters.py,sha256=yzhYF79GX03cOwlR_-B8SPMQPZv4UpAPytH0fQwBE70,4093
6
- sonatoki/Preprocessors.py,sha256=G2up2jKKSrHQtTQWYNWH_fkjgroL45ZeajVn1KUECt8,3431
7
- sonatoki/Scorers.py,sha256=twuFGqcIg6UTaeBVb1SmKenldC3hj1s97m6zyM_HXjg,3678
8
- sonatoki/Tokenizers.py,sha256=epOG3jZHI3MSO_L_6Z3zsSkexDEMLVzA2ARg6EnPMO0,1628
6
+ sonatoki/Preprocessors.py,sha256=OhjKcH_nlDznT1Y-ssdU1xH6_xbNrPDQEh8D7H3UTcs,3499
7
+ sonatoki/Scorers.py,sha256=V293DBiupBiujzuc4yMrKOAiuNTLltIsiCzIAlLeokA,4129
8
+ sonatoki/Tokenizers.py,sha256=lFvYS9SXij6gIk-CatKyFQtbRV0ska9FAgynEObxd-A,1741
9
9
  sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  sonatoki/__main__.py,sha256=6xc-wIrrFo9wTyn4zRQNAmqwmJBtVvCMwV-CrM-hueA,82
11
11
  sonatoki/constants.py,sha256=h5rbCfu9YF76BsjQYud5d2wq1HODY05zOaw0Ir1cwjo,1320
12
12
  sonatoki/ilo.py,sha256=h3TYoqrjHxMGhB8ZJLVijVzy1AVCeWJk5x0q-bs4JMc,4278
13
13
  sonatoki/linku.json,sha256=MdFuFRIHniPDUVxKEKuUg1KyzPVgcCj4ZeyvburCwD0,270928
14
- sonatoki-0.1.1.dist-info/RECORD,,
14
+ sonatoki-0.1.2.dist-info/RECORD,,