sonatoki 0.11.0__tar.gz → 0.11.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sonatoki-0.11.0 → sonatoki-0.11.2}/PKG-INFO +1 -1
- {sonatoki-0.11.0 → sonatoki-0.11.2}/pyproject.toml +1 -1
- {sonatoki-0.11.0 → sonatoki-0.11.2}/src/sonatoki/Cleaners.py +11 -9
- {sonatoki-0.11.0 → sonatoki-0.11.2}/src/sonatoki/Tokenizers.py +12 -4
- {sonatoki-0.11.0 → sonatoki-0.11.2}/src/sonatoki/constants.py +4 -1
- {sonatoki-0.11.0 → sonatoki-0.11.2}/src/sonatoki/utils.py +2 -7
- {sonatoki-0.11.0 → sonatoki-0.11.2}/tests/test_cleaners.py +2 -2
- {sonatoki-0.11.0 → sonatoki-0.11.2}/tests/tokenize_cases/tokenize_sentences_tok.yml +23 -0
- {sonatoki-0.11.0 → sonatoki-0.11.2}/LICENSE +0 -0
- {sonatoki-0.11.0 → sonatoki-0.11.2}/README.md +0 -0
- {sonatoki-0.11.0 → sonatoki-0.11.2}/src/sonatoki/Configs.py +0 -0
- {sonatoki-0.11.0 → sonatoki-0.11.2}/src/sonatoki/Filters.py +0 -0
- {sonatoki-0.11.0 → sonatoki-0.11.2}/src/sonatoki/Preprocessors.py +0 -0
- {sonatoki-0.11.0 → sonatoki-0.11.2}/src/sonatoki/Scorers.py +0 -0
- {sonatoki-0.11.0 → sonatoki-0.11.2}/src/sonatoki/__init__.py +0 -0
- {sonatoki-0.11.0 → sonatoki-0.11.2}/src/sonatoki/__main__.py +0 -0
- {sonatoki-0.11.0 → sonatoki-0.11.2}/src/sonatoki/alphabetic.txt +0 -0
- {sonatoki-0.11.0 → sonatoki-0.11.2}/src/sonatoki/ilo.py +0 -0
- {sonatoki-0.11.0 → sonatoki-0.11.2}/src/sonatoki/linku.json +0 -0
- {sonatoki-0.11.0 → sonatoki-0.11.2}/src/sonatoki/py.typed +0 -0
- {sonatoki-0.11.0 → sonatoki-0.11.2}/src/sonatoki/sandbox.json +0 -0
- {sonatoki-0.11.0 → sonatoki-0.11.2}/src/sonatoki/syllabic.txt +0 -0
- {sonatoki-0.11.0 → sonatoki-0.11.2}/src/sonatoki/types.py +0 -0
- {sonatoki-0.11.0 → sonatoki-0.11.2}/tests/__init__.py +0 -0
- {sonatoki-0.11.0 → sonatoki-0.11.2}/tests/test_filters.py +0 -0
- {sonatoki-0.11.0 → sonatoki-0.11.2}/tests/test_ilo.py +0 -0
- {sonatoki-0.11.0 → sonatoki-0.11.2}/tests/test_preprocessors.py +0 -0
- {sonatoki-0.11.0 → sonatoki-0.11.2}/tests/test_properties.py +0 -0
- {sonatoki-0.11.0 → sonatoki-0.11.2}/tests/test_scorers.py +0 -0
- {sonatoki-0.11.0 → sonatoki-0.11.2}/tests/test_tokenize.py +0 -0
- {sonatoki-0.11.0 → sonatoki-0.11.2}/tests/test_utils.py +0 -0
- {sonatoki-0.11.0 → sonatoki-0.11.2}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -1,6 +1,7 @@
|
|
1
1
|
# STL
|
2
2
|
import re
|
3
3
|
from abc import ABC, abstractmethod
|
4
|
+
from sys import intern
|
4
5
|
|
5
6
|
# PDM
|
6
7
|
from typing_extensions import override
|
@@ -21,7 +22,7 @@ class RegexCleaner(Cleaner):
|
|
21
22
|
@classmethod
|
22
23
|
@override
|
23
24
|
def clean(cls, token: str) -> str:
|
24
|
-
return re.sub(cls.pattern, cls.replace, token)
|
25
|
+
return intern(re.sub(cls.pattern, cls.replace, token))
|
25
26
|
|
26
27
|
|
27
28
|
class ConsecutiveDuplicates(Cleaner):
|
@@ -44,29 +45,30 @@ class ConsecutiveDuplicates(Cleaner):
|
|
44
45
|
return token
|
45
46
|
|
46
47
|
output = token[0]
|
47
|
-
|
48
48
|
last_output = output.lower() # ignore case in comparison
|
49
49
|
for i in range(1, len(token)):
|
50
|
-
cur_char = token[i]
|
51
|
-
|
50
|
+
cur_char = intern(token[i])
|
51
|
+
lower_cur_char = intern(cur_char.lower())
|
52
|
+
if lower_cur_char == last_output:
|
52
53
|
continue
|
53
|
-
output +=
|
54
|
-
last_output =
|
54
|
+
output += cur_char # preserve case of string
|
55
|
+
last_output = lower_cur_char
|
56
|
+
output = intern(output)
|
55
57
|
return output
|
56
58
|
|
57
59
|
|
58
60
|
class ConsecutiveDuplicatesRe(RegexCleaner):
|
59
61
|
"""Reference implementation for `ConsecutiveDuplicates`."""
|
60
62
|
|
61
|
-
pattern = re.compile(r"(.)\1+", flags=re.IGNORECASE)
|
62
|
-
replace = r"\1"
|
63
|
+
pattern: "re.Pattern[str]" = re.compile(r"(.)\1+", flags=re.IGNORECASE)
|
64
|
+
replace: str = r"\1"
|
63
65
|
|
64
66
|
|
65
67
|
class Lowercase(Cleaner):
|
66
68
|
@classmethod
|
67
69
|
@override
|
68
70
|
def clean(cls, token: str) -> str:
|
69
|
-
return token.lower()
|
71
|
+
return intern(token.lower())
|
70
72
|
|
71
73
|
|
72
74
|
__all__ = [
|
@@ -1,6 +1,7 @@
|
|
1
1
|
# STL
|
2
2
|
import re
|
3
3
|
from abc import ABC, abstractmethod
|
4
|
+
from sys import intern
|
4
5
|
from typing import Set, List
|
5
6
|
|
6
7
|
# PDM
|
@@ -40,7 +41,11 @@ class RegexTokenizer(Tokenizer):
|
|
40
41
|
@classmethod
|
41
42
|
@override
|
42
43
|
def tokenize(cls, s: str) -> List[str]:
|
43
|
-
return [
|
44
|
+
return [
|
45
|
+
intern(clean)
|
46
|
+
for word in re.split(cls.pattern, s)
|
47
|
+
if (clean := word.strip())
|
48
|
+
]
|
44
49
|
|
45
50
|
|
46
51
|
class Regex1Tokenizer(Tokenizer):
|
@@ -50,7 +55,9 @@ class Regex1Tokenizer(Tokenizer):
|
|
50
55
|
@override
|
51
56
|
def tokenize(cls, s: str) -> List[str]:
|
52
57
|
return [
|
53
|
-
|
58
|
+
intern(clean)
|
59
|
+
for word in regex.split(cls.pattern, s)
|
60
|
+
if (clean := word.strip())
|
54
61
|
]
|
55
62
|
|
56
63
|
|
@@ -65,7 +72,8 @@ class WordTokenizer(SetTokenizer):
|
|
65
72
|
@classmethod
|
66
73
|
def add_token(cls, s: str, tokens: List[str], last_match: int, i: int):
|
67
74
|
if i > last_match:
|
68
|
-
|
75
|
+
token = intern(s[last_match:i])
|
76
|
+
tokens.append(token)
|
69
77
|
|
70
78
|
@classmethod
|
71
79
|
def to_tokens(cls, s: str) -> List[str]:
|
@@ -91,7 +99,7 @@ class WordTokenizer(SetTokenizer):
|
|
91
99
|
|
92
100
|
if NimiUCSUR.filter(s[i]):
|
93
101
|
cls.add_token(s, tokens, last_match, i)
|
94
|
-
|
102
|
+
cls.add_token(s, tokens, i, i + 1)
|
95
103
|
i += 1
|
96
104
|
last_match = i
|
97
105
|
continue
|
@@ -498,7 +498,10 @@ ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
|
|
498
498
|
|
499
499
|
|
500
500
|
UNICODE_WHITESPACE_RANGES = [
|
501
|
-
"\\
|
501
|
+
"\\U00000009", # tab
|
502
|
+
"\\U0000000A", # line feed
|
503
|
+
"\\U0000000D", # carriage return
|
504
|
+
"\\U00000020", # space
|
502
505
|
"\\U000000a0",
|
503
506
|
"\\U00001680",
|
504
507
|
"\\U00002000-\\U0000200a",
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# STL
|
2
2
|
import itertools
|
3
|
-
from typing import Set, List, TypeVar, Iterable
|
3
|
+
from typing import Set, List, Tuple, TypeVar, Iterable
|
4
4
|
|
5
5
|
# LOCAL
|
6
6
|
from sonatoki.Cleaners import Lowercase, ConsecutiveDuplicates
|
@@ -72,12 +72,7 @@ def find_unicode_chars(ranges: List[str]) -> str:
|
|
72
72
|
return "".join(result)
|
73
73
|
|
74
74
|
|
75
|
-
def
|
76
|
-
"s -> (s0,s1), (s1,s2), (s2, s3), ..."
|
77
|
-
return overlapping_ntuples(iterable, n=2)
|
78
|
-
|
79
|
-
|
80
|
-
def overlapping_ntuples(iterable: Iterable[T], n: int) -> Iterable[T]:
|
75
|
+
def overlapping_ntuples(iterable: Iterable[T], n: int) -> Iterable[Tuple[T, ...]]:
|
81
76
|
teed = itertools.tee(iterable, n)
|
82
77
|
for i in range(1, n):
|
83
78
|
for j in range(i):
|
@@ -5,7 +5,7 @@ import hypothesis.strategies as st
|
|
5
5
|
from hypothesis import given, assume, example
|
6
6
|
|
7
7
|
# LOCAL
|
8
|
-
from sonatoki.utils import
|
8
|
+
from sonatoki.utils import overlapping_ntuples
|
9
9
|
from sonatoki.Cleaners import Lowercase, ConsecutiveDuplicates, ConsecutiveDuplicatesRe
|
10
10
|
|
11
11
|
# FILESYSTEM
|
@@ -21,7 +21,7 @@ from .test_utils import PROPER_NAME_RE
|
|
21
21
|
def test_ConsecutiveDuplicatesRe(s: str):
|
22
22
|
_ = assume("\n" not in s)
|
23
23
|
res = ConsecutiveDuplicatesRe.clean(s)
|
24
|
-
for a, b in
|
24
|
+
for a, b in overlapping_ntuples(res, 2):
|
25
25
|
assert a.lower() != b.lower(), (s, res)
|
26
26
|
|
27
27
|
|
@@ -94,6 +94,29 @@
|
|
94
94
|
output:
|
95
95
|
- "isn't that game-breaking?"
|
96
96
|
- "i think so"
|
97
|
+
- name: "intraword punctuation 3"
|
98
|
+
input: "e.g.\n- monsuta\n- monsi\n- ma"
|
99
|
+
output:
|
100
|
+
- "e.g."
|
101
|
+
- "-"
|
102
|
+
- "monsuta"
|
103
|
+
- "-"
|
104
|
+
- "monsi"
|
105
|
+
- "-"
|
106
|
+
- "ma"
|
107
|
+
- name: "multiline with fake intraword"
|
108
|
+
input: >
|
109
|
+
toki!
|
110
|
+
sitelen pini ni li tu ala e toki.
|
111
|
+
ni kin.
|
112
|
+
taso ni li pini e toki anu seme:
|
113
|
+
pini la ni li toki sin.
|
114
|
+
output:
|
115
|
+
- "toki!"
|
116
|
+
- "sitelen pini ni li tu ala e toki."
|
117
|
+
- "ni kin."
|
118
|
+
- "taso ni li pini e toki anu seme:"
|
119
|
+
- "pini la ni li toki sin."
|
97
120
|
- name: "fake intraword punct 1"
|
98
121
|
input: "!.h"
|
99
122
|
output:
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|