sonatoki 0.8.3__tar.gz → 0.8.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sonatoki-0.8.3 → sonatoki-0.8.4}/PKG-INFO +1 -1
- {sonatoki-0.8.3 → sonatoki-0.8.4}/pyproject.toml +1 -1
- {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/Preprocessors.py +15 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/tests/test_preprocessors.py +10 -1
- {sonatoki-0.8.3 → sonatoki-0.8.4}/LICENSE +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/README.md +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/Cleaners.py +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/Configs.py +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/Filters.py +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/Scorers.py +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/Tokenizers.py +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/__init__.py +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/__main__.py +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/alphabetic.txt +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/constants.py +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/ilo.py +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/linku.json +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/py.typed +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/sandbox.json +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/syllabic.txt +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/types.py +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/utils.py +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/tests/__init__.py +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/tests/test_cleaners.py +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/tests/test_filters.py +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/tests/test_ilo.py +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/tests/test_properties.py +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/tests/test_scorers.py +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/tests/test_tokenize.py +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/tests/test_utils.py +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
- {sonatoki-0.8.3 → sonatoki-0.8.4}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -83,6 +83,19 @@ class MarkdownURLs(RegexPreprocessor):
|
|
83
83
|
replace = r"\1"
|
84
84
|
|
85
85
|
|
86
|
+
class Emails(RegexPreprocessor):
|
87
|
+
"""Attempt to remove emails, for a particularly strong definition of
|
88
|
+
"email".
|
89
|
+
|
90
|
+
https://www.regular-expressions.info/email.html
|
91
|
+
"""
|
92
|
+
|
93
|
+
pattern = re.compile(
|
94
|
+
r"\b[a-zA-Z0-9._%+-]{2,}@[a-zA-Z0-9.-]{2,}\.[a-zA-Z]{2,24}\b",
|
95
|
+
flags=re.IGNORECASE,
|
96
|
+
)
|
97
|
+
|
98
|
+
|
86
99
|
class Reference(RegexPreprocessor):
|
87
100
|
"""Remove text contained in double brackets.
|
88
101
|
|
@@ -228,6 +241,7 @@ RECOMMENDED_PREPROCESSORS: List[Type[Preprocessor]] = [
|
|
228
241
|
Reference,
|
229
242
|
MarkdownURLs,
|
230
243
|
URLs,
|
244
|
+
Emails,
|
231
245
|
Emoji,
|
232
246
|
]
|
233
247
|
|
@@ -242,6 +256,7 @@ __all__ = [
|
|
242
256
|
"DiscordMentions",
|
243
257
|
"DiscordSpecial",
|
244
258
|
"DoubleQuotes",
|
259
|
+
"Emails",
|
245
260
|
"Emoji",
|
246
261
|
"MarkdownURLs",
|
247
262
|
"RECOMMENDED_PREPROCESSORS",
|
@@ -24,6 +24,7 @@ from sonatoki.Preprocessors import (
|
|
24
24
|
DiscordMentions,
|
25
25
|
AngleBracketObject,
|
26
26
|
)
|
27
|
+
from src.sonatoki.Preprocessors import Emails
|
27
28
|
|
28
29
|
|
29
30
|
def extract_bracket_content(markdown_text: str) -> Optional[str]:
|
@@ -31,7 +32,7 @@ def extract_bracket_content(markdown_text: str) -> Optional[str]:
|
|
31
32
|
if start == -1:
|
32
33
|
return None
|
33
34
|
|
34
|
-
end = markdown_text.
|
35
|
+
end = markdown_text.find("](")
|
35
36
|
if end == -1 or end <= start:
|
36
37
|
return None
|
37
38
|
|
@@ -54,11 +55,19 @@ def test_URLs(s: str):
|
|
54
55
|
@example("[[] silly mode activated](https://discord.gg/)")
|
55
56
|
@example("[https://example.com/](http://example.com)")
|
56
57
|
@example("[192.168.0.255](http://localhost:80)")
|
58
|
+
@example("[text](https://bad.worse]/)")
|
57
59
|
def test_MarkdownURLs(s: str):
|
58
60
|
bracket_content = extract_bracket_content(s)
|
59
61
|
assert MarkdownURLs.process(s) == bracket_content
|
60
62
|
|
61
63
|
|
64
|
+
@given(st.from_regex(Emails.pattern, fullmatch=True))
|
65
|
+
@example("mun@pona.la")
|
66
|
+
@example("tokipona@alinome.com")
|
67
|
+
def test_Emails(s: str):
|
68
|
+
assert Emails.process(s).strip() == ""
|
69
|
+
|
70
|
+
|
62
71
|
@given(st.from_regex(Spoilers.pattern, fullmatch=True))
|
63
72
|
@example("|| | ||")
|
64
73
|
@example("|| content\n\n\ncontent ||")
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|