sonatoki 0.8.3__tar.gz → 0.8.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {sonatoki-0.8.3 → sonatoki-0.8.4}/PKG-INFO +1 -1
  2. {sonatoki-0.8.3 → sonatoki-0.8.4}/pyproject.toml +1 -1
  3. {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/Preprocessors.py +15 -0
  4. {sonatoki-0.8.3 → sonatoki-0.8.4}/tests/test_preprocessors.py +10 -1
  5. {sonatoki-0.8.3 → sonatoki-0.8.4}/LICENSE +0 -0
  6. {sonatoki-0.8.3 → sonatoki-0.8.4}/README.md +0 -0
  7. {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/Cleaners.py +0 -0
  8. {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/Configs.py +0 -0
  9. {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/Filters.py +0 -0
  10. {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/Scorers.py +0 -0
  11. {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/Tokenizers.py +0 -0
  12. {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/__init__.py +0 -0
  13. {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/__main__.py +0 -0
  14. {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/alphabetic.txt +0 -0
  15. {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/constants.py +0 -0
  16. {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/ilo.py +0 -0
  17. {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/linku.json +0 -0
  18. {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/py.typed +0 -0
  19. {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/sandbox.json +0 -0
  20. {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/syllabic.txt +0 -0
  21. {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/types.py +0 -0
  22. {sonatoki-0.8.3 → sonatoki-0.8.4}/src/sonatoki/utils.py +0 -0
  23. {sonatoki-0.8.3 → sonatoki-0.8.4}/tests/__init__.py +0 -0
  24. {sonatoki-0.8.3 → sonatoki-0.8.4}/tests/test_cleaners.py +0 -0
  25. {sonatoki-0.8.3 → sonatoki-0.8.4}/tests/test_filters.py +0 -0
  26. {sonatoki-0.8.3 → sonatoki-0.8.4}/tests/test_ilo.py +0 -0
  27. {sonatoki-0.8.3 → sonatoki-0.8.4}/tests/test_properties.py +0 -0
  28. {sonatoki-0.8.3 → sonatoki-0.8.4}/tests/test_scorers.py +0 -0
  29. {sonatoki-0.8.3 → sonatoki-0.8.4}/tests/test_tokenize.py +0 -0
  30. {sonatoki-0.8.3 → sonatoki-0.8.4}/tests/test_utils.py +0 -0
  31. {sonatoki-0.8.3 → sonatoki-0.8.4}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
  32. {sonatoki-0.8.3 → sonatoki-0.8.4}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.8.3
3
+ Version: 0.8.4
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sonatoki"
3
- version = "0.8.3"
3
+ version = "0.8.4"
4
4
  description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
5
5
  authors = [
6
6
  { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
@@ -83,6 +83,19 @@ class MarkdownURLs(RegexPreprocessor):
83
83
  replace = r"\1"
84
84
 
85
85
 
86
+ class Emails(RegexPreprocessor):
87
+ """Attempt to remove emails, for a particularly strong definition of
88
+ "email".
89
+
90
+ https://www.regular-expressions.info/email.html
91
+ """
92
+
93
+ pattern = re.compile(
94
+ r"\b[a-zA-Z0-9._%+-]{2,}@[a-zA-Z0-9.-]{2,}\.[a-zA-Z]{2,24}\b",
95
+ flags=re.IGNORECASE,
96
+ )
97
+
98
+
86
99
  class Reference(RegexPreprocessor):
87
100
  """Remove text contained in double brackets.
88
101
 
@@ -228,6 +241,7 @@ RECOMMENDED_PREPROCESSORS: List[Type[Preprocessor]] = [
228
241
  Reference,
229
242
  MarkdownURLs,
230
243
  URLs,
244
+ Emails,
231
245
  Emoji,
232
246
  ]
233
247
 
@@ -242,6 +256,7 @@ __all__ = [
242
256
  "DiscordMentions",
243
257
  "DiscordSpecial",
244
258
  "DoubleQuotes",
259
+ "Emails",
245
260
  "Emoji",
246
261
  "MarkdownURLs",
247
262
  "RECOMMENDED_PREPROCESSORS",
@@ -24,6 +24,7 @@ from sonatoki.Preprocessors import (
24
24
  DiscordMentions,
25
25
  AngleBracketObject,
26
26
  )
27
+ from src.sonatoki.Preprocessors import Emails
27
28
 
28
29
 
29
30
  def extract_bracket_content(markdown_text: str) -> Optional[str]:
@@ -31,7 +32,7 @@ def extract_bracket_content(markdown_text: str) -> Optional[str]:
31
32
  if start == -1:
32
33
  return None
33
34
 
34
- end = markdown_text.rfind("]")
35
+ end = markdown_text.find("](")
35
36
  if end == -1 or end <= start:
36
37
  return None
37
38
 
@@ -54,11 +55,19 @@ def test_URLs(s: str):
54
55
  @example("[[] silly mode activated](https://discord.gg/)")
55
56
  @example("[https://example.com/](http://example.com)")
56
57
  @example("[192.168.0.255](http://localhost:80)")
58
+ @example("[text](https://bad.worse]/)")
57
59
  def test_MarkdownURLs(s: str):
58
60
  bracket_content = extract_bracket_content(s)
59
61
  assert MarkdownURLs.process(s) == bracket_content
60
62
 
61
63
 
64
+ @given(st.from_regex(Emails.pattern, fullmatch=True))
65
+ @example("mun@pona.la")
66
+ @example("tokipona@alinome.com")
67
+ def test_Emails(s: str):
68
+ assert Emails.process(s).strip() == ""
69
+
70
+
62
71
  @given(st.from_regex(Spoilers.pattern, fullmatch=True))
63
72
  @example("|| | ||")
64
73
  @example("|| content\n\n\ncontent ||")
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes