sonatoki 0.5.2__tar.gz → 0.5.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {sonatoki-0.5.2 → sonatoki-0.5.3}/PKG-INFO +1 -1
  2. {sonatoki-0.5.2 → sonatoki-0.5.3}/pyproject.toml +1 -1
  3. {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/Configs.py +24 -10
  4. {sonatoki-0.5.2 → sonatoki-0.5.3}/tests/test_properties.py +4 -2
  5. {sonatoki-0.5.2 → sonatoki-0.5.3}/LICENSE +0 -0
  6. {sonatoki-0.5.2 → sonatoki-0.5.3}/README.md +0 -0
  7. {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/Cleaners.py +0 -0
  8. {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/Filters.py +0 -0
  9. {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/Preprocessors.py +0 -0
  10. {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/Scorers.py +0 -0
  11. {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/Tokenizers.py +0 -0
  12. {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/__init__.py +0 -0
  13. {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/__main__.py +0 -0
  14. {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/alphabetic.txt +0 -0
  15. {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/constants.py +0 -0
  16. {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/ilo.py +0 -0
  17. {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/linku.json +0 -0
  18. {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/py.typed +0 -0
  19. {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/sandbox.json +0 -0
  20. {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/syllabic.txt +0 -0
  21. {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/utils.py +0 -0
  22. {sonatoki-0.5.2 → sonatoki-0.5.3}/tests/__init__.py +0 -0
  23. {sonatoki-0.5.2 → sonatoki-0.5.3}/tests/test_cleaners.py +0 -0
  24. {sonatoki-0.5.2 → sonatoki-0.5.3}/tests/test_filters.py +0 -0
  25. {sonatoki-0.5.2 → sonatoki-0.5.3}/tests/test_ilo.py +0 -0
  26. {sonatoki-0.5.2 → sonatoki-0.5.3}/tests/test_preprocessors.py +0 -0
  27. {sonatoki-0.5.2 → sonatoki-0.5.3}/tests/test_scorers.py +0 -0
  28. {sonatoki-0.5.2 → sonatoki-0.5.3}/tests/test_tokenize.py +0 -0
  29. {sonatoki-0.5.2 → sonatoki-0.5.3}/tests/test_utils.py +0 -0
  30. {sonatoki-0.5.2 → sonatoki-0.5.3}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
  31. {sonatoki-0.5.2 → sonatoki-0.5.3}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.5.2
3
+ Version: 0.5.3
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sonatoki"
3
- version = "0.5.2"
3
+ version = "0.5.3"
4
4
  description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
5
5
  authors = [
6
6
  { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
@@ -112,16 +112,30 @@ __corpus_tokens_dict: Set[str] = cast(
112
112
  ].tokens, # pyright: ignore[reportAttributeAccessIssue]
113
113
  )
114
114
  __corpus_tokens_dict -= {
115
- "an",
116
- "i",
117
- "me",
118
- "ne",
119
- "se",
120
- "take",
121
- "ten",
122
- "to",
123
- "u",
124
- "we",
115
+ # Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
116
+ # In this case, all of these appear more often in English by a factor of at least 10.
117
+ "aka", # also known as
118
+ "an", # article
119
+ "api", # API
120
+ "i", # 1st person
121
+ "kana", # japanese script
122
+ "me", # 1st person
123
+ "ne", # "no" in several languages
124
+ "nu", # "new", now in dutch
125
+ "se", # spanish particle, "see"
126
+ "take", # acquire, perhaps forcefully or without permission
127
+ "ten", # 10
128
+ "to", # to, too
129
+ "u", # no u
130
+ "we", # 1st person plural
131
+ "wi", # wii and discussions of syllables
132
+ "sole", # singular, of shoe
133
+ # unexplored candidates for removal
134
+ # "omen", # ominous
135
+ # "papa", # father
136
+ # "lo", # "lo" and "loo"
137
+ # "ewe", # sheep
138
+ # "pa", # father- eh?
125
139
  }
126
140
  """Mimics the previous implementation of ilo pi toki pona taso."""
127
141
  LazyConfig: IloConfig = {
@@ -1,6 +1,6 @@
1
1
  # PDM
2
2
  import hypothesis.strategies as st
3
- from hypothesis import given
3
+ from hypothesis import given, assume
4
4
 
5
5
  # LOCAL
6
6
  from sonatoki.Filters import (
@@ -54,11 +54,13 @@ def test_ku_filters_non_overlap(s: str):
54
54
  | NIMI_LINKU_COMMON
55
55
  | NIMI_LINKU_UNCOMMON
56
56
  | NIMI_LINKU_OBSCURE
57
- | NIMI_LINKU_SANDBOX - {"su"}
57
+ | NIMI_LINKU_SANDBOX
58
58
  )
59
59
  )
60
60
  )
61
61
  def test_linku_filters_non_overlap(s: str):
62
+ _ = assume(s != "su")
63
+
62
64
  s = Lowercase.clean(s)
63
65
  s = ConsecutiveDuplicates.clean(s)
64
66
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes