sonatoki 0.9.0__tar.gz → 0.9.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {sonatoki-0.9.0 → sonatoki-0.9.2}/PKG-INFO +1 -1
  2. {sonatoki-0.9.0 → sonatoki-0.9.2}/pyproject.toml +1 -1
  3. {sonatoki-0.9.0 → sonatoki-0.9.2}/src/sonatoki/Scorers.py +13 -8
  4. {sonatoki-0.9.0 → sonatoki-0.9.2}/src/sonatoki/Tokenizers.py +4 -0
  5. {sonatoki-0.9.0 → sonatoki-0.9.2}/src/sonatoki/constants.py +9 -3
  6. sonatoki-0.9.2/src/sonatoki/linku.json +1 -0
  7. sonatoki-0.9.2/src/sonatoki/sandbox.json +1 -0
  8. {sonatoki-0.9.0 → sonatoki-0.9.2}/src/sonatoki/types.py +1 -1
  9. {sonatoki-0.9.0 → sonatoki-0.9.2}/tests/test_ilo.py +4 -1
  10. {sonatoki-0.9.0 → sonatoki-0.9.2}/tests/tokenize_cases/tokenize_words_tok.yml +51 -12
  11. sonatoki-0.9.0/src/sonatoki/linku.json +0 -1
  12. sonatoki-0.9.0/src/sonatoki/sandbox.json +0 -1
  13. {sonatoki-0.9.0 → sonatoki-0.9.2}/LICENSE +0 -0
  14. {sonatoki-0.9.0 → sonatoki-0.9.2}/README.md +0 -0
  15. {sonatoki-0.9.0 → sonatoki-0.9.2}/src/sonatoki/Cleaners.py +0 -0
  16. {sonatoki-0.9.0 → sonatoki-0.9.2}/src/sonatoki/Configs.py +3 -3
  17. {sonatoki-0.9.0 → sonatoki-0.9.2}/src/sonatoki/Filters.py +0 -0
  18. {sonatoki-0.9.0 → sonatoki-0.9.2}/src/sonatoki/Preprocessors.py +0 -0
  19. {sonatoki-0.9.0 → sonatoki-0.9.2}/src/sonatoki/__init__.py +0 -0
  20. {sonatoki-0.9.0 → sonatoki-0.9.2}/src/sonatoki/__main__.py +0 -0
  21. {sonatoki-0.9.0 → sonatoki-0.9.2}/src/sonatoki/alphabetic.txt +0 -0
  22. {sonatoki-0.9.0 → sonatoki-0.9.2}/src/sonatoki/ilo.py +0 -0
  23. {sonatoki-0.9.0 → sonatoki-0.9.2}/src/sonatoki/py.typed +0 -0
  24. {sonatoki-0.9.0 → sonatoki-0.9.2}/src/sonatoki/syllabic.txt +0 -0
  25. {sonatoki-0.9.0 → sonatoki-0.9.2}/src/sonatoki/utils.py +0 -0
  26. {sonatoki-0.9.0 → sonatoki-0.9.2}/tests/__init__.py +0 -0
  27. {sonatoki-0.9.0 → sonatoki-0.9.2}/tests/test_cleaners.py +0 -0
  28. {sonatoki-0.9.0 → sonatoki-0.9.2}/tests/test_filters.py +0 -0
  29. {sonatoki-0.9.0 → sonatoki-0.9.2}/tests/test_preprocessors.py +0 -0
  30. {sonatoki-0.9.0 → sonatoki-0.9.2}/tests/test_properties.py +0 -0
  31. {sonatoki-0.9.0 → sonatoki-0.9.2}/tests/test_scorers.py +0 -0
  32. {sonatoki-0.9.0 → sonatoki-0.9.2}/tests/test_tokenize.py +0 -0
  33. {sonatoki-0.9.0 → sonatoki-0.9.2}/tests/test_utils.py +0 -0
  34. {sonatoki-0.9.0 → sonatoki-0.9.2}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.9.0
3
+ Version: 0.9.2
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sonatoki"
3
- version = "0.9.0"
3
+ version = "0.9.2"
4
4
  description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
5
5
  authors = [
6
6
  { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
@@ -113,19 +113,24 @@ class Scaling(Scorer):
113
113
 
114
114
 
115
115
  class Voting(Scaling):
116
- """Derives from `Scaling` in assigning scores from 0 to 1 based on how soon
117
- a filter matches, with the first filter scoring a 1. However, after all
118
- scores are derived, each token scoring 0 is given a is given an opportunity
119
- to score based on its nearest 3 neighbors.
120
-
121
- If created with a Filter, tokens must also pass that filter to be
122
- considered for voting.
116
+ """Derives from `Scaling` in assigning scores from 0 to 1 based on the
117
+ first matching filter out of the list of filters. However, after all scores
118
+ are derived, each token scoring less than the threshold is assigned the
119
+ average score of its nearest 3 neighbors. The default threshold is 0.
120
+
121
+ If there are 3 or fewer tokens, this scorer is identical to the
122
+ Scaling scorer.
123
+
124
+ If the Voting scorer is created with a Filter, tokens must also
125
+ match that filter to be considered for voting. For example, the
126
+ following Voting filter would only check words with a score of 0.3
127
+ or less that still match the Syllabic filter: `Voting(Syllabic, 0.3)`
123
128
  """
124
129
 
125
130
  prereq: Type[Filter] = Pass
126
131
  threshold: int = 0
127
132
 
128
- def __new__(cls, filter: Type[Filter], threshold_: int):
133
+ def __new__(cls, filter: Type[Filter], threshold_: int = 0) -> Type[Scorer]:
129
134
  class AnonVoting(Voting):
130
135
  prereq = filter
131
136
  threshold = threshold_
@@ -104,6 +104,10 @@ class WordTokenizer(SetTokenizer):
104
104
  # we skipped, but there wasn't another writing character
105
105
  cls.add_token(s, tokens, last_match, i - 1)
106
106
  last_match = i - 1
107
+ # there may be punctuation though
108
+ # TODO: this is duplicated
109
+ while i < slen and cls.is_delimiter(s[i]):
110
+ i += 1
107
111
 
108
112
  cls.add_token(s, tokens, last_match, i)
109
113
 
@@ -7,7 +7,7 @@ from pathlib import Path
7
7
  from sonatoki.types import LinkuWord, LinkuUsageDate
8
8
  from sonatoki.utils import find_unicode_chars, find_unicode_ranges
9
9
 
10
- LATEST_DATE = "2023-09"
10
+ LATEST_DATE = "2024-09"
11
11
  # hardcoding this seems bad, but it means the parser is stable w.r.t. Linku!
12
12
 
13
13
 
@@ -507,7 +507,7 @@ SENTENCE_PUNCT = """.?!:;"()[-]«»‹›“”‟„⹂‽·•…「」『』"
507
507
  # single quotes are word boundaries if not intra-word, but double quotes are sentence
508
508
  # boundaries
509
509
 
510
- INTRA_WORD_PUNCT = """-'"""
510
+ INTRA_WORD_PUNCT = """-'’."""
511
511
 
512
512
 
513
513
  LINKU = Path(__file__).resolve().parent / Path("linku.json")
@@ -668,10 +668,11 @@ FALSE_POS_ALPHABETIC: Set[str] = {
668
668
  "not",
669
669
  "link",
670
670
  "wait",
671
+ "just",
671
672
  "lol",
672
673
  "new",
673
674
  "also",
674
- "isn", # TODO: tokenizer....
675
+ "isnt",
675
676
  "mean",
676
677
  "means",
677
678
  "it",
@@ -681,6 +682,7 @@ FALSE_POS_ALPHABETIC: Set[str] = {
681
682
  "new",
682
683
  "wel",
683
684
  "makes",
685
+ "unles",
684
686
  }
685
687
 
686
688
  UCSUR_RANGES = [
@@ -722,6 +724,10 @@ def words_by_usage(
722
724
 
723
725
  result: Set[str] = set()
724
726
  for word in data.values():
727
+ if usage == 0:
728
+ result.add(word["word"])
729
+ continue
730
+
725
731
  usages = word["usage"]
726
732
  if date in usages and usages[date] >= usage:
727
733
  result.add(word["word"])