sonatoki 0.11.1__tar.gz → 0.11.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {sonatoki-0.11.1 → sonatoki-0.11.3}/PKG-INFO +1 -1
  2. {sonatoki-0.11.1 → sonatoki-0.11.3}/pyproject.toml +1 -1
  3. {sonatoki-0.11.1 → sonatoki-0.11.3}/src/sonatoki/Cleaners.py +11 -9
  4. {sonatoki-0.11.1 → sonatoki-0.11.3}/src/sonatoki/Tokenizers.py +12 -4
  5. {sonatoki-0.11.1 → sonatoki-0.11.3}/src/sonatoki/constants.py +1 -1
  6. {sonatoki-0.11.1 → sonatoki-0.11.3}/src/sonatoki/utils.py +2 -7
  7. {sonatoki-0.11.1 → sonatoki-0.11.3}/tests/test_cleaners.py +2 -2
  8. {sonatoki-0.11.1 → sonatoki-0.11.3}/tests/tokenize_cases/tokenize_sentences_tok.yml +5 -0
  9. {sonatoki-0.11.1 → sonatoki-0.11.3}/tests/tokenize_cases/tokenize_words_tok.yml +10 -7
  10. {sonatoki-0.11.1 → sonatoki-0.11.3}/LICENSE +0 -0
  11. {sonatoki-0.11.1 → sonatoki-0.11.3}/README.md +0 -0
  12. {sonatoki-0.11.1 → sonatoki-0.11.3}/src/sonatoki/Configs.py +0 -0
  13. {sonatoki-0.11.1 → sonatoki-0.11.3}/src/sonatoki/Filters.py +0 -0
  14. {sonatoki-0.11.1 → sonatoki-0.11.3}/src/sonatoki/Preprocessors.py +0 -0
  15. {sonatoki-0.11.1 → sonatoki-0.11.3}/src/sonatoki/Scorers.py +0 -0
  16. {sonatoki-0.11.1 → sonatoki-0.11.3}/src/sonatoki/__init__.py +0 -0
  17. {sonatoki-0.11.1 → sonatoki-0.11.3}/src/sonatoki/__main__.py +0 -0
  18. {sonatoki-0.11.1 → sonatoki-0.11.3}/src/sonatoki/alphabetic.txt +0 -0
  19. {sonatoki-0.11.1 → sonatoki-0.11.3}/src/sonatoki/ilo.py +0 -0
  20. {sonatoki-0.11.1 → sonatoki-0.11.3}/src/sonatoki/linku.json +0 -0
  21. {sonatoki-0.11.1 → sonatoki-0.11.3}/src/sonatoki/py.typed +0 -0
  22. {sonatoki-0.11.1 → sonatoki-0.11.3}/src/sonatoki/sandbox.json +0 -0
  23. {sonatoki-0.11.1 → sonatoki-0.11.3}/src/sonatoki/syllabic.txt +0 -0
  24. {sonatoki-0.11.1 → sonatoki-0.11.3}/src/sonatoki/types.py +0 -0
  25. {sonatoki-0.11.1 → sonatoki-0.11.3}/tests/__init__.py +0 -0
  26. {sonatoki-0.11.1 → sonatoki-0.11.3}/tests/test_filters.py +0 -0
  27. {sonatoki-0.11.1 → sonatoki-0.11.3}/tests/test_ilo.py +0 -0
  28. {sonatoki-0.11.1 → sonatoki-0.11.3}/tests/test_preprocessors.py +0 -0
  29. {sonatoki-0.11.1 → sonatoki-0.11.3}/tests/test_properties.py +0 -0
  30. {sonatoki-0.11.1 → sonatoki-0.11.3}/tests/test_scorers.py +0 -0
  31. {sonatoki-0.11.1 → sonatoki-0.11.3}/tests/test_tokenize.py +0 -0
  32. {sonatoki-0.11.1 → sonatoki-0.11.3}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.11.1
3
+ Version: 0.11.3
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sonatoki"
3
- version = "0.11.1"
3
+ version = "0.11.3"
4
4
  description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
5
5
  authors = [
6
6
  { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
@@ -1,6 +1,7 @@
1
1
  # STL
2
2
  import re
3
3
  from abc import ABC, abstractmethod
4
+ from sys import intern
4
5
 
5
6
  # PDM
6
7
  from typing_extensions import override
@@ -21,7 +22,7 @@ class RegexCleaner(Cleaner):
21
22
  @classmethod
22
23
  @override
23
24
  def clean(cls, token: str) -> str:
24
- return re.sub(cls.pattern, cls.replace, token)
25
+ return intern(re.sub(cls.pattern, cls.replace, token))
25
26
 
26
27
 
27
28
  class ConsecutiveDuplicates(Cleaner):
@@ -44,29 +45,30 @@ class ConsecutiveDuplicates(Cleaner):
44
45
  return token
45
46
 
46
47
  output = token[0]
47
-
48
48
  last_output = output.lower() # ignore case in comparison
49
49
  for i in range(1, len(token)):
50
- cur_char = token[i].lower()
51
- if cur_char == last_output:
50
+ cur_char = intern(token[i])
51
+ lower_cur_char = intern(cur_char.lower())
52
+ if lower_cur_char == last_output:
52
53
  continue
53
- output += token[i] # preserve case of string
54
- last_output = cur_char
54
+ output += cur_char # preserve case of string
55
+ last_output = lower_cur_char
56
+ output = intern(output)
55
57
  return output
56
58
 
57
59
 
58
60
  class ConsecutiveDuplicatesRe(RegexCleaner):
59
61
  """Reference implementation for `ConsecutiveDuplicates`."""
60
62
 
61
- pattern = re.compile(r"(.)\1+", flags=re.IGNORECASE)
62
- replace = r"\1"
63
+ pattern: "re.Pattern[str]" = re.compile(r"(.)\1+", flags=re.IGNORECASE)
64
+ replace: str = r"\1"
63
65
 
64
66
 
65
67
  class Lowercase(Cleaner):
66
68
  @classmethod
67
69
  @override
68
70
  def clean(cls, token: str) -> str:
69
- return token.lower()
71
+ return intern(token.lower())
70
72
 
71
73
 
72
74
  __all__ = [
@@ -1,6 +1,7 @@
1
1
  # STL
2
2
  import re
3
3
  from abc import ABC, abstractmethod
4
+ from sys import intern
4
5
  from typing import Set, List
5
6
 
6
7
  # PDM
@@ -40,7 +41,11 @@ class RegexTokenizer(Tokenizer):
40
41
  @classmethod
41
42
  @override
42
43
  def tokenize(cls, s: str) -> List[str]:
43
- return [clean for word in re.split(cls.pattern, s) if (clean := word.strip())]
44
+ return [
45
+ intern(clean)
46
+ for word in re.split(cls.pattern, s)
47
+ if (clean := word.strip())
48
+ ]
44
49
 
45
50
 
46
51
  class Regex1Tokenizer(Tokenizer):
@@ -50,7 +55,9 @@ class Regex1Tokenizer(Tokenizer):
50
55
  @override
51
56
  def tokenize(cls, s: str) -> List[str]:
52
57
  return [
53
- clean for word in regex.split(cls.pattern, s) if (clean := word.strip())
58
+ intern(clean)
59
+ for word in regex.split(cls.pattern, s)
60
+ if (clean := word.strip())
54
61
  ]
55
62
 
56
63
 
@@ -65,7 +72,8 @@ class WordTokenizer(SetTokenizer):
65
72
  @classmethod
66
73
  def add_token(cls, s: str, tokens: List[str], last_match: int, i: int):
67
74
  if i > last_match:
68
- tokens.append(s[last_match:i])
75
+ token = intern(s[last_match:i])
76
+ tokens.append(token)
69
77
 
70
78
  @classmethod
71
79
  def to_tokens(cls, s: str) -> List[str]:
@@ -91,7 +99,7 @@ class WordTokenizer(SetTokenizer):
91
99
 
92
100
  if NimiUCSUR.filter(s[i]):
93
101
  cls.add_token(s, tokens, last_match, i)
94
- tokens.append(s[i])
102
+ cls.add_token(s, tokens, i, i + 1)
95
103
  i += 1
96
104
  last_match = i
97
105
  continue
@@ -538,7 +538,7 @@ QUOTATIVE_PUNCT = """"«»‹›“”‟„⹂「」『』"""
538
538
  UCSUR_SENTENCE_PUNCT = """󱦜󱦝"""
539
539
  ALL_SENTENCE_PUNCT = BASIC_SENTENCE_PUNCT + UCSUR_SENTENCE_PUNCT
540
540
 
541
- INTRA_WORD_PUNCT = """-'’."""
541
+ INTRA_WORD_PUNCT = """-'’._"""
542
542
 
543
543
 
544
544
  LINKU = Path(__file__).resolve().parent / Path("linku.json")
@@ -1,6 +1,6 @@
1
1
  # STL
2
2
  import itertools
3
- from typing import Set, List, TypeVar, Iterable
3
+ from typing import Set, List, Tuple, TypeVar, Iterable
4
4
 
5
5
  # LOCAL
6
6
  from sonatoki.Cleaners import Lowercase, ConsecutiveDuplicates
@@ -72,12 +72,7 @@ def find_unicode_chars(ranges: List[str]) -> str:
72
72
  return "".join(result)
73
73
 
74
74
 
75
- def overlapping_pairs(iterable: Iterable[T]) -> Iterable[T]:
76
- "s -> (s0,s1), (s1,s2), (s2, s3), ..."
77
- return overlapping_ntuples(iterable, n=2)
78
-
79
-
80
- def overlapping_ntuples(iterable: Iterable[T], n: int) -> Iterable[T]:
75
+ def overlapping_ntuples(iterable: Iterable[T], n: int) -> Iterable[Tuple[T, ...]]:
81
76
  teed = itertools.tee(iterable, n)
82
77
  for i in range(1, n):
83
78
  for j in range(i):
@@ -5,7 +5,7 @@ import hypothesis.strategies as st
5
5
  from hypothesis import given, assume, example
6
6
 
7
7
  # LOCAL
8
- from sonatoki.utils import overlapping_pairs
8
+ from sonatoki.utils import overlapping_ntuples
9
9
  from sonatoki.Cleaners import Lowercase, ConsecutiveDuplicates, ConsecutiveDuplicatesRe
10
10
 
11
11
  # FILESYSTEM
@@ -21,7 +21,7 @@ from .test_utils import PROPER_NAME_RE
21
21
  def test_ConsecutiveDuplicatesRe(s: str):
22
22
  _ = assume("\n" not in s)
23
23
  res = ConsecutiveDuplicatesRe.clean(s)
24
- for a, b in overlapping_pairs(res):
24
+ for a, b in overlapping_ntuples(res, 2):
25
25
  assert a.lower() != b.lower(), (s, res)
26
26
 
27
27
 
@@ -104,6 +104,11 @@
104
104
  - "monsi"
105
105
  - "-"
106
106
  - "ma"
107
+ - name: "intraword punctuation 4"
108
+ input: "look at this variable: leaf_node_right"
109
+ output:
110
+ - "look at this variable:"
111
+ - "leaf_node_right"
107
112
  - name: "multiline with fake intraword"
108
113
  input: >
109
114
  toki!
@@ -187,13 +187,7 @@
187
187
  - "e"
188
188
  - "sitelen"
189
189
  - "[_"
190
- - "ike"
191
- - "_"
192
- - "nanpa"
193
- - "_"
194
- - "lete"
195
- - "_"
196
- - "ike"
190
+ - "ike_nanpa_lete_ike"
197
191
  - "]."
198
192
  - "ni"
199
193
  - "li"
@@ -345,6 +339,15 @@
345
339
  input: "whom's't'd've'n't"
346
340
  output:
347
341
  - "whom's't'd've'n't"
342
+ - name: "underscore"
343
+ input: "look at this variable: leaf_node_right"
344
+ output:
345
+ - "look"
346
+ - "at"
347
+ - "this"
348
+ - "variable"
349
+ - ":"
350
+ - "leaf_node_right"
348
351
  - name: "just periods"
349
352
  input: "..."
350
353
  output:
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes