sonatoki 0.9.1__py3-none-any.whl → 0.9.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonatoki/Tokenizers.py CHANGED
@@ -104,6 +104,10 @@ class WordTokenizer(SetTokenizer):
104
104
  # we skipped, but there wasn't another writing character
105
105
  cls.add_token(s, tokens, last_match, i - 1)
106
106
  last_match = i - 1
107
+ # there may be punctuation though
108
+ # TODO: this is duplicated
109
+ while i < slen and cls.is_delimiter(s[i]):
110
+ i += 1
107
111
 
108
112
  cls.add_token(s, tokens, last_match, i)
109
113
 
sonatoki/constants.py CHANGED
@@ -507,7 +507,7 @@ SENTENCE_PUNCT = """.?!:;"()[-]«»‹›“”‟„⹂‽·•…「」『』"
507
507
  # single quotes are word boundaries if not intra-word, but double quotes are sentence
508
508
  # boundaries
509
509
 
510
- INTRA_WORD_PUNCT = """-'"""
510
+ INTRA_WORD_PUNCT = """-'’."""
511
511
 
512
512
 
513
513
  LINKU = Path(__file__).resolve().parent / Path("linku.json")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.9.1
3
+ Version: 0.9.2
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,17 +1,17 @@
1
- sonatoki-0.9.1.dist-info/METADATA,sha256=LtQdicBtNTIcKfboAe5C7kPMLs_J46H8msRAcjY5gUw,6893
2
- sonatoki-0.9.1.dist-info/WHEEL,sha256=thaaA2w1JzcGC48WYufAs8nrYZjJm8LqNfnXFOFyCC4,90
3
- sonatoki-0.9.1.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
4
- sonatoki-0.9.1.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
1
+ sonatoki-0.9.2.dist-info/METADATA,sha256=nTqR-hm823FWnDVMJCgoWwmhSU4RaE2fdayXQcixd4o,6893
2
+ sonatoki-0.9.2.dist-info/WHEEL,sha256=thaaA2w1JzcGC48WYufAs8nrYZjJm8LqNfnXFOFyCC4,90
3
+ sonatoki-0.9.2.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
4
+ sonatoki-0.9.2.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
5
5
  sonatoki/Cleaners.py,sha256=x2dT3MpDUfbrHA0EP2D3n1sTiKFFi5jw9ha-1dX973o,1958
6
6
  sonatoki/Configs.py,sha256=6TY-G1nZFGv5EcElatWvI5MagwVCo92D5TTl7s2PX_s,4899
7
7
  sonatoki/Filters.py,sha256=8HAtR6_Rk6GPboaS_MHwSjZBJxYnAA8kYbRPI0eR6sM,14823
8
8
  sonatoki/Preprocessors.py,sha256=RmzkvPVo6Kdx1rZ5HeR9cTtx6oxpp2iLKrOMCUEqIrM,7107
9
9
  sonatoki/Scorers.py,sha256=zkdWc0hbtCX1HPdhI2tu2mL4Z5_S5sv7T83MefE4Yik,7756
10
- sonatoki/Tokenizers.py,sha256=8lpC70bzXOpHyhVr5bmqpYKmdmQvJdf7X5-Icc9RRCw,5040
10
+ sonatoki/Tokenizers.py,sha256=cfWWZCfvn2tNJChDrofHrORZExp17g0rPmH5ydWgTQY,5219
11
11
  sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  sonatoki/__main__.py,sha256=QIWRLYS1jb7OBUBK5s8kYoeiMv6MLBlt_I7H7tIVjpU,5745
13
13
  sonatoki/alphabetic.txt,sha256=duyqAKilD2vLIr75RShCIAnktNJcGeEoQIk18V6czmg,11702
14
- sonatoki/constants.py,sha256=I7njbNlIlTnOeowApVYV2l6uQLl3N5vAdS0regd11lI,19516
14
+ sonatoki/constants.py,sha256=BrU45haroW-ya3qmFsVk7fdTUGyoYVw1MdVVnpiWjt8,19517
15
15
  sonatoki/ilo.py,sha256=Dsn0yagkwjqpAQoCj6mkZ6NqWeanRF2lxNDNoqjWGLo,5993
16
16
  sonatoki/linku.json,sha256=U5KVxFJSageQydXXDsQCT8X_QoNAK2OaZhJmbu0eoZo,299939
17
17
  sonatoki/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -19,4 +19,4 @@ sonatoki/sandbox.json,sha256=QAviQZ7_nwstUr1ejKegxiIoYmBL2YJIoiZovDYNFL8,147485
19
19
  sonatoki/syllabic.txt,sha256=HnqY4TrZ3tPcHah3TsvG9F9gjMrnAGdJ8hHJNHyyUPc,1712
20
20
  sonatoki/types.py,sha256=VjYSGAzsbR_d3mg8n-VHg__7LyXpmGdEIMDsbPHyxFw,1265
21
21
  sonatoki/utils.py,sha256=sT5xLMEj0aLpy8GP92HKblJU1Wt1m8NUlMgCFWB32xQ,2265
22
- sonatoki-0.9.1.dist-info/RECORD,,
22
+ sonatoki-0.9.2.dist-info/RECORD,,