sonatoki 0.8.1__tar.gz → 0.8.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {sonatoki-0.8.1 → sonatoki-0.8.2}/PKG-INFO +1 -1
  2. {sonatoki-0.8.1 → sonatoki-0.8.2}/pyproject.toml +1 -1
  3. {sonatoki-0.8.1 → sonatoki-0.8.2}/src/sonatoki/constants.py +3 -2
  4. {sonatoki-0.8.1 → sonatoki-0.8.2}/tests/test_preprocessors.py +2 -1
  5. {sonatoki-0.8.1 → sonatoki-0.8.2}/tests/tokenize_cases/tokenize_sentences_tok.yml +2 -1
  6. {sonatoki-0.8.1 → sonatoki-0.8.2}/LICENSE +0 -0
  7. {sonatoki-0.8.1 → sonatoki-0.8.2}/README.md +0 -0
  8. {sonatoki-0.8.1 → sonatoki-0.8.2}/src/sonatoki/Cleaners.py +0 -0
  9. {sonatoki-0.8.1 → sonatoki-0.8.2}/src/sonatoki/Configs.py +0 -0
  10. {sonatoki-0.8.1 → sonatoki-0.8.2}/src/sonatoki/Filters.py +0 -0
  11. {sonatoki-0.8.1 → sonatoki-0.8.2}/src/sonatoki/Preprocessors.py +0 -0
  12. {sonatoki-0.8.1 → sonatoki-0.8.2}/src/sonatoki/Scorers.py +0 -0
  13. {sonatoki-0.8.1 → sonatoki-0.8.2}/src/sonatoki/Tokenizers.py +0 -0
  14. {sonatoki-0.8.1 → sonatoki-0.8.2}/src/sonatoki/__init__.py +0 -0
  15. {sonatoki-0.8.1 → sonatoki-0.8.2}/src/sonatoki/__main__.py +0 -0
  16. {sonatoki-0.8.1 → sonatoki-0.8.2}/src/sonatoki/alphabetic.txt +0 -0
  17. {sonatoki-0.8.1 → sonatoki-0.8.2}/src/sonatoki/ilo.py +0 -0
  18. {sonatoki-0.8.1 → sonatoki-0.8.2}/src/sonatoki/linku.json +0 -0
  19. {sonatoki-0.8.1 → sonatoki-0.8.2}/src/sonatoki/py.typed +0 -0
  20. {sonatoki-0.8.1 → sonatoki-0.8.2}/src/sonatoki/sandbox.json +0 -0
  21. {sonatoki-0.8.1 → sonatoki-0.8.2}/src/sonatoki/syllabic.txt +0 -0
  22. {sonatoki-0.8.1 → sonatoki-0.8.2}/src/sonatoki/types.py +0 -0
  23. {sonatoki-0.8.1 → sonatoki-0.8.2}/src/sonatoki/utils.py +0 -0
  24. {sonatoki-0.8.1 → sonatoki-0.8.2}/tests/__init__.py +0 -0
  25. {sonatoki-0.8.1 → sonatoki-0.8.2}/tests/test_cleaners.py +0 -0
  26. {sonatoki-0.8.1 → sonatoki-0.8.2}/tests/test_filters.py +0 -0
  27. {sonatoki-0.8.1 → sonatoki-0.8.2}/tests/test_ilo.py +0 -0
  28. {sonatoki-0.8.1 → sonatoki-0.8.2}/tests/test_properties.py +0 -0
  29. {sonatoki-0.8.1 → sonatoki-0.8.2}/tests/test_scorers.py +0 -0
  30. {sonatoki-0.8.1 → sonatoki-0.8.2}/tests/test_tokenize.py +0 -0
  31. {sonatoki-0.8.1 → sonatoki-0.8.2}/tests/test_utils.py +0 -0
  32. {sonatoki-0.8.1 → sonatoki-0.8.2}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.8.1
3
+ Version: 0.8.2
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sonatoki"
3
- version = "0.8.1"
3
+ version = "0.8.2"
4
4
  description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
5
5
  authors = [
6
6
  { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
@@ -503,8 +503,9 @@ ALL_PUNCT = "".join(sorted(list(set(POSIX_PUNCT + UNICODE_PUNCT))))
503
503
  ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
504
504
  # combined bc the result could be simpler
505
505
 
506
- SENTENCE_PUNCT = """.?!:;()[-]·•…"""
507
- # NOTE: quotes were previously included, but in TP they are *not* reliably sentence boundaries
506
+ SENTENCE_PUNCT = """.?!:;"()[-]«»‹›“”‟„⹂‽·•…「」『』"""
507
+ # single quotes are word boundaries if not intra-word, but double quotes are sentence
508
+ # boundaries
508
509
 
509
510
  INTRA_WORD_PUNCT = """-'’"""
510
511
 
@@ -49,7 +49,8 @@ def test_Backticks(s: str):
49
49
  assert res == "", (repr(s), repr(res))
50
50
 
51
51
 
52
- @given(st.from_regex(Codeblock.pattern, fullmatch=True))
52
+ @given(st.from_regex(r"```(?:(?!`).+?)```", fullmatch=True))
53
+ @example("""```0```""")
53
54
  @example(
54
55
  """```
55
56
  ```"""
@@ -56,7 +56,8 @@
56
56
  input: 'ona li toki e ni: "mama sina"'
57
57
  output:
58
58
  - "ona li toki e ni:"
59
- - '"mama sina"'
59
+ - '"'
60
+ - 'mama sina"'
60
61
  - name: "discovered case 1"
61
62
  input: "ona li ken lukin e sitelen [_ike_nanpa_lete_ike]. ni li pona kin."
62
63
  output:
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes