sonatoki 0.5.1__tar.gz → 0.5.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {sonatoki-0.5.1 → sonatoki-0.5.2}/PKG-INFO +17 -16
  2. {sonatoki-0.5.1 → sonatoki-0.5.2}/README.md +16 -15
  3. {sonatoki-0.5.1 → sonatoki-0.5.2}/pyproject.toml +1 -1
  4. {sonatoki-0.5.1 → sonatoki-0.5.2}/src/sonatoki/Configs.py +22 -1
  5. {sonatoki-0.5.1 → sonatoki-0.5.2}/src/sonatoki/Preprocessors.py +6 -0
  6. {sonatoki-0.5.1 → sonatoki-0.5.2}/src/sonatoki/constants.py +6 -3
  7. sonatoki-0.5.2/src/sonatoki/linku.json +1 -0
  8. sonatoki-0.5.2/src/sonatoki/sandbox.json +1 -0
  9. {sonatoki-0.5.1 → sonatoki-0.5.2}/tests/test_filters.py +23 -0
  10. {sonatoki-0.5.1 → sonatoki-0.5.2}/tests/test_ilo.py +15 -8
  11. {sonatoki-0.5.1 → sonatoki-0.5.2}/tests/test_preprocessors.py +10 -0
  12. {sonatoki-0.5.1 → sonatoki-0.5.2}/tests/test_properties.py +1 -1
  13. sonatoki-0.5.1/src/sonatoki/linku.json +0 -1
  14. sonatoki-0.5.1/src/sonatoki/sandbox.json +0 -1
  15. {sonatoki-0.5.1 → sonatoki-0.5.2}/LICENSE +0 -0
  16. {sonatoki-0.5.1 → sonatoki-0.5.2}/src/sonatoki/Cleaners.py +0 -0
  17. {sonatoki-0.5.1 → sonatoki-0.5.2}/src/sonatoki/Filters.py +0 -0
  18. {sonatoki-0.5.1 → sonatoki-0.5.2}/src/sonatoki/Scorers.py +0 -0
  19. {sonatoki-0.5.1 → sonatoki-0.5.2}/src/sonatoki/Tokenizers.py +0 -0
  20. {sonatoki-0.5.1 → sonatoki-0.5.2}/src/sonatoki/__init__.py +0 -0
  21. {sonatoki-0.5.1 → sonatoki-0.5.2}/src/sonatoki/__main__.py +0 -0
  22. {sonatoki-0.5.1 → sonatoki-0.5.2}/src/sonatoki/alphabetic.txt +0 -0
  23. {sonatoki-0.5.1 → sonatoki-0.5.2}/src/sonatoki/ilo.py +0 -0
  24. {sonatoki-0.5.1 → sonatoki-0.5.2}/src/sonatoki/py.typed +0 -0
  25. {sonatoki-0.5.1 → sonatoki-0.5.2}/src/sonatoki/syllabic.txt +0 -0
  26. {sonatoki-0.5.1 → sonatoki-0.5.2}/src/sonatoki/utils.py +0 -0
  27. {sonatoki-0.5.1 → sonatoki-0.5.2}/tests/__init__.py +0 -0
  28. {sonatoki-0.5.1 → sonatoki-0.5.2}/tests/test_cleaners.py +0 -0
  29. {sonatoki-0.5.1 → sonatoki-0.5.2}/tests/test_scorers.py +0 -0
  30. {sonatoki-0.5.1 → sonatoki-0.5.2}/tests/test_tokenize.py +0 -0
  31. {sonatoki-0.5.1 → sonatoki-0.5.2}/tests/test_utils.py +0 -0
  32. {sonatoki-0.5.1 → sonatoki-0.5.2}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
  33. {sonatoki-0.5.1 → sonatoki-0.5.2}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.5.1
3
+ Version: 0.5.2
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -26,9 +26,9 @@ This library, "Language Knowledge," helps you identify whether a message is in T
26
26
 
27
27
  I wrote this library with a variety of scraps and lessons learned from a prior project, [ilo pi toki pona taso, "toki-pona-only tool"](https://github.com/gregdan3/ilo-pi-toki-pona-taso). That tool now uses this library to great success!
28
28
 
29
- If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, time, preferences of the speaker, and much more, can all alter whether a given message is "in" any specific language. This complexity applies to Toki Pona too.
29
+ If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, topic, preferences of the speaker, and much more, can all alter whether a given message is "in" any specific language. This complexity applies to Toki Pona too.
30
30
 
31
- So, this project "solves" that complex problem by offering an opinionated tokenizer and a configurable parser, allowing you to tune its output to your preferences and goals. [Even silly ones.](https://sona.pona.la/wiki/isipin_epiku).
31
+ So, this project "solves" that complex problem by offering an opinionated tokenizer and a configurable parser, allowing you to tune its output to your preferences and goals. [Even silly ones.](https://sona.pona.la/wiki/isipin_epiku)
32
32
 
33
33
  ## Quick Start
34
34
 
@@ -61,12 +61,12 @@ Or if you'd prefer to configure on your own:
61
61
  from copy import deepcopy
62
62
  from sonatoki.ilo import Ilo
63
63
  from sonatoki.Configs import BaseConfig
64
- from sonatoki.Filters import NimiLinkuCore, Phonotactic, ProperName
64
+ from sonatoki.Filters import NimiLinkuCore, NimiLinkuCommon, Phonotactic, ProperName, Or
65
65
  from sonatoki.Scorers import SoftPassFail
66
66
 
67
67
  def main():
68
68
  config = deepcopy(BaseConfig)
69
- config["scoring_filters"].extend([NimiLinkuCore, Phonotactic, ProperName])
69
+ config["scoring_filters"].extend([Or(NimiLinkuCore, NimiLinkuCommon), Phonotactic, ProperName])
70
70
  config["scorer"] = SoftPassFail
71
71
 
72
72
  ilo = Ilo(**config)
@@ -78,7 +78,7 @@ if __name__ == "__main__":
78
78
  main()
79
79
  ```
80
80
 
81
- `Ilo` is highly configurable by necessity, so I recommend looking through the premade configs in `Configs` as well as the individual `Preprocessors`, `Filters`, and `Scorers`. The `Cleaners` module only contains one cleaner, which I recommend always using. Similarly, the `Tokenizers` module contains several other word tokenizers, but their performance will be worse than the dedicated Toki Pona tokenizer `WordTokenizerTok`.
81
+ `Ilo` is highly configurable by necessity, so I recommend looking through the premade configs in `Configs` as well as the individual `Preprocessors`, `Filters`, and `Scorers`. In `Cleaners`, all you need is `ConsecutiveDuplicates`. In `Tokenizers`, the preferred tokenizers `WordTokenizer` and `SentTokenizer` are already the default in `Ilo`.
82
82
 
83
83
  ## Development
84
84
 
@@ -92,25 +92,26 @@ if __name__ == "__main__":
92
92
 
93
93
  The intent is to show our methodology to the Unicode Consortium, particularly to the Script Encoding Working Group (previously the Script Ad Hoc Group). As far as we're aware, zero members of the committee know Toki Pona, which unfortunately means we fall back on English.
94
94
 
95
- After our proposal has been examined and a result given by the committee, I will translate this file and library into Toki Pona, with a note left behind for those who do not understand it.
95
+ I originally intended to translate this file and library into Toki Pona once Unicode had reviewed our proposal, but this library has picked up some interest outside of the Toki Pona community, so this library and README will remain accessible to them.
96
96
 
97
97
  ### What's the deal with the tokenizers?
98
98
 
99
- The Toki Pona tokenizer `sonatoki.Tokenizers.WordTokenizer` has the goal of tokenizing statements such that every token either represents a word candidate ("toki", "mumumu") or a complete non-candidate ("..!", "123").
100
- This design is highly undesirable for NLTK's English tokenizer because English words can have "punctuation" characters in them.
101
- But Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet or in [Private Use Area Unicode characters](https://www.kreativekorp.com/ucsur/), so a more aggressive tokenizer is highly desirable.
99
+ The Toki Pona tokenizer `sonatoki.Tokenizers.WordTokenizer` attempts to tokenize statements such that every token either represents a word candidate ("toki", "mumumu") or a complete non-candidate ("..!", "123").
100
+ This design is highly undesirable for NLTK's English tokenizer because English words can have "punctuation" characters in them such as `'` or `-`.
101
+ Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet or in [Private Use Area Unicode characters](https://www.kreativekorp.com/ucsur/), so a more aggressive tokenizer is highly desirable.
102
102
 
103
103
  The goal of splitting into word candidates and non-candidates is important, because any [encoding of Toki Pona's logographic script](https://www.kreativekorp.com/ucsur/charts/sitelen.html) will require each character be split into its own token, where the default behavior would be to leave consecutive non-punctuation together.
104
104
 
105
105
  ### Aren't there a lot of false positives?
106
106
 
107
- Yes, depending on the filter you choose and how you apply it.
108
- It's up to you to use this tool responsibly on input you've done your best to clean, such as by using stronger filters before weaker ones.
109
- For now though, here's a list of relevant false positives:
107
+ For any individual filter, yes. Here are some examples:
110
108
 
111
- - `ProperName` will errantly match text in languages without a capital/lowercase distinction, artificially increasing scores.
112
- - `Alphabetic` will match a _lot_ of undesirable text- it essentially allows 14 letters of the English alphabet. For example, "I'm well" would match as _three_ words: "i", "m", "well".
113
- - `NimiPu` and other sets containing `a`, `mute`, `open`, and others will unavoidably match those words in English text too.
109
+ - `ProperName` will errantly match text in languages without a capital/lowercase distinction
110
+ - `Alphabetic` matches words so long as they are only made of letters in Toki Pona's alphabet, which is 14 letters of the Latin alphabet.
111
+ - `Syllabic` and `Phonetic`, despite imposing more structure than `Alphabetic`, will match a surprising amount of English words. For example, every word in "an awesome joke!" matches.
112
+ - `NimiPu` and `NimiLinkuCore` will match `a`, `mute`, `open` regardless of the surrounding language.
113
+
114
+ This is point of `Ilo` and the `Scorers`: None of these filters would _individually_ be able to correctly identify a Toki Pona statement, but all of them working together with some tuning are able to achieve a surprisingly high accuracy.
114
115
 
115
116
  ### Don't some of the cleaners/filters conflict?
116
117
 
@@ -13,9 +13,9 @@ This library, "Language Knowledge," helps you identify whether a message is in T
13
13
 
14
14
  I wrote this library with a variety of scraps and lessons learned from a prior project, [ilo pi toki pona taso, "toki-pona-only tool"](https://github.com/gregdan3/ilo-pi-toki-pona-taso). That tool now uses this library to great success!
15
15
 
16
- If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, time, preferences of the speaker, and much more, can all alter whether a given message is "in" any specific language. This complexity applies to Toki Pona too.
16
+ If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, topic, preferences of the speaker, and much more, can all alter whether a given message is "in" any specific language. This complexity applies to Toki Pona too.
17
17
 
18
- So, this project "solves" that complex problem by offering an opinionated tokenizer and a configurable parser, allowing you to tune its output to your preferences and goals. [Even silly ones.](https://sona.pona.la/wiki/isipin_epiku).
18
+ So, this project "solves" that complex problem by offering an opinionated tokenizer and a configurable parser, allowing you to tune its output to your preferences and goals. [Even silly ones.](https://sona.pona.la/wiki/isipin_epiku)
19
19
 
20
20
  ## Quick Start
21
21
 
@@ -48,12 +48,12 @@ Or if you'd prefer to configure on your own:
48
48
  from copy import deepcopy
49
49
  from sonatoki.ilo import Ilo
50
50
  from sonatoki.Configs import BaseConfig
51
- from sonatoki.Filters import NimiLinkuCore, Phonotactic, ProperName
51
+ from sonatoki.Filters import NimiLinkuCore, NimiLinkuCommon, Phonotactic, ProperName, Or
52
52
  from sonatoki.Scorers import SoftPassFail
53
53
 
54
54
  def main():
55
55
  config = deepcopy(BaseConfig)
56
- config["scoring_filters"].extend([NimiLinkuCore, Phonotactic, ProperName])
56
+ config["scoring_filters"].extend([Or(NimiLinkuCore, NimiLinkuCommon), Phonotactic, ProperName])
57
57
  config["scorer"] = SoftPassFail
58
58
 
59
59
  ilo = Ilo(**config)
@@ -65,7 +65,7 @@ if __name__ == "__main__":
65
65
  main()
66
66
  ```
67
67
 
68
- `Ilo` is highly configurable by necessity, so I recommend looking through the premade configs in `Configs` as well as the individual `Preprocessors`, `Filters`, and `Scorers`. The `Cleaners` module only contains one cleaner, which I recommend always using. Similarly, the `Tokenizers` module contains several other word tokenizers, but their performance will be worse than the dedicated Toki Pona tokenizer `WordTokenizerTok`.
68
+ `Ilo` is highly configurable by necessity, so I recommend looking through the premade configs in `Configs` as well as the individual `Preprocessors`, `Filters`, and `Scorers`. In `Cleaners`, all you need is `ConsecutiveDuplicates`. In `Tokenizers`, the preferred tokenizers `WordTokenizer` and `SentTokenizer` are already the default in `Ilo`.
69
69
 
70
70
  ## Development
71
71
 
@@ -79,25 +79,26 @@ if __name__ == "__main__":
79
79
 
80
80
  The intent is to show our methodology to the Unicode Consortium, particularly to the Script Encoding Working Group (previously the Script Ad Hoc Group). As far as we're aware, zero members of the committee know Toki Pona, which unfortunately means we fall back on English.
81
81
 
82
- After our proposal has been examined and a result given by the committee, I will translate this file and library into Toki Pona, with a note left behind for those who do not understand it.
82
+ I originally intended to translate this file and library into Toki Pona once Unicode had reviewed our proposal, but this library has picked up some interest outside of the Toki Pona community, so this library and README will remain accessible to them.
83
83
 
84
84
  ### What's the deal with the tokenizers?
85
85
 
86
- The Toki Pona tokenizer `sonatoki.Tokenizers.WordTokenizer` has the goal of tokenizing statements such that every token either represents a word candidate ("toki", "mumumu") or a complete non-candidate ("..!", "123").
87
- This design is highly undesirable for NLTK's English tokenizer because English words can have "punctuation" characters in them.
88
- But Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet or in [Private Use Area Unicode characters](https://www.kreativekorp.com/ucsur/), so a more aggressive tokenizer is highly desirable.
86
+ The Toki Pona tokenizer `sonatoki.Tokenizers.WordTokenizer` attempts to tokenize statements such that every token either represents a word candidate ("toki", "mumumu") or a complete non-candidate ("..!", "123").
87
+ This design is highly undesirable for NLTK's English tokenizer because English words can have "punctuation" characters in them such as `'` or `-`.
88
+ Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet or in [Private Use Area Unicode characters](https://www.kreativekorp.com/ucsur/), so a more aggressive tokenizer is highly desirable.
89
89
 
90
90
  The goal of splitting into word candidates and non-candidates is important, because any [encoding of Toki Pona's logographic script](https://www.kreativekorp.com/ucsur/charts/sitelen.html) will require each character be split into its own token, where the default behavior would be to leave consecutive non-punctuation together.
91
91
 
92
92
  ### Aren't there a lot of false positives?
93
93
 
94
- Yes, depending on the filter you choose and how you apply it.
95
- It's up to you to use this tool responsibly on input you've done your best to clean, such as by using stronger filters before weaker ones.
96
- For now though, here's a list of relevant false positives:
94
+ For any individual filter, yes. Here are some examples:
97
95
 
98
- - `ProperName` will errantly match text in languages without a capital/lowercase distinction, artificially increasing scores.
99
- - `Alphabetic` will match a _lot_ of undesirable text- it essentially allows 14 letters of the English alphabet. For example, "I'm well" would match as _three_ words: "i", "m", "well".
100
- - `NimiPu` and other sets containing `a`, `mute`, `open`, and others will unavoidably match those words in English text too.
96
+ - `ProperName` will errantly match text in languages without a capital/lowercase distinction
97
+ - `Alphabetic` matches words so long as they are only made of letters in Toki Pona's alphabet, which is 14 letters of the Latin alphabet.
98
+ - `Syllabic` and `Phonetic`, despite imposing more structure than `Alphabetic`, will match a surprising amount of English words. For example, every word in "an awesome joke!" matches.
99
+ - `NimiPu` and `NimiLinkuCore` will match `a`, `mute`, `open` regardless of the surrounding language.
100
+
101
+ This is point of `Ilo` and the `Scorers`: None of these filters would _individually_ be able to correctly identify a Toki Pona statement, but all of them working together with some tuning are able to achieve a surprisingly high accuracy.
101
102
 
102
103
  ### Don't some of the cleaners/filters conflict?
103
104
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sonatoki"
3
- version = "0.5.1"
3
+ version = "0.5.2"
4
4
  description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
5
5
  authors = [
6
6
  { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
@@ -1,6 +1,6 @@
1
1
  # STL
2
2
  from copy import deepcopy
3
- from typing import List, Type, TypedDict
3
+ from typing import Set, List, Type, TypedDict, cast
4
4
 
5
5
  # PDM
6
6
  from typing_extensions import NotRequired
@@ -18,6 +18,7 @@ from sonatoki.Filters import (
18
18
  NimiKuLili,
19
19
  NimiKuSuli,
20
20
  ProperName,
21
+ Phonotactic,
21
22
  Punctuation,
22
23
  LongSyllabic,
23
24
  Miscellaneous,
@@ -102,6 +103,26 @@ CorpusConfig: IloConfig = {
102
103
  "scorer": SoftScaling,
103
104
  "passing_score": 0.8,
104
105
  }
106
+
107
+ # TODO: create a mechanism to omit tokens from a filter with more granularity
108
+ __corpus_tokens_dict: Set[str] = cast(
109
+ Set[str],
110
+ CorpusConfig["scoring_filters"][
111
+ 0
112
+ ].tokens, # pyright: ignore[reportAttributeAccessIssue]
113
+ )
114
+ __corpus_tokens_dict -= {
115
+ "an",
116
+ "i",
117
+ "me",
118
+ "ne",
119
+ "se",
120
+ "take",
121
+ "ten",
122
+ "to",
123
+ "u",
124
+ "we",
125
+ }
105
126
  """Mimics the previous implementation of ilo pi toki pona taso."""
106
127
  LazyConfig: IloConfig = {
107
128
  "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
@@ -90,6 +90,12 @@ class DiscordEmotes(RegexPreprocessor):
90
90
  pattern = re.compile(r"<a?:[a-zA-Z0-9_]{2,}:[0-9]{2,}>")
91
91
 
92
92
 
93
+ class ColonEmotes(RegexPreprocessor):
94
+ """Remove colon-marked emotes `:name:`"""
95
+
96
+ pattern = re.compile(r":[a-zA-Z0-9_]{2,}:")
97
+
98
+
93
99
  class DiscordMentions(RegexPreprocessor):
94
100
  pattern = re.compile(r"<@[\!\&]?[0-9]{2,}>")
95
101
 
@@ -553,7 +553,7 @@ FALSE_POS_SYLLABIC = {
553
553
  "in",
554
554
  "no",
555
555
  "some",
556
- # "papa",
556
+ # "papa", # now in sandbox
557
557
  "on",
558
558
  "me",
559
559
  "ipa",
@@ -591,7 +591,7 @@ FALSE_POS_SYLLABIC = {
591
591
  "oposite",
592
592
  "anime",
593
593
  "potato",
594
- # "japan",
594
+ "japan",
595
595
  "nose",
596
596
  "kilo",
597
597
  "alone",
@@ -629,17 +629,20 @@ FALSE_POS_SYLLABIC = {
629
629
  "awaken",
630
630
  "eliminate",
631
631
  "elite",
632
- "misuse",
633
632
  "emanate",
634
633
  "iluminate",
635
634
  "imense",
636
635
  "imitate",
636
+ "injoke",
637
637
  "insane",
638
638
  "insolate",
639
639
  "insulate",
640
640
  "intense",
641
641
  "lemon",
642
642
  "manipulate",
643
+ "misuse",
644
+ "ne", # "no" in many other languages
645
+ "wana",
643
646
  }
644
647
 
645
648
  FALSE_POS_ALPHABETIC: Set[str] = {