sonatoki 0.3.0__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {sonatoki-0.3.0 → sonatoki-0.3.2}/PKG-INFO +28 -17
  2. {sonatoki-0.3.0 → sonatoki-0.3.2}/README.md +27 -16
  3. {sonatoki-0.3.0 → sonatoki-0.3.2}/pyproject.toml +1 -3
  4. {sonatoki-0.3.0 → sonatoki-0.3.2}/src/sonatoki/Configs.py +41 -30
  5. {sonatoki-0.3.0 → sonatoki-0.3.2}/src/sonatoki/Filters.py +121 -24
  6. {sonatoki-0.3.0 → sonatoki-0.3.2}/src/sonatoki/constants.py +74 -38
  7. {sonatoki-0.3.0 → sonatoki-0.3.2}/src/sonatoki/utils.py +14 -1
  8. sonatoki-0.3.2/tests/test_filters.py +242 -0
  9. {sonatoki-0.3.0 → sonatoki-0.3.2}/tests/test_ilo.py +36 -11
  10. sonatoki-0.3.2/tests/test_properties.py +78 -0
  11. {sonatoki-0.3.0 → sonatoki-0.3.2}/tests/test_scorers.py +4 -2
  12. {sonatoki-0.3.0 → sonatoki-0.3.2}/tests/test_utils.py +2 -2
  13. sonatoki-0.3.0/tests/test_filters.py +0 -127
  14. {sonatoki-0.3.0 → sonatoki-0.3.2}/LICENSE +0 -0
  15. {sonatoki-0.3.0 → sonatoki-0.3.2}/src/sonatoki/Cleaners.py +0 -0
  16. {sonatoki-0.3.0 → sonatoki-0.3.2}/src/sonatoki/Preprocessors.py +0 -0
  17. {sonatoki-0.3.0 → sonatoki-0.3.2}/src/sonatoki/Scorers.py +0 -0
  18. {sonatoki-0.3.0 → sonatoki-0.3.2}/src/sonatoki/Tokenizers.py +0 -0
  19. {sonatoki-0.3.0 → sonatoki-0.3.2}/src/sonatoki/__init__.py +0 -0
  20. {sonatoki-0.3.0 → sonatoki-0.3.2}/src/sonatoki/__main__.py +0 -0
  21. {sonatoki-0.3.0 → sonatoki-0.3.2}/src/sonatoki/ilo.py +0 -0
  22. {sonatoki-0.3.0 → sonatoki-0.3.2}/src/sonatoki/linku.json +0 -0
  23. {sonatoki-0.3.0 → sonatoki-0.3.2}/src/sonatoki/py.typed +0 -0
  24. {sonatoki-0.3.0 → sonatoki-0.3.2}/src/sonatoki/sandbox.json +0 -0
  25. {sonatoki-0.3.0 → sonatoki-0.3.2}/tests/__init__.py +0 -0
  26. {sonatoki-0.3.0 → sonatoki-0.3.2}/tests/test_cleaners.py +0 -0
  27. {sonatoki-0.3.0 → sonatoki-0.3.2}/tests/test_preprocessors.py +0 -0
  28. {sonatoki-0.3.0 → sonatoki-0.3.2}/tests/test_tokenize.py +0 -0
  29. {sonatoki-0.3.0 → sonatoki-0.3.2}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
  30. {sonatoki-0.3.0 → sonatoki-0.3.2}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -12,15 +12,22 @@ Description-Content-Type: text/markdown
12
12
 
13
13
  # sona toki
14
14
 
15
+ <div align="center">
16
+
17
+ ![Test workflow for this library](https://github.com/gregdan3/sona-toki/workflows/Tests/badge.svg)
18
+ [![Version number for this library](https://img.shields.io/pypi/v/sonatoki?logo=python&logoColor=%23cccccc)](https://pypi.org/project/sonatoki)
19
+
20
+ </div>
21
+
15
22
  ## What is **sona toki**?
16
23
 
17
- This library, "Language Knowledge," helps you identify whether a message is in Toki Pona. No grammar checking, yet, which means this more checks whether a given message has enough Toki Pona words.
24
+ This library, "Language Knowledge," helps you identify whether a message is in Toki Pona. It does so by determining whether a large enough number of words in a statement are "in Toki Pona". No grammar checking, yet.
18
25
 
19
- I wrote it with a variety of scraps and lessons learned from a prior project, [ilo pi toki pona taso, "toki-pona-only tool"](https://github.com/gregdan3/ilo-pi-toki-pona-taso). That tool will be rewritten to use this library shortly.
26
+ I wrote this library with a variety of scraps and lessons learned from a prior project, [ilo pi toki pona taso, "toki-pona-only tool"](https://github.com/gregdan3/ilo-pi-toki-pona-taso). That tool now uses this library to great success!
20
27
 
21
- If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, time, preferences of the speaker, and much more, can all alter whether a given message is "in" any specific language, and this question applies to Toki Pona too.
28
+ If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, time, preferences of the speaker, and much more, can all alter whether a given message is "in" any specific language. This complexity applies to Toki Pona too.
22
29
 
23
- This project "solves" that complex problem by offering a highly configurable parser, so you can tune it to your preferences and goals.
30
+ So, this project "solves" that complex problem by offering an opinionated tokenizer and a configurable parser, allowing you to tune its output to your preferences and goals. [Even silly ones.](https://sona.pona.la/wiki/isipin_epiku).
24
31
 
25
32
  ## Quick Start
26
33
 
@@ -53,12 +60,12 @@ Or if you'd prefer to configure on your own:
53
60
  from copy import deepcopy
54
61
  from sonatoki.ilo import Ilo
55
62
  from sonatoki.Configs import BaseConfig
56
- from sonatoki.Filters import NimiPuAle, Phonotactic, ProperName
63
+ from sonatoki.Filters import NimiLinkuCore, Phonotactic, ProperName
57
64
  from sonatoki.Scorers import SoftPassFail
58
65
 
59
66
  def main():
60
67
  config = deepcopy(BaseConfig)
61
- config["scoring_filters"].extend([NimiPuAle, Phonotactic, ProperName])
68
+ config["scoring_filters"].extend([NimiLinkuCore, Phonotactic, ProperName])
62
69
  config["scorer"] = SoftPassFail
63
70
 
64
71
  ilo = Ilo(**config)
@@ -88,24 +95,28 @@ After our proposal has been examined and a result given by the committee, I will
88
95
 
89
96
  ### What's the deal with the tokenizers?
90
97
 
91
- The Toki Pona tokenizer `word_tokenize_tok` is very specific in always separating writing characters from punctuation, and leaving contiguous punctuation as contiguous- this is a level of precision that NLTK's English tokenizer does not want for several reasons, such as that English words can have "punctuation" characters in them.
92
-
93
- Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet, so a more aggressive tokenizer is highly desirable.
98
+ The Toki Pona tokenizer `sonatoki.Tokenizers.WordTokenizer` has the goal of tokenizing statements such that every token either represents a word candidate ("toki", "mumumu") or a complete non-candidate ("..!", "123").
99
+ This design is highly undesirable for NLTK's English tokenizer because English words can have "punctuation" characters in them.
100
+ But Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet or in [Private Use Area Unicode characters](https://www.kreativekorp.com/ucsur/), so a more aggressive tokenizer is highly desirable.
94
101
 
95
- The other tokenizers are provided as a comparison case more than anything. I do not recommend their use.
102
+ The goal of splitting into word candidates and non-candidates is important, because any [encoding of Toki Pona's logographic script](https://www.kreativekorp.com/ucsur/charts/sitelen.html) will require each character be split into its own token, where the default behavior would be to leave consecutive non-punctuation together.
96
103
 
97
104
  ### Aren't there a lot of false positives?
98
105
 
99
- Yes. It's up to you to use this tool responsibly on input you've done your best to clean, and better, use stronger filters before weaker ones. For now though, here's a list of relevant false positives:
106
+ Yes, depending on the filter you choose and how you apply it.
107
+ It's up to you to use this tool responsibly on input you've done your best to clean, such as by using stronger filters before weaker ones.
108
+ For now though, here's a list of relevant false positives:
100
109
 
101
- - `ProperName` will errantly match text in languages without a capital/lowercase distinction, artificially inflating the scores.
102
- - `Alphabetic` will match a _lot_ of undesirable text- it essentially allows 14 letters of the English alphabet.
110
+ - `ProperName` will errantly match text in languages without a capital/lowercase distinction, artificially increasing scores.
111
+ - `Alphabetic` will match a _lot_ of undesirable text- it essentially allows 14 letters of the English alphabet. For example, "I'm well" would match as _three_ words: "i", "m", "well".
112
+ - `NimiPu` and other sets containing `a`, `mute`, `open`, and others will unavoidably match those words in English text too.
103
113
 
104
114
  ### Don't some of the cleaners/filters conflict?
105
115
 
106
- Yes. Some do so
116
+ Yes, though not terribly much.
107
117
 
108
118
  - `ConsecutiveDuplicates` may errantly change a word's validity. For example, "manna" is phonotactically invalid in Toki Pona, but would become "mana" which is valid.
109
- - `ConsecutiveDuplicates` will not work correctly with syllabaries (alphabets, but representing a pair of consonant and vowel).
119
+ - `ConsecutiveDuplicates` will not work correctly with syllabaries, though this should not change the validity of the analyzed word unless you attempt to dictionary match these words.
120
+ - If you build your own `MemberFilter` with words that have capital letters or consecutive duplicates, they will never match unless you use `prep_dictionary`.
110
121
 
111
- You'll notice a _lot_ of these are troubles regarding the application of latin alphabet filters to non-latin text. Working on it!
122
+ You'll notice these are mostly casued by applying latin alphabet filters to non-latin text. Working on it!
@@ -1,14 +1,21 @@
1
1
  # sona toki
2
2
 
3
+ <div align="center">
4
+
5
+ ![Test workflow for this library](https://github.com/gregdan3/sona-toki/workflows/Tests/badge.svg)
6
+ [![Version number for this library](https://img.shields.io/pypi/v/sonatoki?logo=python&logoColor=%23cccccc)](https://pypi.org/project/sonatoki)
7
+
8
+ </div>
9
+
3
10
  ## What is **sona toki**?
4
11
 
5
- This library, "Language Knowledge," helps you identify whether a message is in Toki Pona. No grammar checking, yet, which means this more checks whether a given message has enough Toki Pona words.
12
+ This library, "Language Knowledge," helps you identify whether a message is in Toki Pona. It does so by determining whether a large enough number of words in a statement are "in Toki Pona". No grammar checking, yet.
6
13
 
7
- I wrote it with a variety of scraps and lessons learned from a prior project, [ilo pi toki pona taso, "toki-pona-only tool"](https://github.com/gregdan3/ilo-pi-toki-pona-taso). That tool will be rewritten to use this library shortly.
14
+ I wrote this library with a variety of scraps and lessons learned from a prior project, [ilo pi toki pona taso, "toki-pona-only tool"](https://github.com/gregdan3/ilo-pi-toki-pona-taso). That tool now uses this library to great success!
8
15
 
9
- If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, time, preferences of the speaker, and much more, can all alter whether a given message is "in" any specific language, and this question applies to Toki Pona too.
16
+ If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, time, preferences of the speaker, and much more, can all alter whether a given message is "in" any specific language. This complexity applies to Toki Pona too.
10
17
 
11
- This project "solves" that complex problem by offering a highly configurable parser, so you can tune it to your preferences and goals.
18
+ So, this project "solves" that complex problem by offering an opinionated tokenizer and a configurable parser, allowing you to tune its output to your preferences and goals. [Even silly ones.](https://sona.pona.la/wiki/isipin_epiku).
12
19
 
13
20
  ## Quick Start
14
21
 
@@ -41,12 +48,12 @@ Or if you'd prefer to configure on your own:
41
48
  from copy import deepcopy
42
49
  from sonatoki.ilo import Ilo
43
50
  from sonatoki.Configs import BaseConfig
44
- from sonatoki.Filters import NimiPuAle, Phonotactic, ProperName
51
+ from sonatoki.Filters import NimiLinkuCore, Phonotactic, ProperName
45
52
  from sonatoki.Scorers import SoftPassFail
46
53
 
47
54
  def main():
48
55
  config = deepcopy(BaseConfig)
49
- config["scoring_filters"].extend([NimiPuAle, Phonotactic, ProperName])
56
+ config["scoring_filters"].extend([NimiLinkuCore, Phonotactic, ProperName])
50
57
  config["scorer"] = SoftPassFail
51
58
 
52
59
  ilo = Ilo(**config)
@@ -76,24 +83,28 @@ After our proposal has been examined and a result given by the committee, I will
76
83
 
77
84
  ### What's the deal with the tokenizers?
78
85
 
79
- The Toki Pona tokenizer `word_tokenize_tok` is very specific in always separating writing characters from punctuation, and leaving contiguous punctuation as contiguous- this is a level of precision that NLTK's English tokenizer does not want for several reasons, such as that English words can have "punctuation" characters in them.
80
-
81
- Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet, so a more aggressive tokenizer is highly desirable.
86
+ The Toki Pona tokenizer `sonatoki.Tokenizers.WordTokenizer` has the goal of tokenizing statements such that every token either represents a word candidate ("toki", "mumumu") or a complete non-candidate ("..!", "123").
87
+ This design is highly undesirable for NLTK's English tokenizer because English words can have "punctuation" characters in them.
88
+ But Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet or in [Private Use Area Unicode characters](https://www.kreativekorp.com/ucsur/), so a more aggressive tokenizer is highly desirable.
82
89
 
83
- The other tokenizers are provided as a comparison case more than anything. I do not recommend their use.
90
+ The goal of splitting into word candidates and non-candidates is important, because any [encoding of Toki Pona's logographic script](https://www.kreativekorp.com/ucsur/charts/sitelen.html) will require each character be split into its own token, where the default behavior would be to leave consecutive non-punctuation together.
84
91
 
85
92
  ### Aren't there a lot of false positives?
86
93
 
87
- Yes. It's up to you to use this tool responsibly on input you've done your best to clean, and better, use stronger filters before weaker ones. For now though, here's a list of relevant false positives:
94
+ Yes, depending on the filter you choose and how you apply it.
95
+ It's up to you to use this tool responsibly on input you've done your best to clean, such as by using stronger filters before weaker ones.
96
+ For now though, here's a list of relevant false positives:
88
97
 
89
- - `ProperName` will errantly match text in languages without a capital/lowercase distinction, artificially inflating the scores.
90
- - `Alphabetic` will match a _lot_ of undesirable text- it essentially allows 14 letters of the English alphabet.
98
+ - `ProperName` will errantly match text in languages without a capital/lowercase distinction, artificially increasing scores.
99
+ - `Alphabetic` will match a _lot_ of undesirable text- it essentially allows 14 letters of the English alphabet. For example, "I'm well" would match as _three_ words: "i", "m", "well".
100
+ - `NimiPu` and other sets containing `a`, `mute`, `open`, and others will unavoidably match those words in English text too.
91
101
 
92
102
  ### Don't some of the cleaners/filters conflict?
93
103
 
94
- Yes. Some do so
104
+ Yes, though not terribly much.
95
105
 
96
106
  - `ConsecutiveDuplicates` may errantly change a word's validity. For example, "manna" is phonotactically invalid in Toki Pona, but would become "mana" which is valid.
97
- - `ConsecutiveDuplicates` will not work correctly with syllabaries (alphabets, but representing a pair of consonant and vowel).
107
+ - `ConsecutiveDuplicates` will not work correctly with syllabaries, though this should not change the validity of the analyzed word unless you attempt to dictionary match these words.
108
+ - If you build your own `MemberFilter` with words that have capital letters or consecutive duplicates, they will never match unless you use `prep_dictionary`.
98
109
 
99
- You'll notice a _lot_ of these are troubles regarding the application of latin alphabet filters to non-latin text. Working on it!
110
+ You'll notice these are mostly casued by applying latin alphabet filters to non-latin text. Working on it!
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sonatoki"
3
- version = "0.3.0"
3
+ version = "0.3.2"
4
4
  description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
5
5
  authors = [
6
6
  { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
@@ -16,8 +16,6 @@ readme = "README.md"
16
16
  [project.license]
17
17
  text = "AGPL-3.0-or-later"
18
18
 
19
- [project.optional-dependencies]
20
-
21
19
  [build-system]
22
20
  requires = [
23
21
  "pdm-backend",
@@ -1,36 +1,36 @@
1
1
  # STL
2
2
  from copy import deepcopy
3
- from typing import List, Type, Union, TypedDict
3
+ from typing import List, Type, TypedDict
4
4
 
5
5
  # LOCAL
6
6
  from sonatoki.Filters import (
7
7
  Filter,
8
- NimiPu,
9
8
  Numeric,
10
- OrFilter,
11
9
  Syllabic,
12
- NimiLinku,
13
- NimiPuAle,
14
10
  NimiUCSUR,
15
11
  Alphabetic,
16
12
  ProperName,
17
- Phonotactic,
18
13
  Punctuation,
19
- NimiLinkuAle,
14
+ LongSyllabic,
15
+ Miscellaneous,
16
+ NimiLinkuCore,
17
+ LongAlphabetic,
18
+ LongProperName,
19
+ OrMemberFilter,
20
+ NimiLinkuCommon,
21
+ NimiLinkuObscure,
20
22
  NimiLinkuSandbox,
21
23
  EnglishIgnorables,
24
+ NimiLinkuUncommon,
22
25
  )
23
26
  from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
24
27
  from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
25
28
  from sonatoki.Tokenizers import Tokenizer, WordTokenizer
26
29
  from sonatoki.Preprocessors import (
27
30
  URLs,
31
+ Backticks,
28
32
  Reference,
29
33
  Preprocessor,
30
- DiscordEmotes,
31
- DiscordSpecial,
32
- DiscordChannels,
33
- DiscordMentions,
34
34
  AngleBracketObject,
35
35
  )
36
36
 
@@ -59,14 +59,14 @@ BaseConfig: IloConfig = {
59
59
 
60
60
 
61
61
  PrefConfig: IloConfig = {
62
- "preprocessors": [URLs, Reference],
62
+ "preprocessors": [Backticks, URLs, Reference],
63
63
  "cleaners": [ConsecutiveDuplicates],
64
- "ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
64
+ "ignoring_filters": [Numeric, Punctuation],
65
65
  "scoring_filters": [
66
- OrFilter(NimiLinku, NimiUCSUR),
67
- Syllabic,
68
- ProperName,
69
- Alphabetic,
66
+ OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
67
+ LongSyllabic,
68
+ LongProperName,
69
+ LongAlphabetic,
70
70
  ],
71
71
  "scorer": SoftScaling,
72
72
  "passing_score": 0.8,
@@ -74,14 +74,22 @@ PrefConfig: IloConfig = {
74
74
  }
75
75
 
76
76
  CorpusConfig: IloConfig = {
77
- "preprocessors": [URLs, AngleBracketObject, Reference],
77
+ "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
78
78
  "cleaners": [ConsecutiveDuplicates],
79
- "ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
79
+ "ignoring_filters": [Numeric, Punctuation],
80
80
  "scoring_filters": [
81
- OrFilter(NimiLinkuSandbox, NimiUCSUR),
82
- Syllabic,
83
- ProperName,
84
- Alphabetic,
81
+ OrMemberFilter(
82
+ NimiLinkuCore,
83
+ NimiLinkuCommon,
84
+ NimiLinkuUncommon,
85
+ NimiLinkuObscure,
86
+ NimiLinkuSandbox,
87
+ NimiUCSUR,
88
+ Miscellaneous,
89
+ ),
90
+ LongSyllabic,
91
+ LongProperName,
92
+ LongAlphabetic,
85
93
  ],
86
94
  "scorer": SoftScaling,
87
95
  "passing_score": 0.8,
@@ -89,25 +97,28 @@ CorpusConfig: IloConfig = {
89
97
  }
90
98
 
91
99
 
100
+ """
101
+ Mimics the previous implementation of ilo pi toki pona taso
102
+ """
92
103
  LazyConfig: IloConfig = {
93
- "preprocessors": [URLs],
104
+ "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
94
105
  "cleaners": [ConsecutiveDuplicates],
95
106
  "ignoring_filters": [Numeric, Punctuation],
96
- "scoring_filters": [Alphabetic, NimiUCSUR, ProperName],
107
+ "scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
97
108
  "scorer": SoftPassFail,
98
109
  "passing_score": 0.8,
99
110
  "word_tokenizer": WordTokenizer,
100
111
  }
101
112
 
102
113
  DiscordConfig: IloConfig = {
103
- "preprocessors": [URLs, AngleBracketObject, Reference],
114
+ "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
104
115
  "cleaners": [ConsecutiveDuplicates],
105
116
  "ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
106
117
  "scoring_filters": [
107
- OrFilter(NimiLinku, NimiUCSUR),
108
- Syllabic,
109
- ProperName,
110
- Alphabetic,
118
+ OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR),
119
+ LongSyllabic,
120
+ LongProperName,
121
+ LongAlphabetic,
111
122
  ],
112
123
  "scorer": SoftScaling,
113
124
  "passing_score": 0.8,
@@ -9,6 +9,7 @@ import regex
9
9
  from typing_extensions import override
10
10
 
11
11
  # LOCAL
12
+ from sonatoki.utils import prep_dictionary
12
13
  from sonatoki.constants import (
13
14
  VOWELS,
14
15
  NIMI_PU,
@@ -17,13 +18,17 @@ from sonatoki.constants import (
17
18
  ALLOWABLES,
18
19
  CONSONANTS,
19
20
  IGNORABLES,
20
- NIMI_LINKU,
21
21
  NIMI_UCSUR,
22
- NIMI_LINKU_LILI,
22
+ NIMI_KU_LILI,
23
+ NIMI_KU_SULI,
24
+ NIMI_LINKU_CORE,
23
25
  ALL_PUNCT_RANGES,
24
26
  NIMI_PU_SYNONYMS,
27
+ NIMI_LINKU_COMMON,
28
+ NIMI_LINKU_OBSCURE,
25
29
  NIMI_LINKU_SANDBOX,
26
30
  UCSUR_PUNCT_RANGES,
31
+ NIMI_LINKU_UNCOMMON,
27
32
  )
28
33
 
29
34
  regex.DEFAULT_VERSION = regex.VERSION1
@@ -37,6 +42,33 @@ class Filter(ABC):
37
42
  raise NotImplementedError
38
43
 
39
44
 
45
+ class MinLen(Filter):
46
+ """
47
+ Meta filter meant to be inherited by another filter to add a length requirement.
48
+ Multiple-inherit with `MinLen` as the first argument so `super()` resolves correctly.
49
+ You may also construct any other filter with a minimum length filter like so:
50
+
51
+ ```
52
+ MinLen(Alphabetic, 3)
53
+ ```
54
+ """
55
+
56
+ length = 0
57
+
58
+ @classmethod
59
+ @cache(maxsize=None)
60
+ def filter(cls, token: str) -> bool:
61
+ if len(token) < cls.length:
62
+ return False
63
+ return super().filter(token)
64
+
65
+ def __new__(cls, filter: Type[Filter], length_: int) -> Type[Filter]:
66
+ class MinLenFilter(MinLen, Filter):
67
+ length = length_
68
+
69
+ return MinLenFilter
70
+
71
+
40
72
  class RegexFilter(Filter):
41
73
  pattern: "re.Pattern[str]"
42
74
 
@@ -78,11 +110,16 @@ class SubsetFilter(Filter):
78
110
 
79
111
 
80
112
  class Miscellaneous(MemberFilter):
81
- tokens = set(ALLOWABLES)
113
+ tokens = prep_dictionary(ALLOWABLES)
82
114
 
83
115
 
84
116
  class EnglishIgnorables(MemberFilter):
85
- tokens = set(IGNORABLES)
117
+ """NOTE: Not recommended for use.
118
+ It is better to use a Long* filter such as LongSyllabic than to use this filter.
119
+ This filter hides words from scoring rather than scoring them poorly,
120
+ which is more of a benefit than a loss for a word you would like to omit."""
121
+
122
+ tokens = prep_dictionary(IGNORABLES)
86
123
 
87
124
 
88
125
  class ProperName(Filter):
@@ -104,28 +141,48 @@ class ProperName(Filter):
104
141
  # this will errantly match.
105
142
 
106
143
 
144
+ class LongProperName(MinLen, ProperName):
145
+ length = 2 # reject "names" of length 1
146
+
147
+
107
148
  class NimiPu(MemberFilter):
108
- tokens = set(NIMI_PU)
149
+ tokens = prep_dictionary(NIMI_PU)
150
+
151
+
152
+ class NimiPuSynonyms(MemberFilter):
153
+ tokens = prep_dictionary(NIMI_PU_SYNONYMS)
154
+
155
+
156
+ class NimiKuSuli(MemberFilter):
157
+ tokens = prep_dictionary(NIMI_KU_SULI)
158
+
109
159
 
160
+ class NimiKuLili(MemberFilter):
161
+ tokens = prep_dictionary(NIMI_KU_LILI)
110
162
 
111
- class NimiPuAle(MemberFilter):
112
- tokens = set(NIMI_PU + NIMI_PU_SYNONYMS)
113
163
 
164
+ class NimiLinkuCore(MemberFilter):
165
+ tokens = prep_dictionary(NIMI_LINKU_CORE)
114
166
 
115
- class NimiLinku(MemberFilter):
116
- tokens = set(NIMI_LINKU)
117
167
 
168
+ class NimiLinkuCommon(MemberFilter):
169
+ tokens = prep_dictionary(NIMI_LINKU_COMMON)
118
170
 
119
- class NimiLinkuAle(MemberFilter):
120
- tokens = set(NIMI_LINKU + NIMI_LINKU_LILI)
171
+
172
+ class NimiLinkuUncommon(MemberFilter):
173
+ tokens = prep_dictionary(NIMI_LINKU_UNCOMMON)
174
+
175
+
176
+ class NimiLinkuObscure(MemberFilter):
177
+ tokens = prep_dictionary(NIMI_LINKU_OBSCURE)
121
178
 
122
179
 
123
180
  class NimiLinkuSandbox(MemberFilter):
124
- tokens = set(NIMI_LINKU + NIMI_LINKU_LILI + NIMI_LINKU_SANDBOX)
181
+ tokens = prep_dictionary(NIMI_LINKU_SANDBOX)
125
182
 
126
183
 
127
184
  class NimiUCSUR(MemberFilter):
128
- tokens = set(NIMI_UCSUR)
185
+ tokens = prep_dictionary(NIMI_UCSUR)
129
186
 
130
187
 
131
188
  class Phonotactic(RegexFilter):
@@ -145,6 +202,10 @@ class Phonotactic(RegexFilter):
145
202
  )
146
203
 
147
204
 
205
+ class LongPhonotactic(MinLen, Phonotactic):
206
+ length = 3
207
+
208
+
148
209
  class Syllabic(RegexFilter):
149
210
  """Determines if a given token is syllabically valid Toki Pona (or `n`).
150
211
  Words must have correctly ordered vowels and consonants, but the phonotactic
@@ -158,6 +219,10 @@ class Syllabic(RegexFilter):
158
219
  )
159
220
 
160
221
 
222
+ class LongSyllabic(MinLen, Syllabic):
223
+ length = 3
224
+
225
+
161
226
  class Alphabetic(SubsetFilter):
162
227
  tokens = set(ALPHABET)
163
228
 
@@ -166,9 +231,8 @@ class AlphabeticRe(RegexFilter):
166
231
  pattern = re.compile(rf"[{ALPHABET}]+", flags=re.IGNORECASE)
167
232
 
168
233
 
169
- class TwoOrMoreAlphabetic(Filter):
170
- # TODO: alphabetic implementation that ignores single characters
171
- pass
234
+ class LongAlphabetic(MinLen, Alphabetic):
235
+ length = 3
172
236
 
173
237
 
174
238
  class Numeric(Filter):
@@ -224,11 +288,10 @@ class OrFilter:
224
288
  Instead, the user is responsible for building an OrFilter out of their desired filters.
225
289
  """
226
290
 
227
- def __new__(cls, *filters_: Type[Filter]) -> Type[Filter]:
228
- if not len(filters_) >= 2:
229
- raise ValueError("Must provide at least two Filters to OrFilter.")
291
+ @staticmethod
292
+ def __generic_filter(*filters_: Type[Filter]) -> Type[Filter]:
230
293
 
231
- class AnonymousOrFilter(Filter):
294
+ class CombinedFilter(Filter):
232
295
  filters: List[Type[Filter]] = list(filters_) # TODO: tuple better?
233
296
 
234
297
  @classmethod
@@ -240,7 +303,37 @@ class OrFilter:
240
303
  return True
241
304
  return False
242
305
 
243
- return AnonymousOrFilter
306
+ return CombinedFilter
307
+
308
+ def __new__(cls, *filters: Type[Filter]) -> Type[Filter]:
309
+ if not len(filters) >= 2:
310
+ raise ValueError("Provide at least two Filters to OrFilter.")
311
+
312
+ member_filters = [f for f in filters if issubclass(f, MemberFilter)]
313
+ if len(member_filters) >= 2:
314
+ raise Warning("Use OrMemberFilter for combining two or more MemberFilters.")
315
+
316
+ filter = cls.__generic_filter(*filters)
317
+
318
+ return filter
319
+
320
+
321
+ class OrMemberFilter:
322
+ @staticmethod
323
+ def __member_filter(*filters: Type[MemberFilter]) -> Type[MemberFilter]:
324
+ all_token_sets: List[Set[str]] = [f.tokens for f in filters]
325
+ all_tokens: Set[str] = set().union(*all_token_sets)
326
+
327
+ class CombinedFilter(MemberFilter):
328
+ tokens = all_tokens
329
+
330
+ return CombinedFilter
331
+
332
+ def __new__(cls, *filters_: Type[MemberFilter]) -> Type[MemberFilter]:
333
+ if not len(filters_) >= 2:
334
+ raise ValueError("Provide two or more MemberFilters to OrMemberFilter.")
335
+ filter = cls.__member_filter(*filters_)
336
+ return filter
244
337
 
245
338
 
246
339
  class AndFilter(Filter):
@@ -271,11 +364,15 @@ __all__ = [
271
364
  "Alphabetic",
272
365
  "AndFilter",
273
366
  "EnglishIgnorables",
274
- "NimiLinku",
275
- "NimiLinkuAle",
367
+ "LongAlphabetic",
368
+ "LongPhonotactic",
369
+ "LongProperName",
370
+ "LongSyllabic",
371
+ "MinLen",
372
+ "NimiLinkuCore",
276
373
  "NimiLinkuSandbox",
277
374
  "NimiPu",
278
- "NimiPuAle",
375
+ "NimiPuSynonyms",
279
376
  "NimiUCSUR",
280
377
  "Numeric",
281
378
  "OrFilter",