sonatoki 0.5.0__tar.gz → 0.5.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sonatoki-0.5.0 → sonatoki-0.5.2}/PKG-INFO +17 -16
- {sonatoki-0.5.0 → sonatoki-0.5.2}/README.md +16 -15
- {sonatoki-0.5.0 → sonatoki-0.5.2}/pyproject.toml +1 -1
- {sonatoki-0.5.0 → sonatoki-0.5.2}/src/sonatoki/Configs.py +23 -1
- {sonatoki-0.5.0 → sonatoki-0.5.2}/src/sonatoki/Filters.py +4 -0
- {sonatoki-0.5.0 → sonatoki-0.5.2}/src/sonatoki/Preprocessors.py +6 -0
- {sonatoki-0.5.0 → sonatoki-0.5.2}/src/sonatoki/constants.py +84 -2
- sonatoki-0.5.2/src/sonatoki/linku.json +1 -0
- sonatoki-0.5.2/src/sonatoki/sandbox.json +1 -0
- {sonatoki-0.5.0 → sonatoki-0.5.2}/tests/test_filters.py +26 -0
- {sonatoki-0.5.0 → sonatoki-0.5.2}/tests/test_ilo.py +17 -10
- {sonatoki-0.5.0 → sonatoki-0.5.2}/tests/test_preprocessors.py +10 -0
- {sonatoki-0.5.0 → sonatoki-0.5.2}/tests/test_properties.py +10 -1
- sonatoki-0.5.0/src/sonatoki/linku.json +0 -1
- sonatoki-0.5.0/src/sonatoki/sandbox.json +0 -1
- {sonatoki-0.5.0 → sonatoki-0.5.2}/LICENSE +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.2}/src/sonatoki/Cleaners.py +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.2}/src/sonatoki/Scorers.py +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.2}/src/sonatoki/Tokenizers.py +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.2}/src/sonatoki/__init__.py +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.2}/src/sonatoki/__main__.py +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.2}/src/sonatoki/alphabetic.txt +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.2}/src/sonatoki/ilo.py +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.2}/src/sonatoki/py.typed +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.2}/src/sonatoki/syllabic.txt +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.2}/src/sonatoki/utils.py +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.2}/tests/__init__.py +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.2}/tests/test_cleaners.py +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.2}/tests/test_scorers.py +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.2}/tests/test_tokenize.py +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.2}/tests/test_utils.py +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.2}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.2}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sonatoki
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.2
|
4
4
|
Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
|
5
5
|
Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
|
6
6
|
License: AGPL-3.0-or-later
|
@@ -26,9 +26,9 @@ This library, "Language Knowledge," helps you identify whether a message is in T
|
|
26
26
|
|
27
27
|
I wrote this library with a variety of scraps and lessons learned from a prior project, [ilo pi toki pona taso, "toki-pona-only tool"](https://github.com/gregdan3/ilo-pi-toki-pona-taso). That tool now uses this library to great success!
|
28
28
|
|
29
|
-
If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment,
|
29
|
+
If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, topic, preferences of the speaker, and much more, can all alter whether a given message is "in" any specific language. This complexity applies to Toki Pona too.
|
30
30
|
|
31
|
-
So, this project "solves" that complex problem by offering an opinionated tokenizer and a configurable parser, allowing you to tune its output to your preferences and goals. [Even silly ones.](https://sona.pona.la/wiki/isipin_epiku)
|
31
|
+
So, this project "solves" that complex problem by offering an opinionated tokenizer and a configurable parser, allowing you to tune its output to your preferences and goals. [Even silly ones.](https://sona.pona.la/wiki/isipin_epiku)
|
32
32
|
|
33
33
|
## Quick Start
|
34
34
|
|
@@ -61,12 +61,12 @@ Or if you'd prefer to configure on your own:
|
|
61
61
|
from copy import deepcopy
|
62
62
|
from sonatoki.ilo import Ilo
|
63
63
|
from sonatoki.Configs import BaseConfig
|
64
|
-
from sonatoki.Filters import NimiLinkuCore, Phonotactic, ProperName
|
64
|
+
from sonatoki.Filters import NimiLinkuCore, NimiLinkuCommon, Phonotactic, ProperName, Or
|
65
65
|
from sonatoki.Scorers import SoftPassFail
|
66
66
|
|
67
67
|
def main():
|
68
68
|
config = deepcopy(BaseConfig)
|
69
|
-
config["scoring_filters"].extend([NimiLinkuCore, Phonotactic, ProperName])
|
69
|
+
config["scoring_filters"].extend([Or(NimiLinkuCore, NimiLinkuCommon), Phonotactic, ProperName])
|
70
70
|
config["scorer"] = SoftPassFail
|
71
71
|
|
72
72
|
ilo = Ilo(**config)
|
@@ -78,7 +78,7 @@ if __name__ == "__main__":
|
|
78
78
|
main()
|
79
79
|
```
|
80
80
|
|
81
|
-
`Ilo` is highly configurable by necessity, so I recommend looking through the premade configs in `Configs` as well as the individual `Preprocessors`, `Filters`, and `Scorers`.
|
81
|
+
`Ilo` is highly configurable by necessity, so I recommend looking through the premade configs in `Configs` as well as the individual `Preprocessors`, `Filters`, and `Scorers`. In `Cleaners`, all you need is `ConsecutiveDuplicates`. In `Tokenizers`, the preferred tokenizers `WordTokenizer` and `SentTokenizer` are already the default in `Ilo`.
|
82
82
|
|
83
83
|
## Development
|
84
84
|
|
@@ -92,25 +92,26 @@ if __name__ == "__main__":
|
|
92
92
|
|
93
93
|
The intent is to show our methodology to the Unicode Consortium, particularly to the Script Encoding Working Group (previously the Script Ad Hoc Group). As far as we're aware, zero members of the committee know Toki Pona, which unfortunately means we fall back on English.
|
94
94
|
|
95
|
-
|
95
|
+
I originally intended to translate this file and library into Toki Pona once Unicode had reviewed our proposal, but this library has picked up some interest outside of the Toki Pona community, so this library and README will remain accessible to them.
|
96
96
|
|
97
97
|
### What's the deal with the tokenizers?
|
98
98
|
|
99
|
-
The Toki Pona tokenizer `sonatoki.Tokenizers.WordTokenizer`
|
100
|
-
This design is highly undesirable for NLTK's English tokenizer because English words can have "punctuation" characters in them
|
101
|
-
|
99
|
+
The Toki Pona tokenizer `sonatoki.Tokenizers.WordTokenizer` attempts to tokenize statements such that every token either represents a word candidate ("toki", "mumumu") or a complete non-candidate ("..!", "123").
|
100
|
+
This design is highly undesirable for NLTK's English tokenizer because English words can have "punctuation" characters in them such as `'` or `-`.
|
101
|
+
Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet or in [Private Use Area Unicode characters](https://www.kreativekorp.com/ucsur/), so a more aggressive tokenizer is highly desirable.
|
102
102
|
|
103
103
|
The goal of splitting into word candidates and non-candidates is important, because any [encoding of Toki Pona's logographic script](https://www.kreativekorp.com/ucsur/charts/sitelen.html) will require each character be split into its own token, where the default behavior would be to leave consecutive non-punctuation together.
|
104
104
|
|
105
105
|
### Aren't there a lot of false positives?
|
106
106
|
|
107
|
-
|
108
|
-
It's up to you to use this tool responsibly on input you've done your best to clean, such as by using stronger filters before weaker ones.
|
109
|
-
For now though, here's a list of relevant false positives:
|
107
|
+
For any individual filter, yes. Here are some examples:
|
110
108
|
|
111
|
-
- `ProperName` will errantly match text in languages without a capital/lowercase distinction
|
112
|
-
- `Alphabetic`
|
113
|
-
- `
|
109
|
+
- `ProperName` will errantly match text in languages without a capital/lowercase distinction
|
110
|
+
- `Alphabetic` matches words so long as they are only made of letters in Toki Pona's alphabet, which is 14 letters of the Latin alphabet.
|
111
|
+
- `Syllabic` and `Phonetic`, despite imposing more structure than `Alphabetic`, will match a surprising amount of English words. For example, every word in "an awesome joke!" matches.
|
112
|
+
- `NimiPu` and `NimiLinkuCore` will match `a`, `mute`, `open` regardless of the surrounding language.
|
113
|
+
|
114
|
+
This is point of `Ilo` and the `Scorers`: None of these filters would _individually_ be able to correctly identify a Toki Pona statement, but all of them working together with some tuning are able to achieve a surprisingly high accuracy.
|
114
115
|
|
115
116
|
### Don't some of the cleaners/filters conflict?
|
116
117
|
|
@@ -13,9 +13,9 @@ This library, "Language Knowledge," helps you identify whether a message is in T
|
|
13
13
|
|
14
14
|
I wrote this library with a variety of scraps and lessons learned from a prior project, [ilo pi toki pona taso, "toki-pona-only tool"](https://github.com/gregdan3/ilo-pi-toki-pona-taso). That tool now uses this library to great success!
|
15
15
|
|
16
|
-
If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment,
|
16
|
+
If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, topic, preferences of the speaker, and much more, can all alter whether a given message is "in" any specific language. This complexity applies to Toki Pona too.
|
17
17
|
|
18
|
-
So, this project "solves" that complex problem by offering an opinionated tokenizer and a configurable parser, allowing you to tune its output to your preferences and goals. [Even silly ones.](https://sona.pona.la/wiki/isipin_epiku)
|
18
|
+
So, this project "solves" that complex problem by offering an opinionated tokenizer and a configurable parser, allowing you to tune its output to your preferences and goals. [Even silly ones.](https://sona.pona.la/wiki/isipin_epiku)
|
19
19
|
|
20
20
|
## Quick Start
|
21
21
|
|
@@ -48,12 +48,12 @@ Or if you'd prefer to configure on your own:
|
|
48
48
|
from copy import deepcopy
|
49
49
|
from sonatoki.ilo import Ilo
|
50
50
|
from sonatoki.Configs import BaseConfig
|
51
|
-
from sonatoki.Filters import NimiLinkuCore, Phonotactic, ProperName
|
51
|
+
from sonatoki.Filters import NimiLinkuCore, NimiLinkuCommon, Phonotactic, ProperName, Or
|
52
52
|
from sonatoki.Scorers import SoftPassFail
|
53
53
|
|
54
54
|
def main():
|
55
55
|
config = deepcopy(BaseConfig)
|
56
|
-
config["scoring_filters"].extend([NimiLinkuCore, Phonotactic, ProperName])
|
56
|
+
config["scoring_filters"].extend([Or(NimiLinkuCore, NimiLinkuCommon), Phonotactic, ProperName])
|
57
57
|
config["scorer"] = SoftPassFail
|
58
58
|
|
59
59
|
ilo = Ilo(**config)
|
@@ -65,7 +65,7 @@ if __name__ == "__main__":
|
|
65
65
|
main()
|
66
66
|
```
|
67
67
|
|
68
|
-
`Ilo` is highly configurable by necessity, so I recommend looking through the premade configs in `Configs` as well as the individual `Preprocessors`, `Filters`, and `Scorers`.
|
68
|
+
`Ilo` is highly configurable by necessity, so I recommend looking through the premade configs in `Configs` as well as the individual `Preprocessors`, `Filters`, and `Scorers`. In `Cleaners`, all you need is `ConsecutiveDuplicates`. In `Tokenizers`, the preferred tokenizers `WordTokenizer` and `SentTokenizer` are already the default in `Ilo`.
|
69
69
|
|
70
70
|
## Development
|
71
71
|
|
@@ -79,25 +79,26 @@ if __name__ == "__main__":
|
|
79
79
|
|
80
80
|
The intent is to show our methodology to the Unicode Consortium, particularly to the Script Encoding Working Group (previously the Script Ad Hoc Group). As far as we're aware, zero members of the committee know Toki Pona, which unfortunately means we fall back on English.
|
81
81
|
|
82
|
-
|
82
|
+
I originally intended to translate this file and library into Toki Pona once Unicode had reviewed our proposal, but this library has picked up some interest outside of the Toki Pona community, so this library and README will remain accessible to them.
|
83
83
|
|
84
84
|
### What's the deal with the tokenizers?
|
85
85
|
|
86
|
-
The Toki Pona tokenizer `sonatoki.Tokenizers.WordTokenizer`
|
87
|
-
This design is highly undesirable for NLTK's English tokenizer because English words can have "punctuation" characters in them
|
88
|
-
|
86
|
+
The Toki Pona tokenizer `sonatoki.Tokenizers.WordTokenizer` attempts to tokenize statements such that every token either represents a word candidate ("toki", "mumumu") or a complete non-candidate ("..!", "123").
|
87
|
+
This design is highly undesirable for NLTK's English tokenizer because English words can have "punctuation" characters in them such as `'` or `-`.
|
88
|
+
Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet or in [Private Use Area Unicode characters](https://www.kreativekorp.com/ucsur/), so a more aggressive tokenizer is highly desirable.
|
89
89
|
|
90
90
|
The goal of splitting into word candidates and non-candidates is important, because any [encoding of Toki Pona's logographic script](https://www.kreativekorp.com/ucsur/charts/sitelen.html) will require each character be split into its own token, where the default behavior would be to leave consecutive non-punctuation together.
|
91
91
|
|
92
92
|
### Aren't there a lot of false positives?
|
93
93
|
|
94
|
-
|
95
|
-
It's up to you to use this tool responsibly on input you've done your best to clean, such as by using stronger filters before weaker ones.
|
96
|
-
For now though, here's a list of relevant false positives:
|
94
|
+
For any individual filter, yes. Here are some examples:
|
97
95
|
|
98
|
-
- `ProperName` will errantly match text in languages without a capital/lowercase distinction
|
99
|
-
- `Alphabetic`
|
100
|
-
- `
|
96
|
+
- `ProperName` will errantly match text in languages without a capital/lowercase distinction
|
97
|
+
- `Alphabetic` matches words so long as they are only made of letters in Toki Pona's alphabet, which is 14 letters of the Latin alphabet.
|
98
|
+
- `Syllabic` and `Phonetic`, despite imposing more structure than `Alphabetic`, will match a surprising amount of English words. For example, every word in "an awesome joke!" matches.
|
99
|
+
- `NimiPu` and `NimiLinkuCore` will match `a`, `mute`, `open` regardless of the surrounding language.
|
100
|
+
|
101
|
+
This is point of `Ilo` and the `Scorers`: None of these filters would _individually_ be able to correctly identify a Toki Pona statement, but all of them working together with some tuning are able to achieve a surprisingly high accuracy.
|
101
102
|
|
102
103
|
### Don't some of the cleaners/filters conflict?
|
103
104
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# STL
|
2
2
|
from copy import deepcopy
|
3
|
-
from typing import List, Type, TypedDict
|
3
|
+
from typing import Set, List, Type, TypedDict, cast
|
4
4
|
|
5
5
|
# PDM
|
6
6
|
from typing_extensions import NotRequired
|
@@ -18,6 +18,7 @@ from sonatoki.Filters import (
|
|
18
18
|
NimiKuLili,
|
19
19
|
NimiKuSuli,
|
20
20
|
ProperName,
|
21
|
+
Phonotactic,
|
21
22
|
Punctuation,
|
22
23
|
LongSyllabic,
|
23
24
|
Miscellaneous,
|
@@ -73,6 +74,7 @@ PrefConfig: IloConfig = {
|
|
73
74
|
"scoring_filters": [
|
74
75
|
Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
|
75
76
|
And(LongSyllabic, Not(FalsePosSyllabic)),
|
77
|
+
# NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
|
76
78
|
LongProperName,
|
77
79
|
LongAlphabetic,
|
78
80
|
],
|
@@ -101,6 +103,26 @@ CorpusConfig: IloConfig = {
|
|
101
103
|
"scorer": SoftScaling,
|
102
104
|
"passing_score": 0.8,
|
103
105
|
}
|
106
|
+
|
107
|
+
# TODO: create a mechanism to omit tokens from a filter with more granularity
|
108
|
+
__corpus_tokens_dict: Set[str] = cast(
|
109
|
+
Set[str],
|
110
|
+
CorpusConfig["scoring_filters"][
|
111
|
+
0
|
112
|
+
].tokens, # pyright: ignore[reportAttributeAccessIssue]
|
113
|
+
)
|
114
|
+
__corpus_tokens_dict -= {
|
115
|
+
"an",
|
116
|
+
"i",
|
117
|
+
"me",
|
118
|
+
"ne",
|
119
|
+
"se",
|
120
|
+
"take",
|
121
|
+
"ten",
|
122
|
+
"to",
|
123
|
+
"u",
|
124
|
+
"we",
|
125
|
+
}
|
104
126
|
"""Mimics the previous implementation of ilo pi toki pona taso."""
|
105
127
|
LazyConfig: IloConfig = {
|
106
128
|
"preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
|
@@ -351,6 +351,10 @@ class Or:
|
|
351
351
|
else:
|
352
352
|
other_filters.extend(member_filters)
|
353
353
|
|
354
|
+
if len(other_filters) == 1: # we only had member filters
|
355
|
+
# TODO: this sucks?
|
356
|
+
return other_filters[0]
|
357
|
+
|
354
358
|
filter = cls.__generic_filter(*other_filters)
|
355
359
|
return filter
|
356
360
|
|
@@ -90,6 +90,12 @@ class DiscordEmotes(RegexPreprocessor):
|
|
90
90
|
pattern = re.compile(r"<a?:[a-zA-Z0-9_]{2,}:[0-9]{2,}>")
|
91
91
|
|
92
92
|
|
93
|
+
class ColonEmotes(RegexPreprocessor):
|
94
|
+
"""Remove colon-marked emotes `:name:`"""
|
95
|
+
|
96
|
+
pattern = re.compile(r":[a-zA-Z0-9_]{2,}:")
|
97
|
+
|
98
|
+
|
93
99
|
class DiscordMentions(RegexPreprocessor):
|
94
100
|
pattern = re.compile(r"<@[\!\&]?[0-9]{2,}>")
|
95
101
|
|
@@ -519,8 +519,10 @@ ALLOWABLES = {
|
|
519
519
|
"kxk", # ken ala ken
|
520
520
|
"wxw", # wile ala wile
|
521
521
|
"msa",
|
522
|
+
"anusem",
|
522
523
|
}
|
523
524
|
|
525
|
+
# NOTE: This is being tracked manually rather than fetched from syllabics.txt until I am convinced that solution is appropriate
|
524
526
|
FALSE_POS_SYLLABIC = {
|
525
527
|
# ordered by frequency in previous TPT data
|
526
528
|
"like",
|
@@ -540,6 +542,7 @@ FALSE_POS_SYLLABIC = {
|
|
540
542
|
"man",
|
541
543
|
# "son", # sona typo?
|
542
544
|
"joke",
|
545
|
+
# pon would go here
|
543
546
|
"so",
|
544
547
|
"ten",
|
545
548
|
"make",
|
@@ -548,11 +551,14 @@ FALSE_POS_SYLLABIC = {
|
|
548
551
|
# "aka" # in sandbox
|
549
552
|
"into",
|
550
553
|
"in",
|
554
|
+
"no",
|
551
555
|
"some",
|
556
|
+
# "papa", # now in sandbox
|
552
557
|
"on",
|
553
558
|
"me",
|
554
559
|
"ipa",
|
555
560
|
"sun",
|
561
|
+
"mine",
|
556
562
|
"sense",
|
557
563
|
"none",
|
558
564
|
"meme",
|
@@ -561,28 +567,104 @@ FALSE_POS_SYLLABIC = {
|
|
561
567
|
"mon",
|
562
568
|
"take",
|
563
569
|
"luna",
|
564
|
-
"anti",
|
565
570
|
"elo",
|
571
|
+
"japanese",
|
566
572
|
"an",
|
573
|
+
"anti",
|
567
574
|
"win",
|
568
575
|
"won",
|
569
|
-
"we",
|
576
|
+
"we", # word in sandbox
|
570
577
|
"men",
|
571
578
|
"ton",
|
572
579
|
"woke",
|
580
|
+
"sen", # seen
|
581
|
+
"se", # see
|
573
582
|
"semi",
|
574
583
|
"male",
|
584
|
+
# "pen", # borderline
|
585
|
+
"woman",
|
586
|
+
"line",
|
587
|
+
"meta",
|
588
|
+
"mini",
|
589
|
+
"sine",
|
590
|
+
# "min", # borderline
|
591
|
+
"oposite",
|
592
|
+
"anime",
|
593
|
+
"potato",
|
594
|
+
"japan",
|
595
|
+
"nose",
|
596
|
+
"kilo",
|
597
|
+
"alone",
|
598
|
+
"minute",
|
599
|
+
"late",
|
600
|
+
"women",
|
601
|
+
"leson",
|
602
|
+
"amen",
|
603
|
+
"tote",
|
604
|
+
"lame",
|
605
|
+
"online",
|
606
|
+
"tone",
|
607
|
+
"ate",
|
608
|
+
"mile",
|
609
|
+
"melon",
|
610
|
+
"tense",
|
611
|
+
"nonsense",
|
612
|
+
"nine",
|
613
|
+
"emo",
|
614
|
+
"unlike",
|
615
|
+
"lone",
|
616
|
+
# manual additions
|
617
|
+
"alike",
|
618
|
+
"amuse",
|
619
|
+
"antelope",
|
620
|
+
"antena",
|
621
|
+
"apetite",
|
622
|
+
"asasin",
|
623
|
+
"asasinate",
|
624
|
+
"asinine",
|
625
|
+
"asinine",
|
626
|
+
"asume",
|
627
|
+
"atone",
|
628
|
+
"awake",
|
629
|
+
"awaken",
|
630
|
+
"eliminate",
|
631
|
+
"elite",
|
632
|
+
"emanate",
|
633
|
+
"iluminate",
|
634
|
+
"imense",
|
635
|
+
"imitate",
|
636
|
+
"injoke",
|
637
|
+
"insane",
|
638
|
+
"insolate",
|
639
|
+
"insulate",
|
640
|
+
"intense",
|
641
|
+
"lemon",
|
642
|
+
"manipulate",
|
643
|
+
"misuse",
|
644
|
+
"ne", # "no" in many other languages
|
645
|
+
"wana",
|
575
646
|
}
|
576
647
|
|
577
648
|
FALSE_POS_ALPHABETIC: Set[str] = {
|
578
649
|
"t",
|
579
650
|
"is",
|
651
|
+
"as",
|
580
652
|
"not",
|
653
|
+
"link",
|
654
|
+
"wait",
|
581
655
|
"lol",
|
656
|
+
"new",
|
582
657
|
"also",
|
583
658
|
"isn", # TODO: tokenizer....
|
584
659
|
"mean",
|
585
660
|
"means",
|
661
|
+
"it",
|
662
|
+
"moment",
|
663
|
+
"its",
|
664
|
+
"lmao",
|
665
|
+
"new",
|
666
|
+
"wel",
|
667
|
+
"makes",
|
586
668
|
}
|
587
669
|
|
588
670
|
UCSUR_RANGES = [
|