sonatoki 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Configs.py +18 -14
- sonatoki/Filters.py +75 -45
- sonatoki/Preprocessors.py +31 -0
- sonatoki/Tokenizers.py +3 -3
- sonatoki/__main__.py +176 -3
- sonatoki/alphabetic.txt +1771 -0
- sonatoki/constants.py +236 -47
- sonatoki/ilo.py +1 -1
- sonatoki/linku.json +1 -1
- sonatoki/sandbox.json +1 -1
- sonatoki/syllabic.txt +297 -0
- sonatoki/utils.py +0 -56
- {sonatoki-0.4.0.dist-info → sonatoki-0.5.0.dist-info}/METADATA +2 -1
- sonatoki-0.5.0.dist-info/RECORD +20 -0
- sonatoki-0.4.0.dist-info/RECORD +0 -18
- {sonatoki-0.4.0.dist-info → sonatoki-0.5.0.dist-info}/WHEEL +0 -0
- {sonatoki-0.4.0.dist-info → sonatoki-0.5.0.dist-info}/licenses/LICENSE +0 -0
sonatoki/Configs.py
CHANGED
@@ -7,6 +7,9 @@ from typing_extensions import NotRequired
|
|
7
7
|
|
8
8
|
# LOCAL
|
9
9
|
from sonatoki.Filters import (
|
10
|
+
Or,
|
11
|
+
And,
|
12
|
+
Not,
|
10
13
|
Filter,
|
11
14
|
Numeric,
|
12
15
|
Syllabic,
|
@@ -21,8 +24,8 @@ from sonatoki.Filters import (
|
|
21
24
|
NimiLinkuCore,
|
22
25
|
LongAlphabetic,
|
23
26
|
LongProperName,
|
24
|
-
OrMemberFilter,
|
25
27
|
NimiLinkuCommon,
|
28
|
+
FalsePosSyllabic,
|
26
29
|
NimiLinkuObscure,
|
27
30
|
NimiLinkuSandbox,
|
28
31
|
NimiLinkuUncommon,
|
@@ -32,6 +35,7 @@ from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
|
|
32
35
|
from sonatoki.Tokenizers import Tokenizer
|
33
36
|
from sonatoki.Preprocessors import (
|
34
37
|
URLs,
|
38
|
+
Emoji,
|
35
39
|
Backticks,
|
36
40
|
Reference,
|
37
41
|
Preprocessor,
|
@@ -63,12 +67,12 @@ BaseConfig: IloConfig = {
|
|
63
67
|
|
64
68
|
|
65
69
|
PrefConfig: IloConfig = {
|
66
|
-
"preprocessors": [Backticks, URLs, Reference],
|
70
|
+
"preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
|
67
71
|
"cleaners": [ConsecutiveDuplicates],
|
68
72
|
"ignoring_filters": [Numeric, Punctuation],
|
69
73
|
"scoring_filters": [
|
70
|
-
|
71
|
-
LongSyllabic,
|
74
|
+
Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
|
75
|
+
And(LongSyllabic, Not(FalsePosSyllabic)),
|
72
76
|
LongProperName,
|
73
77
|
LongAlphabetic,
|
74
78
|
],
|
@@ -77,11 +81,11 @@ PrefConfig: IloConfig = {
|
|
77
81
|
}
|
78
82
|
|
79
83
|
CorpusConfig: IloConfig = {
|
80
|
-
"preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
|
84
|
+
"preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
|
81
85
|
"cleaners": [ConsecutiveDuplicates],
|
82
86
|
"ignoring_filters": [Numeric, Punctuation],
|
83
87
|
"scoring_filters": [
|
84
|
-
|
88
|
+
Or(
|
85
89
|
NimiLinkuCore,
|
86
90
|
NimiLinkuCommon,
|
87
91
|
NimiLinkuUncommon,
|
@@ -90,7 +94,7 @@ CorpusConfig: IloConfig = {
|
|
90
94
|
NimiUCSUR,
|
91
95
|
Miscellaneous,
|
92
96
|
),
|
93
|
-
LongSyllabic,
|
97
|
+
And(LongSyllabic, Not(FalsePosSyllabic)),
|
94
98
|
LongProperName,
|
95
99
|
LongAlphabetic,
|
96
100
|
],
|
@@ -99,7 +103,7 @@ CorpusConfig: IloConfig = {
|
|
99
103
|
}
|
100
104
|
"""Mimics the previous implementation of ilo pi toki pona taso."""
|
101
105
|
LazyConfig: IloConfig = {
|
102
|
-
"preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
|
106
|
+
"preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
|
103
107
|
"cleaners": [ConsecutiveDuplicates],
|
104
108
|
"ignoring_filters": [Numeric, Punctuation],
|
105
109
|
"scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
|
@@ -108,18 +112,18 @@ LazyConfig: IloConfig = {
|
|
108
112
|
}
|
109
113
|
"""This is extremely silly."""
|
110
114
|
IsipinEpikuConfig: IloConfig = {
|
111
|
-
"preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
|
115
|
+
"preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
|
112
116
|
"cleaners": [ConsecutiveDuplicates],
|
113
117
|
"ignoring_filters": [Numeric, Punctuation],
|
114
118
|
"scoring_filters": [
|
115
|
-
|
119
|
+
Or(
|
116
120
|
NimiKuSuli,
|
117
121
|
NimiKuLili,
|
118
122
|
NimiLinkuUncommon,
|
119
123
|
NimiLinkuObscure,
|
120
124
|
NimiLinkuSandbox,
|
121
125
|
),
|
122
|
-
LongSyllabic,
|
126
|
+
And(LongSyllabic, Not(FalsePosSyllabic)),
|
123
127
|
LongProperName,
|
124
128
|
LongAlphabetic,
|
125
129
|
],
|
@@ -129,12 +133,12 @@ IsipinEpikuConfig: IloConfig = {
|
|
129
133
|
|
130
134
|
|
131
135
|
DiscordConfig: IloConfig = {
|
132
|
-
"preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
|
136
|
+
"preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
|
133
137
|
"cleaners": [ConsecutiveDuplicates],
|
134
138
|
"ignoring_filters": [Numeric, Punctuation],
|
135
139
|
"scoring_filters": [
|
136
|
-
|
137
|
-
LongSyllabic,
|
140
|
+
Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
|
141
|
+
And(LongSyllabic, Not(FalsePosSyllabic)),
|
138
142
|
LongProperName,
|
139
143
|
LongAlphabetic,
|
140
144
|
],
|
sonatoki/Filters.py
CHANGED
@@ -6,7 +6,7 @@ from functools import lru_cache as cache # cache comes in 3.9
|
|
6
6
|
|
7
7
|
# PDM
|
8
8
|
import regex
|
9
|
-
from typing_extensions import override
|
9
|
+
from typing_extensions import override, deprecated
|
10
10
|
|
11
11
|
# LOCAL
|
12
12
|
from sonatoki.utils import prep_dictionary
|
@@ -17,18 +17,21 @@ from sonatoki.constants import (
|
|
17
17
|
ALL_PUNCT,
|
18
18
|
ALLOWABLES,
|
19
19
|
CONSONANTS,
|
20
|
-
IGNORABLES,
|
21
20
|
NIMI_UCSUR,
|
22
21
|
NIMI_KU_LILI,
|
23
22
|
NIMI_KU_SULI,
|
24
23
|
NIMI_LINKU_CORE,
|
25
|
-
ALL_PUNCT_RANGES,
|
26
24
|
NIMI_PU_SYNONYMS,
|
27
25
|
NIMI_LINKU_COMMON,
|
26
|
+
FALSE_POS_SYLLABIC,
|
28
27
|
NIMI_LINKU_OBSCURE,
|
29
28
|
NIMI_LINKU_SANDBOX,
|
30
|
-
|
29
|
+
NOT_IN_PUNCT_CLASS,
|
31
30
|
NIMI_LINKU_UNCOMMON,
|
31
|
+
ALL_PUNCT_RANGES_STR,
|
32
|
+
FALSE_POS_ALPHABETIC,
|
33
|
+
UCSUR_PUNCT_RANGES_STR,
|
34
|
+
EMOJI_VARIATION_SELECTOR_RANGES_STR,
|
32
35
|
)
|
33
36
|
|
34
37
|
regex.DEFAULT_VERSION = regex.VERSION1
|
@@ -113,13 +116,18 @@ class Miscellaneous(MemberFilter):
|
|
113
116
|
tokens = prep_dictionary(ALLOWABLES)
|
114
117
|
|
115
118
|
|
116
|
-
class
|
117
|
-
"""
|
118
|
-
|
119
|
-
This filter hides words from scoring rather than scoring them poorly,
|
120
|
-
which is more of a benefit than a loss for a word you would like to omit."""
|
119
|
+
class FalsePosSyllabic(MemberFilter):
|
120
|
+
"""A MemberFilter of words which would match Syllabic (and often Phonetic),
|
121
|
+
but are words in other languages."""
|
121
122
|
|
122
|
-
tokens = prep_dictionary(
|
123
|
+
tokens = prep_dictionary(FALSE_POS_SYLLABIC)
|
124
|
+
|
125
|
+
|
126
|
+
class FalsePosAlphabetic(MemberFilter):
|
127
|
+
"""A MemberFilter of words which would match Alphabetic, but are words in
|
128
|
+
other languages."""
|
129
|
+
|
130
|
+
tokens = prep_dictionary(FALSE_POS_ALPHABETIC)
|
123
131
|
|
124
132
|
|
125
133
|
class ProperName(Filter):
|
@@ -273,7 +281,7 @@ class PunctuationRe(RegexFilter):
|
|
273
281
|
Goes out of date compared to the `regex` library if UNICODE_PUNCT_RANGES is not updated.
|
274
282
|
"""
|
275
283
|
|
276
|
-
pattern = re.compile(rf"[{
|
284
|
+
pattern = re.compile(rf"[{ALL_PUNCT_RANGES_STR}]+")
|
277
285
|
|
278
286
|
|
279
287
|
class PunctuationRe1(Regex1Filter):
|
@@ -281,22 +289,24 @@ class PunctuationRe1(Regex1Filter):
|
|
281
289
|
punctuation."""
|
282
290
|
|
283
291
|
pattern = regex.compile(
|
284
|
-
rf"[\p{{Punctuation}}\p{{posix_punct}}{
|
292
|
+
rf"[\p{{Punctuation}}\p{{posix_punct}}{NOT_IN_PUNCT_CLASS}{UCSUR_PUNCT_RANGES_STR}{EMOJI_VARIATION_SELECTOR_RANGES_STR}]+"
|
285
293
|
)
|
286
294
|
|
287
295
|
|
288
|
-
class
|
296
|
+
class Or:
|
289
297
|
"""Instantiate with more than one filter to compose them into one filter,
|
290
298
|
returning True when any individual filter matches or False otherwise.
|
291
|
-
Requires at least two filters.
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
299
|
+
Requires at least two filters. If two or more MemberFilters are provided,
|
300
|
+
they will be combined by creating a single set with the members of every
|
301
|
+
individual filter.
|
302
|
+
|
303
|
+
Or exists as a compromise between the need to score some filters
|
304
|
+
equally, while not adding custom behavior to scorers. I could have
|
305
|
+
allowed a position to have a list of filters instead of one filter,
|
306
|
+
but this would require cleaning the user's input, and nested
|
307
|
+
handling of lists. It also would not have been as powerful- I would
|
308
|
+
need another param for the and/or switch, or to not give users the
|
309
|
+
choice.
|
300
310
|
|
301
311
|
Instead, the user is responsible for building an OrFilter out of
|
302
312
|
their desired filters.
|
@@ -304,7 +314,6 @@ class OrFilter:
|
|
304
314
|
|
305
315
|
@staticmethod
|
306
316
|
def __generic_filter(*filters_: Type[Filter]) -> Type[Filter]:
|
307
|
-
|
308
317
|
class CombinedFilter(Filter):
|
309
318
|
filters: List[Type[Filter]] = list(filters_) # TODO: tuple better?
|
310
319
|
|
@@ -319,20 +328,6 @@ class OrFilter:
|
|
319
328
|
|
320
329
|
return CombinedFilter
|
321
330
|
|
322
|
-
def __new__(cls, *filters: Type[Filter]) -> Type[Filter]:
|
323
|
-
if not len(filters) >= 2:
|
324
|
-
raise ValueError("Provide at least two Filters to OrFilter.")
|
325
|
-
|
326
|
-
member_filters = [f for f in filters if issubclass(f, MemberFilter)]
|
327
|
-
if len(member_filters) >= 2:
|
328
|
-
raise Warning("Use OrMemberFilter for combining two or more MemberFilters.")
|
329
|
-
|
330
|
-
filter = cls.__generic_filter(*filters)
|
331
|
-
|
332
|
-
return filter
|
333
|
-
|
334
|
-
|
335
|
-
class OrMemberFilter:
|
336
331
|
@staticmethod
|
337
332
|
def __member_filter(*filters: Type[MemberFilter]) -> Type[MemberFilter]:
|
338
333
|
all_token_sets: List[Set[str]] = [f.tokens for f in filters]
|
@@ -343,14 +338,24 @@ class OrMemberFilter:
|
|
343
338
|
|
344
339
|
return CombinedFilter
|
345
340
|
|
346
|
-
def __new__(cls, *
|
347
|
-
if not len(
|
348
|
-
raise ValueError("Provide
|
349
|
-
|
341
|
+
def __new__(cls, *filters: Type[Filter]) -> Type[Filter]:
|
342
|
+
if not len(filters) >= 2:
|
343
|
+
raise ValueError("Provide at least two Filters to OrFilter.")
|
344
|
+
|
345
|
+
member_filters = [f for f in filters if issubclass(f, MemberFilter)]
|
346
|
+
other_filters = [f for f in filters if not issubclass(f, MemberFilter)]
|
347
|
+
if len(member_filters) >= 2:
|
348
|
+
# we can save some effort by making a single filter out of these
|
349
|
+
member_filter = cls.__member_filter(*member_filters)
|
350
|
+
other_filters.append(member_filter)
|
351
|
+
else:
|
352
|
+
other_filters.extend(member_filters)
|
353
|
+
|
354
|
+
filter = cls.__generic_filter(*other_filters)
|
350
355
|
return filter
|
351
356
|
|
352
357
|
|
353
|
-
class
|
358
|
+
class And:
|
354
359
|
"""Instantiate with more than one filter to compose them into one filter,
|
355
360
|
returning False when any individual filter fails to match or True
|
356
361
|
otherwise.
|
@@ -377,10 +382,34 @@ class AndFilter:
|
|
377
382
|
return AnonymousAndFilter
|
378
383
|
|
379
384
|
|
385
|
+
class Not(Filter):
|
386
|
+
"""
|
387
|
+
Meta filter which may be inherited by or constructed with a filter to invert its output.
|
388
|
+
---
|
389
|
+
```
|
390
|
+
from sonatoki.Filters import Alphabetic, Not
|
391
|
+
|
392
|
+
my_filter = Not(Alphabetic)
|
393
|
+
class MyFilter(Not, Alphabetic):
|
394
|
+
...
|
395
|
+
```
|
396
|
+
"""
|
397
|
+
|
398
|
+
@classmethod
|
399
|
+
@cache(maxsize=None)
|
400
|
+
def filter(cls, token: str) -> bool:
|
401
|
+
return not super().filter(token)
|
402
|
+
|
403
|
+
def __new__(cls, filter: Type[Filter]) -> Type[Filter]:
|
404
|
+
class NotFilter(Not, filter): ...
|
405
|
+
|
406
|
+
return NotFilter
|
407
|
+
|
408
|
+
|
380
409
|
__all__ = [
|
381
410
|
"Alphabetic",
|
382
|
-
"
|
383
|
-
"
|
411
|
+
"And",
|
412
|
+
"FalsePosSyllabic",
|
384
413
|
"LongAlphabetic",
|
385
414
|
"LongPhonotactic",
|
386
415
|
"LongProperName",
|
@@ -391,8 +420,9 @@ __all__ = [
|
|
391
420
|
"NimiPu",
|
392
421
|
"NimiPuSynonyms",
|
393
422
|
"NimiUCSUR",
|
423
|
+
"Not",
|
394
424
|
"Numeric",
|
395
|
-
"
|
425
|
+
"Or",
|
396
426
|
"Phonotactic",
|
397
427
|
"ProperName",
|
398
428
|
"Punctuation",
|
sonatoki/Preprocessors.py
CHANGED
@@ -21,6 +21,7 @@ import re
|
|
21
21
|
from abc import ABC, abstractmethod
|
22
22
|
|
23
23
|
# PDM
|
24
|
+
import emoji
|
24
25
|
import regex
|
25
26
|
from typing_extensions import override
|
26
27
|
|
@@ -162,6 +163,34 @@ class AllQuotes(RegexPreprocessor):
|
|
162
163
|
)
|
163
164
|
|
164
165
|
|
166
|
+
class Emoji(Preprocessor):
|
167
|
+
@classmethod
|
168
|
+
@override
|
169
|
+
def process(cls, msg: str) -> str:
|
170
|
+
return emoji.replace_emoji(msg)
|
171
|
+
|
172
|
+
|
173
|
+
class ZeroWidths(RegexPreprocessor):
|
174
|
+
"""Remove the Zero Width Joiner and Zero Width Non-Joiner from the input.
|
175
|
+
|
176
|
+
ZWJ and ZWNJ do serve semantic purposes,
|
177
|
+
such as combining many person emojis into the family emojis,
|
178
|
+
or ensuring two characters do not become a ligature.
|
179
|
+
However, all emojis are considered punctuation by this library,
|
180
|
+
so preprocessing ZWJ out is more accurate:
|
181
|
+
It will leave behind the component emojis, which will be ignored.
|
182
|
+
|
183
|
+
But ZWJ cannot be considered punctuation for tokenizing purposes because it is used in the middle of words to render them differently.
|
184
|
+
In this vein, ZWJ is a function character.
|
185
|
+
|
186
|
+
In the future, it may be smarter to omit ZWJ in the tokenization process,
|
187
|
+
or to make the tokenizer smarter by having it keep together collected emojis.
|
188
|
+
But in order to do this, emoji would have to be accurately distinguished from all other punctuation.
|
189
|
+
"""
|
190
|
+
|
191
|
+
pattern = re.compile("[\\U0000200C-\\U0000200D]")
|
192
|
+
|
193
|
+
|
165
194
|
__all__ = [
|
166
195
|
"AllQuotes",
|
167
196
|
"AngleBracketObject",
|
@@ -176,4 +205,6 @@ __all__ = [
|
|
176
205
|
"SingleQuotes",
|
177
206
|
"Spoilers",
|
178
207
|
"URLs",
|
208
|
+
"ZeroWidths",
|
209
|
+
"Emoji",
|
179
210
|
]
|
sonatoki/Tokenizers.py
CHANGED
@@ -10,7 +10,7 @@ from typing_extensions import override, deprecated
|
|
10
10
|
# LOCAL
|
11
11
|
from sonatoki.utils import regex_escape
|
12
12
|
from sonatoki.Filters import NimiUCSUR # seriously this sucks
|
13
|
-
from sonatoki.constants import ALL_PUNCT, SENTENCE_PUNCT,
|
13
|
+
from sonatoki.constants import ALL_PUNCT, SENTENCE_PUNCT, ALL_PUNCT_RANGES_STR
|
14
14
|
|
15
15
|
regex.DEFAULT_VERSION = regex.VERSION1
|
16
16
|
|
@@ -66,7 +66,7 @@ class WordTokenizer(SetTokenizer):
|
|
66
66
|
last_membership = s[0] in cls.delimiters
|
67
67
|
for i, char in enumerate(s):
|
68
68
|
mem = char in cls.delimiters
|
69
|
-
ucsur = NimiUCSUR.filter(char)
|
69
|
+
ucsur = NimiUCSUR.filter(char)
|
70
70
|
changed = (mem != last_membership) or ucsur
|
71
71
|
# this keeps contiguous words together, but splits UCSUR
|
72
72
|
if not changed:
|
@@ -94,7 +94,7 @@ class WordTokenizer(SetTokenizer):
|
|
94
94
|
"WordTokenizerRe is a previous reference implementation. Its behavior has diverged from WordTokenizer and it may not be restored."
|
95
95
|
)
|
96
96
|
class WordTokenizerRe(RegexTokenizer):
|
97
|
-
pattern = re.compile(rf"""([{
|
97
|
+
pattern = re.compile(rf"""([{ALL_PUNCT_RANGES_STR}]+|\s+)""")
|
98
98
|
|
99
99
|
|
100
100
|
@deprecated(
|
sonatoki/__main__.py
CHANGED
@@ -1,9 +1,182 @@
|
|
1
1
|
#!/bin/env python3
|
2
|
+
# STL
|
3
|
+
import os
|
4
|
+
import json
|
5
|
+
import argparse
|
6
|
+
from typing import Any, Set, Dict, List
|
2
7
|
|
8
|
+
# PDM
|
9
|
+
import emoji
|
10
|
+
import requests
|
3
11
|
|
4
|
-
|
5
|
-
|
12
|
+
# LOCAL
|
13
|
+
from sonatoki.utils import find_unicode_ranges
|
14
|
+
from sonatoki.Filters import (
|
15
|
+
Or,
|
16
|
+
LongSyllabic,
|
17
|
+
NimiLinkuCore,
|
18
|
+
LongAlphabetic,
|
19
|
+
NimiLinkuCommon,
|
20
|
+
NimiLinkuObscure,
|
21
|
+
NimiLinkuUncommon,
|
22
|
+
)
|
23
|
+
from sonatoki.Cleaners import ConsecutiveDuplicates
|
24
|
+
from sonatoki.constants import (
|
25
|
+
UCSUR_PUNCT_RANGES,
|
26
|
+
UNICODE_PUNCT_RANGES,
|
27
|
+
EMOJI_VARIATION_SELECTOR_RANGES,
|
28
|
+
)
|
29
|
+
|
30
|
+
HERE = os.path.dirname(os.path.realpath(__file__))
|
31
|
+
|
32
|
+
UNICODE_DATA = "https://unicode.org/Public/UNIDATA/UnicodeData.txt"
|
33
|
+
|
34
|
+
LINKU_WORDS = "https://api.linku.la/v1/words?lang=en"
|
35
|
+
LINKU_SANDBOX = "https://api.linku.la/v1/sandbox?lang=en"
|
36
|
+
|
37
|
+
WORDS_10K = "https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english.txt"
|
38
|
+
WORDS_25K = "https://raw.githubusercontent.com/dolph/dictionary/master/popular.txt"
|
39
|
+
WORDS_479K = (
|
40
|
+
"https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt"
|
41
|
+
)
|
42
|
+
|
43
|
+
HEADERS = { # pretend to be Chrome 121, just in case
|
44
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.3"
|
45
|
+
}
|
46
|
+
|
47
|
+
|
48
|
+
def download(url: str) -> str:
|
49
|
+
if not url.startswith("https://"):
|
50
|
+
raise ValueError(url)
|
51
|
+
|
52
|
+
resp = requests.get(url, timeout=5, headers=HEADERS)
|
53
|
+
return resp.text
|
54
|
+
|
55
|
+
|
56
|
+
def download_json(url: str) -> Dict[str, Any]:
|
57
|
+
resp = download(url)
|
58
|
+
return json.loads(resp)
|
59
|
+
|
60
|
+
|
61
|
+
def regen_linku_data():
|
62
|
+
data = download_json(LINKU_WORDS)
|
63
|
+
with open(os.path.join(HERE, "linku.json"), "w") as f:
|
64
|
+
_ = f.write(json.dumps(data))
|
65
|
+
|
66
|
+
data = download_json(LINKU_SANDBOX)
|
67
|
+
with open(os.path.join(HERE, "sandbox.json"), "w") as f:
|
68
|
+
_ = f.write(json.dumps(data))
|
69
|
+
|
70
|
+
|
71
|
+
def regen_false_negatives():
|
72
|
+
# TODO: regen from my frequency data where the score is below 0.8?
|
73
|
+
KnownWords = Or(
|
74
|
+
NimiLinkuCore,
|
75
|
+
NimiLinkuCommon,
|
76
|
+
NimiLinkuUncommon,
|
77
|
+
NimiLinkuObscure,
|
78
|
+
)
|
79
|
+
|
80
|
+
syllabic_matches: Set[str] = set()
|
81
|
+
alphabetic_matches: Set[str] = set()
|
82
|
+
data = download(WORDS_25K)
|
83
|
+
for word in data.splitlines():
|
84
|
+
if not word:
|
85
|
+
continue
|
86
|
+
word = ConsecutiveDuplicates.clean(word)
|
87
|
+
|
88
|
+
if KnownWords.filter(word):
|
89
|
+
# ignore dictionary
|
90
|
+
continue
|
91
|
+
if LongSyllabic.filter(word):
|
92
|
+
syllabic_matches.add(word)
|
93
|
+
continue
|
94
|
+
if LongAlphabetic.filter(word):
|
95
|
+
alphabetic_matches.add(word)
|
96
|
+
continue
|
97
|
+
|
98
|
+
# TODO: include short matches or no?
|
99
|
+
with open(os.path.join(HERE, "syllabic.txt"), "w") as f:
|
100
|
+
syllabic_final = sorted([word + "\n" for word in syllabic_matches])
|
101
|
+
f.writelines(syllabic_final)
|
102
|
+
|
103
|
+
with open(os.path.join(HERE, "alphabetic.txt"), "w") as f:
|
104
|
+
alphabetic_final = sorted([word + "\n" for word in alphabetic_matches])
|
105
|
+
f.writelines(alphabetic_final)
|
106
|
+
|
107
|
+
|
108
|
+
def regen_unicode_data():
|
109
|
+
PUNCT_CATEGORIES = {
|
110
|
+
# Punctuation
|
111
|
+
"Pc", # Connector
|
112
|
+
"Pd", # Dash
|
113
|
+
"Pe", # Close (end)
|
114
|
+
"Pf", # Final
|
115
|
+
"Pi", # Initial
|
116
|
+
"Po", # Other
|
117
|
+
"Ps", # Open (sOpen)
|
118
|
+
# Symbol
|
119
|
+
"Sm", # Math
|
120
|
+
"Sk", # Modifier (kModifier)
|
121
|
+
"Sc", # Currency
|
122
|
+
"So", # Other
|
123
|
+
}
|
124
|
+
r"""These characters are in Symbol other (So) but are not in
|
125
|
+
`\p{Punctuation}` However, I began excluding them again, because it turns
|
126
|
+
out that some sequences of latin alphabet emoji."""
|
127
|
+
|
128
|
+
# NOTE: There are many characters which look like writing characters but are in the punctuation character class. Examples:
|
129
|
+
# - kangxi radicals from ⺀ to ⿕ which are for demonstration, not writing
|
130
|
+
# - parenthesized hangul letters and syllables from ㈀ to ㈜
|
131
|
+
# - circled katakana from ㋐ to ㋾
|
132
|
+
# the latter two shouldn't be in `\p{Punctuation}` if the latin alphabet isn't... oof
|
133
|
+
|
134
|
+
def is_punctuation(data: List[str]):
|
135
|
+
return data[2] in PUNCT_CATEGORIES
|
136
|
+
|
137
|
+
def get_character(data: List[str]):
|
138
|
+
return chr(int(data[0], 16))
|
139
|
+
|
140
|
+
unicode_data = download(UNICODE_DATA)
|
141
|
+
unicode_punctuation = ""
|
142
|
+
for line in unicode_data.split("\n"):
|
143
|
+
if not line: # damn you, trailing newline
|
144
|
+
continue
|
145
|
+
# NOTE: UnicodeData.txt lists a range if there are many consecutive similar characters
|
146
|
+
# (e.g. CJK Ideograph, First at 4E00 and CJK Ideograph, Last at 9FFF).
|
147
|
+
# This does not apply to any currently defined punctuation category.
|
148
|
+
|
149
|
+
unicode_data = line.split(";")
|
150
|
+
if not is_punctuation(unicode_data):
|
151
|
+
continue
|
152
|
+
|
153
|
+
char = get_character(unicode_data)
|
154
|
+
|
155
|
+
unicode_punctuation += char
|
156
|
+
|
157
|
+
unicode_punctuation = emoji.replace_emoji(unicode_punctuation)
|
158
|
+
|
159
|
+
unicode_ranges = find_unicode_ranges(unicode_punctuation)
|
160
|
+
unicode_ranges.extend(UCSUR_PUNCT_RANGES)
|
161
|
+
# unicode_ranges.extend(EMOJI_VARIATION_SELECTOR_RANGES) # made unnecessary by emoji library
|
162
|
+
unicode_ranges = sorted(unicode_ranges)
|
163
|
+
# sorted in case my manual additions are out of order
|
164
|
+
|
165
|
+
if unicode_ranges != UNICODE_PUNCT_RANGES:
|
166
|
+
output = json.dumps(unicode_ranges, indent=4, ensure_ascii=True)
|
167
|
+
print(output)
|
168
|
+
|
169
|
+
|
170
|
+
def main(argv: argparse.Namespace):
|
171
|
+
regen_unicode_data()
|
172
|
+
regen_linku_data()
|
173
|
+
regen_false_negatives()
|
6
174
|
|
7
175
|
|
8
176
|
if __name__ == "__main__":
|
9
|
-
|
177
|
+
"""Helper script to fetch UNICODE_PUNCT in constants.py."""
|
178
|
+
parser = argparse.ArgumentParser()
|
179
|
+
|
180
|
+
# TODO: choice between regen unicode data, regen linku, regen english phonomatches
|
181
|
+
argv = parser.parse_args()
|
182
|
+
main(argv)
|