sonatoki 0.3.3__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonatoki/Configs.py CHANGED
@@ -2,8 +2,14 @@
2
2
  from copy import deepcopy
3
3
  from typing import List, Type, TypedDict
4
4
 
5
+ # PDM
6
+ from typing_extensions import NotRequired
7
+
5
8
  # LOCAL
6
9
  from sonatoki.Filters import (
10
+ Or,
11
+ And,
12
+ Not,
7
13
  Filter,
8
14
  Numeric,
9
15
  Syllabic,
@@ -18,17 +24,18 @@ from sonatoki.Filters import (
18
24
  NimiLinkuCore,
19
25
  LongAlphabetic,
20
26
  LongProperName,
21
- OrMemberFilter,
22
27
  NimiLinkuCommon,
28
+ FalsePosSyllabic,
23
29
  NimiLinkuObscure,
24
30
  NimiLinkuSandbox,
25
31
  NimiLinkuUncommon,
26
32
  )
27
33
  from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
28
34
  from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
29
- from sonatoki.Tokenizers import Tokenizer, WordTokenizer
35
+ from sonatoki.Tokenizers import Tokenizer
30
36
  from sonatoki.Preprocessors import (
31
37
  URLs,
38
+ Emoji,
32
39
  Backticks,
33
40
  Reference,
34
41
  Preprocessor,
@@ -38,12 +45,13 @@ from sonatoki.Preprocessors import (
38
45
 
39
46
  class IloConfig(TypedDict):
40
47
  preprocessors: List[Type[Preprocessor]]
41
- word_tokenizer: Type[Tokenizer]
42
48
  cleaners: List[Type[Cleaner]]
43
49
  ignoring_filters: List[Type[Filter]]
44
50
  scoring_filters: List[Type[Filter]]
45
51
  scorer: Type[Scorer]
46
52
  passing_score: Number
53
+ word_tokenizer: NotRequired[Type[Tokenizer]]
54
+ sent_tokenizer: NotRequired[Type[Tokenizer]]
47
55
 
48
56
 
49
57
  # TODO: branching configs? config builder?
@@ -55,31 +63,29 @@ BaseConfig: IloConfig = {
55
63
  "scoring_filters": [],
56
64
  "scorer": PassFail,
57
65
  "passing_score": 0.8,
58
- "word_tokenizer": WordTokenizer,
59
66
  }
60
67
 
61
68
 
62
69
  PrefConfig: IloConfig = {
63
- "preprocessors": [Backticks, URLs, Reference],
70
+ "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
64
71
  "cleaners": [ConsecutiveDuplicates],
65
72
  "ignoring_filters": [Numeric, Punctuation],
66
73
  "scoring_filters": [
67
- OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
68
- LongSyllabic,
74
+ Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
75
+ And(LongSyllabic, Not(FalsePosSyllabic)),
69
76
  LongProperName,
70
77
  LongAlphabetic,
71
78
  ],
72
79
  "scorer": SoftScaling,
73
80
  "passing_score": 0.8,
74
- "word_tokenizer": WordTokenizer,
75
81
  }
76
82
 
77
83
  CorpusConfig: IloConfig = {
78
- "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
84
+ "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
79
85
  "cleaners": [ConsecutiveDuplicates],
80
86
  "ignoring_filters": [Numeric, Punctuation],
81
87
  "scoring_filters": [
82
- OrMemberFilter(
88
+ Or(
83
89
  NimiLinkuCore,
84
90
  NimiLinkuCommon,
85
91
  NimiLinkuUncommon,
@@ -88,60 +94,56 @@ CorpusConfig: IloConfig = {
88
94
  NimiUCSUR,
89
95
  Miscellaneous,
90
96
  ),
91
- LongSyllabic,
97
+ And(LongSyllabic, Not(FalsePosSyllabic)),
92
98
  LongProperName,
93
99
  LongAlphabetic,
94
100
  ],
95
101
  "scorer": SoftScaling,
96
102
  "passing_score": 0.8,
97
- "word_tokenizer": WordTokenizer,
98
103
  }
99
104
  """Mimics the previous implementation of ilo pi toki pona taso."""
100
105
  LazyConfig: IloConfig = {
101
- "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
106
+ "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
102
107
  "cleaners": [ConsecutiveDuplicates],
103
108
  "ignoring_filters": [Numeric, Punctuation],
104
109
  "scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
105
110
  "scorer": SoftPassFail,
106
111
  "passing_score": 0.8,
107
- "word_tokenizer": WordTokenizer,
108
112
  }
109
113
  """This is extremely silly."""
110
114
  IsipinEpikuConfig: IloConfig = {
111
- "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
115
+ "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
112
116
  "cleaners": [ConsecutiveDuplicates],
113
117
  "ignoring_filters": [Numeric, Punctuation],
114
118
  "scoring_filters": [
115
- OrMemberFilter(
119
+ Or(
116
120
  NimiKuSuli,
117
121
  NimiKuLili,
118
122
  NimiLinkuUncommon,
119
123
  NimiLinkuObscure,
120
124
  NimiLinkuSandbox,
121
125
  ),
122
- LongSyllabic,
126
+ And(LongSyllabic, Not(FalsePosSyllabic)),
123
127
  LongProperName,
124
128
  LongAlphabetic,
125
129
  ],
126
130
  "scorer": SoftScaling,
127
131
  "passing_score": 0.8,
128
- "word_tokenizer": WordTokenizer,
129
132
  }
130
133
 
131
134
 
132
135
  DiscordConfig: IloConfig = {
133
- "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
136
+ "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
134
137
  "cleaners": [ConsecutiveDuplicates],
135
138
  "ignoring_filters": [Numeric, Punctuation],
136
139
  "scoring_filters": [
137
- OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
138
- LongSyllabic,
140
+ Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
141
+ And(LongSyllabic, Not(FalsePosSyllabic)),
139
142
  LongProperName,
140
143
  LongAlphabetic,
141
144
  ],
142
145
  "scorer": SoftScaling,
143
146
  "passing_score": 0.8,
144
- "word_tokenizer": WordTokenizer,
145
147
  }
146
148
 
147
149
  TelegramConfig: IloConfig = deepcopy(PrefConfig)
sonatoki/Filters.py CHANGED
@@ -6,7 +6,7 @@ from functools import lru_cache as cache # cache comes in 3.9
6
6
 
7
7
  # PDM
8
8
  import regex
9
- from typing_extensions import override
9
+ from typing_extensions import override, deprecated
10
10
 
11
11
  # LOCAL
12
12
  from sonatoki.utils import prep_dictionary
@@ -17,18 +17,21 @@ from sonatoki.constants import (
17
17
  ALL_PUNCT,
18
18
  ALLOWABLES,
19
19
  CONSONANTS,
20
- IGNORABLES,
21
20
  NIMI_UCSUR,
22
21
  NIMI_KU_LILI,
23
22
  NIMI_KU_SULI,
24
23
  NIMI_LINKU_CORE,
25
- ALL_PUNCT_RANGES,
26
24
  NIMI_PU_SYNONYMS,
27
25
  NIMI_LINKU_COMMON,
26
+ FALSE_POS_SYLLABIC,
28
27
  NIMI_LINKU_OBSCURE,
29
28
  NIMI_LINKU_SANDBOX,
30
- UCSUR_PUNCT_RANGES,
29
+ NOT_IN_PUNCT_CLASS,
31
30
  NIMI_LINKU_UNCOMMON,
31
+ ALL_PUNCT_RANGES_STR,
32
+ FALSE_POS_ALPHABETIC,
33
+ UCSUR_PUNCT_RANGES_STR,
34
+ EMOJI_VARIATION_SELECTOR_RANGES_STR,
32
35
  )
33
36
 
34
37
  regex.DEFAULT_VERSION = regex.VERSION1
@@ -113,13 +116,18 @@ class Miscellaneous(MemberFilter):
113
116
  tokens = prep_dictionary(ALLOWABLES)
114
117
 
115
118
 
116
- class EnglishIgnorables(MemberFilter):
117
- """NOTE: Not recommended for use.
118
- It is better to use a Long* filter such as LongSyllabic than to use this filter.
119
- This filter hides words from scoring rather than scoring them poorly,
120
- which is more of a benefit than a loss for a word you would like to omit."""
119
+ class FalsePosSyllabic(MemberFilter):
120
+ """A MemberFilter of words which would match Syllabic (and often Phonetic),
121
+ but are words in other languages."""
121
122
 
122
- tokens = prep_dictionary(IGNORABLES)
123
+ tokens = prep_dictionary(FALSE_POS_SYLLABIC)
124
+
125
+
126
+ class FalsePosAlphabetic(MemberFilter):
127
+ """A MemberFilter of words which would match Alphabetic, but are words in
128
+ other languages."""
129
+
130
+ tokens = prep_dictionary(FALSE_POS_ALPHABETIC)
123
131
 
124
132
 
125
133
  class ProperName(Filter):
@@ -273,7 +281,7 @@ class PunctuationRe(RegexFilter):
273
281
  Goes out of date compared to the `regex` library if UNICODE_PUNCT_RANGES is not updated.
274
282
  """
275
283
 
276
- pattern = re.compile(rf"[{ALL_PUNCT_RANGES}]+")
284
+ pattern = re.compile(rf"[{ALL_PUNCT_RANGES_STR}]+")
277
285
 
278
286
 
279
287
  class PunctuationRe1(Regex1Filter):
@@ -281,22 +289,24 @@ class PunctuationRe1(Regex1Filter):
281
289
  punctuation."""
282
290
 
283
291
  pattern = regex.compile(
284
- rf"[\p{{Punctuation}}\p{{posix_punct}}{UCSUR_PUNCT_RANGES}]+"
292
+ rf"[\p{{Punctuation}}\p{{posix_punct}}{NOT_IN_PUNCT_CLASS}{UCSUR_PUNCT_RANGES_STR}{EMOJI_VARIATION_SELECTOR_RANGES_STR}]+"
285
293
  )
286
294
 
287
295
 
288
- class OrFilter:
296
+ class Or:
289
297
  """Instantiate with more than one filter to compose them into one filter,
290
298
  returning True when any individual filter matches or False otherwise.
291
- Requires at least two filters.
292
-
293
- OrFilter exists as a compromise between the need to score some
294
- filters equally, while not adding custom behavior to scorers. I
295
- could have allowed a position to have a list of filters instead of
296
- one filter, but this would require cleaning the user's input, and
297
- nested handling of lists. It also would not have been as powerful- I
298
- would need another param for the and/or switch, or to not give users
299
- the choice.
299
+ Requires at least two filters. If two or more MemberFilters are provided,
300
+ they will be combined by creating a single set with the members of every
301
+ individual filter.
302
+
303
+ Or exists as a compromise between the need to score some filters
304
+ equally, while not adding custom behavior to scorers. I could have
305
+ allowed a position to have a list of filters instead of one filter,
306
+ but this would require cleaning the user's input, and nested
307
+ handling of lists. It also would not have been as powerful- I would
308
+ need another param for the and/or switch, or to not give users the
309
+ choice.
300
310
 
301
311
  Instead, the user is responsible for building an OrFilter out of
302
312
  their desired filters.
@@ -304,7 +314,6 @@ class OrFilter:
304
314
 
305
315
  @staticmethod
306
316
  def __generic_filter(*filters_: Type[Filter]) -> Type[Filter]:
307
-
308
317
  class CombinedFilter(Filter):
309
318
  filters: List[Type[Filter]] = list(filters_) # TODO: tuple better?
310
319
 
@@ -319,20 +328,6 @@ class OrFilter:
319
328
 
320
329
  return CombinedFilter
321
330
 
322
- def __new__(cls, *filters: Type[Filter]) -> Type[Filter]:
323
- if not len(filters) >= 2:
324
- raise ValueError("Provide at least two Filters to OrFilter.")
325
-
326
- member_filters = [f for f in filters if issubclass(f, MemberFilter)]
327
- if len(member_filters) >= 2:
328
- raise Warning("Use OrMemberFilter for combining two or more MemberFilters.")
329
-
330
- filter = cls.__generic_filter(*filters)
331
-
332
- return filter
333
-
334
-
335
- class OrMemberFilter:
336
331
  @staticmethod
337
332
  def __member_filter(*filters: Type[MemberFilter]) -> Type[MemberFilter]:
338
333
  all_token_sets: List[Set[str]] = [f.tokens for f in filters]
@@ -343,14 +338,24 @@ class OrMemberFilter:
343
338
 
344
339
  return CombinedFilter
345
340
 
346
- def __new__(cls, *filters_: Type[MemberFilter]) -> Type[MemberFilter]:
347
- if not len(filters_) >= 2:
348
- raise ValueError("Provide two or more MemberFilters to OrMemberFilter.")
349
- filter = cls.__member_filter(*filters_)
341
+ def __new__(cls, *filters: Type[Filter]) -> Type[Filter]:
342
+ if not len(filters) >= 2:
343
+ raise ValueError("Provide at least two Filters to OrFilter.")
344
+
345
+ member_filters = [f for f in filters if issubclass(f, MemberFilter)]
346
+ other_filters = [f for f in filters if not issubclass(f, MemberFilter)]
347
+ if len(member_filters) >= 2:
348
+ # we can save some effort by making a single filter out of these
349
+ member_filter = cls.__member_filter(*member_filters)
350
+ other_filters.append(member_filter)
351
+ else:
352
+ other_filters.extend(member_filters)
353
+
354
+ filter = cls.__generic_filter(*other_filters)
350
355
  return filter
351
356
 
352
357
 
353
- class AndFilter:
358
+ class And:
354
359
  """Instantiate with more than one filter to compose them into one filter,
355
360
  returning False when any individual filter fails to match or True
356
361
  otherwise.
@@ -377,10 +382,34 @@ class AndFilter:
377
382
  return AnonymousAndFilter
378
383
 
379
384
 
385
+ class Not(Filter):
386
+ """
387
+ Meta filter which may be inherited by or constructed with a filter to invert its output.
388
+ ---
389
+ ```
390
+ from sonatoki.Filters import Alphabetic, Not
391
+
392
+ my_filter = Not(Alphabetic)
393
+ class MyFilter(Not, Alphabetic):
394
+ ...
395
+ ```
396
+ """
397
+
398
+ @classmethod
399
+ @cache(maxsize=None)
400
+ def filter(cls, token: str) -> bool:
401
+ return not super().filter(token)
402
+
403
+ def __new__(cls, filter: Type[Filter]) -> Type[Filter]:
404
+ class NotFilter(Not, filter): ...
405
+
406
+ return NotFilter
407
+
408
+
380
409
  __all__ = [
381
410
  "Alphabetic",
382
- "AndFilter",
383
- "EnglishIgnorables",
411
+ "And",
412
+ "FalsePosSyllabic",
384
413
  "LongAlphabetic",
385
414
  "LongPhonotactic",
386
415
  "LongProperName",
@@ -391,8 +420,9 @@ __all__ = [
391
420
  "NimiPu",
392
421
  "NimiPuSynonyms",
393
422
  "NimiUCSUR",
423
+ "Not",
394
424
  "Numeric",
395
- "OrFilter",
425
+ "Or",
396
426
  "Phonotactic",
397
427
  "ProperName",
398
428
  "Punctuation",
sonatoki/Preprocessors.py CHANGED
@@ -21,6 +21,7 @@ import re
21
21
  from abc import ABC, abstractmethod
22
22
 
23
23
  # PDM
24
+ import emoji
24
25
  import regex
25
26
  from typing_extensions import override
26
27
 
@@ -162,6 +163,34 @@ class AllQuotes(RegexPreprocessor):
162
163
  )
163
164
 
164
165
 
166
+ class Emoji(Preprocessor):
167
+ @classmethod
168
+ @override
169
+ def process(cls, msg: str) -> str:
170
+ return emoji.replace_emoji(msg)
171
+
172
+
173
+ class ZeroWidths(RegexPreprocessor):
174
+ """Remove the Zero Width Joiner and Zero Width Non-Joiner from the input.
175
+
176
+ ZWJ and ZWNJ do serve semantic purposes,
177
+ such as combining many person emojis into the family emojis,
178
+ or ensuring two characters do not become a ligature.
179
+ However, all emojis are considered punctuation by this library,
180
+ so preprocessing ZWJ out is more accurate:
181
+ It will leave behind the component emojis, which will be ignored.
182
+
183
+ But ZWJ cannot be considered punctuation for tokenizing purposes because it is used in the middle of words to render them differently.
184
+ In this vein, ZWJ is a function character.
185
+
186
+ In the future, it may be smarter to omit ZWJ in the tokenization process,
187
+ or to make the tokenizer smarter by having it keep together collected emojis.
188
+ But in order to do this, emoji would have to be accurately distinguished from all other punctuation.
189
+ """
190
+
191
+ pattern = re.compile("[\\U0000200C-\\U0000200D]")
192
+
193
+
165
194
  __all__ = [
166
195
  "AllQuotes",
167
196
  "AngleBracketObject",
@@ -176,4 +205,6 @@ __all__ = [
176
205
  "SingleQuotes",
177
206
  "Spoilers",
178
207
  "URLs",
208
+ "ZeroWidths",
209
+ "Emoji",
179
210
  ]
sonatoki/Tokenizers.py CHANGED
@@ -10,7 +10,7 @@ from typing_extensions import override, deprecated
10
10
  # LOCAL
11
11
  from sonatoki.utils import regex_escape
12
12
  from sonatoki.Filters import NimiUCSUR # seriously this sucks
13
- from sonatoki.constants import ALL_PUNCT, SENTENCE_PUNCT, ALL_PUNCT_RANGES
13
+ from sonatoki.constants import ALL_PUNCT, SENTENCE_PUNCT, ALL_PUNCT_RANGES_STR
14
14
 
15
15
  regex.DEFAULT_VERSION = regex.VERSION1
16
16
 
@@ -66,7 +66,7 @@ class WordTokenizer(SetTokenizer):
66
66
  last_membership = s[0] in cls.delimiters
67
67
  for i, char in enumerate(s):
68
68
  mem = char in cls.delimiters
69
- ucsur = NimiUCSUR.filter(char) # always "changed" means
69
+ ucsur = NimiUCSUR.filter(char)
70
70
  changed = (mem != last_membership) or ucsur
71
71
  # this keeps contiguous words together, but splits UCSUR
72
72
  if not changed:
@@ -94,7 +94,7 @@ class WordTokenizer(SetTokenizer):
94
94
  "WordTokenizerRe is a previous reference implementation. Its behavior has diverged from WordTokenizer and it may not be restored."
95
95
  )
96
96
  class WordTokenizerRe(RegexTokenizer):
97
- pattern = re.compile(rf"""([{ALL_PUNCT_RANGES}]+|\s+)""")
97
+ pattern = re.compile(rf"""([{ALL_PUNCT_RANGES_STR}]+|\s+)""")
98
98
 
99
99
 
100
100
  @deprecated(
sonatoki/__main__.py CHANGED
@@ -1,9 +1,182 @@
1
1
  #!/bin/env python3
2
+ # STL
3
+ import os
4
+ import json
5
+ import argparse
6
+ from typing import Any, Set, Dict, List
2
7
 
8
+ # PDM
9
+ import emoji
10
+ import requests
3
11
 
4
- def open():
5
- pass
12
+ # LOCAL
13
+ from sonatoki.utils import find_unicode_ranges
14
+ from sonatoki.Filters import (
15
+ Or,
16
+ LongSyllabic,
17
+ NimiLinkuCore,
18
+ LongAlphabetic,
19
+ NimiLinkuCommon,
20
+ NimiLinkuObscure,
21
+ NimiLinkuUncommon,
22
+ )
23
+ from sonatoki.Cleaners import ConsecutiveDuplicates
24
+ from sonatoki.constants import (
25
+ UCSUR_PUNCT_RANGES,
26
+ UNICODE_PUNCT_RANGES,
27
+ EMOJI_VARIATION_SELECTOR_RANGES,
28
+ )
29
+
30
+ HERE = os.path.dirname(os.path.realpath(__file__))
31
+
32
+ UNICODE_DATA = "https://unicode.org/Public/UNIDATA/UnicodeData.txt"
33
+
34
+ LINKU_WORDS = "https://api.linku.la/v1/words?lang=en"
35
+ LINKU_SANDBOX = "https://api.linku.la/v1/sandbox?lang=en"
36
+
37
+ WORDS_10K = "https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english.txt"
38
+ WORDS_25K = "https://raw.githubusercontent.com/dolph/dictionary/master/popular.txt"
39
+ WORDS_479K = (
40
+ "https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt"
41
+ )
42
+
43
+ HEADERS = { # pretend to be Chrome 121, just in case
44
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.3"
45
+ }
46
+
47
+
48
+ def download(url: str) -> str:
49
+ if not url.startswith("https://"):
50
+ raise ValueError(url)
51
+
52
+ resp = requests.get(url, timeout=5, headers=HEADERS)
53
+ return resp.text
54
+
55
+
56
+ def download_json(url: str) -> Dict[str, Any]:
57
+ resp = download(url)
58
+ return json.loads(resp)
59
+
60
+
61
+ def regen_linku_data():
62
+ data = download_json(LINKU_WORDS)
63
+ with open(os.path.join(HERE, "linku.json"), "w") as f:
64
+ _ = f.write(json.dumps(data))
65
+
66
+ data = download_json(LINKU_SANDBOX)
67
+ with open(os.path.join(HERE, "sandbox.json"), "w") as f:
68
+ _ = f.write(json.dumps(data))
69
+
70
+
71
+ def regen_false_negatives():
72
+ # TODO: regen from my frequency data where the score is below 0.8?
73
+ KnownWords = Or(
74
+ NimiLinkuCore,
75
+ NimiLinkuCommon,
76
+ NimiLinkuUncommon,
77
+ NimiLinkuObscure,
78
+ )
79
+
80
+ syllabic_matches: Set[str] = set()
81
+ alphabetic_matches: Set[str] = set()
82
+ data = download(WORDS_25K)
83
+ for word in data.splitlines():
84
+ if not word:
85
+ continue
86
+ word = ConsecutiveDuplicates.clean(word)
87
+
88
+ if KnownWords.filter(word):
89
+ # ignore dictionary
90
+ continue
91
+ if LongSyllabic.filter(word):
92
+ syllabic_matches.add(word)
93
+ continue
94
+ if LongAlphabetic.filter(word):
95
+ alphabetic_matches.add(word)
96
+ continue
97
+
98
+ # TODO: include short matches or no?
99
+ with open(os.path.join(HERE, "syllabic.txt"), "w") as f:
100
+ syllabic_final = sorted([word + "\n" for word in syllabic_matches])
101
+ f.writelines(syllabic_final)
102
+
103
+ with open(os.path.join(HERE, "alphabetic.txt"), "w") as f:
104
+ alphabetic_final = sorted([word + "\n" for word in alphabetic_matches])
105
+ f.writelines(alphabetic_final)
106
+
107
+
108
+ def regen_unicode_data():
109
+ PUNCT_CATEGORIES = {
110
+ # Punctuation
111
+ "Pc", # Connector
112
+ "Pd", # Dash
113
+ "Pe", # Close (end)
114
+ "Pf", # Final
115
+ "Pi", # Initial
116
+ "Po", # Other
117
+ "Ps", # Open (sOpen)
118
+ # Symbol
119
+ "Sm", # Math
120
+ "Sk", # Modifier (kModifier)
121
+ "Sc", # Currency
122
+ "So", # Other
123
+ }
124
+ r"""These characters are in Symbol other (So) but are not in
125
+ `\p{Punctuation}` However, I began excluding them again, because it turns
126
+ out that some sequences of latin alphabet emoji."""
127
+
128
+ # NOTE: There are many characters which look like writing characters but are in the punctuation character class. Examples:
129
+ # - kangxi radicals from ⺀ to ⿕ which are for demonstration, not writing
130
+ # - parenthesized hangul letters and syllables from ㈀ to ㈜
131
+ # - circled katakana from ㋐ to ㋾
132
+ # the latter two shouldn't be in `\p{Punctuation}` if the latin alphabet isn't... oof
133
+
134
+ def is_punctuation(data: List[str]):
135
+ return data[2] in PUNCT_CATEGORIES
136
+
137
+ def get_character(data: List[str]):
138
+ return chr(int(data[0], 16))
139
+
140
+ unicode_data = download(UNICODE_DATA)
141
+ unicode_punctuation = ""
142
+ for line in unicode_data.split("\n"):
143
+ if not line: # damn you, trailing newline
144
+ continue
145
+ # NOTE: UnicodeData.txt lists a range if there are many consecutive similar characters
146
+ # (e.g. CJK Ideograph, First at 4E00 and CJK Ideograph, Last at 9FFF).
147
+ # This does not apply to any currently defined punctuation category.
148
+
149
+ unicode_data = line.split(";")
150
+ if not is_punctuation(unicode_data):
151
+ continue
152
+
153
+ char = get_character(unicode_data)
154
+
155
+ unicode_punctuation += char
156
+
157
+ unicode_punctuation = emoji.replace_emoji(unicode_punctuation)
158
+
159
+ unicode_ranges = find_unicode_ranges(unicode_punctuation)
160
+ unicode_ranges.extend(UCSUR_PUNCT_RANGES)
161
+ # unicode_ranges.extend(EMOJI_VARIATION_SELECTOR_RANGES) # made unnecessary by emoji library
162
+ unicode_ranges = sorted(unicode_ranges)
163
+ # sorted in case my manual additions are out of order
164
+
165
+ if unicode_ranges != UNICODE_PUNCT_RANGES:
166
+ output = json.dumps(unicode_ranges, indent=4, ensure_ascii=True)
167
+ print(output)
168
+
169
+
170
+ def main(argv: argparse.Namespace):
171
+ regen_unicode_data()
172
+ regen_linku_data()
173
+ regen_false_negatives()
6
174
 
7
175
 
8
176
  if __name__ == "__main__":
9
- open()
177
+ """Helper script to fetch UNICODE_PUNCT in constants.py."""
178
+ parser = argparse.ArgumentParser()
179
+
180
+ # TODO: choice between regen unicode data, regen linku, regen english phonomatches
181
+ argv = parser.parse_args()
182
+ main(argv)