unicodedata-reader 0.1.6__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {unicodedata-reader-0.1.6 → unicodedata_reader-0.2.0}/PKG-INFO +20 -5
- {unicodedata-reader-0.1.6 → unicodedata_reader-0.2.0}/README.md +14 -2
- {unicodedata-reader-0.1.6 → unicodedata_reader-0.2.0}/pyproject.toml +7 -7
- {unicodedata-reader-0.1.6 → unicodedata_reader-0.2.0}/unicodedata_reader/__init__.py +1 -0
- {unicodedata-reader-0.1.6 → unicodedata_reader-0.2.0}/unicodedata_reader/__main__.py +2 -0
- {unicodedata-reader-0.1.6 → unicodedata_reader-0.2.0}/unicodedata_reader/cli.py +16 -10
- {unicodedata-reader-0.1.6 → unicodedata_reader-0.2.0}/unicodedata_reader/compressor.py +3 -2
- unicodedata_reader-0.2.0/unicodedata_reader/east_asian_width.py +30 -0
- unicodedata_reader-0.2.0/unicodedata_reader/emoji.py +38 -0
- {unicodedata-reader-0.1.6 → unicodedata_reader-0.2.0}/unicodedata_reader/entry.py +29 -13
- {unicodedata-reader-0.1.6 → unicodedata_reader-0.2.0}/unicodedata_reader/general_category.py +1 -0
- {unicodedata-reader-0.1.6 → unicodedata_reader-0.2.0}/unicodedata_reader/line_break.py +1 -0
- {unicodedata-reader-0.1.6 → unicodedata_reader-0.2.0}/unicodedata_reader/reader.py +5 -0
- unicodedata_reader-0.2.0/unicodedata_reader/set.py +72 -0
- {unicodedata-reader-0.1.6 → unicodedata_reader-0.2.0}/unicodedata_reader/vertical_orientation.py +1 -0
- unicodedata-reader-0.1.6/setup.py +0 -34
- unicodedata-reader-0.1.6/unicodedata_reader/emoji.py +0 -21
- {unicodedata-reader-0.1.6 → unicodedata_reader-0.2.0}/LICENSE +0 -0
- {unicodedata-reader-0.1.6 → unicodedata_reader-0.2.0}/unicodedata_reader/bidi_brackets.py +0 -0
|
@@ -1,17 +1,20 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unicodedata-reader
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary:
|
|
5
5
|
Home-page: https://github.com/kojiishi/unicodedata-reader
|
|
6
6
|
License: Apache-2.0
|
|
7
7
|
Author: Koji Ishii
|
|
8
8
|
Author-email: kojii@chromium.org
|
|
9
|
-
Requires-Python: >=3.8
|
|
9
|
+
Requires-Python: >=3.8
|
|
10
10
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.8
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
-
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Requires-Dist: platformdirs (>=2.2,<5.0)
|
|
15
18
|
Project-URL: Repository, https://github.com/kojiishi/unicodedata-reader
|
|
16
19
|
Description-Content-Type: text/markdown
|
|
17
20
|
|
|
@@ -36,6 +39,7 @@ that can read the property values of the [Unicode Character Database]
|
|
|
36
39
|
in browsers.
|
|
37
40
|
Please see the [JavaScript] section below.
|
|
38
41
|
|
|
42
|
+
[General_Category property]: http://unicode.org/reports/tr44/#General_Category
|
|
39
43
|
[Line_Break property]: http://unicode.org/reports/tr44/#Line_Break
|
|
40
44
|
[Unicode Character Database]: https://unicode.org/reports/tr44/
|
|
41
45
|
[unicodedata]: https://docs.python.org/3/library/unicodedata.html
|
|
@@ -79,10 +83,21 @@ The [`UnicodeDataCompressor` class] in this package
|
|
|
79
83
|
can generate JavaScript functions that can read the property values
|
|
80
84
|
of the [Unicode Character Database] in browsers.
|
|
81
85
|
|
|
82
|
-
|
|
83
|
-
|
|
86
|
+
Following examples are available in the "`js`" directory:
|
|
87
|
+
* [GeneralCategory.js] is a generated JavaScript file
|
|
88
|
+
for the Unicode [General_Category property].
|
|
89
|
+
* [LineBreak.js] is a generated JavaScript file
|
|
90
|
+
for the Unicode [Line_Break property].
|
|
91
|
+
* [LineBreak.html] for an example usage of [LineBreak.js].
|
|
92
|
+
|
|
93
|
+
The following command generates a JavaScript file for the [Line_Break property]
|
|
94
|
+
using `js/template.js` as the template file:
|
|
95
|
+
```sh
|
|
96
|
+
unicodedata-reader lb -t js/template.js
|
|
97
|
+
```
|
|
84
98
|
|
|
85
99
|
[`UnicodeDataCompressor` class]: https://github.com/kojiishi/unicodedata-reader/blob/main/unicodedata_reader/compressor.py
|
|
100
|
+
[GeneralCategory.js]: https://github.com/kojiishi/unicodedata-reader/blob/main/js/GeneralCategory.js
|
|
86
101
|
[LineBreak.html]: https://github.com/kojiishi/unicodedata-reader/blob/main/js/LineBreak.html
|
|
87
102
|
[LineBreak.js]: https://github.com/kojiishi/unicodedata-reader/blob/main/js/LineBreak.js
|
|
88
103
|
|
|
@@ -19,6 +19,7 @@ that can read the property values of the [Unicode Character Database]
|
|
|
19
19
|
in browsers.
|
|
20
20
|
Please see the [JavaScript] section below.
|
|
21
21
|
|
|
22
|
+
[General_Category property]: http://unicode.org/reports/tr44/#General_Category
|
|
22
23
|
[Line_Break property]: http://unicode.org/reports/tr44/#Line_Break
|
|
23
24
|
[Unicode Character Database]: https://unicode.org/reports/tr44/
|
|
24
25
|
[unicodedata]: https://docs.python.org/3/library/unicodedata.html
|
|
@@ -62,9 +63,20 @@ The [`UnicodeDataCompressor` class] in this package
|
|
|
62
63
|
can generate JavaScript functions that can read the property values
|
|
63
64
|
of the [Unicode Character Database] in browsers.
|
|
64
65
|
|
|
65
|
-
|
|
66
|
-
|
|
66
|
+
Following examples are available in the "`js`" directory:
|
|
67
|
+
* [GeneralCategory.js] is a generated JavaScript file
|
|
68
|
+
for the Unicode [General_Category property].
|
|
69
|
+
* [LineBreak.js] is a generated JavaScript file
|
|
70
|
+
for the Unicode [Line_Break property].
|
|
71
|
+
* [LineBreak.html] for an example usage of [LineBreak.js].
|
|
72
|
+
|
|
73
|
+
The following command generates a JavaScript file for the [Line_Break property]
|
|
74
|
+
using `js/template.js` as the template file:
|
|
75
|
+
```sh
|
|
76
|
+
unicodedata-reader lb -t js/template.js
|
|
77
|
+
```
|
|
67
78
|
|
|
68
79
|
[`UnicodeDataCompressor` class]: https://github.com/kojiishi/unicodedata-reader/blob/main/unicodedata_reader/compressor.py
|
|
80
|
+
[GeneralCategory.js]: https://github.com/kojiishi/unicodedata-reader/blob/main/js/GeneralCategory.js
|
|
69
81
|
[LineBreak.html]: https://github.com/kojiishi/unicodedata-reader/blob/main/js/LineBreak.html
|
|
70
82
|
[LineBreak.js]: https://github.com/kojiishi/unicodedata-reader/blob/main/js/LineBreak.js
|
|
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|
|
4
4
|
|
|
5
5
|
[tool.poetry]
|
|
6
6
|
name = "unicodedata-reader"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.2.0"
|
|
8
8
|
description = ""
|
|
9
9
|
authors = ["Koji Ishii <kojii@chromium.org>"]
|
|
10
10
|
readme = "README.md"
|
|
@@ -12,14 +12,14 @@ repository = "https://github.com/kojiishi/unicodedata-reader"
|
|
|
12
12
|
license = "Apache-2.0"
|
|
13
13
|
|
|
14
14
|
[tool.poetry.dependencies]
|
|
15
|
-
python = ">=3.8
|
|
16
|
-
platformdirs = "
|
|
15
|
+
python = ">=3.8"
|
|
16
|
+
platformdirs = ">=2.2,<5.0"
|
|
17
17
|
|
|
18
18
|
[tool.poetry.dev-dependencies]
|
|
19
|
-
pytest = "
|
|
20
|
-
pytype = "*"
|
|
21
|
-
tox = "^
|
|
22
|
-
yapf = "^0.
|
|
19
|
+
pytest = "*"
|
|
20
|
+
pytype = {version = "*", python = "<3.10"}
|
|
21
|
+
tox = "^4.14.2"
|
|
22
|
+
yapf = "^0.40.2"
|
|
23
23
|
|
|
24
24
|
[tool.poetry.scripts]
|
|
25
25
|
unicodedata-reader = 'unicodedata_reader.__main__:main'
|
|
@@ -2,6 +2,7 @@ import pathlib
|
|
|
2
2
|
import sys
|
|
3
3
|
|
|
4
4
|
import unicodedata_reader.bidi_brackets as bidi_brackets
|
|
5
|
+
import unicodedata_reader.east_asian_width as ea
|
|
5
6
|
import unicodedata_reader.emoji as emoji
|
|
6
7
|
import unicodedata_reader.general_category as gc
|
|
7
8
|
import unicodedata_reader.line_break as lb
|
|
@@ -12,6 +13,7 @@ def main():
|
|
|
12
13
|
args = sys.argv
|
|
13
14
|
sub_commands = {
|
|
14
15
|
'bidi': lambda: bidi_brackets.dump_bidi_brackets(),
|
|
16
|
+
'ea': lambda: ea.UnicodeEastAsianWidthDataCli().main(),
|
|
15
17
|
'emoji': lambda: emoji.UnicodeEmojiDataCli().main(),
|
|
16
18
|
'gc': lambda: gc.UnicodeGeneralCategoryDataCli().main(),
|
|
17
19
|
'lb': lambda: lb.UnicodeLineBreakDataCli().main(),
|
|
@@ -73,14 +73,15 @@ def _init_logging(verbose):
|
|
|
73
73
|
|
|
74
74
|
|
|
75
75
|
class UnicodeDataCli(object):
|
|
76
|
+
|
|
76
77
|
def __init__(self):
|
|
77
|
-
self.
|
|
78
|
+
self._parse_args()
|
|
78
79
|
|
|
79
80
|
def _columns(self) -> Dict[str, Callable[[int, str], Any]]:
|
|
80
81
|
columns = self._core_columns()
|
|
81
82
|
columns = dict(
|
|
82
83
|
itertools.chain({
|
|
83
|
-
'Code': lambda code, ch: u_hex(code),
|
|
84
|
+
'Code': lambda code, ch: 'U' + u_hex(code),
|
|
84
85
|
'Char': lambda code, ch: u_printable_chr(ch),
|
|
85
86
|
}.items(), columns.items(), {
|
|
86
87
|
'Name': lambda code, ch: u_name_or_empty(ch),
|
|
@@ -119,17 +120,22 @@ class UnicodeDataCli(object):
|
|
|
119
120
|
compressor = UnicodeDataCompressor(entries)
|
|
120
121
|
compressor.substitute_template(template, name=self.name, output=output)
|
|
121
122
|
|
|
122
|
-
def
|
|
123
|
+
def _parse_args(self):
|
|
123
124
|
parser = argparse.ArgumentParser()
|
|
124
|
-
parser.add_argument('text',
|
|
125
|
+
parser.add_argument('text',
|
|
126
|
+
nargs='*',
|
|
127
|
+
help='show properties for the text')
|
|
125
128
|
parser.add_argument('-f', '--no-cache', action='store_true')
|
|
126
|
-
parser.add_argument('
|
|
127
|
-
parser.add_argument('-t',
|
|
129
|
+
parser.add_argument('--name', help='$NAME in the template')
|
|
130
|
+
parser.add_argument('-t',
|
|
131
|
+
'--template',
|
|
132
|
+
type=pathlib.Path,
|
|
133
|
+
help='generate a file from the template')
|
|
128
134
|
parser.add_argument('-o', '--output', type=pathlib.Path)
|
|
129
|
-
parser.add_argument(
|
|
130
|
-
|
|
131
|
-
help=
|
|
132
|
-
action=
|
|
135
|
+
parser.add_argument('-v',
|
|
136
|
+
'--verbose',
|
|
137
|
+
help='increase output verbosity',
|
|
138
|
+
action='count',
|
|
133
139
|
default=0)
|
|
134
140
|
parser.parse_args(namespace=self)
|
|
135
141
|
_init_logging(self.verbose) # pytype: disable=attribute-error
|
|
@@ -22,6 +22,7 @@ def _init_logging(verbose: int):
|
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class UnicodeDataCompressor(object):
|
|
25
|
+
|
|
25
26
|
def __init__(self, entries: UnicodeDataEntries):
|
|
26
27
|
self._entries = entries
|
|
27
28
|
|
|
@@ -82,8 +83,8 @@ class UnicodeDataCompressor(object):
|
|
|
82
83
|
len(bytes), len(base64bytes), len(values_for_int),
|
|
83
84
|
value_bits)
|
|
84
85
|
mapping = {
|
|
85
|
-
'
|
|
86
|
-
'
|
|
86
|
+
'NAME': name,
|
|
87
|
+
'BASE64BYTES': base64bytes.decode('ascii'),
|
|
87
88
|
'VALUE_BITS': str(value_bits),
|
|
88
89
|
'VALUE_MASK': str((1 << value_bits) - 1),
|
|
89
90
|
'VALUE_LIST': ','.join(f'"{v}"' for v in values_for_int),
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import unicodedata
|
|
3
|
+
from typing import Any
|
|
4
|
+
from typing import Callable
|
|
5
|
+
from typing import Dict
|
|
6
|
+
|
|
7
|
+
from unicodedata_reader import *
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class UnicodeEastAsianWidthDataCli(UnicodeDataCli):
|
|
11
|
+
|
|
12
|
+
def __init__(self):
|
|
13
|
+
super().__init__()
|
|
14
|
+
self._entries = UnicodeDataReader.default.east_asian_width()
|
|
15
|
+
|
|
16
|
+
def _core_columns(self) -> Dict[str, Callable[[int, str], Any]]:
|
|
17
|
+
return {
|
|
18
|
+
'EA': lambda code, ch: self._entries.value(code),
|
|
19
|
+
'GC': lambda code, ch: unicodedata.category(ch),
|
|
20
|
+
'EAW': lambda code, ch: unicodedata.east_asian_width(ch),
|
|
21
|
+
'cp932': lambda code, ch: u_enc(ch, 'cp932'),
|
|
22
|
+
'sjis04': lambda code, ch: u_enc(ch, 'sjis_2004'),
|
|
23
|
+
'cp936': lambda code, ch: u_enc(ch, 'cp936'),
|
|
24
|
+
'cp949': lambda code, ch: u_enc(ch, 'cp949'),
|
|
25
|
+
'cp950': lambda code, ch: u_enc(ch, 'cp950'),
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
if __name__ == '__main__':
|
|
30
|
+
UnicodeEastAsianWidthDataCli().main()
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
from typing import Any
|
|
3
|
+
from typing import Callable
|
|
4
|
+
from typing import Dict
|
|
5
|
+
|
|
6
|
+
from unicodedata_reader import *
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class UnicodeEmojiDataCli(UnicodeDataCli):
|
|
10
|
+
|
|
11
|
+
def __init__(self):
|
|
12
|
+
super().__init__()
|
|
13
|
+
self._entries = UnicodeDataReader.default.emoji()
|
|
14
|
+
|
|
15
|
+
def _emoji_flag_func(self, mask: EmojiType):
|
|
16
|
+
return lambda code, ch: 1 if self._entries.value(code) & mask else 0
|
|
17
|
+
|
|
18
|
+
def _core_columns(self) -> Dict[str, Callable[[int, str], Any]]:
|
|
19
|
+
return {
|
|
20
|
+
'Emoji':
|
|
21
|
+
self._emoji_flag_func(EmojiType.Emoji),
|
|
22
|
+
'Emoji_Presentation':
|
|
23
|
+
self._emoji_flag_func(EmojiType.Emoji_Presentation),
|
|
24
|
+
'Emoji_Modifier':
|
|
25
|
+
self._emoji_flag_func(EmojiType.Emoji_Modifier),
|
|
26
|
+
'Emoji_Modifier_Base':
|
|
27
|
+
self._emoji_flag_func(EmojiType.Emoji_Modifier_Base),
|
|
28
|
+
'Emoji_Component':
|
|
29
|
+
self._emoji_flag_func(EmojiType.Emoji_Component),
|
|
30
|
+
'Extended_Pictographic':
|
|
31
|
+
self._emoji_flag_func(EmojiType.Extended_Pictographic),
|
|
32
|
+
'EmojiCombined':
|
|
33
|
+
lambda code, ch: self._entries.value(code),
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
if __name__ == '__main__':
|
|
38
|
+
UnicodeEmojiDataCli().main()
|
|
@@ -27,6 +27,7 @@ def u_enc(c, encoding):
|
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
class BidiBrackets(object):
|
|
30
|
+
|
|
30
31
|
def __init__(self, pair: int, type: str):
|
|
31
32
|
self.pair = pair
|
|
32
33
|
self.type = type
|
|
@@ -55,6 +56,9 @@ class UnicodeDataEntry(object):
|
|
|
55
56
|
|
|
56
57
|
[Unicode character database]: https://unicode.org/reports/tr44/
|
|
57
58
|
"""
|
|
59
|
+
|
|
60
|
+
max_code_point = 0x10FFFF
|
|
61
|
+
|
|
58
62
|
def __init__(self, min: int, max: int, value):
|
|
59
63
|
self.min = min
|
|
60
64
|
self.max = max
|
|
@@ -134,19 +138,22 @@ class UnicodeDataEntry(object):
|
|
|
134
138
|
min = -1
|
|
135
139
|
last_code = -1
|
|
136
140
|
for code, value in values:
|
|
141
|
+
assert code > last_code
|
|
137
142
|
if value == last_value and code == last_code + 1:
|
|
138
143
|
last_code = code
|
|
139
144
|
continue
|
|
140
|
-
if min >= 0
|
|
145
|
+
if min >= 0:
|
|
141
146
|
yield UnicodeDataEntry(min, last_code, last_value)
|
|
142
147
|
last_value = value
|
|
143
148
|
min = last_code = code
|
|
144
|
-
if min >= 0
|
|
149
|
+
if min >= 0:
|
|
145
150
|
yield UnicodeDataEntry(min, code, last_value)
|
|
146
151
|
|
|
147
152
|
@staticmethod
|
|
148
153
|
def from_values(values: Iterable[Any]):
|
|
149
|
-
|
|
154
|
+
pairs = enumerate(values)
|
|
155
|
+
pairs = (p for p in pairs if p[1] is not None)
|
|
156
|
+
return UnicodeDataEntry.from_pairs(pairs)
|
|
150
157
|
|
|
151
158
|
@staticmethod
|
|
152
159
|
def values_for_code(entries, missing_value) -> Iterable[Any]:
|
|
@@ -163,6 +170,7 @@ class UnicodeDataEntries(object):
|
|
|
163
170
|
or a list of `UnicodeDataEntry`.
|
|
164
171
|
[Unicode character database]: https://unicode.org/reports/tr44/
|
|
165
172
|
"""
|
|
173
|
+
|
|
166
174
|
def __init__(self,
|
|
167
175
|
entries: Optional[Union[Iterable[UnicodeDataEntry],
|
|
168
176
|
Sequence[UnicodeDataEntry]]] = None,
|
|
@@ -171,12 +179,15 @@ class UnicodeDataEntries(object):
|
|
|
171
179
|
converter=None):
|
|
172
180
|
self._missing_entries = self._default_missing_entries()
|
|
173
181
|
self.name = name
|
|
182
|
+
self._values_for_int = None # type: list
|
|
183
|
+
|
|
174
184
|
if entries is not None:
|
|
185
|
+
assert lines is None
|
|
186
|
+
assert converter is None
|
|
175
187
|
self._entries = entries
|
|
176
188
|
else:
|
|
177
189
|
assert lines is not None
|
|
178
190
|
self._load_lines(lines, converter=converter)
|
|
179
|
-
self._values_for_int = None # type: list
|
|
180
191
|
|
|
181
192
|
def _default_missing_entries(self) -> List[UnicodeDataEntry]:
|
|
182
193
|
return []
|
|
@@ -193,16 +204,16 @@ class UnicodeDataEntries(object):
|
|
|
193
204
|
self._missing_entries.extend(entries)
|
|
194
205
|
assert self._missing_entries
|
|
195
206
|
|
|
196
|
-
def
|
|
207
|
+
def _ensure_multi_iterable(self):
|
|
197
208
|
if isinstance(self._entries, types.GeneratorType):
|
|
198
209
|
self._entries = tuple(self._entries)
|
|
199
210
|
|
|
200
211
|
def __iter__(self):
|
|
201
|
-
self.
|
|
212
|
+
self._ensure_multi_iterable()
|
|
202
213
|
return self._entries.__iter__()
|
|
203
214
|
|
|
204
215
|
def __len__(self):
|
|
205
|
-
self.
|
|
216
|
+
self._ensure_multi_iterable()
|
|
206
217
|
return len(self._entries)
|
|
207
218
|
|
|
208
219
|
def missing_value(self, code: int):
|
|
@@ -235,12 +246,12 @@ class UnicodeDataEntries(object):
|
|
|
235
246
|
|
|
236
247
|
def unicodes(self) -> Iterable[int]:
|
|
237
248
|
"""Returns a list of Unicode code points defined in this entries."""
|
|
238
|
-
self.
|
|
249
|
+
self._ensure_multi_iterable()
|
|
239
250
|
return itertools.chain(*(e.range() for e in self._entries))
|
|
240
251
|
|
|
241
252
|
def value(self, code: int):
|
|
242
253
|
"""Returns the value for the given code point."""
|
|
243
|
-
self.
|
|
254
|
+
self._ensure_multi_iterable()
|
|
244
255
|
for entry in self._entries:
|
|
245
256
|
if code < entry.min:
|
|
246
257
|
return self.missing_value(code)
|
|
@@ -254,7 +265,7 @@ class UnicodeDataEntries(object):
|
|
|
254
265
|
The list includes missing values,
|
|
255
266
|
so that `tuple(values_for_code())[code]` is equal to `value(code)`.
|
|
256
267
|
"""
|
|
257
|
-
self.
|
|
268
|
+
self._ensure_multi_iterable()
|
|
258
269
|
return UnicodeDataEntry.values_for_code(self._entries,
|
|
259
270
|
self.missing_value)
|
|
260
271
|
|
|
@@ -278,8 +289,8 @@ class UnicodeDataEntries(object):
|
|
|
278
289
|
|
|
279
290
|
On return, the original values are stored in `self.value_list`.
|
|
280
291
|
"""
|
|
281
|
-
assert self.
|
|
282
|
-
self.
|
|
292
|
+
assert self._values_for_int is None
|
|
293
|
+
self._ensure_multi_iterable()
|
|
283
294
|
value_map = {}
|
|
284
295
|
for entry in self._entries:
|
|
285
296
|
assert not isinstance(entry.value, int)
|
|
@@ -295,7 +306,7 @@ class UnicodeDataEntries(object):
|
|
|
295
306
|
|
|
296
307
|
def to_dict(self) -> Dict[int, Any]:
|
|
297
308
|
"""Returns a `dict` of values with a Unicode code point as the key."""
|
|
298
|
-
self.
|
|
309
|
+
self._ensure_multi_iterable()
|
|
299
310
|
dict = {}
|
|
300
311
|
for entry in self._entries:
|
|
301
312
|
for code in entry.range():
|
|
@@ -304,12 +315,14 @@ class UnicodeDataEntries(object):
|
|
|
304
315
|
|
|
305
316
|
|
|
306
317
|
class UnicodeBidiBracketsDataEntries(UnicodeDataEntries):
|
|
318
|
+
|
|
307
319
|
def _load_lines(self, lines: Iterable[str], converter=None):
|
|
308
320
|
converter = converter or BidiBrackets.from_values
|
|
309
321
|
super()._load_lines(lines, converter=converter)
|
|
310
322
|
|
|
311
323
|
|
|
312
324
|
class UnicodeEmojiDataEntries(UnicodeDataEntries):
|
|
325
|
+
|
|
313
326
|
def _load_lines(self, lines: Iterable[str], converter=None):
|
|
314
327
|
converter = converter or (lambda v: EmojiType[v])
|
|
315
328
|
super()._load_lines(lines, converter=converter)
|
|
@@ -334,6 +347,7 @@ class UnicodeEmojiDataEntries(UnicodeDataEntries):
|
|
|
334
347
|
|
|
335
348
|
|
|
336
349
|
class UnicodeLineBreakDataEntries(UnicodeDataEntries):
|
|
350
|
+
|
|
337
351
|
def _load_comment(self, comment: str, start_index: int):
|
|
338
352
|
# Load missing value entries. See the comments in:
|
|
339
353
|
# https://www.unicode.org/Public/UNIDATA/LineBreak.txt
|
|
@@ -356,12 +370,14 @@ class UnicodeLineBreakDataEntries(UnicodeDataEntries):
|
|
|
356
370
|
|
|
357
371
|
|
|
358
372
|
class UnicodeScriptExtensionsDataEntries(UnicodeDataEntries):
|
|
373
|
+
|
|
359
374
|
def _load_lines(self, lines: Iterable[str], converter=None):
|
|
360
375
|
converter = converter or (lambda v: v.split())
|
|
361
376
|
super()._load_lines(lines, converter=converter)
|
|
362
377
|
|
|
363
378
|
|
|
364
379
|
class UnicodeVerticalOrientationDataEntries(UnicodeDataEntries):
|
|
380
|
+
|
|
365
381
|
def _load_comment(self, comment: str, start_index: int):
|
|
366
382
|
# Load missing value entries. See the comments in:
|
|
367
383
|
# https://www.unicode.org/Public/UNIDATA/VerticalOrientation.txt
|
|
@@ -33,6 +33,11 @@ class UnicodeDataReader(object):
|
|
|
33
33
|
lines = self.read_lines(name)
|
|
34
34
|
return UnicodeDataEntries(name=name, lines=lines)
|
|
35
35
|
|
|
36
|
+
def east_asian_width(self) -> UnicodeDataEntries:
|
|
37
|
+
name = 'EastAsianWidth'
|
|
38
|
+
lines = self.read_lines(name)
|
|
39
|
+
return UnicodeDataEntries(name=name, lines=lines)
|
|
40
|
+
|
|
36
41
|
def emoji(self) -> UnicodeDataEntries:
|
|
37
42
|
lines = self.read_lines('emoji/emoji-data')
|
|
38
43
|
return UnicodeEmojiDataEntries(name='Emoji', lines=lines)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from typing import Callable
|
|
2
|
+
|
|
3
|
+
from unicodedata_reader.entry import *
|
|
4
|
+
from unicodedata_reader.reader import *
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Set(object):
|
|
8
|
+
"""A simple set of Unicode code points."""
|
|
9
|
+
|
|
10
|
+
def __init__(self) -> None:
|
|
11
|
+
self.set = set() # type: set[int]
|
|
12
|
+
|
|
13
|
+
def __contains__(self, code_point: int) -> bool:
|
|
14
|
+
return code_point in self.set
|
|
15
|
+
|
|
16
|
+
def __iter__(self) -> Iterable[int]:
|
|
17
|
+
return self.set.__iter__()
|
|
18
|
+
|
|
19
|
+
def __isub__(self, other: 'Set') -> None:
|
|
20
|
+
self.set -= other.set
|
|
21
|
+
|
|
22
|
+
def __iand__(self, other: 'Set') -> None:
|
|
23
|
+
self.set &= other.set
|
|
24
|
+
|
|
25
|
+
def __ior__(self, other: 'Set') -> None:
|
|
26
|
+
self.set |= other.set
|
|
27
|
+
|
|
28
|
+
def add(self, code: int) -> None:
|
|
29
|
+
self.set.add(code)
|
|
30
|
+
|
|
31
|
+
def remove(self, code: int) -> None:
|
|
32
|
+
self.set.discard(code)
|
|
33
|
+
|
|
34
|
+
def add_entries(self, entries: UnicodeDataEntries, pred: Callable[[Any],
|
|
35
|
+
bool]):
|
|
36
|
+
for entry in entries:
|
|
37
|
+
if pred(entry.value):
|
|
38
|
+
for code in entry.range():
|
|
39
|
+
self.set.add(code)
|
|
40
|
+
|
|
41
|
+
@staticmethod
|
|
42
|
+
def east_asian_width(
|
|
43
|
+
value: str,
|
|
44
|
+
reader: UnicodeDataReader = UnicodeDataReader.default) -> 'Set':
|
|
45
|
+
set = Set()
|
|
46
|
+
set.add_entries(reader.east_asian_width(), lambda v: v == value)
|
|
47
|
+
return set
|
|
48
|
+
|
|
49
|
+
@staticmethod
|
|
50
|
+
def general_category(
|
|
51
|
+
value: str,
|
|
52
|
+
reader: UnicodeDataReader = UnicodeDataReader.default) -> 'Set':
|
|
53
|
+
set = Set()
|
|
54
|
+
set.add_entries(reader.general_category(),
|
|
55
|
+
lambda v: v.startswith(value))
|
|
56
|
+
return set
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
def scripts(
|
|
60
|
+
value: str,
|
|
61
|
+
reader: UnicodeDataReader = UnicodeDataReader.default) -> 'Set':
|
|
62
|
+
set = Set()
|
|
63
|
+
set.add_entries(reader.scripts(), lambda v: v == value)
|
|
64
|
+
return set
|
|
65
|
+
|
|
66
|
+
@staticmethod
|
|
67
|
+
def script_extensions(
|
|
68
|
+
value: str,
|
|
69
|
+
reader: UnicodeDataReader = UnicodeDataReader.default) -> 'Set':
|
|
70
|
+
set = Set()
|
|
71
|
+
set.add_entries(reader.script_extensions(), lambda v: value in v)
|
|
72
|
+
return set
|
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
from setuptools import setup
|
|
3
|
-
|
|
4
|
-
packages = \
|
|
5
|
-
['unicodedata_reader']
|
|
6
|
-
|
|
7
|
-
package_data = \
|
|
8
|
-
{'': ['*']}
|
|
9
|
-
|
|
10
|
-
install_requires = \
|
|
11
|
-
['platformdirs>=2.2.0,<3.0.0']
|
|
12
|
-
|
|
13
|
-
entry_points = \
|
|
14
|
-
{'console_scripts': ['unicodedata-reader = unicodedata_reader.__main__:main']}
|
|
15
|
-
|
|
16
|
-
setup_kwargs = {
|
|
17
|
-
'name': 'unicodedata-reader',
|
|
18
|
-
'version': '0.1.6',
|
|
19
|
-
'description': '',
|
|
20
|
-
'long_description': '[](https://github.com/kojiishi/unicodedata-reader/actions/workflows/ci.yml)\n[](https://pypi.org/project/unicodedata-reader/)\n[](https://github.com/kojiishi/unicodedata-reader/network/updates)\n\n\n# unicodedata-reader\n\nThis package reads and parses the [Unicode Character Database] files.\n\nMany of them are available in the [unicodedata] module,\nor in other 3rd party modules.\nWhen the desired data is not in any existing modules,\nsuch as the [Line_Break property] or the [Vertical_Orientation property],\nthis package can read the data files\nat <https://www.unicode.org/Public/UNIDATA/>.\n\nThis package can also generate JavaScript functions\nthat can read the property values of the [Unicode Character Database]\nin browsers.\nPlease see the [JavaScript] section below.\n\n[Line_Break property]: http://unicode.org/reports/tr44/#Line_Break\n[Unicode Character Database]: https://unicode.org/reports/tr44/\n[unicodedata]: https://docs.python.org/3/library/unicodedata.html\n[Vertical_Orientation property]: http://unicode.org/reports/tr44/#Vertical_Orientation\n\n## Install\n\n```sh\npip install unicodedata-reader\n```\nIf you want to clone and install using [poetry]:\n```sh\ngit clone https://github.com/kojiishi/unicodedata-reader\ncd unicodedata-reader\npoetry install\npoetry shell\n```\n\n[poetry]: https://github.com/python-poetry/poetry\n\n\n## Python\n\n```python\nimport unicodedata_reader\n\nreader = unicodedata_reader.UnicodeDataReader.default\nlb = reader.line_break()\nprint(lb.value(0x41))\n```\nThe example above prints `AL`,\nthe [Line_Break property] value for U+0041.\nPlease also see [line_break_test.py] for more usages.\n\n[line_break_test.py]: https://github.com/kojiishi/unicodedata-reader/blob/main/tests/line_break_test.py\n\n## JavaScript\n[JavaScript]: #javascript\n\nThe [`UnicodeDataCompressor` class] in this package\ncan generate JavaScript functions that can read the property values\nof the [Unicode Character Database] in browsers.\n\nPlease see [LineBreak.js] for an example of the generated functions\nand [LineBreak.html] for an example usage.\n\n[`UnicodeDataCompressor` class]: https://github.com/kojiishi/unicodedata-reader/blob/main/unicodedata_reader/compressor.py\n[LineBreak.html]: https://github.com/kojiishi/unicodedata-reader/blob/main/js/LineBreak.html\n[LineBreak.js]: https://github.com/kojiishi/unicodedata-reader/blob/main/js/LineBreak.js\n',
|
|
21
|
-
'author': 'Koji Ishii',
|
|
22
|
-
'author_email': 'kojii@chromium.org',
|
|
23
|
-
'maintainer': None,
|
|
24
|
-
'maintainer_email': None,
|
|
25
|
-
'url': 'https://github.com/kojiishi/unicodedata-reader',
|
|
26
|
-
'packages': packages,
|
|
27
|
-
'package_data': package_data,
|
|
28
|
-
'install_requires': install_requires,
|
|
29
|
-
'entry_points': entry_points,
|
|
30
|
-
'python_requires': '>=3.8,<3.10',
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
setup(**setup_kwargs)
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
from typing import Any
|
|
3
|
-
from typing import Callable
|
|
4
|
-
from typing import Dict
|
|
5
|
-
|
|
6
|
-
from unicodedata_reader import *
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class UnicodeEmojiDataCli(UnicodeDataCli):
|
|
10
|
-
def __init__(self):
|
|
11
|
-
super().__init__()
|
|
12
|
-
self._entries = UnicodeDataReader.default.emoji()
|
|
13
|
-
|
|
14
|
-
def _core_columns(self) -> Dict[str, Callable[[int, str], Any]]:
|
|
15
|
-
return {
|
|
16
|
-
'Emoji': lambda code, ch: self._entries.value(code),
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
if __name__ == '__main__':
|
|
21
|
-
UnicodeEmojiDataCli().main()
|
|
File without changes
|
|
File without changes
|