unicodedata-reader 0.1.7__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {unicodedata-reader-0.1.7 → unicodedata_reader-1.0.0}/PKG-INFO +5 -3
- {unicodedata-reader-0.1.7 → unicodedata_reader-1.0.0}/pyproject.toml +4 -4
- {unicodedata-reader-0.1.7 → unicodedata_reader-1.0.0}/unicodedata_reader/__init__.py +1 -0
- {unicodedata-reader-0.1.7 → unicodedata_reader-1.0.0}/unicodedata_reader/__main__.py +2 -0
- {unicodedata-reader-0.1.7 → unicodedata_reader-1.0.0}/unicodedata_reader/cli.py +1 -0
- {unicodedata-reader-0.1.7 → unicodedata_reader-1.0.0}/unicodedata_reader/compressor.py +1 -0
- unicodedata_reader-1.0.0/unicodedata_reader/east_asian_width.py +30 -0
- {unicodedata-reader-0.1.7 → unicodedata_reader-1.0.0}/unicodedata_reader/emoji.py +1 -0
- {unicodedata-reader-0.1.7 → unicodedata_reader-1.0.0}/unicodedata_reader/entry.py +10 -0
- {unicodedata-reader-0.1.7 → unicodedata_reader-1.0.0}/unicodedata_reader/general_category.py +1 -0
- {unicodedata-reader-0.1.7 → unicodedata_reader-1.0.0}/unicodedata_reader/line_break.py +1 -0
- {unicodedata-reader-0.1.7 → unicodedata_reader-1.0.0}/unicodedata_reader/reader.py +5 -0
- unicodedata_reader-1.0.0/unicodedata_reader/set.py +72 -0
- {unicodedata-reader-0.1.7 → unicodedata_reader-1.0.0}/unicodedata_reader/vertical_orientation.py +1 -0
- unicodedata-reader-0.1.7/setup.py +0 -34
- {unicodedata-reader-0.1.7 → unicodedata_reader-1.0.0}/LICENSE +0 -0
- {unicodedata-reader-0.1.7 → unicodedata_reader-1.0.0}/README.md +0 -0
- {unicodedata-reader-0.1.7 → unicodedata_reader-1.0.0}/unicodedata_reader/bidi_brackets.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unicodedata-reader
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary:
|
|
5
5
|
Home-page: https://github.com/kojiishi/unicodedata-reader
|
|
6
6
|
License: Apache-2.0
|
|
@@ -9,10 +9,12 @@ Author-email: kojii@chromium.org
|
|
|
9
9
|
Requires-Python: >=3.8
|
|
10
10
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
|
12
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
13
12
|
Classifier: Programming Language :: Python :: 3.8
|
|
14
13
|
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
-
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Requires-Dist: platformdirs (>=2.2,<5.0)
|
|
16
18
|
Project-URL: Repository, https://github.com/kojiishi/unicodedata-reader
|
|
17
19
|
Description-Content-Type: text/markdown
|
|
18
20
|
|
|
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|
|
4
4
|
|
|
5
5
|
[tool.poetry]
|
|
6
6
|
name = "unicodedata-reader"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "1.0.0"
|
|
8
8
|
description = ""
|
|
9
9
|
authors = ["Koji Ishii <kojii@chromium.org>"]
|
|
10
10
|
readme = "README.md"
|
|
@@ -13,13 +13,13 @@ license = "Apache-2.0"
|
|
|
13
13
|
|
|
14
14
|
[tool.poetry.dependencies]
|
|
15
15
|
python = ">=3.8"
|
|
16
|
-
platformdirs = "
|
|
16
|
+
platformdirs = ">=2.2,<5.0"
|
|
17
17
|
|
|
18
18
|
[tool.poetry.dev-dependencies]
|
|
19
19
|
pytest = "*"
|
|
20
20
|
pytype = {version = "*", python = "<3.10"}
|
|
21
|
-
tox = "^
|
|
22
|
-
yapf = "^0.
|
|
21
|
+
tox = "^4.14.2"
|
|
22
|
+
yapf = "^0.40.2"
|
|
23
23
|
|
|
24
24
|
[tool.poetry.scripts]
|
|
25
25
|
unicodedata-reader = 'unicodedata_reader.__main__:main'
|
|
@@ -2,6 +2,7 @@ import pathlib
|
|
|
2
2
|
import sys
|
|
3
3
|
|
|
4
4
|
import unicodedata_reader.bidi_brackets as bidi_brackets
|
|
5
|
+
import unicodedata_reader.east_asian_width as ea
|
|
5
6
|
import unicodedata_reader.emoji as emoji
|
|
6
7
|
import unicodedata_reader.general_category as gc
|
|
7
8
|
import unicodedata_reader.line_break as lb
|
|
@@ -12,6 +13,7 @@ def main():
|
|
|
12
13
|
args = sys.argv
|
|
13
14
|
sub_commands = {
|
|
14
15
|
'bidi': lambda: bidi_brackets.dump_bidi_brackets(),
|
|
16
|
+
'ea': lambda: ea.UnicodeEastAsianWidthDataCli().main(),
|
|
15
17
|
'emoji': lambda: emoji.UnicodeEmojiDataCli().main(),
|
|
16
18
|
'gc': lambda: gc.UnicodeGeneralCategoryDataCli().main(),
|
|
17
19
|
'lb': lambda: lb.UnicodeLineBreakDataCli().main(),
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import unicodedata
|
|
3
|
+
from typing import Any
|
|
4
|
+
from typing import Callable
|
|
5
|
+
from typing import Dict
|
|
6
|
+
|
|
7
|
+
from unicodedata_reader import *
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class UnicodeEastAsianWidthDataCli(UnicodeDataCli):
|
|
11
|
+
|
|
12
|
+
def __init__(self):
|
|
13
|
+
super().__init__()
|
|
14
|
+
self._entries = UnicodeDataReader.default.east_asian_width()
|
|
15
|
+
|
|
16
|
+
def _core_columns(self) -> Dict[str, Callable[[int, str], Any]]:
|
|
17
|
+
return {
|
|
18
|
+
'EA': lambda code, ch: self._entries.value(code),
|
|
19
|
+
'GC': lambda code, ch: unicodedata.category(ch),
|
|
20
|
+
'EAW': lambda code, ch: unicodedata.east_asian_width(ch),
|
|
21
|
+
'cp932': lambda code, ch: u_enc(ch, 'cp932'),
|
|
22
|
+
'sjis04': lambda code, ch: u_enc(ch, 'sjis_2004'),
|
|
23
|
+
'cp936': lambda code, ch: u_enc(ch, 'cp936'),
|
|
24
|
+
'cp949': lambda code, ch: u_enc(ch, 'cp949'),
|
|
25
|
+
'cp950': lambda code, ch: u_enc(ch, 'cp950'),
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
if __name__ == '__main__':
|
|
30
|
+
UnicodeEastAsianWidthDataCli().main()
|
|
@@ -27,6 +27,7 @@ def u_enc(c, encoding):
|
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
class BidiBrackets(object):
|
|
30
|
+
|
|
30
31
|
def __init__(self, pair: int, type: str):
|
|
31
32
|
self.pair = pair
|
|
32
33
|
self.type = type
|
|
@@ -55,6 +56,9 @@ class UnicodeDataEntry(object):
|
|
|
55
56
|
|
|
56
57
|
[Unicode character database]: https://unicode.org/reports/tr44/
|
|
57
58
|
"""
|
|
59
|
+
|
|
60
|
+
max_code_point = 0x10FFFF
|
|
61
|
+
|
|
58
62
|
def __init__(self, min: int, max: int, value):
|
|
59
63
|
self.min = min
|
|
60
64
|
self.max = max
|
|
@@ -166,6 +170,7 @@ class UnicodeDataEntries(object):
|
|
|
166
170
|
or a list of `UnicodeDataEntry`.
|
|
167
171
|
[Unicode character database]: https://unicode.org/reports/tr44/
|
|
168
172
|
"""
|
|
173
|
+
|
|
169
174
|
def __init__(self,
|
|
170
175
|
entries: Optional[Union[Iterable[UnicodeDataEntry],
|
|
171
176
|
Sequence[UnicodeDataEntry]]] = None,
|
|
@@ -310,12 +315,14 @@ class UnicodeDataEntries(object):
|
|
|
310
315
|
|
|
311
316
|
|
|
312
317
|
class UnicodeBidiBracketsDataEntries(UnicodeDataEntries):
|
|
318
|
+
|
|
313
319
|
def _load_lines(self, lines: Iterable[str], converter=None):
|
|
314
320
|
converter = converter or BidiBrackets.from_values
|
|
315
321
|
super()._load_lines(lines, converter=converter)
|
|
316
322
|
|
|
317
323
|
|
|
318
324
|
class UnicodeEmojiDataEntries(UnicodeDataEntries):
|
|
325
|
+
|
|
319
326
|
def _load_lines(self, lines: Iterable[str], converter=None):
|
|
320
327
|
converter = converter or (lambda v: EmojiType[v])
|
|
321
328
|
super()._load_lines(lines, converter=converter)
|
|
@@ -340,6 +347,7 @@ class UnicodeEmojiDataEntries(UnicodeDataEntries):
|
|
|
340
347
|
|
|
341
348
|
|
|
342
349
|
class UnicodeLineBreakDataEntries(UnicodeDataEntries):
|
|
350
|
+
|
|
343
351
|
def _load_comment(self, comment: str, start_index: int):
|
|
344
352
|
# Load missing value entries. See the comments in:
|
|
345
353
|
# https://www.unicode.org/Public/UNIDATA/LineBreak.txt
|
|
@@ -362,12 +370,14 @@ class UnicodeLineBreakDataEntries(UnicodeDataEntries):
|
|
|
362
370
|
|
|
363
371
|
|
|
364
372
|
class UnicodeScriptExtensionsDataEntries(UnicodeDataEntries):
|
|
373
|
+
|
|
365
374
|
def _load_lines(self, lines: Iterable[str], converter=None):
|
|
366
375
|
converter = converter or (lambda v: v.split())
|
|
367
376
|
super()._load_lines(lines, converter=converter)
|
|
368
377
|
|
|
369
378
|
|
|
370
379
|
class UnicodeVerticalOrientationDataEntries(UnicodeDataEntries):
|
|
380
|
+
|
|
371
381
|
def _load_comment(self, comment: str, start_index: int):
|
|
372
382
|
# Load missing value entries. See the comments in:
|
|
373
383
|
# https://www.unicode.org/Public/UNIDATA/VerticalOrientation.txt
|
|
@@ -33,6 +33,11 @@ class UnicodeDataReader(object):
|
|
|
33
33
|
lines = self.read_lines(name)
|
|
34
34
|
return UnicodeDataEntries(name=name, lines=lines)
|
|
35
35
|
|
|
36
|
+
def east_asian_width(self) -> UnicodeDataEntries:
|
|
37
|
+
name = 'EastAsianWidth'
|
|
38
|
+
lines = self.read_lines(name)
|
|
39
|
+
return UnicodeDataEntries(name=name, lines=lines)
|
|
40
|
+
|
|
36
41
|
def emoji(self) -> UnicodeDataEntries:
|
|
37
42
|
lines = self.read_lines('emoji/emoji-data')
|
|
38
43
|
return UnicodeEmojiDataEntries(name='Emoji', lines=lines)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from typing import Callable
|
|
2
|
+
|
|
3
|
+
from unicodedata_reader.entry import *
|
|
4
|
+
from unicodedata_reader.reader import *
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Set(object):
|
|
8
|
+
"""A simple set of Unicode code points."""
|
|
9
|
+
|
|
10
|
+
def __init__(self) -> None:
|
|
11
|
+
self.set = set() # type: set[int]
|
|
12
|
+
|
|
13
|
+
def __contains__(self, code_point: int) -> bool:
|
|
14
|
+
return code_point in self.set
|
|
15
|
+
|
|
16
|
+
def __iter__(self) -> Iterable[int]:
|
|
17
|
+
return self.set.__iter__()
|
|
18
|
+
|
|
19
|
+
def __isub__(self, other: 'Set') -> None:
|
|
20
|
+
self.set -= other.set
|
|
21
|
+
|
|
22
|
+
def __iand__(self, other: 'Set') -> None:
|
|
23
|
+
self.set &= other.set
|
|
24
|
+
|
|
25
|
+
def __ior__(self, other: 'Set') -> None:
|
|
26
|
+
self.set |= other.set
|
|
27
|
+
|
|
28
|
+
def add(self, code: int) -> None:
|
|
29
|
+
self.set.add(code)
|
|
30
|
+
|
|
31
|
+
def remove(self, code: int) -> None:
|
|
32
|
+
self.set.discard(code)
|
|
33
|
+
|
|
34
|
+
def add_entries(self, entries: UnicodeDataEntries, pred: Callable[[Any],
|
|
35
|
+
bool]):
|
|
36
|
+
for entry in entries:
|
|
37
|
+
if pred(entry.value):
|
|
38
|
+
for code in entry.range():
|
|
39
|
+
self.set.add(code)
|
|
40
|
+
|
|
41
|
+
@staticmethod
|
|
42
|
+
def east_asian_width(
|
|
43
|
+
value: str,
|
|
44
|
+
reader: UnicodeDataReader = UnicodeDataReader.default) -> 'Set':
|
|
45
|
+
set = Set()
|
|
46
|
+
set.add_entries(reader.east_asian_width(), lambda v: v == value)
|
|
47
|
+
return set
|
|
48
|
+
|
|
49
|
+
@staticmethod
|
|
50
|
+
def general_category(
|
|
51
|
+
value: str,
|
|
52
|
+
reader: UnicodeDataReader = UnicodeDataReader.default) -> 'Set':
|
|
53
|
+
set = Set()
|
|
54
|
+
set.add_entries(reader.general_category(),
|
|
55
|
+
lambda v: v.startswith(value))
|
|
56
|
+
return set
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
def scripts(
|
|
60
|
+
value: str,
|
|
61
|
+
reader: UnicodeDataReader = UnicodeDataReader.default) -> 'Set':
|
|
62
|
+
set = Set()
|
|
63
|
+
set.add_entries(reader.scripts(), lambda v: v == value)
|
|
64
|
+
return set
|
|
65
|
+
|
|
66
|
+
@staticmethod
|
|
67
|
+
def script_extensions(
|
|
68
|
+
value: str,
|
|
69
|
+
reader: UnicodeDataReader = UnicodeDataReader.default) -> 'Set':
|
|
70
|
+
set = Set()
|
|
71
|
+
set.add_entries(reader.script_extensions(), lambda v: value in v)
|
|
72
|
+
return set
|
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
from setuptools import setup
|
|
3
|
-
|
|
4
|
-
packages = \
|
|
5
|
-
['unicodedata_reader']
|
|
6
|
-
|
|
7
|
-
package_data = \
|
|
8
|
-
{'': ['*']}
|
|
9
|
-
|
|
10
|
-
install_requires = \
|
|
11
|
-
['platformdirs>=2.2.0,<3.0.0']
|
|
12
|
-
|
|
13
|
-
entry_points = \
|
|
14
|
-
{'console_scripts': ['unicodedata-reader = unicodedata_reader.__main__:main']}
|
|
15
|
-
|
|
16
|
-
setup_kwargs = {
|
|
17
|
-
'name': 'unicodedata-reader',
|
|
18
|
-
'version': '0.1.7',
|
|
19
|
-
'description': '',
|
|
20
|
-
'long_description': '[](https://github.com/kojiishi/unicodedata-reader/actions/workflows/ci.yml)\n[](https://pypi.org/project/unicodedata-reader/)\n[](https://github.com/kojiishi/unicodedata-reader/network/updates)\n\n\n# unicodedata-reader\n\nThis package reads and parses the [Unicode Character Database] files.\n\nMany of them are available in the [unicodedata] module,\nor in other 3rd party modules.\nWhen the desired data is not in any existing modules,\nsuch as the [Line_Break property] or the [Vertical_Orientation property],\nthis package can read the data files\nat <https://www.unicode.org/Public/UNIDATA/>.\n\nThis package can also generate JavaScript functions\nthat can read the property values of the [Unicode Character Database]\nin browsers.\nPlease see the [JavaScript] section below.\n\n[General_Category property]: http://unicode.org/reports/tr44/#General_Category\n[Line_Break property]: http://unicode.org/reports/tr44/#Line_Break\n[Unicode Character Database]: https://unicode.org/reports/tr44/\n[unicodedata]: https://docs.python.org/3/library/unicodedata.html\n[Vertical_Orientation property]: http://unicode.org/reports/tr44/#Vertical_Orientation\n\n## Install\n\n```sh\npip install unicodedata-reader\n```\nIf you want to clone and install using [poetry]:\n```sh\ngit clone https://github.com/kojiishi/unicodedata-reader\ncd unicodedata-reader\npoetry install\npoetry shell\n```\n\n[poetry]: https://github.com/python-poetry/poetry\n\n\n## Python\n\n```python\nimport unicodedata_reader\n\nreader = unicodedata_reader.UnicodeDataReader.default\nlb = reader.line_break()\nprint(lb.value(0x41))\n```\nThe example above prints `AL`,\nthe [Line_Break property] value for U+0041.\nPlease also see [line_break_test.py] for more usages.\n\n[line_break_test.py]: https://github.com/kojiishi/unicodedata-reader/blob/main/tests/line_break_test.py\n\n## JavaScript\n[JavaScript]: #javascript\n\nThe [`UnicodeDataCompressor` class] in this package\ncan generate JavaScript functions that can read the property values\nof the [Unicode Character Database] in browsers.\n\nFollowing examples are available in the "`js`" directory:\n* [GeneralCategory.js] is a generated JavaScript file\n for the Unicode [General_Category property].\n* [LineBreak.js] is a generated JavaScript file\n for the Unicode [Line_Break property].\n* [LineBreak.html] for an example usage of [LineBreak.js].\n\nThe following command generates a JavaScript file for the [Line_Break property]\nusing `js/template.js` as the template file:\n```sh\nunicodedata-reader lb -t js/template.js\n```\n\n[`UnicodeDataCompressor` class]: https://github.com/kojiishi/unicodedata-reader/blob/main/unicodedata_reader/compressor.py\n[GeneralCategory.js]: https://github.com/kojiishi/unicodedata-reader/blob/main/js/GeneralCategory.js\n[LineBreak.html]: https://github.com/kojiishi/unicodedata-reader/blob/main/js/LineBreak.html\n[LineBreak.js]: https://github.com/kojiishi/unicodedata-reader/blob/main/js/LineBreak.js\n',
|
|
21
|
-
'author': 'Koji Ishii',
|
|
22
|
-
'author_email': 'kojii@chromium.org',
|
|
23
|
-
'maintainer': None,
|
|
24
|
-
'maintainer_email': None,
|
|
25
|
-
'url': 'https://github.com/kojiishi/unicodedata-reader',
|
|
26
|
-
'packages': packages,
|
|
27
|
-
'package_data': package_data,
|
|
28
|
-
'install_requires': install_requires,
|
|
29
|
-
'entry_points': entry_points,
|
|
30
|
-
'python_requires': '>=3.8',
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
setup(**setup_kwargs)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|