unicodedata-reader 1.3.8__tar.gz → 1.3.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/.github/workflows/ci.yml +9 -10
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/.github/workflows/publish.yml +1 -1
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/PKG-INFO +1 -1
- unicodedata_reader-1.3.9/Taskfile.yml +41 -0
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/pyproject.toml +4 -5
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/src/unicodedata_reader/__main__.py +9 -9
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/src/unicodedata_reader/bidi_brackets.py +12 -11
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/src/unicodedata_reader/cli.py +51 -35
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/src/unicodedata_reader/compressor.py +43 -33
- unicodedata_reader-1.3.9/src/unicodedata_reader/east_asian_width.py +29 -0
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/src/unicodedata_reader/east_asian_width_common.py +2 -2
- unicodedata_reader-1.3.9/src/unicodedata_reader/emoji.py +32 -0
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/src/unicodedata_reader/entry.py +58 -54
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/src/unicodedata_reader/general_category.py +2 -3
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/src/unicodedata_reader/line_break.py +4 -5
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/src/unicodedata_reader/reader.py +32 -30
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/src/unicodedata_reader/set.py +16 -13
- unicodedata_reader-1.3.9/src/unicodedata_reader/vertical_orientation.py +29 -0
- unicodedata_reader-1.3.9/tests/cli_test.py +31 -0
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/tests/conftest.py +1 -1
- unicodedata_reader-1.3.9/tests/entry_test.py +190 -0
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/tests/line_break_test.py +7 -7
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/tests/reader_test.py +10 -10
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/tests/set_test.py +12 -12
- unicodedata_reader-1.3.9/uv.lock +386 -0
- unicodedata_reader-1.3.8/Taskfile.yml +0 -35
- unicodedata_reader-1.3.8/src/unicodedata_reader/east_asian_width.py +0 -30
- unicodedata_reader-1.3.8/src/unicodedata_reader/emoji.py +0 -38
- unicodedata_reader-1.3.8/src/unicodedata_reader/vertical_orientation.py +0 -30
- unicodedata_reader-1.3.8/tests/cli_test.py +0 -31
- unicodedata_reader-1.3.8/tests/entry_test.py +0 -166
- unicodedata_reader-1.3.8/uv.lock +0 -790
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/.github/dependabot.yml +0 -0
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/.gitignore +0 -0
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/.yapfignore +0 -0
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/LICENSE +0 -0
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/README.md +0 -0
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/js/GeneralCategory.js +0 -0
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/js/LineBreak.html +0 -0
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/js/LineBreak.js +0 -0
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/js/template.js +0 -0
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/src/unicodedata_reader/__init__.py +0 -0
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/tests/__init__.py +0 -0
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/tests/cache/EastAsianWidth +0 -0
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/tests/cache/LineBreak +0 -0
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/tests/cache/ScriptExtensions +0 -0
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/tests/cache/Scripts +0 -0
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/tests/cache/extracted/DerivedGeneralCategory +0 -0
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/tests/cache/extracted/DerivedName +0 -0
- {unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/tox.ini +0 -0
|
@@ -19,7 +19,7 @@ jobs:
|
|
|
19
19
|
python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
|
|
20
20
|
|
|
21
21
|
steps:
|
|
22
|
-
- uses: actions/checkout@
|
|
22
|
+
- uses: actions/checkout@v7
|
|
23
23
|
|
|
24
24
|
# https://docs.astral.sh/uv/guides/integration/github/
|
|
25
25
|
- name: Install uv and set up Python ${{ matrix.python-version }}
|
|
@@ -29,20 +29,19 @@ jobs:
|
|
|
29
29
|
|
|
30
30
|
# https://taskfile.dev/docs/installation#github-actions
|
|
31
31
|
- name: Install Task
|
|
32
|
-
uses: go-task/setup-task@
|
|
32
|
+
uses: go-task/setup-task@v2
|
|
33
33
|
|
|
34
34
|
- name: Install dependencies
|
|
35
|
-
run:
|
|
36
|
-
uv sync --all-extras --dev
|
|
35
|
+
run: uv sync --all-extras --dev
|
|
37
36
|
|
|
38
37
|
- name: Test
|
|
39
|
-
run:
|
|
40
|
-
|
|
38
|
+
run: task test
|
|
39
|
+
|
|
40
|
+
- name: Type
|
|
41
|
+
run: task type
|
|
41
42
|
|
|
42
43
|
- name: Lint
|
|
43
|
-
run:
|
|
44
|
-
task lint
|
|
44
|
+
run: task lint
|
|
45
45
|
|
|
46
46
|
- name: Format check
|
|
47
|
-
run:
|
|
48
|
-
task fmtchk
|
|
47
|
+
run: task fmtchk
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# yaml-language-server: $schema=https://taskfile.dev/schema.json
|
|
2
|
+
|
|
3
|
+
version: '3'
|
|
4
|
+
|
|
5
|
+
tasks:
|
|
6
|
+
default:
|
|
7
|
+
deps: [check]
|
|
8
|
+
|
|
9
|
+
check:
|
|
10
|
+
- task: tests
|
|
11
|
+
- task: type
|
|
12
|
+
- task: lint
|
|
13
|
+
- task: fmtchk
|
|
14
|
+
- git diff --exit-code
|
|
15
|
+
|
|
16
|
+
fix: "{{.RUN}} ruff check --fix {{.CLI_ARGS}}"
|
|
17
|
+
fmt: "{{.RUN}} ruff format {{.CLI_ARGS}}"
|
|
18
|
+
fmtchk: "{{.RUN}} ruff format --check {{.CLI_ARGS}}"
|
|
19
|
+
lint: "{{.RUN}} ruff check {{.CLI_ARGS}}"
|
|
20
|
+
test: "{{.RUN}} pytest tests {{.PYTEST}} {{.CLI_ARGS}}"
|
|
21
|
+
tests: "{{.RUN}} tox {{.TOX}} {{.CLI_ARGS}}"
|
|
22
|
+
type: "{{.RUN}} ty check {{.TY}} {{.CLI_ARGS}}"
|
|
23
|
+
|
|
24
|
+
gen:
|
|
25
|
+
- "{{.RUN}} unicodedata-reader lb -t js/template.js {{.GEN}} {{.CLI_ARGS}}"
|
|
26
|
+
- "{{.RUN}} unicodedata-reader gc -t js/template.js {{.GEN}} {{.CLI_ARGS}}"
|
|
27
|
+
|
|
28
|
+
install-git-hooks:
|
|
29
|
+
desc: Create git hooks
|
|
30
|
+
cmds:
|
|
31
|
+
- echo '#!/bin/sh' > .git/hooks/pre-push
|
|
32
|
+
- echo 'task check' >> .git/hooks/pre-push
|
|
33
|
+
- cmd: chmod +x .git/hooks/pre-push
|
|
34
|
+
platforms: [linux, darwin]
|
|
35
|
+
|
|
36
|
+
vars:
|
|
37
|
+
GEN: -fv
|
|
38
|
+
PYTEST:
|
|
39
|
+
RUN: uv run
|
|
40
|
+
TOX: -p
|
|
41
|
+
TY:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "unicodedata-reader"
|
|
3
|
-
version = "1.3.
|
|
3
|
+
version = "1.3.9"
|
|
4
4
|
description = ""
|
|
5
5
|
authors = [{name = "Koji Ishii", email="kojii@chromium.org"}]
|
|
6
6
|
readme = "README.md"
|
|
@@ -15,12 +15,11 @@ repository = "https://github.com/kojiishi/unicodedata-reader"
|
|
|
15
15
|
|
|
16
16
|
[dependency-groups]
|
|
17
17
|
dev = [
|
|
18
|
-
"pytest>=9.1.
|
|
19
|
-
"
|
|
20
|
-
"ruff>=0.15.17",
|
|
18
|
+
"pytest>=9.1.1",
|
|
19
|
+
"ruff>=0.15.18",
|
|
21
20
|
"tox>=4.55.1",
|
|
22
21
|
"tox-uv>=1.35.2",
|
|
23
|
-
"
|
|
22
|
+
"ty>=0.0.51",
|
|
24
23
|
]
|
|
25
24
|
|
|
26
25
|
[project.scripts]
|
|
@@ -12,12 +12,12 @@ import unicodedata_reader.vertical_orientation as vo
|
|
|
12
12
|
def main():
|
|
13
13
|
args = sys.argv
|
|
14
14
|
sub_commands = {
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
15
|
+
"bidi": lambda: bidi_brackets.dump_bidi_brackets(),
|
|
16
|
+
"ea": lambda: ea.UnicodeEastAsianWidthDataCli().main(),
|
|
17
|
+
"emoji": lambda: emoji.UnicodeEmojiDataCli().main(),
|
|
18
|
+
"gc": lambda: gc.UnicodeGeneralCategoryDataCli().main(),
|
|
19
|
+
"lb": lambda: lb.UnicodeLineBreakDataCli().main(),
|
|
20
|
+
"vo": lambda: vo.UnicodeVerticalOrientationDataCli().main(),
|
|
21
21
|
}
|
|
22
22
|
if len(args) > 1:
|
|
23
23
|
func = sub_commands.get(args[1])
|
|
@@ -27,9 +27,9 @@ def main():
|
|
|
27
27
|
return
|
|
28
28
|
|
|
29
29
|
name = pathlib.Path(args[0]).name
|
|
30
|
-
sub_commands =
|
|
31
|
-
print(f
|
|
30
|
+
sub_commands = "|".join(sub_commands.keys())
|
|
31
|
+
print(f"usage: {name} {sub_commands} [options...]", file=sys.stderr)
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
if __name__ ==
|
|
34
|
+
if __name__ == "__main__":
|
|
35
35
|
main()
|
{unicodedata_reader-1.3.8 → unicodedata_reader-1.3.9}/src/unicodedata_reader/bidi_brackets.py
RENAMED
|
@@ -13,27 +13,28 @@ def dump_bidi_brackets():
|
|
|
13
13
|
|
|
14
14
|
def bidi_brackets_type(code):
|
|
15
15
|
bracket = bidi_brackets.get(code)
|
|
16
|
-
return bracket.type if bracket else
|
|
16
|
+
return bracket.type if bracket else "x"
|
|
17
17
|
|
|
18
18
|
columns = {
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
19
|
+
"Code": lambda code, ch: u_hex(code),
|
|
20
|
+
"Char": lambda code, ch: chr(code),
|
|
21
|
+
"Bidi_Paired_Bracket_Type": lambda code, ch: bidi_brackets_type(code),
|
|
22
|
+
"EAW": lambda code, ch: unicodedata.east_asian_width(ch),
|
|
23
|
+
"Script": lambda code, ch: scripts.get(code),
|
|
24
|
+
"ScriptExt": lambda code, ch: str(script_extensions.get(code, [])),
|
|
25
25
|
}
|
|
26
|
-
print(f
|
|
26
|
+
print(f"# {' '.join(columns.keys())}")
|
|
27
27
|
last_block = None
|
|
28
28
|
for code in get_unicodes_from_args(bidi_brackets.keys()):
|
|
29
29
|
block = blocks[code]
|
|
30
30
|
if block != last_block:
|
|
31
|
-
print(f
|
|
31
|
+
print(f"# {block}")
|
|
32
32
|
last_block = block
|
|
33
33
|
ch = chr(code)
|
|
34
34
|
values = (func(code, ch) for func in columns.values())
|
|
35
|
-
|
|
35
|
+
values = ("" if v is None else str(v) for v in values)
|
|
36
|
+
print(f"{' '.join(values)} # {unicodedata.name(chr(code))}")
|
|
36
37
|
|
|
37
38
|
|
|
38
|
-
if __name__ ==
|
|
39
|
+
if __name__ == "__main__":
|
|
39
40
|
dump_bidi_brackets()
|
|
@@ -8,6 +8,7 @@ from typing import Callable
|
|
|
8
8
|
from typing import Dict
|
|
9
9
|
from typing import Iterable
|
|
10
10
|
from typing import Optional
|
|
11
|
+
from typing import Sequence
|
|
11
12
|
import unicodedata
|
|
12
13
|
|
|
13
14
|
from unicodedata_reader import *
|
|
@@ -16,7 +17,8 @@ from unicodedata_reader import *
|
|
|
16
17
|
def _to_unicodes_from_str(text):
|
|
17
18
|
while text:
|
|
18
19
|
match = re.match(
|
|
19
|
-
r
|
|
20
|
+
r"([uU]\+?)?([0-9a-fA-F]{4,5})(-([0-9a-fA-F]{4,5}))?,?\s*", text
|
|
21
|
+
)
|
|
20
22
|
if match:
|
|
21
23
|
prefix = match.group(1)
|
|
22
24
|
hex = match.group(2)
|
|
@@ -27,7 +29,7 @@ def _to_unicodes_from_str(text):
|
|
|
27
29
|
yield from range(code, int(hex_end, 16) + 1)
|
|
28
30
|
else:
|
|
29
31
|
yield code
|
|
30
|
-
text = text[match.end():]
|
|
32
|
+
text = text[match.end() :]
|
|
31
33
|
continue
|
|
32
34
|
code = ord(text[0])
|
|
33
35
|
yield code
|
|
@@ -42,7 +44,7 @@ def to_unicodes(text):
|
|
|
42
44
|
|
|
43
45
|
def get_unicodes_from_args(default=None):
|
|
44
46
|
parser = argparse.ArgumentParser()
|
|
45
|
-
parser.add_argument(
|
|
47
|
+
parser.add_argument("text", nargs="+" if default is None else "*")
|
|
46
48
|
args = parser.parse_args()
|
|
47
49
|
if args.text:
|
|
48
50
|
return to_unicodes(args.text)
|
|
@@ -51,8 +53,8 @@ def get_unicodes_from_args(default=None):
|
|
|
51
53
|
|
|
52
54
|
def u_printable_chr(ch):
|
|
53
55
|
gc = unicodedata.category(ch)
|
|
54
|
-
if gc ==
|
|
55
|
-
return
|
|
56
|
+
if gc == "Cc":
|
|
57
|
+
return ""
|
|
56
58
|
return ch
|
|
57
59
|
|
|
58
60
|
|
|
@@ -60,7 +62,7 @@ def u_name_or_empty(ch):
|
|
|
60
62
|
try:
|
|
61
63
|
return unicodedata.name(ch)
|
|
62
64
|
except ValueError:
|
|
63
|
-
return
|
|
65
|
+
return ""
|
|
64
66
|
|
|
65
67
|
|
|
66
68
|
def _init_logging(verbose):
|
|
@@ -73,6 +75,14 @@ def _init_logging(verbose):
|
|
|
73
75
|
|
|
74
76
|
|
|
75
77
|
class UnicodeDataCli(object):
|
|
78
|
+
text: Optional[Sequence[str]]
|
|
79
|
+
clear_cache: bool
|
|
80
|
+
no_cache: bool
|
|
81
|
+
name: Optional[str]
|
|
82
|
+
template: Optional[pathlib.Path]
|
|
83
|
+
output: Optional[pathlib.Path]
|
|
84
|
+
verbose: int
|
|
85
|
+
_entries: UnicodeDataEntries
|
|
76
86
|
|
|
77
87
|
def __init__(self):
|
|
78
88
|
self._parse_args()
|
|
@@ -80,39 +90,43 @@ class UnicodeDataCli(object):
|
|
|
80
90
|
def _columns(self) -> Dict[str, Callable[[int, str], Any]]:
|
|
81
91
|
columns = self._core_columns()
|
|
82
92
|
columns = dict(
|
|
83
|
-
itertools.chain(
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
93
|
+
itertools.chain(
|
|
94
|
+
{
|
|
95
|
+
"Code": lambda code, ch: "U" + u_hex(code),
|
|
96
|
+
"Char": lambda code, ch: u_printable_chr(ch),
|
|
97
|
+
}.items(),
|
|
98
|
+
columns.items(),
|
|
99
|
+
{
|
|
100
|
+
"Name": lambda code, ch: u_name_or_empty(ch),
|
|
101
|
+
}.items(),
|
|
102
|
+
)
|
|
103
|
+
)
|
|
89
104
|
return columns
|
|
90
105
|
|
|
91
106
|
def _core_columns(self) -> Dict[str, Callable[[int, str], Any]]:
|
|
92
107
|
raise NotImplementedError()
|
|
93
108
|
|
|
94
|
-
def _unicodes(self) ->
|
|
109
|
+
def _unicodes(self) -> Iterable[int]:
|
|
95
110
|
if self.text:
|
|
96
111
|
return to_unicodes(self.text)
|
|
97
112
|
return self._default_unicodes()
|
|
98
113
|
|
|
99
|
-
def _default_unicodes(self) ->
|
|
114
|
+
def _default_unicodes(self) -> Iterable[int]:
|
|
100
115
|
return self._entries.unicodes()
|
|
101
116
|
|
|
102
117
|
def print(self):
|
|
103
118
|
columns = self._columns()
|
|
104
|
-
print(
|
|
119
|
+
print("\t".join(key for key in columns.keys()))
|
|
105
120
|
for code in self._unicodes():
|
|
106
121
|
try:
|
|
107
122
|
ch = chr(code)
|
|
108
123
|
values = (func(code, ch) for func in columns.values())
|
|
109
|
-
values = (
|
|
110
|
-
print(
|
|
124
|
+
values = ("" if v is None else str(v) for v in values)
|
|
125
|
+
print("\t".join(values))
|
|
111
126
|
except UnicodeEncodeError:
|
|
112
127
|
continue
|
|
113
128
|
|
|
114
|
-
def substitute_template(self, template: pathlib.Path,
|
|
115
|
-
output: pathlib.Path):
|
|
129
|
+
def substitute_template(self, template: pathlib.Path, output: pathlib.Path):
|
|
116
130
|
entries = self._entries
|
|
117
131
|
entries.fill_missing_values()
|
|
118
132
|
entries.map_values_to_int()
|
|
@@ -122,22 +136,24 @@ class UnicodeDataCli(object):
|
|
|
122
136
|
|
|
123
137
|
def _parse_args(self):
|
|
124
138
|
parser = argparse.ArgumentParser()
|
|
125
|
-
parser.add_argument(
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
parser.add_argument(
|
|
129
|
-
parser.add_argument(
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
parser.add_argument(
|
|
136
|
-
parser.add_argument(
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
139
|
+
parser.add_argument("text", nargs="*", help="show properties for the text")
|
|
140
|
+
parser.add_argument("-f", "--clear-cache", action="store_true")
|
|
141
|
+
parser.add_argument("-F", "--no-cache", action="store_true")
|
|
142
|
+
parser.add_argument("--name", help="$NAME in the template")
|
|
143
|
+
parser.add_argument(
|
|
144
|
+
"-t",
|
|
145
|
+
"--template",
|
|
146
|
+
type=pathlib.Path,
|
|
147
|
+
help="generate a file from the template",
|
|
148
|
+
)
|
|
149
|
+
parser.add_argument("-o", "--output", type=pathlib.Path)
|
|
150
|
+
parser.add_argument(
|
|
151
|
+
"-v",
|
|
152
|
+
"--verbose",
|
|
153
|
+
help="increase output verbosity",
|
|
154
|
+
action="count",
|
|
155
|
+
default=0,
|
|
156
|
+
)
|
|
141
157
|
parser.parse_args(namespace=self)
|
|
142
158
|
_init_logging(self.verbose) # pytype: disable=attribute-error
|
|
143
159
|
if self.clear_cache:
|
|
@@ -9,7 +9,7 @@ from typing import Optional
|
|
|
9
9
|
|
|
10
10
|
from unicodedata_reader import *
|
|
11
11
|
|
|
12
|
-
_logger = logging.getLogger(
|
|
12
|
+
_logger = logging.getLogger("UnicodeDataCompressor")
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
def _init_logging(verbose: int):
|
|
@@ -22,7 +22,6 @@ def _init_logging(verbose: int):
|
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class UnicodeDataCompressor(object):
|
|
25
|
-
|
|
26
25
|
def __init__(self, entries: UnicodeDataEntries):
|
|
27
26
|
self._entries = entries
|
|
28
27
|
|
|
@@ -62,16 +61,23 @@ class UnicodeDataCompressor(object):
|
|
|
62
61
|
assert entry.value < (1 << value_bits)
|
|
63
62
|
assert entry.count > 0
|
|
64
63
|
combined = ((entry.count - 1) << value_bits) | entry.value
|
|
65
|
-
_logger.debug(
|
|
66
|
-
|
|
67
|
-
|
|
64
|
+
_logger.debug(
|
|
65
|
+
"%04X %s=%d: %d -> %X",
|
|
66
|
+
entry.min,
|
|
67
|
+
entries.values_for_int()[entry.value],
|
|
68
|
+
entry.value,
|
|
69
|
+
entry.count,
|
|
70
|
+
combined,
|
|
71
|
+
)
|
|
68
72
|
bytes.extend(self._to_bytes(combined))
|
|
69
73
|
return bytes
|
|
70
74
|
|
|
71
|
-
def substitute_template(
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
+
def substitute_template(
|
|
76
|
+
self,
|
|
77
|
+
template: pathlib.Path,
|
|
78
|
+
output: Optional[pathlib.Path] = None,
|
|
79
|
+
name: Optional[str] = None,
|
|
80
|
+
) -> str:
|
|
75
81
|
entries = self._entries
|
|
76
82
|
bytes = self.compress()
|
|
77
83
|
base64bytes = base64.b64encode(bytes)
|
|
@@ -79,15 +85,20 @@ class UnicodeDataCompressor(object):
|
|
|
79
85
|
value_bits = self._bitsize
|
|
80
86
|
name = name or entries.name
|
|
81
87
|
assert name
|
|
82
|
-
_logger.info(
|
|
83
|
-
|
|
84
|
-
|
|
88
|
+
_logger.info(
|
|
89
|
+
"%s: Bytes=%d, Base64=%d, #values=%d (%d bits)",
|
|
90
|
+
name,
|
|
91
|
+
len(bytes),
|
|
92
|
+
len(base64bytes),
|
|
93
|
+
len(values_for_int),
|
|
94
|
+
value_bits,
|
|
95
|
+
)
|
|
85
96
|
mapping = {
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
97
|
+
"NAME": name,
|
|
98
|
+
"BASE64BYTES": base64bytes.decode("ascii"),
|
|
99
|
+
"VALUE_BITS": str(value_bits),
|
|
100
|
+
"VALUE_MASK": str((1 << value_bits) - 1),
|
|
101
|
+
"VALUE_LIST": ",".join(f'"{v}"' for v in values_for_int),
|
|
91
102
|
}
|
|
92
103
|
|
|
93
104
|
text = template.read_text()
|
|
@@ -95,13 +106,13 @@ class UnicodeDataCompressor(object):
|
|
|
95
106
|
text = text.substitute(mapping)
|
|
96
107
|
|
|
97
108
|
if output:
|
|
98
|
-
if str(output) ==
|
|
109
|
+
if str(output) == "-":
|
|
99
110
|
sys.stdout.write(text)
|
|
100
111
|
else:
|
|
101
112
|
if output.is_dir():
|
|
102
|
-
output = output / f
|
|
103
|
-
output.write_text(text, newline=
|
|
104
|
-
_logger.info(
|
|
113
|
+
output = output / f"{name}{template.suffix}"
|
|
114
|
+
output.write_text(text, newline="\n")
|
|
115
|
+
_logger.info("Saved to %s", output)
|
|
105
116
|
|
|
106
117
|
return text
|
|
107
118
|
|
|
@@ -109,16 +120,14 @@ class UnicodeDataCompressor(object):
|
|
|
109
120
|
def main():
|
|
110
121
|
this_dir = pathlib.Path(__file__).resolve().parent
|
|
111
122
|
parser = argparse.ArgumentParser()
|
|
112
|
-
parser.add_argument(
|
|
113
|
-
parser.add_argument(
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
parser.add_argument(
|
|
117
|
-
parser.add_argument(
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
action='count',
|
|
121
|
-
default=0)
|
|
123
|
+
parser.add_argument("--name", default="LineBreak")
|
|
124
|
+
parser.add_argument(
|
|
125
|
+
"--template", type=pathlib.Path, default=this_dir.parent / "js" / "template.js"
|
|
126
|
+
)
|
|
127
|
+
parser.add_argument("-o", "--output", type=pathlib.Path)
|
|
128
|
+
parser.add_argument(
|
|
129
|
+
"-v", "--verbose", help="increase output verbosity", action="count", default=0
|
|
130
|
+
)
|
|
122
131
|
args = parser.parse_args()
|
|
123
132
|
_init_logging(args.verbose)
|
|
124
133
|
|
|
@@ -130,8 +139,9 @@ def main():
|
|
|
130
139
|
output = args.output
|
|
131
140
|
compressor = UnicodeDataCompressor(entries)
|
|
132
141
|
compressor.substitute_template(
|
|
133
|
-
template, output=output if output else template.parent, name=args.name
|
|
142
|
+
template, output=output if output else template.parent, name=args.name
|
|
143
|
+
)
|
|
134
144
|
|
|
135
145
|
|
|
136
|
-
if __name__ ==
|
|
146
|
+
if __name__ == "__main__":
|
|
137
147
|
main()
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import unicodedata
|
|
3
|
+
from typing import Any
|
|
4
|
+
from typing import Callable
|
|
5
|
+
from typing import Dict
|
|
6
|
+
|
|
7
|
+
from unicodedata_reader import *
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class UnicodeEastAsianWidthDataCli(UnicodeDataCli):
|
|
11
|
+
def __init__(self):
|
|
12
|
+
super().__init__()
|
|
13
|
+
self._entries = UnicodeDataReader.default.east_asian_width()
|
|
14
|
+
|
|
15
|
+
def _core_columns(self) -> Dict[str, Callable[[int, str], Any]]:
|
|
16
|
+
return {
|
|
17
|
+
"EA": lambda code, ch: self._entries.value(code),
|
|
18
|
+
"GC": lambda code, ch: unicodedata.category(ch),
|
|
19
|
+
"EAW": lambda code, ch: unicodedata.east_asian_width(ch),
|
|
20
|
+
"cp932": lambda code, ch: u_enc(ch, "cp932"),
|
|
21
|
+
"sjis04": lambda code, ch: u_enc(ch, "sjis_2004"),
|
|
22
|
+
"cp936": lambda code, ch: u_enc(ch, "cp936"),
|
|
23
|
+
"cp949": lambda code, ch: u_enc(ch, "cp949"),
|
|
24
|
+
"cp950": lambda code, ch: u_enc(ch, "cp950"),
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
if __name__ == "__main__":
|
|
29
|
+
UnicodeEastAsianWidthDataCli().main()
|
|
@@ -24,8 +24,7 @@ def dump_east_asian_width():
|
|
|
24
24
|
"Bidi_Paired_Bracket_Type": lambda code, ch: bidi_brackets_type(code),
|
|
25
25
|
"EAW": lambda code, ch: unicodedata.east_asian_width(ch),
|
|
26
26
|
"Script": lambda code, ch: scripts.get(code),
|
|
27
|
-
"ScriptExt":
|
|
28
|
-
lambda code, ch: " ".join(script_extensions.get(code, [])),
|
|
27
|
+
"ScriptExt": lambda code, ch: " ".join(script_extensions.get(code, [])),
|
|
29
28
|
}
|
|
30
29
|
sep = "\t"
|
|
31
30
|
print(f"# {sep.join(columns.keys())},Name")
|
|
@@ -38,6 +37,7 @@ def dump_east_asian_width():
|
|
|
38
37
|
if script != "Common":
|
|
39
38
|
continue
|
|
40
39
|
values = (func(code, ch) for func in columns.values())
|
|
40
|
+
values = ("" if v is None else str(v) for v in values)
|
|
41
41
|
output = sep.join(values)
|
|
42
42
|
try:
|
|
43
43
|
output += f"{sep}{unicodedata.name(chr(code))}"
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
from typing import Any
|
|
3
|
+
from typing import Callable
|
|
4
|
+
from typing import Dict
|
|
5
|
+
|
|
6
|
+
from unicodedata_reader import *
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class UnicodeEmojiDataCli(UnicodeDataCli):
|
|
10
|
+
def __init__(self):
|
|
11
|
+
super().__init__()
|
|
12
|
+
self._entries = UnicodeDataReader.default.emoji()
|
|
13
|
+
|
|
14
|
+
def _emoji_flag_func(self, mask: EmojiType):
|
|
15
|
+
return lambda code, ch: 1 if self._entries.value(code) & mask else 0
|
|
16
|
+
|
|
17
|
+
def _core_columns(self) -> Dict[str, Callable[[int, str], Any]]:
|
|
18
|
+
return {
|
|
19
|
+
"Emoji": self._emoji_flag_func(EmojiType.Emoji),
|
|
20
|
+
"Emoji_Presentation": self._emoji_flag_func(EmojiType.Emoji_Presentation),
|
|
21
|
+
"Emoji_Modifier": self._emoji_flag_func(EmojiType.Emoji_Modifier),
|
|
22
|
+
"Emoji_Modifier_Base": self._emoji_flag_func(EmojiType.Emoji_Modifier_Base),
|
|
23
|
+
"Emoji_Component": self._emoji_flag_func(EmojiType.Emoji_Component),
|
|
24
|
+
"Extended_Pictographic": self._emoji_flag_func(
|
|
25
|
+
EmojiType.Extended_Pictographic
|
|
26
|
+
),
|
|
27
|
+
"EmojiCombined": lambda code, ch: self._entries.value(code),
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
if __name__ == "__main__":
|
|
32
|
+
UnicodeEmojiDataCli().main()
|