pinyinparser 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
pinyinparser.py,sha256=tGq4VRRcICA1U5gFDMMY_fC4SQ7_wweXzChzlLXucV0,34208
|
|
2
|
+
pinyinparser-1.0.0.dist-info/METADATA,sha256=lIhxK-NOkWLPYZ1HFj19mfDRDFedIu6HmVKQyHqLKNE,177
|
|
3
|
+
pinyinparser-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
4
|
+
pinyinparser-1.0.0.dist-info/top_level.txt,sha256=6EGG--K0y1GoDpu1iJyOdAleVlfhNtImpxo7QWjj5I4,13
|
|
5
|
+
pinyinparser-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pinyinparser
|
pinyinparser.py
ADDED
|
@@ -0,0 +1,1035 @@
|
|
|
1
|
+
import base64 # 解压那坨位图用的(嗯确实是解压因为a85编码比直接写hex还短这何尝不是一种压缩
|
|
2
|
+
import re # 过滤用的
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from enum import IntEnum, StrEnum
|
|
5
|
+
from functools import cache
|
|
6
|
+
from itertools import chain, pairwise
|
|
7
|
+
from typing import cast, overload
|
|
8
|
+
from unicodedata import normalize
|
|
9
|
+
from warnings import warn
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Initial(IntEnum):
|
|
13
|
+
missing = 0
|
|
14
|
+
unspec = 0x0001
|
|
15
|
+
nul = 0x0002
|
|
16
|
+
H = 0x0003 # Hhm Hhng
|
|
17
|
+
R = 0x0004 # Rri
|
|
18
|
+
b = 0x0005
|
|
19
|
+
c = 0x0006
|
|
20
|
+
ch = 0x0007
|
|
21
|
+
d = 0x0008
|
|
22
|
+
f = 0x0009
|
|
23
|
+
g = 0x000A
|
|
24
|
+
h = 0x000B
|
|
25
|
+
j = 0x000C
|
|
26
|
+
k = 0x000D
|
|
27
|
+
l = 0x000E
|
|
28
|
+
m = 0x000F
|
|
29
|
+
n = 0x0010
|
|
30
|
+
p = 0x0011
|
|
31
|
+
q = 0x0012
|
|
32
|
+
r = 0x0013
|
|
33
|
+
s = 0x0014
|
|
34
|
+
sh = 0x0015
|
|
35
|
+
t = 0x0016
|
|
36
|
+
w = 0x0017
|
|
37
|
+
x = 0x0018
|
|
38
|
+
y = 0x0019
|
|
39
|
+
z = 0x001A
|
|
40
|
+
zh = 0x001B
|
|
41
|
+
M = 0x001C # Mm Mn Mng
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class Final(IntEnum):
|
|
45
|
+
missing = 0
|
|
46
|
+
unspec = 0x0100
|
|
47
|
+
nul = 0x0200
|
|
48
|
+
a = 0x0300
|
|
49
|
+
ai = 0x0400
|
|
50
|
+
an = 0x0500
|
|
51
|
+
ang = 0x0600
|
|
52
|
+
ao = 0x0700
|
|
53
|
+
e = 0x0800
|
|
54
|
+
ei = 0x0900
|
|
55
|
+
en = 0x0A00
|
|
56
|
+
eng = 0x0B00
|
|
57
|
+
er = 0x0C00
|
|
58
|
+
hm = 0x0D00
|
|
59
|
+
hng = 0x0E00
|
|
60
|
+
i = 0x0F00 # [i] ji qi xi
|
|
61
|
+
ia = 0x1000
|
|
62
|
+
ian = 0x1100
|
|
63
|
+
iang = 0x1200
|
|
64
|
+
iao = 0x1300
|
|
65
|
+
ieh = 0x1400
|
|
66
|
+
ien = 0x1500
|
|
67
|
+
ii = 0x1600 # [z]/[ɿ] zi ci si
|
|
68
|
+
ieng = 0x1700
|
|
69
|
+
iong = 0x1800
|
|
70
|
+
iou = 0x1900
|
|
71
|
+
ng = 0x1A00 # [ŋ̊]
|
|
72
|
+
o = 0x1B00
|
|
73
|
+
ong = 0x1C00
|
|
74
|
+
ou = 0x1D00
|
|
75
|
+
ri = 0x1E00 # [ʅ] zhi chi shi [ʐ]/[ʅ] ri
|
|
76
|
+
u = 0x1F00
|
|
77
|
+
ua = 0x2000
|
|
78
|
+
uai = 0x2100
|
|
79
|
+
uan = 0x2200
|
|
80
|
+
uang = 0x2300
|
|
81
|
+
uei = 0x2400
|
|
82
|
+
uen = 0x2500
|
|
83
|
+
ueng = 0x2600
|
|
84
|
+
uo = 0x2700
|
|
85
|
+
v = 0x2800
|
|
86
|
+
van = 0x2900
|
|
87
|
+
veh = 0x2A00
|
|
88
|
+
ven = 0x2B00
|
|
89
|
+
m = 0x2C00
|
|
90
|
+
n = 0x2D00
|
|
91
|
+
eh = 0x2E00 # ê
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class Tone(IntEnum):
|
|
95
|
+
missing = 0
|
|
96
|
+
unspec = 0x0020
|
|
97
|
+
nul = 0x0040
|
|
98
|
+
t1 = 0x0060
|
|
99
|
+
t2 = 0x0080
|
|
100
|
+
t3 = 0x00A0
|
|
101
|
+
t4 = 0x00C0
|
|
102
|
+
t5 = 0x00E0
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
FINAL2STR = {
|
|
106
|
+
Final.ii: "i",
|
|
107
|
+
Final.ri: "i",
|
|
108
|
+
Final.iou: "iu",
|
|
109
|
+
Final.uei: "ui",
|
|
110
|
+
Final.ien: "in",
|
|
111
|
+
Final.uen: "un",
|
|
112
|
+
Final.v: "ü",
|
|
113
|
+
Final.veh: "üe",
|
|
114
|
+
Final.van: "üan",
|
|
115
|
+
Final.ven: "ün",
|
|
116
|
+
Final.eh: "ê",
|
|
117
|
+
Final.ieh: "ie",
|
|
118
|
+
Final.ieng: "ing",
|
|
119
|
+
}
|
|
120
|
+
TONE2STR = {
|
|
121
|
+
Tone.t1: "1",
|
|
122
|
+
Tone.t2: "2",
|
|
123
|
+
Tone.t3: "3",
|
|
124
|
+
Tone.t4: "4",
|
|
125
|
+
Tone.t5: "5",
|
|
126
|
+
}
|
|
127
|
+
FINAL2TONED = {
|
|
128
|
+
Tone.t1: {"a": "ā", "e": "ē", "i": "ī", "o": "ō", "u": "ū", "ü": "ǖ", "ê": "ê̄", "m": "m̄", "n": "n̄"},
|
|
129
|
+
Tone.t2: {"a": "á", "e": "é", "i": "í", "o": "ó", "u": "ú", "ü": "ǘ", "ê": "ế", "m": "ḿ", "n": "ń"},
|
|
130
|
+
Tone.t3: {"a": "ǎ", "e": "ě", "i": "ǐ", "o": "ǒ", "u": "ǔ", "ü": "ǚ", "ê": "ê̌", "m": "m̌", "n": "ň"},
|
|
131
|
+
Tone.t4: {"a": "à", "e": "è", "i": "ì", "o": "ò", "u": "ù", "ü": "ǜ", "ê": "ề", "m": "m̀", "n": "ǹ"},
|
|
132
|
+
}
|
|
133
|
+
SYLLMAP_Y = {
|
|
134
|
+
Final.ia: "ya",
|
|
135
|
+
Final.ieh: "ye",
|
|
136
|
+
Final.iao: "yao",
|
|
137
|
+
Final.iou: "you",
|
|
138
|
+
Final.ian: "yan",
|
|
139
|
+
Final.ien: "yin",
|
|
140
|
+
Final.iang: "yang",
|
|
141
|
+
Final.ieng: "ying",
|
|
142
|
+
Final.iong: "yong",
|
|
143
|
+
Final.v: "yu",
|
|
144
|
+
Final.veh: "yue",
|
|
145
|
+
Final.van: "yuan",
|
|
146
|
+
Final.ven: "yun",
|
|
147
|
+
}
|
|
148
|
+
SYLLMAP_W = {
|
|
149
|
+
Final.ua: "wa",
|
|
150
|
+
Final.uo: "wo",
|
|
151
|
+
Final.uai: "wai",
|
|
152
|
+
Final.uei: "wei",
|
|
153
|
+
Final.uan: "wan",
|
|
154
|
+
Final.uen: "wen",
|
|
155
|
+
Final.uang: "wang",
|
|
156
|
+
Final.ueng: "weng",
|
|
157
|
+
}
|
|
158
|
+
SYLL1SEP: set[Final] = {
|
|
159
|
+
Final.m,
|
|
160
|
+
Final.n,
|
|
161
|
+
Final.ng,
|
|
162
|
+
Final.an,
|
|
163
|
+
Final.ang,
|
|
164
|
+
Final.en,
|
|
165
|
+
Final.eng,
|
|
166
|
+
Final.er,
|
|
167
|
+
Final.hm,
|
|
168
|
+
Final.hng,
|
|
169
|
+
Final.i,
|
|
170
|
+
Final.ri,
|
|
171
|
+
Final.ii,
|
|
172
|
+
Final.ian,
|
|
173
|
+
Final.iang,
|
|
174
|
+
Final.ien,
|
|
175
|
+
Final.ieng,
|
|
176
|
+
Final.iong,
|
|
177
|
+
Final.ong,
|
|
178
|
+
Final.uan,
|
|
179
|
+
Final.uang,
|
|
180
|
+
Final.uen,
|
|
181
|
+
Final.ueng,
|
|
182
|
+
Final.van,
|
|
183
|
+
Final.ven,
|
|
184
|
+
}
|
|
185
|
+
SYLL2SEP: set[Final] = {Final.m, Final.n, Final.ng, Final.eh}
|
|
186
|
+
SYLLSEP: dict[Final, set[Final]] = {
|
|
187
|
+
Final.a: {Final.o, Final.ong, Final.ou},
|
|
188
|
+
Final.ia: {Final.o, Final.ong, Final.ou},
|
|
189
|
+
Final.u: {Final.a, Final.ai, Final.an, Final.ang, Final.ao, Final.eng, Final.o, Final.ong, Final.ou},
|
|
190
|
+
Final.v: {Final.an, Final.ang, Final.e, Final.ei, Final.en, Final.eng, Final.er, Final.o},
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
class ToneStyle(StrEnum):
|
|
195
|
+
ABOVE = "above"
|
|
196
|
+
RIGHT = "right"
|
|
197
|
+
AFTER = "after"
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
class IncompatibleWarning(UserWarning):
|
|
201
|
+
pass
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
class _SyllMeta(type):
|
|
205
|
+
def __getattr__(self, name):
|
|
206
|
+
try:
|
|
207
|
+
return parse_single(name)
|
|
208
|
+
except ValueError:
|
|
209
|
+
raise AttributeError(name)
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
class Syllable(metaclass=_SyllMeta):
|
|
213
|
+
@overload
|
|
214
|
+
def __init__(self, i: Initial = ..., f: Final = ..., t: Tone = ...): ...
|
|
215
|
+
|
|
216
|
+
@overload
|
|
217
|
+
def __init__(self, i: int): ...
|
|
218
|
+
|
|
219
|
+
@overload
|
|
220
|
+
def __init__(self, i: str): ...
|
|
221
|
+
|
|
222
|
+
def __init__(self, i: int | Initial | str = Initial.missing, f: Final = Final.missing, t: Tone = Tone.missing):
|
|
223
|
+
if isinstance(i, str):
|
|
224
|
+
if (f is not Final.missing) or (t is not Tone.missing):
|
|
225
|
+
raise ValueError("不能同时使用字符串初始化和声韵调初始化。")
|
|
226
|
+
sp = parse_single(i)
|
|
227
|
+
self.initial = sp.initial
|
|
228
|
+
self.final = sp.final
|
|
229
|
+
self.tone = sp.tone
|
|
230
|
+
elif isinstance(i, Initial):
|
|
231
|
+
self.initial = i
|
|
232
|
+
self.final = f
|
|
233
|
+
self.tone = t
|
|
234
|
+
else:
|
|
235
|
+
if (f is not Final.missing) or (t is not Tone.missing):
|
|
236
|
+
raise ValueError("不能同时使用uint16初始化和声韵调初始化。")
|
|
237
|
+
if i & ~0x3FFF:
|
|
238
|
+
raise ValueError("int中存在无效的位。使用整数初始化时仅接受高2位为0的uint16。")
|
|
239
|
+
try:
|
|
240
|
+
self.initial = Initial(i & 0x001F)
|
|
241
|
+
self.final = Final(i & 0x3F00)
|
|
242
|
+
self.tone = Tone(i & 0x00E0)
|
|
243
|
+
except ValueError:
|
|
244
|
+
raise ValueError("无效的uint16。")
|
|
245
|
+
|
|
246
|
+
def __int__(self):
|
|
247
|
+
return self.initial | self.final | self.tone
|
|
248
|
+
|
|
249
|
+
def __repr__(self):
|
|
250
|
+
return f"<{self.initial.name}·{self.final.name}·{self.tone.name.removeprefix("t")}{"" if self.is_valid() else "(N/E)"}>"
|
|
251
|
+
|
|
252
|
+
@cache
|
|
253
|
+
def to_str(self, tone_style: ToneStyle = ToneStyle.ABOVE, NO_INCOMPAT_WARNING: bool = False) -> str:
|
|
254
|
+
match self.initial:
|
|
255
|
+
case Initial.missing | Initial.unspec | Initial.nul:
|
|
256
|
+
initial_str = ""
|
|
257
|
+
case Initial.R:
|
|
258
|
+
initial_str = "r"
|
|
259
|
+
case Initial.H:
|
|
260
|
+
initial_str = "h"
|
|
261
|
+
case _:
|
|
262
|
+
initial_str = self.initial.name
|
|
263
|
+
|
|
264
|
+
if self.final in (Final.missing, Final.unspec, Final.nul):
|
|
265
|
+
final_str = ""
|
|
266
|
+
else:
|
|
267
|
+
final_str = FINAL2STR.get(self.final, self.final.name)
|
|
268
|
+
|
|
269
|
+
match self.initial:
|
|
270
|
+
case Initial.y:
|
|
271
|
+
base_str = SYLLMAP_Y.get(self.final, f"y{final_str}")
|
|
272
|
+
case Initial.w:
|
|
273
|
+
base_str = SYLLMAP_W.get(self.final, f"w{final_str}")
|
|
274
|
+
case Initial.j | Initial.q | Initial.x:
|
|
275
|
+
base_str = initial_str + final_str.replace("ü", "u")
|
|
276
|
+
case Initial.R if self.final == Final.ri:
|
|
277
|
+
base_str = "ri"
|
|
278
|
+
case Initial.H if self.final in (Final.hm, Final.hng):
|
|
279
|
+
base_str = final_str
|
|
280
|
+
case Initial.M if self.final in (Final.m, Final.n, Final.ng):
|
|
281
|
+
base_str = final_str
|
|
282
|
+
case _:
|
|
283
|
+
base_str = initial_str + final_str
|
|
284
|
+
|
|
285
|
+
if not base_str:
|
|
286
|
+
return ""
|
|
287
|
+
|
|
288
|
+
if tone_style == ToneStyle.AFTER:
|
|
289
|
+
return base_str + TONE2STR.get(self.tone, "")
|
|
290
|
+
|
|
291
|
+
if self.tone in (Tone.missing, Tone.unspec, Tone.nul):
|
|
292
|
+
return base_str
|
|
293
|
+
|
|
294
|
+
if tone_style == ToneStyle.ABOVE and self.tone == Tone.t5:
|
|
295
|
+
return base_str
|
|
296
|
+
|
|
297
|
+
toned = FINAL2TONED.get(self.tone, {})
|
|
298
|
+
|
|
299
|
+
for v in ["a", "ê", "e", "o"]:
|
|
300
|
+
if (pos := base_str.find(v)) != -1:
|
|
301
|
+
break
|
|
302
|
+
else:
|
|
303
|
+
for pos in range(len(base_str) - 1, -1, -1):
|
|
304
|
+
if base_str[pos] in {"i", "u", "ü"}:
|
|
305
|
+
break
|
|
306
|
+
else:
|
|
307
|
+
for v in ["n", "m"]:
|
|
308
|
+
if (pos := base_str.find(v)) != -1:
|
|
309
|
+
break
|
|
310
|
+
else:
|
|
311
|
+
return base_str
|
|
312
|
+
|
|
313
|
+
if tone_style == ToneStyle.ABOVE:
|
|
314
|
+
if base_str[pos] in toned:
|
|
315
|
+
return f"{base_str[:pos]}{toned[base_str[pos]]}{base_str[pos+1:]}"
|
|
316
|
+
return base_str
|
|
317
|
+
elif tone_style == ToneStyle.RIGHT:
|
|
318
|
+
if not NO_INCOMPAT_WARNING:
|
|
319
|
+
warn(
|
|
320
|
+
"注意:使用RIGHT(数字附标)模式产出的拼音字符串并不通用,且不再能被本解析器解析。如果你明确知道你在做什么,可以传入NO_INCOMPAT_WARNING=True以关闭此警告。",
|
|
321
|
+
IncompatibleWarning,
|
|
322
|
+
)
|
|
323
|
+
return f"{base_str[:pos+1]}{TONE2STR.get(self.tone, '')}{base_str[pos+1:]}"
|
|
324
|
+
|
|
325
|
+
return base_str
|
|
326
|
+
|
|
327
|
+
def __str__(self):
|
|
328
|
+
return self.to_str()
|
|
329
|
+
|
|
330
|
+
def __bool__(self):
|
|
331
|
+
return bool(self.initial or self.final or self.tone)
|
|
332
|
+
|
|
333
|
+
def __eq__(self, other):
|
|
334
|
+
return self.initial == other.initial and self.final == other.final and self.tone == other.tone
|
|
335
|
+
|
|
336
|
+
__hash__ = __int__
|
|
337
|
+
|
|
338
|
+
def is_complete(self):
|
|
339
|
+
return bool(self.initial and self.final and self.tone)
|
|
340
|
+
|
|
341
|
+
def need_sep(self, prev: Syllable) -> bool:
|
|
342
|
+
if self.initial and (self.initial not in {Initial.nul, Initial.unspec, Initial.M}):
|
|
343
|
+
return False # 有有效声母隔着则必不需要
|
|
344
|
+
|
|
345
|
+
if prev.final in SYLL1SEP or self.final in SYLL2SEP:
|
|
346
|
+
return True
|
|
347
|
+
|
|
348
|
+
return prev.final in SYLLSEP and self.final in SYLLSEP[prev.final]
|
|
349
|
+
|
|
350
|
+
def is_valid(self):
|
|
351
|
+
return _check_syllable_valid(int(self))
|
|
352
|
+
|
|
353
|
+
def copy(self):
|
|
354
|
+
return Syllable(self.initial, self.final, self.tone)
|
|
355
|
+
|
|
356
|
+
def _mreject(self, other: Syllable, check_initial: bool = False):
|
|
357
|
+
return bool(
|
|
358
|
+
(self.initial and other.initial)
|
|
359
|
+
or (self.tone and other.tone)
|
|
360
|
+
or ((self.final or self.tone) and (other.initial or other.final))
|
|
361
|
+
or (
|
|
362
|
+
check_initial
|
|
363
|
+
and (self.final or other.final or ((self.initial or other.initial) and (self.tone or other.tone)))
|
|
364
|
+
and ((self.initial == Initial.nul) or (not self.initial))
|
|
365
|
+
and ((other.initial == Initial.nul) or (not other.initial))
|
|
366
|
+
)
|
|
367
|
+
)
|
|
368
|
+
# 化简自(s1 and o1)or(s2 and o2)or(s3 and o3)or(s2 and o1)or(s3 and o1)or(s3 and o2)or(check_initial and(s1 or o1 or s2 or o2)and(s2 or o2 or s3 or o3)and((self.initial==Initial.nul)or(not s1))and((other.initial==Initial.nul)or(not o1)))
|
|
369
|
+
# 其中s1=self.initial!=Initial.missing s2=self.final!=Final.missing s3=self.tone!=Tone.missing o1=other.initial!=Initial.missing o2=other.final!=Final.missing o3=other.tone!=Tone.missing
|
|
370
|
+
|
|
371
|
+
def _merge(self, other: Syllable):
|
|
372
|
+
if other.initial != Initial.missing:
|
|
373
|
+
self.initial = other.initial
|
|
374
|
+
if other.final != Final.missing:
|
|
375
|
+
self.final = other.final
|
|
376
|
+
if other.tone != Tone.missing:
|
|
377
|
+
self.tone = other.tone
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
TOKENS = {
|
|
381
|
+
"iang": [0x1200],
|
|
382
|
+
"iāng": [0x1260], # 江
|
|
383
|
+
"iáng": [0x1280], # 凉
|
|
384
|
+
"iǎng": [0x12A0], # 抢
|
|
385
|
+
"iàng": [0x12C0], # 呛
|
|
386
|
+
"iong": [0x1800],
|
|
387
|
+
"iōng": [0x1860], # 凶
|
|
388
|
+
"ióng": [0x1880], # 穷
|
|
389
|
+
"iǒng": [0x18A0], # 涌
|
|
390
|
+
"iòng": [0x18C0], # 用
|
|
391
|
+
"uang": [0x2300],
|
|
392
|
+
"uāng": [0x2360], # 光
|
|
393
|
+
"uáng": [0x2380], # 狂
|
|
394
|
+
"uǎng": [0x23A0], # 广
|
|
395
|
+
"uàng": [0x23C0], # 旷
|
|
396
|
+
"ueng": [0x2600],
|
|
397
|
+
"uēng": [0x2660], # 翁
|
|
398
|
+
"uéng": [0x2680], # 不存在,然而拒绝解析有违直觉,因而保留,下同
|
|
399
|
+
"uěng": [0x26A0], # 塕
|
|
400
|
+
"uèng": [0x26C0], # 瓮
|
|
401
|
+
"juan": [0x290C],
|
|
402
|
+
"juān": [0x296C], # 捐
|
|
403
|
+
"juán": [0x298C], # 不存在
|
|
404
|
+
"juǎn": [0x29AC], # 卷
|
|
405
|
+
"juàn": [0x29CC], # 倦
|
|
406
|
+
"quan": [0x2912],
|
|
407
|
+
"quān": [0x2972], # 圈
|
|
408
|
+
"quán": [0x2992], # 全
|
|
409
|
+
"quǎn": [0x29B2], # 犬
|
|
410
|
+
"quàn": [0x29D2], # 劝
|
|
411
|
+
"xuan": [0x2918],
|
|
412
|
+
"xuān": [0x2978], # 宣
|
|
413
|
+
"xuán": [0x2998], # 悬
|
|
414
|
+
"xuǎn": [0x29B8], # 选
|
|
415
|
+
"xuàn": [0x29D8], # 炫
|
|
416
|
+
"yang": [0x1219],
|
|
417
|
+
"yāng": [0x1279], # 央
|
|
418
|
+
"yáng": [0x1299], # 阳
|
|
419
|
+
"yǎng": [0x12B9], # 养
|
|
420
|
+
"yàng": [0x12D9], # 样
|
|
421
|
+
"ying": [0x1719],
|
|
422
|
+
"yīng": [0x1779], # 英
|
|
423
|
+
"yíng": [0x1799], # 赢
|
|
424
|
+
"yǐng": [0x17B9], # 影
|
|
425
|
+
"yìng": [0x17D9], # 映
|
|
426
|
+
"yong": [0x1819],
|
|
427
|
+
"yōng": [0x1879], # 拥
|
|
428
|
+
"yóng": [0x1899], # 颙
|
|
429
|
+
"yǒng": [0x18B9], # 泳
|
|
430
|
+
"yòng": [0x18D9], # 用
|
|
431
|
+
"wang": [0x2317],
|
|
432
|
+
"wāng": [0x2377], # 汪
|
|
433
|
+
"wáng": [0x2397], # 王
|
|
434
|
+
"wǎng": [0x23B7], # 网
|
|
435
|
+
"wàng": [0x23D7], # 忘
|
|
436
|
+
"weng": [0x2617],
|
|
437
|
+
"wēng": [0x2677], # 翁
|
|
438
|
+
"wéng": [0x2697], # 不存在
|
|
439
|
+
"wěng": [0x26B7], # 塕
|
|
440
|
+
"wèng": [0x26D7], # 瓮
|
|
441
|
+
"yuan": [0x2919],
|
|
442
|
+
"yuān": [0x2979], # 冤
|
|
443
|
+
"yuán": [0x2999], # 圆
|
|
444
|
+
"yuǎn": [0x29B9], # 远
|
|
445
|
+
"yuàn": [0x29D9], # 怨
|
|
446
|
+
"ang": [0x0602, 0x0600],
|
|
447
|
+
"āng": [0x0662, 0x0660], # 刚 肮
|
|
448
|
+
"áng": [0x0682, 0x0680], # 扛 昂
|
|
449
|
+
"ǎng": [0x06A2, 0x06A0], # 莽 䇦
|
|
450
|
+
"àng": [0x06C2, 0x06C0], # 抗 盎
|
|
451
|
+
"eng": [0x0B02, 0x0B00],
|
|
452
|
+
"ēng": [0x0B62, 0x0B60], # 庚 鞥
|
|
453
|
+
"éng": [0x0B82, 0x0B80], # 横 不存在
|
|
454
|
+
"ěng": [0x0BA2, 0x0BA0], # 冷 不存在
|
|
455
|
+
"èng": [0x0BC2, 0x0BC0], # 赠 不存在
|
|
456
|
+
"ian": [0x1100],
|
|
457
|
+
"iān": [0x1160], # 先
|
|
458
|
+
"ián": [0x1180], # 咸
|
|
459
|
+
"iǎn": [0x11A0], # 显
|
|
460
|
+
"iàn": [0x11C0], # 现
|
|
461
|
+
"iao": [0x1300],
|
|
462
|
+
"iāo": [0x1360], # 交
|
|
463
|
+
"iáo": [0x1380], # 嚼
|
|
464
|
+
"iǎo": [0x13A0], # 缴
|
|
465
|
+
"iào": [0x13C0], # 叫
|
|
466
|
+
"ing": [0x1700],
|
|
467
|
+
"īng": [0x1760], # 星
|
|
468
|
+
"íng": [0x1780], # 形
|
|
469
|
+
"ǐng": [0x17A0], # 醒
|
|
470
|
+
"ìng": [0x17C0], # 幸
|
|
471
|
+
"ong": [0x1C00],
|
|
472
|
+
"ōng": [0x1C60], # 东
|
|
473
|
+
"óng": [0x1C80], # 龙
|
|
474
|
+
"ǒng": [0x1CA0], # 拢
|
|
475
|
+
"òng": [0x1CC0], # 冻
|
|
476
|
+
"uai": [0x2100],
|
|
477
|
+
"uāi": [0x2160], # 乖
|
|
478
|
+
"uái": [0x2180], # 淮
|
|
479
|
+
"uǎi": [0x21A0], # 拐
|
|
480
|
+
"uài": [0x21C0], # 坏
|
|
481
|
+
"uan": [0x2200],
|
|
482
|
+
"uān": [0x2260], # 欢
|
|
483
|
+
"uán": [0x2280], # 环
|
|
484
|
+
"uǎn": [0x22A0], # 缓
|
|
485
|
+
"uàn": [0x22C0], # 换
|
|
486
|
+
"van": [0x2900],
|
|
487
|
+
"vān": [0x2960], # 捐
|
|
488
|
+
"ván": [0x2980], # 全
|
|
489
|
+
"vǎn": [0x29A0], # 犬
|
|
490
|
+
"vàn": [0x29C0], # 倦
|
|
491
|
+
"üan": [0x2900],
|
|
492
|
+
"üān": [0x2960],
|
|
493
|
+
"üán": [0x2980],
|
|
494
|
+
"üǎn": [0x29A0],
|
|
495
|
+
"üàn": [0x29C0],
|
|
496
|
+
"hng": [0x0EE3], # 哼
|
|
497
|
+
"zhi": [0x1E1B], # [ʅ]
|
|
498
|
+
"zhī": [0x1E7B], # 支
|
|
499
|
+
"zhí": [0x1E9B], # 直
|
|
500
|
+
"zhǐ": [0x1EBB], # 纸
|
|
501
|
+
"zhì": [0x1EDB], # 至
|
|
502
|
+
"chi": [0x1E07], # [ʅ]
|
|
503
|
+
"chī": [0x1E67], # 吃
|
|
504
|
+
"chí": [0x1E87], # 持
|
|
505
|
+
"chǐ": [0x1EA7], # 尺
|
|
506
|
+
"chì": [0x1EC7], # 赤
|
|
507
|
+
"shi": [0x1E15], # [ʅ]
|
|
508
|
+
"shī": [0x1E75], # 诗
|
|
509
|
+
"shí": [0x1E95], # 石
|
|
510
|
+
"shǐ": [0x1EB5], # 史
|
|
511
|
+
"shì": [0x1ED5], # 事
|
|
512
|
+
"jue": [0x2A0C],
|
|
513
|
+
"juē": [0x2A6C], # 撅
|
|
514
|
+
"jué": [0x2A8C], # 绝
|
|
515
|
+
"juě": [0x2AAC], # 蹶
|
|
516
|
+
"juè": [0x2ACC], # 倔
|
|
517
|
+
"que": [0x2A12],
|
|
518
|
+
"quē": [0x2A72], # 缺
|
|
519
|
+
"qué": [0x2A92], # 瘸
|
|
520
|
+
"quě": [0x2AB2], # 不存在
|
|
521
|
+
"què": [0x2AD2], # 雀
|
|
522
|
+
"xue": [0x2A18],
|
|
523
|
+
"xuē": [0x2A78], # 薛
|
|
524
|
+
"xué": [0x2A98], # 学
|
|
525
|
+
"xuě": [0x2AB8], # 雪
|
|
526
|
+
"xuè": [0x2AD8], # 谑
|
|
527
|
+
"jun": [0x2B0C],
|
|
528
|
+
"jūn": [0x2B6C], # 君
|
|
529
|
+
"jún": [0x2B8C], # 不存在
|
|
530
|
+
"jǔn": [0x2BAC], # 𢉦(RD广军)
|
|
531
|
+
"jùn": [0x2BCC], # 郡
|
|
532
|
+
"qun": [0x2B12],
|
|
533
|
+
"qūn": [0x2B72], # 逡
|
|
534
|
+
"qún": [0x2B92], # 群
|
|
535
|
+
"qǔn": [0x2BB2], # 䊎
|
|
536
|
+
"qùn": [0x2BD2], # 不存在
|
|
537
|
+
"xun": [0x2B18],
|
|
538
|
+
"xūn": [0x2B78], # 勋
|
|
539
|
+
"xún": [0x2B98], # 寻
|
|
540
|
+
"xǔn": [0x2BB8], # 不存在
|
|
541
|
+
"xùn": [0x2BD8], # 巽
|
|
542
|
+
"yao": [0x1319],
|
|
543
|
+
"yāo": [0x1379], # 邀
|
|
544
|
+
"yáo": [0x1399], # 摇
|
|
545
|
+
"yǎo": [0x13B9], # 咬
|
|
546
|
+
"yào": [0x13D9], # 药
|
|
547
|
+
"you": [0x1919],
|
|
548
|
+
"yōu": [0x1979], # 优
|
|
549
|
+
"yóu": [0x1999], # 游
|
|
550
|
+
"yǒu": [0x19B9], # 有
|
|
551
|
+
"yòu": [0x19D9], # 右
|
|
552
|
+
"yan": [0x1119],
|
|
553
|
+
"yān": [0x1179], # 烟
|
|
554
|
+
"yán": [0x1199], # 盐
|
|
555
|
+
"yǎn": [0x11B9], # 眼
|
|
556
|
+
"yàn": [0x11D9], # 验
|
|
557
|
+
"yin": [0x1519],
|
|
558
|
+
"yīn": [0x1579], # 阴
|
|
559
|
+
"yín": [0x1599], # 银
|
|
560
|
+
"yǐn": [0x15B9], # 饮
|
|
561
|
+
"yìn": [0x15D9], # 印
|
|
562
|
+
"wai": [0x2117],
|
|
563
|
+
"wāi": [0x2177], # 歪
|
|
564
|
+
"wái": [0x2197], # 不存在
|
|
565
|
+
"wǎi": [0x21B7], # 𨂿
|
|
566
|
+
"wài": [0x21D7], # 外
|
|
567
|
+
"wei": [0x2417],
|
|
568
|
+
"wēi": [0x2477], # 威
|
|
569
|
+
"wéi": [0x2497], # 维
|
|
570
|
+
"wěi": [0x24B7], # 尾
|
|
571
|
+
"wèi": [0x24D7], # 味
|
|
572
|
+
"wan": [0x2217],
|
|
573
|
+
"wān": [0x2277], # 弯
|
|
574
|
+
"wán": [0x2297], # 完
|
|
575
|
+
"wǎn": [0x22B7], # 碗
|
|
576
|
+
"wàn": [0x22D7], # 万
|
|
577
|
+
"wen": [0x2517],
|
|
578
|
+
"wēn": [0x2577], # 温
|
|
579
|
+
"wén": [0x2597], # 文
|
|
580
|
+
"wěn": [0x25B7], # 稳
|
|
581
|
+
"wèn": [0x25D7], # 问
|
|
582
|
+
"yue": [0x2A19],
|
|
583
|
+
"yuē": [0x2A79], # 约
|
|
584
|
+
"yué": [0x2A99], # 块(音yué义不详,但字统有记载因而算进来了)
|
|
585
|
+
"yuě": [0x2AB9], # 哕
|
|
586
|
+
"yuè": [0x2AD9], # 月
|
|
587
|
+
"yun": [0x2B19],
|
|
588
|
+
"yūn": [0x2B79], # 晕
|
|
589
|
+
"yún": [0x2B99], # 云
|
|
590
|
+
"yǔn": [0x2BB9], # 允
|
|
591
|
+
"yùn": [0x2BD9], # 韵
|
|
592
|
+
"zi": [0x161A],
|
|
593
|
+
"zī": [0x167A], # 兹
|
|
594
|
+
"zí": [0x169A], # 不存在
|
|
595
|
+
"zǐ": [0x16BA], # 紫
|
|
596
|
+
"zì": [0x16DA], # 字
|
|
597
|
+
"ci": [0x1606],
|
|
598
|
+
"cī": [0x1666], # 呲
|
|
599
|
+
"cí": [0x1686], # 词
|
|
600
|
+
"cǐ": [0x16A6], # 此
|
|
601
|
+
"cì": [0x16C6], # 次
|
|
602
|
+
"si": [0x1614],
|
|
603
|
+
"sī": [0x1674], # 丝
|
|
604
|
+
"sí": [0x1694], # 不存在
|
|
605
|
+
"sǐ": [0x16B4], # 死
|
|
606
|
+
"sì": [0x16D4], # 四
|
|
607
|
+
"ri": [0x1E04, 0x1E00], # [ʐ]/[ʅ]
|
|
608
|
+
"rī": [0x1E64, 0x1E60], # 痴 不存在
|
|
609
|
+
"rí": [0x1E84, 0x1E80], # 迟 不存在
|
|
610
|
+
"rǐ": [0x1EA4, 0x1EA0], # 齿 不存在
|
|
611
|
+
"rì": [0x1EC4, 0x1EC0], # 斥 日
|
|
612
|
+
"hm": [0x0DE3],
|
|
613
|
+
"ai": [0x0402, 0x0400],
|
|
614
|
+
"āi": [0x0462, 0x0460], # 该 挨
|
|
615
|
+
"ái": [0x0482, 0x0480], # 孩 皑
|
|
616
|
+
"ǎi": [0x04A2, 0x04A0], # 改 矮
|
|
617
|
+
"ài": [0x04C2, 0x04C0], # 骇 爱
|
|
618
|
+
"an": [0x0502, 0x0500],
|
|
619
|
+
"ān": [0x0562, 0x0560], # 潘 安
|
|
620
|
+
"án": [0x0582, 0x0580], # 盘 儑
|
|
621
|
+
"ǎn": [0x05A2, 0x05A0], # 懒 俺
|
|
622
|
+
"àn": [0x05C2, 0x05C0], # 烂 暗
|
|
623
|
+
"ao": [0x0702, 0x0700],
|
|
624
|
+
"āo": [0x0762, 0x0760], # 高 凹
|
|
625
|
+
"áo": [0x0782, 0x0780], # 豪 熬
|
|
626
|
+
"ǎo": [0x07A2, 0x07A0], # 好 拗
|
|
627
|
+
"ào": [0x07C2, 0x07C0], # 告 傲
|
|
628
|
+
"ei": [0x0902, 0x0900],
|
|
629
|
+
"ēi": [0x0962, 0x0960], # 飞 不存在(欸等字在eh)
|
|
630
|
+
"éi": [0x0982, 0x0980], # 肥 不存在
|
|
631
|
+
"ěi": [0x09A2, 0x09A0], # 匪 不存在
|
|
632
|
+
"èi": [0x09C2, 0x09C0], # 费 不存在
|
|
633
|
+
"en": [0x0A02, 0x0A00],
|
|
634
|
+
"ēn": [0x0A62, 0x0A60], # 奔 恩
|
|
635
|
+
"én": [0x0A82, 0x0A80], # 盆 不存在
|
|
636
|
+
"ěn": [0x0AA2, 0x0AA0], # 本 不存在
|
|
637
|
+
"èn": [0x0AC2, 0x0AC0], # 笨 摁
|
|
638
|
+
"er": [0x0C02, 0x0C00],
|
|
639
|
+
"ēr": [0x0C62, 0x0C60], # 不存在 不存在
|
|
640
|
+
"ér": [0x0C82, 0x0C80], # 不存在 儿
|
|
641
|
+
"ěr": [0x0CA2, 0x0CA0], # 不存在 尔
|
|
642
|
+
"èr": [0x0CC2, 0x0CC0], # 不存在 佴
|
|
643
|
+
"ia": [0x1000],
|
|
644
|
+
"iā": [0x1060], # 家
|
|
645
|
+
"iá": [0x1080], # 夹
|
|
646
|
+
"iǎ": [0x10A0], # 贾
|
|
647
|
+
"ià": [0x10C0], # 架
|
|
648
|
+
"ie": [0x1400],
|
|
649
|
+
"iē": [0x1460], # 街
|
|
650
|
+
"ié": [0x1480], # 截
|
|
651
|
+
"iě": [0x14A0], # 解
|
|
652
|
+
"iè": [0x14C0], # 借
|
|
653
|
+
"ii": [0x1600], # [z]/[ɿ]
|
|
654
|
+
"in": [0x1500],
|
|
655
|
+
"īn": [0x1560], # 侵
|
|
656
|
+
"ín": [0x1580], # 琴
|
|
657
|
+
"ǐn": [0x15A0], # 寝
|
|
658
|
+
"ìn": [0x15C0], # 沁
|
|
659
|
+
"iu": [0x1900],
|
|
660
|
+
"iū": [0x1960], # 秋
|
|
661
|
+
"iú": [0x1980], # 求
|
|
662
|
+
"iǔ": [0x19A0], # 朽
|
|
663
|
+
"iù": [0x19C0], # 锈
|
|
664
|
+
"ng": [0x1A1C], # [ŋ̊],仅见于唔、嗯二字
|
|
665
|
+
"n̄g": [0x1A7C], # 然而还是为了不违反直觉,在这插一个n1g
|
|
666
|
+
"ńg": [0x1A9C],
|
|
667
|
+
"ňg": [0x1ABC],
|
|
668
|
+
"ǹg": [0x1ADC], # n+macron没有单字符表示,且n1g不存在,
|
|
669
|
+
"ou": [0x1D02, 0x1D00],
|
|
670
|
+
"ōu": [0x1D62, 0x1D60], # 沟 欧
|
|
671
|
+
"óu": [0x1D82, 0x1D80], # 楼 吽
|
|
672
|
+
"ǒu": [0x1DA2, 0x1DA0], # 篓 偶
|
|
673
|
+
"òu": [0x1DC2, 0x1DC0], # 够 沤
|
|
674
|
+
"ua": [0x2000],
|
|
675
|
+
"uā": [0x2060], # 花
|
|
676
|
+
"uá": [0x2080], # 滑
|
|
677
|
+
"uǎ": [0x20A0], # 垮
|
|
678
|
+
"uà": [0x20C0], # 跨
|
|
679
|
+
"ui": [0x2400],
|
|
680
|
+
"uī": [0x2460], # 灰
|
|
681
|
+
"uí": [0x2480], # 回
|
|
682
|
+
"uǐ": [0x24A0], # 毁
|
|
683
|
+
"uì": [0x24C0], # 会
|
|
684
|
+
"un": [0x2500],
|
|
685
|
+
"ūn": [0x2560], # 昆
|
|
686
|
+
"ún": [0x2580], # 仑
|
|
687
|
+
"ǔn": [0x25A0], # 捆
|
|
688
|
+
"ùn": [0x25C0], # 论
|
|
689
|
+
"uo": [0x2700],
|
|
690
|
+
"uō": [0x2760], # 锅
|
|
691
|
+
"uó": [0x2780], # 活
|
|
692
|
+
"uǒ": [0x27A0], # 火
|
|
693
|
+
"uò": [0x27C0], # 过
|
|
694
|
+
"ve": [0x2A00],
|
|
695
|
+
"vē": [0x2A60], # 薛
|
|
696
|
+
"vé": [0x2A80], # 学
|
|
697
|
+
"vě": [0x2AA0], # 雪
|
|
698
|
+
"vè": [0x2AC0], # 谑
|
|
699
|
+
"üe": [0x2A00],
|
|
700
|
+
"üē": [0x2A60],
|
|
701
|
+
"üé": [0x2A80],
|
|
702
|
+
"üě": [0x2AA0],
|
|
703
|
+
"üè": [0x2AC0],
|
|
704
|
+
"vn": [0x2B00],
|
|
705
|
+
"ün": [0x2B00],
|
|
706
|
+
"ǖn": [0x2B60], # 逡
|
|
707
|
+
"ǘn": [0x2B80], # 群
|
|
708
|
+
"ǚn": [0x2BA0], # 允
|
|
709
|
+
"ǜn": [0x2BC0], # 孕
|
|
710
|
+
"ju": [0x280C],
|
|
711
|
+
"jū": [0x286C], # 居
|
|
712
|
+
"jú": [0x288C], # 局
|
|
713
|
+
"jǔ": [0x28AC], # 举
|
|
714
|
+
"jù": [0x28CC], # 句
|
|
715
|
+
"qu": [0x2812],
|
|
716
|
+
"qū": [0x2872], # 区
|
|
717
|
+
"qú": [0x2892], # 渠
|
|
718
|
+
"qǔ": [0x28B2], # 取
|
|
719
|
+
"qù": [0x28D2], # 去
|
|
720
|
+
"xu": [0x2818],
|
|
721
|
+
"xū": [0x2878], # 需
|
|
722
|
+
"xú": [0x2898], # 徐
|
|
723
|
+
"xǔ": [0x28B8], # 许
|
|
724
|
+
"xù": [0x28D8], # 序
|
|
725
|
+
"yi": [0x0F19],
|
|
726
|
+
"yī": [0x0F79], # 一
|
|
727
|
+
"yí": [0x0F99], # 疑
|
|
728
|
+
"yǐ": [0x0FB9], # 以
|
|
729
|
+
"yì": [0x0FD9], # 忆
|
|
730
|
+
"ya": [0x1019],
|
|
731
|
+
"yā": [0x1079], # 压
|
|
732
|
+
"yá": [0x1099], # 牙
|
|
733
|
+
"yǎ": [0x10B9], # 雅
|
|
734
|
+
"yà": [0x10D9], # 亚
|
|
735
|
+
"ye": [0x1419],
|
|
736
|
+
"yē": [0x1479], # 噎
|
|
737
|
+
"yé": [0x1499], # 爷
|
|
738
|
+
"yě": [0x14B9], # 野
|
|
739
|
+
"yè": [0x14D9], # 页
|
|
740
|
+
"wu": [0x1F17],
|
|
741
|
+
"wū": [0x1F77], # 屋
|
|
742
|
+
"wú": [0x1F97], # 无
|
|
743
|
+
"wǔ": [0x1FB7], # 舞
|
|
744
|
+
"wù": [0x1FD7], # 物
|
|
745
|
+
"wa": [0x2017],
|
|
746
|
+
"wā": [0x2077], # 洼
|
|
747
|
+
"wá": [0x2097], # 娃
|
|
748
|
+
"wǎ": [0x20B7], # 瓦
|
|
749
|
+
"wà": [0x20D7], # 袜
|
|
750
|
+
"wo": [0x2717],
|
|
751
|
+
"wō": [0x2777], # 窝
|
|
752
|
+
"wó": [0x2797], # 不存在
|
|
753
|
+
"wǒ": [0x27B7], # 我
|
|
754
|
+
"wò": [0x27D7], # 卧
|
|
755
|
+
"yu": [0x2819],
|
|
756
|
+
"yū": [0x2879], # 淤
|
|
757
|
+
"yú": [0x2899], # 于
|
|
758
|
+
"yǔ": [0x28B9], # 与
|
|
759
|
+
"yù": [0x28D9], # 玉
|
|
760
|
+
"zh": [0x001B],
|
|
761
|
+
"ch": [0x0007],
|
|
762
|
+
"sh": [0x0015],
|
|
763
|
+
"ê": [0x2E00],
|
|
764
|
+
"ê̄": [0x2E60],
|
|
765
|
+
"ế": [0x2E80],
|
|
766
|
+
"ê̌": [0x2EA0],
|
|
767
|
+
"ề": [0x2EC0], # U+1EC1 一、二、三声没有结合形式,只能用组合字符;四声有结合形式
|
|
768
|
+
"a": [0x0302, 0x0300],
|
|
769
|
+
"ā": [0x0362, 0x0360], # 妈 啊
|
|
770
|
+
"á": [0x0382, 0x0380], # 麻 啊
|
|
771
|
+
"ǎ": [0x03A2, 0x03A0], # 马 啊
|
|
772
|
+
"à": [0x03C2, 0x03C0], # 骂 啊
|
|
773
|
+
"e": [0x0802, 0x0800],
|
|
774
|
+
"ē": [0x0862, 0x0860], # 歌 婀
|
|
775
|
+
"é": [0x0882, 0x0880], # 隔 俄
|
|
776
|
+
"ě": [0x08A2, 0x08A0], # 舸 𫫇
|
|
777
|
+
"è": [0x08C2, 0x08C0], # 各 恶
|
|
778
|
+
"i": [0x0F00], # [i]
|
|
779
|
+
"ī": [0x0F60], # 机
|
|
780
|
+
"í": [0x0F80], # 急
|
|
781
|
+
"ǐ": [0x0FA0], # 挤
|
|
782
|
+
"ì": [0x0FC0], # 记
|
|
783
|
+
"o": [0x1B02, 0x1B00], # 咯、哦
|
|
784
|
+
"ō": [0x1B62, 0x1B60], # 此四音是否统合到uo尚有待商榷,暂定为不统合
|
|
785
|
+
"ó": [0x1B82, 0x1B80],
|
|
786
|
+
"ǒ": [0x1BA2, 0x1BA0],
|
|
787
|
+
"ò": [0x1BC2, 0x1BC0],
|
|
788
|
+
"u": [0x1F00],
|
|
789
|
+
"ū": [0x1F60], # 孤
|
|
790
|
+
"ú": [0x1F80], # 湖
|
|
791
|
+
"ǔ": [0x1FA0], # 虎
|
|
792
|
+
"ù": [0x1FC0], # 固
|
|
793
|
+
"v": [0x2800],
|
|
794
|
+
"ü": [0x2800],
|
|
795
|
+
"ǖ": [0x2860], # 屈
|
|
796
|
+
"ǘ": [0x2880], # 渠
|
|
797
|
+
"ǚ": [0x28A0], # 取
|
|
798
|
+
"ǜ": [0x28C0], # 去
|
|
799
|
+
"b": [0x0005],
|
|
800
|
+
"p": [0x0011],
|
|
801
|
+
"m": [0x000F], # 也可为韵母m[m̥],仅见于呒、呣二字
|
|
802
|
+
"f": [0x0009],
|
|
803
|
+
"d": [0x0008],
|
|
804
|
+
"t": [0x0016],
|
|
805
|
+
"n": [0x0010], # 也可为韵母n[n̥]/[ɰ̃],仅见于唔、嗯二字
|
|
806
|
+
"l": [0x000E],
|
|
807
|
+
"g": [0x000A],
|
|
808
|
+
"h": [0x000B],
|
|
809
|
+
"j": [0x000C],
|
|
810
|
+
"k": [0x000D],
|
|
811
|
+
"q": [0x0012],
|
|
812
|
+
"x": [0x0018],
|
|
813
|
+
"r": [0x0013],
|
|
814
|
+
"z": [0x001A],
|
|
815
|
+
"c": [0x0006],
|
|
816
|
+
"s": [0x0014],
|
|
817
|
+
"y": [0x0019], # 伪声母y
|
|
818
|
+
"w": [0x0017], # 伪声母w
|
|
819
|
+
"1": [0x0060],
|
|
820
|
+
"2": [0x0080],
|
|
821
|
+
"3": [0x00A0],
|
|
822
|
+
"4": [0x00C0],
|
|
823
|
+
"5": [0x00E0], # 轻声
|
|
824
|
+
"̄": [0x0060], # ISO 7098:2015 7.1节
|
|
825
|
+
"́": [0x0080],
|
|
826
|
+
"̌": [0x00A0],
|
|
827
|
+
"̀": [0x00C0],
|
|
828
|
+
# 以下按需启用
|
|
829
|
+
# "?": [0x0001, 0x0100, 0x0020],
|
|
830
|
+
# ".": [0x0121], # 通配
|
|
831
|
+
# "*": [0x0001, 0x0100], # 声母韵母通配
|
|
832
|
+
# "0": [0x0020], # 声调通配
|
|
833
|
+
# "/": [0x0002], # 零声母
|
|
834
|
+
# "&": [0x0004], # 伪声母R
|
|
835
|
+
"n̄": [0x2D7C], # 多字符,不存在
|
|
836
|
+
"ń": [0x2D9C],
|
|
837
|
+
"ň": [0x2DBC],
|
|
838
|
+
"ǹ": [0x2DDC],
|
|
839
|
+
"n1": [0x2D7C], # 不存在
|
|
840
|
+
"n2": [0x2D9C],
|
|
841
|
+
"n3": [0x2DBC],
|
|
842
|
+
"n4": [0x2DDC],
|
|
843
|
+
"m̄": [0x2C7C], # 多字符,不存在
|
|
844
|
+
"ḿ": [0x2C9C], # 只有ḿ有单字符表示
|
|
845
|
+
"m̌": [0x2CBC], # 多字符,不存在
|
|
846
|
+
"m̀": [0x2CDC], # 多字符
|
|
847
|
+
"m1": [0x2C7C], # 不存在
|
|
848
|
+
"m2": [0x2C9C],
|
|
849
|
+
"m3": [0x2CBC], # 不存在
|
|
850
|
+
"m4": [0x2CDC],
|
|
851
|
+
}
|
|
852
|
+
VALID_CHARS = set(chain.from_iterable(TOKENS.keys()))
|
|
853
|
+
VALID_CHARS_RE_DEFAULT = re.compile(f"[{re.escape("".join(VALID_CHARS))}]*")
|
|
854
|
+
VALID_SYLLABLES = base64.a85decode(
|
|
855
|
+
b'q>2-6l15_oq>2-6q=>R.a`&4$zzzH&EXE3V!^_H2AQ`H2AQ`84!jfzzzq<K"&Z18e8q>238q>238#Qb,0zzzq>238Z2,(5q>238q>238i(sa^zzzH2AQ`H1DjUH2AWbH2AWb!"],3zzz@64l!@7pq0+U\\VY0j>\\$6$X"DzzzadNb@Ja`pGamf?2RIC1X#S6t9zzz\\Z6,@YbMO,WJ_5gl2(ei+<UXbzzzq7@[M\\FLO<kkPJdq"#O/#Tsrazzzz!<<*"!<<*"!<<*"!<<*"zzzzzzz!WW3#zzzzzzz!WW3#zzz8G(:08G(:08G(:08G(:05S1a3zzz!(=X\'!!L+<5SV$7!!L+<!!IfPzzz8ArmU5k*/$8G(:08G(:0#R!Enzzz!!L+<#Z/>;!*$c7#ZSV?!"aY\\zzz8@6bE!4:,R8G(:0aRmj[!(9W`zzz8G(:089E5Z8CZ#e8G(:0!&T3+zzz#aE.*!:[f)#kYq58<gpj#[kFJzzz&-)h6&-)h6&-)h6&-)h6!!!-&zzz8<hL%!:\\A98G(:08G(:0!&QtAzzz!!L+<!!\'h8!!L(;!!L+<zzzz5ZGQ"!*$c7!-H$W!-H$W!!IfPzzzz!!!!%!!!!%!!!!%zzzz$(ueKM4ahK$(q7u$(q7u$*\\p[zzzE#TPp0N&YdE*F([GZtpc!!*-%zzzn[fbCZ18e7nbX@0EVgdZ!>$(Jzzz+92ZK+92ZK+92ZK,QJ)O+92ZKzzzpmXAJq"m5Wq"m5Wq"m5WW08t:zzz+:BS!!!=DF+:0Ft+:BRt!!",Azzz+9<kl+9DNC+:0Ft+:BS!!!48CzzzE!nc10N\'Y-?s>MkE)Sq&zzzz+:BS!+:94k+:BS!+:BS!!!<3$zzzE!nK)+:::5E!ni3E!ni3&-=6\\zzzE#LP80N\'Y,E#UtC:g6*c!!j\\Izzz!!",Az!!",A!!",AzzzzE"b>9:fAD;:fB1OE*GL.5SF8&zzz!!L+<!*$c7!*$c7!*$c7!!L+<zzz!!L+<!!\'h8!!L+<!!L+<!!!$"zzz!!L+<!!L+<!#33K!*$c7zzzz!!L+<!!\'h8!!IiQ!!L(;!!%NLzzz!!!!%!!!!%z!!!!%zzzzz!!!!%!!!!%!!!!%zzzz!<<*"!<<*"!<<*"!<'
|
|
856
|
+
)
|
|
857
|
+
# 请自行忽略这个雷霆大位图,0人知道为什么我要把位图直接就内联到代码里
|
|
858
|
+
CHRMAP = str.maketrans("ˉˊˇˋ", "̄́̌̀")
|
|
859
|
+
|
|
860
|
+
|
|
861
|
+
@cache
|
|
862
|
+
def _check_syllable_valid(i: int) -> bool:
|
|
863
|
+
def _check_full(val: int) -> bool:
|
|
864
|
+
off = val - 866
|
|
865
|
+
return (0 <= off < 11105) and bool(VALID_SYLLABLES[(off >> 3)] & (1 << (off & 7)))
|
|
866
|
+
|
|
867
|
+
initial = i & 0x001F
|
|
868
|
+
final = i & 0x3F00
|
|
869
|
+
tone = i & 0x00E0
|
|
870
|
+
|
|
871
|
+
i_empty = initial in (Initial.unspec, Initial.missing)
|
|
872
|
+
f_empty = final in (Final.unspec, Final.missing)
|
|
873
|
+
t_empty = tone in (Tone.unspec, Tone.missing)
|
|
874
|
+
|
|
875
|
+
filled = (not i_empty) + (not f_empty) + (not t_empty)
|
|
876
|
+
|
|
877
|
+
if filled <= 1:
|
|
878
|
+
return True
|
|
879
|
+
elif filled == 2:
|
|
880
|
+
if i_empty:
|
|
881
|
+
for x in Initial:
|
|
882
|
+
if x not in (Initial.unspec, Initial.missing) and _check_full(x | final | tone):
|
|
883
|
+
return True
|
|
884
|
+
elif f_empty:
|
|
885
|
+
for x in Final:
|
|
886
|
+
if x not in (Final.unspec, Final.missing) and _check_full(initial | x | tone):
|
|
887
|
+
return True
|
|
888
|
+
elif t_empty:
|
|
889
|
+
for x in Tone:
|
|
890
|
+
if x not in (Tone.unspec, Tone.missing) and _check_full(initial | final | x):
|
|
891
|
+
return True
|
|
892
|
+
return False
|
|
893
|
+
else:
|
|
894
|
+
return _check_full(i)
|
|
895
|
+
|
|
896
|
+
|
|
897
|
+
def _check_input_valid(s: str, VRE: re.Pattern[str] | str = VALID_CHARS_RE_DEFAULT) -> bool:
|
|
898
|
+
return bool(re.fullmatch(VRE, s))
|
|
899
|
+
|
|
900
|
+
|
|
901
|
+
def __parse(s: str, stack: list[Syllable], force_initial: bool = True, force_valid_syllable: bool = False) -> list[Syllable] | None:
|
|
902
|
+
# 我知道DFS还不剪枝会导致这个函数性能极差而且有爆递归风险,但是我无能优化了
|
|
903
|
+
if not s:
|
|
904
|
+
if force_valid_syllable and (not stack[-1].is_valid()):
|
|
905
|
+
return None
|
|
906
|
+
return stack
|
|
907
|
+
valid_heads = [s[:n] for n in range(min(4, len(s)), 0, -1) if s[:n] in TOKENS]
|
|
908
|
+
if not valid_heads:
|
|
909
|
+
return None
|
|
910
|
+
|
|
911
|
+
dont_try_again: set[Syllable] = set()
|
|
912
|
+
|
|
913
|
+
for head in valid_heads:
|
|
914
|
+
next_force_initial = not (
|
|
915
|
+
(head[-1] not in TOKENS) or (not any(((r & 0x001F) and (r & 0x001F) != Initial.nul) for r in TOKENS[head[-1]]))
|
|
916
|
+
) # 当前head末位字符不能做声母,那没必要再设置声母回退了。
|
|
917
|
+
# 不回退但是也不能直接continue(那样会导致更短的head先被尝试然后抢了),只能扔到dont_try_again里防止冗余计算
|
|
918
|
+
# 虽然但是很明显这块是把原来的for in [True,False]展开了。嘛虽然更长了但至少缩进少了而且效率或许可能会高一点?
|
|
919
|
+
|
|
920
|
+
for role in (Syllable(v) for v in TOKENS[head]):
|
|
921
|
+
if stack[-1]._mreject(role, force_initial):
|
|
922
|
+
continue
|
|
923
|
+
|
|
924
|
+
current_stack = stack.copy()
|
|
925
|
+
current_stack[-1] = current_stack[-1].copy()
|
|
926
|
+
current_stack[-1]._merge(role)
|
|
927
|
+
|
|
928
|
+
for start_new_syll in [True] if role.tone else [False, True]: # 声调后必须新开音节
|
|
929
|
+
if force_valid_syllable and start_new_syll and (not current_stack[-1].is_valid()):
|
|
930
|
+
continue
|
|
931
|
+
if not next_force_initial:
|
|
932
|
+
dont_try_again.add(role)
|
|
933
|
+
next_new_stack = current_stack.copy()
|
|
934
|
+
if start_new_syll:
|
|
935
|
+
next_new_stack.append(Syllable())
|
|
936
|
+
if ret_stack := __parse(
|
|
937
|
+
s=s[len(head) :],
|
|
938
|
+
stack=next_new_stack,
|
|
939
|
+
force_initial=next_force_initial and start_new_syll,
|
|
940
|
+
force_valid_syllable=force_valid_syllable,
|
|
941
|
+
): # 不新开音节就不检测声母
|
|
942
|
+
return ret_stack
|
|
943
|
+
|
|
944
|
+
for head in valid_heads:
|
|
945
|
+
for role in (Syllable(v) for v in TOKENS[head]):
|
|
946
|
+
if role in dont_try_again:
|
|
947
|
+
continue
|
|
948
|
+
|
|
949
|
+
if stack[-1]._mreject(role, force_initial):
|
|
950
|
+
continue
|
|
951
|
+
|
|
952
|
+
current_stack = stack.copy()
|
|
953
|
+
current_stack[-1] = current_stack[-1].copy()
|
|
954
|
+
current_stack[-1]._merge(role)
|
|
955
|
+
|
|
956
|
+
for start_new_syll in [True] if role.tone else [False, True]:
|
|
957
|
+
if force_valid_syllable and start_new_syll and (not current_stack[-1].is_valid()):
|
|
958
|
+
continue
|
|
959
|
+
next_new_stack = current_stack.copy()
|
|
960
|
+
if start_new_syll:
|
|
961
|
+
next_new_stack.append(Syllable())
|
|
962
|
+
if ret_stack := __parse(s[len(head) :], next_new_stack, False, force_valid_syllable):
|
|
963
|
+
return ret_stack
|
|
964
|
+
return None
|
|
965
|
+
|
|
966
|
+
|
|
967
|
+
_recompile = cache(re.compile)
|
|
968
|
+
|
|
969
|
+
|
|
970
|
+
def parse(s: str, sep: str = "' -", default_tone_neutral=False, force_valid_syllable=False, missing_as_nul: bool = False) -> list[Syllable]:
|
|
971
|
+
s = normalize("NFKC", s).lower().translate(CHRMAP)
|
|
972
|
+
if not _check_input_valid(s, _recompile(f"[{re.escape(''.join(VALID_CHARS|set(sep)))}]*")):
|
|
973
|
+
raise ValueError("无效的输入字符")
|
|
974
|
+
|
|
975
|
+
ret = [
|
|
976
|
+
__parse(s=seg, stack=[Syllable()], force_initial=False, force_valid_syllable=force_valid_syllable)
|
|
977
|
+
for seg in re.split(f"[{re.escape(sep)}]", s)
|
|
978
|
+
if seg
|
|
979
|
+
]
|
|
980
|
+
if not all(ret):
|
|
981
|
+
raise ValueError(f"无法解析 {s}")
|
|
982
|
+
|
|
983
|
+
ret = cast(list[list[Syllable]], ret) # 沟槽的pylance不会用all收窄,怒哩
|
|
984
|
+
|
|
985
|
+
for r in ret:
|
|
986
|
+
if not r[-1]:
|
|
987
|
+
del r[-1]
|
|
988
|
+
|
|
989
|
+
retl = list(chain.from_iterable(ret))
|
|
990
|
+
for syl in retl:
|
|
991
|
+
if default_tone_neutral and syl.tone == Tone.missing:
|
|
992
|
+
syl.tone = Tone.t5
|
|
993
|
+
if missing_as_nul:
|
|
994
|
+
syl.initial = syl.initial or Initial.nul
|
|
995
|
+
syl.final = syl.final or Final.nul
|
|
996
|
+
syl.tone = syl.tone or Tone.nul
|
|
997
|
+
|
|
998
|
+
return retl
|
|
999
|
+
|
|
1000
|
+
|
|
1001
|
+
@cache
|
|
1002
|
+
def parse_single(s: str, force_valid_syllable=False) -> Syllable:
|
|
1003
|
+
s = normalize("NFKC", s).lower().translate(CHRMAP)
|
|
1004
|
+
if not _check_input_valid(s):
|
|
1005
|
+
raise ValueError("无效的输入字符")
|
|
1006
|
+
|
|
1007
|
+
ret = __parse(s=s, stack=[Syllable()], force_initial=False, force_valid_syllable=force_valid_syllable)
|
|
1008
|
+
|
|
1009
|
+
if ret and not ret[-1]:
|
|
1010
|
+
del ret[-1]
|
|
1011
|
+
|
|
1012
|
+
if not ret or len(ret) != 1:
|
|
1013
|
+
raise ValueError(f"无法解析 {s}")
|
|
1014
|
+
|
|
1015
|
+
rets = ret[0]
|
|
1016
|
+
|
|
1017
|
+
rets.initial = rets.initial or Initial.nul
|
|
1018
|
+
rets.final = rets.final or Final.nul
|
|
1019
|
+
rets.tone = rets.tone or Tone.t5
|
|
1020
|
+
|
|
1021
|
+
return rets
|
|
1022
|
+
|
|
1023
|
+
|
|
1024
|
+
def syllables_to_str(sylls: Iterable[Syllable], sep: str = "'") -> str:
|
|
1025
|
+
ret = []
|
|
1026
|
+
for prev, curr in pairwise(filter(None, sylls)):
|
|
1027
|
+
if not ret:
|
|
1028
|
+
ret.append(str(prev))
|
|
1029
|
+
if curr.need_sep(prev):
|
|
1030
|
+
ret.append(sep)
|
|
1031
|
+
ret.append(str(curr))
|
|
1032
|
+
return "".join(ret)
|
|
1033
|
+
|
|
1034
|
+
|
|
1035
|
+
__all__ = ["Initial", "Final", "Tone", "Syllable", "parse_single", "syllables_to_str", "parse"]
|