regexp_parser 0.5.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +242 -0
- data/Gemfile +1 -0
- data/README.md +21 -17
- data/Rakefile +31 -0
- data/lib/regexp_parser/expression.rb +11 -9
- data/lib/regexp_parser/expression/classes/alternation.rb +5 -28
- data/lib/regexp_parser/expression/classes/backref.rb +21 -16
- data/lib/regexp_parser/expression/classes/escape.rb +81 -10
- data/lib/regexp_parser/expression/classes/group.rb +20 -20
- data/lib/regexp_parser/expression/classes/{character_class.rb → posix_class.rb} +2 -2
- data/lib/regexp_parser/expression/classes/property.rb +6 -0
- data/lib/regexp_parser/expression/classes/set.rb +10 -93
- data/lib/regexp_parser/expression/classes/set/intersection.rb +9 -0
- data/lib/regexp_parser/expression/classes/set/range.rb +23 -0
- data/lib/regexp_parser/expression/methods/strfregexp.rb +6 -4
- data/lib/regexp_parser/expression/methods/tests.rb +4 -14
- data/lib/regexp_parser/expression/methods/traverse.rb +1 -1
- data/lib/regexp_parser/expression/quantifier.rb +3 -4
- data/lib/regexp_parser/expression/sequence_operation.rb +34 -0
- data/lib/regexp_parser/expression/subexpression.rb +6 -10
- data/lib/regexp_parser/lexer.rb +13 -17
- data/lib/regexp_parser/parser.rb +170 -116
- data/lib/regexp_parser/scanner.rb +952 -2431
- data/lib/regexp_parser/scanner/char_type.rl +31 -0
- data/lib/regexp_parser/scanner/properties/long.yml +561 -0
- data/lib/regexp_parser/scanner/properties/short.yml +225 -0
- data/lib/regexp_parser/scanner/property.rl +7 -806
- data/lib/regexp_parser/scanner/scanner.rl +112 -154
- data/lib/regexp_parser/syntax/base.rb +4 -4
- data/lib/regexp_parser/syntax/tokens.rb +1 -0
- data/lib/regexp_parser/syntax/tokens/backref.rb +2 -2
- data/lib/regexp_parser/syntax/tokens/character_set.rb +3 -38
- data/lib/regexp_parser/syntax/tokens/escape.rb +2 -3
- data/lib/regexp_parser/syntax/tokens/group.rb +5 -4
- data/lib/regexp_parser/syntax/tokens/{character_class.rb → posix_class.rb} +5 -5
- data/lib/regexp_parser/syntax/tokens/unicode_property.rb +519 -266
- data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -4
- data/lib/regexp_parser/syntax/versions/1.9.1.rb +4 -10
- data/lib/regexp_parser/syntax/versions/2.0.0.rb +0 -2
- data/lib/regexp_parser/syntax/versions/2.4.1.rb +1 -1
- data/lib/regexp_parser/version.rb +1 -1
- data/regexp_parser.gemspec +2 -1
- data/test/expression/test_base.rb +2 -1
- data/test/expression/test_clone.rb +0 -57
- data/test/expression/test_set.rb +31 -8
- data/test/expression/test_strfregexp.rb +13 -4
- data/test/expression/test_subexpression.rb +25 -0
- data/test/expression/test_traverse.rb +25 -25
- data/test/helpers.rb +1 -0
- data/test/lexer/test_all.rb +1 -1
- data/test/lexer/test_conditionals.rb +9 -7
- data/test/lexer/test_nesting.rb +39 -21
- data/test/lexer/test_refcalls.rb +4 -4
- data/test/parser/set/test_intersections.rb +127 -0
- data/test/parser/set/test_ranges.rb +111 -0
- data/test/parser/test_all.rb +4 -1
- data/test/parser/test_escapes.rb +41 -9
- data/test/parser/test_groups.rb +22 -3
- data/test/parser/test_posix_classes.rb +27 -0
- data/test/parser/test_properties.rb +17 -290
- data/test/parser/test_refcalls.rb +66 -26
- data/test/parser/test_sets.rb +132 -129
- data/test/scanner/test_all.rb +1 -7
- data/test/scanner/test_conditionals.rb +16 -16
- data/test/scanner/test_errors.rb +0 -30
- data/test/scanner/test_escapes.rb +1 -2
- data/test/scanner/test_free_space.rb +28 -28
- data/test/scanner/test_groups.rb +35 -35
- data/test/scanner/test_meta.rb +1 -1
- data/test/scanner/test_properties.rb +87 -114
- data/test/scanner/test_refcalls.rb +18 -18
- data/test/scanner/test_scripts.rb +19 -351
- data/test/scanner/test_sets.rb +87 -60
- data/test/scanner/test_unicode_blocks.rb +4 -105
- data/test/support/warning_extractor.rb +1 -1
- data/test/syntax/test_syntax.rb +7 -0
- data/test/syntax/versions/test_1.8.rb +2 -4
- metadata +17 -7
- data/ChangeLog +0 -325
- data/test/scanner/test_emojis.rb +0 -31
@@ -0,0 +1,225 @@
|
|
1
|
+
#
|
2
|
+
# THIS FILE IS AUTO-GENERATED BY `rake props:update`, DO NOT EDIT
|
3
|
+
#
|
4
|
+
---
|
5
|
+
adlm: adlam
|
6
|
+
aghb: caucasian_albanian
|
7
|
+
ahex: ascii_hex_digit
|
8
|
+
arab: arabic
|
9
|
+
armi: imperial_aramaic
|
10
|
+
armn: armenian
|
11
|
+
avst: avestan
|
12
|
+
bali: balinese
|
13
|
+
bamu: bamum
|
14
|
+
bass: bassa_vah
|
15
|
+
batk: batak
|
16
|
+
beng: bengali
|
17
|
+
bhks: bhaiksuki
|
18
|
+
bidic: bidi_control
|
19
|
+
bopo: bopomofo
|
20
|
+
brah: brahmi
|
21
|
+
brai: braille
|
22
|
+
bugi: buginese
|
23
|
+
buhd: buhid
|
24
|
+
c: other
|
25
|
+
cakm: chakma
|
26
|
+
cans: canadian_aboriginal
|
27
|
+
cari: carian
|
28
|
+
cc: control
|
29
|
+
cf: format
|
30
|
+
cher: cherokee
|
31
|
+
ci: case_ignorable
|
32
|
+
cn: unassigned
|
33
|
+
co: private_use
|
34
|
+
copt: coptic
|
35
|
+
cprt: cypriot
|
36
|
+
cs: surrogate
|
37
|
+
cwcf: changes_when_casefolded
|
38
|
+
cwcm: changes_when_casemapped
|
39
|
+
cwl: changes_when_lowercased
|
40
|
+
cwt: changes_when_titlecased
|
41
|
+
cwu: changes_when_uppercased
|
42
|
+
cyrl: cyrillic
|
43
|
+
dep: deprecated
|
44
|
+
deva: devanagari
|
45
|
+
di: default_ignorable_code_point
|
46
|
+
dia: diacritic
|
47
|
+
dsrt: deseret
|
48
|
+
dupl: duployan
|
49
|
+
egyp: egyptian_hieroglyphs
|
50
|
+
elba: elbasan
|
51
|
+
ethi: ethiopic
|
52
|
+
ext: extender
|
53
|
+
geor: georgian
|
54
|
+
glag: glagolitic
|
55
|
+
gonm: masaram_gondi
|
56
|
+
goth: gothic
|
57
|
+
gran: grantha
|
58
|
+
grbase: grapheme_base
|
59
|
+
grek: greek
|
60
|
+
grext: grapheme_extend
|
61
|
+
grlink: grapheme_link
|
62
|
+
gujr: gujarati
|
63
|
+
guru: gurmukhi
|
64
|
+
hang: hangul
|
65
|
+
hani: han
|
66
|
+
hano: hanunoo
|
67
|
+
hatr: hatran
|
68
|
+
hebr: hebrew
|
69
|
+
hex: hex_digit
|
70
|
+
hira: hiragana
|
71
|
+
hluw: anatolian_hieroglyphs
|
72
|
+
hmng: pahawh_hmong
|
73
|
+
hung: old_hungarian
|
74
|
+
idc: id_continue
|
75
|
+
ideo: ideographic
|
76
|
+
ids: id_start
|
77
|
+
idsb: ids_binary_operator
|
78
|
+
idst: ids_trinary_operator
|
79
|
+
ital: old_italic
|
80
|
+
java: javanese
|
81
|
+
joinc: join_control
|
82
|
+
kali: kayah_li
|
83
|
+
kana: katakana
|
84
|
+
khar: kharoshthi
|
85
|
+
khmr: khmer
|
86
|
+
khoj: khojki
|
87
|
+
knda: kannada
|
88
|
+
kthi: kaithi
|
89
|
+
l: letter
|
90
|
+
lana: tai_tham
|
91
|
+
laoo: lao
|
92
|
+
latn: latin
|
93
|
+
lc: cased_letter
|
94
|
+
lepc: lepcha
|
95
|
+
limb: limbu
|
96
|
+
lina: linear_a
|
97
|
+
linb: linear_b
|
98
|
+
ll: lowercase_letter
|
99
|
+
lm: modifier_letter
|
100
|
+
lo: other_letter
|
101
|
+
loe: logical_order_exception
|
102
|
+
lt: titlecase_letter
|
103
|
+
lu: uppercase_letter
|
104
|
+
lyci: lycian
|
105
|
+
lydi: lydian
|
106
|
+
m: mark
|
107
|
+
mahj: mahajani
|
108
|
+
mand: mandaic
|
109
|
+
mani: manichaean
|
110
|
+
marc: marchen
|
111
|
+
mc: spacing_mark
|
112
|
+
me: enclosing_mark
|
113
|
+
mend: mende_kikakui
|
114
|
+
merc: meroitic_cursive
|
115
|
+
mero: meroitic_hieroglyphs
|
116
|
+
mlym: malayalam
|
117
|
+
mn: nonspacing_mark
|
118
|
+
mong: mongolian
|
119
|
+
mroo: mro
|
120
|
+
mtei: meetei_mayek
|
121
|
+
mult: multani
|
122
|
+
mymr: myanmar
|
123
|
+
n: number
|
124
|
+
narb: old_north_arabian
|
125
|
+
nbat: nabataean
|
126
|
+
nchar: noncharacter_code_point
|
127
|
+
nd: decimal_number
|
128
|
+
nkoo: nko
|
129
|
+
nl: letter_number
|
130
|
+
'no': other_number
|
131
|
+
nshu: nushu
|
132
|
+
oalpha: other_alphabetic
|
133
|
+
odi: other_default_ignorable_code_point
|
134
|
+
ogam: ogham
|
135
|
+
ogrext: other_grapheme_extend
|
136
|
+
oidc: other_id_continue
|
137
|
+
oids: other_id_start
|
138
|
+
olck: ol_chiki
|
139
|
+
olower: other_lowercase
|
140
|
+
omath: other_math
|
141
|
+
orkh: old_turkic
|
142
|
+
orya: oriya
|
143
|
+
osge: osage
|
144
|
+
osma: osmanya
|
145
|
+
oupper: other_uppercase
|
146
|
+
p: punctuation
|
147
|
+
palm: palmyrene
|
148
|
+
patsyn: pattern_syntax
|
149
|
+
patws: pattern_white_space
|
150
|
+
pauc: pau_cin_hau
|
151
|
+
pc: connector_punctuation
|
152
|
+
pcm: prepended_concatenation_mark
|
153
|
+
pd: dash_punctuation
|
154
|
+
pe: close_punctuation
|
155
|
+
perm: old_permic
|
156
|
+
pf: final_punctuation
|
157
|
+
phag: phags_pa
|
158
|
+
phli: inscriptional_pahlavi
|
159
|
+
phlp: psalter_pahlavi
|
160
|
+
phnx: phoenician
|
161
|
+
pi: initial_punctuation
|
162
|
+
plrd: miao
|
163
|
+
po: other_punctuation
|
164
|
+
prti: inscriptional_parthian
|
165
|
+
ps: open_punctuation
|
166
|
+
qaac: coptic
|
167
|
+
qaai: inherited
|
168
|
+
qmark: quotation_mark
|
169
|
+
ri: regional_indicator
|
170
|
+
rjng: rejang
|
171
|
+
runr: runic
|
172
|
+
s: symbol
|
173
|
+
samr: samaritan
|
174
|
+
sarb: old_south_arabian
|
175
|
+
saur: saurashtra
|
176
|
+
sc: currency_symbol
|
177
|
+
sd: soft_dotted
|
178
|
+
sgnw: signwriting
|
179
|
+
shaw: shavian
|
180
|
+
shrd: sharada
|
181
|
+
sidd: siddham
|
182
|
+
sind: khudawadi
|
183
|
+
sinh: sinhala
|
184
|
+
sk: modifier_symbol
|
185
|
+
sm: math_symbol
|
186
|
+
so: other_symbol
|
187
|
+
sora: sora_sompeng
|
188
|
+
soyo: soyombo
|
189
|
+
sterm: sentence_terminal
|
190
|
+
sund: sundanese
|
191
|
+
sylo: syloti_nagri
|
192
|
+
syrc: syriac
|
193
|
+
tagb: tagbanwa
|
194
|
+
takr: takri
|
195
|
+
tale: tai_le
|
196
|
+
talu: new_tai_lue
|
197
|
+
taml: tamil
|
198
|
+
tang: tangut
|
199
|
+
tavt: tai_viet
|
200
|
+
telu: telugu
|
201
|
+
term: terminal_punctuation
|
202
|
+
tfng: tifinagh
|
203
|
+
tglg: tagalog
|
204
|
+
thaa: thaana
|
205
|
+
tibt: tibetan
|
206
|
+
tirh: tirhuta
|
207
|
+
ugar: ugaritic
|
208
|
+
uideo: unified_ideograph
|
209
|
+
vaii: vai
|
210
|
+
vs: variation_selector
|
211
|
+
wara: warang_citi
|
212
|
+
wspace: white_space
|
213
|
+
xidc: xid_continue
|
214
|
+
xids: xid_start
|
215
|
+
xpeo: old_persian
|
216
|
+
xsux: cuneiform
|
217
|
+
yiii: yi
|
218
|
+
z: separator
|
219
|
+
zanb: zanabazar_square
|
220
|
+
zinh: inherited
|
221
|
+
zl: line_separator
|
222
|
+
zp: paragraph_separator
|
223
|
+
zs: space_separator
|
224
|
+
zyyy: common
|
225
|
+
zzzz: unknown
|
@@ -1,55 +1,9 @@
|
|
1
1
|
%%{
|
2
2
|
machine re_property;
|
3
3
|
|
4
|
-
property_char
|
4
|
+
property_char = [pP];
|
5
5
|
|
6
|
-
|
7
|
-
# yet if this applies to all flavors and in all encodings. A bug has just
|
8
|
-
# been filed against ruby regarding this issue, see:
|
9
|
-
# http://redmine.ruby-lang.org/issues/show/4014
|
10
|
-
property_name_unicode = 'alnum'i | 'alpha'i | 'any'i | 'ascii'i | 'blank'i |
|
11
|
-
'cntrl'i | 'digit'i | 'graph'i | 'lower'i | 'print'i |
|
12
|
-
'punct'i | 'space'i | 'upper'i | 'word'i | 'xdigit'i;
|
13
|
-
|
14
|
-
property_name_posix = 'any'i | 'assigned'i | 'newline'i;
|
15
|
-
|
16
|
-
property_name = property_name_unicode | property_name_posix;
|
17
|
-
|
18
|
-
category_letter = [Ll] . [ultmo]?;
|
19
|
-
category_mark = [Mm] . [nce]?;
|
20
|
-
category_number = [Nn] . [dlo]?;
|
21
|
-
category_punctuation = [Pp] . [cdseifo]?;
|
22
|
-
category_symbol = [Ss] . [mcko]?;
|
23
|
-
category_separator = [Zz] . [slp]?;
|
24
|
-
category_codepoint = [Cc] . [cfson]?;
|
25
|
-
|
26
|
-
general_category = category_letter | category_mark |
|
27
|
-
category_number | category_punctuation |
|
28
|
-
category_symbol | category_separator |
|
29
|
-
category_codepoint;
|
30
|
-
|
31
|
-
property_derived = 'math'i | 'alphabetic'i |
|
32
|
-
'lowercase'i | 'uppercase'i |
|
33
|
-
'id_start'i | 'id_continue'i |
|
34
|
-
'xid_start'i | 'xid_continue'i |
|
35
|
-
'grapheme_base'i | 'grapheme_extend'i |
|
36
|
-
'default_ignorable_code_point'i;
|
37
|
-
|
38
|
-
property_age = 'age=1.1'i | 'age=2.0'i | 'age=2.1'i |
|
39
|
-
'age=3.0'i | 'age=3.1'i | 'age=3.2'i |
|
40
|
-
'age=4.0'i | 'age=4.1'i | 'age=5.0'i |
|
41
|
-
'age=5.1'i | 'age=5.2'i | 'age=6.0'i |
|
42
|
-
'age=6.1'i | 'age=6.2'i | 'age=6.3'i |
|
43
|
-
'age=7.0'i | 'age=8.0'i | 'age=9.0'i |
|
44
|
-
'age=10.0'i;
|
45
|
-
|
46
|
-
property_script = (alnum | space | '_' | '-')+; # everything else
|
47
|
-
|
48
|
-
property_sequence = property_char . '{' . '^'? (
|
49
|
-
property_name | general_category |
|
50
|
-
property_age | property_derived |
|
51
|
-
property_script
|
52
|
-
) . '}';
|
6
|
+
property_sequence = property_char . '{' . '^'? (alnum|space|[_\-\.=])+ '}';
|
53
7
|
|
54
8
|
action premature_property_end {
|
55
9
|
raise PrematureEndError.new('unicode property')
|
@@ -61,767 +15,14 @@
|
|
61
15
|
|
62
16
|
property_sequence < eof(premature_property_end) {
|
63
17
|
text = text(data, ts, te, 1).first
|
64
|
-
|
65
|
-
type = :set
|
66
|
-
else
|
67
|
-
type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
|
68
|
-
end
|
69
|
-
|
70
|
-
name = data[ts+2..te-2].pack('c*').gsub(/[\s_]/,'').downcase
|
71
|
-
if name[0].chr == '^'
|
72
|
-
name = name[1..-1]
|
73
|
-
end
|
74
|
-
|
75
|
-
case name
|
76
|
-
# Named
|
77
|
-
when 'alnum'
|
78
|
-
self.emit(type, :alnum, text, ts-1, te)
|
79
|
-
when 'alpha'
|
80
|
-
self.emit(type, :alpha, text, ts-1, te)
|
81
|
-
when 'ascii'
|
82
|
-
self.emit(type, :ascii, text, ts-1, te)
|
83
|
-
when 'blank'
|
84
|
-
self.emit(type, :blank, text, ts-1, te)
|
85
|
-
when 'cntrl'
|
86
|
-
self.emit(type, :cntrl, text, ts-1, te)
|
87
|
-
when 'digit'
|
88
|
-
self.emit(type, :digit, text, ts-1, te)
|
89
|
-
when 'graph'
|
90
|
-
self.emit(type, :graph, text, ts-1, te)
|
91
|
-
when 'lower'
|
92
|
-
self.emit(type, :lower, text, ts-1, te)
|
93
|
-
when 'print'
|
94
|
-
self.emit(type, :print, text, ts-1, te)
|
95
|
-
when 'punct'
|
96
|
-
self.emit(type, :punct, text, ts-1, te)
|
97
|
-
when 'space'
|
98
|
-
self.emit(type, :space, text, ts-1, te)
|
99
|
-
when 'upper'
|
100
|
-
self.emit(type, :upper, text, ts-1, te)
|
101
|
-
when 'word'
|
102
|
-
self.emit(type, :word, text, ts-1, te)
|
103
|
-
when 'xdigit'
|
104
|
-
self.emit(type, :xdigit, text, ts-1, te)
|
105
|
-
when 'xposixpunct'
|
106
|
-
self.emit(type, :xposixpunct, text, ts-1, te)
|
107
|
-
|
108
|
-
# Only in Oniguruma (old Rubies)
|
109
|
-
when 'newline'
|
110
|
-
self.emit(type, :newline, text, ts-1, te)
|
111
|
-
|
112
|
-
when 'any'
|
113
|
-
self.emit(type, :any, text, ts-1, te)
|
114
|
-
when 'assigned'
|
115
|
-
self.emit(type, :assigned, text, ts-1, te)
|
116
|
-
|
117
|
-
# Letters
|
118
|
-
when 'l', 'letter'
|
119
|
-
self.emit(type, :letter_any, text, ts-1, te)
|
120
|
-
when 'lu', 'uppercaseletter'
|
121
|
-
self.emit(type, :letter_uppercase, text, ts-1, te)
|
122
|
-
when 'll', 'lowercaseletter'
|
123
|
-
self.emit(type, :letter_lowercase, text, ts-1, te)
|
124
|
-
when 'lt', 'titlecaseletter'
|
125
|
-
self.emit(type, :letter_titlecase, text, ts-1, te)
|
126
|
-
when 'lm', 'modifierletter'
|
127
|
-
self.emit(type, :letter_modifier, text, ts-1, te)
|
128
|
-
when 'lo', 'otherletter'
|
129
|
-
self.emit(type, :letter_other, text, ts-1, te)
|
130
|
-
|
131
|
-
# Marks
|
132
|
-
when 'm', 'mark'
|
133
|
-
self.emit(type, :mark_any, text, ts-1, te)
|
134
|
-
when 'mn', 'nonspacingmark'
|
135
|
-
self.emit(type, :mark_nonspacing, text, ts-1, te)
|
136
|
-
when 'mc', 'spacingmark'
|
137
|
-
self.emit(type, :mark_spacing, text, ts-1, te)
|
138
|
-
when 'me', 'enclosingmark'
|
139
|
-
self.emit(type, :mark_enclosing, text, ts-1, te)
|
140
|
-
|
141
|
-
# Numbers
|
142
|
-
when 'n', 'number'
|
143
|
-
self.emit(type, :number_any, text, ts-1, te)
|
144
|
-
when 'nd', 'decimalnumber'
|
145
|
-
self.emit(type, :number_decimal, text, ts-1, te)
|
146
|
-
when 'nl', 'letternumber'
|
147
|
-
self.emit(type, :number_letter, text, ts-1, te)
|
148
|
-
when 'no', 'othernumber'
|
149
|
-
self.emit(type, :number_other, text, ts-1, te)
|
150
|
-
|
151
|
-
# Punctuation
|
152
|
-
when 'p', 'punctuation'
|
153
|
-
self.emit(type, :punct_any, text, ts-1, te)
|
154
|
-
when 'pc', 'connectorpunctuation'
|
155
|
-
self.emit(type, :punct_connector, text, ts-1, te)
|
156
|
-
when 'pd', 'dashpunctuation'
|
157
|
-
self.emit(type, :punct_dash, text, ts-1, te)
|
158
|
-
when 'ps', 'openpunctuation'
|
159
|
-
self.emit(type, :punct_open, text, ts-1, te)
|
160
|
-
when 'pe', 'closepunctuation'
|
161
|
-
self.emit(type, :punct_close, text, ts-1, te)
|
162
|
-
when 'pi', 'initialpunctuation'
|
163
|
-
self.emit(type, :punct_initial, text, ts-1, te)
|
164
|
-
when 'pf', 'finalpunctuation'
|
165
|
-
self.emit(type, :punct_final, text, ts-1, te)
|
166
|
-
when 'po', 'otherpunctuation'
|
167
|
-
self.emit(type, :punct_other, text, ts-1, te)
|
168
|
-
|
169
|
-
# Symbols
|
170
|
-
when 's', 'symbol'
|
171
|
-
self.emit(type, :symbol_any, text, ts-1, te)
|
172
|
-
when 'sm', 'mathsymbol'
|
173
|
-
self.emit(type, :symbol_math, text, ts-1, te)
|
174
|
-
when 'sc', 'currencysymbol'
|
175
|
-
self.emit(type, :symbol_currency, text, ts-1, te)
|
176
|
-
when 'sk', 'modifiersymbol'
|
177
|
-
self.emit(type, :symbol_modifier, text, ts-1, te)
|
178
|
-
when 'so', 'othersymbol'
|
179
|
-
self.emit(type, :symbol_other, text, ts-1, te)
|
180
|
-
|
181
|
-
# Separators
|
182
|
-
when 'z', 'separator'
|
183
|
-
self.emit(type, :separator_any, text, ts-1, te)
|
184
|
-
when 'zs', 'spaceseparator'
|
185
|
-
self.emit(type, :separator_space, text, ts-1, te)
|
186
|
-
when 'zl', 'lineseparator'
|
187
|
-
self.emit(type, :separator_line, text, ts-1, te)
|
188
|
-
when 'zp', 'paragraphseparator'
|
189
|
-
self.emit(type, :separator_para, text, ts-1, te)
|
190
|
-
|
191
|
-
# Codepoints
|
192
|
-
when 'c', 'other'
|
193
|
-
self.emit(type, :other, text, ts-1, te)
|
194
|
-
when 'cc', 'control'
|
195
|
-
self.emit(type, :control, text, ts-1, te)
|
196
|
-
when 'cf', 'format'
|
197
|
-
self.emit(type, :format, text, ts-1, te)
|
198
|
-
when 'cs', 'surrogate'
|
199
|
-
self.emit(type, :surrogate, text, ts-1, te)
|
200
|
-
when 'co', 'privateuse'
|
201
|
-
self.emit(type, :private_use, text, ts-1, te)
|
202
|
-
when 'cn', 'unassigned'
|
203
|
-
self.emit(type, :unassigned, text, ts-1, te)
|
204
|
-
|
205
|
-
# Age
|
206
|
-
when 'age=1.1'
|
207
|
-
self.emit(type, :age_1_1, text, ts-1, te)
|
208
|
-
when 'age=2.0'
|
209
|
-
self.emit(type, :age_2_0, text, ts-1, te)
|
210
|
-
when 'age=2.1'
|
211
|
-
self.emit(type, :age_2_1, text, ts-1, te)
|
212
|
-
when 'age=3.0'
|
213
|
-
self.emit(type, :age_3_0, text, ts-1, te)
|
214
|
-
when 'age=3.1'
|
215
|
-
self.emit(type, :age_3_1, text, ts-1, te)
|
216
|
-
when 'age=3.2'
|
217
|
-
self.emit(type, :age_3_2, text, ts-1, te)
|
218
|
-
when 'age=4.0'
|
219
|
-
self.emit(type, :age_4_0, text, ts-1, te)
|
220
|
-
when 'age=4.1'
|
221
|
-
self.emit(type, :age_4_1, text, ts-1, te)
|
222
|
-
when 'age=5.0'
|
223
|
-
self.emit(type, :age_5_0, text, ts-1, te)
|
224
|
-
when 'age=5.1'
|
225
|
-
self.emit(type, :age_5_1, text, ts-1, te)
|
226
|
-
when 'age=5.2'
|
227
|
-
self.emit(type, :age_5_2, text, ts-1, te)
|
228
|
-
when 'age=6.0'
|
229
|
-
self.emit(type, :age_6_0, text, ts-1, te)
|
230
|
-
when 'age=6.1'
|
231
|
-
self.emit(type, :age_6_1, text, ts-1, te)
|
232
|
-
when 'age=6.2'
|
233
|
-
self.emit(type, :age_6_2, text, ts-1, te)
|
234
|
-
when 'age=6.3'
|
235
|
-
self.emit(type, :age_6_3, text, ts-1, te)
|
236
|
-
when 'age=7.0'
|
237
|
-
self.emit(type, :age_7_0, text, ts-1, te)
|
238
|
-
when 'age=8.0'
|
239
|
-
self.emit(type, :age_8_0, text, ts-1, te)
|
240
|
-
when 'age=9.0'
|
241
|
-
self.emit(type, :age_9_0, text, ts-1, te)
|
242
|
-
when 'age=10.0'
|
243
|
-
self.emit(type, :age_10_0, text, ts-1, te)
|
244
|
-
|
245
|
-
# Derived Properties
|
246
|
-
when 'ahex', 'asciihexdigit'
|
247
|
-
self.emit(type, :ascii_hex, text, ts-1, te)
|
248
|
-
when 'alphabetic'
|
249
|
-
self.emit(type, :alphabetic, text, ts-1, te)
|
250
|
-
when 'cased'
|
251
|
-
self.emit(type, :cased, text, ts-1, te)
|
252
|
-
when 'cwcf', 'changeswhencasefolded'
|
253
|
-
self.emit(type, :changes_when_casefolded, text, ts-1, te)
|
254
|
-
when 'cwcm', 'changeswhencasemapped'
|
255
|
-
self.emit(type, :changes_when_casemapped, text, ts-1, te)
|
256
|
-
when 'cwl', 'changeswhenlowercased'
|
257
|
-
self.emit(type, :changes_when_lowercased, text, ts-1, te)
|
258
|
-
when 'cwt', 'changeswhentitlecased'
|
259
|
-
self.emit(type, :changes_when_titlecased, text, ts-1, te)
|
260
|
-
when 'cwu', 'changeswhenuppercased'
|
261
|
-
self.emit(type, :changes_when_uppercased, text, ts-1, te)
|
262
|
-
when 'ci', 'caseignorable'
|
263
|
-
self.emit(type, :case_ignorable, text, ts-1, te)
|
264
|
-
when 'bidic', 'bidicontrol'
|
265
|
-
self.emit(type, :bidi_control, text, ts-1, te)
|
266
|
-
when 'dash'
|
267
|
-
self.emit(type, :dash, text, ts-1, te)
|
268
|
-
when 'dep', 'deprecated'
|
269
|
-
self.emit(type, :deprecated, text, ts-1, te)
|
270
|
-
when 'di', 'defaultignorablecodepoint'
|
271
|
-
self.emit(type, :default_ignorable_cp, text, ts-1, te)
|
272
|
-
when 'dia', 'diacritic'
|
273
|
-
self.emit(type, :diacritic, text, ts-1, te)
|
274
|
-
when 'ext', 'extender'
|
275
|
-
self.emit(type, :extender, text, ts-1, te)
|
276
|
-
when 'grbase', 'graphemebase'
|
277
|
-
self.emit(type, :grapheme_base, text, ts-1, te)
|
278
|
-
when 'grext', 'graphemeextend'
|
279
|
-
self.emit(type, :grapheme_extend, text, ts-1, te)
|
280
|
-
when 'grlink', 'graphemelink' # NOTE: deprecated as of Unicode 5.0
|
281
|
-
self.emit(type, :grapheme_link, text, ts-1, te)
|
282
|
-
when 'hex', 'hexdigit'
|
283
|
-
self.emit(type, :hex_digit, text, ts-1, te)
|
284
|
-
when 'hyphen' # NOTE: deprecated as of Unicode 6.0
|
285
|
-
self.emit(type, :hyphen, text, ts-1, te)
|
286
|
-
when 'idc', 'idcontinue'
|
287
|
-
self.emit(type, :id_continue, text, ts-1, te)
|
288
|
-
when 'ideo', 'ideographic'
|
289
|
-
self.emit(type, :ideographic, text, ts-1, te)
|
290
|
-
when 'ids', 'idstart'
|
291
|
-
self.emit(type, :id_start, text, ts-1, te)
|
292
|
-
when 'idsb', 'idsbinaryoperator'
|
293
|
-
self.emit(type, :ids_binary_op, text, ts-1, te)
|
294
|
-
when 'idst', 'idstrinaryoperator'
|
295
|
-
self.emit(type, :ids_trinary_op, text, ts-1, te)
|
296
|
-
when 'joinc', 'joincontrol'
|
297
|
-
self.emit(type, :join_control, text, ts-1, te)
|
298
|
-
when 'loe', 'logicalorderexception'
|
299
|
-
self.emit(type, :logical_order_exception, text, ts-1, te)
|
300
|
-
when 'lowercase'
|
301
|
-
self.emit(type, :lowercase, text, ts-1, te)
|
302
|
-
when 'math'
|
303
|
-
self.emit(type, :math, text, ts-1, te)
|
304
|
-
when 'nchar', 'noncharactercodepoint'
|
305
|
-
self.emit(type, :non_character_cp, text, ts-1, te)
|
306
|
-
when 'oalpha', 'otheralphabetic'
|
307
|
-
self.emit(type, :other_alphabetic, text, ts-1, te)
|
308
|
-
when 'odi', 'otherdefaultignorablecodepoint'
|
309
|
-
self.emit(type, :other_default_ignorable_cp, text, ts-1, te)
|
310
|
-
when 'ogrext', 'othergraphemeextend'
|
311
|
-
self.emit(type, :other_grapheme_extended, text, ts-1, te)
|
312
|
-
when 'oidc', 'otheridcontinue'
|
313
|
-
self.emit(type, :other_id_continue, text, ts-1, te)
|
314
|
-
when 'oids', 'otheridstart'
|
315
|
-
self.emit(type, :other_id_start, text, ts-1, te)
|
316
|
-
when 'olower', 'otherlowercase'
|
317
|
-
self.emit(type, :other_lowercase, text, ts-1, te)
|
318
|
-
when 'omath', 'othermath'
|
319
|
-
self.emit(type, :other_math, text, ts-1, te)
|
320
|
-
when 'oupper', 'otheruppercase'
|
321
|
-
self.emit(type, :other_uppercase, text, ts-1, te)
|
322
|
-
when 'patsyn', 'patternsyntax'
|
323
|
-
self.emit(type, :pattern_syntax, text, ts-1, te)
|
324
|
-
when 'patws', 'patternwhitespace'
|
325
|
-
self.emit(type, :pattern_whitespace, text, ts-1, te)
|
326
|
-
when 'qmark', 'quotationmark'
|
327
|
-
self.emit(type, :quotation_mark, text, ts-1, te)
|
328
|
-
when 'radical'
|
329
|
-
self.emit(type, :radical, text, ts-1, te)
|
330
|
-
when 'ri', 'regionalindicator'
|
331
|
-
self.emit(type, :regional_indicator, text, ts-1, te)
|
332
|
-
when 'sd', 'softdotted'
|
333
|
-
self.emit(type, :soft_dotted, text, ts-1, te)
|
334
|
-
when 'sterm'
|
335
|
-
self.emit(type, :sentence_terminal, text, ts-1, te)
|
336
|
-
when 'term', 'terminalpunctuation'
|
337
|
-
self.emit(type, :terminal_punctuation, text, ts-1, te)
|
338
|
-
when 'uideo', 'unifiedideograph'
|
339
|
-
self.emit(type, :unified_ideograph, text, ts-1, te)
|
340
|
-
when 'uppercase'
|
341
|
-
self.emit(type, :uppercase, text, ts-1, te)
|
342
|
-
when 'vs', 'variationselector'
|
343
|
-
self.emit(type, :variation_selector, text, ts-1, te)
|
344
|
-
when 'wspace', 'whitespace'
|
345
|
-
self.emit(type, :whitespace, text, ts-1, te)
|
346
|
-
when 'xids', 'xidstart'
|
347
|
-
self.emit(type, :xid_start, text, ts-1, te)
|
348
|
-
when 'xidc', 'xidcontinue'
|
349
|
-
self.emit(type, :xid_continue, text, ts-1, te)
|
350
|
-
|
351
|
-
# Emoji
|
352
|
-
when 'emoji'
|
353
|
-
self.emit(type, :emoji_any, text, ts-1, te)
|
354
|
-
when 'emojicomponent'
|
355
|
-
self.emit(type, :emoji_component, text, ts-1, te)
|
356
|
-
when 'emojimodifier'
|
357
|
-
self.emit(type, :emoji_modifier, text, ts-1, te)
|
358
|
-
when 'emojimodifierbase'
|
359
|
-
self.emit(type, :emoji_modifier_base, text, ts-1, te)
|
360
|
-
when 'emojipresentation'
|
361
|
-
self.emit(type, :emoji_presentation, text, ts-1, te)
|
362
|
-
|
363
|
-
# Scripts
|
364
|
-
when 'aghb', 'caucasianalbanian'
|
365
|
-
self.emit(type, :script_caucasian_albanian, text, ts-1, te)
|
366
|
-
when 'arab', 'arabic'
|
367
|
-
self.emit(type, :script_arabic, text, ts-1, te)
|
368
|
-
when 'armi', 'imperialaramaic'
|
369
|
-
self.emit(type, :script_imperial_aramaic, text, ts-1, te)
|
370
|
-
when 'armn', 'armenian'
|
371
|
-
self.emit(type, :script_armenian, text, ts-1, te)
|
372
|
-
when 'avst', 'avestan'
|
373
|
-
self.emit(type, :script_avestan, text, ts-1, te)
|
374
|
-
when 'bali', 'balinese'
|
375
|
-
self.emit(type, :script_balinese, text, ts-1, te)
|
376
|
-
when 'bamu', 'bamum'
|
377
|
-
self.emit(type, :script_bamum, text, ts-1, te)
|
378
|
-
when 'bass', 'bassavah'
|
379
|
-
self.emit(type, :script_bassa_vah, text, ts-1, te)
|
380
|
-
when 'batk', 'batak'
|
381
|
-
self.emit(type, :script_batak, text, ts-1, te)
|
382
|
-
when 'beng', 'bengali'
|
383
|
-
self.emit(type, :script_bengali, text, ts-1, te)
|
384
|
-
when 'bopo', 'bopomofo'
|
385
|
-
self.emit(type, :script_bopomofo, text, ts-1, te)
|
386
|
-
when 'brah', 'brahmi'
|
387
|
-
self.emit(type, :script_brahmi, text, ts-1, te)
|
388
|
-
when 'brai', 'braille'
|
389
|
-
self.emit(type, :script_braille, text, ts-1, te)
|
390
|
-
when 'bugi', 'buginese'
|
391
|
-
self.emit(type, :script_buginese, text, ts-1, te)
|
392
|
-
when 'buhd', 'buhid'
|
393
|
-
self.emit(type, :script_buhid, text, ts-1, te)
|
394
|
-
when 'cans', 'canadianaboriginal'
|
395
|
-
self.emit(type, :script_canadian_aboriginal, text, ts-1, te)
|
396
|
-
when 'cari', 'carian'
|
397
|
-
self.emit(type, :script_carian, text, ts-1, te)
|
398
|
-
when 'cham'
|
399
|
-
self.emit(type, :script_cham, text, ts-1, te)
|
400
|
-
when 'cher', 'cherokee'
|
401
|
-
self.emit(type, :script_cherokee, text, ts-1, te)
|
402
|
-
when 'copt', 'coptic', 'qaac'
|
403
|
-
self.emit(type, :script_coptic, text, ts-1, te)
|
404
|
-
when 'cprt', 'cypriot'
|
405
|
-
self.emit(type, :script_cypriot, text, ts-1, te)
|
406
|
-
when 'cyrl', 'cyrillic'
|
407
|
-
self.emit(type, :script_cyrillic, text, ts-1, te)
|
408
|
-
when 'deva', 'devanagari'
|
409
|
-
self.emit(type, :script_devanagari, text, ts-1, te)
|
410
|
-
when 'dsrt', 'deseret'
|
411
|
-
self.emit(type, :script_deseret, text, ts-1, te)
|
412
|
-
when 'dupl', 'duployan'
|
413
|
-
self.emit(type, :script_duployan, text, ts-1, te)
|
414
|
-
when 'egyp', 'egyptianhieroglyphs'
|
415
|
-
self.emit(type, :script_egyptian_hieroglyphs, text, ts-1, te)
|
416
|
-
when 'elba', 'elbasan'
|
417
|
-
self.emit(type, :script_elbasan, text, ts-1, te)
|
418
|
-
when 'ethi', 'ethiopic'
|
419
|
-
self.emit(type, :script_ethiopic, text, ts-1, te)
|
420
|
-
when 'geor', 'georgian'
|
421
|
-
self.emit(type, :script_georgian, text, ts-1, te)
|
422
|
-
when 'glag', 'glagolitic'
|
423
|
-
self.emit(type, :script_glagolitic, text, ts-1, te)
|
424
|
-
when 'goth', 'gothic'
|
425
|
-
self.emit(type, :script_gothic, text, ts-1, te)
|
426
|
-
when 'gran', 'grantha'
|
427
|
-
self.emit(type, :script_grantha, text, ts-1, te)
|
428
|
-
when 'grek', 'greek'
|
429
|
-
self.emit(type, :script_greek, text, ts-1, te)
|
430
|
-
when 'gujr', 'gujarati'
|
431
|
-
self.emit(type, :script_gujarati, text, ts-1, te)
|
432
|
-
when 'guru', 'gurmukhi'
|
433
|
-
self.emit(type, :script_gurmukhi, text, ts-1, te)
|
434
|
-
when 'hang', 'hangul'
|
435
|
-
self.emit(type, :script_hangul, text, ts-1, te)
|
436
|
-
when 'hani', 'han'
|
437
|
-
self.emit(type, :script_han, text, ts-1, te)
|
438
|
-
when 'hano', 'hanunoo'
|
439
|
-
self.emit(type, :script_hanunoo, text, ts-1, te)
|
440
|
-
when 'hebr', 'hebrew'
|
441
|
-
self.emit(type, :script_hebrew, text, ts-1, te)
|
442
|
-
when 'hira', 'hiragana'
|
443
|
-
self.emit(type, :script_hiragana, text, ts-1, te)
|
444
|
-
when 'hmng', 'pahawhhmong'
|
445
|
-
self.emit(type, :script_pahawh_hmong, text, ts-1, te)
|
446
|
-
when 'hrkt', 'katakanaorhiragana'
|
447
|
-
self.emit(type, :script_katakana_or_hiragana, text, ts-1, te)
|
448
|
-
when 'ital', 'olditalic'
|
449
|
-
self.emit(type, :script_old_italic, text, ts-1, te)
|
450
|
-
when 'java', 'javanese'
|
451
|
-
self.emit(type, :script_javanese, text, ts-1, te)
|
452
|
-
when 'kali', 'kayahli'
|
453
|
-
self.emit(type, :script_kayah_li, text, ts-1, te)
|
454
|
-
when 'kana', 'katakana'
|
455
|
-
self.emit(type, :script_katakana, text, ts-1, te)
|
456
|
-
when 'khar', 'kharoshthi'
|
457
|
-
self.emit(type, :script_kharoshthi, text, ts-1, te)
|
458
|
-
when 'khmr', 'khmer'
|
459
|
-
self.emit(type, :script_khmer, text, ts-1, te)
|
460
|
-
when 'khoj', 'khojki'
|
461
|
-
self.emit(type, :script_khojki, text, ts-1, te)
|
462
|
-
when 'knda', 'kannada'
|
463
|
-
self.emit(type, :script_kannada, text, ts-1, te)
|
464
|
-
when 'kthi', 'kaithi'
|
465
|
-
self.emit(type, :script_kaithi, text, ts-1, te)
|
466
|
-
when 'lana', 'taitham'
|
467
|
-
self.emit(type, :script_tai_tham, text, ts-1, te)
|
468
|
-
when 'laoo', 'lao'
|
469
|
-
self.emit(type, :script_lao, text, ts-1, te)
|
470
|
-
when 'latn', 'latin'
|
471
|
-
self.emit(type, :script_latin, text, ts-1, te)
|
472
|
-
when 'lepc', 'lepcha'
|
473
|
-
self.emit(type, :script_lepcha, text, ts-1, te)
|
474
|
-
when 'limb', 'limbu'
|
475
|
-
self.emit(type, :script_limbu, text, ts-1, te)
|
476
|
-
when 'lina', 'lineara'
|
477
|
-
self.emit(type, :script_linear_a, text, ts-1, te)
|
478
|
-
when 'linb', 'linearb'
|
479
|
-
self.emit(type, :script_linear_b, text, ts-1, te)
|
480
|
-
when 'lisu'
|
481
|
-
self.emit(type, :script_lisu, text, ts-1, te)
|
482
|
-
when 'lyci', 'lycian'
|
483
|
-
self.emit(type, :script_lycian, text, ts-1, te)
|
484
|
-
when 'lydi', 'lydian'
|
485
|
-
self.emit(type, :script_lydian, text, ts-1, te)
|
486
|
-
when 'mlym', 'malayalam'
|
487
|
-
self.emit(type, :script_malayalam, text, ts-1, te)
|
488
|
-
when 'mahj', 'mahajani'
|
489
|
-
self.emit(type, :script_mahajani, text, ts-1, te)
|
490
|
-
when 'mand', 'mandaic'
|
491
|
-
self.emit(type, :script_mandaic, text, ts-1, te)
|
492
|
-
when 'mani', 'manichaean'
|
493
|
-
self.emit(type, :script_manichaean, text, ts-1, te)
|
494
|
-
when 'mend', 'mendekikakui'
|
495
|
-
self.emit(type, :script_mende_kikakui, text, ts-1, te)
|
496
|
-
when 'modi'
|
497
|
-
self.emit(type, :script_modi, text, ts-1, te)
|
498
|
-
when 'mong', 'mongolian'
|
499
|
-
self.emit(type, :script_mongolian, text, ts-1, te)
|
500
|
-
when 'mroo', 'mro'
|
501
|
-
self.emit(type, :script_mro, text, ts-1, te)
|
502
|
-
when 'mtei', 'meeteimayek'
|
503
|
-
self.emit(type, :script_meetei_mayek, text, ts-1, te)
|
504
|
-
when 'mymr', 'myanmar'
|
505
|
-
self.emit(type, :script_myanmar, text, ts-1, te)
|
506
|
-
when 'narb', 'oldnortharabian'
|
507
|
-
self.emit(type, :script_old_north_arabian, text, ts-1, te)
|
508
|
-
when 'nbat', 'nabataean'
|
509
|
-
self.emit(type, :script_nabataean, text, ts-1, te)
|
510
|
-
when 'nkoo', 'nko'
|
511
|
-
self.emit(type, :script_nko, text, ts-1, te)
|
512
|
-
when 'ogam', 'ogham'
|
513
|
-
self.emit(type, :script_ogham, text, ts-1, te)
|
514
|
-
when 'olck', 'olchiki'
|
515
|
-
self.emit(type, :script_ol_chiki, text, ts-1, te)
|
516
|
-
when 'orkh', 'oldturkic'
|
517
|
-
self.emit(type, :script_old_turkic, text, ts-1, te)
|
518
|
-
when 'orya', 'oriya'
|
519
|
-
self.emit(type, :script_oriya, text, ts-1, te)
|
520
|
-
when 'osma', 'osmanya'
|
521
|
-
self.emit(type, :script_osmanya, text, ts-1, te)
|
522
|
-
when 'palm', 'palmyrene'
|
523
|
-
self.emit(type, :script_palmyrene, text, ts-1, te)
|
524
|
-
when 'pauc', 'paucinhau'
|
525
|
-
self.emit(type, :script_pau_cin_hau, text, ts-1, te)
|
526
|
-
when 'perm', 'oldpermic'
|
527
|
-
self.emit(type, :script_old_permic, text, ts-1, te)
|
528
|
-
when 'phag', 'phagspa'
|
529
|
-
self.emit(type, :script_phags_pa, text, ts-1, te)
|
530
|
-
when 'phli', 'inscriptionalpahlavi'
|
531
|
-
self.emit(type, :script_inscriptional_pahlavi, text, ts-1, te)
|
532
|
-
when 'phlp', 'psalterpahlavi'
|
533
|
-
self.emit(type, :script_psalter_pahlavi, text, ts-1, te)
|
534
|
-
when 'phnx', 'phoenician'
|
535
|
-
self.emit(type, :script_phoenician, text, ts-1, te)
|
536
|
-
when 'prti', 'inscriptionalparthian'
|
537
|
-
self.emit(type, :script_inscriptional_parthian, text, ts-1, te)
|
538
|
-
when 'rjng', 'rejang'
|
539
|
-
self.emit(type, :script_rejang, text, ts-1, te)
|
540
|
-
when 'runr', 'runic'
|
541
|
-
self.emit(type, :script_runic, text, ts-1, te)
|
542
|
-
when 'samr', 'samaritan'
|
543
|
-
self.emit(type, :script_samaritan, text, ts-1, te)
|
544
|
-
when 'sarb', 'oldsoutharabian'
|
545
|
-
self.emit(type, :script_old_south_arabian, text, ts-1, te)
|
546
|
-
when 'saur', 'saurashtra'
|
547
|
-
self.emit(type, :script_saurashtra, text, ts-1, te)
|
548
|
-
when 'shaw', 'shavian'
|
549
|
-
self.emit(type, :script_shavian, text, ts-1, te)
|
550
|
-
when 'sidd', 'siddham'
|
551
|
-
self.emit(type, :script_siddham, text, ts-1, te)
|
552
|
-
when 'sind', 'khudawadi'
|
553
|
-
self.emit(type, :script_khudawadi, text, ts-1, te)
|
554
|
-
when 'sinh', 'sinhala'
|
555
|
-
self.emit(type, :script_sinhala, text, ts-1, te)
|
556
|
-
when 'sund', 'sundanese'
|
557
|
-
self.emit(type, :script_sundanese, text, ts-1, te)
|
558
|
-
when 'sylo', 'sylotinagri'
|
559
|
-
self.emit(type, :script_syloti_nagri, text, ts-1, te)
|
560
|
-
when 'syrc', 'syriac'
|
561
|
-
self.emit(type, :script_syriac, text, ts-1, te)
|
562
|
-
when 'tagb', 'tagbanwa'
|
563
|
-
self.emit(type, :script_tagbanwa, text, ts-1, te)
|
564
|
-
when 'tale', 'taile'
|
565
|
-
self.emit(type, :script_tai_le, text, ts-1, te)
|
566
|
-
when 'talu', 'newtailue'
|
567
|
-
self.emit(type, :script_new_tai_lue, text, ts-1, te)
|
568
|
-
when 'taml', 'tamil'
|
569
|
-
self.emit(type, :script_tamil, text, ts-1, te)
|
570
|
-
when 'tavt', 'taiviet'
|
571
|
-
self.emit(type, :script_tai_viet, text, ts-1, te)
|
572
|
-
when 'telu', 'telugu'
|
573
|
-
self.emit(type, :script_telugu, text, ts-1, te)
|
574
|
-
when 'tfng', 'tifinagh'
|
575
|
-
self.emit(type, :script_tifinagh, text, ts-1, te)
|
576
|
-
when 'tglg', 'tagalog'
|
577
|
-
self.emit(type, :script_tagalog, text, ts-1, te)
|
578
|
-
when 'thaa', 'thaana'
|
579
|
-
self.emit(type, :script_thaana, text, ts-1, te)
|
580
|
-
when 'thai'
|
581
|
-
self.emit(type, :script_thai, text, ts-1, te)
|
582
|
-
when 'tibt', 'tibetan'
|
583
|
-
self.emit(type, :script_tibetan, text, ts-1, te)
|
584
|
-
when 'tirh', 'tirhuta'
|
585
|
-
self.emit(type, :script_tirhuta, text, ts-1, te)
|
586
|
-
when 'ugar', 'ugaritic'
|
587
|
-
self.emit(type, :script_ugaritic, text, ts-1, te)
|
588
|
-
when 'vaii', 'vai'
|
589
|
-
self.emit(type, :script_vai, text, ts-1, te)
|
590
|
-
when 'wara', 'warangciti'
|
591
|
-
self.emit(type, :script_warang_citi, text, ts-1, te)
|
592
|
-
when 'xpeo', 'oldpersian'
|
593
|
-
self.emit(type, :script_old_persian, text, ts-1, te)
|
594
|
-
when 'xsux', 'cuneiform'
|
595
|
-
self.emit(type, :script_cuneiform, text, ts-1, te)
|
596
|
-
when 'yiii', 'yi'
|
597
|
-
self.emit(type, :script_yi, text, ts-1, te)
|
598
|
-
when 'zinh', 'inherited', 'qaai'
|
599
|
-
self.emit(type, :script_inherited, text, ts-1, te)
|
600
|
-
when 'zyyy', 'common'
|
601
|
-
self.emit(type, :script_common, text, ts-1, te)
|
602
|
-
when 'zzzz', 'unknown'
|
603
|
-
self.emit(type, :script_unknown, text, ts-1, te)
|
18
|
+
type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
|
604
19
|
|
605
|
-
|
606
|
-
when 'inalphabeticpresentationforms'
|
607
|
-
self.emit(type, :block_inalphabetic_presentation_forms, text, ts-1, te)
|
608
|
-
when 'inarabicpresentationforms-a'
|
609
|
-
self.emit(type, :block_inarabic_presentation_forms_a, text, ts-1, te)
|
610
|
-
when 'inarabicpresentationforms-b'
|
611
|
-
self.emit(type, :block_inarabic_presentation_forms_b, text, ts-1, te)
|
612
|
-
when 'inarabic'
|
613
|
-
self.emit(type, :block_inarabic, text, ts-1, te)
|
614
|
-
when 'inarmenian'
|
615
|
-
self.emit(type, :block_inarmenian, text, ts-1, te)
|
616
|
-
when 'inarrows'
|
617
|
-
self.emit(type, :block_inarrows, text, ts-1, te)
|
618
|
-
when 'inbasiclatin'
|
619
|
-
self.emit(type, :block_inbasic_latin, text, ts-1, te)
|
620
|
-
when 'inbengali'
|
621
|
-
self.emit(type, :block_inbengali, text, ts-1, te)
|
622
|
-
when 'inblockelements'
|
623
|
-
self.emit(type, :block_inblock_elements, text, ts-1, te)
|
624
|
-
when 'inbopomofoextended'
|
625
|
-
self.emit(type, :block_inbopomofo_extended, text, ts-1, te)
|
626
|
-
when 'inbopomofo'
|
627
|
-
self.emit(type, :block_inbopomofo, text, ts-1, te)
|
628
|
-
when 'inboxdrawing'
|
629
|
-
self.emit(type, :block_inbox_drawing, text, ts-1, te)
|
630
|
-
when 'inbraillepatterns'
|
631
|
-
self.emit(type, :block_inbraille_patterns, text, ts-1, te)
|
632
|
-
when 'inbuhid'
|
633
|
-
self.emit(type, :block_inbuhid, text, ts-1, te)
|
634
|
-
when 'incjkcompatibilityforms'
|
635
|
-
self.emit(type, :block_incjk_compatibility_forms, text, ts-1, te)
|
636
|
-
when 'incjkcompatibilityideographs'
|
637
|
-
self.emit(type, :block_incjk_compatibility_ideographs, text, ts-1, te)
|
638
|
-
when 'incjkcompatibility'
|
639
|
-
self.emit(type, :block_incjk_compatibility, text, ts-1, te)
|
640
|
-
when 'incjkradicalssupplement'
|
641
|
-
self.emit(type, :block_incjk_radicals_supplement, text, ts-1, te)
|
642
|
-
when 'incjksymbolsandpunctuation'
|
643
|
-
self.emit(type, :block_incjk_symbols_and_punctuation, text, ts-1, te)
|
644
|
-
when 'incjkunifiedideographsextensiona'
|
645
|
-
self.emit(type, :block_incjk_unified_ideographs_extension_a, text, ts-1, te)
|
646
|
-
when 'incjkunifiedideographs'
|
647
|
-
self.emit(type, :block_incjk_unified_ideographs, text, ts-1, te)
|
648
|
-
when 'incherokee'
|
649
|
-
self.emit(type, :block_incherokee, text, ts-1, te)
|
650
|
-
when 'incombiningdiacriticalmarksforsymbols'
|
651
|
-
self.emit(type, :block_incombining_diacritical_marks_for_symbols, text, ts-1, te)
|
652
|
-
when 'incombiningdiacriticalmarks'
|
653
|
-
self.emit(type, :block_incombining_diacritical_marks, text, ts-1, te)
|
654
|
-
when 'incombininghalfmarks'
|
655
|
-
self.emit(type, :block_incombining_half_marks, text, ts-1, te)
|
656
|
-
when 'incontrolpictures'
|
657
|
-
self.emit(type, :block_incontrol_pictures, text, ts-1, te)
|
658
|
-
when 'incurrencysymbols'
|
659
|
-
self.emit(type, :block_incurrency_symbols, text, ts-1, te)
|
660
|
-
when 'incyrillicsupplement'
|
661
|
-
self.emit(type, :block_incyrillic_supplement, text, ts-1, te)
|
662
|
-
when 'incyrillic'
|
663
|
-
self.emit(type, :block_incyrillic, text, ts-1, te)
|
664
|
-
when 'indevanagari'
|
665
|
-
self.emit(type, :block_indevanagari, text, ts-1, te)
|
666
|
-
when 'indingbats'
|
667
|
-
self.emit(type, :block_indingbats, text, ts-1, te)
|
668
|
-
when 'inenclosedalphanumerics'
|
669
|
-
self.emit(type, :block_inenclosed_alphanumerics, text, ts-1, te)
|
670
|
-
when 'inenclosedcjklettersandmonths'
|
671
|
-
self.emit(type, :block_inenclosed_cjk_letters_and_months, text, ts-1, te)
|
672
|
-
when 'inethiopic'
|
673
|
-
self.emit(type, :block_inethiopic, text, ts-1, te)
|
674
|
-
when 'ingeneralpunctuation'
|
675
|
-
self.emit(type, :block_ingeneral_punctuation, text, ts-1, te)
|
676
|
-
when 'ingeometricshapes'
|
677
|
-
self.emit(type, :block_ingeometric_shapes, text, ts-1, te)
|
678
|
-
when 'ingeorgian'
|
679
|
-
self.emit(type, :block_ingeorgian, text, ts-1, te)
|
680
|
-
when 'ingreekextended'
|
681
|
-
self.emit(type, :block_ingreek_extended, text, ts-1, te)
|
682
|
-
when 'ingreekandcoptic'
|
683
|
-
self.emit(type, :block_ingreek_and_coptic, text, ts-1, te)
|
684
|
-
when 'ingujarati'
|
685
|
-
self.emit(type, :block_ingujarati, text, ts-1, te)
|
686
|
-
when 'ingurmukhi'
|
687
|
-
self.emit(type, :block_ingurmukhi, text, ts-1, te)
|
688
|
-
when 'inhalfwidthandfullwidthforms'
|
689
|
-
self.emit(type, :block_inhalfwidth_and_fullwidth_forms, text, ts-1, te)
|
690
|
-
when 'inhangulcompatibilityjamo'
|
691
|
-
self.emit(type, :block_inhangul_compatibility_jamo, text, ts-1, te)
|
692
|
-
when 'inhanguljamo'
|
693
|
-
self.emit(type, :block_inhangul_jamo, text, ts-1, te)
|
694
|
-
when 'inhangulsyllables'
|
695
|
-
self.emit(type, :block_inhangul_syllables, text, ts-1, te)
|
696
|
-
when 'inhanunoo'
|
697
|
-
self.emit(type, :block_inhanunoo, text, ts-1, te)
|
698
|
-
when 'inhebrew'
|
699
|
-
self.emit(type, :block_inhebrew, text, ts-1, te)
|
700
|
-
when 'inhighprivateusesurrogates'
|
701
|
-
self.emit(type, :block_inhigh_private_use_surrogates, text, ts-1, te)
|
702
|
-
when 'inhighsurrogates'
|
703
|
-
self.emit(type, :block_inhigh_surrogates, text, ts-1, te)
|
704
|
-
when 'inhiragana'
|
705
|
-
self.emit(type, :block_inhiragana, text, ts-1, te)
|
706
|
-
when 'inipaextensions'
|
707
|
-
self.emit(type, :block_inipa_extensions, text, ts-1, te)
|
708
|
-
when 'inideographicdescriptioncharacters'
|
709
|
-
self.emit(type, :block_inideographic_description_characters, text, ts-1, te)
|
710
|
-
when 'inkanbun'
|
711
|
-
self.emit(type, :block_inkanbun, text, ts-1, te)
|
712
|
-
when 'inkangxiradicals'
|
713
|
-
self.emit(type, :block_inkangxi_radicals, text, ts-1, te)
|
714
|
-
when 'inkannada'
|
715
|
-
self.emit(type, :block_inkannada, text, ts-1, te)
|
716
|
-
when 'inkatakanaphoneticextensions'
|
717
|
-
self.emit(type, :block_inkatakana_phonetic_extensions, text, ts-1, te)
|
718
|
-
when 'inkatakana'
|
719
|
-
self.emit(type, :block_inkatakana, text, ts-1, te)
|
720
|
-
when 'inkhmersymbols'
|
721
|
-
self.emit(type, :block_inkhmer_symbols, text, ts-1, te)
|
722
|
-
when 'inkhmer'
|
723
|
-
self.emit(type, :block_inkhmer, text, ts-1, te)
|
724
|
-
when 'inlao'
|
725
|
-
self.emit(type, :block_inlao, text, ts-1, te)
|
726
|
-
when 'inlatin-1supplement'
|
727
|
-
self.emit(type, :block_inlatin_1_supplement, text, ts-1, te)
|
728
|
-
when 'inlatinextended-a'
|
729
|
-
self.emit(type, :block_inlatin_extended_a, text, ts-1, te)
|
730
|
-
when 'inlatinextended-b'
|
731
|
-
self.emit(type, :block_inlatin_extended_b, text, ts-1, te)
|
732
|
-
when 'inlatinextendedadditional'
|
733
|
-
self.emit(type, :block_inlatin_extended_additional, text, ts-1, te)
|
734
|
-
when 'inletterlikesymbols'
|
735
|
-
self.emit(type, :block_inletterlike_symbols, text, ts-1, te)
|
736
|
-
when 'inlimbu'
|
737
|
-
self.emit(type, :block_inlimbu, text, ts-1, te)
|
738
|
-
when 'inlowsurrogates'
|
739
|
-
self.emit(type, :block_inlow_surrogates, text, ts-1, te)
|
740
|
-
when 'inmalayalam'
|
741
|
-
self.emit(type, :block_inmalayalam, text, ts-1, te)
|
742
|
-
when 'inmathematicaloperators'
|
743
|
-
self.emit(type, :block_inmathematical_operators, text, ts-1, te)
|
744
|
-
when 'inmiscellaneousmathematicalsymbols-a'
|
745
|
-
self.emit(type, :block_inmiscellaneous_mathematical_symbols_a, text, ts-1, te)
|
746
|
-
when 'inmiscellaneousmathematicalsymbols-b'
|
747
|
-
self.emit(type, :block_inmiscellaneous_mathematical_symbols_b, text, ts-1, te)
|
748
|
-
when 'inmiscellaneoussymbolsandarrows'
|
749
|
-
self.emit(type, :block_inmiscellaneous_symbols_and_arrows, text, ts-1, te)
|
750
|
-
when 'inmiscellaneoussymbols'
|
751
|
-
self.emit(type, :block_inmiscellaneous_symbols, text, ts-1, te)
|
752
|
-
when 'inmiscellaneoustechnical'
|
753
|
-
self.emit(type, :block_inmiscellaneous_technical, text, ts-1, te)
|
754
|
-
when 'inmongolian'
|
755
|
-
self.emit(type, :block_inmongolian, text, ts-1, te)
|
756
|
-
when 'inmyanmar'
|
757
|
-
self.emit(type, :block_inmyanmar, text, ts-1, te)
|
758
|
-
when 'innumberforms'
|
759
|
-
self.emit(type, :block_innumber_forms, text, ts-1, te)
|
760
|
-
when 'inogham'
|
761
|
-
self.emit(type, :block_inogham, text, ts-1, te)
|
762
|
-
when 'inopticalcharacterrecognition'
|
763
|
-
self.emit(type, :block_inoptical_character_recognition, text, ts-1, te)
|
764
|
-
when 'inoriya'
|
765
|
-
self.emit(type, :block_inoriya, text, ts-1, te)
|
766
|
-
when 'inphoneticextensions'
|
767
|
-
self.emit(type, :block_inphonetic_extensions, text, ts-1, te)
|
768
|
-
when 'inprivateusearea'
|
769
|
-
self.emit(type, :block_inprivate_use_area, text, ts-1, te)
|
770
|
-
when 'inrunic'
|
771
|
-
self.emit(type, :block_inrunic, text, ts-1, te)
|
772
|
-
when 'insinhala'
|
773
|
-
self.emit(type, :block_insinhala, text, ts-1, te)
|
774
|
-
when 'insmallformvariants'
|
775
|
-
self.emit(type, :block_insmall_form_variants, text, ts-1, te)
|
776
|
-
when 'inspacingmodifierletters'
|
777
|
-
self.emit(type, :block_inspacing_modifier_letters, text, ts-1, te)
|
778
|
-
when 'inspecials'
|
779
|
-
self.emit(type, :block_inspecials, text, ts-1, te)
|
780
|
-
when 'insuperscriptsandsubscripts'
|
781
|
-
self.emit(type, :block_insuperscripts_and_subscripts, text, ts-1, te)
|
782
|
-
when 'insupplementalarrows-a'
|
783
|
-
self.emit(type, :block_insupplemental_arrows_a, text, ts-1, te)
|
784
|
-
when 'insupplementalarrows-b'
|
785
|
-
self.emit(type, :block_insupplemental_arrows_b, text, ts-1, te)
|
786
|
-
when 'insupplementalmathematicaloperators'
|
787
|
-
self.emit(type, :block_insupplemental_mathematical_operators, text, ts-1, te)
|
788
|
-
when 'insyriac'
|
789
|
-
self.emit(type, :block_insyriac, text, ts-1, te)
|
790
|
-
when 'intagalog'
|
791
|
-
self.emit(type, :block_intagalog, text, ts-1, te)
|
792
|
-
when 'intagbanwa'
|
793
|
-
self.emit(type, :block_intagbanwa, text, ts-1, te)
|
794
|
-
when 'intaile'
|
795
|
-
self.emit(type, :block_intai_le, text, ts-1, te)
|
796
|
-
when 'intamil'
|
797
|
-
self.emit(type, :block_intamil, text, ts-1, te)
|
798
|
-
when 'intelugu'
|
799
|
-
self.emit(type, :block_intelugu, text, ts-1, te)
|
800
|
-
when 'inthaana'
|
801
|
-
self.emit(type, :block_inthaana, text, ts-1, te)
|
802
|
-
when 'inthai'
|
803
|
-
self.emit(type, :block_inthai, text, ts-1, te)
|
804
|
-
when 'intibetan'
|
805
|
-
self.emit(type, :block_intibetan, text, ts-1, te)
|
806
|
-
when 'inunifiedcanadianaboriginalsyllabics'
|
807
|
-
self.emit(type, :block_inunified_canadian_aboriginal_syllabics, text, ts-1, te)
|
808
|
-
when 'invariationselectors'
|
809
|
-
self.emit(type, :block_invariation_selectors, text, ts-1, te)
|
810
|
-
when 'inyiradicals'
|
811
|
-
self.emit(type, :block_inyi_radicals, text, ts-1, te)
|
812
|
-
when 'inyisyllables'
|
813
|
-
self.emit(type, :block_inyi_syllables, text, ts-1, te)
|
814
|
-
when 'inyijinghexagramsymbols'
|
815
|
-
self.emit(type, :block_inyijing_hexagram_symbols, text, ts-1, te)
|
20
|
+
name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
|
816
21
|
|
817
|
-
|
818
|
-
|
819
|
-
# an :unknown for the property be better?
|
820
|
-
#
|
821
|
-
# self.emit(type, :unknown, text, ts-1, te)
|
22
|
+
token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
|
23
|
+
raise UnknownUnicodePropertyError.new(name) unless token
|
822
24
|
|
823
|
-
|
824
|
-
end
|
25
|
+
self.emit(type, token.to_sym, text, ts-1, te)
|
825
26
|
|
826
27
|
fret;
|
827
28
|
};
|