regexp_property_values 0.3.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,133 @@
1
+ module RegexpPropertyValues
2
+ module Updater
3
+ module_function
4
+
5
+ require 'fileutils'
6
+ require 'set'
7
+
8
+ BASE_URL = 'http://www.unicode.org/Public/'
9
+
10
+ UCD_FILES = %w[
11
+ Blocks.txt
12
+ DerivedAge.txt
13
+ DerivedCoreProperties.txt
14
+ PropertyAliases.txt
15
+ PropertyValueAliases.txt
16
+ PropList.txt
17
+ Scripts.txt
18
+ ]
19
+
20
+ EMOJI_FILES = %w[
21
+ emoji-data.txt
22
+ ]
23
+
24
+ TMP_DIR = File.join(__dir__, 'tmp_ucd')
25
+
26
+ def call
27
+ prepare_tmp_dir
28
+ download_ucd_files
29
+ write_values
30
+ write_aliases
31
+ remove_tmp_dir
32
+ print_stats
33
+ end
34
+
35
+ def prepare_tmp_dir
36
+ FileUtils.rm_rf(TMP_DIR) if File.exist?(TMP_DIR)
37
+ FileUtils.mkdir(TMP_DIR)
38
+ end
39
+
40
+ def download_ucd_files
41
+ unicode_version = RbConfig::CONFIG.fetch('UNICODE_VERSION')
42
+ emoji_version = RbConfig::CONFIG.fetch('UNICODE_EMOJI_VERSION')
43
+ puts 'This will load ucd and emoji data for the CURRENT RUBY '\
44
+ "(#{unicode_version} / #{emoji_version}). Run this on the "\
45
+ 'latest Ruby version you want to support. Continue? [y/n]'
46
+ return puts 'download skipped.' unless $stdin.gets =~ /^y/i
47
+
48
+ Dir.chdir(TMP_DIR) do
49
+ UCD_FILES.each { |f| `wget #{BASE_URL}/#{unicode_version}/ucd/#{f}` }
50
+ EMOJI_FILES.each { |f| `wget #{BASE_URL}/emoji/#{emoji_version}/#{f}` }
51
+ end
52
+ end
53
+
54
+ def write_values
55
+ @values = Set.new
56
+
57
+ # posix properties
58
+ @values += %w[
59
+ Alpha Blank Cntrl Digit Graph Lower Print
60
+ Punct Space Upper XDigit Word Alnum ASCII
61
+ XPosixPunct
62
+ ]
63
+
64
+ # special properties
65
+ @values += %w[Any Assigned In_No_Block Unknown]
66
+
67
+ # legacy properties
68
+ @values += %w[Newline]
69
+
70
+ regexp = /^[0-9a-fA-F]+(?:\.\.[0-9a-fA-F]+)? *; (?<prop_name>\w+) +# /
71
+ %w[
72
+ DerivedCoreProperties.txt
73
+ PropList.txt
74
+ Scripts.txt
75
+ emoji-data.txt
76
+ ].each { |file| scan(file, regexp) { |caps| @values << caps[:prop_name] } }
77
+
78
+ scan('PropertyValueAliases.txt', /^gc ; \w+ *; (?<prop_name>\w+)/) do |caps|
79
+ @values << caps[:prop_name]
80
+ end
81
+
82
+ scan('Blocks.txt', /^[\dA-F.]+ *; (?<block_name>[-\w ]+)/) do |caps|
83
+ @values << 'In_' + caps[:block_name].gsub(/\W/, '_')
84
+ end
85
+
86
+ scan('DerivedAge.txt', /^[\dA-F.]+ *; (?<age_num>[\d.]+)/) do |caps|
87
+ @values << 'Age=' + caps[:age_num]
88
+ end
89
+
90
+ File.write(RegexpPropertyValues::VALUES_PATH, @values.sort.join("\n"))
91
+ end
92
+
93
+ def write_aliases
94
+ @aliases = Set.new
95
+
96
+ scan('PropertyAliases.txt', /^(?<alias>\w+) *; (?<name>\w+)/) do |caps|
97
+ if in_values?(caps[:name]) && !in_values?(caps[:alias])
98
+ @aliases << [caps[:alias], caps[:name]]
99
+ end
100
+ end
101
+
102
+ scan('PropertyValueAliases.txt',
103
+ /^[gs]c ; (?<alias1>\w+) *; (?<name>\w+)(?: *; (?<alias2>\w+))?/) do |caps|
104
+ if in_values?(caps[:name]) && !in_values?(caps[:alias1])
105
+ @aliases << [caps[:alias1], caps[:name]]
106
+ end
107
+ if in_values?(caps[:name]) && caps[:alias2] && !in_values?(caps[:alias2])
108
+ @aliases << [caps[:alias2], caps[:name]]
109
+ end
110
+ end
111
+
112
+ File.write(RegexpPropertyValues::ALIASES_PATH,
113
+ @aliases.sort.map { |pair| pair.join(';') }.join("\n"))
114
+ end
115
+
116
+ def in_values?(string)
117
+ @values.any? { |value| value.casecmp?(string) }
118
+ end
119
+
120
+ def scan(file, pattern)
121
+ path = File.join(TMP_DIR, file)
122
+ File.read(path).scan(pattern) { yield(Regexp.last_match) }
123
+ end
124
+
125
+ def remove_tmp_dir
126
+ FileUtils.rm_rf(TMP_DIR)
127
+ end
128
+
129
+ def print_stats
130
+ print "\nFetched #{@values.size} values and #{@aliases.size} aliases.\n\n"
131
+ end
132
+ end
133
+ end
@@ -0,0 +1,14 @@
1
+ module RegexpPropertyValues
2
+ class Value
3
+ require_relative 'value/shared_methods'
4
+ include SharedMethods
5
+
6
+ if const_defined?(:OnigRegexpPropertyHelper)
7
+ require_relative 'value/ext_adapter'
8
+ include ExtAdapter
9
+ else
10
+ require_relative 'value/ruby_fallback'
11
+ include RubyFallback
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,19 @@
1
+ module RegexpPropertyValues
2
+ class Value
3
+ module ExtAdapter
4
+ def matched_characters
5
+ matched_codepoints.map { |cp| cp.chr('utf-8') }
6
+ end
7
+
8
+ def matched_codepoints
9
+ matched_ranges.flat_map(&:to_a)
10
+ end
11
+
12
+ def matched_ranges
13
+ OnigRegexpPropertyHelper.matched_ranges(name)
14
+ rescue ArgumentError
15
+ raise_unsupported_or_unknown_error
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,21 @@
1
+ module RegexpPropertyValues
2
+ class Value
3
+ module RubyFallback
4
+ def matched_characters
5
+ matched_codepoints.map { |cp| cp.chr('utf-8') }
6
+ end
7
+
8
+ def matched_codepoints
9
+ # turns out scanning one big string is the least slow way to do this
10
+ @@test_str ||= (0..0xD7FF).map { |cp| cp.chr('utf-8') }.join <<
11
+ (0xE000..0x10FFFF).map { |cp| cp.chr('utf-8') }.join
12
+ @@test_str.scan(regexp).flat_map(&:codepoints)
13
+ end
14
+
15
+ def matched_ranges
16
+ require 'range_compressor'
17
+ RangeCompressor.compress(matched_codepoints)
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,63 @@
1
+ module RegexpPropertyValues
2
+ class Value
3
+ module SharedMethods
4
+ attr_reader :name
5
+
6
+ def initialize(name)
7
+ @name = name
8
+ end
9
+
10
+ def supported_by_current_ruby?
11
+ !!regexp rescue false
12
+ end
13
+
14
+ def ==(other)
15
+ identifier == other.identifier
16
+ end
17
+ alias eql? ==
18
+
19
+ def hash
20
+ @hash ||= identifier.hash
21
+ end
22
+
23
+ def identifier
24
+ @identifier ||= name.to_s.downcase.gsub(/[^0-9a-z=.]/, '')
25
+ end
26
+ alias to_s identifier
27
+
28
+ def full_name
29
+ (original = find_original) ? original.name : raise_unknown_error
30
+ end
31
+
32
+ def character_set
33
+ require 'character_set'
34
+ CharacterSet.from_ranges(*matched_ranges)
35
+ end
36
+
37
+ private
38
+
39
+ def regexp
40
+ @regexp ||= /\p{#{identifier}}/u
41
+ rescue RegexpError, SyntaxError
42
+ raise_unsupported_or_unknown_error
43
+ end
44
+
45
+ def find_original
46
+ RegexpPropertyValues.all.find { |orig| orig.eql?(self) } ||
47
+ RegexpPropertyValues.alias_hash[self]
48
+ end
49
+
50
+ def raise_unsupported_or_unknown_error
51
+ find_original ? raise_unsupported_error : raise_unknown_error
52
+ end
53
+
54
+ def raise_unsupported_error
55
+ raise Error, "Property name `#{name}` is known, but not in this Ruby"
56
+ end
57
+
58
+ def raise_unknown_error
59
+ raise Error, "Property name `#{name}` is not known in any Ruby"
60
+ end
61
+ end
62
+ end
63
+ end
@@ -1,3 +1,3 @@
1
1
  module RegexpPropertyValues
2
- VERSION = '0.3.5'
2
+ VERSION = '1.0.0'
3
3
  end
@@ -0,0 +1,590 @@
1
+ ASCII
2
+ ASCII_Hex_Digit
3
+ Adlam
4
+ Age=1.1
5
+ Age=10.0
6
+ Age=11.0
7
+ Age=12.0
8
+ Age=12.1
9
+ Age=2.0
10
+ Age=2.1
11
+ Age=3.0
12
+ Age=3.1
13
+ Age=3.2
14
+ Age=4.0
15
+ Age=4.1
16
+ Age=5.0
17
+ Age=5.1
18
+ Age=5.2
19
+ Age=6.0
20
+ Age=6.1
21
+ Age=6.2
22
+ Age=6.3
23
+ Age=7.0
24
+ Age=8.0
25
+ Age=9.0
26
+ Ahom
27
+ Alnum
28
+ Alpha
29
+ Alphabetic
30
+ Anatolian_Hieroglyphs
31
+ Any
32
+ Arabic
33
+ Armenian
34
+ Assigned
35
+ Avestan
36
+ Balinese
37
+ Bamum
38
+ Bassa_Vah
39
+ Batak
40
+ Bengali
41
+ Bhaiksuki
42
+ Bidi_Control
43
+ Blank
44
+ Bopomofo
45
+ Brahmi
46
+ Braille
47
+ Buginese
48
+ Buhid
49
+ Canadian_Aboriginal
50
+ Carian
51
+ Case_Ignorable
52
+ Cased
53
+ Cased_Letter
54
+ Caucasian_Albanian
55
+ Chakma
56
+ Cham
57
+ Changes_When_Casefolded
58
+ Changes_When_Casemapped
59
+ Changes_When_Lowercased
60
+ Changes_When_Titlecased
61
+ Changes_When_Uppercased
62
+ Cherokee
63
+ Close_Punctuation
64
+ Cntrl
65
+ Common
66
+ Connector_Punctuation
67
+ Control
68
+ Coptic
69
+ Cuneiform
70
+ Currency_Symbol
71
+ Cypriot
72
+ Cyrillic
73
+ Dash
74
+ Dash_Punctuation
75
+ Decimal_Number
76
+ Default_Ignorable_Code_Point
77
+ Deprecated
78
+ Deseret
79
+ Devanagari
80
+ Diacritic
81
+ Digit
82
+ Dogra
83
+ Duployan
84
+ Egyptian_Hieroglyphs
85
+ Elbasan
86
+ Elymaic
87
+ Emoji
88
+ Emoji_Component
89
+ Emoji_Modifier
90
+ Emoji_Modifier_Base
91
+ Emoji_Presentation
92
+ Enclosing_Mark
93
+ Ethiopic
94
+ Extender
95
+ Final_Punctuation
96
+ Format
97
+ Georgian
98
+ Glagolitic
99
+ Gothic
100
+ Grantha
101
+ Graph
102
+ Grapheme_Base
103
+ Grapheme_Extend
104
+ Grapheme_Link
105
+ Greek
106
+ Gujarati
107
+ Gunjala_Gondi
108
+ Gurmukhi
109
+ Han
110
+ Hangul
111
+ Hanifi_Rohingya
112
+ Hanunoo
113
+ Hatran
114
+ Hebrew
115
+ Hex_Digit
116
+ Hiragana
117
+ Hyphen
118
+ IDS_Binary_Operator
119
+ IDS_Trinary_Operator
120
+ ID_Continue
121
+ ID_Start
122
+ Ideographic
123
+ Imperial_Aramaic
124
+ In_Adlam
125
+ In_Aegean_Numbers
126
+ In_Ahom
127
+ In_Alchemical_Symbols
128
+ In_Alphabetic_Presentation_Forms
129
+ In_Anatolian_Hieroglyphs
130
+ In_Ancient_Greek_Musical_Notation
131
+ In_Ancient_Greek_Numbers
132
+ In_Ancient_Symbols
133
+ In_Arabic
134
+ In_Arabic_Extended_A
135
+ In_Arabic_Mathematical_Alphabetic_Symbols
136
+ In_Arabic_Presentation_Forms_A
137
+ In_Arabic_Presentation_Forms_B
138
+ In_Arabic_Supplement
139
+ In_Armenian
140
+ In_Arrows
141
+ In_Avestan
142
+ In_Balinese
143
+ In_Bamum
144
+ In_Bamum_Supplement
145
+ In_Basic_Latin
146
+ In_Bassa_Vah
147
+ In_Batak
148
+ In_Bengali
149
+ In_Bhaiksuki
150
+ In_Block_Elements
151
+ In_Bopomofo
152
+ In_Bopomofo_Extended
153
+ In_Box_Drawing
154
+ In_Brahmi
155
+ In_Braille_Patterns
156
+ In_Buginese
157
+ In_Buhid
158
+ In_Byzantine_Musical_Symbols
159
+ In_CJK_Compatibility
160
+ In_CJK_Compatibility_Forms
161
+ In_CJK_Compatibility_Ideographs
162
+ In_CJK_Compatibility_Ideographs_Supplement
163
+ In_CJK_Radicals_Supplement
164
+ In_CJK_Strokes
165
+ In_CJK_Symbols_and_Punctuation
166
+ In_CJK_Unified_Ideographs
167
+ In_CJK_Unified_Ideographs_Extension_A
168
+ In_CJK_Unified_Ideographs_Extension_B
169
+ In_CJK_Unified_Ideographs_Extension_C
170
+ In_CJK_Unified_Ideographs_Extension_D
171
+ In_CJK_Unified_Ideographs_Extension_E
172
+ In_CJK_Unified_Ideographs_Extension_F
173
+ In_Carian
174
+ In_Caucasian_Albanian
175
+ In_Chakma
176
+ In_Cham
177
+ In_Cherokee
178
+ In_Cherokee_Supplement
179
+ In_Chess_Symbols
180
+ In_Combining_Diacritical_Marks
181
+ In_Combining_Diacritical_Marks_Extended
182
+ In_Combining_Diacritical_Marks_Supplement
183
+ In_Combining_Diacritical_Marks_for_Symbols
184
+ In_Combining_Half_Marks
185
+ In_Common_Indic_Number_Forms
186
+ In_Control_Pictures
187
+ In_Coptic
188
+ In_Coptic_Epact_Numbers
189
+ In_Counting_Rod_Numerals
190
+ In_Cuneiform
191
+ In_Cuneiform_Numbers_and_Punctuation
192
+ In_Currency_Symbols
193
+ In_Cypriot_Syllabary
194
+ In_Cyrillic
195
+ In_Cyrillic_Extended_A
196
+ In_Cyrillic_Extended_B
197
+ In_Cyrillic_Extended_C
198
+ In_Cyrillic_Supplement
199
+ In_Deseret
200
+ In_Devanagari
201
+ In_Devanagari_Extended
202
+ In_Dingbats
203
+ In_Dogra
204
+ In_Domino_Tiles
205
+ In_Duployan
206
+ In_Early_Dynastic_Cuneiform
207
+ In_Egyptian_Hieroglyph_Format_Controls
208
+ In_Egyptian_Hieroglyphs
209
+ In_Elbasan
210
+ In_Elymaic
211
+ In_Emoticons
212
+ In_Enclosed_Alphanumeric_Supplement
213
+ In_Enclosed_Alphanumerics
214
+ In_Enclosed_CJK_Letters_and_Months
215
+ In_Enclosed_Ideographic_Supplement
216
+ In_Ethiopic
217
+ In_Ethiopic_Extended
218
+ In_Ethiopic_Extended_A
219
+ In_Ethiopic_Supplement
220
+ In_General_Punctuation
221
+ In_Geometric_Shapes
222
+ In_Geometric_Shapes_Extended
223
+ In_Georgian
224
+ In_Georgian_Extended
225
+ In_Georgian_Supplement
226
+ In_Glagolitic
227
+ In_Glagolitic_Supplement
228
+ In_Gothic
229
+ In_Grantha
230
+ In_Greek_Extended
231
+ In_Greek_and_Coptic
232
+ In_Gujarati
233
+ In_Gunjala_Gondi
234
+ In_Gurmukhi
235
+ In_Halfwidth_and_Fullwidth_Forms
236
+ In_Hangul_Compatibility_Jamo
237
+ In_Hangul_Jamo
238
+ In_Hangul_Jamo_Extended_A
239
+ In_Hangul_Jamo_Extended_B
240
+ In_Hangul_Syllables
241
+ In_Hanifi_Rohingya
242
+ In_Hanunoo
243
+ In_Hatran
244
+ In_Hebrew
245
+ In_High_Private_Use_Surrogates
246
+ In_High_Surrogates
247
+ In_Hiragana
248
+ In_IPA_Extensions
249
+ In_Ideographic_Description_Characters
250
+ In_Ideographic_Symbols_and_Punctuation
251
+ In_Imperial_Aramaic
252
+ In_Indic_Siyaq_Numbers
253
+ In_Inscriptional_Pahlavi
254
+ In_Inscriptional_Parthian
255
+ In_Javanese
256
+ In_Kaithi
257
+ In_Kana_Extended_A
258
+ In_Kana_Supplement
259
+ In_Kanbun
260
+ In_Kangxi_Radicals
261
+ In_Kannada
262
+ In_Katakana
263
+ In_Katakana_Phonetic_Extensions
264
+ In_Kayah_Li
265
+ In_Kharoshthi
266
+ In_Khmer
267
+ In_Khmer_Symbols
268
+ In_Khojki
269
+ In_Khudawadi
270
+ In_Lao
271
+ In_Latin_1_Supplement
272
+ In_Latin_Extended_A
273
+ In_Latin_Extended_Additional
274
+ In_Latin_Extended_B
275
+ In_Latin_Extended_C
276
+ In_Latin_Extended_D
277
+ In_Latin_Extended_E
278
+ In_Lepcha
279
+ In_Letterlike_Symbols
280
+ In_Limbu
281
+ In_Linear_A
282
+ In_Linear_B_Ideograms
283
+ In_Linear_B_Syllabary
284
+ In_Lisu
285
+ In_Low_Surrogates
286
+ In_Lycian
287
+ In_Lydian
288
+ In_Mahajani
289
+ In_Mahjong_Tiles
290
+ In_Makasar
291
+ In_Malayalam
292
+ In_Mandaic
293
+ In_Manichaean
294
+ In_Marchen
295
+ In_Masaram_Gondi
296
+ In_Mathematical_Alphanumeric_Symbols
297
+ In_Mathematical_Operators
298
+ In_Mayan_Numerals
299
+ In_Medefaidrin
300
+ In_Meetei_Mayek
301
+ In_Meetei_Mayek_Extensions
302
+ In_Mende_Kikakui
303
+ In_Meroitic_Cursive
304
+ In_Meroitic_Hieroglyphs
305
+ In_Miao
306
+ In_Miscellaneous_Mathematical_Symbols_A
307
+ In_Miscellaneous_Mathematical_Symbols_B
308
+ In_Miscellaneous_Symbols
309
+ In_Miscellaneous_Symbols_and_Arrows
310
+ In_Miscellaneous_Symbols_and_Pictographs
311
+ In_Miscellaneous_Technical
312
+ In_Modi
313
+ In_Modifier_Tone_Letters
314
+ In_Mongolian
315
+ In_Mongolian_Supplement
316
+ In_Mro
317
+ In_Multani
318
+ In_Musical_Symbols
319
+ In_Myanmar
320
+ In_Myanmar_Extended_A
321
+ In_Myanmar_Extended_B
322
+ In_NKo
323
+ In_Nabataean
324
+ In_Nandinagari
325
+ In_New_Tai_Lue
326
+ In_Newa
327
+ In_No_Block
328
+ In_Number_Forms
329
+ In_Nushu
330
+ In_Nyiakeng_Puachue_Hmong
331
+ In_Ogham
332
+ In_Ol_Chiki
333
+ In_Old_Hungarian
334
+ In_Old_Italic
335
+ In_Old_North_Arabian
336
+ In_Old_Permic
337
+ In_Old_Persian
338
+ In_Old_Sogdian
339
+ In_Old_South_Arabian
340
+ In_Old_Turkic
341
+ In_Optical_Character_Recognition
342
+ In_Oriya
343
+ In_Ornamental_Dingbats
344
+ In_Osage
345
+ In_Osmanya
346
+ In_Ottoman_Siyaq_Numbers
347
+ In_Pahawh_Hmong
348
+ In_Palmyrene
349
+ In_Pau_Cin_Hau
350
+ In_Phags_pa
351
+ In_Phaistos_Disc
352
+ In_Phoenician
353
+ In_Phonetic_Extensions
354
+ In_Phonetic_Extensions_Supplement
355
+ In_Playing_Cards
356
+ In_Private_Use_Area
357
+ In_Psalter_Pahlavi
358
+ In_Rejang
359
+ In_Rumi_Numeral_Symbols
360
+ In_Runic
361
+ In_Samaritan
362
+ In_Saurashtra
363
+ In_Sharada
364
+ In_Shavian
365
+ In_Shorthand_Format_Controls
366
+ In_Siddham
367
+ In_Sinhala
368
+ In_Sinhala_Archaic_Numbers
369
+ In_Small_Form_Variants
370
+ In_Small_Kana_Extension
371
+ In_Sogdian
372
+ In_Sora_Sompeng
373
+ In_Soyombo
374
+ In_Spacing_Modifier_Letters
375
+ In_Specials
376
+ In_Sundanese
377
+ In_Sundanese_Supplement
378
+ In_Superscripts_and_Subscripts
379
+ In_Supplemental_Arrows_A
380
+ In_Supplemental_Arrows_B
381
+ In_Supplemental_Arrows_C
382
+ In_Supplemental_Mathematical_Operators
383
+ In_Supplemental_Punctuation
384
+ In_Supplemental_Symbols_and_Pictographs
385
+ In_Supplementary_Private_Use_Area_A
386
+ In_Supplementary_Private_Use_Area_B
387
+ In_Sutton_SignWriting
388
+ In_Syloti_Nagri
389
+ In_Symbols_and_Pictographs_Extended_A
390
+ In_Syriac
391
+ In_Syriac_Supplement
392
+ In_Tagalog
393
+ In_Tagbanwa
394
+ In_Tags
395
+ In_Tai_Le
396
+ In_Tai_Tham
397
+ In_Tai_Viet
398
+ In_Tai_Xuan_Jing_Symbols
399
+ In_Takri
400
+ In_Tamil
401
+ In_Tamil_Supplement
402
+ In_Tangut
403
+ In_Tangut_Components
404
+ In_Telugu
405
+ In_Thaana
406
+ In_Thai
407
+ In_Tibetan
408
+ In_Tifinagh
409
+ In_Tirhuta
410
+ In_Transport_and_Map_Symbols
411
+ In_Ugaritic
412
+ In_Unified_Canadian_Aboriginal_Syllabics
413
+ In_Unified_Canadian_Aboriginal_Syllabics_Extended
414
+ In_Vai
415
+ In_Variation_Selectors
416
+ In_Variation_Selectors_Supplement
417
+ In_Vedic_Extensions
418
+ In_Vertical_Forms
419
+ In_Wancho
420
+ In_Warang_Citi
421
+ In_Yi_Radicals
422
+ In_Yi_Syllables
423
+ In_Yijing_Hexagram_Symbols
424
+ In_Zanabazar_Square
425
+ Inherited
426
+ Initial_Punctuation
427
+ Inscriptional_Pahlavi
428
+ Inscriptional_Parthian
429
+ Javanese
430
+ Join_Control
431
+ Kaithi
432
+ Kannada
433
+ Katakana
434
+ Kayah_Li
435
+ Kharoshthi
436
+ Khmer
437
+ Khojki
438
+ Khudawadi
439
+ Lao
440
+ Latin
441
+ Lepcha
442
+ Letter
443
+ Letter_Number
444
+ Limbu
445
+ Line_Separator
446
+ Linear_A
447
+ Linear_B
448
+ Lisu
449
+ Logical_Order_Exception
450
+ Lower
451
+ Lowercase
452
+ Lowercase_Letter
453
+ Lycian
454
+ Lydian
455
+ Mahajani
456
+ Makasar
457
+ Malayalam
458
+ Mandaic
459
+ Manichaean
460
+ Marchen
461
+ Mark
462
+ Masaram_Gondi
463
+ Math
464
+ Math_Symbol
465
+ Medefaidrin
466
+ Meetei_Mayek
467
+ Mende_Kikakui
468
+ Meroitic_Cursive
469
+ Meroitic_Hieroglyphs
470
+ Miao
471
+ Modi
472
+ Modifier_Letter
473
+ Modifier_Symbol
474
+ Mongolian
475
+ Mro
476
+ Multani
477
+ Myanmar
478
+ Nabataean
479
+ Nandinagari
480
+ New_Tai_Lue
481
+ Newa
482
+ Newline
483
+ Nko
484
+ Noncharacter_Code_Point
485
+ Nonspacing_Mark
486
+ Number
487
+ Nushu
488
+ Nyiakeng_Puachue_Hmong
489
+ Ogham
490
+ Ol_Chiki
491
+ Old_Hungarian
492
+ Old_Italic
493
+ Old_North_Arabian
494
+ Old_Permic
495
+ Old_Persian
496
+ Old_Sogdian
497
+ Old_South_Arabian
498
+ Old_Turkic
499
+ Open_Punctuation
500
+ Oriya
501
+ Osage
502
+ Osmanya
503
+ Other
504
+ Other_Alphabetic
505
+ Other_Default_Ignorable_Code_Point
506
+ Other_Grapheme_Extend
507
+ Other_ID_Continue
508
+ Other_ID_Start
509
+ Other_Letter
510
+ Other_Lowercase
511
+ Other_Math
512
+ Other_Number
513
+ Other_Punctuation
514
+ Other_Symbol
515
+ Other_Uppercase
516
+ Pahawh_Hmong
517
+ Palmyrene
518
+ Paragraph_Separator
519
+ Pattern_Syntax
520
+ Pattern_White_Space
521
+ Pau_Cin_Hau
522
+ Phags_Pa
523
+ Phoenician
524
+ Prepended_Concatenation_Mark
525
+ Print
526
+ Private_Use
527
+ Psalter_Pahlavi
528
+ Punct
529
+ Punctuation
530
+ Quotation_Mark
531
+ Radical
532
+ Regional_Indicator
533
+ Rejang
534
+ Runic
535
+ Samaritan
536
+ Saurashtra
537
+ Sentence_Terminal
538
+ Separator
539
+ Sharada
540
+ Shavian
541
+ Siddham
542
+ SignWriting
543
+ Sinhala
544
+ Soft_Dotted
545
+ Sogdian
546
+ Sora_Sompeng
547
+ Soyombo
548
+ Space
549
+ Space_Separator
550
+ Spacing_Mark
551
+ Sundanese
552
+ Surrogate
553
+ Syloti_Nagri
554
+ Symbol
555
+ Syriac
556
+ Tagalog
557
+ Tagbanwa
558
+ Tai_Le
559
+ Tai_Tham
560
+ Tai_Viet
561
+ Takri
562
+ Tamil
563
+ Tangut
564
+ Telugu
565
+ Terminal_Punctuation
566
+ Thaana
567
+ Thai
568
+ Tibetan
569
+ Tifinagh
570
+ Tirhuta
571
+ Titlecase_Letter
572
+ Ugaritic
573
+ Unassigned
574
+ Unified_Ideograph
575
+ Unknown
576
+ Upper
577
+ Uppercase
578
+ Uppercase_Letter
579
+ Vai
580
+ Variation_Selector
581
+ Wancho
582
+ Warang_Citi
583
+ White_Space
584
+ Word
585
+ XDigit
586
+ XID_Continue
587
+ XID_Start
588
+ XPosixPunct
589
+ Yi
590
+ Zanabazar_Square