unicode_script_detector 0.0.5 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: a0febe236b556e42077b401d8e117b3996c6065dcbb33c974a572a8af64d14a4
|
|
4
|
+
data.tar.gz: f8ef874b90e0ca8e387bc16d5fc947745fcbe34bbd8ea53c328f98d8c3d8eddd
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: c5b931f1a8f527900f9c37ba0af67ca4e1fbfc0a0fde2f382fc1c2853a2c8813654f1d334f692e39882421eb3a4825f8e056d91d92d58fb4c2c1c22266c347b3
|
|
7
|
+
data.tar.gz: 69ac1e2314cefd944af0958cba15764afdf05e6e6eb314b68bc75172c7d1085be00725d40df5fe7f1af23abccc0243fa5d134c0b590da62d83913dce6570f0c1
|
|
@@ -1,55 +1,65 @@
|
|
|
1
1
|
module UnicodeScriptDetector
|
|
2
2
|
class Detector
|
|
3
|
-
attr_reader :
|
|
3
|
+
attr_reader :scripts
|
|
4
4
|
|
|
5
5
|
def initialize(string)
|
|
6
6
|
@string = string
|
|
7
|
-
@
|
|
8
|
-
@
|
|
7
|
+
@char_scripts = []
|
|
8
|
+
@script_names = []
|
|
9
9
|
|
|
10
10
|
detect_scripts
|
|
11
11
|
end
|
|
12
12
|
|
|
13
13
|
def scripts
|
|
14
|
-
@
|
|
15
|
-
end
|
|
16
|
-
|
|
17
|
-
def detect_scripts
|
|
18
|
-
@string.chars.each_with_index do |char, index|
|
|
19
|
-
detected = false
|
|
20
|
-
Scripts::LIST.each_with_index do |script_data, index|
|
|
21
|
-
if char.match?(script_data[:regex])
|
|
22
|
-
@characters << Character.new(char, script_data[:script], script_data[:name])
|
|
23
|
-
@scripts << script_data[:script]
|
|
24
|
-
detected = true
|
|
25
|
-
break
|
|
26
|
-
end
|
|
27
|
-
end
|
|
28
|
-
@characters << Character.new(char, :Other, "Other") unless detected
|
|
29
|
-
@scripts << :Other unless detected
|
|
30
|
-
end
|
|
14
|
+
@char_scripts.uniq
|
|
31
15
|
end
|
|
32
16
|
|
|
33
17
|
def contains?(scripts)
|
|
34
|
-
return @
|
|
35
|
-
|
|
36
|
-
scripts.all? { |script| @scripts.include?(script) }
|
|
18
|
+
return @char_scripts.include?(scripts) if scripts.is_a?(Symbol)
|
|
19
|
+
scripts.all? { |script| @char_scripts.include?(script) }
|
|
37
20
|
end
|
|
38
21
|
|
|
39
22
|
def contains_only?(scripts)
|
|
40
|
-
return @
|
|
41
|
-
|
|
42
|
-
@scripts.uniq.sort == scripts.uniq.sort
|
|
23
|
+
return @char_scripts.uniq == [scripts] if scripts.is_a?(Symbol)
|
|
24
|
+
@char_scripts.uniq.sort == scripts.uniq.sort
|
|
43
25
|
end
|
|
44
26
|
|
|
45
27
|
def script_groups
|
|
46
|
-
@
|
|
47
|
-
.
|
|
48
|
-
.
|
|
28
|
+
@string.chars
|
|
29
|
+
.zip(@char_scripts, @script_names)
|
|
30
|
+
.chunk { |_, script, _| script }
|
|
31
|
+
.map do |script, char_data|
|
|
32
|
+
chars = char_data.map(&:first)
|
|
33
|
+
name = char_data.first[2]
|
|
34
|
+
ScriptGroup.new(script, chars, name)
|
|
35
|
+
end
|
|
49
36
|
end
|
|
50
37
|
|
|
51
38
|
def grouped_scripts_hash
|
|
52
39
|
script_groups.map { |group| [group.script, group.text] }.to_h
|
|
53
40
|
end
|
|
41
|
+
|
|
42
|
+
def characters
|
|
43
|
+
@characters ||= @string.chars.zip(@char_scripts, @script_names).map do |char, script, name|
|
|
44
|
+
Character.new(char, script, name)
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
private
|
|
49
|
+
def detect_scripts
|
|
50
|
+
@string.chars.each do |char|
|
|
51
|
+
script_info = find_script_for_char(char)
|
|
52
|
+
@char_scripts << script_info[:script]
|
|
53
|
+
@script_names << script_info[:name]
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def find_script_for_char(char)
|
|
58
|
+
Scripts::LIST.each do |script_data|
|
|
59
|
+
return script_data if char.match?(script_data[:regex])
|
|
60
|
+
end
|
|
61
|
+
{ script: :Other, name: "Other" }
|
|
62
|
+
end
|
|
63
|
+
|
|
54
64
|
end
|
|
55
65
|
end
|
|
@@ -1,19 +1,20 @@
|
|
|
1
1
|
module UnicodeScriptDetector
|
|
2
2
|
class ScriptGroup
|
|
3
|
-
attr_reader :script, :
|
|
3
|
+
attr_reader :script, :text, :name
|
|
4
4
|
|
|
5
|
-
def initialize(script,
|
|
5
|
+
def initialize(script, chars, name)
|
|
6
6
|
@script = script
|
|
7
|
-
@
|
|
8
|
-
@text =
|
|
7
|
+
@chars = chars
|
|
8
|
+
@text = chars.join
|
|
9
|
+
@name = name
|
|
9
10
|
end
|
|
10
11
|
|
|
11
12
|
def length
|
|
12
|
-
@
|
|
13
|
+
@chars.length
|
|
13
14
|
end
|
|
14
15
|
|
|
15
|
-
def
|
|
16
|
-
@characters.
|
|
16
|
+
def characters
|
|
17
|
+
@characters ||= @chars.map { |char| Character.new(char, @script, @name) }
|
|
17
18
|
end
|
|
18
19
|
end
|
|
19
20
|
end
|
|
@@ -1,21 +1,6 @@
|
|
|
1
1
|
module UnicodeScriptDetector
|
|
2
2
|
class Scripts
|
|
3
3
|
LIST = [
|
|
4
|
-
{
|
|
5
|
-
script: :Whitespace,
|
|
6
|
-
name: "Whitespace",
|
|
7
|
-
regex: /\s/
|
|
8
|
-
},
|
|
9
|
-
{
|
|
10
|
-
script: :Digit,
|
|
11
|
-
name: "Digit",
|
|
12
|
-
regex: /\d/
|
|
13
|
-
},
|
|
14
|
-
{
|
|
15
|
-
script: :Punctuation,
|
|
16
|
-
name: "Punctuation",
|
|
17
|
-
regex: /[[:punct:]]/
|
|
18
|
-
},
|
|
19
4
|
{
|
|
20
5
|
script: :Adlam,
|
|
21
6
|
name: "Adlam",
|
|
@@ -28,7 +13,7 @@ module UnicodeScriptDetector
|
|
|
28
13
|
},
|
|
29
14
|
{
|
|
30
15
|
script: :Anatolian_Hieroglyphs,
|
|
31
|
-
name: "
|
|
16
|
+
name: "Anatolian Hieroglyphs",
|
|
32
17
|
regex: /\p{Anatolian_Hieroglyphs}/,
|
|
33
18
|
},
|
|
34
19
|
{
|
|
@@ -58,7 +43,7 @@ module UnicodeScriptDetector
|
|
|
58
43
|
},
|
|
59
44
|
{
|
|
60
45
|
script: :Bassa_Vah,
|
|
61
|
-
name: "
|
|
46
|
+
name: "Bassa Vah",
|
|
62
47
|
regex: /\p{Bassa_Vah}/,
|
|
63
48
|
},
|
|
64
49
|
{
|
|
@@ -103,7 +88,7 @@ module UnicodeScriptDetector
|
|
|
103
88
|
},
|
|
104
89
|
{
|
|
105
90
|
script: :Canadian_Aboriginal,
|
|
106
|
-
name: "
|
|
91
|
+
name: "Canadian Aboriginal",
|
|
107
92
|
regex: /\p{Canadian_Aboriginal}/,
|
|
108
93
|
},
|
|
109
94
|
{
|
|
@@ -113,7 +98,7 @@ module UnicodeScriptDetector
|
|
|
113
98
|
},
|
|
114
99
|
{
|
|
115
100
|
script: :Caucasian_Albanian,
|
|
116
|
-
name: "
|
|
101
|
+
name: "Caucasian Albanian",
|
|
117
102
|
regex: /\p{Caucasian_Albanian}/,
|
|
118
103
|
},
|
|
119
104
|
{
|
|
@@ -153,7 +138,7 @@ module UnicodeScriptDetector
|
|
|
153
138
|
},
|
|
154
139
|
{
|
|
155
140
|
script: :Cypro_Minoan,
|
|
156
|
-
name: "
|
|
141
|
+
name: "Cypro Minoan",
|
|
157
142
|
regex: /\p{Cypro_Minoan}/,
|
|
158
143
|
},
|
|
159
144
|
{
|
|
@@ -173,7 +158,7 @@ module UnicodeScriptDetector
|
|
|
173
158
|
},
|
|
174
159
|
{
|
|
175
160
|
script: :Dives_Akuru,
|
|
176
|
-
name: "
|
|
161
|
+
name: "Dives Akuru",
|
|
177
162
|
regex: /\p{Dives_Akuru}/,
|
|
178
163
|
},
|
|
179
164
|
{
|
|
@@ -188,8 +173,8 @@ module UnicodeScriptDetector
|
|
|
188
173
|
},
|
|
189
174
|
{
|
|
190
175
|
script: :Egyptian_Hieroglyphs,
|
|
191
|
-
name: "
|
|
192
|
-
regex: /\p{Egyptian_Hieroglyphs}/,
|
|
176
|
+
name: "Egyptian Hieroglyphs",
|
|
177
|
+
regex: /\p{Egyptian_Hieroglyphs}|[\u{13460}-\u{1355F}]/,
|
|
193
178
|
},
|
|
194
179
|
{
|
|
195
180
|
script: :Elbasan,
|
|
@@ -206,7 +191,11 @@ module UnicodeScriptDetector
|
|
|
206
191
|
name: "Ethiopic",
|
|
207
192
|
regex: /\p{Ethiopic}/,
|
|
208
193
|
},
|
|
209
|
-
|
|
194
|
+
{
|
|
195
|
+
script: :Garay,
|
|
196
|
+
name: "Garay",
|
|
197
|
+
regex: /[\u{10D40}-\u{10D8F}]/,
|
|
198
|
+
},
|
|
210
199
|
{
|
|
211
200
|
script: :Georgian,
|
|
212
201
|
name: "Georgian",
|
|
@@ -239,7 +228,7 @@ module UnicodeScriptDetector
|
|
|
239
228
|
},
|
|
240
229
|
{
|
|
241
230
|
script: :Gunjala_Gondi,
|
|
242
|
-
name: "
|
|
231
|
+
name: "Gunjala Gondi",
|
|
243
232
|
regex: /\p{Gunjala_Gondi}/,
|
|
244
233
|
},
|
|
245
234
|
{
|
|
@@ -247,11 +236,15 @@ module UnicodeScriptDetector
|
|
|
247
236
|
name: "Gurmukhi",
|
|
248
237
|
regex: /\p{Gurmukhi}/,
|
|
249
238
|
},
|
|
250
|
-
|
|
239
|
+
{
|
|
240
|
+
script: :Gurung_Khema,
|
|
241
|
+
name: "Gurung Khema",
|
|
242
|
+
regex: /[\u{16100}-\u{1613F}]/,
|
|
243
|
+
},
|
|
251
244
|
{
|
|
252
245
|
script: :Han,
|
|
253
246
|
name: "Han",
|
|
254
|
-
regex: /\p{Han}/,
|
|
247
|
+
regex: /\p{Han}|[\u{323B0}-\u{3347F}]/,
|
|
255
248
|
},
|
|
256
249
|
{
|
|
257
250
|
script: :Hangul,
|
|
@@ -260,7 +253,7 @@ module UnicodeScriptDetector
|
|
|
260
253
|
},
|
|
261
254
|
{
|
|
262
255
|
script: :Hanifi_Rohingya,
|
|
263
|
-
name: "
|
|
256
|
+
name: "Hanifi Rohingya",
|
|
264
257
|
regex: /\p{Hanifi_Rohingya}/,
|
|
265
258
|
},
|
|
266
259
|
{
|
|
@@ -285,7 +278,7 @@ module UnicodeScriptDetector
|
|
|
285
278
|
},
|
|
286
279
|
{
|
|
287
280
|
script: :Imperial_Aramaic,
|
|
288
|
-
name: "
|
|
281
|
+
name: "Imperial Aramaic",
|
|
289
282
|
regex: /\p{Imperial_Aramaic}/,
|
|
290
283
|
},
|
|
291
284
|
{
|
|
@@ -295,12 +288,12 @@ module UnicodeScriptDetector
|
|
|
295
288
|
},
|
|
296
289
|
{
|
|
297
290
|
script: :Inscriptional_Pahlavi,
|
|
298
|
-
name: "
|
|
291
|
+
name: "Inscriptional Pahlavi",
|
|
299
292
|
regex: /\p{Inscriptional_Pahlavi}/,
|
|
300
293
|
},
|
|
301
294
|
{
|
|
302
295
|
script: :Inscriptional_Parthian,
|
|
303
|
-
name: "
|
|
296
|
+
name: "Inscriptional Parthian",
|
|
304
297
|
regex: /\p{Inscriptional_Parthian}/,
|
|
305
298
|
},
|
|
306
299
|
{
|
|
@@ -323,7 +316,6 @@ module UnicodeScriptDetector
|
|
|
323
316
|
name: "Katakana",
|
|
324
317
|
regex: /\p{Katakana}/,
|
|
325
318
|
},
|
|
326
|
-
|
|
327
319
|
{
|
|
328
320
|
script: :Kawi,
|
|
329
321
|
name: "Kawi",
|
|
@@ -331,7 +323,7 @@ module UnicodeScriptDetector
|
|
|
331
323
|
},
|
|
332
324
|
{
|
|
333
325
|
script: :Kayah_Li,
|
|
334
|
-
name: "
|
|
326
|
+
name: "Kayah Li",
|
|
335
327
|
regex: /\p{Kayah_Li}/,
|
|
336
328
|
},
|
|
337
329
|
{
|
|
@@ -341,7 +333,7 @@ module UnicodeScriptDetector
|
|
|
341
333
|
},
|
|
342
334
|
{
|
|
343
335
|
script: :Khitan_Small_Script,
|
|
344
|
-
name: "
|
|
336
|
+
name: "Khitan Small Script",
|
|
345
337
|
regex: /\p{Khitan_Small_Script}/,
|
|
346
338
|
},
|
|
347
339
|
{
|
|
@@ -359,7 +351,11 @@ module UnicodeScriptDetector
|
|
|
359
351
|
name: "Khudawadi",
|
|
360
352
|
regex: /\p{Khudawadi}/,
|
|
361
353
|
},
|
|
362
|
-
|
|
354
|
+
{
|
|
355
|
+
script: :Kirat_Rai,
|
|
356
|
+
name: "Kirat Rai",
|
|
357
|
+
regex: /[\u{16D40}-\u{16D7F}]/,
|
|
358
|
+
},
|
|
363
359
|
{
|
|
364
360
|
script: :Lao,
|
|
365
361
|
name: "Lao",
|
|
@@ -382,12 +378,12 @@ module UnicodeScriptDetector
|
|
|
382
378
|
},
|
|
383
379
|
{
|
|
384
380
|
script: :Linear_A,
|
|
385
|
-
name: "
|
|
381
|
+
name: "Linear A",
|
|
386
382
|
regex: /\p{Linear_A}/,
|
|
387
383
|
},
|
|
388
384
|
{
|
|
389
385
|
script: :Linear_B,
|
|
390
|
-
name: "
|
|
386
|
+
name: "Linear B",
|
|
391
387
|
regex: /\p{Linear_B}/,
|
|
392
388
|
},
|
|
393
389
|
{
|
|
@@ -437,7 +433,7 @@ module UnicodeScriptDetector
|
|
|
437
433
|
},
|
|
438
434
|
{
|
|
439
435
|
script: :Masaram_Gondi,
|
|
440
|
-
name: "
|
|
436
|
+
name: "Masaram Gondi",
|
|
441
437
|
regex: /\p{Masaram_Gondi}/,
|
|
442
438
|
},
|
|
443
439
|
{
|
|
@@ -447,22 +443,22 @@ module UnicodeScriptDetector
|
|
|
447
443
|
},
|
|
448
444
|
{
|
|
449
445
|
script: :Meetei_Mayek,
|
|
450
|
-
name: "
|
|
446
|
+
name: "Meetei Mayek",
|
|
451
447
|
regex: /\p{Meetei_Mayek}/,
|
|
452
448
|
},
|
|
453
449
|
{
|
|
454
450
|
script: :Mende_Kikakui,
|
|
455
|
-
name: "
|
|
451
|
+
name: "Mende Kikakui",
|
|
456
452
|
regex: /\p{Mende_Kikakui}/,
|
|
457
453
|
},
|
|
458
454
|
{
|
|
459
455
|
script: :Meroitic_Cursive,
|
|
460
|
-
name: "
|
|
456
|
+
name: "Meroitic Cursive",
|
|
461
457
|
regex: /\p{Meroitic_Cursive}/,
|
|
462
458
|
},
|
|
463
459
|
{
|
|
464
460
|
script: :Meroitic_Hieroglyphs,
|
|
465
|
-
name: "
|
|
461
|
+
name: "Meroitic Hieroglyphs",
|
|
466
462
|
regex: /\p{Meroitic_Hieroglyphs}/,
|
|
467
463
|
},
|
|
468
464
|
{
|
|
@@ -493,7 +489,7 @@ module UnicodeScriptDetector
|
|
|
493
489
|
{
|
|
494
490
|
script: :Myanmar,
|
|
495
491
|
name: "Myanmar",
|
|
496
|
-
regex: /\p{Myanmar}/,
|
|
492
|
+
regex: /\p{Myanmar}|[\u{116D0}-\u{116FF}]/,
|
|
497
493
|
},
|
|
498
494
|
{
|
|
499
495
|
script: :Nabataean,
|
|
@@ -502,7 +498,7 @@ module UnicodeScriptDetector
|
|
|
502
498
|
},
|
|
503
499
|
{
|
|
504
500
|
script: :Nag_Mundari,
|
|
505
|
-
name: "
|
|
501
|
+
name: "Nag Mundari",
|
|
506
502
|
regex: /\p{Nag_Mundari}/,
|
|
507
503
|
},
|
|
508
504
|
{
|
|
@@ -512,7 +508,7 @@ module UnicodeScriptDetector
|
|
|
512
508
|
},
|
|
513
509
|
{
|
|
514
510
|
script: :New_Tai_Lue,
|
|
515
|
-
name: "
|
|
511
|
+
name: "New Tai Lue",
|
|
516
512
|
regex: /\p{New_Tai_Lue}/,
|
|
517
513
|
},
|
|
518
514
|
{
|
|
@@ -532,7 +528,7 @@ module UnicodeScriptDetector
|
|
|
532
528
|
},
|
|
533
529
|
{
|
|
534
530
|
script: :Nyiakeng_Puachue_Hmong,
|
|
535
|
-
name: "
|
|
531
|
+
name: "Nyiakeng Puachue Hmong",
|
|
536
532
|
regex: /\p{Nyiakeng_Puachue_Hmong}/,
|
|
537
533
|
},
|
|
538
534
|
{
|
|
@@ -542,53 +538,57 @@ module UnicodeScriptDetector
|
|
|
542
538
|
},
|
|
543
539
|
{
|
|
544
540
|
script: :Ol_Chiki,
|
|
545
|
-
name: "
|
|
541
|
+
name: "Ol Chiki",
|
|
546
542
|
regex: /\p{Ol_Chiki}/,
|
|
547
543
|
},
|
|
548
|
-
|
|
544
|
+
{
|
|
545
|
+
script: :Ol_Onal,
|
|
546
|
+
name: "Ol Onal",
|
|
547
|
+
regex: /[\u{1E5D0}-\u{1E5FF}]/,
|
|
548
|
+
},
|
|
549
549
|
{
|
|
550
550
|
script: :Old_Hungarian,
|
|
551
|
-
name: "
|
|
551
|
+
name: "Old Hungarian",
|
|
552
552
|
regex: /\p{Old_Hungarian}/,
|
|
553
553
|
},
|
|
554
554
|
{
|
|
555
555
|
script: :Old_Italic,
|
|
556
|
-
name: "
|
|
556
|
+
name: "Old Italic",
|
|
557
557
|
regex: /\p{Old_Italic}/,
|
|
558
558
|
},
|
|
559
559
|
{
|
|
560
560
|
script: :Old_North_Arabian,
|
|
561
|
-
name: "
|
|
561
|
+
name: "Old North Arabian",
|
|
562
562
|
regex: /\p{Old_North_Arabian}/,
|
|
563
563
|
},
|
|
564
564
|
{
|
|
565
565
|
script: :Old_Permic,
|
|
566
|
-
name: "
|
|
566
|
+
name: "Old Permic",
|
|
567
567
|
regex: /\p{Old_Permic}/,
|
|
568
568
|
},
|
|
569
569
|
{
|
|
570
570
|
script: :Old_Persian,
|
|
571
|
-
name: "
|
|
571
|
+
name: "Old Persian",
|
|
572
572
|
regex: /\p{Old_Persian}/,
|
|
573
573
|
},
|
|
574
574
|
{
|
|
575
575
|
script: :Old_Sogdian,
|
|
576
|
-
name: "
|
|
576
|
+
name: "Old Sogdian",
|
|
577
577
|
regex: /\p{Old_Sogdian}/,
|
|
578
578
|
},
|
|
579
579
|
{
|
|
580
580
|
script: :Old_South_Arabian,
|
|
581
|
-
name: "
|
|
581
|
+
name: "Old South Arabian",
|
|
582
582
|
regex: /\p{Old_South_Arabian}/,
|
|
583
583
|
},
|
|
584
584
|
{
|
|
585
585
|
script: :Old_Turkic,
|
|
586
|
-
name: "
|
|
586
|
+
name: "Old Turkic",
|
|
587
587
|
regex: /\p{Old_Turkic}/,
|
|
588
588
|
},
|
|
589
589
|
{
|
|
590
590
|
script: :Old_Uyghur,
|
|
591
|
-
name: "
|
|
591
|
+
name: "Old Uyghur",
|
|
592
592
|
regex: /\p{Old_Uyghur}/,
|
|
593
593
|
},
|
|
594
594
|
{
|
|
@@ -608,7 +608,7 @@ module UnicodeScriptDetector
|
|
|
608
608
|
},
|
|
609
609
|
{
|
|
610
610
|
script: :Pahawh_Hmong,
|
|
611
|
-
name: "
|
|
611
|
+
name: "Pahawh Hmong",
|
|
612
612
|
regex: /\p{Pahawh_Hmong}/,
|
|
613
613
|
},
|
|
614
614
|
{
|
|
@@ -618,12 +618,12 @@ module UnicodeScriptDetector
|
|
|
618
618
|
},
|
|
619
619
|
{
|
|
620
620
|
script: :Pau_Cin_Hau,
|
|
621
|
-
name: "
|
|
621
|
+
name: "Pau Cin Hau",
|
|
622
622
|
regex: /\p{Pau_Cin_Hau}/,
|
|
623
623
|
},
|
|
624
624
|
{
|
|
625
625
|
script: :Phags_Pa,
|
|
626
|
-
name: "
|
|
626
|
+
name: "Phags Pa",
|
|
627
627
|
regex: /\p{Phags_Pa}/,
|
|
628
628
|
},
|
|
629
629
|
{
|
|
@@ -633,7 +633,7 @@ module UnicodeScriptDetector
|
|
|
633
633
|
},
|
|
634
634
|
{
|
|
635
635
|
script: :Psalter_Pahlavi,
|
|
636
|
-
name: "
|
|
636
|
+
name: "Psalter Pahlavi",
|
|
637
637
|
regex: /\p{Psalter_Pahlavi}/,
|
|
638
638
|
},
|
|
639
639
|
{
|
|
@@ -659,7 +659,7 @@ module UnicodeScriptDetector
|
|
|
659
659
|
{
|
|
660
660
|
script: :Sharada,
|
|
661
661
|
name: "Sharada",
|
|
662
|
-
regex: /\p{Sharada}/,
|
|
662
|
+
regex: /\p{Sharada}|[\u{11B60}-\u{11B7F}]/,
|
|
663
663
|
},
|
|
664
664
|
{
|
|
665
665
|
script: :Shavian,
|
|
@@ -671,6 +671,11 @@ module UnicodeScriptDetector
|
|
|
671
671
|
name: "Siddham",
|
|
672
672
|
regex: /\p{Siddham}/,
|
|
673
673
|
},
|
|
674
|
+
{
|
|
675
|
+
script: :Sidetic,
|
|
676
|
+
name: "Sidetic",
|
|
677
|
+
regex: /[\u{10940}-\u{1095F}]/
|
|
678
|
+
},
|
|
674
679
|
{
|
|
675
680
|
script: :SignWriting,
|
|
676
681
|
name: "SignWriting",
|
|
@@ -688,7 +693,7 @@ module UnicodeScriptDetector
|
|
|
688
693
|
},
|
|
689
694
|
{
|
|
690
695
|
script: :Sora_Sompeng,
|
|
691
|
-
name: "
|
|
696
|
+
name: "Sora Sompeng",
|
|
692
697
|
regex: /\p{Sora_Sompeng}/,
|
|
693
698
|
},
|
|
694
699
|
{
|
|
@@ -701,10 +706,14 @@ module UnicodeScriptDetector
|
|
|
701
706
|
name: "Sundanese",
|
|
702
707
|
regex: /\p{Sundanese}/,
|
|
703
708
|
},
|
|
704
|
-
|
|
709
|
+
{
|
|
710
|
+
script: :Sunuwar,
|
|
711
|
+
name: "Sunuwar",
|
|
712
|
+
regex: /[\u{11BC0}-\u{11BFF}]/,
|
|
713
|
+
},
|
|
705
714
|
{
|
|
706
715
|
script: :Syloti_Nagri,
|
|
707
|
-
name: "
|
|
716
|
+
name: "Syloti Nagri",
|
|
708
717
|
regex: /\p{Syloti_Nagri}/,
|
|
709
718
|
},
|
|
710
719
|
{
|
|
@@ -724,19 +733,24 @@ module UnicodeScriptDetector
|
|
|
724
733
|
},
|
|
725
734
|
{
|
|
726
735
|
script: :Tai_Le,
|
|
727
|
-
name: "
|
|
736
|
+
name: "Tai Le",
|
|
728
737
|
regex: /\p{Tai_Le}/,
|
|
729
738
|
},
|
|
730
739
|
{
|
|
731
740
|
script: :Tai_Tham,
|
|
732
|
-
name: "
|
|
741
|
+
name: "Tai Tham",
|
|
733
742
|
regex: /\p{Tai_Tham}/,
|
|
734
743
|
},
|
|
735
744
|
{
|
|
736
745
|
script: :Tai_Viet,
|
|
737
|
-
name: "
|
|
746
|
+
name: "Tai Viet",
|
|
738
747
|
regex: /\p{Tai_Viet}/,
|
|
739
748
|
},
|
|
749
|
+
{
|
|
750
|
+
script: :Tai_Yo,
|
|
751
|
+
name: "Tai Yo",
|
|
752
|
+
regex: /[\u{1E6C0}-\u{1E6FF}]/,
|
|
753
|
+
},
|
|
740
754
|
{
|
|
741
755
|
script: :Takri,
|
|
742
756
|
name: "Takri",
|
|
@@ -755,7 +769,7 @@ module UnicodeScriptDetector
|
|
|
755
769
|
{
|
|
756
770
|
script: :Tangut,
|
|
757
771
|
name: "Tangut",
|
|
758
|
-
regex: /\p{Tangut}/,
|
|
772
|
+
regex: /\p{Tangut}|[\u{18D80}-\u{18DFF}]/,
|
|
759
773
|
},
|
|
760
774
|
{
|
|
761
775
|
script: :Telugu,
|
|
@@ -787,13 +801,26 @@ module UnicodeScriptDetector
|
|
|
787
801
|
name: "Tirhuta",
|
|
788
802
|
regex: /\p{Tirhuta}/,
|
|
789
803
|
},
|
|
790
|
-
|
|
804
|
+
{
|
|
805
|
+
script: :Todhri,
|
|
806
|
+
name: "Todhri",
|
|
807
|
+
regex: /[\u{105C0}-\u{105FF}]/,
|
|
808
|
+
},
|
|
809
|
+
{
|
|
810
|
+
script: :Tolong_Siki,
|
|
811
|
+
name: "Tolong Siki",
|
|
812
|
+
regex: /[\u{11DB0}-\u{11DEF}]/,
|
|
813
|
+
},
|
|
791
814
|
{
|
|
792
815
|
script: :Toto,
|
|
793
816
|
name: "Toto",
|
|
794
817
|
regex: /\p{Toto}/,
|
|
795
818
|
},
|
|
796
|
-
|
|
819
|
+
{
|
|
820
|
+
script: :Tulu_Tigalari,
|
|
821
|
+
name: "Tulu Tigalari",
|
|
822
|
+
regex: /[\u{11380}-\u{113FF}]/,
|
|
823
|
+
},
|
|
797
824
|
{
|
|
798
825
|
script: :Ugaritic,
|
|
799
826
|
name: "Ugaritic",
|
|
@@ -821,7 +848,7 @@ module UnicodeScriptDetector
|
|
|
821
848
|
},
|
|
822
849
|
{
|
|
823
850
|
script: :Warang_Citi,
|
|
824
|
-
name: "
|
|
851
|
+
name: "Warang Citi",
|
|
825
852
|
regex: /\p{Warang_Citi}/,
|
|
826
853
|
},
|
|
827
854
|
{
|
|
@@ -836,18 +863,35 @@ module UnicodeScriptDetector
|
|
|
836
863
|
},
|
|
837
864
|
{
|
|
838
865
|
script: :Zanabazar_Square,
|
|
839
|
-
name: "
|
|
866
|
+
name: "Zanabazar Square",
|
|
840
867
|
regex: /\p{Zanabazar_Square}/,
|
|
841
868
|
},
|
|
869
|
+
|
|
870
|
+
#Special characters
|
|
871
|
+
{
|
|
872
|
+
script: :Whitespace,
|
|
873
|
+
name: "Whitespace",
|
|
874
|
+
regex: /\s/
|
|
875
|
+
},
|
|
876
|
+
{
|
|
877
|
+
script: :Digit,
|
|
878
|
+
name: "Digit",
|
|
879
|
+
regex: /\d/
|
|
880
|
+
},
|
|
842
881
|
{
|
|
843
882
|
script: :Emoji,
|
|
844
883
|
name: "Emoji",
|
|
845
|
-
regex: /\p{
|
|
884
|
+
regex: /\p{Emoji_Presentation}/,
|
|
885
|
+
},
|
|
886
|
+
{
|
|
887
|
+
script: :Punctuation,
|
|
888
|
+
name: "Punctuation",
|
|
889
|
+
regex: /[[:punct:]]/
|
|
846
890
|
},
|
|
847
891
|
{
|
|
848
892
|
script: :Common,
|
|
849
893
|
name: "Common",
|
|
850
|
-
regex: /\p{Common}/,
|
|
894
|
+
regex: /\p{Common}|[\u{1CEC0}-\u{1CEFF}]|[\u{1CC00}-\u{1CEBF}]/,
|
|
851
895
|
},
|
|
852
896
|
]
|
|
853
897
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: unicode_script_detector
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.7
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- David Arendsen
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2026-01-02 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: zeitwerk
|