unicode_script_detector 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +30 -8
- data/lib/unicode_script_detector/character.rb +13 -1
- data/lib/unicode_script_detector/detector.rb +11 -1
- data/lib/unicode_script_detector/script_group.rb +19 -0
- data/lib/unicode_script_detector/scripts.rb +347 -9
- data/lib/unicode_script_detector/version.rb +1 -1
- data/lib/unicode_script_detector.rb +5 -1
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 4795cdb246ac34ccb0ee5183ff0e704d25e4e67410acee321f36f4446dd28356
|
|
4
|
+
data.tar.gz: 0c7b9c4c835718f2fc7509225204e38c0a7148949c5745f1448b299c27e0e88d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 5da7422c57295f4ac3dee3ac9ccfaa99b5586418de956a88876035541da023e9fa4afe609a4aa79d4c3a1a5f9b1ffe64370984657844c06fc6a575578beb5ee2
|
|
7
|
+
data.tar.gz: aa9fecf48386b6eb5a0074cbbec8819af80153c6111e042debd9e9c312145bc11a936b3003a406370a28e8a27a48b2bd0409c0cf043b26fe465f4c58ee9669e2
|
data/README.md
CHANGED
|
@@ -20,23 +20,45 @@ $ gem install unicode_script_detector
|
|
|
20
20
|
UnicodeScriptDetector.detect_characters "Hel6б"
|
|
21
21
|
|
|
22
22
|
#Output:
|
|
23
|
-
[
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
23
|
+
[
|
|
24
|
+
#<UnicodeScriptDetector::Character:0x00007768fefdead8 @char="H", @name="Latin", @script=:Latin>,
|
|
25
|
+
#<UnicodeScriptDetector::Character:0x00007768fefdea10 @char="e", @name="Latin", @script=:Latin>,
|
|
26
|
+
#<UnicodeScriptDetector::Character:0x00007768fefde970 @char="l", @name="Latin", @script=:Latin>,
|
|
27
|
+
#<UnicodeScriptDetector::Character:0x00007768fefde8d0 @char="6", @name="Digit", @script=:Digit>,
|
|
28
|
+
#<UnicodeScriptDetector::Character:0x00007768fefde830 @char="б", @name="Cyrillic", @script=:Cyrillic>
|
|
29
|
+
]
|
|
28
30
|
```
|
|
29
31
|
|
|
30
32
|
## Detect if a script contains certain scripts
|
|
31
33
|
```ruby
|
|
32
34
|
# This will return true because it contains Latin and Cyrillic
|
|
33
|
-
UnicodeScriptDetector.contains? "
|
|
35
|
+
UnicodeScriptDetector.contains? "Helб🔥", [:Latin, :Cyrillic]
|
|
34
36
|
```
|
|
35
37
|
|
|
36
38
|
## Detect if a script contains only certain scripts
|
|
37
39
|
```ruby
|
|
38
40
|
# This will return false because it contains an Emoji as well
|
|
39
|
-
UnicodeScriptDetector.contains_only? "
|
|
41
|
+
UnicodeScriptDetector.contains_only? "Helб🔥", [:Latin, :Cyrillic]
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Detect all the characters of a string, grouped by the script
|
|
45
|
+
```ruby
|
|
46
|
+
UnicodeScriptDetector.script_groups("Hel6б how are you?").each do |group|
|
|
47
|
+
puts "#{group.name}: #{group.text} (#{group.length} characters)"
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
#Output:
|
|
51
|
+
|
|
52
|
+
Latin: Hel (3 characters)
|
|
53
|
+
Digit: 6 (1 characters)
|
|
54
|
+
Cyrillic: б (1 characters)
|
|
55
|
+
Whitespace: (1 characters)
|
|
56
|
+
Latin: how (3 characters)
|
|
57
|
+
Whitespace: (1 characters)
|
|
58
|
+
Latin: are (3 characters)
|
|
59
|
+
Whitespace: (1 characters)
|
|
60
|
+
Latin: you (3 characters)
|
|
61
|
+
Punctuation: ? (1 characters)
|
|
40
62
|
```
|
|
41
63
|
|
|
42
64
|
## Development
|
|
@@ -47,4 +69,4 @@ Run the tests with `bin/test`.
|
|
|
47
69
|
You're welcome to contribute to this project. See https://github.com/davidarendsen/unicode_script_detector.
|
|
48
70
|
|
|
49
71
|
## License
|
|
50
|
-
This software is released under the [MIT license](LICENSE).
|
|
72
|
+
This software is released under the [MIT license](LICENSE).
|
|
@@ -15,5 +15,17 @@ module UnicodeScriptDetector
|
|
|
15
15
|
def hiragana?
|
|
16
16
|
@script === :Hiragana
|
|
17
17
|
end
|
|
18
|
+
|
|
19
|
+
def punctuation?
|
|
20
|
+
@script === :Punctuation
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def emoji?
|
|
24
|
+
@script === :Emoji
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def digit?
|
|
28
|
+
@script === :Digit
|
|
29
|
+
end
|
|
18
30
|
end
|
|
19
|
-
end
|
|
31
|
+
end
|
|
@@ -41,5 +41,15 @@ module UnicodeScriptDetector
|
|
|
41
41
|
|
|
42
42
|
@scripts.uniq.sort == scripts.uniq.sort
|
|
43
43
|
end
|
|
44
|
+
|
|
45
|
+
def script_groups
|
|
46
|
+
@characters
|
|
47
|
+
.chunk { |char| char.script }
|
|
48
|
+
.map { |script, chars| ScriptGroup.new(script, chars) }
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def grouped_scripts_hash
|
|
52
|
+
script_groups.map { |group| [group.script, group.text] }.to_h
|
|
53
|
+
end
|
|
44
54
|
end
|
|
45
|
-
end
|
|
55
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
module UnicodeScriptDetector
|
|
2
|
+
class ScriptGroup
|
|
3
|
+
attr_reader :script, :characters, :text
|
|
4
|
+
|
|
5
|
+
def initialize(script, characters)
|
|
6
|
+
@script = script
|
|
7
|
+
@characters = characters
|
|
8
|
+
@text = characters.map(&:char).join
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def length
|
|
12
|
+
@characters.length
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def name
|
|
16
|
+
@characters.first&.name
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -1,15 +1,35 @@
|
|
|
1
1
|
module UnicodeScriptDetector
|
|
2
2
|
class Scripts
|
|
3
3
|
LIST = [
|
|
4
|
+
{
|
|
5
|
+
script: :Whitespace,
|
|
6
|
+
name: "Whitespace",
|
|
7
|
+
regex: /\s/
|
|
8
|
+
},
|
|
4
9
|
{
|
|
5
10
|
script: :Digit,
|
|
6
11
|
name: "Digit",
|
|
7
|
-
regex: /\d
|
|
12
|
+
regex: /\d/
|
|
8
13
|
},
|
|
9
14
|
{
|
|
10
|
-
script: :
|
|
11
|
-
name: "
|
|
12
|
-
regex:
|
|
15
|
+
script: :Punctuation,
|
|
16
|
+
name: "Punctuation",
|
|
17
|
+
regex: /[[:punct:]]/
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
script: :Adlam,
|
|
21
|
+
name: "Adlam",
|
|
22
|
+
regex: /\p{Adlam}/,
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
script: :Ahom,
|
|
26
|
+
name: "Ahom",
|
|
27
|
+
regex: /\p{Ahom}/,
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
script: :Anatolian_Hieroglyphs,
|
|
31
|
+
name: "Anatolian_Hieroglyphs",
|
|
32
|
+
regex: /\p{Anatolian_Hieroglyphs}/,
|
|
13
33
|
},
|
|
14
34
|
{
|
|
15
35
|
script: :Arabic,
|
|
@@ -21,6 +41,11 @@ module UnicodeScriptDetector
|
|
|
21
41
|
name: "Armenian",
|
|
22
42
|
regex: /\p{Armenian}/,
|
|
23
43
|
},
|
|
44
|
+
{
|
|
45
|
+
script: :Avestan,
|
|
46
|
+
name: "Avestan",
|
|
47
|
+
regex: /\p{Avestan}/,
|
|
48
|
+
},
|
|
24
49
|
{
|
|
25
50
|
script: :Balinese,
|
|
26
51
|
name: "Balinese",
|
|
@@ -31,6 +56,11 @@ module UnicodeScriptDetector
|
|
|
31
56
|
name: "Bamum",
|
|
32
57
|
regex: /\p{Bamum}/,
|
|
33
58
|
},
|
|
59
|
+
{
|
|
60
|
+
script: :Bassa_Vah,
|
|
61
|
+
name: "Bassa_Vah",
|
|
62
|
+
regex: /\p{Bassa_Vah}/,
|
|
63
|
+
},
|
|
34
64
|
{
|
|
35
65
|
script: :Batak,
|
|
36
66
|
name: "Batak",
|
|
@@ -41,6 +71,11 @@ module UnicodeScriptDetector
|
|
|
41
71
|
name: "Bengali",
|
|
42
72
|
regex: /\p{Bengali}/,
|
|
43
73
|
},
|
|
74
|
+
{
|
|
75
|
+
script: :Bhaiksuki,
|
|
76
|
+
name: "Bhaiksuki",
|
|
77
|
+
regex: /\p{Bhaiksuki}/,
|
|
78
|
+
},
|
|
44
79
|
{
|
|
45
80
|
script: :Bopomofo,
|
|
46
81
|
name: "Bopomofo",
|
|
@@ -76,6 +111,11 @@ module UnicodeScriptDetector
|
|
|
76
111
|
name: "Carian",
|
|
77
112
|
regex: /\p{Carian}/,
|
|
78
113
|
},
|
|
114
|
+
{
|
|
115
|
+
script: :Caucasian_Albanian,
|
|
116
|
+
name: "Caucasian_Albanian",
|
|
117
|
+
regex: /\p{Caucasian_Albanian}/,
|
|
118
|
+
},
|
|
79
119
|
{
|
|
80
120
|
script: :Chakma,
|
|
81
121
|
name: "Chakma",
|
|
@@ -91,6 +131,11 @@ module UnicodeScriptDetector
|
|
|
91
131
|
name: "Cherokee",
|
|
92
132
|
regex: /\p{Cherokee}/,
|
|
93
133
|
},
|
|
134
|
+
{
|
|
135
|
+
script: :Chorasmian,
|
|
136
|
+
name: "Chorasmian",
|
|
137
|
+
regex: /\p{Chorasmian}/,
|
|
138
|
+
},
|
|
94
139
|
{
|
|
95
140
|
script: :Coptic,
|
|
96
141
|
name: "Coptic",
|
|
@@ -106,6 +151,11 @@ module UnicodeScriptDetector
|
|
|
106
151
|
name: "Cypriot",
|
|
107
152
|
regex: /\p{Cypriot}/,
|
|
108
153
|
},
|
|
154
|
+
{
|
|
155
|
+
script: :Cypro_Minoan,
|
|
156
|
+
name: "Cypro_Minoan",
|
|
157
|
+
regex: /\p{Cypro_Minoan}/,
|
|
158
|
+
},
|
|
109
159
|
{
|
|
110
160
|
script: :Cyrillic,
|
|
111
161
|
name: "Cyrillic",
|
|
@@ -121,16 +171,42 @@ module UnicodeScriptDetector
|
|
|
121
171
|
name: "Devanagari",
|
|
122
172
|
regex: /\p{Devanagari}/,
|
|
123
173
|
},
|
|
174
|
+
{
|
|
175
|
+
script: :Dives_Akuru,
|
|
176
|
+
name: "Dives_Akuru",
|
|
177
|
+
regex: /\p{Dives_Akuru}/,
|
|
178
|
+
},
|
|
179
|
+
{
|
|
180
|
+
script: :Dogra,
|
|
181
|
+
name: "Dogra",
|
|
182
|
+
regex: /\p{Dogra}/,
|
|
183
|
+
},
|
|
184
|
+
{
|
|
185
|
+
script: :Duployan,
|
|
186
|
+
name: "Duployan",
|
|
187
|
+
regex: /\p{Duployan}/,
|
|
188
|
+
},
|
|
124
189
|
{
|
|
125
190
|
script: :Egyptian_Hieroglyphs,
|
|
126
191
|
name: "Egyptian_Hieroglyphs",
|
|
127
192
|
regex: /\p{Egyptian_Hieroglyphs}/,
|
|
128
193
|
},
|
|
194
|
+
{
|
|
195
|
+
script: :Elbasan,
|
|
196
|
+
name: "Elbasan",
|
|
197
|
+
regex: /\p{Elbasan}/,
|
|
198
|
+
},
|
|
199
|
+
{
|
|
200
|
+
script: :Elymaic,
|
|
201
|
+
name: "Elymaic",
|
|
202
|
+
regex: /\p{Elymaic}/,
|
|
203
|
+
},
|
|
129
204
|
{
|
|
130
205
|
script: :Ethiopic,
|
|
131
206
|
name: "Ethiopic",
|
|
132
207
|
regex: /\p{Ethiopic}/,
|
|
133
208
|
},
|
|
209
|
+
|
|
134
210
|
{
|
|
135
211
|
script: :Georgian,
|
|
136
212
|
name: "Georgian",
|
|
@@ -146,6 +222,11 @@ module UnicodeScriptDetector
|
|
|
146
222
|
name: "Gothic",
|
|
147
223
|
regex: /\p{Gothic}/,
|
|
148
224
|
},
|
|
225
|
+
{
|
|
226
|
+
script: :Grantha,
|
|
227
|
+
name: "Grantha",
|
|
228
|
+
regex: /\p{Grantha}/,
|
|
229
|
+
},
|
|
149
230
|
{
|
|
150
231
|
script: :Greek,
|
|
151
232
|
name: "Greek",
|
|
@@ -156,11 +237,17 @@ module UnicodeScriptDetector
|
|
|
156
237
|
name: "Gujarati",
|
|
157
238
|
regex: /\p{Gujarati}/,
|
|
158
239
|
},
|
|
240
|
+
{
|
|
241
|
+
script: :Gunjala_Gondi,
|
|
242
|
+
name: "Gunjala_Gondi",
|
|
243
|
+
regex: /\p{Gunjala_Gondi}/,
|
|
244
|
+
},
|
|
159
245
|
{
|
|
160
246
|
script: :Gurmukhi,
|
|
161
247
|
name: "Gurmukhi",
|
|
162
248
|
regex: /\p{Gurmukhi}/,
|
|
163
249
|
},
|
|
250
|
+
|
|
164
251
|
{
|
|
165
252
|
script: :Han,
|
|
166
253
|
name: "Han",
|
|
@@ -171,11 +258,21 @@ module UnicodeScriptDetector
|
|
|
171
258
|
name: "Hangul",
|
|
172
259
|
regex: /\p{Hangul}/,
|
|
173
260
|
},
|
|
261
|
+
{
|
|
262
|
+
script: :Hanifi_Rohingya,
|
|
263
|
+
name: "Hanifi_Rohingya",
|
|
264
|
+
regex: /\p{Hanifi_Rohingya}/,
|
|
265
|
+
},
|
|
174
266
|
{
|
|
175
267
|
script: :Hanunoo,
|
|
176
268
|
name: "Hanunoo",
|
|
177
269
|
regex: /\p{Hanunoo}/,
|
|
178
270
|
},
|
|
271
|
+
{
|
|
272
|
+
script: :Hatran,
|
|
273
|
+
name: "Hatran",
|
|
274
|
+
regex: /\p{Hatran}/,
|
|
275
|
+
},
|
|
179
276
|
{
|
|
180
277
|
script: :Hebrew,
|
|
181
278
|
name: "Hebrew",
|
|
@@ -226,6 +323,12 @@ module UnicodeScriptDetector
|
|
|
226
323
|
name: "Katakana",
|
|
227
324
|
regex: /\p{Katakana}/,
|
|
228
325
|
},
|
|
326
|
+
|
|
327
|
+
{
|
|
328
|
+
script: :Kawi,
|
|
329
|
+
name: "Kawi",
|
|
330
|
+
regex: /\p{Kawi}/,
|
|
331
|
+
},
|
|
229
332
|
{
|
|
230
333
|
script: :Kayah_Li,
|
|
231
334
|
name: "Kayah_Li",
|
|
@@ -236,11 +339,27 @@ module UnicodeScriptDetector
|
|
|
236
339
|
name: "Kharoshthi",
|
|
237
340
|
regex: /\p{Kharoshthi}/,
|
|
238
341
|
},
|
|
342
|
+
{
|
|
343
|
+
script: :Khitan_Small_Script,
|
|
344
|
+
name: "Khitan_Small_Script",
|
|
345
|
+
regex: /\p{Khitan_Small_Script}/,
|
|
346
|
+
},
|
|
239
347
|
{
|
|
240
348
|
script: :Khmer,
|
|
241
349
|
name: "Khmer",
|
|
242
350
|
regex: /\p{Khmer}/,
|
|
243
351
|
},
|
|
352
|
+
{
|
|
353
|
+
script: :Khojki,
|
|
354
|
+
name: "Khojki",
|
|
355
|
+
regex: /\p{Khojki}/,
|
|
356
|
+
},
|
|
357
|
+
{
|
|
358
|
+
script: :Khudawadi,
|
|
359
|
+
name: "Khudawadi",
|
|
360
|
+
regex: /\p{Khudawadi}/,
|
|
361
|
+
},
|
|
362
|
+
|
|
244
363
|
{
|
|
245
364
|
script: :Lao,
|
|
246
365
|
name: "Lao",
|
|
@@ -261,11 +380,21 @@ module UnicodeScriptDetector
|
|
|
261
380
|
name: "Limbu",
|
|
262
381
|
regex: /\p{Limbu}/,
|
|
263
382
|
},
|
|
383
|
+
{
|
|
384
|
+
script: :Linear_A,
|
|
385
|
+
name: "Linear_A",
|
|
386
|
+
regex: /\p{Linear_A}/,
|
|
387
|
+
},
|
|
264
388
|
{
|
|
265
389
|
script: :Linear_B,
|
|
266
390
|
name: "Linear_B",
|
|
267
391
|
regex: /\p{Linear_B}/,
|
|
268
392
|
},
|
|
393
|
+
{
|
|
394
|
+
script: :Lisu,
|
|
395
|
+
name: "Lisu",
|
|
396
|
+
regex: /\p{Lisu}/,
|
|
397
|
+
},
|
|
269
398
|
{
|
|
270
399
|
script: :Lycian,
|
|
271
400
|
name: "Lycian",
|
|
@@ -276,6 +405,16 @@ module UnicodeScriptDetector
|
|
|
276
405
|
name: "Lydian",
|
|
277
406
|
regex: /\p{Lydian}/,
|
|
278
407
|
},
|
|
408
|
+
{
|
|
409
|
+
script: :Mahajani,
|
|
410
|
+
name: "Mahajani",
|
|
411
|
+
regex: /\p{Mahajani}/,
|
|
412
|
+
},
|
|
413
|
+
{
|
|
414
|
+
script: :Makasar,
|
|
415
|
+
name: "Makasar",
|
|
416
|
+
regex: /\p{Makasar}/,
|
|
417
|
+
},
|
|
279
418
|
{
|
|
280
419
|
script: :Malayalam,
|
|
281
420
|
name: "Malayalam",
|
|
@@ -286,11 +425,36 @@ module UnicodeScriptDetector
|
|
|
286
425
|
name: "Mandaic",
|
|
287
426
|
regex: /\p{Mandaic}/,
|
|
288
427
|
},
|
|
428
|
+
{
|
|
429
|
+
script: :Manichaean,
|
|
430
|
+
name: "Manichaean",
|
|
431
|
+
regex: /\p{Manichaean}/,
|
|
432
|
+
},
|
|
433
|
+
{
|
|
434
|
+
script: :Marchen,
|
|
435
|
+
name: "Marchen",
|
|
436
|
+
regex: /\p{Marchen}/,
|
|
437
|
+
},
|
|
438
|
+
{
|
|
439
|
+
script: :Masaram_Gondi,
|
|
440
|
+
name: "Masaram_Gondi",
|
|
441
|
+
regex: /\p{Masaram_Gondi}/,
|
|
442
|
+
},
|
|
443
|
+
{
|
|
444
|
+
script: :Medefaidrin,
|
|
445
|
+
name: "Medefaidrin",
|
|
446
|
+
regex: /\p{Medefaidrin}/,
|
|
447
|
+
},
|
|
289
448
|
{
|
|
290
449
|
script: :Meetei_Mayek,
|
|
291
450
|
name: "Meetei_Mayek",
|
|
292
451
|
regex: /\p{Meetei_Mayek}/,
|
|
293
452
|
},
|
|
453
|
+
{
|
|
454
|
+
script: :Mende_Kikakui,
|
|
455
|
+
name: "Mende_Kikakui",
|
|
456
|
+
regex: /\p{Mende_Kikakui}/,
|
|
457
|
+
},
|
|
294
458
|
{
|
|
295
459
|
script: :Meroitic_Cursive,
|
|
296
460
|
name: "Meroitic_Cursive",
|
|
@@ -306,26 +470,71 @@ module UnicodeScriptDetector
|
|
|
306
470
|
name: "Miao",
|
|
307
471
|
regex: /\p{Miao}/,
|
|
308
472
|
},
|
|
473
|
+
{
|
|
474
|
+
script: :Modi,
|
|
475
|
+
name: "Modi",
|
|
476
|
+
regex: /\p{Modi}/,
|
|
477
|
+
},
|
|
309
478
|
{
|
|
310
479
|
script: :Mongolian,
|
|
311
480
|
name: "Mongolian",
|
|
312
481
|
regex: /\p{Mongolian}/,
|
|
313
482
|
},
|
|
483
|
+
{
|
|
484
|
+
script: :Mro,
|
|
485
|
+
name: "Mro",
|
|
486
|
+
regex: /\p{Mro}/,
|
|
487
|
+
},
|
|
488
|
+
{
|
|
489
|
+
script: :Multani,
|
|
490
|
+
name: "Multani",
|
|
491
|
+
regex: /\p{Multani}/,
|
|
492
|
+
},
|
|
314
493
|
{
|
|
315
494
|
script: :Myanmar,
|
|
316
495
|
name: "Myanmar",
|
|
317
496
|
regex: /\p{Myanmar}/,
|
|
318
497
|
},
|
|
498
|
+
{
|
|
499
|
+
script: :Nabataean,
|
|
500
|
+
name: "Nabataean",
|
|
501
|
+
regex: /\p{Nabataean}/,
|
|
502
|
+
},
|
|
503
|
+
{
|
|
504
|
+
script: :Nag_Mundari,
|
|
505
|
+
name: "Nag_Mundari",
|
|
506
|
+
regex: /\p{Nag_Mundari}/,
|
|
507
|
+
},
|
|
508
|
+
{
|
|
509
|
+
script: :Nandinagari,
|
|
510
|
+
name: "Nandinagari",
|
|
511
|
+
regex: /\p{Nandinagari}/,
|
|
512
|
+
},
|
|
319
513
|
{
|
|
320
514
|
script: :New_Tai_Lue,
|
|
321
515
|
name: "New_Tai_Lue",
|
|
322
516
|
regex: /\p{New_Tai_Lue}/,
|
|
323
517
|
},
|
|
518
|
+
{
|
|
519
|
+
script: :Newa,
|
|
520
|
+
name: "Newa",
|
|
521
|
+
regex: /\p{Newa}/,
|
|
522
|
+
},
|
|
324
523
|
{
|
|
325
524
|
script: :Nko,
|
|
326
525
|
name: "Nko",
|
|
327
526
|
regex: /\p{Nko}/,
|
|
328
527
|
},
|
|
528
|
+
{
|
|
529
|
+
script: :Nushu,
|
|
530
|
+
name: "Nushu",
|
|
531
|
+
regex: /\p{Nushu}/,
|
|
532
|
+
},
|
|
533
|
+
{
|
|
534
|
+
script: :Nyiakeng_Puachue_Hmong,
|
|
535
|
+
name: "Nyiakeng_Puachue_Hmong",
|
|
536
|
+
regex: /\p{Nyiakeng_Puachue_Hmong}/,
|
|
537
|
+
},
|
|
329
538
|
{
|
|
330
539
|
script: :Ogham,
|
|
331
540
|
name: "Ogham",
|
|
@@ -336,16 +545,37 @@ module UnicodeScriptDetector
|
|
|
336
545
|
name: "Ol_Chiki",
|
|
337
546
|
regex: /\p{Ol_Chiki}/,
|
|
338
547
|
},
|
|
548
|
+
|
|
549
|
+
{
|
|
550
|
+
script: :Old_Hungarian,
|
|
551
|
+
name: "Old_Hungarian",
|
|
552
|
+
regex: /\p{Old_Hungarian}/,
|
|
553
|
+
},
|
|
339
554
|
{
|
|
340
555
|
script: :Old_Italic,
|
|
341
556
|
name: "Old_Italic",
|
|
342
557
|
regex: /\p{Old_Italic}/,
|
|
343
558
|
},
|
|
559
|
+
{
|
|
560
|
+
script: :Old_North_Arabian,
|
|
561
|
+
name: "Old_North_Arabian",
|
|
562
|
+
regex: /\p{Old_North_Arabian}/,
|
|
563
|
+
},
|
|
564
|
+
{
|
|
565
|
+
script: :Old_Permic,
|
|
566
|
+
name: "Old_Permic",
|
|
567
|
+
regex: /\p{Old_Permic}/,
|
|
568
|
+
},
|
|
344
569
|
{
|
|
345
570
|
script: :Old_Persian,
|
|
346
571
|
name: "Old_Persian",
|
|
347
572
|
regex: /\p{Old_Persian}/,
|
|
348
573
|
},
|
|
574
|
+
{
|
|
575
|
+
script: :Old_Sogdian,
|
|
576
|
+
name: "Old_Sogdian",
|
|
577
|
+
regex: /\p{Old_Sogdian}/,
|
|
578
|
+
},
|
|
349
579
|
{
|
|
350
580
|
script: :Old_South_Arabian,
|
|
351
581
|
name: "Old_South_Arabian",
|
|
@@ -356,16 +586,41 @@ module UnicodeScriptDetector
|
|
|
356
586
|
name: "Old_Turkic",
|
|
357
587
|
regex: /\p{Old_Turkic}/,
|
|
358
588
|
},
|
|
589
|
+
{
|
|
590
|
+
script: :Old_Uyghur,
|
|
591
|
+
name: "Old_Uyghur",
|
|
592
|
+
regex: /\p{Old_Uyghur}/,
|
|
593
|
+
},
|
|
359
594
|
{
|
|
360
595
|
script: :Oriya,
|
|
361
596
|
name: "Oriya",
|
|
362
597
|
regex: /\p{Oriya}/,
|
|
363
598
|
},
|
|
599
|
+
{
|
|
600
|
+
script: :Osage,
|
|
601
|
+
name: "Osage",
|
|
602
|
+
regex: /\p{Osage}/,
|
|
603
|
+
},
|
|
364
604
|
{
|
|
365
605
|
script: :Osmanya,
|
|
366
606
|
name: "Osmanya",
|
|
367
607
|
regex: /\p{Osmanya}/,
|
|
368
608
|
},
|
|
609
|
+
{
|
|
610
|
+
script: :Pahawh_Hmong,
|
|
611
|
+
name: "Pahawh_Hmong",
|
|
612
|
+
regex: /\p{Pahawh_Hmong}/,
|
|
613
|
+
},
|
|
614
|
+
{
|
|
615
|
+
script: :Palmyrene,
|
|
616
|
+
name: "Palmyrene",
|
|
617
|
+
regex: /\p{Palmyrene}/,
|
|
618
|
+
},
|
|
619
|
+
{
|
|
620
|
+
script: :Pau_Cin_Hau,
|
|
621
|
+
name: "Pau_Cin_Hau",
|
|
622
|
+
regex: /\p{Pau_Cin_Hau}/,
|
|
623
|
+
},
|
|
369
624
|
{
|
|
370
625
|
script: :Phags_Pa,
|
|
371
626
|
name: "Phags_Pa",
|
|
@@ -376,6 +631,11 @@ module UnicodeScriptDetector
|
|
|
376
631
|
name: "Phoenician",
|
|
377
632
|
regex: /\p{Phoenician}/,
|
|
378
633
|
},
|
|
634
|
+
{
|
|
635
|
+
script: :Psalter_Pahlavi,
|
|
636
|
+
name: "Psalter_Pahlavi",
|
|
637
|
+
regex: /\p{Psalter_Pahlavi}/,
|
|
638
|
+
},
|
|
379
639
|
{
|
|
380
640
|
script: :Rejang,
|
|
381
641
|
name: "Rejang",
|
|
@@ -386,6 +646,11 @@ module UnicodeScriptDetector
|
|
|
386
646
|
name: "Runic",
|
|
387
647
|
regex: /\p{Runic}/,
|
|
388
648
|
},
|
|
649
|
+
{
|
|
650
|
+
script: :Samaritan,
|
|
651
|
+
name: "Samaritan",
|
|
652
|
+
regex: /\p{Samaritan}/,
|
|
653
|
+
},
|
|
389
654
|
{
|
|
390
655
|
script: :Saurashtra,
|
|
391
656
|
name: "Saurashtra",
|
|
@@ -401,21 +666,42 @@ module UnicodeScriptDetector
|
|
|
401
666
|
name: "Shavian",
|
|
402
667
|
regex: /\p{Shavian}/,
|
|
403
668
|
},
|
|
669
|
+
{
|
|
670
|
+
script: :Siddham,
|
|
671
|
+
name: "Siddham",
|
|
672
|
+
regex: /\p{Siddham}/,
|
|
673
|
+
},
|
|
674
|
+
{
|
|
675
|
+
script: :SignWriting,
|
|
676
|
+
name: "SignWriting",
|
|
677
|
+
regex: /\p{SignWriting}/,
|
|
678
|
+
},
|
|
404
679
|
{
|
|
405
680
|
script: :Sinhala,
|
|
406
681
|
name: "Sinhala",
|
|
407
682
|
regex: /\p{Sinhala}/,
|
|
408
683
|
},
|
|
684
|
+
{
|
|
685
|
+
script: :Sogdian,
|
|
686
|
+
name: "Sogdian",
|
|
687
|
+
regex: /\p{Sogdian}/,
|
|
688
|
+
},
|
|
409
689
|
{
|
|
410
690
|
script: :Sora_Sompeng,
|
|
411
691
|
name: "Sora_Sompeng",
|
|
412
692
|
regex: /\p{Sora_Sompeng}/,
|
|
413
693
|
},
|
|
694
|
+
{
|
|
695
|
+
script: :Soyombo,
|
|
696
|
+
name: "Soyombo",
|
|
697
|
+
regex: /\p{Soyombo}/,
|
|
698
|
+
},
|
|
414
699
|
{
|
|
415
700
|
script: :Sundanese,
|
|
416
701
|
name: "Sundanese",
|
|
417
702
|
regex: /\p{Sundanese}/,
|
|
418
703
|
},
|
|
704
|
+
|
|
419
705
|
{
|
|
420
706
|
script: :Syloti_Nagri,
|
|
421
707
|
name: "Syloti_Nagri",
|
|
@@ -461,6 +747,16 @@ module UnicodeScriptDetector
|
|
|
461
747
|
name: "Tamil",
|
|
462
748
|
regex: /\p{Tamil}/,
|
|
463
749
|
},
|
|
750
|
+
{
|
|
751
|
+
script: :Tangsa,
|
|
752
|
+
name: "Tangsa",
|
|
753
|
+
regex: /\p{Tangsa}/,
|
|
754
|
+
},
|
|
755
|
+
{
|
|
756
|
+
script: :Tangut,
|
|
757
|
+
name: "Tangut",
|
|
758
|
+
regex: /\p{Tangut}/,
|
|
759
|
+
},
|
|
464
760
|
{
|
|
465
761
|
script: :Telugu,
|
|
466
762
|
name: "Telugu",
|
|
@@ -486,25 +782,67 @@ module UnicodeScriptDetector
|
|
|
486
782
|
name: "Tifinagh",
|
|
487
783
|
regex: /\p{Tifinagh}/,
|
|
488
784
|
},
|
|
785
|
+
{
|
|
786
|
+
script: :Tirhuta,
|
|
787
|
+
name: "Tirhuta",
|
|
788
|
+
regex: /\p{Tirhuta}/,
|
|
789
|
+
},
|
|
790
|
+
|
|
791
|
+
{
|
|
792
|
+
script: :Toto,
|
|
793
|
+
name: "Toto",
|
|
794
|
+
regex: /\p{Toto}/,
|
|
795
|
+
},
|
|
796
|
+
|
|
489
797
|
{
|
|
490
798
|
script: :Ugaritic,
|
|
491
799
|
name: "Ugaritic",
|
|
492
800
|
regex: /\p{Ugaritic}/,
|
|
493
801
|
},
|
|
802
|
+
{
|
|
803
|
+
script: :Unknown,
|
|
804
|
+
name: "Unknown",
|
|
805
|
+
regex: /\p{Unknown}/,
|
|
806
|
+
},
|
|
494
807
|
{
|
|
495
808
|
script: :Vai,
|
|
496
809
|
name: "Vai",
|
|
497
810
|
regex: /\p{Vai}/,
|
|
498
811
|
},
|
|
812
|
+
{
|
|
813
|
+
script: :Vithkuqi,
|
|
814
|
+
name: "Vithkuqi",
|
|
815
|
+
regex: /\p{Vithkuqi}/,
|
|
816
|
+
},
|
|
817
|
+
{
|
|
818
|
+
script: :Wancho,
|
|
819
|
+
name: "Wancho",
|
|
820
|
+
regex: /\p{Wancho}/,
|
|
821
|
+
},
|
|
822
|
+
{
|
|
823
|
+
script: :Warang_Citi,
|
|
824
|
+
name: "Warang_Citi",
|
|
825
|
+
regex: /\p{Warang_Citi}/,
|
|
826
|
+
},
|
|
827
|
+
{
|
|
828
|
+
script: :Yezidi,
|
|
829
|
+
name: "Yezidi",
|
|
830
|
+
regex: /\p{Yezidi}/,
|
|
831
|
+
},
|
|
499
832
|
{
|
|
500
833
|
script: :Yi,
|
|
501
834
|
name: "Yi",
|
|
502
835
|
regex: /\p{Yi}/,
|
|
503
836
|
},
|
|
504
|
-
{
|
|
505
|
-
script: :
|
|
506
|
-
name: "
|
|
507
|
-
regex: /\p{
|
|
837
|
+
{
|
|
838
|
+
script: :Zanabazar_Square,
|
|
839
|
+
name: "Zanabazar_Square",
|
|
840
|
+
regex: /\p{Zanabazar_Square}/,
|
|
841
|
+
},
|
|
842
|
+
{
|
|
843
|
+
script: :Emoji,
|
|
844
|
+
name: "Emoji",
|
|
845
|
+
regex: /\p{Emoji}/,
|
|
508
846
|
},
|
|
509
847
|
{
|
|
510
848
|
script: :Common,
|
|
@@ -513,4 +851,4 @@ module UnicodeScriptDetector
|
|
|
513
851
|
},
|
|
514
852
|
]
|
|
515
853
|
end
|
|
516
|
-
end
|
|
854
|
+
end
|
|
@@ -9,6 +9,10 @@ module UnicodeScriptDetector
|
|
|
9
9
|
UnicodeScriptDetector::Detector.new(string).characters
|
|
10
10
|
end
|
|
11
11
|
|
|
12
|
+
def script_groups(string)
|
|
13
|
+
UnicodeScriptDetector::Detector.new(string).script_groups
|
|
14
|
+
end
|
|
15
|
+
|
|
12
16
|
def contains?(string, scripts)
|
|
13
17
|
UnicodeScriptDetector::Detector.new(string).contains?(scripts)
|
|
14
18
|
end
|
|
@@ -17,4 +21,4 @@ module UnicodeScriptDetector
|
|
|
17
21
|
UnicodeScriptDetector::Detector.new(string).contains_only?(scripts)
|
|
18
22
|
end
|
|
19
23
|
end
|
|
20
|
-
end
|
|
24
|
+
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: unicode_script_detector
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.5
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- David Arendsen
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2025-12-31 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: zeitwerk
|
|
@@ -55,6 +55,7 @@ files:
|
|
|
55
55
|
- lib/unicode_script_detector.rb
|
|
56
56
|
- lib/unicode_script_detector/character.rb
|
|
57
57
|
- lib/unicode_script_detector/detector.rb
|
|
58
|
+
- lib/unicode_script_detector/script_group.rb
|
|
58
59
|
- lib/unicode_script_detector/scripts.rb
|
|
59
60
|
- lib/unicode_script_detector/version.rb
|
|
60
61
|
homepage: https://github.com/davidarendsen/unicode_script_detector
|