unicode-scripts 1.9.0 → 1.11.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/Gemfile.lock +1 -1
- data/MIT-LICENSE.txt +1 -1
- data/README.md +81 -358
- data/data/scripts.marshal.gz +0 -0
- data/lib/unicode/scripts/constants.rb +4 -2
- data/lib/unicode/scripts.rb +70 -4
- data/spec/unicode_scripts_spec.rb +54 -0
- metadata +7 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: feaabd20c3a3869a96e62e34d7c39b83739365549904ecdb129e83d9f73540d4
|
4
|
+
data.tar.gz: 40af16102c2aa63b35051f09b65cd8e2d14c32fbce21a7c802ea260121ade5b5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8d5f215ed6b03d5192eef673d22f0705cac149e3701427570ab52b4e3c538ac1537b7ca5a0768a66e6d5ffdd4d66b9363b4fdafbdf74112df1e7e59ab639cf2c
|
7
|
+
data.tar.gz: 735b9611f0bfee72dd074a8873c3c269d23a150b6d7a9da64ca33b7d02c5a65316a45eaa47bb60fc44a554b70d3efbd3bcd8cf0c94185e01d7fdd5f767766839
|
data/CHANGELOG.md
CHANGED
data/Gemfile.lock
CHANGED
data/MIT-LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
# Unicode::Scripts [![[version]](https://badge.fury.io/rb/unicode-scripts.svg)](https://badge.fury.io/rb/unicode-scripts) [![[ci]](https://github.com/janlelis/unicode-scripts/workflows/Test/badge.svg)](https://github.com/janlelis/unicode-scripts/actions?query=workflow%3ATest)
|
2
2
|
|
3
|
-
Retrieve
|
3
|
+
Retrieve all [Unicode script(s)](https://en.wikipedia.org/wiki/Script_%28Unicode%29) a string belongs to. Can also return the *Script_Extension* property (scx) which is defined as characters which are "commonly used with more than one script, but with a limited number of scripts".
|
4
4
|
|
5
|
-
|
5
|
+
Based on the *Script_Extension*, this library can also return the [augmented script set](https://www.unicode.org/reports/tr39/#def-augmented-script-set) to figure out if a string is **mixed-script** or **single-script**. Mixed scripts can be an indicator of suspicious user inputs.
|
6
6
|
|
7
|
-
|
7
|
+
Unicode version: **16.0.0** (September 2024)
|
8
8
|
|
9
|
-
|
9
|
+
Supported Rubies: **3.x** (might work: **2.x**)
|
10
10
|
|
11
11
|
## Gemfile
|
12
12
|
|
@@ -14,7 +14,7 @@ Old Rubies that might still work: **2.X**
|
|
14
14
|
gem "unicode-scripts"
|
15
15
|
```
|
16
16
|
|
17
|
-
## Usage
|
17
|
+
## Usage - Scripts and Script Extensions
|
18
18
|
|
19
19
|
```ruby
|
20
20
|
require "unicode/scripts"
|
@@ -29,381 +29,104 @@ Unicode::Scripts.script("ᴦ") # => "Greek"
|
|
29
29
|
|
30
30
|
# Script_Extension property
|
31
31
|
Unicode::Scripts.script_extensions("॥")
|
32
|
-
# => ["Bengali", "Devanagari", "Dogra", "Grantha", "Gujarati","Gunjala_Gondi", "Gurmukhi",
|
33
|
-
"Khudawadi",
|
34
|
-
"Syloti_Nagri", "Takri", "Tamil", "Telugu", "Tirhuta"]
|
32
|
+
# => ["Bengali", "Devanagari", "Dogra", "Grantha", "Gujarati", "Gunjala_Gondi", "Gurmukhi","Gurung_Khema",
|
33
|
+
"Kannada","Khudawadi", "Limbu", "Mahajani", "Malayalam", "Masaram_Gondi", "Nandinagari", "Ol_Onal",
|
34
|
+
"Oriya", "Sinhala", "Syloti_Nagri", "Takri", "Tamil", "Telugu", "Tirhuta"]
|
35
35
|
```
|
36
36
|
|
37
|
-
##
|
38
|
-
### Regex Matching
|
37
|
+
## Usage - Augmented Scripts
|
39
38
|
|
40
|
-
|
39
|
+
Like script extensions, but adds meta scripts for Asian languages and treats _Common_/_Inherited_ values as ALL scripts.
|
41
40
|
|
42
41
|
```ruby
|
43
|
-
|
42
|
+
require "unicode/scripts"
|
43
|
+
|
44
|
+
Unicode::Scripts.augmented_scripts("ねガ") # => ['Hira', 'Kana', 'Jpan']
|
45
|
+
Unicode::Scripts.augmented_scripts("1") # => ["Adlm", "Aghb", "Ahom", … ]
|
44
46
|
```
|
45
47
|
|
46
|
-
|
48
|
+
## Usage - Resolved Script
|
49
|
+
|
50
|
+
Intersection of all augmented scripts per character.
|
51
|
+
|
52
|
+
```ruby
|
53
|
+
require "unicode/scripts"
|
54
|
+
|
55
|
+
Unicode::Scripts.resolved_scripts("СігсӀе") # => [ 'Cyrl' ]
|
56
|
+
Unicode::Scripts.resolved_scripts("Сirсlе") # => []
|
57
|
+
Unicode::Scripts.resolved_scripts("𝖢𝗂𝗋𝖼𝗅𝖾") # => ['Adlm', 'Aghb', 'Ahom', … ]
|
58
|
+
Unicode::Scripts.resolved_scripts("1") # => ['Adlm','Aghb', 'Ahom', … ]
|
59
|
+
Unicode::Scripts.resolved_scripts("ねガ") # => ['Hira', 'Kana', 'Jpan']
|
60
|
+
```
|
61
|
+
|
62
|
+
Please note that the **resolved script** can contain multiple scripts, as per standard.
|
63
|
+
|
64
|
+
## Usage - Mixed-Script Detection
|
47
65
|
|
48
|
-
|
66
|
+
Mixed-script if resolved script set is empty, single-script otherwise.
|
67
|
+
|
68
|
+
```ruby
|
69
|
+
require "unicode/scripts"
|
70
|
+
|
71
|
+
Unicode::Scripts.mixed?("СігсӀе"); # => false
|
72
|
+
Unicode::Scripts.mixed?("Сirсlе"); # => true
|
73
|
+
Unicode::Scripts.mixed?("𝖢𝗂𝗋𝖼𝗅𝖾"); # => false
|
74
|
+
Unicode::Scripts.mixed?("1"); # => false
|
75
|
+
Unicode::Scripts.mixed?("ねガ"); # => false
|
76
|
+
|
77
|
+
Unicode::Scripts.single?("СігсӀе"); # => true
|
78
|
+
Unicode::Scripts.single?("Сirсlе"); # => false
|
79
|
+
Unicode::Scripts.single?("𝖢𝗂𝗋𝖼𝗅𝖾"); # => true
|
80
|
+
Unicode::Scripts.single?("1"); # => true
|
81
|
+
Unicode::Scripts.single?("ねガ"); # => true
|
82
|
+
```
|
83
|
+
|
84
|
+
Please note that a **single-script** string might actually contain multiple scripts, as per standard (e.g. for Asian languages)
|
85
|
+
|
86
|
+
### List of All Scripts
|
49
87
|
|
50
88
|
You can extract all script names from the gem like this:
|
51
89
|
|
52
90
|
```ruby
|
53
91
|
require "unicode/scripts"
|
54
|
-
puts Unicode::Scripts.names
|
55
|
-
|
56
|
-
# # # Output # # #
|
57
|
-
|
58
|
-
Adlam
|
59
|
-
Ahom
|
60
|
-
Anatolian_Hieroglyphs
|
61
|
-
Arabic
|
62
|
-
Armenian
|
63
|
-
Avestan
|
64
|
-
Balinese
|
65
|
-
Bamum
|
66
|
-
Bassa_Vah
|
67
|
-
Batak
|
68
|
-
Bengali
|
69
|
-
Bhaiksuki
|
70
|
-
Bopomofo
|
71
|
-
Brahmi
|
72
|
-
Braille
|
73
|
-
Buginese
|
74
|
-
Buhid
|
75
|
-
Canadian_Aboriginal
|
76
|
-
Carian
|
77
|
-
Caucasian_Albanian
|
78
|
-
Chakma
|
79
|
-
Cham
|
80
|
-
Cherokee
|
81
|
-
Chorasmian
|
82
|
-
Common
|
83
|
-
Coptic
|
84
|
-
Cuneiform
|
85
|
-
Cypriot
|
86
|
-
Cypro_Minoan
|
87
|
-
Cyrillic
|
88
|
-
Deseret
|
89
|
-
Devanagari
|
90
|
-
Dives_Akuru
|
91
|
-
Dogra
|
92
|
-
Duployan
|
93
|
-
Egyptian_Hieroglyphs
|
94
|
-
Elbasan
|
95
|
-
Elymaic
|
96
|
-
Ethiopic
|
97
|
-
Georgian
|
98
|
-
Glagolitic
|
99
|
-
Gothic
|
100
|
-
Grantha
|
101
|
-
Greek
|
102
|
-
Gujarati
|
103
|
-
Gunjala_Gondi
|
104
|
-
Gurmukhi
|
105
|
-
Han
|
106
|
-
Hangul
|
107
|
-
Hanifi_Rohingya
|
108
|
-
Hanunoo
|
109
|
-
Hatran
|
110
|
-
Hebrew
|
111
|
-
Hiragana
|
112
|
-
Imperial_Aramaic
|
113
|
-
Inherited
|
114
|
-
Inscriptional_Pahlavi
|
115
|
-
Inscriptional_Parthian
|
116
|
-
Javanese
|
117
|
-
Kaithi
|
118
|
-
Kannada
|
119
|
-
Katakana
|
120
|
-
Katakana_Or_Hiragana
|
121
|
-
Kawi
|
122
|
-
Kayah_Li
|
123
|
-
Kharoshthi
|
124
|
-
Khitan_Small_Script
|
125
|
-
Khmer
|
126
|
-
Khojki
|
127
|
-
Khudawadi
|
128
|
-
Lao
|
129
|
-
Latin
|
130
|
-
Lepcha
|
131
|
-
Limbu
|
132
|
-
Linear_A
|
133
|
-
Linear_B
|
134
|
-
Lisu
|
135
|
-
Lycian
|
136
|
-
Lydian
|
137
|
-
Mahajani
|
138
|
-
Makasar
|
139
|
-
Malayalam
|
140
|
-
Mandaic
|
141
|
-
Manichaean
|
142
|
-
Marchen
|
143
|
-
Masaram_Gondi
|
144
|
-
Medefaidrin
|
145
|
-
Meetei_Mayek
|
146
|
-
Mende_Kikakui
|
147
|
-
Meroitic_Cursive
|
148
|
-
Meroitic_Hieroglyphs
|
149
|
-
Miao
|
150
|
-
Modi
|
151
|
-
Mongolian
|
152
|
-
Mro
|
153
|
-
Multani
|
154
|
-
Myanmar
|
155
|
-
Nabataean
|
156
|
-
Nag_Mundari
|
157
|
-
Nandinagari
|
158
|
-
New_Tai_Lue
|
159
|
-
Newa
|
160
|
-
Nko
|
161
|
-
Nushu
|
162
|
-
Nyiakeng_Puachue_Hmong
|
163
|
-
Ogham
|
164
|
-
Ol_Chiki
|
165
|
-
Old_Hungarian
|
166
|
-
Old_Italic
|
167
|
-
Old_North_Arabian
|
168
|
-
Old_Permic
|
169
|
-
Old_Persian
|
170
|
-
Old_Sogdian
|
171
|
-
Old_South_Arabian
|
172
|
-
Old_Turkic
|
173
|
-
Old_Uyghur
|
174
|
-
Oriya
|
175
|
-
Osage
|
176
|
-
Osmanya
|
177
|
-
Pahawh_Hmong
|
178
|
-
Palmyrene
|
179
|
-
Pau_Cin_Hau
|
180
|
-
Phags_Pa
|
181
|
-
Phoenician
|
182
|
-
Psalter_Pahlavi
|
183
|
-
Rejang
|
184
|
-
Runic
|
185
|
-
Samaritan
|
186
|
-
Saurashtra
|
187
|
-
Sharada
|
188
|
-
Shavian
|
189
|
-
Siddham
|
190
|
-
SignWriting
|
191
|
-
Sinhala
|
192
|
-
Sogdian
|
193
|
-
Sora_Sompeng
|
194
|
-
Soyombo
|
195
|
-
Sundanese
|
196
|
-
Syloti_Nagri
|
197
|
-
Syriac
|
198
|
-
Tagalog
|
199
|
-
Tagbanwa
|
200
|
-
Tai_Le
|
201
|
-
Tai_Tham
|
202
|
-
Tai_Viet
|
203
|
-
Takri
|
204
|
-
Tamil
|
205
|
-
Tangsa
|
206
|
-
Tangut
|
207
|
-
Telugu
|
208
|
-
Thaana
|
209
|
-
Thai
|
210
|
-
Tibetan
|
211
|
-
Tifinagh
|
212
|
-
Tirhuta
|
213
|
-
Toto
|
214
|
-
Ugaritic
|
215
|
-
Unknown
|
216
|
-
Vai
|
217
|
-
Vithkuqi
|
218
|
-
Wancho
|
219
|
-
Warang_Citi
|
220
|
-
Yezidi
|
221
|
-
Yi
|
222
|
-
Zanabazar_Square
|
92
|
+
puts Unicode::Scripts.names # list of scripts
|
223
93
|
```
|
224
94
|
|
225
|
-
|
95
|
+
To get all 4 letter script codes (ISO 15924):
|
96
|
+
|
97
|
+
```ruby
|
98
|
+
require "unicode/scripts"
|
99
|
+
puts Unicode::Scripts.names(format: :short) # list of scripts
|
100
|
+
```
|
226
101
|
|
227
|
-
|
102
|
+
Augmented scripts:
|
228
103
|
|
229
104
|
```ruby
|
230
105
|
require "unicode/scripts"
|
231
|
-
puts Unicode::Scripts.names(format: :short)
|
232
|
-
|
233
|
-
# # # Output # # #
|
234
|
-
|
235
|
-
Adlm
|
236
|
-
Aghb
|
237
|
-
Ahom
|
238
|
-
Arab
|
239
|
-
Armi
|
240
|
-
Armn
|
241
|
-
Avst
|
242
|
-
Bali
|
243
|
-
Bamu
|
244
|
-
Bass
|
245
|
-
Batk
|
246
|
-
Beng
|
247
|
-
Bhks
|
248
|
-
Bopo
|
249
|
-
Brah
|
250
|
-
Brai
|
251
|
-
Bugi
|
252
|
-
Buhd
|
253
|
-
Cakm
|
254
|
-
Cans
|
255
|
-
Cari
|
256
|
-
Cham
|
257
|
-
Cher
|
258
|
-
Chrs
|
259
|
-
Copt
|
260
|
-
Cpmn
|
261
|
-
Cprt
|
262
|
-
Cyrl
|
263
|
-
Deva
|
264
|
-
Diak
|
265
|
-
Dogr
|
266
|
-
Dsrt
|
267
|
-
Dupl
|
268
|
-
Egyp
|
269
|
-
Elba
|
270
|
-
Elym
|
271
|
-
Ethi
|
272
|
-
Geor
|
273
|
-
Glag
|
274
|
-
Gong
|
275
|
-
Gonm
|
276
|
-
Goth
|
277
|
-
Gran
|
278
|
-
Grek
|
279
|
-
Gujr
|
280
|
-
Guru
|
281
|
-
Hang
|
282
|
-
Hani
|
283
|
-
Hano
|
284
|
-
Hatr
|
285
|
-
Hebr
|
286
|
-
Hira
|
287
|
-
Hluw
|
288
|
-
Hmng
|
289
|
-
Hmnp
|
290
|
-
Hrkt
|
291
|
-
Hung
|
292
|
-
Ital
|
293
|
-
Java
|
294
|
-
Kali
|
295
|
-
Kana
|
296
|
-
Kawi
|
297
|
-
Khar
|
298
|
-
Khmr
|
299
|
-
Khoj
|
300
|
-
Kits
|
301
|
-
Knda
|
302
|
-
Kthi
|
303
|
-
Lana
|
304
|
-
Laoo
|
305
|
-
Latn
|
306
|
-
Lepc
|
307
|
-
Limb
|
308
|
-
Lina
|
309
|
-
Linb
|
310
|
-
Lisu
|
311
|
-
Lyci
|
312
|
-
Lydi
|
313
|
-
Mahj
|
314
|
-
Maka
|
315
|
-
Mand
|
316
|
-
Mani
|
317
|
-
Marc
|
318
|
-
Medf
|
319
|
-
Mend
|
320
|
-
Merc
|
321
|
-
Mero
|
322
|
-
Mlym
|
323
|
-
Modi
|
324
|
-
Mong
|
325
|
-
Mroo
|
326
|
-
Mtei
|
327
|
-
Mult
|
328
|
-
Mymr
|
329
|
-
Nagm
|
330
|
-
Nand
|
331
|
-
Narb
|
332
|
-
Nbat
|
333
|
-
Newa
|
334
|
-
Nkoo
|
335
|
-
Nshu
|
336
|
-
Ogam
|
337
|
-
Olck
|
338
|
-
Orkh
|
339
|
-
Orya
|
340
|
-
Osge
|
341
|
-
Osma
|
342
|
-
Ougr
|
343
|
-
Palm
|
344
|
-
Pauc
|
345
|
-
Perm
|
346
|
-
Phag
|
347
|
-
Phli
|
348
|
-
Phlp
|
349
|
-
Phnx
|
350
|
-
Plrd
|
351
|
-
Prti
|
352
|
-
Qaac
|
353
|
-
Qaai
|
354
|
-
Rjng
|
355
|
-
Rohg
|
356
|
-
Runr
|
357
|
-
Samr
|
358
|
-
Sarb
|
359
|
-
Saur
|
360
|
-
Sgnw
|
361
|
-
Shaw
|
362
|
-
Shrd
|
363
|
-
Sidd
|
364
|
-
Sind
|
365
|
-
Sinh
|
366
|
-
Sogd
|
367
|
-
Sogo
|
368
|
-
Sora
|
369
|
-
Soyo
|
370
|
-
Sund
|
371
|
-
Sylo
|
372
|
-
Syrc
|
373
|
-
Tagb
|
374
|
-
Takr
|
375
|
-
Tale
|
376
|
-
Talu
|
377
|
-
Taml
|
378
|
-
Tang
|
379
|
-
Tavt
|
380
|
-
Telu
|
381
|
-
Tfng
|
382
|
-
Tglg
|
383
|
-
Thaa
|
384
|
-
Thai
|
385
|
-
Tibt
|
386
|
-
Tirh
|
387
|
-
Tnsa
|
388
|
-
Toto
|
389
|
-
Ugar
|
390
|
-
Vaii
|
391
|
-
Vith
|
392
|
-
Wara
|
393
|
-
Wcho
|
394
|
-
Xpeo
|
395
|
-
Xsux
|
396
|
-
Yezi
|
397
|
-
Yiii
|
398
|
-
Zanb
|
399
|
-
Zinh
|
400
|
-
Zyyy
|
401
|
-
Zzzz
|
106
|
+
puts Unicode::Scripts.names(format: :short, augmented: :only)
|
402
107
|
```
|
403
108
|
|
404
|
-
|
109
|
+
You can find a list of all scripts in Unicode, with links to Wikipedia on [character.construction/scripts](https://character.construction/scripts)
|
110
|
+
|
111
|
+
## Hints
|
112
|
+
### Regex Matching
|
113
|
+
|
114
|
+
If you have a string and want to match a substring/character from a specific Unicode script, you actually won't need this gem. Instead, you can use the [Regexp Unicode Property Syntax `\p{}`](https://ruby-doc.org/core/Regexp.html#class-Regexp-label-Character+Properties):
|
115
|
+
|
116
|
+
```ruby
|
117
|
+
"Coptic letter: ⲁ".scan(/\p{Coptic}/) # => ["ⲁ"]
|
118
|
+
```
|
119
|
+
|
120
|
+
See [Idiosyncratic Ruby: Proper Unicoding](https://idiosyncratic-ruby.com/41-proper-unicoding.html) for more info.
|
121
|
+
|
122
|
+
## Also See
|
123
|
+
|
124
|
+
- JavaScript implementation (same data & algorithms): [unicode-script.js](https://github.com/janlelis/unicode-script.js)
|
125
|
+
- Index created with: [unicoder](https://github.com/janlelis/unicoder)
|
126
|
+
- Get the Unicode blocks of a string: [unicode-blocks gem](https://github.com/janlelis/unicode-blocks)
|
127
|
+
- See [unicode-x](https://github.com/janlelis/unicode-x) for more Unicode related micro libraries for Ruby.
|
405
128
|
|
406
129
|
## MIT License
|
407
130
|
|
408
|
-
- Copyright (C) 2016-
|
131
|
+
- Copyright (C) 2016-2024 Jan Lelis <https://janlelis.com>. Released under the MIT license.
|
409
132
|
- Unicode data: https://www.unicode.org/copyright.html#Exhibit1
|
data/data/scripts.marshal.gz
CHANGED
Binary file
|
@@ -2,9 +2,11 @@
|
|
2
2
|
|
3
3
|
module Unicode
|
4
4
|
module Scripts
|
5
|
-
VERSION = "1.
|
6
|
-
UNICODE_VERSION = "
|
5
|
+
VERSION = "1.11.0"
|
6
|
+
UNICODE_VERSION = "16.0.0"
|
7
7
|
DATA_DIRECTORY = File.expand_path(File.dirname(__FILE__) + "/../../../data/").freeze
|
8
8
|
INDEX_FILENAME = (DATA_DIRECTORY + "/scripts.marshal.gz").freeze
|
9
|
+
|
10
|
+
AUGMENTED_SCRIPT_CODES = ["Hanb", "Jpan", "Kore"]
|
9
11
|
end
|
10
12
|
end
|
data/lib/unicode/scripts.rb
CHANGED
@@ -46,11 +46,77 @@ module Unicode
|
|
46
46
|
}.sort
|
47
47
|
end
|
48
48
|
|
49
|
-
def self.
|
49
|
+
def self.augmented_scripts(string)
|
50
50
|
require_relative 'scripts/index' unless defined? ::Unicode::Scripts::INDEX
|
51
|
-
|
52
|
-
|
53
|
-
|
51
|
+
|
52
|
+
augmented = string.each_codepoint.inject([]){ |res, codepoint|
|
53
|
+
if new_scripts = INDEX[:SCRIPT_EXTENSIONS][codepoint]
|
54
|
+
script_extension_names = new_scripts.map{ |new_script|
|
55
|
+
INDEX[:SCRIPT_ALIASES].key(new_script)
|
56
|
+
}
|
57
|
+
else
|
58
|
+
script_extension_names = scripts([codepoint].pack("U"), format: :short)
|
59
|
+
end
|
60
|
+
|
61
|
+
res | script_extension_names
|
62
|
+
}
|
63
|
+
|
64
|
+
if augmented.include? "Hani"
|
65
|
+
augmented |= ["Hanb", "Jpan", "Kore"]
|
66
|
+
end
|
67
|
+
if augmented.include?("Hira") || augmented.include?("Kana")
|
68
|
+
augmented |= ["Jpan"]
|
69
|
+
end
|
70
|
+
if augmented.include? "Hang"
|
71
|
+
augmented |= ["Kore"]
|
72
|
+
end
|
73
|
+
if augmented.include? "Bopo"
|
74
|
+
augmented |= ["Hanb"]
|
75
|
+
end
|
76
|
+
if augmented.include?("Zyyy") || augmented.include?("Zinh")
|
77
|
+
augmented |= names(format: :short, augmented: :include )
|
78
|
+
end
|
79
|
+
|
80
|
+
augmented.sort
|
81
|
+
end
|
82
|
+
|
83
|
+
def self.resolved_scripts(string)
|
84
|
+
string.chars.reduce(
|
85
|
+
Unicode::Scripts.names(format: :short, augmented: :include)
|
86
|
+
){ |acc, char|
|
87
|
+
acc & augmented_scripts(char)
|
88
|
+
}
|
89
|
+
end
|
90
|
+
|
91
|
+
def self.mixed?(string)
|
92
|
+
resolved_scripts(string).empty?
|
93
|
+
end
|
94
|
+
|
95
|
+
def self.single?(string)
|
96
|
+
!resolved_scripts(string).empty?
|
97
|
+
end
|
98
|
+
|
99
|
+
# Lists scripts. Options:
|
100
|
+
# - format - :long, :short
|
101
|
+
# - augmented - :include, :exclude, :only
|
102
|
+
def self.names(format: :long, augmented: :exclude)
|
103
|
+
if format == :long && augmented != :exclude
|
104
|
+
raise ArgumentError, "only short four-letter script codes (ISO 15924) supported when listing augmented scripts"
|
105
|
+
end
|
106
|
+
|
107
|
+
if augmented == :only
|
108
|
+
return AUGMENTED_SCRIPT_CODES
|
109
|
+
end
|
110
|
+
|
111
|
+
require_relative 'scripts/index' unless defined? ::Unicode::Scripts::INDEX
|
112
|
+
|
113
|
+
if format == :long
|
114
|
+
INDEX[:SCRIPT_NAMES].sort
|
115
|
+
elsif augmented == :exclude
|
116
|
+
INDEX[:SCRIPT_ALIASES].keys.sort
|
117
|
+
else
|
118
|
+
(INDEX[:SCRIPT_ALIASES].keys + AUGMENTED_SCRIPT_CODES).sort
|
119
|
+
end
|
54
120
|
end
|
55
121
|
end
|
56
122
|
end
|
@@ -63,6 +63,7 @@ describe Unicode::Scripts do
|
|
63
63
|
"Gujarati",
|
64
64
|
"Gunjala_Gondi",
|
65
65
|
"Gurmukhi",
|
66
|
+
"Gurung_Khema",
|
66
67
|
"Kannada",
|
67
68
|
"Khudawadi",
|
68
69
|
"Limbu",
|
@@ -70,6 +71,7 @@ describe Unicode::Scripts do
|
|
70
71
|
"Malayalam",
|
71
72
|
"Masaram_Gondi",
|
72
73
|
"Nandinagari",
|
74
|
+
"Ol_Onal",
|
73
75
|
"Oriya",
|
74
76
|
"Sinhala",
|
75
77
|
"Syloti_Nagri",
|
@@ -89,12 +91,14 @@ describe Unicode::Scripts do
|
|
89
91
|
"Gonm",
|
90
92
|
"Gran",
|
91
93
|
"Gujr",
|
94
|
+
"Gukh",
|
92
95
|
"Guru",
|
93
96
|
"Knda",
|
94
97
|
"Limb",
|
95
98
|
"Mahj",
|
96
99
|
"Mlym",
|
97
100
|
"Nand",
|
101
|
+
"Onao",
|
98
102
|
"Orya",
|
99
103
|
"Sind",
|
100
104
|
"Sinh",
|
@@ -126,11 +130,61 @@ describe Unicode::Scripts do
|
|
126
130
|
end
|
127
131
|
end
|
128
132
|
|
133
|
+
describe ".augmented_scripts" do
|
134
|
+
it "will always return an Array" do
|
135
|
+
assert_equal [], Unicode::Scripts.augmented_scripts("")
|
136
|
+
end
|
137
|
+
|
138
|
+
it "will return all extended scripts that characters in the string belong to + augmented" do
|
139
|
+
assert_equal ["Hira", "Jpan", "Kana"], Unicode::Scripts.augmented_scripts("ねガ")
|
140
|
+
end
|
141
|
+
|
142
|
+
it "will replace Common with all scripts" do
|
143
|
+
assert_equal \
|
144
|
+
Unicode::Scripts.names(format: :short, augmented: :include),
|
145
|
+
Unicode::Scripts.augmented_scripts("1")
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
describe ".resolved_scripts" do
|
150
|
+
it "return intersection of augmented scripts per character" do
|
151
|
+
assert_equal ["Cyrl"], Unicode::Scripts.resolved_scripts("СігсӀе")
|
152
|
+
assert_equal [], Unicode::Scripts.resolved_scripts("Сirсlе")
|
153
|
+
assert_equal \
|
154
|
+
Unicode::Scripts.names(format: :short, augmented: :include),
|
155
|
+
Unicode::Scripts.resolved_scripts("𝖢𝗂𝗋𝖼𝗅𝖾")
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
describe "mixed?" do
|
160
|
+
it "will return true if .resolved_scripts(string) is empty" do
|
161
|
+
assert_equal false, Unicode::Scripts.mixed?("СігсӀе")
|
162
|
+
assert Unicode::Scripts.mixed?("Сirсlе")
|
163
|
+
assert_equal false, Unicode::Scripts.mixed?("𝖢𝗂𝗋𝖼𝗅𝖾")
|
164
|
+
assert_equal false, Unicode::Scripts.mixed?("1")
|
165
|
+
assert_equal false, Unicode::Scripts.mixed?("ねガ")
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
describe "single?" do
|
170
|
+
it "will return true if .resolved_scripts(string) is not empty" do
|
171
|
+
assert Unicode::Scripts.single?("СігсӀе")
|
172
|
+
assert_equal false, Unicode::Scripts.single?("Сirсlе")
|
173
|
+
assert Unicode::Scripts.single?("𝖢𝗂𝗋𝖼𝗅𝖾")
|
174
|
+
assert Unicode::Scripts.single?("1")
|
175
|
+
assert Unicode::Scripts.single?("ねガ")
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
129
179
|
describe ".names" do
|
130
180
|
it "will return a list of all script names" do
|
131
181
|
assert_kind_of Array, Unicode::Scripts.names
|
132
182
|
assert_includes Unicode::Scripts.names, "Inscriptional_Parthian"
|
133
183
|
end
|
184
|
+
|
185
|
+
it "will return a list of all augmented script codes" do
|
186
|
+
assert_equal Unicode::Scripts.names(format: :short, augmented: :only), ["Hanb", "Jpan", "Kore"]
|
187
|
+
end
|
134
188
|
end
|
135
189
|
end
|
136
190
|
|
metadata
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unicode-scripts
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.11.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jan Lelis
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-11-03 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
|
-
description: "[Unicode
|
13
|
+
description: "[Unicode 16.0.0] Retrieve the Unicode script(s) a string belongs to.
|
14
14
|
Can also return the Script_Extension property which is defined as characters which
|
15
15
|
are 'commonly used with more than one script, but with a limited number of scripts'. "
|
16
16
|
email:
|
@@ -39,7 +39,7 @@ licenses:
|
|
39
39
|
- MIT
|
40
40
|
metadata:
|
41
41
|
rubygems_mfa_required: 'true'
|
42
|
-
post_install_message:
|
42
|
+
post_install_message:
|
43
43
|
rdoc_options: []
|
44
44
|
require_paths:
|
45
45
|
- lib
|
@@ -54,8 +54,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
54
54
|
- !ruby/object:Gem::Version
|
55
55
|
version: '0'
|
56
56
|
requirements: []
|
57
|
-
rubygems_version: 3.
|
58
|
-
signing_key:
|
57
|
+
rubygems_version: 3.5.21
|
58
|
+
signing_key:
|
59
59
|
specification_version: 4
|
60
60
|
summary: Which script(s) does a Unicode string belong to?
|
61
61
|
test_files:
|