unicode_script_detector 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +50 -0
- data/lib/unicode_script_detector/character.rb +19 -0
- data/lib/unicode_script_detector/detector.rb +45 -0
- data/lib/unicode_script_detector/scripts.rb +516 -0
- data/lib/unicode_script_detector/version.rb +3 -0
- data/lib/unicode_script_detector.rb +20 -0
- metadata +83 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 642b20bc8d5ac4e9235bae7468d7b78eba92ff63522473ae4a17ee78bba1b82c
|
|
4
|
+
data.tar.gz: dc8c392cf8301c17f16718ff07f6ce3b3ba6e940126b3b1e23917340b61c7819
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: '0478c770eec54d36171fc251a6b061b6babddd222db0abc62b858c388213bf74711f44ebc6e2a3351509fcdc8c44addde5fcd29cd683dabea53c38d081499acb'
|
|
7
|
+
data.tar.gz: 45fac137742fc05c44514745018483c930fccd37839c96cc537bed1009b59a49287d1df0b2b8379ef59e5155bbb79b4cf2accc972c911dd3c75cc09937271072
|
data/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 David Arendsen
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# Unicode Script Detector
|
|
2
|
+
|
|
3
|
+
Detect all Unicode scripts in a text.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
Add this line to your application's Gemfile:
|
|
8
|
+
```ruby
|
|
9
|
+
gem "unicode_script_detector"
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
Or install it globally:
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
$ gem install unicode_script_detector
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Detect all the characters of a string
|
|
19
|
+
```ruby
|
|
20
|
+
UnicodeScriptDetector.detect_characters "Hel6б"
|
|
21
|
+
|
|
22
|
+
#Output:
|
|
23
|
+
[#<UnicodeScriptDetector::Character:0x00007768fefdead8 @char="H", @name="Latin", @script=:Latin>,
|
|
24
|
+
#<UnicodeScriptDetector::Character:0x00007768fefdea10 @char="e", @name="Latin", @script=:Latin>,
|
|
25
|
+
#<UnicodeScriptDetector::Character:0x00007768fefde970 @char="l", @name="Latin", @script=:Latin>,
|
|
26
|
+
#<UnicodeScriptDetector::Character:0x00007768fefde8d0 @char="6", @name="Digit", @script=:Digit>,
|
|
27
|
+
#<UnicodeScriptDetector::Character:0x00007768fefde830 @char="б", @name="Cyrillic", @script=:Cyrillic>]
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Detect if a script contains certain scripts
|
|
31
|
+
```ruby
|
|
32
|
+
# This will return true because it contains Latin and Cyrillic
|
|
33
|
+
UnicodeScriptDetector.contains? "Hellб🔥", [:Latin, :Cyrillic]
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Detect if a script contains only certain scripts
|
|
37
|
+
```ruby
|
|
38
|
+
# This will return false because it contains an Emoji as well
|
|
39
|
+
UnicodeScriptDetector.contains_only? "Hellб🔥", [:Latin, :Cyrillic]
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Development
|
|
43
|
+
Start the console with `bin/console`.
|
|
44
|
+
Run the tests with `bin/test`.
|
|
45
|
+
|
|
46
|
+
## Contributing
|
|
47
|
+
You're welcome to contribute to this project. See https://github.com/davidarendsen/unicode_script_detector.
|
|
48
|
+
|
|
49
|
+
## License
|
|
50
|
+
This software is released under the [MIT license](LICENSE).
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
module UnicodeScriptDetector
|
|
2
|
+
class Character
|
|
3
|
+
attr_reader :char, :script, :name
|
|
4
|
+
|
|
5
|
+
def initialize(char, script, name)
|
|
6
|
+
@char = char
|
|
7
|
+
@script = script
|
|
8
|
+
@name = name
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def latin?
|
|
12
|
+
@script === :Latin
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def hiragana?
|
|
16
|
+
@script === :Hiragana
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
module UnicodeScriptDetector
|
|
2
|
+
class Detector
|
|
3
|
+
attr_reader :characters, :scripts
|
|
4
|
+
|
|
5
|
+
def initialize(string)
|
|
6
|
+
@string = string
|
|
7
|
+
@characters = []
|
|
8
|
+
@scripts = []
|
|
9
|
+
|
|
10
|
+
detect_scripts
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def scripts
|
|
14
|
+
@scripts.uniq
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def detect_scripts
|
|
18
|
+
@string.chars.each_with_index do |char, index|
|
|
19
|
+
detected = false
|
|
20
|
+
Scripts::LIST.each_with_index do |script_data, index|
|
|
21
|
+
if char.match?(script_data[:regex])
|
|
22
|
+
@characters << Character.new(char, script_data[:script], script_data[:name])
|
|
23
|
+
@scripts << script_data[:script]
|
|
24
|
+
detected = true
|
|
25
|
+
break
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
@characters << Character.new(char, :Other, "Other") unless detected
|
|
29
|
+
@scripts << :Other unless detected
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def contains?(scripts)
|
|
34
|
+
return @scripts.include?(scripts) if scripts.is_a?(Symbol)
|
|
35
|
+
|
|
36
|
+
scripts.all? { |script| @scripts.include?(script) }
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def contains_only?(scripts)
|
|
40
|
+
return @scripts.uniq == [scripts] if scripts.is_a?(Symbol)
|
|
41
|
+
|
|
42
|
+
@scripts.uniq.sort == scripts.uniq.sort
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
@@ -0,0 +1,516 @@
|
|
|
1
|
+
module UnicodeScriptDetector
|
|
2
|
+
class Scripts
|
|
3
|
+
LIST = [
|
|
4
|
+
{
|
|
5
|
+
script: :Digit,
|
|
6
|
+
name: "Digit",
|
|
7
|
+
regex: /\d/,
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
script: :Whitespace,
|
|
11
|
+
name: "Whitespace",
|
|
12
|
+
regex: /\s/,
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
script: :Arabic,
|
|
16
|
+
name: "Arabic",
|
|
17
|
+
regex: /\p{Arabic}/,
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
script: :Armenian,
|
|
21
|
+
name: "Armenian",
|
|
22
|
+
regex: /\p{Armenian}/,
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
script: :Balinese,
|
|
26
|
+
name: "Balinese",
|
|
27
|
+
regex: /\p{Balinese}/,
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
script: :Bamum,
|
|
31
|
+
name: "Bamum",
|
|
32
|
+
regex: /\p{Bamum}/,
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
script: :Batak,
|
|
36
|
+
name: "Batak",
|
|
37
|
+
regex: /\p{Batak}/,
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
script: :Bengali,
|
|
41
|
+
name: "Bengali",
|
|
42
|
+
regex: /\p{Bengali}/,
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
script: :Bopomofo,
|
|
46
|
+
name: "Bopomofo",
|
|
47
|
+
regex: /\p{Bopomofo}/,
|
|
48
|
+
},
|
|
49
|
+
{
|
|
50
|
+
script: :Brahmi,
|
|
51
|
+
name: "Brahmi",
|
|
52
|
+
regex: /\p{Brahmi}/,
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
script: :Braille,
|
|
56
|
+
name: "Braille",
|
|
57
|
+
regex: /\p{Braille}/,
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
script: :Buginese,
|
|
61
|
+
name: "Buginese",
|
|
62
|
+
regex: /\p{Buginese}/,
|
|
63
|
+
},
|
|
64
|
+
{
|
|
65
|
+
script: :Buhid,
|
|
66
|
+
name: "Buhid",
|
|
67
|
+
regex: /\p{Buhid}/,
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
script: :Canadian_Aboriginal,
|
|
71
|
+
name: "Canadian_Aboriginal",
|
|
72
|
+
regex: /\p{Canadian_Aboriginal}/,
|
|
73
|
+
},
|
|
74
|
+
{
|
|
75
|
+
script: :Carian,
|
|
76
|
+
name: "Carian",
|
|
77
|
+
regex: /\p{Carian}/,
|
|
78
|
+
},
|
|
79
|
+
{
|
|
80
|
+
script: :Chakma,
|
|
81
|
+
name: "Chakma",
|
|
82
|
+
regex: /\p{Chakma}/,
|
|
83
|
+
},
|
|
84
|
+
{
|
|
85
|
+
script: :Cham,
|
|
86
|
+
name: "Cham",
|
|
87
|
+
regex: /\p{Cham}/,
|
|
88
|
+
},
|
|
89
|
+
{
|
|
90
|
+
script: :Cherokee,
|
|
91
|
+
name: "Cherokee",
|
|
92
|
+
regex: /\p{Cherokee}/,
|
|
93
|
+
},
|
|
94
|
+
{
|
|
95
|
+
script: :Coptic,
|
|
96
|
+
name: "Coptic",
|
|
97
|
+
regex: /\p{Coptic}/,
|
|
98
|
+
},
|
|
99
|
+
{
|
|
100
|
+
script: :Cuneiform,
|
|
101
|
+
name: "Cuneiform",
|
|
102
|
+
regex: /\p{Cuneiform}/,
|
|
103
|
+
},
|
|
104
|
+
{
|
|
105
|
+
script: :Cypriot,
|
|
106
|
+
name: "Cypriot",
|
|
107
|
+
regex: /\p{Cypriot}/,
|
|
108
|
+
},
|
|
109
|
+
{
|
|
110
|
+
script: :Cyrillic,
|
|
111
|
+
name: "Cyrillic",
|
|
112
|
+
regex: /\p{Cyrillic}/,
|
|
113
|
+
},
|
|
114
|
+
{
|
|
115
|
+
script: :Deseret,
|
|
116
|
+
name: "Deseret",
|
|
117
|
+
regex: /\p{Deseret}/,
|
|
118
|
+
},
|
|
119
|
+
{
|
|
120
|
+
script: :Devanagari,
|
|
121
|
+
name: "Devanagari",
|
|
122
|
+
regex: /\p{Devanagari}/,
|
|
123
|
+
},
|
|
124
|
+
{
|
|
125
|
+
script: :Egyptian_Hieroglyphs,
|
|
126
|
+
name: "Egyptian_Hieroglyphs",
|
|
127
|
+
regex: /\p{Egyptian_Hieroglyphs}/,
|
|
128
|
+
},
|
|
129
|
+
{
|
|
130
|
+
script: :Ethiopic,
|
|
131
|
+
name: "Ethiopic",
|
|
132
|
+
regex: /\p{Ethiopic}/,
|
|
133
|
+
},
|
|
134
|
+
{
|
|
135
|
+
script: :Georgian,
|
|
136
|
+
name: "Georgian",
|
|
137
|
+
regex: /\p{Georgian}/,
|
|
138
|
+
},
|
|
139
|
+
{
|
|
140
|
+
script: :Glagolitic,
|
|
141
|
+
name: "Glagolitic",
|
|
142
|
+
regex: /\p{Glagolitic}/,
|
|
143
|
+
},
|
|
144
|
+
{
|
|
145
|
+
script: :Gothic,
|
|
146
|
+
name: "Gothic",
|
|
147
|
+
regex: /\p{Gothic}/,
|
|
148
|
+
},
|
|
149
|
+
{
|
|
150
|
+
script: :Greek,
|
|
151
|
+
name: "Greek",
|
|
152
|
+
regex: /\p{Greek}/,
|
|
153
|
+
},
|
|
154
|
+
{
|
|
155
|
+
script: :Gujarati,
|
|
156
|
+
name: "Gujarati",
|
|
157
|
+
regex: /\p{Gujarati}/,
|
|
158
|
+
},
|
|
159
|
+
{
|
|
160
|
+
script: :Gurmukhi,
|
|
161
|
+
name: "Gurmukhi",
|
|
162
|
+
regex: /\p{Gurmukhi}/,
|
|
163
|
+
},
|
|
164
|
+
{
|
|
165
|
+
script: :Han,
|
|
166
|
+
name: "Han",
|
|
167
|
+
regex: /\p{Han}/,
|
|
168
|
+
},
|
|
169
|
+
{
|
|
170
|
+
script: :Hangul,
|
|
171
|
+
name: "Hangul",
|
|
172
|
+
regex: /\p{Hangul}/,
|
|
173
|
+
},
|
|
174
|
+
{
|
|
175
|
+
script: :Hanunoo,
|
|
176
|
+
name: "Hanunoo",
|
|
177
|
+
regex: /\p{Hanunoo}/,
|
|
178
|
+
},
|
|
179
|
+
{
|
|
180
|
+
script: :Hebrew,
|
|
181
|
+
name: "Hebrew",
|
|
182
|
+
regex: /\p{Hebrew}/,
|
|
183
|
+
},
|
|
184
|
+
{
|
|
185
|
+
script: :Hiragana,
|
|
186
|
+
name: "Hiragana",
|
|
187
|
+
regex: /\p{Hiragana}/,
|
|
188
|
+
},
|
|
189
|
+
{
|
|
190
|
+
script: :Imperial_Aramaic,
|
|
191
|
+
name: "Imperial_Aramaic",
|
|
192
|
+
regex: /\p{Imperial_Aramaic}/,
|
|
193
|
+
},
|
|
194
|
+
{
|
|
195
|
+
script: :Inherited,
|
|
196
|
+
name: "Inherited",
|
|
197
|
+
regex: /\p{Inherited}/,
|
|
198
|
+
},
|
|
199
|
+
{
|
|
200
|
+
script: :Inscriptional_Pahlavi,
|
|
201
|
+
name: "Inscriptional_Pahlavi",
|
|
202
|
+
regex: /\p{Inscriptional_Pahlavi}/,
|
|
203
|
+
},
|
|
204
|
+
{
|
|
205
|
+
script: :Inscriptional_Parthian,
|
|
206
|
+
name: "Inscriptional_Parthian",
|
|
207
|
+
regex: /\p{Inscriptional_Parthian}/,
|
|
208
|
+
},
|
|
209
|
+
{
|
|
210
|
+
script: :Javanese,
|
|
211
|
+
name: "Javanese",
|
|
212
|
+
regex: /\p{Javanese}/,
|
|
213
|
+
},
|
|
214
|
+
{
|
|
215
|
+
script: :Kaithi,
|
|
216
|
+
name: "Kaithi",
|
|
217
|
+
regex: /\p{Kaithi}/,
|
|
218
|
+
},
|
|
219
|
+
{
|
|
220
|
+
script: :Kannada,
|
|
221
|
+
name: "Kannada",
|
|
222
|
+
regex: /\p{Kannada}/,
|
|
223
|
+
},
|
|
224
|
+
{
|
|
225
|
+
script: :Katakana,
|
|
226
|
+
name: "Katakana",
|
|
227
|
+
regex: /\p{Katakana}/,
|
|
228
|
+
},
|
|
229
|
+
{
|
|
230
|
+
script: :Kayah_Li,
|
|
231
|
+
name: "Kayah_Li",
|
|
232
|
+
regex: /\p{Kayah_Li}/,
|
|
233
|
+
},
|
|
234
|
+
{
|
|
235
|
+
script: :Kharoshthi,
|
|
236
|
+
name: "Kharoshthi",
|
|
237
|
+
regex: /\p{Kharoshthi}/,
|
|
238
|
+
},
|
|
239
|
+
{
|
|
240
|
+
script: :Khmer,
|
|
241
|
+
name: "Khmer",
|
|
242
|
+
regex: /\p{Khmer}/,
|
|
243
|
+
},
|
|
244
|
+
{
|
|
245
|
+
script: :Lao,
|
|
246
|
+
name: "Lao",
|
|
247
|
+
regex: /\p{Lao}/,
|
|
248
|
+
},
|
|
249
|
+
{
|
|
250
|
+
script: :Latin,
|
|
251
|
+
name: "Latin",
|
|
252
|
+
regex: /\p{Latin}/,
|
|
253
|
+
},
|
|
254
|
+
{
|
|
255
|
+
script: :Lepcha,
|
|
256
|
+
name: "Lepcha",
|
|
257
|
+
regex: /\p{Lepcha}/,
|
|
258
|
+
},
|
|
259
|
+
{
|
|
260
|
+
script: :Limbu,
|
|
261
|
+
name: "Limbu",
|
|
262
|
+
regex: /\p{Limbu}/,
|
|
263
|
+
},
|
|
264
|
+
{
|
|
265
|
+
script: :Linear_B,
|
|
266
|
+
name: "Linear_B",
|
|
267
|
+
regex: /\p{Linear_B}/,
|
|
268
|
+
},
|
|
269
|
+
{
|
|
270
|
+
script: :Lycian,
|
|
271
|
+
name: "Lycian",
|
|
272
|
+
regex: /\p{Lycian}/,
|
|
273
|
+
},
|
|
274
|
+
{
|
|
275
|
+
script: :Lydian,
|
|
276
|
+
name: "Lydian",
|
|
277
|
+
regex: /\p{Lydian}/,
|
|
278
|
+
},
|
|
279
|
+
{
|
|
280
|
+
script: :Malayalam,
|
|
281
|
+
name: "Malayalam",
|
|
282
|
+
regex: /\p{Malayalam}/,
|
|
283
|
+
},
|
|
284
|
+
{
|
|
285
|
+
script: :Mandaic,
|
|
286
|
+
name: "Mandaic",
|
|
287
|
+
regex: /\p{Mandaic}/,
|
|
288
|
+
},
|
|
289
|
+
{
|
|
290
|
+
script: :Meetei_Mayek,
|
|
291
|
+
name: "Meetei_Mayek",
|
|
292
|
+
regex: /\p{Meetei_Mayek}/,
|
|
293
|
+
},
|
|
294
|
+
{
|
|
295
|
+
script: :Meroitic_Cursive,
|
|
296
|
+
name: "Meroitic_Cursive",
|
|
297
|
+
regex: /\p{Meroitic_Cursive}/,
|
|
298
|
+
},
|
|
299
|
+
{
|
|
300
|
+
script: :Meroitic_Hieroglyphs,
|
|
301
|
+
name: "Meroitic_Hieroglyphs",
|
|
302
|
+
regex: /\p{Meroitic_Hieroglyphs}/,
|
|
303
|
+
},
|
|
304
|
+
{
|
|
305
|
+
script: :Miao,
|
|
306
|
+
name: "Miao",
|
|
307
|
+
regex: /\p{Miao}/,
|
|
308
|
+
},
|
|
309
|
+
{
|
|
310
|
+
script: :Mongolian,
|
|
311
|
+
name: "Mongolian",
|
|
312
|
+
regex: /\p{Mongolian}/,
|
|
313
|
+
},
|
|
314
|
+
{
|
|
315
|
+
script: :Myanmar,
|
|
316
|
+
name: "Myanmar",
|
|
317
|
+
regex: /\p{Myanmar}/,
|
|
318
|
+
},
|
|
319
|
+
{
|
|
320
|
+
script: :New_Tai_Lue,
|
|
321
|
+
name: "New_Tai_Lue",
|
|
322
|
+
regex: /\p{New_Tai_Lue}/,
|
|
323
|
+
},
|
|
324
|
+
{
|
|
325
|
+
script: :Nko,
|
|
326
|
+
name: "Nko",
|
|
327
|
+
regex: /\p{Nko}/,
|
|
328
|
+
},
|
|
329
|
+
{
|
|
330
|
+
script: :Ogham,
|
|
331
|
+
name: "Ogham",
|
|
332
|
+
regex: /\p{Ogham}/,
|
|
333
|
+
},
|
|
334
|
+
{
|
|
335
|
+
script: :Ol_Chiki,
|
|
336
|
+
name: "Ol_Chiki",
|
|
337
|
+
regex: /\p{Ol_Chiki}/,
|
|
338
|
+
},
|
|
339
|
+
{
|
|
340
|
+
script: :Old_Italic,
|
|
341
|
+
name: "Old_Italic",
|
|
342
|
+
regex: /\p{Old_Italic}/,
|
|
343
|
+
},
|
|
344
|
+
{
|
|
345
|
+
script: :Old_Persian,
|
|
346
|
+
name: "Old_Persian",
|
|
347
|
+
regex: /\p{Old_Persian}/,
|
|
348
|
+
},
|
|
349
|
+
{
|
|
350
|
+
script: :Old_South_Arabian,
|
|
351
|
+
name: "Old_South_Arabian",
|
|
352
|
+
regex: /\p{Old_South_Arabian}/,
|
|
353
|
+
},
|
|
354
|
+
{
|
|
355
|
+
script: :Old_Turkic,
|
|
356
|
+
name: "Old_Turkic",
|
|
357
|
+
regex: /\p{Old_Turkic}/,
|
|
358
|
+
},
|
|
359
|
+
{
|
|
360
|
+
script: :Oriya,
|
|
361
|
+
name: "Oriya",
|
|
362
|
+
regex: /\p{Oriya}/,
|
|
363
|
+
},
|
|
364
|
+
{
|
|
365
|
+
script: :Osmanya,
|
|
366
|
+
name: "Osmanya",
|
|
367
|
+
regex: /\p{Osmanya}/,
|
|
368
|
+
},
|
|
369
|
+
{
|
|
370
|
+
script: :Phags_Pa,
|
|
371
|
+
name: "Phags_Pa",
|
|
372
|
+
regex: /\p{Phags_Pa}/,
|
|
373
|
+
},
|
|
374
|
+
{
|
|
375
|
+
script: :Phoenician,
|
|
376
|
+
name: "Phoenician",
|
|
377
|
+
regex: /\p{Phoenician}/,
|
|
378
|
+
},
|
|
379
|
+
{
|
|
380
|
+
script: :Rejang,
|
|
381
|
+
name: "Rejang",
|
|
382
|
+
regex: /\p{Rejang}/,
|
|
383
|
+
},
|
|
384
|
+
{
|
|
385
|
+
script: :Runic,
|
|
386
|
+
name: "Runic",
|
|
387
|
+
regex: /\p{Runic}/,
|
|
388
|
+
},
|
|
389
|
+
{
|
|
390
|
+
script: :Saurashtra,
|
|
391
|
+
name: "Saurashtra",
|
|
392
|
+
regex: /\p{Saurashtra}/,
|
|
393
|
+
},
|
|
394
|
+
{
|
|
395
|
+
script: :Sharada,
|
|
396
|
+
name: "Sharada",
|
|
397
|
+
regex: /\p{Sharada}/,
|
|
398
|
+
},
|
|
399
|
+
{
|
|
400
|
+
script: :Shavian,
|
|
401
|
+
name: "Shavian",
|
|
402
|
+
regex: /\p{Shavian}/,
|
|
403
|
+
},
|
|
404
|
+
{
|
|
405
|
+
script: :Sinhala,
|
|
406
|
+
name: "Sinhala",
|
|
407
|
+
regex: /\p{Sinhala}/,
|
|
408
|
+
},
|
|
409
|
+
{
|
|
410
|
+
script: :Sora_Sompeng,
|
|
411
|
+
name: "Sora_Sompeng",
|
|
412
|
+
regex: /\p{Sora_Sompeng}/,
|
|
413
|
+
},
|
|
414
|
+
{
|
|
415
|
+
script: :Sundanese,
|
|
416
|
+
name: "Sundanese",
|
|
417
|
+
regex: /\p{Sundanese}/,
|
|
418
|
+
},
|
|
419
|
+
{
|
|
420
|
+
script: :Syloti_Nagri,
|
|
421
|
+
name: "Syloti_Nagri",
|
|
422
|
+
regex: /\p{Syloti_Nagri}/,
|
|
423
|
+
},
|
|
424
|
+
{
|
|
425
|
+
script: :Syriac,
|
|
426
|
+
name: "Syriac",
|
|
427
|
+
regex: /\p{Syriac}/,
|
|
428
|
+
},
|
|
429
|
+
{
|
|
430
|
+
script: :Tagalog,
|
|
431
|
+
name: "Tagalog",
|
|
432
|
+
regex: /\p{Tagalog}/,
|
|
433
|
+
},
|
|
434
|
+
{
|
|
435
|
+
script: :Tagbanwa,
|
|
436
|
+
name: "Tagbanwa",
|
|
437
|
+
regex: /\p{Tagbanwa}/,
|
|
438
|
+
},
|
|
439
|
+
{
|
|
440
|
+
script: :Tai_Le,
|
|
441
|
+
name: "Tai_Le",
|
|
442
|
+
regex: /\p{Tai_Le}/,
|
|
443
|
+
},
|
|
444
|
+
{
|
|
445
|
+
script: :Tai_Tham,
|
|
446
|
+
name: "Tai_Tham",
|
|
447
|
+
regex: /\p{Tai_Tham}/,
|
|
448
|
+
},
|
|
449
|
+
{
|
|
450
|
+
script: :Tai_Viet,
|
|
451
|
+
name: "Tai_Viet",
|
|
452
|
+
regex: /\p{Tai_Viet}/,
|
|
453
|
+
},
|
|
454
|
+
{
|
|
455
|
+
script: :Takri,
|
|
456
|
+
name: "Takri",
|
|
457
|
+
regex: /\p{Takri}/,
|
|
458
|
+
},
|
|
459
|
+
{
|
|
460
|
+
script: :Tamil,
|
|
461
|
+
name: "Tamil",
|
|
462
|
+
regex: /\p{Tamil}/,
|
|
463
|
+
},
|
|
464
|
+
{
|
|
465
|
+
script: :Telugu,
|
|
466
|
+
name: "Telugu",
|
|
467
|
+
regex: /\p{Telugu}/,
|
|
468
|
+
},
|
|
469
|
+
{
|
|
470
|
+
script: :Thaana,
|
|
471
|
+
name: "Thaana",
|
|
472
|
+
regex: /\p{Thaana}/,
|
|
473
|
+
},
|
|
474
|
+
{
|
|
475
|
+
script: :Thai,
|
|
476
|
+
name: "Thai",
|
|
477
|
+
regex: /\p{Thai}/,
|
|
478
|
+
},
|
|
479
|
+
{
|
|
480
|
+
script: :Tibetan,
|
|
481
|
+
name: "Tibetan",
|
|
482
|
+
regex: /\p{Tibetan}/,
|
|
483
|
+
},
|
|
484
|
+
{
|
|
485
|
+
script: :Tifinagh,
|
|
486
|
+
name: "Tifinagh",
|
|
487
|
+
regex: /\p{Tifinagh}/,
|
|
488
|
+
},
|
|
489
|
+
{
|
|
490
|
+
script: :Ugaritic,
|
|
491
|
+
name: "Ugaritic",
|
|
492
|
+
regex: /\p{Ugaritic}/,
|
|
493
|
+
},
|
|
494
|
+
{
|
|
495
|
+
script: :Vai,
|
|
496
|
+
name: "Vai",
|
|
497
|
+
regex: /\p{Vai}/,
|
|
498
|
+
},
|
|
499
|
+
{
|
|
500
|
+
script: :Yi,
|
|
501
|
+
name: "Yi",
|
|
502
|
+
regex: /\p{Yi}/,
|
|
503
|
+
},
|
|
504
|
+
{
|
|
505
|
+
script: :Emoji,
|
|
506
|
+
name: "Emoji",
|
|
507
|
+
regex: /\p{Emoji}/,
|
|
508
|
+
},
|
|
509
|
+
{
|
|
510
|
+
script: :Common,
|
|
511
|
+
name: "Common",
|
|
512
|
+
regex: /\p{Common}/,
|
|
513
|
+
},
|
|
514
|
+
]
|
|
515
|
+
end
|
|
516
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
require "zeitwerk"
|
|
2
|
+
|
|
3
|
+
loader = Zeitwerk::Loader.for_gem
|
|
4
|
+
loader.setup
|
|
5
|
+
|
|
6
|
+
module UnicodeScriptDetector
|
|
7
|
+
class << self
|
|
8
|
+
def detect_characters(string)
|
|
9
|
+
UnicodeScriptDetector::Detector.new(string).characters
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def contains?(string, scripts)
|
|
13
|
+
UnicodeScriptDetector::Detector.new(string).contains?(scripts)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def contains_only?(string, scripts)
|
|
17
|
+
UnicodeScriptDetector::Detector.new(string).contains_only?(scripts)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: unicode_script_detector
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.0.1
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- David Arendsen
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2024-12-02 00:00:00.000000000 Z
|
|
12
|
+
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: zeitwerk
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - "~>"
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '2.5'
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - "~>"
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '2.5'
|
|
27
|
+
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: activesupport
|
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - "~>"
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: '7.2'
|
|
34
|
+
- - ">="
|
|
35
|
+
- !ruby/object:Gem::Version
|
|
36
|
+
version: 7.2.2
|
|
37
|
+
type: :runtime
|
|
38
|
+
prerelease: false
|
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
40
|
+
requirements:
|
|
41
|
+
- - "~>"
|
|
42
|
+
- !ruby/object:Gem::Version
|
|
43
|
+
version: '7.2'
|
|
44
|
+
- - ">="
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: 7.2.2
|
|
47
|
+
description: Detect the unicode script per character
|
|
48
|
+
email: davidarendsen@hey.com
|
|
49
|
+
executables: []
|
|
50
|
+
extensions: []
|
|
51
|
+
extra_rdoc_files: []
|
|
52
|
+
files:
|
|
53
|
+
- LICENSE
|
|
54
|
+
- README.md
|
|
55
|
+
- lib/unicode_script_detector.rb
|
|
56
|
+
- lib/unicode_script_detector/character.rb
|
|
57
|
+
- lib/unicode_script_detector/detector.rb
|
|
58
|
+
- lib/unicode_script_detector/scripts.rb
|
|
59
|
+
- lib/unicode_script_detector/version.rb
|
|
60
|
+
homepage: https://rubygems.org/gems/unicode_script_detector
|
|
61
|
+
licenses:
|
|
62
|
+
- MIT
|
|
63
|
+
metadata: {}
|
|
64
|
+
post_install_message:
|
|
65
|
+
rdoc_options: []
|
|
66
|
+
require_paths:
|
|
67
|
+
- lib
|
|
68
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
69
|
+
requirements:
|
|
70
|
+
- - ">="
|
|
71
|
+
- !ruby/object:Gem::Version
|
|
72
|
+
version: 3.1.0
|
|
73
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
74
|
+
requirements:
|
|
75
|
+
- - ">="
|
|
76
|
+
- !ruby/object:Gem::Version
|
|
77
|
+
version: '0'
|
|
78
|
+
requirements: []
|
|
79
|
+
rubygems_version: 3.5.11
|
|
80
|
+
signing_key:
|
|
81
|
+
specification_version: 4
|
|
82
|
+
summary: Unicode Script Detector
|
|
83
|
+
test_files: []
|