unicode-script 0.1.0 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -13
- data/lib/unicode_script/charts.rb +159 -159
- data/lib/unicode_script/core.rb +44 -36
- data/lib/unicode_script/version.rb +2 -3
- data/spec/lib/unicode_script_spec.rb +8 -8
- metadata +8 -8
checksums.yaml
CHANGED
@@ -1,15 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
5
|
-
data.tar.gz: !binary |-
|
6
|
-
MTM2NzVkNDBmZGRmMWE3MTk4YTA5ODIzMDQxN2VmNDQzZWQ5MGJkNg==
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: beeafac906b9c14b8d3510ad8bfee403d7d648f2
|
4
|
+
data.tar.gz: 36bdbfa5841bebe1ba16a5e31238ef53bd94f873
|
7
5
|
SHA512:
|
8
|
-
metadata.gz:
|
9
|
-
|
10
|
-
ZTgzYmQyMGE0MGU0OGVkZGJkMGE0YzE1NzY4MzA5MTg5NzAwOWI1YjMxZmJj
|
11
|
-
MzYwNWJjN2Q1YjNkZDJiNTgzZTE3OTNlZWM3Yjg0NjM4NTYxODQ=
|
12
|
-
data.tar.gz: !binary |-
|
13
|
-
NzAzMzY1ODRhZWFkYzczMjUwZGYyOThmMGVjNDhiNWUxOGY5M2FmOTY2MDVm
|
14
|
-
YmRmOWU2Njk3MGRmYjJmNDIwYmZkMzJiZGIyYzg3OTBmYjg1YjhjNmUzOWM4
|
15
|
-
YTk1MGRkMjQ2OGViOTQwYTJlM2NiNmNjNGE3YzAwNDcwNDhmOTA=
|
6
|
+
metadata.gz: 3d3c9bf74deb4bf4dc3f1207ada3f0151824560ef9de8e14bdac665df2251dcd872341a3f94483d2705e24e750db987742e3422da380a43fdeb3423eadca34f4
|
7
|
+
data.tar.gz: 47680ec779b08865201ac5f1e1fa0e7f2f5b3e34eaaafe4d79cc06a19ebd176a62c32dce2839cff5c41c035dbdc0abe2499a6dd2560651238c403b18950b1a5c
|
@@ -1,161 +1,161 @@
|
|
1
1
|
module UnicodeScript
|
2
|
-
CHARTS =
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
2
|
+
CHARTS = {'armenian' => (0x0530..0x058f),
|
3
|
+
'coptic' => (0x2c80..0x2cff),
|
4
|
+
'greek and coptic' => (0x0370..0x03ff),
|
5
|
+
'cypriot syllabary' => (0x10800..0x1083f),
|
6
|
+
'cyrilic' => (0x0400..0x04ff),
|
7
|
+
'cyrilic supplement' => (0x0500..0x052f),
|
8
|
+
'cyrillic extended-a' => (0x2de0..0x2dff),
|
9
|
+
'cyrillic extended-b' => (0xa640..0xa69f),
|
10
|
+
'georgian' => (0x10a0..0x10ff),
|
11
|
+
'georgian supplement' => (0x2d00..0x2d2f),
|
12
|
+
'hiragana' => (0x3040..0x309f),
|
13
|
+
'glagolitic' => (0x2c00..0x2c5f),
|
14
|
+
'gothic' => (0x10330..0x1034f),
|
15
|
+
'greek extended' => (0x1f00..0x1fff),
|
16
|
+
'basic latin' => (0x0000..0x007f),
|
17
|
+
'c1 controls and latin-1 supplement' => (0x0080..0x00ff),
|
18
|
+
'latin extended-a' => (0x0100..0x017f),
|
19
|
+
'latin extended-b' => (0x0180..0x024f),
|
20
|
+
'latin extended-c' => (0x2c60..0x2c7f),
|
21
|
+
'latin extended-d' => (0xa720..0xa7ff),
|
22
|
+
'latin extended additional' => (0x1e00..0x1eff),
|
23
|
+
'fullwidth ascii' => (0x0020..0x007e),
|
24
|
+
'halfwidth cjk punctuation' => (0x3000..0x303f),
|
25
|
+
'halfwidth hangul' => (0x3130..0x318f),
|
26
|
+
'linear b syllabary' => (0x10000..0x1007f),
|
27
|
+
'linear b ideograms' => (0x10080..0x100ff),
|
28
|
+
'ogham' => (0x1680..0x169f),
|
29
|
+
'old italic' => (0x10300..0x1032f),
|
30
|
+
'phaistos disc' => (0x101d0..0x101ff),
|
31
|
+
'runic' => (0x16a0..0x16ff),
|
32
|
+
'shavian' => (0x10450..0x1047f),
|
33
|
+
'ipa extensions' => (0x0250..0x02af),
|
34
|
+
'phonetic extensions' => (0x1d00..0x1d7f),
|
35
|
+
'phonetic extensions supplement' => (0x1d80..0x1dbf),
|
36
|
+
'modifier tone letters' => (0xa700..0xa71f),
|
37
|
+
'spacing modifier letters' => (0x02b0..0x02ff),
|
38
|
+
'superscripts and subscripts' => (0x2070..0x209f),
|
39
|
+
'combining diacritical marks' => (0x0300..0x036f),
|
40
|
+
'combining diacritical marks supplement' => (0x1dc0..0x1dff),
|
41
|
+
'combining half marks' => (0xfe20..0xfe2f),
|
42
|
+
'bamum' => (0xa6a0..0xa6ff),
|
43
|
+
'bamum supplement' => (0x16800..0x16a3f),
|
44
|
+
'egyptian hieroglyphs' => (0x13000..0x1342f),
|
45
|
+
'ethiopic' => (0x1200..0x137f),
|
46
|
+
'ethiopic supplement' => (0x1380..0x139f),
|
47
|
+
'ethiopic extended' => (0x2d80..0x2ddf),
|
48
|
+
'ethiopic extended-a' => (0xab00..0xab2f),
|
49
|
+
'meroitic cursive' => (0x109a0..0x109ff),
|
50
|
+
'meroitic hieroglyphs' => (0x10980..0x1099f),
|
51
|
+
'nko' => (0x07c0..0x07ff),
|
52
|
+
'osmanya' => (0x10480..0x104af),
|
53
|
+
'tifinagh' => (0x2d30..0x2d7f),
|
54
|
+
'vai' => (0xa500..0xa63f),
|
55
|
+
'arabic' => (0x0600..0x06ff),
|
56
|
+
'arabic supplement' => (0x0750..0x077f),
|
57
|
+
'arabic extended-a' => (0x08a0..0x08ff),
|
58
|
+
'arabic presentation forms-a' => (0xfb50..0xfdff),
|
59
|
+
'arabic presentation forms-b' => (0xfe70..0xfeff),
|
60
|
+
'imperial aramaic' => (0x10840..0x1085f),
|
61
|
+
'avestan' => (0x10b00..0x10b3f),
|
62
|
+
'carian' => (0x102a0..0x102df),
|
63
|
+
'cuneiform' => (0x12000..0x123ff),
|
64
|
+
'cuneiform numbers and punctuation' => (0x12400..0x1247f),
|
65
|
+
'old persian' => (0x103a0..0x103df),
|
66
|
+
'ugaritic' => (0x10380..0x1039f),
|
67
|
+
'hebrew' => (0x0590..0x05ff),
|
68
|
+
'lycian' => (0x10280..0x1029f),
|
69
|
+
'lydian' => (0x10920..0x1093f),
|
70
|
+
'mandaic' => (0x0840..0x085f),
|
71
|
+
'old south arabian' => (0x10a60..0x10a7f),
|
72
|
+
'inscriptional pahlavi' => (0x10b60..0x10b7f),
|
73
|
+
'inscriptional parthian' => (0x10b40..0x10b5f),
|
74
|
+
'phoenician' => (0x10900..0x1091f),
|
75
|
+
'samaritan' => (0x0800..0x083f),
|
76
|
+
'syriac' => (0x0700..0x074f),
|
77
|
+
'mongolian' => (0x1800..0x18af),
|
78
|
+
'old turkic' => (0x10c00..0x10c4f),
|
79
|
+
'phags-pa' => (0xa840..0xa87f),
|
80
|
+
'tibetan' => (0x0f00..0x0fff),
|
81
|
+
'bengali' => (0x0980..0x09ff),
|
82
|
+
'brahmi' => (0x11000..0x1107f),
|
83
|
+
'chakma' => (0x11100..0x1114f),
|
84
|
+
'devanagari' => (0x0900..0x097f),
|
85
|
+
'devanagari extended' => (0xa8e0..0xa8ff),
|
86
|
+
'gujarati' => (0x0a80..0x0aff),
|
87
|
+
'gurmukhi' => (0x0a00..0x0a7f),
|
88
|
+
'kaithi' => (0x11080..0x110cf),
|
89
|
+
'kannada' => (0x0c80..0x0cff),
|
90
|
+
'kharoshthi' => (0x10a00..0x10a5f),
|
91
|
+
'lepcha' => (0x1c00..0x1c4f),
|
92
|
+
'limbu' => (0x1900..0x194f),
|
93
|
+
'malayalam' => (0x0d00..0x0d7f),
|
94
|
+
'meetei mayek' => (0xabc0..0xabff),
|
95
|
+
'meetei mayek extensions' => (0xaae0..0xaaff),
|
96
|
+
'ol chiki' => (0x1c50..0x1c7f),
|
97
|
+
'oriya' => (0x0b00..0x0b7f),
|
98
|
+
'saurashtra' => (0xa880..0xa8df),
|
99
|
+
'sharada' => (0x11180..0x111df),
|
100
|
+
'sinhala' => (0x0d80..0x0dff),
|
101
|
+
'sora sompeng' => (0x110d0..0x110ff),
|
102
|
+
'syloti nagri' => (0xa800..0xa82f),
|
103
|
+
'takri' => (0x11680..0x116cf),
|
104
|
+
'tamil' => (0x0b80..0x0bff),
|
105
|
+
'telugu' => (0x0c00..0x0c7f),
|
106
|
+
'thaana' => (0x0780..0x07bf),
|
107
|
+
'vedic extensions' => (0x1cd0..0x1cff),
|
108
|
+
'balinese' => (0x1b00..0x1b7f),
|
109
|
+
'batak' => (0x1bc0..0x1bff),
|
110
|
+
'buginese' => (0x1a00..0x1a1f),
|
111
|
+
'cham' => (0xaa00..0xaa5f),
|
112
|
+
'javanese' => (0xa980..0xa9df),
|
113
|
+
'kayah li' => (0xa900..0xa92f),
|
114
|
+
'khmer' => (0x1780..0x17ff),
|
115
|
+
'khmer symbols' => (0x19e0..0x19ff),
|
116
|
+
'lao' => (0x0e80..0x0eff),
|
117
|
+
'myanmar' => (0x1000..0x109f),
|
118
|
+
'myanmar extended-a' => (0xaa60..0xaa7f),
|
119
|
+
'new tai lue' => (0x1980..0x19df),
|
120
|
+
'rejang' => (0xa930..0xa95f),
|
121
|
+
'sundanese' => (0x1b80..0x1bbf),
|
122
|
+
'sundanese supplement' => (0x1cc0..0x1ccf),
|
123
|
+
'tai le' => (0x1950..0x197f),
|
124
|
+
'tai tham' => (0x1a20..0x1aaf),
|
125
|
+
'tai viet' => (0xaa80..0xaadf),
|
126
|
+
'thai' => (0x0e00..0x0e7f),
|
127
|
+
'buhid' => (0x1740..0x175f),
|
128
|
+
'hanunoo' => (0x1720..0x173f),
|
129
|
+
'tagalog' => (0x1700..0x171f),
|
130
|
+
'tagbanwa' => (0x1760..0x177f),
|
131
|
+
'bopomofo' => (0x3100..0x312f),
|
132
|
+
'bopomofo extended' => (0x31a0..0x31bf),
|
133
|
+
'cjk unified ideographs' => (0x4e00..0x9fcc),
|
134
|
+
'cjk unified ideographs extension a' => (0x3400..0x4db5),
|
135
|
+
'cjk unified ideographs extension b' => (0x20000..0x2a6d6),
|
136
|
+
'cjk unified ideographs extension c' => (0x2a700..0x2b734),
|
137
|
+
'cjk unified ideographs extension d' => (0x2b740..0x2b81d),
|
138
|
+
'cjk compatibility ideographs' => (0xf900..0xfaff),
|
139
|
+
'cjk compatibility ideographs supplement' => (0x2f800..0x2fa1f),
|
140
|
+
'kangxi radicals' => (0x2f00..0x2fdf),
|
141
|
+
'cjk radicals supplement' => (0x2e80..0x2eff),
|
142
|
+
'cjk strokes' => (0x31c0..0x31ef),
|
143
|
+
'hangul jamo' => (0x1100..0x11ff),
|
144
|
+
'hangul jamo extended-a' => (0xa960..0xa97f),
|
145
|
+
'hangul jamo extended-b' => (0xd7b0..0xd7ff),
|
146
|
+
'hangul compatibility jamo' => (0x3130..0x318f),
|
147
|
+
'hiragana' => (0x3040..0x309f),
|
148
|
+
'katakana' => (0x30a0..0x30ff),
|
149
|
+
'katakana phonetic extensions' => (0x31f0..0x31ff),
|
150
|
+
'kana supplement' => (0x1b000..0x1b0ff),
|
151
|
+
'kanbun' => (0x3190..0x319f),
|
152
|
+
'lisu' => (0xa4d0..0xa4ff),
|
153
|
+
'miao' => (0x16f00..0x16f9f),
|
154
|
+
'yi syllables' => (0xa000..0xa48f),
|
155
|
+
'yi radicals' => (0xa490..0xa4cf),
|
156
|
+
'cherokee' => (0x13a0..0x13ff),
|
157
|
+
'deseret' => (0x10400..0x1044f),
|
158
|
+
'unified canadian aboriginal syllabics' => (0x1400..0x167f),
|
159
|
+
'unified canadian aboriginal syllabics extended' => (0x18b0..0x18ff)
|
160
|
+
}
|
161
161
|
end
|
data/lib/unicode_script/core.rb
CHANGED
@@ -1,46 +1,54 @@
|
|
1
1
|
module UnicodeScript
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
end
|
3
|
+
def self.detect(string)
|
4
|
+
res = []
|
5
|
+
string.tr!(' ','')
|
6
|
+
string.codepoints.each do |c|
|
7
|
+
script = find_script(c)
|
8
|
+
index = res.find_index{|v| v[:script] == script}
|
9
|
+
if script
|
10
|
+
if index
|
11
|
+
res[index][:value].push(c.chr)
|
12
|
+
else
|
13
|
+
res.push({script: script, value: [].push(c.chr)})
|
15
14
|
end
|
16
|
-
|
17
|
-
end
|
18
|
-
res.each do |r|
|
19
|
-
r[:value] = r[:value].join('')
|
20
15
|
end
|
21
|
-
|
16
|
+
|
22
17
|
end
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
18
|
+
res.each do |r|
|
19
|
+
r[:value] = r[:value].join('')
|
20
|
+
end
|
21
|
+
res
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.method_missing(method, val)
|
25
|
+
script_name = method.to_s.gsub('_', ' ').chop
|
26
|
+
puts script_name
|
27
|
+
if charted? script_name
|
28
|
+
val.codepoints.each do |point|
|
29
|
+
return false if !(CHARTS[script_name].include?(point))
|
33
30
|
end
|
31
|
+
return true
|
32
|
+
else
|
34
33
|
super
|
35
34
|
end
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.respond_to_missing?(method, include_private = false)
|
38
|
+
charted?(method.to_s.gsub('_', ' ').chop) || super
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def self.charted? script
|
44
|
+
CHARTS.has_key?(script)
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.find_script(codepoint)
|
48
|
+
CHARTS.each do |k, v|
|
49
|
+
return k if v.include? codepoint
|
44
50
|
end
|
51
|
+
nil
|
52
|
+
end
|
45
53
|
|
46
|
-
end
|
54
|
+
end
|
@@ -6,16 +6,16 @@ describe 'UnicodeScript' do
|
|
6
6
|
h = 'ひらがな'
|
7
7
|
k = 'カタカナ'
|
8
8
|
mixed = "東京 Tokyo"
|
9
|
-
UnicodeScript.detect(h).should eq([{:script => '
|
10
|
-
UnicodeScript.detect(k).should eq([{:script => '
|
11
|
-
UnicodeScript.detect(mixed).should eq([{:script => '
|
12
|
-
|
9
|
+
UnicodeScript.detect(h).should eq([{:script => 'hiragana', :value => 'ひらがな'}])
|
10
|
+
UnicodeScript.detect(k).should eq([{:script => 'katakana', :value => 'カタカナ'}])
|
11
|
+
UnicodeScript.detect(mixed).should eq([{:script => 'cjk unified ideographs', :value => '東京'},
|
12
|
+
{:script => 'basic latin', :value => 'Tokyo'}])
|
13
13
|
end
|
14
|
-
|
14
|
+
|
15
15
|
it 'should be able to check whether string belongs to certain script' do
|
16
|
-
h = '
|
16
|
+
h = '漢字'
|
17
17
|
mixed = 'ひらaaaがな'
|
18
|
-
UnicodeScript.
|
18
|
+
UnicodeScript.cjk_unified_ideographs?(h).should eq(true)
|
19
19
|
UnicodeScript.hiragana?(mixed).should eq(false)
|
20
20
|
end
|
21
|
-
end
|
21
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unicode-script
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yuri-gg
|
@@ -14,28 +14,28 @@ dependencies:
|
|
14
14
|
name: rake
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- -
|
24
|
+
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rspec
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- -
|
31
|
+
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: '0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- -
|
38
|
+
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
description: Small utility that allows you to detect scripts (languages) in unicode
|
@@ -61,17 +61,17 @@ require_paths:
|
|
61
61
|
- lib
|
62
62
|
required_ruby_version: !ruby/object:Gem::Requirement
|
63
63
|
requirements:
|
64
|
-
- -
|
64
|
+
- - ">="
|
65
65
|
- !ruby/object:Gem::Version
|
66
66
|
version: '0'
|
67
67
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
68
68
|
requirements:
|
69
|
-
- -
|
69
|
+
- - ">="
|
70
70
|
- !ruby/object:Gem::Version
|
71
71
|
version: '0'
|
72
72
|
requirements: []
|
73
73
|
rubyforge_project:
|
74
|
-
rubygems_version: 2.1
|
74
|
+
rubygems_version: 2.2.0.preview.1
|
75
75
|
signing_key:
|
76
76
|
specification_version: 4
|
77
77
|
summary: Unicode script detector
|