unicode-script 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ N2M5NTcxNWNkMDVkYWFiMDg3M2U1ZDRjMjEwODhhMGQ1MjI5M2MyNw==
5
+ data.tar.gz: !binary |-
6
+ MTM2NzVkNDBmZGRmMWE3MTk4YTA5ODIzMDQxN2VmNDQzZWQ5MGJkNg==
7
+ SHA512:
8
+ metadata.gz: !binary |-
9
+ ZTg5ZWVlMmJkYjA0MGNlMGJjZDk2YTI1ZWMyYjEyODIzZmUyZTI1N2JlZDI3
10
+ ZTgzYmQyMGE0MGU0OGVkZGJkMGE0YzE1NzY4MzA5MTg5NzAwOWI1YjMxZmJj
11
+ MzYwNWJjN2Q1YjNkZDJiNTgzZTE3OTNlZWM3Yjg0NjM4NTYxODQ=
12
+ data.tar.gz: !binary |-
13
+ NzAzMzY1ODRhZWFkYzczMjUwZGYyOThmMGVjNDhiNWUxOGY5M2FmOTY2MDVm
14
+ YmRmOWU2Njk3MGRmYjJmNDIwYmZkMzJiZGIyYzg3OTBmYjg1YjhjNmUzOWM4
15
+ YTk1MGRkMjQ2OGViOTQwYTJlM2NiNmNjNGE3YzAwNDcwNDhmOTA=
@@ -0,0 +1,9 @@
1
+ require 'unicode_script/charts'
2
+ require 'unicode_script/core'
3
+ require 'unicode_script/version'
4
+
5
+
6
+
7
+ module UnicodeScript
8
+ Encoding.default_internal = Encoding::UTF_8
9
+ end
@@ -0,0 +1,161 @@
1
+ module UnicodeScript
2
+ CHARTS = [{:name => 'Armenian', :range => (0x0530..0x058F)},
3
+ {:name => 'Coptic', :range => (0x2C80..0x2CFF)},
4
+ {:name => 'Greek and Coptic', :range => (0x0370..0x03FF)},
5
+ {:name => 'Cypriot Syllabary', :range => (0x10800..0x1083F)},
6
+ {:name => 'Cyrilic', :range => (0x0400..0x04FF)},
7
+ {:name => 'Cyrilic Supplement', :range => (0x0500..0x052F)},
8
+ {:name => 'Cyrillic Extended-A', :range => (0x2DE0..0x2DFF)},
9
+ {:name => 'Cyrillic Extended-B', :range => (0xA640..0xA69F)},
10
+ {:name => 'Georgian', :range => (0x10A0..0x10FF)},
11
+ {:name => 'Georgian Supplement', :range => (0x2D00..0x2D2F)},
12
+ {:name => 'Hiragana', :range => (0x3040..0x309F)},
13
+ {:name => 'Glagolitic', :range => (0x2C00..0x2C5F)},
14
+ {:name => 'Gothic', :range => (0x10330..0x1034F)},
15
+ {:name => 'Greek Extended', :range => (0x1F00..0x1FFF)},
16
+ {:name => 'Basic Latin', :range => (0x0000..0x007F)},
17
+ {:name => 'C1 Controls and Latin-1 Supplement', :range => (0x0080..0x00FF)},
18
+ {:name => 'Latin Extended-A', :range => (0x0100..0x017F)},
19
+ {:name => 'Latin Extended-B', :range => (0x0180..0x024F)},
20
+ {:name => 'Latin Extended-C', :range => (0x2C60..0x2C7F)},
21
+ {:name => 'Latin Extended-D', :range => (0xA720..0xA7FF)},
22
+ {:name => 'Latin Extended Additional', :range => (0x1E00..0x1EFF)},
23
+ {:name => 'Fullwidth ASCII', :range => (0x0020..0x007E)},
24
+ {:name => 'Halfwidth CJK punctuation', :range => (0x3000..0x303F)},
25
+ {:name => 'Halfwidth Hangul', :range => (0x3130..0x318F)},
26
+ {:name => 'Linear B Syllabary', :range => (0x10000..0x1007F)},
27
+ {:name => 'Linear B Ideograms', :range => (0x10080..0x100FF)},
28
+ {:name => 'Ogham', :range => (0x1680..0x169F)},
29
+ {:name => 'Old Italic', :range => (0x10300..0x1032F)},
30
+ {:name => 'Phaistos Disc', :range => (0x101D0..0x101FF)},
31
+ {:name => 'Runic', :range => (0x16A0..0x16FF)},
32
+ {:name => 'Shavian', :range => (0x10450..0x1047F)},
33
+ {:name => 'IPA Extensions', :range => (0x0250..0x02AF)},
34
+ {:name => 'Phonetic Extensions', :range => (0x1D00..0x1D7F)},
35
+ {:name => 'Phonetic Extensions Supplement', :range => (0x1D80..0x1DBF)},
36
+ {:name => 'Modifier Tone Letters', :range => (0xA700..0xA71F)},
37
+ {:name => 'Spacing Modifier Letters', :range => (0x02B0..0x02FF)},
38
+ {:name => 'Superscripts and Subscripts', :range => (0x2070..0x209F)},
39
+ {:name => 'Combining Diacritical Marks', :range => (0x0300..0x036F)},
40
+ {:name => 'Combining Diacritical Marks Supplement', :range => (0x1DC0..0x1DFF)},
41
+ {:name => 'Combining Half Marks', :range => (0xFE20..0xFE2F)},
42
+ {:name => 'Bamum', :range => (0xA6A0..0xA6FF)},
43
+ {:name => 'Bamum Supplement', :range => (0x16800..0x16A3F)},
44
+ {:name => 'Egyptian Hieroglyphs', :range => (0x13000..0x1342F)},
45
+ {:name => 'Ethiopic', :range => (0x1200..0x137F)},
46
+ {:name => 'Ethiopic Supplement', :range => (0x1380..0x139F)},
47
+ {:name => 'Ethiopic Extended', :range => (0x2D80..0x2DDF)},
48
+ {:name => 'Ethiopic Extended-A', :range => (0xAB00..0xAB2F)},
49
+ {:name => 'Meroitic Cursive', :range => (0x109A0..0x109FF)},
50
+ {:name => 'Meroitic Hieroglyphs', :range => (0x10980..0x1099F)},
51
+ {:name => 'NKo', :range => (0x07C0..0x07FF)},
52
+ {:name => 'Osmanya', :range => (0x10480..0x104AF)},
53
+ {:name => 'Tifinagh', :range => (0x2D30..0x2D7F)},
54
+ {:name => 'Vai', :range => (0xA500..0xA63F)},
55
+ {:name => 'Arabic', :range => (0x0600..0x06FF)},
56
+ {:name => 'Arabic Supplement', :range => (0x0750..0x077F)},
57
+ {:name => 'Arabic Extended-A', :range => (0x08A0..0x08FF)},
58
+ {:name => 'Arabic Presentation Forms-A', :range => (0xFB50..0xFDFF)},
59
+ {:name => 'Arabic Presentation Forms-B', :range => (0xFE70..0xFEFF)},
60
+ {:name => 'Imperial Aramaic', :range => (0x10840..0x1085F)},
61
+ {:name => 'Avestan', :range => (0x10B00..0x10B3F)},
62
+ {:name => 'Carian', :range => (0x102A0..0x102DF)},
63
+ {:name => 'Cuneiform', :range => (0x12000..0x123FF)},
64
+ {:name => 'Cuneiform Numbers and Punctuation', :range => (0x12400..0x1247F)},
65
+ {:name => 'Old Persian', :range => (0x103A0..0x103DF)},
66
+ {:name => 'Ugaritic', :range => (0x10380..0x1039F)},
67
+ {:name => 'Hebrew', :range => (0x0590..0x05FF)},
68
+ {:name => 'Lycian', :range => (0x10280..0x1029F)},
69
+ {:name => 'Lydian', :range => (0x10920..0x1093F)},
70
+ {:name => 'Mandaic', :range => (0x0840..0x085F)},
71
+ {:name => 'Old South Arabian', :range => (0x10A60..0x10A7F)},
72
+ {:name => 'Inscriptional Pahlavi', :range => (0x10B60..0x10B7F)},
73
+ {:name => 'Inscriptional Parthian', :range => (0x10B40..0x10B5F)},
74
+ {:name => 'Phoenician', :range => (0x10900..0x1091F)},
75
+ {:name => 'Samaritan', :range => (0x0800..0x083F)},
76
+ {:name => 'Syriac', :range => (0x0700..0x074F)},
77
+ {:name => 'Mongolian', :range => (0x1800..0x18AF)},
78
+ {:name => 'Old Turkic', :range => (0x10C00..0x10C4F)},
79
+ {:name => 'Phags-pa', :range => (0xA840..0xA87F)},
80
+ {:name => 'Tibetan', :range => (0x0F00..0x0FFF)},
81
+ {:name => 'Bengali', :range => (0x0980..0x09FF)},
82
+ {:name => 'Brahmi', :range => (0x11000..0x1107F)},
83
+ {:name => 'Chakma', :range => (0x11100..0x1114F)},
84
+ {:name => 'Devanagari', :range => (0x0900..0x097F)},
85
+ {:name => 'Devanagari Extended', :range => (0xA8E0..0xA8FF)},
86
+ {:name => 'Gujarati', :range => (0x0A80..0x0AFF)},
87
+ {:name => 'Gurmukhi', :range => (0x0A00..0x0A7F)},
88
+ {:name => 'Kaithi', :range => (0x11080..0x110CF)},
89
+ {:name => 'Kannada', :range => (0x0C80..0x0CFF)},
90
+ {:name => 'Kharoshthi', :range => (0x10A00..0x10A5F)},
91
+ {:name => 'Lepcha', :range => (0x1C00..0x1C4F)},
92
+ {:name => 'Limbu', :range => (0x1900..0x194F)},
93
+ {:name => 'Malayalam', :range => (0x0D00..0x0D7F)},
94
+ {:name => 'Meetei Mayek', :range => (0xABC0..0xABFF)},
95
+ {:name => 'Meetei Mayek Extensions', :range => (0xAAE0..0xAAFF)},
96
+ {:name => 'Ol Chiki', :range => (0x1C50..0x1C7F)},
97
+ {:name => 'Oriya', :range => (0x0B00..0x0B7F)},
98
+ {:name => 'Saurashtra', :range => (0xA880..0xA8DF)},
99
+ {:name => 'Sharada', :range => (0x11180..0x111DF)},
100
+ {:name => 'Sinhala', :range => (0x0D80..0x0DFF)},
101
+ {:name => 'Sora Sompeng', :range => (0x110D0..0x110FF)},
102
+ {:name => 'Syloti Nagri', :range => (0xA800..0xA82F)},
103
+ {:name => 'Takri', :range => (0x11680..0x116CF)},
104
+ {:name => 'Tamil', :range => (0x0B80..0x0BFF)},
105
+ {:name => 'Telugu', :range => (0x0C00..0x0C7F)},
106
+ {:name => 'Thaana', :range => (0x0780..0x07BF)},
107
+ {:name => 'Vedic Extensions', :range => (0x1CD0..0x1CFF)},
108
+ {:name => 'Balinese', :range => (0x1B00..0x1B7F)},
109
+ {:name => 'Batak', :range => (0x1BC0..0x1BFF)},
110
+ {:name => 'Buginese', :range => (0x1A00..0x1A1F)},
111
+ {:name => 'Cham', :range => (0xAA00..0xAA5F)},
112
+ {:name => 'Javanese', :range => (0xA980..0xA9DF)},
113
+ {:name => 'Kayah Li', :range => (0xA900..0xA92F)},
114
+ {:name => 'Khmer', :range => (0x1780..0x17FF)},
115
+ {:name => 'Khmer Symbols', :range => (0x19E0..0x19FF)},
116
+ {:name => 'Lao', :range => (0x0E80..0x0EFF)},
117
+ {:name => 'Myanmar', :range => (0x1000..0x109F)},
118
+ {:name => 'Myanmar Extended-A', :range => (0xAA60..0xAA7F)},
119
+ {:name => 'New Tai Lue', :range => (0x1980..0x19DF)},
120
+ {:name => 'Rejang', :range => (0xA930..0xA95F)},
121
+ {:name => 'Sundanese', :range => (0x1B80..0x1BBF)},
122
+ {:name => 'Sundanese Supplement', :range => (0x1CC0..0x1CCF)},
123
+ {:name => 'Tai Le', :range => (0x1950..0x197F)},
124
+ {:name => 'Tai Tham', :range => (0x1A20..0x1AAF)},
125
+ {:name => 'Tai Viet', :range => (0xAA80..0xAADF)},
126
+ {:name => 'Thai', :range => (0x0E00..0x0E7F)},
127
+ {:name => 'Buhid', :range => (0x1740..0x175F)},
128
+ {:name => 'Hanunoo', :range => (0x1720..0x173F)},
129
+ {:name => 'Tagalog', :range => (0x1700..0x171F)},
130
+ {:name => 'Tagbanwa', :range => (0x1760..0x177F)},
131
+ {:name => 'Bopomofo', :range => (0x3100..0x312F)},
132
+ {:name => 'Bopomofo Extended', :range => (0x31A0..0x31BF)},
133
+ {:name => 'CJK Unified Ideographs', :range => (0x4E00..0x9FCC)},
134
+ {:name => 'CJK Unified Ideographs Extension A', :range => (0x3400..0x4DB5)},
135
+ {:name => 'CJK Unified Ideographs Extension B', :range => (0x20000..0x2A6D6)},
136
+ {:name => 'CJK Unified Ideographs Extension C', :range => (0x2A700..0x2B734)},
137
+ {:name => 'CJK Unified Ideographs Extension D', :range => (0x2B740..0x2B81D)},
138
+ {:name => 'CJK Compatibility Ideographs', :range => (0xF900..0xFAFF)},
139
+ {:name => 'CJK Compatibility Ideographs Supplement', :range => (0x2F800..0x2FA1F)},
140
+ {:name => 'Kangxi Radicals', :range => (0x2F00..0x2FDF)},
141
+ {:name => 'CJK Radicals Supplement', :range => (0x2E80..0x2EFF)},
142
+ {:name => 'CJK Strokes', :range => (0x31C0..0x31EF)},
143
+ {:name => 'Hangul Jamo', :range => (0x1100..0x11FF)},
144
+ {:name => 'Hangul Jamo Extended-A', :range => (0xA960..0xA97F)},
145
+ {:name => 'Hangul Jamo Extended-B', :range => (0xD7B0..0xD7FF)},
146
+ {:name => 'Hangul Compatibility Jamo', :range => (0x3130..0x318F)},
147
+ {:name => 'Hiragana', :range => (0x3040..0x309F)},
148
+ {:name => 'Katakana', :range => (0x30A0..0x30FF)},
149
+ {:name => 'Katakana Phonetic Extensions', :range => (0x31F0..0x31FF)},
150
+ {:name => 'Kana Supplement', :range => (0x1B000..0x1B0FF)},
151
+ {:name => 'Kanbun', :range => (0x3190..0x319F)},
152
+ {:name => 'Lisu', :range => (0xA4D0..0xA4FF)},
153
+ {:name => 'Miao', :range => (0x16F00..0x16F9F)},
154
+ {:name => 'Yi Syllables', :range => (0xA000..0xA48F)},
155
+ {:name => 'Yi Radicals', :range => (0xA490..0xA4CF)},
156
+ {:name => 'Cherokee', :range => (0x13A0..0x13FF)},
157
+ {:name => 'Deseret', :range => (0x10400..0x1044F)},
158
+ {:name => 'Unified Canadian Aboriginal Syllabics', :range => (0x1400..0x167F)},
159
+ {:name => 'Unified Canadian Aboriginal Syllabics Extended', :range => (0x18B0..0x18FF)}
160
+ ]
161
+ end
@@ -0,0 +1,46 @@
1
+ module UnicodeScript
2
+
3
+ def self.detect string
4
+ res = []
5
+ string.tr!(' ','')
6
+ string.codepoints.each do |c|
7
+ script = find_script(c)
8
+ index = res.find_index{|v| v[:script] == script}
9
+ if script
10
+ if index
11
+ res[index][:value].push(c.chr)
12
+ else
13
+ res.push({:script => script, :value => [].push(c.chr)})
14
+ end
15
+ end
16
+
17
+ end
18
+ res.each do |r|
19
+ r[:value] = r[:value].join('')
20
+ end
21
+ res
22
+ end
23
+
24
+
25
+ def self.method_missing method, val
26
+ CHARTS.each do |c|
27
+ if c[:name].downcase == method.to_s.chop
28
+ val.codepoints.each do |p|
29
+ return false if !(c[:range].include?(p))
30
+ end
31
+ return true
32
+ end
33
+ end
34
+ super
35
+ end
36
+
37
+ private
38
+
39
+ def self.find_script codepoint
40
+ CHARTS.each do |c|
41
+ return c[:name] if c[:range].include? codepoint
42
+ end
43
+ nil
44
+ end
45
+
46
+ end
@@ -0,0 +1,9 @@
1
+ module UnicodeScript
2
+ module Version
3
+ MAJOR = 0
4
+ MINOR = 1
5
+ TINY = 0
6
+ STRING = [Version::MAJOR, Version::MINOR, Version::TINY].compact * '.'
7
+ end
8
+
9
+ end
@@ -0,0 +1,21 @@
1
+ # encoding: UTF-8
2
+ require 'spec_helper'
3
+
4
+ describe 'UnicodeScript' do
5
+ it 'should be able to determine script of text' do
6
+ h = 'ひらがな'
7
+ k = 'カタカナ'
8
+ mixed = "東京 Tokyo"
9
+ UnicodeScript.detect(h).should eq([{:script => 'Hiragana', :value => 'ひらがな'}])
10
+ UnicodeScript.detect(k).should eq([{:script => 'Katakana', :value => 'カタカナ'}])
11
+ UnicodeScript.detect(mixed).should eq([{:script => 'CJK Unified Ideographs', :value => '東京'},
12
+ {:script => 'Basic Latin', :value => 'Tokyo'}])
13
+ end
14
+
15
+ it 'should be able to check whether string belongs to certain script' do
16
+ h = 'ひらがな'
17
+ mixed = 'ひらaaaがな'
18
+ UnicodeScript.hiragana?(h).should eq(true)
19
+ UnicodeScript.hiragana?(mixed).should eq(false)
20
+ end
21
+ end
@@ -0,0 +1,19 @@
1
+ # This file was generated by the `rspec --init` command. Conventionally, all
2
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
+ # Require this file using `require "spec_helper"` to ensure that it is only
4
+ # loaded once.
5
+ #
6
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
7
+
8
+ require 'unicode-script'
9
+ RSpec.configure do |config|
10
+ config.treat_symbols_as_metadata_keys_with_true_values = true
11
+ config.run_all_when_everything_filtered = true
12
+ config.filter_run :focus
13
+
14
+ # Run specs in random order to surface order dependencies. If you find an
15
+ # order dependency and want to debug it, you can fix the order by providing
16
+ # the seed, which is printed after each run.
17
+ # --seed 1234
18
+ config.order = 'random'
19
+ end
metadata ADDED
@@ -0,0 +1,80 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: unicode-script
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - yuri-gg
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-03-07 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ! '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ! '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rspec
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: Small utility that allows you to detect scripts (languages) in unicode
42
+ text
43
+ email: yuri.goncharenko@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - lib/unicode-script.rb
49
+ - lib/unicode_script/charts.rb
50
+ - lib/unicode_script/core.rb
51
+ - lib/unicode_script/version.rb
52
+ - spec/lib/unicode_script_spec.rb
53
+ - spec/spec_helper.rb
54
+ homepage: https://github.com/yuri-g/unicode-script
55
+ licenses:
56
+ - MIT
57
+ metadata: {}
58
+ post_install_message:
59
+ rdoc_options: []
60
+ require_paths:
61
+ - lib
62
+ required_ruby_version: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ! '>='
65
+ - !ruby/object:Gem::Version
66
+ version: '0'
67
+ required_rubygems_version: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - ! '>='
70
+ - !ruby/object:Gem::Version
71
+ version: '0'
72
+ requirements: []
73
+ rubyforge_project:
74
+ rubygems_version: 2.1.11
75
+ signing_key:
76
+ specification_version: 4
77
+ summary: Unicode script detector
78
+ test_files:
79
+ - spec/lib/unicode_script_spec.rb
80
+ - spec/spec_helper.rb