script_detector 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,48 @@
1
+ # encoding: UTF-8
2
+
3
+ require File.dirname(__FILE__) + '/chinese_detector.rb'
4
+
5
+ class String
6
+ include ChineseDetector
7
+
8
+ # Returns true if the string contains Chinese characters _and_ no Japanese or Korean characters
9
+ def chinese?
10
+ look_for_chars_in(/\p{Han}/) and !self.japanese? and !self.korean?
11
+ end
12
+
13
+ # Return true if the string contains traditional Chinese characters (繁體字)
14
+ def traditional_chinese?
15
+ look_for_chars_in traditional_chinese_regex
16
+ end
17
+
18
+ # Return true if the string contains simplified Chinese characters (简体字)
19
+ def simplified_chinese?
20
+ look_for_chars_in simplified_chinese_regex
21
+ end
22
+
23
+ # Returns true if the string contains specifically Japanese (hiragana or katakana) characters
24
+ def japanese?
25
+ look_for_chars_in /(\p{Katakana}|\p{Hiragana})/
26
+ end
27
+
28
+ # Returns true if the string contains specifically Korean (hangul) characters
29
+ def korean?
30
+ look_for_chars_in /\p{Hangul}/
31
+ end
32
+
33
+ # Try to detect script and return one of "Japanese", "Korean", "Traditional Chinese", "Simplified Chinese", "Ambiguous Chinese" or "Unknown"
34
+ def identify_script
35
+ return "Japanese" if self.japanese?
36
+ return "Korean" if self.korean?
37
+ return "Traditional Chinese" if self.traditional_chinese?
38
+ return "Simplified Chinese" if self.simplified_chinese?
39
+ return "Ambiguous Chinese" if self.chinese?
40
+ "Unknown"
41
+ end
42
+
43
+ private
44
+
45
+ def look_for_chars_in regex
46
+ !! (self =~ regex)
47
+ end
48
+ end
data/spec/helper.rb ADDED
@@ -0,0 +1,21 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+
11
+ require 'simplecov'
12
+ SimpleCov.start do
13
+ add_filter "config"
14
+ add_filter "spec"
15
+ add_filter "vendor"
16
+ coverage_dir "target/reports/coverage"
17
+ end
18
+
19
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
20
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
21
+ require 'script_detector'
@@ -0,0 +1,132 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'helper'
4
+
5
+ describe 'Script detection for' do
6
+ describe 'Simplified Chinese text' do
7
+ before do
8
+ @chinese_simpl = ' 我的气垫船充满了鳝鱼.'
9
+ end
10
+
11
+ it 'is Chinese' do
12
+ @chinese_simpl.should be_chinese
13
+ end
14
+
15
+ it 'is simplified Chinese' do
16
+ @chinese_simpl.should be_simplified_chinese
17
+ end
18
+
19
+ it 'is not traditional Chinese' do
20
+ @chinese_simpl.should_not be_traditional_chinese
21
+ end
22
+
23
+ it 'is not Japanese' do
24
+ @chinese_simpl.should_not be_japanese
25
+ end
26
+
27
+ it 'is identified as Simplified Chinese' do
28
+ @chinese_simpl.identify_script.should == "Simplified Chinese"
29
+ end
30
+ end
31
+
32
+ describe 'Traditional Chinese text' do
33
+ before do
34
+ @chinese_trad = ' 我的氣墊船充滿了鱔魚.'
35
+ end
36
+
37
+ it 'is Chinese' do
38
+ @chinese_trad.should be_chinese
39
+ end
40
+
41
+ it 'is traditional Chinese' do
42
+ @chinese_trad.should be_traditional_chinese
43
+ end
44
+
45
+ it 'is not simplified Chinese' do
46
+ @chinese_trad.should_not be_simplified_chinese
47
+ end
48
+
49
+ it 'is not Japanese' do
50
+ @chinese_trad.should_not be_japanese
51
+ end
52
+
53
+ it 'is identified as traditional Chinese' do
54
+ @chinese_trad.identify_script.should == "Traditional Chinese"
55
+ end
56
+ end
57
+
58
+ describe 'Ambiguous Chinese text' do
59
+ before do
60
+ @chinese_amb = '你好.'
61
+ end
62
+
63
+ it 'is Chinese' do
64
+ @chinese_amb.should be_chinese
65
+ end
66
+
67
+ it 'is neither traditional nor simplified Chinese' do
68
+ @chinese_amb.should_not be_simplified_chinese
69
+ @chinese_amb.should_not be_traditional_chinese
70
+ end
71
+
72
+ it 'is identified as ambiguous Chinese' do
73
+ @chinese_amb.identify_script.should == "Ambiguous Chinese"
74
+ end
75
+ end
76
+
77
+ describe 'Japanese text' do
78
+ before do
79
+ @japanese = ' 私のホバークラフトは鰻でいっぱいです.'
80
+ end
81
+
82
+ it 'is Japanese' do
83
+ @japanese.should be_japanese
84
+ end
85
+
86
+ it 'is not Chinese or Korean' do
87
+ @japanese.should_not be_chinese
88
+ @japanese.should_not be_korean
89
+ end
90
+
91
+ it 'is identified as Japanese' do
92
+ @japanese.identify_script.should == "Japanese"
93
+ end
94
+ end
95
+
96
+ describe 'Korean text' do
97
+ before do
98
+ @korean = ' 내 호버크라프트는 장어로 가득 차 있어요.'
99
+ end
100
+
101
+ it 'is Korean' do
102
+ @korean.should be_korean
103
+ end
104
+
105
+ it 'is not Chinese or Japanese' do
106
+ @korean.should_not be_chinese
107
+ @korean.should_not be_japanese
108
+ end
109
+
110
+ it 'is identified as Korean' do
111
+ @korean.identify_script.should == "Korean"
112
+ end
113
+ end
114
+
115
+ describe 'Non-CJK text' do
116
+ before do
117
+ @unknown = 'Hello world.'
118
+ end
119
+
120
+ it 'is none of the above' do
121
+ @unknown.should_not be_chinese
122
+ @unknown.should_not be_simplified_chinese
123
+ @unknown.should_not be_traditional_chinese
124
+ @unknown.should_not be_japanese
125
+ @unknown.should_not be_korean
126
+ end
127
+
128
+ it 'is identified as unknown' do
129
+ @unknown.identify_script.should == "Unknown"
130
+ end
131
+ end
132
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1 @@
1
+ --colour --format progress
metadata ADDED
@@ -0,0 +1,176 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: script_detector
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jani Patokallio
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-05-30 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: shoulda
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rdoc
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: '3.12'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: '3.12'
46
+ - !ruby/object:Gem::Dependency
47
+ name: rspec-rails
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: simplecov
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: nokogiri
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: bundler
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ~>
100
+ - !ruby/object:Gem::Version
101
+ version: 1.1.0
102
+ type: :development
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ~>
108
+ - !ruby/object:Gem::Version
109
+ version: 1.1.0
110
+ - !ruby/object:Gem::Dependency
111
+ name: jeweler
112
+ requirement: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ~>
116
+ - !ruby/object:Gem::Version
117
+ version: 1.8.3
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
122
+ requirements:
123
+ - - ~>
124
+ - !ruby/object:Gem::Version
125
+ version: 1.8.3
126
+ description: Utility library for determining if string is traditional Chinese, simplified
127
+ Chinese, Japanese or Korean
128
+ email: jpatokal@iki.fi
129
+ executables: []
130
+ extensions: []
131
+ extra_rdoc_files:
132
+ - LICENSE.txt
133
+ - README.rdoc
134
+ files:
135
+ - .document
136
+ - Gemfile
137
+ - Gemfile.lock
138
+ - LICENSE.txt
139
+ - README.rdoc
140
+ - Rakefile
141
+ - VERSION
142
+ - lib/chinese_detector.rb
143
+ - lib/script_detector.rb
144
+ - spec/helper.rb
145
+ - spec/script_detector_spec.rb
146
+ - spec/spec.opts
147
+ homepage: http://github.com/jpatokal/script_detector
148
+ licenses:
149
+ - MIT
150
+ post_install_message:
151
+ rdoc_options: []
152
+ require_paths:
153
+ - lib
154
+ required_ruby_version: !ruby/object:Gem::Requirement
155
+ none: false
156
+ requirements:
157
+ - - ! '>='
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
160
+ segments:
161
+ - 0
162
+ hash: -1438461098142701375
163
+ required_rubygems_version: !ruby/object:Gem::Requirement
164
+ none: false
165
+ requirements:
166
+ - - ! '>='
167
+ - !ruby/object:Gem::Version
168
+ version: '0'
169
+ requirements: []
170
+ rubyforge_project:
171
+ rubygems_version: 1.8.21
172
+ signing_key:
173
+ specification_version: 3
174
+ summary: Utility library for determining if string is traditional Chinese, simplified
175
+ Chinese, Japanese or Korean
176
+ test_files: []