script_detector 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/Gemfile +17 -0
- data/Gemfile.lock +91 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +60 -0
- data/Rakefile +65 -0
- data/VERSION +1 -0
- data/lib/chinese_detector.rb +5559 -0
- data/lib/script_detector.rb +48 -0
- data/spec/helper.rb +21 -0
- data/spec/script_detector_spec.rb +132 -0
- data/spec/spec.opts +1 -0
- metadata +176 -0
@@ -0,0 +1,48 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require File.dirname(__FILE__) + '/chinese_detector.rb'
|
4
|
+
|
5
|
+
class String
|
6
|
+
include ChineseDetector
|
7
|
+
|
8
|
+
# Returns true if the string contains Chinese characters _and_ no Japanese or Korean characters
|
9
|
+
def chinese?
|
10
|
+
look_for_chars_in(/\p{Han}/) and !self.japanese? and !self.korean?
|
11
|
+
end
|
12
|
+
|
13
|
+
# Return true if the string contains traditional Chinese characters (繁體字)
|
14
|
+
def traditional_chinese?
|
15
|
+
look_for_chars_in traditional_chinese_regex
|
16
|
+
end
|
17
|
+
|
18
|
+
# Return true if the string contains simplified Chinese characters (简体字)
|
19
|
+
def simplified_chinese?
|
20
|
+
look_for_chars_in simplified_chinese_regex
|
21
|
+
end
|
22
|
+
|
23
|
+
# Returns true if the string contains specifically Japanese (hiragana or katakana) characters
|
24
|
+
def japanese?
|
25
|
+
look_for_chars_in /(\p{Katakana}|\p{Hiragana})/
|
26
|
+
end
|
27
|
+
|
28
|
+
# Returns true if the string contains specifically Korean (hangul) characters
|
29
|
+
def korean?
|
30
|
+
look_for_chars_in /\p{Hangul}/
|
31
|
+
end
|
32
|
+
|
33
|
+
# Try to detect script and return one of "Japanese", "Korean", "Traditional Chinese", "Simplified Chinese", "Ambiguous Chinese" or "Unknown"
|
34
|
+
def identify_script
|
35
|
+
return "Japanese" if self.japanese?
|
36
|
+
return "Korean" if self.korean?
|
37
|
+
return "Traditional Chinese" if self.traditional_chinese?
|
38
|
+
return "Simplified Chinese" if self.simplified_chinese?
|
39
|
+
return "Ambiguous Chinese" if self.chinese?
|
40
|
+
"Unknown"
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def look_for_chars_in regex
|
46
|
+
!! (self =~ regex)
|
47
|
+
end
|
48
|
+
end
|
data/spec/helper.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
|
11
|
+
require 'simplecov'
|
12
|
+
SimpleCov.start do
|
13
|
+
add_filter "config"
|
14
|
+
add_filter "spec"
|
15
|
+
add_filter "vendor"
|
16
|
+
coverage_dir "target/reports/coverage"
|
17
|
+
end
|
18
|
+
|
19
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
20
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
21
|
+
require 'script_detector'
|
@@ -0,0 +1,132 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'helper'
|
4
|
+
|
5
|
+
describe 'Script detection for' do
|
6
|
+
describe 'Simplified Chinese text' do
|
7
|
+
before do
|
8
|
+
@chinese_simpl = ' 我的气垫船充满了鳝鱼.'
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'is Chinese' do
|
12
|
+
@chinese_simpl.should be_chinese
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'is simplified Chinese' do
|
16
|
+
@chinese_simpl.should be_simplified_chinese
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'is not traditional Chinese' do
|
20
|
+
@chinese_simpl.should_not be_traditional_chinese
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'is not Japanese' do
|
24
|
+
@chinese_simpl.should_not be_japanese
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'is identified as Simplified Chinese' do
|
28
|
+
@chinese_simpl.identify_script.should == "Simplified Chinese"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
describe 'Traditional Chinese text' do
|
33
|
+
before do
|
34
|
+
@chinese_trad = ' 我的氣墊船充滿了鱔魚.'
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'is Chinese' do
|
38
|
+
@chinese_trad.should be_chinese
|
39
|
+
end
|
40
|
+
|
41
|
+
it 'is traditional Chinese' do
|
42
|
+
@chinese_trad.should be_traditional_chinese
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'is not simplified Chinese' do
|
46
|
+
@chinese_trad.should_not be_simplified_chinese
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'is not Japanese' do
|
50
|
+
@chinese_trad.should_not be_japanese
|
51
|
+
end
|
52
|
+
|
53
|
+
it 'is identified as traditional Chinese' do
|
54
|
+
@chinese_trad.identify_script.should == "Traditional Chinese"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
describe 'Ambiguous Chinese text' do
|
59
|
+
before do
|
60
|
+
@chinese_amb = '你好.'
|
61
|
+
end
|
62
|
+
|
63
|
+
it 'is Chinese' do
|
64
|
+
@chinese_amb.should be_chinese
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'is neither traditional nor simplified Chinese' do
|
68
|
+
@chinese_amb.should_not be_simplified_chinese
|
69
|
+
@chinese_amb.should_not be_traditional_chinese
|
70
|
+
end
|
71
|
+
|
72
|
+
it 'is identified as ambiguous Chinese' do
|
73
|
+
@chinese_amb.identify_script.should == "Ambiguous Chinese"
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
describe 'Japanese text' do
|
78
|
+
before do
|
79
|
+
@japanese = ' 私のホバークラフトは鰻でいっぱいです.'
|
80
|
+
end
|
81
|
+
|
82
|
+
it 'is Japanese' do
|
83
|
+
@japanese.should be_japanese
|
84
|
+
end
|
85
|
+
|
86
|
+
it 'is not Chinese or Korean' do
|
87
|
+
@japanese.should_not be_chinese
|
88
|
+
@japanese.should_not be_korean
|
89
|
+
end
|
90
|
+
|
91
|
+
it 'is identified as Japanese' do
|
92
|
+
@japanese.identify_script.should == "Japanese"
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
describe 'Korean text' do
|
97
|
+
before do
|
98
|
+
@korean = ' 내 호버크라프트는 장어로 가득 차 있어요.'
|
99
|
+
end
|
100
|
+
|
101
|
+
it 'is Korean' do
|
102
|
+
@korean.should be_korean
|
103
|
+
end
|
104
|
+
|
105
|
+
it 'is not Chinese or Japanese' do
|
106
|
+
@korean.should_not be_chinese
|
107
|
+
@korean.should_not be_japanese
|
108
|
+
end
|
109
|
+
|
110
|
+
it 'is identified as Korean' do
|
111
|
+
@korean.identify_script.should == "Korean"
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
describe 'Non-CJK text' do
|
116
|
+
before do
|
117
|
+
@unknown = 'Hello world.'
|
118
|
+
end
|
119
|
+
|
120
|
+
it 'is none of the above' do
|
121
|
+
@unknown.should_not be_chinese
|
122
|
+
@unknown.should_not be_simplified_chinese
|
123
|
+
@unknown.should_not be_traditional_chinese
|
124
|
+
@unknown.should_not be_japanese
|
125
|
+
@unknown.should_not be_korean
|
126
|
+
end
|
127
|
+
|
128
|
+
it 'is identified as unknown' do
|
129
|
+
@unknown.identify_script.should == "Unknown"
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
data/spec/spec.opts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--colour --format progress
|
metadata
ADDED
@@ -0,0 +1,176 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: script_detector
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.2
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Jani Patokallio
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-05-30 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: shoulda
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rdoc
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ~>
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '3.12'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '3.12'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: rspec-rails
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: simplecov
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: nokogiri
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :development
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: bundler
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ~>
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: 1.1.0
|
102
|
+
type: :development
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ~>
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: 1.1.0
|
110
|
+
- !ruby/object:Gem::Dependency
|
111
|
+
name: jeweler
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
113
|
+
none: false
|
114
|
+
requirements:
|
115
|
+
- - ~>
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: 1.8.3
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
122
|
+
requirements:
|
123
|
+
- - ~>
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: 1.8.3
|
126
|
+
description: Utility library for determining if string is traditional Chinese, simplified
|
127
|
+
Chinese, Japanese or Korean
|
128
|
+
email: jpatokal@iki.fi
|
129
|
+
executables: []
|
130
|
+
extensions: []
|
131
|
+
extra_rdoc_files:
|
132
|
+
- LICENSE.txt
|
133
|
+
- README.rdoc
|
134
|
+
files:
|
135
|
+
- .document
|
136
|
+
- Gemfile
|
137
|
+
- Gemfile.lock
|
138
|
+
- LICENSE.txt
|
139
|
+
- README.rdoc
|
140
|
+
- Rakefile
|
141
|
+
- VERSION
|
142
|
+
- lib/chinese_detector.rb
|
143
|
+
- lib/script_detector.rb
|
144
|
+
- spec/helper.rb
|
145
|
+
- spec/script_detector_spec.rb
|
146
|
+
- spec/spec.opts
|
147
|
+
homepage: http://github.com/jpatokal/script_detector
|
148
|
+
licenses:
|
149
|
+
- MIT
|
150
|
+
post_install_message:
|
151
|
+
rdoc_options: []
|
152
|
+
require_paths:
|
153
|
+
- lib
|
154
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
155
|
+
none: false
|
156
|
+
requirements:
|
157
|
+
- - ! '>='
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '0'
|
160
|
+
segments:
|
161
|
+
- 0
|
162
|
+
hash: -1438461098142701375
|
163
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
164
|
+
none: false
|
165
|
+
requirements:
|
166
|
+
- - ! '>='
|
167
|
+
- !ruby/object:Gem::Version
|
168
|
+
version: '0'
|
169
|
+
requirements: []
|
170
|
+
rubyforge_project:
|
171
|
+
rubygems_version: 1.8.21
|
172
|
+
signing_key:
|
173
|
+
specification_version: 3
|
174
|
+
summary: Utility library for determining if string is traditional Chinese, simplified
|
175
|
+
Chinese, Japanese or Korean
|
176
|
+
test_files: []
|