langusta 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/.travis.yml ADDED
@@ -0,0 +1,7 @@
1
+ before_install: "sudo apt-get -y install libonig-dev libyajl-dev"
2
+ language: ruby
3
+ rvm:
4
+ - 1.8.7
5
+ - 1.9.2
6
+ - 1.9.3
7
+
data/Gemfile CHANGED
@@ -1,11 +1,14 @@
1
1
  source :gemcutter
2
- gem "oniguruma", "1.1.0"
3
- gem "yajl-ruby", "0.8.2"
4
2
 
5
- group :development do
6
- gem "bundler", "~> 1.0.0"
7
- gem "jeweler", "~> 1.5.2"
8
- gem "rcov"
3
+ platform :mri_18 do
4
+ gem "oniguruma", "1.1.0"
5
+ end
6
+
7
+ gem "yajl-ruby", "0.8.2", :require => 'yajl'
8
+
9
+ gem "bundler"
10
+ gem "jeweler"
11
+
12
+ group :test do
9
13
  gem "mocha"
10
- gem "ruby-debug"
11
14
  end
data/Gemfile.lock CHANGED
@@ -1,32 +1,28 @@
1
1
  GEM
2
2
  remote: http://rubygems.org/
3
3
  specs:
4
- columnize (0.3.2)
5
4
  git (1.2.5)
6
- jeweler (1.5.2)
7
- bundler (~> 1.0.0)
5
+ jeweler (1.8.3)
6
+ bundler (~> 1.0)
8
7
  git (>= 1.2.5)
9
8
  rake
10
- linecache (0.43)
11
- mocha (0.9.12)
9
+ rdoc
10
+ json (1.6.5)
11
+ metaclass (0.0.1)
12
+ mocha (0.10.5)
13
+ metaclass (~> 0.0.1)
12
14
  oniguruma (1.1.0)
13
- rake (0.8.7)
14
- rcov (0.9.9)
15
- ruby-debug (0.10.4)
16
- columnize (>= 0.1)
17
- ruby-debug-base (~> 0.10.4.0)
18
- ruby-debug-base (0.10.4)
19
- linecache (>= 0.3)
15
+ rake (0.9.2.2)
16
+ rdoc (3.12)
17
+ json (~> 1.4)
20
18
  yajl-ruby (0.8.2)
21
19
 
22
20
  PLATFORMS
23
21
  ruby
24
22
 
25
23
  DEPENDENCIES
26
- bundler (~> 1.0.0)
27
- jeweler (~> 1.5.2)
24
+ bundler
25
+ jeweler
28
26
  mocha
29
27
  oniguruma (= 1.1.0)
30
- rcov
31
- ruby-debug
32
28
  yajl-ruby (= 0.8.2)
@@ -1,25 +1,41 @@
1
- = langusta
1
+ # langusta
2
2
 
3
3
  Langusta is a language detection library based on a method designed and implemented by Nakatani Shuyo. This work is almost a direct 1-to-1 port of the original Java library which can be found at: http://code.google.com/p/language-detection.
4
4
 
5
5
  For more information about the method (naive bayesian classification), have a look at this presentation: http://www.slideshare.net/shuyo/language-detection-library-for-java. This implementation uses some resources from the original library, specifically the language profiles.
6
6
 
7
- == Runtime dependencies
7
+ ## Build status
8
8
 
9
- * oniguruma - regular expressions swiss army knife
9
+ [![Build Status](https://secure.travis-ci.org/jasiek/langusta.png?branch=master)](http://travis-ci.org/jasiek/langusta)
10
+
11
+ ## Runtime dependencies
12
+
13
+ * oniguruma - regular expressions swiss army knife (only required for 1.8.7)
10
14
  * yajl-ruby - a quick and elegant JSON parser
11
15
 
12
- == Usage
16
+ ## Usage
17
+
18
+ The simplest way to use this library is to use the facade provided with this package.
13
19
 
14
- See lib/langusta/language_detection_facade.rb for an example, a canonical way to use the library is through this class.
20
+ ```ruby
21
+ require 'langusta'
22
+ facade = Langusta::LanguageDetectionFacade.new
23
+ facade.detect('zażółć gęślą jaźń') #=> 'pl'
24
+ ```
15
25
 
16
- == Compatibility
26
+ If you don't need all 49 profiles, you can boost your detection speed and reduce memory consumption by writing your own facade-like class.
27
+
28
+ ## Compatibility
17
29
 
18
30
  * Ruby 1.8.7
31
+ * Ruby 1.9.2
32
+ * Ruby 1.9.3
33
+
34
+ ## Caveats
19
35
 
20
- A version for Ruby 1.9 is in the works.
36
+ Langusta is a memory hog - 49 profiles will take up about 80MB of RAM.
21
37
 
22
- == Contributing to langusta
38
+ ## Contributing to langusta
23
39
 
24
40
  * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
25
41
  * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
@@ -29,6 +45,7 @@ A version for Ruby 1.9 is in the works.
29
45
  * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
30
46
  * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
31
47
 
32
- == Copyright
48
+ ## Copyright
49
+
50
+ Copyright (c) 2011, 2012 Jan Szumiec. See LICENSE.txt for further details.
33
51
 
34
- Copyright (c) 2011 Jan Szumiec. See LICENSE.txt for further details.
data/Rakefile CHANGED
@@ -24,21 +24,14 @@ Jeweler::RubygemsDotOrgTasks.new
24
24
 
25
25
  require 'rake/testtask'
26
26
  Rake::TestTask.new(:test) do |test|
27
- test.libs << 'lib' << 'test'
28
- test.pattern = 'test/test_*.rb'
29
- test.verbose = true
30
- end
31
-
32
- require 'rcov/rcovtask'
33
- Rcov::RcovTask.new do |test|
34
- test.libs << 'test'
27
+ test.libs << 'lib' << 'test' << '.'
35
28
  test.pattern = 'test/test_*.rb'
36
29
  test.verbose = true
37
30
  end
38
31
 
39
32
  task :default => :test
40
33
 
41
- require 'rake/rdoctask'
34
+ require 'rdoc/task'
42
35
  Rake::RDocTask.new do |rdoc|
43
36
  version = File.exist?('VERSION') ? File.read('VERSION') : ""
44
37
 
@@ -49,7 +42,7 @@ Rake::RDocTask.new do |rdoc|
49
42
  end
50
43
 
51
44
  Rake::TestTask.new('test:quality') do |test|
52
- test.libs << 'test/quality'
45
+ test.libs << 'test/quality' << 'lib' << '.'
53
46
  test.pattern = 'test/quality/test_*.rb'
54
47
  test.verbose = true
55
48
  end
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.1
1
+ 0.2.0
data/langusta.gemspec CHANGED
@@ -4,26 +4,26 @@
4
4
  # -*- encoding: utf-8 -*-
5
5
 
6
6
  Gem::Specification.new do |s|
7
- s.name = %q{langusta}
8
- s.version = "0.1.1"
7
+ s.name = "langusta"
8
+ s.version = "0.2.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Jan Szumiec"]
12
- s.date = %q{2011-04-10}
13
- s.default_executable = %q{langusta}
14
- s.description = %q{Highly accurate language detection library, uses naive bayesian filter.}
15
- s.email = %q{jan.szumiec@gmail.com}
12
+ s.date = "2012-03-04"
13
+ s.description = "Highly accurate language detection library, uses naive bayesian filter."
14
+ s.email = "jan.szumiec@gmail.com"
16
15
  s.executables = ["langusta"]
17
16
  s.extra_rdoc_files = [
18
17
  "LICENSE.txt",
19
- "README.rdoc"
18
+ "README.md"
20
19
  ]
21
20
  s.files = [
22
21
  ".document",
22
+ ".travis.yml",
23
23
  "Gemfile",
24
24
  "Gemfile.lock",
25
25
  "LICENSE.txt",
26
- "README.rdoc",
26
+ "README.md",
27
27
  "Rakefile",
28
28
  "VERSION",
29
29
  "bin/langusta",
@@ -31,9 +31,12 @@ Gem::Specification.new do |s|
31
31
  "data/uppercase.bin",
32
32
  "langusta.gemspec",
33
33
  "lib/langusta.rb",
34
+ "lib/langusta/codepoints.rb",
34
35
  "lib/langusta/command.rb",
35
36
  "lib/langusta/detector.rb",
36
37
  "lib/langusta/detector_factory.rb",
38
+ "lib/langusta/guard.rb",
39
+ "lib/langusta/inspector.rb",
37
40
  "lib/langusta/java_property_reader.rb",
38
41
  "lib/langusta/lang_profile.rb",
39
42
  "lib/langusta/language.rb",
@@ -41,7 +44,6 @@ Gem::Specification.new do |s|
41
44
  "lib/langusta/n_gram.rb",
42
45
  "lib/langusta/regex_helper.rb",
43
46
  "lib/langusta/tag_extractor.rb",
44
- "lib/langusta/ucs2_string.rb",
45
47
  "lib/langusta/unicode_block.rb",
46
48
  "profiles/af",
47
49
  "profiles/ar",
@@ -152,59 +154,33 @@ Gem::Specification.new do |s|
152
154
  "test/test_langusta.rb",
153
155
  "test/test_n_gram.rb",
154
156
  "test/test_tag_extractor.rb",
155
- "test/test_ucs2_string.rb",
156
157
  "test/test_unicode_block.rb"
157
158
  ]
158
- s.homepage = %q{http://github.com/jasiek/langusta}
159
+ s.homepage = "http://github.com/jasiek/langusta"
159
160
  s.licenses = ["Apache 2.0"]
160
161
  s.require_paths = ["lib"]
161
- s.rubygems_version = %q{1.5.1}
162
- s.summary = %q{Language detection library based on http://code.google.com/p/language-detection/.}
163
- s.test_files = [
164
- "test/helper.rb",
165
- "test/quality/test_falsified.rb",
166
- "test/test_command.rb",
167
- "test/test_detector.rb",
168
- "test/test_detector_factory.rb",
169
- "test/test_java_property_reader.rb",
170
- "test/test_lang_profile.rb",
171
- "test/test_language.rb",
172
- "test/test_language_detection_facade.rb",
173
- "test/test_langusta.rb",
174
- "test/test_n_gram.rb",
175
- "test/test_tag_extractor.rb",
176
- "test/test_ucs2_string.rb",
177
- "test/test_unicode_block.rb"
178
- ]
162
+ s.rubygems_version = "1.8.17"
163
+ s.summary = "Language detection library based on http://code.google.com/p/language-detection/."
179
164
 
180
165
  if s.respond_to? :specification_version then
181
166
  s.specification_version = 3
182
167
 
183
168
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
184
- s.add_runtime_dependency(%q<oniguruma>, ["= 1.1.0"])
169
+ s.add_runtime_dependency(%q<oniguruma>, ["= 1.1.0"]) if RUBY_VERSION < "1.9"
185
170
  s.add_runtime_dependency(%q<yajl-ruby>, ["= 0.8.2"])
186
- s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
187
- s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
188
- s.add_development_dependency(%q<rcov>, [">= 0"])
189
- s.add_development_dependency(%q<mocha>, [">= 0"])
190
- s.add_development_dependency(%q<ruby-debug>, [">= 0"])
171
+ s.add_runtime_dependency(%q<bundler>, [">= 0"])
172
+ s.add_runtime_dependency(%q<jeweler>, [">= 0"])
191
173
  else
192
- s.add_dependency(%q<oniguruma>, ["= 1.1.0"])
174
+ s.add_dependency(%q<oniguruma>, ["= 1.1.0"]) if RUBY_VERSION < "1.9"
193
175
  s.add_dependency(%q<yajl-ruby>, ["= 0.8.2"])
194
- s.add_dependency(%q<bundler>, ["~> 1.0.0"])
195
- s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
196
- s.add_dependency(%q<rcov>, [">= 0"])
197
- s.add_dependency(%q<mocha>, [">= 0"])
198
- s.add_dependency(%q<ruby-debug>, [">= 0"])
176
+ s.add_dependency(%q<bundler>, [">= 0"])
177
+ s.add_dependency(%q<jeweler>, [">= 0"])
199
178
  end
200
179
  else
201
- s.add_dependency(%q<oniguruma>, ["= 1.1.0"])
180
+ s.add_dependency(%q<oniguruma>, ["= 1.1.0"]) if RUBY_VERSION < "1.9"
202
181
  s.add_dependency(%q<yajl-ruby>, ["= 0.8.2"])
203
- s.add_dependency(%q<bundler>, ["~> 1.0.0"])
204
- s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
205
- s.add_dependency(%q<rcov>, [">= 0"])
206
- s.add_dependency(%q<mocha>, [">= 0"])
207
- s.add_dependency(%q<ruby-debug>, [">= 0"])
182
+ s.add_dependency(%q<bundler>, [">= 0"])
183
+ s.add_dependency(%q<jeweler>, [">= 0"])
208
184
  end
209
185
  end
210
186
 
data/lib/langusta.rb CHANGED
@@ -2,20 +2,18 @@ $: << File.expand_path(File.dirname(__FILE__))
2
2
 
3
3
  require 'rubygems'
4
4
  require 'bundler'
5
- Bundler.setup
5
+ Bundler.require
6
6
 
7
7
  require 'optparse'
8
- require 'iconv'
9
-
10
- # Required gems
11
- require 'oniguruma'
12
- require 'yajl'
8
+ require 'iconv' if RUBY_VERSION < "1.9"
13
9
 
14
10
  module Langusta
15
11
  VERSION = '0.1.1'
16
12
 
13
+ autoload :Guard, 'langusta/guard'
14
+ autoload :Inspector, 'langusta/inspector'
17
15
  autoload :RegexHelper, 'langusta/regex_helper'
18
- autoload :UCS2String, 'langusta/ucs2_string'
16
+ autoload :Codepoints, 'langusta/codepoints'
19
17
  autoload :Language, 'langusta/language'
20
18
  autoload :LangProfile, 'langusta/lang_profile'
21
19
  autoload :Detector, 'langusta/detector'
@@ -33,8 +31,36 @@ module Langusta
33
31
  UPPERCASE_BIN = File.join(ABSOLUTE_PATH, 'data/uppercase.bin')
34
32
  MESSAGES_PROPERTIES = File.join(ABSOLUTE_PATH, 'data/messages.properties')
35
33
 
36
- class DuplicateProfilesError < StandardError; end
37
- class NoProfilesLoadedError < StandardError; end
38
- class NoFeaturesInTextError < StandardError; end
34
+ class Error < StandardError; end
35
+ class DuplicateProfilesError < Error; end
36
+ class NoProfilesLoadedError < Error; end
37
+ class NoFeaturesInTextError < Error; end
38
+
39
+ UTF82CP_SELECTOR = RUBY_VERSION < "1.9" ? :utf82cp_18 : :utf82cp_19
40
+ CP2UTF8_SELECTOR = RUBY_VERSION < "1.9" ? :cp2utf8_18 : :cp2utf8_19
41
+
42
+ def self.utf82cp(utf8_string)
43
+ send(UTF82CP_SELECTOR, utf8_string)
44
+ end
45
+
46
+ def self.utf82cp_18(utf8_string)
47
+ Iconv.conv('ucs-2be', 'utf-8', utf8_string).unpack('n*')
48
+ end
49
+
50
+ def self.utf82cp_19(utf8_string)
51
+ utf8_string.encode('ucs-2be').unpack('n*')
52
+ end
53
+
54
+ def self.cp2utf8(cp_array)
55
+ send(CP2UTF8_SELECTOR, cp_array)
56
+ end
57
+
58
+ def self.cp2utf8_18(cp_array)
59
+ Iconv.conv('utf-8', 'ucs-2be', cp_array.pack('n*'))
60
+ end
61
+
62
+ def self.cp2utf8_19(cp_array)
63
+ cp_array.pack('n*').force_encoding('ucs-2be').encode('utf-8')
64
+ end
39
65
  end
40
66
 
@@ -0,0 +1,19 @@
1
+ module Langusta
2
+ module Codepoints
3
+ GSUB_SELECTOR = RUBY_VERSION < "1.9" ? :gsub18 : :gsub19
4
+
5
+ def self.gsub!(codepoint_array, regex, replacement)
6
+ string = Langusta.cp2utf8(codepoint_array)
7
+ string = send(GSUB_SELECTOR, string, regex, replacement)
8
+ codepoint_array.replace(Langusta.utf82cp(string))
9
+ end
10
+
11
+ def self.gsub18(string, oregex, replacement)
12
+ oregex.gsub(string, replacement)
13
+ end
14
+
15
+ def self.gsub19(string, regex, replacement)
16
+ string.gsub(regex, replacement)
17
+ end
18
+ end
19
+ end
@@ -55,7 +55,7 @@ EOF
55
55
  end
56
56
 
57
57
  def detect_single_lang(filename, alpha)
58
- ucs2_content = UCS2String.from_utf8(File.open(filename).read)
58
+ ucs2_content = Langusta.utf82cp(File.open(filename).read)
59
59
  detector = @detector_factory.create(alpha)
60
60
  detector.append(ucs2_content)
61
61
 
@@ -64,8 +64,8 @@ EOF
64
64
 
65
65
  def initialize_factory(profile_directory)
66
66
  profiles = load_profiles(profile_directory)
67
- profiles.each_with_index do |profile, index|
68
- @detector_factory.add_profile(profile, index, profiles.length)
67
+ profiles.each do |profile|
68
+ @detector_factory.add_profile(profile)
69
69
  end
70
70
  end
71
71
 
@@ -13,7 +13,7 @@ module Langusta
13
13
  def initialize(factory)
14
14
  @word_lang_prob_map = factory.word_lang_prob_map
15
15
  @lang_list = factory.lang_list
16
- @text = UCS2String.new('')
16
+ @text = []
17
17
  @langprob = nil
18
18
  @alpha = ALPHA_DEFAULT
19
19
  @n_trial = 7
@@ -25,13 +25,15 @@ module Langusta
25
25
  # Append more text to be recognized.
26
26
  # @param text [UCS2String] text to be recognized
27
27
  def append(text)
28
- raise TypeError.new("Expected: UCS2String, got: #{text.class}") unless text.is_a?(UCS2String)
29
- text.gsub!(RegexHelper::URL_REGEX, "\x00\x20")
30
- text.gsub!(RegexHelper::MAIL_REGEX, "\x00\x20")
28
+ Guard.klass(text, Array, __method__)
29
+
30
+ text = Codepoints.gsub!(text, RegexHelper::URL_REGEX, "\x00\x20")
31
+ text = Codepoints.gsub!(text, RegexHelper::MAIL_REGEX, "\x00\x20")
32
+
31
33
  text = text.map do |c|
32
34
  NGram.normalize(c)
33
35
  end
34
- @text = text.gsub!(RegexHelper::SPACE_REGEX, "\x00\x20")
36
+ @text = Codepoints.gsub!(text, RegexHelper::SPACE_REGEX, "\x00\x20")
35
37
  end
36
38
 
37
39
  # Detect the language.
@@ -102,17 +104,17 @@ module Langusta
102
104
 
103
105
  def cleaning_text
104
106
  non_latin_count = latin_count = 0
105
- @text.each_char do |c|
106
- if c < "\00z" && c >= "\x00A"
107
+ @text.each do |c|
108
+ if c < 0x007a && c > 0x0041 # c > "z" && c < "A"
107
109
  latin_count += 1
108
- elsif c >= "\x03\x00" && UnicodeBlock.of(c) != UnicodeBlock::LATIN_EXTENDED_ADDITIONAL
110
+ elsif c >= 0x3000 && UnicodeBlock.of(c) != UnicodeBlock::LATIN_EXTENDED_ADDITIONAL
109
111
  non_latin_count += 1
110
112
  end
111
113
  end
112
114
  if latin_count * 2 < non_latin_count
113
- text_without_latin = UCS2String.new('')
114
- @text.each_char do |c|
115
- text_without_latin << c if c > "\x00z" || c < "\x00A"
115
+ text_without_latin = []
116
+ @text.each do |c|
117
+ text_without_latin << c if c > 0x007a || c < 0x0041 # c > "z" || c < "A"
116
118
  end
117
119
  @text = text_without_latin
118
120
  end
@@ -121,7 +123,7 @@ module Langusta
121
123
  def extract_ngrams
122
124
  list = []
123
125
  ngram = NGram.new
124
- @text.each_char do |char|
126
+ @text.each do |char|
125
127
  ngram.add_char(char)
126
128
  (1..NGram::N_GRAM).each do |n|
127
129
  w = ngram.get(n)
@@ -169,7 +171,8 @@ module Langusta
169
171
  # verbose
170
172
  weight = alpha / BASE_FREQ
171
173
  prob.length.times do |i|
172
- prob[i] *= weight + lang_prob_map[i]
174
+ # tiny workaround for nil values in word freq array
175
+ prob[i] *= weight + (lang_prob_map[i] || 0.0)
173
176
  end
174
177
  true
175
178
  end