langusta 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/langusta.gemspec +3 -3
- data/lib/langusta.rb +5 -1
- data/lib/langusta/detector.rb +2 -2
- data/lib/langusta/detector_factory.rb +2 -4
- data/test/test_detector.rb +8 -0
- data/test/test_detector_factory.rb +15 -3
- metadata +5 -5
data/Rakefile
CHANGED
@@ -16,7 +16,7 @@ Jeweler::Tasks.new do |gem|
|
|
16
16
|
gem.homepage = "http://github.com/jasiek/langusta"
|
17
17
|
gem.license = "Apache 2.0"
|
18
18
|
gem.summary = %Q{Language detection library based on http://code.google.com/p/language-detection/.}
|
19
|
-
gem.description = %Q{
|
19
|
+
gem.description = %Q{Highly accurate language detection library, uses naive bayesian filter.}
|
20
20
|
gem.email = "jan.szumiec@gmail.com"
|
21
21
|
gem.authors = ["Jan Szumiec"]
|
22
22
|
end
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.1
|
data/langusta.gemspec
CHANGED
@@ -5,13 +5,13 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{langusta}
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Jan Szumiec"]
|
12
|
-
s.date = %q{2011-04-
|
12
|
+
s.date = %q{2011-04-10}
|
13
13
|
s.default_executable = %q{langusta}
|
14
|
-
s.description = %q{
|
14
|
+
s.description = %q{Highly accurate language detection library, uses naive bayesian filter.}
|
15
15
|
s.email = %q{jan.szumiec@gmail.com}
|
16
16
|
s.executables = ["langusta"]
|
17
17
|
s.extra_rdoc_files = [
|
data/lib/langusta.rb
CHANGED
@@ -12,7 +12,7 @@ require 'oniguruma'
|
|
12
12
|
require 'yajl'
|
13
13
|
|
14
14
|
module Langusta
|
15
|
-
VERSION = '0.1.
|
15
|
+
VERSION = '0.1.1'
|
16
16
|
|
17
17
|
autoload :RegexHelper, 'langusta/regex_helper'
|
18
18
|
autoload :UCS2String, 'langusta/ucs2_string'
|
@@ -32,5 +32,9 @@ module Langusta
|
|
32
32
|
PROFILES_PATH = File.join(ABSOLUTE_PATH, 'profiles')
|
33
33
|
UPPERCASE_BIN = File.join(ABSOLUTE_PATH, 'data/uppercase.bin')
|
34
34
|
MESSAGES_PROPERTIES = File.join(ABSOLUTE_PATH, 'data/messages.properties')
|
35
|
+
|
36
|
+
class DuplicateProfilesError < StandardError; end
|
37
|
+
class NoProfilesLoadedError < StandardError; end
|
38
|
+
class NoFeaturesInTextError < StandardError; end
|
35
39
|
end
|
36
40
|
|
data/lib/langusta/detector.rb
CHANGED
@@ -45,7 +45,7 @@ module Langusta
|
|
45
45
|
def detect_block
|
46
46
|
cleaning_text()
|
47
47
|
ngrams = extract_ngrams()
|
48
|
-
raise
|
48
|
+
raise NoFeaturesInTextError if ngrams.empty?
|
49
49
|
@langprob = Array.new(@lang_list.length, 0.0)
|
50
50
|
|
51
51
|
@n_trial.times do
|
@@ -68,6 +68,7 @@ module Langusta
|
|
68
68
|
end
|
69
69
|
end
|
70
70
|
|
71
|
+
# TODO: this looks like it's not referenced anywhere.
|
71
72
|
def set_prior_map(prior_map)
|
72
73
|
@prior_map = Array.new[@lang_list.length]
|
73
74
|
sump = 0.0
|
@@ -99,7 +100,6 @@ module Langusta
|
|
99
100
|
maxp
|
100
101
|
end
|
101
102
|
|
102
|
-
private
|
103
103
|
def cleaning_text
|
104
104
|
non_latin_count = latin_count = 0
|
105
105
|
@text.each_char do |c|
|
@@ -1,6 +1,4 @@
|
|
1
1
|
module Langusta
|
2
|
-
class LangDetectException < StandardError; end
|
3
|
-
|
4
2
|
class DetectorFactory
|
5
3
|
attr_reader :word_lang_prob_map, :lang_list
|
6
4
|
|
@@ -14,7 +12,7 @@ module Langusta
|
|
14
12
|
# @param [Fixnum] index at which the language profile is to be added.
|
15
13
|
# @param [Fixnum] counts how many language profiles are to be added to this factory in total.
|
16
14
|
def add_profile(profile, index, langsize)
|
17
|
-
raise
|
15
|
+
raise DuplicateProfilesError.new(profile.name) if @lang_list.include?(profile.name)
|
18
16
|
@lang_list << profile.name
|
19
17
|
profile.freq.keys.each do |word|
|
20
18
|
if not @word_lang_prob_map.has_key?(word)
|
@@ -39,7 +37,7 @@ module Langusta
|
|
39
37
|
|
40
38
|
private
|
41
39
|
def create_detector
|
42
|
-
raise
|
40
|
+
raise NoProfilesLoadedError if @lang_list.empty?
|
43
41
|
detector = Detector.new(self)
|
44
42
|
end
|
45
43
|
end
|
data/test/test_detector.rb
CHANGED
@@ -49,4 +49,12 @@ class DetectorTest < Test::Unit::TestCase
|
|
49
49
|
detector.append(UCS2String.new("\x30\x42\x30\x42\x30\x42\x30\x42\x00a"))
|
50
50
|
assert_equal("jp", detector.detect())
|
51
51
|
end
|
52
|
+
|
53
|
+
def test_exceptions
|
54
|
+
detector = @factory.create()
|
55
|
+
detector.append(UCS2String.new(''))
|
56
|
+
assert_raises(NoFeaturesInTextError) do
|
57
|
+
detector.detect()
|
58
|
+
end
|
59
|
+
end
|
52
60
|
end
|
@@ -6,11 +6,23 @@ class DetectorFactoryTest < Test::Unit::TestCase
|
|
6
6
|
factory = DetectorFactory.new
|
7
7
|
|
8
8
|
factory.add_profile(profile, 0, 1)
|
9
|
-
assert_raises(LangDetectException) do
|
10
|
-
factory.add_profile(profile, 1, 1)
|
11
|
-
end
|
12
9
|
|
13
10
|
detector = factory.create(0.123)
|
14
11
|
assert_equal(0.123, detector.alpha)
|
15
12
|
end
|
13
|
+
|
14
|
+
def test_exceptions
|
15
|
+
profile = LangProfile.new
|
16
|
+
factory = DetectorFactory.new
|
17
|
+
|
18
|
+
assert_raises(NoProfilesLoadedError) do
|
19
|
+
factory.create()
|
20
|
+
end
|
21
|
+
|
22
|
+
factory.add_profile(profile, 0, 2)
|
23
|
+
|
24
|
+
assert_raises(DuplicateProfilesError) do
|
25
|
+
factory.add_profile(profile, 1, 2)
|
26
|
+
end
|
27
|
+
end
|
16
28
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: langusta
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 25
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 1
|
10
|
+
version: 0.1.1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jan Szumiec
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-04-
|
18
|
+
date: 2011-04-10 00:00:00 +02:00
|
19
19
|
default_executable: langusta
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -124,7 +124,7 @@ dependencies:
|
|
124
124
|
name: ruby-debug
|
125
125
|
version_requirements: *id007
|
126
126
|
prerelease: false
|
127
|
-
description:
|
127
|
+
description: Highly accurate language detection library, uses naive bayesian filter.
|
128
128
|
email: jan.szumiec@gmail.com
|
129
129
|
executables:
|
130
130
|
- langusta
|