langusta 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/langusta.gemspec +3 -3
- data/lib/langusta.rb +5 -1
- data/lib/langusta/detector.rb +2 -2
- data/lib/langusta/detector_factory.rb +2 -4
- data/test/test_detector.rb +8 -0
- data/test/test_detector_factory.rb +15 -3
- metadata +5 -5
data/Rakefile
CHANGED
@@ -16,7 +16,7 @@ Jeweler::Tasks.new do |gem|
|
|
16
16
|
gem.homepage = "http://github.com/jasiek/langusta"
|
17
17
|
gem.license = "Apache 2.0"
|
18
18
|
gem.summary = %Q{Language detection library based on http://code.google.com/p/language-detection/.}
|
19
|
-
gem.description = %Q{
|
19
|
+
gem.description = %Q{Highly accurate language detection library, uses naive bayesian filter.}
|
20
20
|
gem.email = "jan.szumiec@gmail.com"
|
21
21
|
gem.authors = ["Jan Szumiec"]
|
22
22
|
end
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.1
|
data/langusta.gemspec
CHANGED
@@ -5,13 +5,13 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{langusta}
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Jan Szumiec"]
|
12
|
-
s.date = %q{2011-04-
|
12
|
+
s.date = %q{2011-04-10}
|
13
13
|
s.default_executable = %q{langusta}
|
14
|
-
s.description = %q{
|
14
|
+
s.description = %q{Highly accurate language detection library, uses naive bayesian filter.}
|
15
15
|
s.email = %q{jan.szumiec@gmail.com}
|
16
16
|
s.executables = ["langusta"]
|
17
17
|
s.extra_rdoc_files = [
|
data/lib/langusta.rb
CHANGED
@@ -12,7 +12,7 @@ require 'oniguruma'
|
|
12
12
|
require 'yajl'
|
13
13
|
|
14
14
|
module Langusta
|
15
|
-
VERSION = '0.1.
|
15
|
+
VERSION = '0.1.1'
|
16
16
|
|
17
17
|
autoload :RegexHelper, 'langusta/regex_helper'
|
18
18
|
autoload :UCS2String, 'langusta/ucs2_string'
|
@@ -32,5 +32,9 @@ module Langusta
|
|
32
32
|
PROFILES_PATH = File.join(ABSOLUTE_PATH, 'profiles')
|
33
33
|
UPPERCASE_BIN = File.join(ABSOLUTE_PATH, 'data/uppercase.bin')
|
34
34
|
MESSAGES_PROPERTIES = File.join(ABSOLUTE_PATH, 'data/messages.properties')
|
35
|
+
|
36
|
+
class DuplicateProfilesError < StandardError; end
|
37
|
+
class NoProfilesLoadedError < StandardError; end
|
38
|
+
class NoFeaturesInTextError < StandardError; end
|
35
39
|
end
|
36
40
|
|
data/lib/langusta/detector.rb
CHANGED
@@ -45,7 +45,7 @@ module Langusta
|
|
45
45
|
def detect_block
|
46
46
|
cleaning_text()
|
47
47
|
ngrams = extract_ngrams()
|
48
|
-
raise
|
48
|
+
raise NoFeaturesInTextError if ngrams.empty?
|
49
49
|
@langprob = Array.new(@lang_list.length, 0.0)
|
50
50
|
|
51
51
|
@n_trial.times do
|
@@ -68,6 +68,7 @@ module Langusta
|
|
68
68
|
end
|
69
69
|
end
|
70
70
|
|
71
|
+
# TODO: this looks like it's not referenced anywhere.
|
71
72
|
def set_prior_map(prior_map)
|
72
73
|
@prior_map = Array.new[@lang_list.length]
|
73
74
|
sump = 0.0
|
@@ -99,7 +100,6 @@ module Langusta
|
|
99
100
|
maxp
|
100
101
|
end
|
101
102
|
|
102
|
-
private
|
103
103
|
def cleaning_text
|
104
104
|
non_latin_count = latin_count = 0
|
105
105
|
@text.each_char do |c|
|
@@ -1,6 +1,4 @@
|
|
1
1
|
module Langusta
|
2
|
-
class LangDetectException < StandardError; end
|
3
|
-
|
4
2
|
class DetectorFactory
|
5
3
|
attr_reader :word_lang_prob_map, :lang_list
|
6
4
|
|
@@ -14,7 +12,7 @@ module Langusta
|
|
14
12
|
# @param [Fixnum] index at which the language profile is to be added.
|
15
13
|
# @param [Fixnum] counts how many language profiles are to be added to this factory in total.
|
16
14
|
def add_profile(profile, index, langsize)
|
17
|
-
raise
|
15
|
+
raise DuplicateProfilesError.new(profile.name) if @lang_list.include?(profile.name)
|
18
16
|
@lang_list << profile.name
|
19
17
|
profile.freq.keys.each do |word|
|
20
18
|
if not @word_lang_prob_map.has_key?(word)
|
@@ -39,7 +37,7 @@ module Langusta
|
|
39
37
|
|
40
38
|
private
|
41
39
|
def create_detector
|
42
|
-
raise
|
40
|
+
raise NoProfilesLoadedError if @lang_list.empty?
|
43
41
|
detector = Detector.new(self)
|
44
42
|
end
|
45
43
|
end
|
data/test/test_detector.rb
CHANGED
@@ -49,4 +49,12 @@ class DetectorTest < Test::Unit::TestCase
|
|
49
49
|
detector.append(UCS2String.new("\x30\x42\x30\x42\x30\x42\x30\x42\x00a"))
|
50
50
|
assert_equal("jp", detector.detect())
|
51
51
|
end
|
52
|
+
|
53
|
+
def test_exceptions
|
54
|
+
detector = @factory.create()
|
55
|
+
detector.append(UCS2String.new(''))
|
56
|
+
assert_raises(NoFeaturesInTextError) do
|
57
|
+
detector.detect()
|
58
|
+
end
|
59
|
+
end
|
52
60
|
end
|
@@ -6,11 +6,23 @@ class DetectorFactoryTest < Test::Unit::TestCase
|
|
6
6
|
factory = DetectorFactory.new
|
7
7
|
|
8
8
|
factory.add_profile(profile, 0, 1)
|
9
|
-
assert_raises(LangDetectException) do
|
10
|
-
factory.add_profile(profile, 1, 1)
|
11
|
-
end
|
12
9
|
|
13
10
|
detector = factory.create(0.123)
|
14
11
|
assert_equal(0.123, detector.alpha)
|
15
12
|
end
|
13
|
+
|
14
|
+
def test_exceptions
|
15
|
+
profile = LangProfile.new
|
16
|
+
factory = DetectorFactory.new
|
17
|
+
|
18
|
+
assert_raises(NoProfilesLoadedError) do
|
19
|
+
factory.create()
|
20
|
+
end
|
21
|
+
|
22
|
+
factory.add_profile(profile, 0, 2)
|
23
|
+
|
24
|
+
assert_raises(DuplicateProfilesError) do
|
25
|
+
factory.add_profile(profile, 1, 2)
|
26
|
+
end
|
27
|
+
end
|
16
28
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: langusta
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 25
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 1
|
10
|
+
version: 0.1.1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jan Szumiec
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-04-
|
18
|
+
date: 2011-04-10 00:00:00 +02:00
|
19
19
|
default_executable: langusta
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -124,7 +124,7 @@ dependencies:
|
|
124
124
|
name: ruby-debug
|
125
125
|
version_requirements: *id007
|
126
126
|
prerelease: false
|
127
|
-
description:
|
127
|
+
description: Highly accurate language detection library, uses naive bayesian filter.
|
128
128
|
email: jan.szumiec@gmail.com
|
129
129
|
executables:
|
130
130
|
- langusta
|