ruby_ngrams_language_detector 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +31 -0
- data/Rakefile +1 -0
- data/language_detector.gemspec +21 -0
- data/lib/language_detector.rb +55 -0
- data/lib/language_detector/profile.rb +123 -0
- data/lib/language_detector/training_data/english.txt +1452 -0
- data/lib/language_detector/training_data/spanish.txt +1559 -0
- data/lib/language_detector/version.rb +3 -0
- data/lib/model.yml +4027 -0
- data/spec/english.txt +1 -0
- data/spec/language_detector_spec.rb +17 -0
- data/spec/profile_spec.rb +104 -0
- data/spec/spanish.txt +1 -0
- data/spec/spec_helper.rb +3 -0
- metadata +83 -0
data/spec/english.txt
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
this is a test of the Emergency text categorizing system.
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
describe LanguageDetector do
|
3
|
+
describe "Test is_valid_character() method" do
|
4
|
+
before do
|
5
|
+
LanguageDetector::Detector.train
|
6
|
+
@language_detector = LanguageDetector::Detector.new
|
7
|
+
end
|
8
|
+
|
9
|
+
it "Test detect spanish" do
|
10
|
+
@language_detector.detect_language("spec/spanish.txt").should eql "es"
|
11
|
+
end
|
12
|
+
|
13
|
+
it "Test detect english" do
|
14
|
+
@language_detector.detect_language("spec/english.txt").should eql "en"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
describe LanguageDetector::Profile do
|
3
|
+
describe "Test is_valid_character() method" do
|
4
|
+
before do
|
5
|
+
@profile = LanguageDetector::Profile.new("test")
|
6
|
+
end
|
7
|
+
|
8
|
+
it "Test '.' is not a valid character" do
|
9
|
+
@profile.is_valid_character?(?.).should be_false
|
10
|
+
end
|
11
|
+
|
12
|
+
it "Test ',' is not a valid character" do
|
13
|
+
@profile.is_valid_character?(?,).should be_false
|
14
|
+
end
|
15
|
+
|
16
|
+
it "Test ':' is not a valid character" do
|
17
|
+
@profile.is_valid_character?(?:).should be_false
|
18
|
+
end
|
19
|
+
|
20
|
+
it "Test ';' is not a valid character" do
|
21
|
+
@profile.is_valid_character?(?;).should be_false
|
22
|
+
end
|
23
|
+
|
24
|
+
it "Test 'A' is not a valid character" do
|
25
|
+
@profile.is_valid_character?(?A).should be_false
|
26
|
+
end
|
27
|
+
|
28
|
+
it "Test 'a' is a valid character" do
|
29
|
+
@profile.is_valid_character?(?a).should be_true
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
describe "Test tokenize() method" do
|
34
|
+
before do
|
35
|
+
@profile = LanguageDetector::Profile.new("test")
|
36
|
+
end
|
37
|
+
|
38
|
+
it "Test '.' is not a valid character" do
|
39
|
+
@profile.tokenize("this is; ,+_ A \t 123 test:").should match_array(["this", "is", "a", "test"])
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
describe "Test count_ngram() method" do
|
44
|
+
before do
|
45
|
+
@profile = LanguageDetector::Profile.new("test")
|
46
|
+
end
|
47
|
+
|
48
|
+
it "Test 1" do
|
49
|
+
@profile.count_ngram('words', 1, {}).should include("w"=>1, "o"=>1, "r"=>1, "d"=>1, "s"=>1)
|
50
|
+
end
|
51
|
+
|
52
|
+
it "Test 2" do
|
53
|
+
@profile.count_ngram('words', 2, {}).should include("wo"=>1, "or"=>1, "rd"=>1, "ds"=>1, "_w" => 1, "s_" => 1)
|
54
|
+
end
|
55
|
+
|
56
|
+
it "Test 3" do
|
57
|
+
@profile.count_ngram('words', 3, {}).should include("wor"=>1, "ord"=>1, "rds"=>1, "_wo" => 1, "ds_" => 1, "s__" => 1)
|
58
|
+
end
|
59
|
+
|
60
|
+
it "Test 4" do
|
61
|
+
@profile.count_ngram('words', 4, {}).should include("word"=>1, "ords"=>1, "_wor" => 1, "rds_" => 1, "ds__" => 1, "s___" => 1)
|
62
|
+
end
|
63
|
+
|
64
|
+
it "Test 5" do
|
65
|
+
@profile.count_ngram('words', 5, {}).should include("words"=>1, "_word" => 1, "ords_" => 1, "rds__" => 1, "ds___" => 1, "s____" => 1)
|
66
|
+
end
|
67
|
+
|
68
|
+
it "Test 6" do
|
69
|
+
@profile.count_ngram('words', 6, {}).should include()
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
describe "Test init_with_string() method" do
|
74
|
+
before do
|
75
|
+
@profile = LanguageDetector::Profile.new("test")
|
76
|
+
end
|
77
|
+
|
78
|
+
it "Test 1" do
|
79
|
+
@profile.init_with_string("this is; ,+_ A \t 123 test:")
|
80
|
+
@profile.ngrams.should include("_t"=>1, "s_"=>2, "is"=>3, "_i"=>4, "th"=>5, "_th"=>6, "thi"=>7, "his"=>8, "is_"=>9, "s__"=>10, "_thi"=>11, "this"=>12, "his_"=>13, "is__"=>14, "s___"=>15, "hi"=>16, "te"=>17, "es"=>18, "st"=>19, "t_"=>20, "_te"=>21, "tes"=>22, "est"=>23, "st_"=>24, "t__"=>25, "_tes"=>26, "test"=>27, "est_"=>28, "st__"=>29, "t___"=>30)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
describe "Test compute_distance() method" do
|
85
|
+
before do
|
86
|
+
@profile1 = LanguageDetector::Profile.new("test")
|
87
|
+
@profile1.init_with_string("this is ,+_ A \t 123 test")
|
88
|
+
|
89
|
+
@profile2 = LanguageDetector::Profile.new("test")
|
90
|
+
@profile2.init_with_string("this is ,+_ A \t 123 test")
|
91
|
+
|
92
|
+
@profile3 = LanguageDetector::Profile.new("test")
|
93
|
+
@profile3.init_with_string("xxxx")
|
94
|
+
end
|
95
|
+
|
96
|
+
it "Test 1" do
|
97
|
+
@profile1.compute_distance(@profile2).should eql 0
|
98
|
+
end
|
99
|
+
|
100
|
+
it "Test 2" do
|
101
|
+
@profile1.compute_distance(@profile3).should eql 24000
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
data/spec/spanish.txt
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
para poner este importante proyecto en
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ruby_ngrams_language_detector
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- cexposito
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-02-14 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rspec
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '2.6'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '2.6'
|
30
|
+
description: ngram based language detector written in ruby
|
31
|
+
email:
|
32
|
+
- carlosexposito68@gmail.com
|
33
|
+
executables: []
|
34
|
+
extensions: []
|
35
|
+
extra_rdoc_files: []
|
36
|
+
files:
|
37
|
+
- .gitignore
|
38
|
+
- Gemfile
|
39
|
+
- LICENSE.txt
|
40
|
+
- README.md
|
41
|
+
- Rakefile
|
42
|
+
- language_detector.gemspec
|
43
|
+
- lib/language_detector.rb
|
44
|
+
- lib/language_detector/profile.rb
|
45
|
+
- lib/language_detector/training_data/english.txt
|
46
|
+
- lib/language_detector/training_data/spanish.txt
|
47
|
+
- lib/language_detector/version.rb
|
48
|
+
- lib/model.yml
|
49
|
+
- spec/english.txt
|
50
|
+
- spec/language_detector_spec.rb
|
51
|
+
- spec/profile_spec.rb
|
52
|
+
- spec/spanish.txt
|
53
|
+
- spec/spec_helper.rb
|
54
|
+
homepage: ''
|
55
|
+
licenses: []
|
56
|
+
post_install_message:
|
57
|
+
rdoc_options: []
|
58
|
+
require_paths:
|
59
|
+
- lib
|
60
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ! '>='
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
66
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ! '>='
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '0'
|
72
|
+
requirements: []
|
73
|
+
rubyforge_project:
|
74
|
+
rubygems_version: 1.8.25
|
75
|
+
signing_key:
|
76
|
+
specification_version: 3
|
77
|
+
summary: ngram based language detector
|
78
|
+
test_files:
|
79
|
+
- spec/english.txt
|
80
|
+
- spec/language_detector_spec.rb
|
81
|
+
- spec/profile_spec.rb
|
82
|
+
- spec/spanish.txt
|
83
|
+
- spec/spec_helper.rb
|