ruby_ngrams_language_detector 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +31 -0
- data/Rakefile +1 -0
- data/language_detector.gemspec +21 -0
- data/lib/language_detector.rb +55 -0
- data/lib/language_detector/profile.rb +123 -0
- data/lib/language_detector/training_data/english.txt +1452 -0
- data/lib/language_detector/training_data/spanish.txt +1559 -0
- data/lib/language_detector/version.rb +3 -0
- data/lib/model.yml +4027 -0
- data/spec/english.txt +1 -0
- data/spec/language_detector_spec.rb +17 -0
- data/spec/profile_spec.rb +104 -0
- data/spec/spanish.txt +1 -0
- data/spec/spec_helper.rb +3 -0
- metadata +83 -0
data/spec/english.txt
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
this is a test of the Emergency text categorizing system.
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
describe LanguageDetector do
|
3
|
+
describe "Test is_valid_character() method" do
|
4
|
+
before do
|
5
|
+
LanguageDetector::Detector.train
|
6
|
+
@language_detector = LanguageDetector::Detector.new
|
7
|
+
end
|
8
|
+
|
9
|
+
it "Test detect spanish" do
|
10
|
+
@language_detector.detect_language("spec/spanish.txt").should eql "es"
|
11
|
+
end
|
12
|
+
|
13
|
+
it "Test detect english" do
|
14
|
+
@language_detector.detect_language("spec/english.txt").should eql "en"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
describe LanguageDetector::Profile do
|
3
|
+
describe "Test is_valid_character() method" do
|
4
|
+
before do
|
5
|
+
@profile = LanguageDetector::Profile.new("test")
|
6
|
+
end
|
7
|
+
|
8
|
+
it "Test '.' is not a valid character" do
|
9
|
+
@profile.is_valid_character?(?.).should be_false
|
10
|
+
end
|
11
|
+
|
12
|
+
it "Test ',' is not a valid character" do
|
13
|
+
@profile.is_valid_character?(?,).should be_false
|
14
|
+
end
|
15
|
+
|
16
|
+
it "Test ':' is not a valid character" do
|
17
|
+
@profile.is_valid_character?(?:).should be_false
|
18
|
+
end
|
19
|
+
|
20
|
+
it "Test ';' is not a valid character" do
|
21
|
+
@profile.is_valid_character?(?;).should be_false
|
22
|
+
end
|
23
|
+
|
24
|
+
it "Test 'A' is not a valid character" do
|
25
|
+
@profile.is_valid_character?(?A).should be_false
|
26
|
+
end
|
27
|
+
|
28
|
+
it "Test 'a' is a valid character" do
|
29
|
+
@profile.is_valid_character?(?a).should be_true
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
describe "Test tokenize() method" do
|
34
|
+
before do
|
35
|
+
@profile = LanguageDetector::Profile.new("test")
|
36
|
+
end
|
37
|
+
|
38
|
+
it "Test '.' is not a valid character" do
|
39
|
+
@profile.tokenize("this is; ,+_ A \t 123 test:").should match_array(["this", "is", "a", "test"])
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
describe "Test count_ngram() method" do
|
44
|
+
before do
|
45
|
+
@profile = LanguageDetector::Profile.new("test")
|
46
|
+
end
|
47
|
+
|
48
|
+
it "Test 1" do
|
49
|
+
@profile.count_ngram('words', 1, {}).should include("w"=>1, "o"=>1, "r"=>1, "d"=>1, "s"=>1)
|
50
|
+
end
|
51
|
+
|
52
|
+
it "Test 2" do
|
53
|
+
@profile.count_ngram('words', 2, {}).should include("wo"=>1, "or"=>1, "rd"=>1, "ds"=>1, "_w" => 1, "s_" => 1)
|
54
|
+
end
|
55
|
+
|
56
|
+
it "Test 3" do
|
57
|
+
@profile.count_ngram('words', 3, {}).should include("wor"=>1, "ord"=>1, "rds"=>1, "_wo" => 1, "ds_" => 1, "s__" => 1)
|
58
|
+
end
|
59
|
+
|
60
|
+
it "Test 4" do
|
61
|
+
@profile.count_ngram('words', 4, {}).should include("word"=>1, "ords"=>1, "_wor" => 1, "rds_" => 1, "ds__" => 1, "s___" => 1)
|
62
|
+
end
|
63
|
+
|
64
|
+
it "Test 5" do
|
65
|
+
@profile.count_ngram('words', 5, {}).should include("words"=>1, "_word" => 1, "ords_" => 1, "rds__" => 1, "ds___" => 1, "s____" => 1)
|
66
|
+
end
|
67
|
+
|
68
|
+
it "Test 6" do
|
69
|
+
@profile.count_ngram('words', 6, {}).should include()
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
describe "Test init_with_string() method" do
|
74
|
+
before do
|
75
|
+
@profile = LanguageDetector::Profile.new("test")
|
76
|
+
end
|
77
|
+
|
78
|
+
it "Test 1" do
|
79
|
+
@profile.init_with_string("this is; ,+_ A \t 123 test:")
|
80
|
+
@profile.ngrams.should include("_t"=>1, "s_"=>2, "is"=>3, "_i"=>4, "th"=>5, "_th"=>6, "thi"=>7, "his"=>8, "is_"=>9, "s__"=>10, "_thi"=>11, "this"=>12, "his_"=>13, "is__"=>14, "s___"=>15, "hi"=>16, "te"=>17, "es"=>18, "st"=>19, "t_"=>20, "_te"=>21, "tes"=>22, "est"=>23, "st_"=>24, "t__"=>25, "_tes"=>26, "test"=>27, "est_"=>28, "st__"=>29, "t___"=>30)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
describe "Test compute_distance() method" do
|
85
|
+
before do
|
86
|
+
@profile1 = LanguageDetector::Profile.new("test")
|
87
|
+
@profile1.init_with_string("this is ,+_ A \t 123 test")
|
88
|
+
|
89
|
+
@profile2 = LanguageDetector::Profile.new("test")
|
90
|
+
@profile2.init_with_string("this is ,+_ A \t 123 test")
|
91
|
+
|
92
|
+
@profile3 = LanguageDetector::Profile.new("test")
|
93
|
+
@profile3.init_with_string("xxxx")
|
94
|
+
end
|
95
|
+
|
96
|
+
it "Test 1" do
|
97
|
+
@profile1.compute_distance(@profile2).should eql 0
|
98
|
+
end
|
99
|
+
|
100
|
+
it "Test 2" do
|
101
|
+
@profile1.compute_distance(@profile3).should eql 24000
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
data/spec/spanish.txt
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
para poner este importante proyecto en
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ruby_ngrams_language_detector
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- cexposito
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-02-14 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rspec
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '2.6'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '2.6'
|
30
|
+
description: ngram based language detector written in ruby
|
31
|
+
email:
|
32
|
+
- carlosexposito68@gmail.com
|
33
|
+
executables: []
|
34
|
+
extensions: []
|
35
|
+
extra_rdoc_files: []
|
36
|
+
files:
|
37
|
+
- .gitignore
|
38
|
+
- Gemfile
|
39
|
+
- LICENSE.txt
|
40
|
+
- README.md
|
41
|
+
- Rakefile
|
42
|
+
- language_detector.gemspec
|
43
|
+
- lib/language_detector.rb
|
44
|
+
- lib/language_detector/profile.rb
|
45
|
+
- lib/language_detector/training_data/english.txt
|
46
|
+
- lib/language_detector/training_data/spanish.txt
|
47
|
+
- lib/language_detector/version.rb
|
48
|
+
- lib/model.yml
|
49
|
+
- spec/english.txt
|
50
|
+
- spec/language_detector_spec.rb
|
51
|
+
- spec/profile_spec.rb
|
52
|
+
- spec/spanish.txt
|
53
|
+
- spec/spec_helper.rb
|
54
|
+
homepage: ''
|
55
|
+
licenses: []
|
56
|
+
post_install_message:
|
57
|
+
rdoc_options: []
|
58
|
+
require_paths:
|
59
|
+
- lib
|
60
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ! '>='
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
66
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ! '>='
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '0'
|
72
|
+
requirements: []
|
73
|
+
rubyforge_project:
|
74
|
+
rubygems_version: 1.8.25
|
75
|
+
signing_key:
|
76
|
+
specification_version: 3
|
77
|
+
summary: ngram based language detector
|
78
|
+
test_files:
|
79
|
+
- spec/english.txt
|
80
|
+
- spec/language_detector_spec.rb
|
81
|
+
- spec/profile_spec.rb
|
82
|
+
- spec/spanish.txt
|
83
|
+
- spec/spec_helper.rb
|