ruby_ngrams_language_detector 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/spec/english.txt ADDED
@@ -0,0 +1 @@
1
+ this is a test of the Emergency text categorizing system.
@@ -0,0 +1,17 @@
1
+ require 'spec_helper'
2
+ describe LanguageDetector do
3
+ describe "Test is_valid_character() method" do
4
+ before do
5
+ LanguageDetector::Detector.train
6
+ @language_detector = LanguageDetector::Detector.new
7
+ end
8
+
9
+ it "Test detect spanish" do
10
+ @language_detector.detect_language("spec/spanish.txt").should eql "es"
11
+ end
12
+
13
+ it "Test detect english" do
14
+ @language_detector.detect_language("spec/english.txt").should eql "en"
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,104 @@
1
+ require 'spec_helper'
2
+ describe LanguageDetector::Profile do
3
+ describe "Test is_valid_character() method" do
4
+ before do
5
+ @profile = LanguageDetector::Profile.new("test")
6
+ end
7
+
8
+ it "Test '.' is not a valid character" do
9
+ @profile.is_valid_character?(?.).should be_false
10
+ end
11
+
12
+ it "Test ',' is not a valid character" do
13
+ @profile.is_valid_character?(?,).should be_false
14
+ end
15
+
16
+ it "Test ':' is not a valid character" do
17
+ @profile.is_valid_character?(?:).should be_false
18
+ end
19
+
20
+ it "Test ';' is not a valid character" do
21
+ @profile.is_valid_character?(?;).should be_false
22
+ end
23
+
24
+ it "Test 'A' is not a valid character" do
25
+ @profile.is_valid_character?(?A).should be_false
26
+ end
27
+
28
+ it "Test 'a' is a valid character" do
29
+ @profile.is_valid_character?(?a).should be_true
30
+ end
31
+ end
32
+
33
+ describe "Test tokenize() method" do
34
+ before do
35
+ @profile = LanguageDetector::Profile.new("test")
36
+ end
37
+
38
+ it "Test '.' is not a valid character" do
39
+ @profile.tokenize("this is; ,+_ A \t 123 test:").should match_array(["this", "is", "a", "test"])
40
+ end
41
+ end
42
+
43
+ describe "Test count_ngram() method" do
44
+ before do
45
+ @profile = LanguageDetector::Profile.new("test")
46
+ end
47
+
48
+ it "Test 1" do
49
+ @profile.count_ngram('words', 1, {}).should include("w"=>1, "o"=>1, "r"=>1, "d"=>1, "s"=>1)
50
+ end
51
+
52
+ it "Test 2" do
53
+ @profile.count_ngram('words', 2, {}).should include("wo"=>1, "or"=>1, "rd"=>1, "ds"=>1, "_w" => 1, "s_" => 1)
54
+ end
55
+
56
+ it "Test 3" do
57
+ @profile.count_ngram('words', 3, {}).should include("wor"=>1, "ord"=>1, "rds"=>1, "_wo" => 1, "ds_" => 1, "s__" => 1)
58
+ end
59
+
60
+ it "Test 4" do
61
+ @profile.count_ngram('words', 4, {}).should include("word"=>1, "ords"=>1, "_wor" => 1, "rds_" => 1, "ds__" => 1, "s___" => 1)
62
+ end
63
+
64
+ it "Test 5" do
65
+ @profile.count_ngram('words', 5, {}).should include("words"=>1, "_word" => 1, "ords_" => 1, "rds__" => 1, "ds___" => 1, "s____" => 1)
66
+ end
67
+
68
+ it "Test 6" do
69
+ @profile.count_ngram('words', 6, {}).should include()
70
+ end
71
+ end
72
+
73
+ describe "Test init_with_string() method" do
74
+ before do
75
+ @profile = LanguageDetector::Profile.new("test")
76
+ end
77
+
78
+ it "Test 1" do
79
+ @profile.init_with_string("this is; ,+_ A \t 123 test:")
80
+ @profile.ngrams.should include("_t"=>1, "s_"=>2, "is"=>3, "_i"=>4, "th"=>5, "_th"=>6, "thi"=>7, "his"=>8, "is_"=>9, "s__"=>10, "_thi"=>11, "this"=>12, "his_"=>13, "is__"=>14, "s___"=>15, "hi"=>16, "te"=>17, "es"=>18, "st"=>19, "t_"=>20, "_te"=>21, "tes"=>22, "est"=>23, "st_"=>24, "t__"=>25, "_tes"=>26, "test"=>27, "est_"=>28, "st__"=>29, "t___"=>30)
81
+ end
82
+ end
83
+
84
+ describe "Test compute_distance() method" do
85
+ before do
86
+ @profile1 = LanguageDetector::Profile.new("test")
87
+ @profile1.init_with_string("this is ,+_ A \t 123 test")
88
+
89
+ @profile2 = LanguageDetector::Profile.new("test")
90
+ @profile2.init_with_string("this is ,+_ A \t 123 test")
91
+
92
+ @profile3 = LanguageDetector::Profile.new("test")
93
+ @profile3.init_with_string("xxxx")
94
+ end
95
+
96
+ it "Test 1" do
97
+ @profile1.compute_distance(@profile2).should eql 0
98
+ end
99
+
100
+ it "Test 2" do
101
+ @profile1.compute_distance(@profile3).should eql 24000
102
+ end
103
+ end
104
+ end
data/spec/spanish.txt ADDED
@@ -0,0 +1 @@
1
+ para poner este importante proyecto en
@@ -0,0 +1,3 @@
1
+ require_relative '../lib/language_detector/profile'
2
+ require_relative '../lib/language_detector'
3
+ require 'yaml'
metadata ADDED
@@ -0,0 +1,83 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ruby_ngrams_language_detector
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - cexposito
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-02-14 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '2.6'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: '2.6'
30
+ description: ngram based language detector written in ruby
31
+ email:
32
+ - carlosexposito68@gmail.com
33
+ executables: []
34
+ extensions: []
35
+ extra_rdoc_files: []
36
+ files:
37
+ - .gitignore
38
+ - Gemfile
39
+ - LICENSE.txt
40
+ - README.md
41
+ - Rakefile
42
+ - language_detector.gemspec
43
+ - lib/language_detector.rb
44
+ - lib/language_detector/profile.rb
45
+ - lib/language_detector/training_data/english.txt
46
+ - lib/language_detector/training_data/spanish.txt
47
+ - lib/language_detector/version.rb
48
+ - lib/model.yml
49
+ - spec/english.txt
50
+ - spec/language_detector_spec.rb
51
+ - spec/profile_spec.rb
52
+ - spec/spanish.txt
53
+ - spec/spec_helper.rb
54
+ homepage: ''
55
+ licenses: []
56
+ post_install_message:
57
+ rdoc_options: []
58
+ require_paths:
59
+ - lib
60
+ required_ruby_version: !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ required_rubygems_version: !ruby/object:Gem::Requirement
67
+ none: false
68
+ requirements:
69
+ - - ! '>='
70
+ - !ruby/object:Gem::Version
71
+ version: '0'
72
+ requirements: []
73
+ rubyforge_project:
74
+ rubygems_version: 1.8.25
75
+ signing_key:
76
+ specification_version: 3
77
+ summary: ngram based language detector
78
+ test_files:
79
+ - spec/english.txt
80
+ - spec/language_detector_spec.rb
81
+ - spec/profile_spec.rb
82
+ - spec/spanish.txt
83
+ - spec/spec_helper.rb