nlp_backpack 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.rdoc +22 -0
- data/Rakefile +45 -0
- data/VERSION +1 -0
- data/lib/nlp_backpack.rb +10 -0
- data/lib/nlp_backpack/chunker.rb +5 -0
- data/lib/nlp_backpack/chunker/regex_chunker.rb +107 -0
- data/lib/nlp_backpack/chunker/tag_pattern.rb +31 -0
- data/lib/nlp_backpack/classifier.rb +5 -0
- data/lib/nlp_backpack/classifier/base.rb +28 -0
- data/lib/nlp_backpack/classifier/naive_bayes.rb +83 -0
- data/lib/nlp_backpack/evaluation.rb +6 -0
- data/lib/nlp_backpack/evaluation/accuracy.rb +46 -0
- data/lib/nlp_backpack/evaluation/base.rb +12 -0
- data/lib/nlp_backpack/evaluation/confusion_matrix.rb +66 -0
- data/lib/nlp_backpack/frequency_distribution.rb +47 -0
- data/lib/nlp_backpack/pos.rb +5 -0
- data/lib/nlp_backpack/pos/brill_tagger.rb +142 -0
- data/lib/nlp_backpack/pos/brill_tagger/lexicon.txt +93696 -0
- data/lib/nlp_backpack/pos/pos_array.rb +32 -0
- data/lib/nlp_backpack/stop_words.rb +17 -0
- data/lib/nlp_backpack/stop_words/stop_words.txt +429 -0
- data/lib/nlp_backpack/tokenizers/custom.rb +13 -0
- data/lib/nlp_backpack/tokenizers/line.rb +13 -0
- data/lib/nlp_backpack/tokenizers/space.rb +13 -0
- data/lib/nlp_backpack/tokenizers/tab.rb +13 -0
- data/lib/nlp_backpack/tokenizers/whitespace.rb +13 -0
- data/lib/nlp_backpack/tokenizers/word.rb +13 -0
- data/nlp_backpack.gemspec +109 -0
- data/spec/chunkers/regex_chunker_spec.rb +46 -0
- data/spec/chunkers/tag_pattern_spec.rb +40 -0
- data/spec/classifiers/naive_bayes_spec.rb +68 -0
- data/spec/evaluation/accuracy_spec.rb +29 -0
- data/spec/evaluation/confusion_matrix_spec.rb +29 -0
- data/spec/frequency_distribution_spec.rb +53 -0
- data/spec/nlp_backpack_spec.rb +4 -0
- data/spec/pos/brill_tagger_spec.rb +24 -0
- data/spec/pos/pos_array_spec.rb +45 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +18 -0
- data/spec/stop_words_spec.rb +15 -0
- data/spec/test_saves/naive.nb +1 -0
- data/spec/tokenizers/custom_spec.rb +24 -0
- data/spec/tokenizers/line_spec.rb +15 -0
- data/spec/tokenizers/space_spec.rb +15 -0
- data/spec/tokenizers/tab_spec.rb +15 -0
- data/spec/tokenizers/whitespace_spec.rb +16 -0
- data/spec/tokenizers/word_spec.rb +15 -0
- metadata +141 -0
@@ -0,0 +1,109 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{nlp_backpack}
|
8
|
+
s.version = "0.0.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["reddavis"]
|
12
|
+
s.date = %q{2010-06-21}
|
13
|
+
s.description = %q{A backpack full of useful toys}
|
14
|
+
s.email = %q{reddavis@gmail.com}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE",
|
17
|
+
"README.rdoc"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".document",
|
21
|
+
".gitignore",
|
22
|
+
"LICENSE",
|
23
|
+
"README.rdoc",
|
24
|
+
"Rakefile",
|
25
|
+
"VERSION",
|
26
|
+
"lib/nlp_backpack.rb",
|
27
|
+
"lib/nlp_backpack/chunker.rb",
|
28
|
+
"lib/nlp_backpack/chunker/regex_chunker.rb",
|
29
|
+
"lib/nlp_backpack/chunker/tag_pattern.rb",
|
30
|
+
"lib/nlp_backpack/classifier.rb",
|
31
|
+
"lib/nlp_backpack/classifier/base.rb",
|
32
|
+
"lib/nlp_backpack/classifier/naive_bayes.rb",
|
33
|
+
"lib/nlp_backpack/evaluation.rb",
|
34
|
+
"lib/nlp_backpack/evaluation/accuracy.rb",
|
35
|
+
"lib/nlp_backpack/evaluation/base.rb",
|
36
|
+
"lib/nlp_backpack/evaluation/confusion_matrix.rb",
|
37
|
+
"lib/nlp_backpack/frequency_distribution.rb",
|
38
|
+
"lib/nlp_backpack/pos.rb",
|
39
|
+
"lib/nlp_backpack/pos/brill_tagger.rb",
|
40
|
+
"lib/nlp_backpack/pos/brill_tagger/lexicon.txt",
|
41
|
+
"lib/nlp_backpack/pos/pos_array.rb",
|
42
|
+
"lib/nlp_backpack/stop_words.rb",
|
43
|
+
"lib/nlp_backpack/stop_words/stop_words.txt",
|
44
|
+
"lib/nlp_backpack/tokenizers/custom.rb",
|
45
|
+
"lib/nlp_backpack/tokenizers/line.rb",
|
46
|
+
"lib/nlp_backpack/tokenizers/space.rb",
|
47
|
+
"lib/nlp_backpack/tokenizers/tab.rb",
|
48
|
+
"lib/nlp_backpack/tokenizers/whitespace.rb",
|
49
|
+
"lib/nlp_backpack/tokenizers/word.rb",
|
50
|
+
"nlp_backpack.gemspec",
|
51
|
+
"spec/chunkers/regex_chunker_spec.rb",
|
52
|
+
"spec/chunkers/tag_pattern_spec.rb",
|
53
|
+
"spec/classifiers/naive_bayes_spec.rb",
|
54
|
+
"spec/evaluation/accuracy_spec.rb",
|
55
|
+
"spec/evaluation/confusion_matrix_spec.rb",
|
56
|
+
"spec/frequency_distribution_spec.rb",
|
57
|
+
"spec/nlp_backpack_spec.rb",
|
58
|
+
"spec/pos/brill_tagger_spec.rb",
|
59
|
+
"spec/pos/pos_array_spec.rb",
|
60
|
+
"spec/spec.opts",
|
61
|
+
"spec/spec_helper.rb",
|
62
|
+
"spec/stop_words_spec.rb",
|
63
|
+
"spec/test_saves/naive.nb",
|
64
|
+
"spec/tokenizers/custom_spec.rb",
|
65
|
+
"spec/tokenizers/line_spec.rb",
|
66
|
+
"spec/tokenizers/space_spec.rb",
|
67
|
+
"spec/tokenizers/tab_spec.rb",
|
68
|
+
"spec/tokenizers/whitespace_spec.rb",
|
69
|
+
"spec/tokenizers/word_spec.rb"
|
70
|
+
]
|
71
|
+
s.homepage = %q{http://github.com/reddavis/NLP-Backpack}
|
72
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
73
|
+
s.require_paths = ["lib"]
|
74
|
+
s.rubygems_version = %q{1.3.6}
|
75
|
+
s.summary = %q{A backpack full of useful toys}
|
76
|
+
s.test_files = [
|
77
|
+
"spec/chunkers/regex_chunker_spec.rb",
|
78
|
+
"spec/chunkers/tag_pattern_spec.rb",
|
79
|
+
"spec/classifiers/naive_bayes_spec.rb",
|
80
|
+
"spec/evaluation/accuracy_spec.rb",
|
81
|
+
"spec/evaluation/confusion_matrix_spec.rb",
|
82
|
+
"spec/frequency_distribution_spec.rb",
|
83
|
+
"spec/nlp_backpack_spec.rb",
|
84
|
+
"spec/pos/brill_tagger_spec.rb",
|
85
|
+
"spec/pos/pos_array_spec.rb",
|
86
|
+
"spec/spec_helper.rb",
|
87
|
+
"spec/stop_words_spec.rb",
|
88
|
+
"spec/tokenizers/custom_spec.rb",
|
89
|
+
"spec/tokenizers/line_spec.rb",
|
90
|
+
"spec/tokenizers/space_spec.rb",
|
91
|
+
"spec/tokenizers/tab_spec.rb",
|
92
|
+
"spec/tokenizers/whitespace_spec.rb",
|
93
|
+
"spec/tokenizers/word_spec.rb"
|
94
|
+
]
|
95
|
+
|
96
|
+
if s.respond_to? :specification_version then
|
97
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
98
|
+
s.specification_version = 3
|
99
|
+
|
100
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
101
|
+
s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
|
102
|
+
else
|
103
|
+
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
104
|
+
end
|
105
|
+
else
|
106
|
+
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
require 'nlp_backpack/pos/pos_array'
|
3
|
+
|
4
|
+
include NLPBackpack
|
5
|
+
|
6
|
+
describe Chunker::RegexChunker do
|
7
|
+
describe "Matching specified chunks" do
|
8
|
+
before do
|
9
|
+
@pos_a = POS::POSArray.new
|
10
|
+
sentence.each {|word| @pos_a << word}
|
11
|
+
end
|
12
|
+
|
13
|
+
describe "Simple chunk" do
|
14
|
+
it "should return 'this is some text'" do
|
15
|
+
grammer = Chunker::RegexChunker.new("<DT><VBZ><DT><NN>")
|
16
|
+
grammer.match(@pos_a).first.should == "this is some text"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "Chunk with conditional tag" do
|
21
|
+
it "should return 'this is some text'" do
|
22
|
+
grammer = Chunker::RegexChunker.new("<DT><VBZ>?<DT>")
|
23
|
+
grammer.match(@pos_a)[0].should == "this is some"
|
24
|
+
grammer.match(@pos_a)[1].should == "this some"
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should return 'this is some text'" do
|
28
|
+
grammer = Chunker::RegexChunker.new("<DT><VBZ>*<DT>")
|
29
|
+
grammer.match(@pos_a)[0].should == "this is some"
|
30
|
+
grammer.match(@pos_a)[1].should == "this some"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
describe "Chunk with tag regex" do
|
35
|
+
it "should return 'this is some text'" do
|
36
|
+
grammer = Chunker::RegexChunker.new('<D\w><VBZ>?<DT>')
|
37
|
+
grammer.match(@pos_a)[0].should == "this is some"
|
38
|
+
grammer.match(@pos_a)[1].should == "this some"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def sentence
|
44
|
+
[["this", "DT"], ["is", "VBZ"], ["some", "DT"], ["text", "NN"], ["text", "NN"], ["that", "IN"], ["I", "PRP"], ["want", "VBP"], ["analyzing", "VBG"], ["this", "DT"], ["some", "DT"]]
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
require 'nlp_backpack/chunker/tag_pattern'
|
3
|
+
|
4
|
+
include NLPBackpack
|
5
|
+
|
6
|
+
describe Chunker::TagPattern do
|
7
|
+
describe "Tag" do
|
8
|
+
before do
|
9
|
+
@tp = Chunker::TagPattern.new("<NN>")
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should return the tag as NN" do
|
13
|
+
@tp.tag.should == /NN/
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
describe "Tag with condition" do
|
18
|
+
before do
|
19
|
+
@tp = Chunker::TagPattern.new("<NN*>")
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should return the tag as NN*" do
|
23
|
+
@tp.tag.should == /NN*/
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
describe "Tag with external condition" do
|
28
|
+
before do
|
29
|
+
@tp = Chunker::TagPattern.new("<NN.+>?")
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should return the tag as NN" do
|
33
|
+
@tp.tag.should == /NN.+/
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should return conditions as ?" do
|
37
|
+
@tp.conditions.should == "?"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
|
3
|
+
include NLPBackpack::Classifier
|
4
|
+
|
5
|
+
describe NaiveBayes do
|
6
|
+
describe "Classification" do
|
7
|
+
before do
|
8
|
+
@classifier = create_and_train_classifier
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should classify as spam with a score of 0.5" do
|
12
|
+
a = @classifier.classify('bad', 'word')
|
13
|
+
a[0].should == :spam
|
14
|
+
a[1].should == 0.5
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
describe "Saving the NB" do
|
19
|
+
describe "DB filepath has been set" do
|
20
|
+
before do
|
21
|
+
@classifier = NaiveBayes.new(:spam, :ham)
|
22
|
+
@classifier.db_filepath = db_filepath
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should save to the filepath provided" do
|
26
|
+
FileUtils.rm(db_filepath, :force => true)
|
27
|
+
@classifier.save
|
28
|
+
File.exists?(db_filepath).should be_true
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
describe "DB filepath has no been set" do
|
33
|
+
it "should raise an error" do
|
34
|
+
lambda do
|
35
|
+
NaiveBayes.new(:spam, :ham).save
|
36
|
+
end.should raise_error
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
describe "Load" do
|
42
|
+
before do
|
43
|
+
classifier = NaiveBayes.new(:spam, :ham)
|
44
|
+
classifier.db_filepath = db_filepath
|
45
|
+
classifier.train(:spam, 'bad', 'word')
|
46
|
+
classifier.train(:ham, 'we', 'bad')
|
47
|
+
classifier.save
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should return 0.5" do
|
51
|
+
classifier = NaiveBayes.load(db_filepath)
|
52
|
+
classifier.classify('bad', 'word')[1].should == 0.5
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
def create_and_train_classifier
|
59
|
+
a = NaiveBayes.new(:spam, :ham)
|
60
|
+
a.train(:spam, 'bad', 'word')
|
61
|
+
a.train(:ham, 'we', 'bad')
|
62
|
+
a
|
63
|
+
end
|
64
|
+
|
65
|
+
def db_filepath
|
66
|
+
File.expand_path(File.dirname(__FILE__) + '/../test_saves/naive.nb')
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
|
3
|
+
include NLPBackpack::Evaluation
|
4
|
+
|
5
|
+
describe Accuracy do
|
6
|
+
before(:all) do
|
7
|
+
@accuracy = Accuracy.new([1,1,2,2], [1,1,2,1])
|
8
|
+
end
|
9
|
+
|
10
|
+
describe "Specific element" do
|
11
|
+
it "should return 100" do
|
12
|
+
@accuracy.accuracy_of(1).should == 100
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should return 50" do
|
16
|
+
@accuracy.accuracy_of(2).should == 50
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "Inspect" do
|
21
|
+
it "should match 100%" do
|
22
|
+
@accuracy.inspect.should match(/100%/)
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should match 50%" do
|
26
|
+
@accuracy.inspect.should match(/50%/)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
|
3
|
+
include NLPBackpack::Evaluation
|
4
|
+
|
5
|
+
describe ConfusionMatrix do
|
6
|
+
before(:all) do
|
7
|
+
@cm = ConfusionMatrix.new(correct_results, test_results)
|
8
|
+
end
|
9
|
+
|
10
|
+
describe "Specific element" do
|
11
|
+
it "should return 90%" do
|
12
|
+
@cm.results_for(1, 1).should == "90%"
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should return 10%" do
|
16
|
+
@cm.results_for(2, 3).should == "10%"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "Inspect" do
|
21
|
+
it "should match <90%>" do
|
22
|
+
@cm.inspect.should match(/<90%>/)
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should match 10%" do
|
26
|
+
@cm.inspect.should match(/10%/)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
include NLPBackpack
|
4
|
+
|
5
|
+
describe FrequencyDistribution do
|
6
|
+
before do
|
7
|
+
conditions = {:happy => happy_text, :sad => sad_text}
|
8
|
+
@cfd = FrequencyDistribution.new(conditions, "happy", "sad")
|
9
|
+
@results = @cfd.process
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should return a hash" do
|
13
|
+
@results.should be_a(Hash)
|
14
|
+
end
|
15
|
+
|
16
|
+
describe "Happy condition" do
|
17
|
+
it "should return 2 for happy" do
|
18
|
+
@results[:happy]["happy"].should == 2
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should return 0 for sad" do
|
22
|
+
@results[:happy]["sad"].should == 0
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
describe "Sad condition" do
|
27
|
+
it "should return 0 for happy" do
|
28
|
+
@results[:sad]["happy"].should == 0
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should return 2 for sad" do
|
32
|
+
@results[:sad]["sad"].should == 2
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
describe "Tabulation" do
|
37
|
+
it "should include the events" do
|
38
|
+
table = @cfd.to_tabulation
|
39
|
+
table.should match(/happy/)
|
40
|
+
table.should match(/sad/)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def happy_text
|
47
|
+
%w{when happy things happen it makes me happy}
|
48
|
+
end
|
49
|
+
|
50
|
+
def sad_text
|
51
|
+
%w{when sad things happen it makes me sad}
|
52
|
+
end
|
53
|
+
end
|