nlp_backpack 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.rdoc +22 -0
- data/Rakefile +45 -0
- data/VERSION +1 -0
- data/lib/nlp_backpack.rb +10 -0
- data/lib/nlp_backpack/chunker.rb +5 -0
- data/lib/nlp_backpack/chunker/regex_chunker.rb +107 -0
- data/lib/nlp_backpack/chunker/tag_pattern.rb +31 -0
- data/lib/nlp_backpack/classifier.rb +5 -0
- data/lib/nlp_backpack/classifier/base.rb +28 -0
- data/lib/nlp_backpack/classifier/naive_bayes.rb +83 -0
- data/lib/nlp_backpack/evaluation.rb +6 -0
- data/lib/nlp_backpack/evaluation/accuracy.rb +46 -0
- data/lib/nlp_backpack/evaluation/base.rb +12 -0
- data/lib/nlp_backpack/evaluation/confusion_matrix.rb +66 -0
- data/lib/nlp_backpack/frequency_distribution.rb +47 -0
- data/lib/nlp_backpack/pos.rb +5 -0
- data/lib/nlp_backpack/pos/brill_tagger.rb +142 -0
- data/lib/nlp_backpack/pos/brill_tagger/lexicon.txt +93696 -0
- data/lib/nlp_backpack/pos/pos_array.rb +32 -0
- data/lib/nlp_backpack/stop_words.rb +17 -0
- data/lib/nlp_backpack/stop_words/stop_words.txt +429 -0
- data/lib/nlp_backpack/tokenizers/custom.rb +13 -0
- data/lib/nlp_backpack/tokenizers/line.rb +13 -0
- data/lib/nlp_backpack/tokenizers/space.rb +13 -0
- data/lib/nlp_backpack/tokenizers/tab.rb +13 -0
- data/lib/nlp_backpack/tokenizers/whitespace.rb +13 -0
- data/lib/nlp_backpack/tokenizers/word.rb +13 -0
- data/nlp_backpack.gemspec +109 -0
- data/spec/chunkers/regex_chunker_spec.rb +46 -0
- data/spec/chunkers/tag_pattern_spec.rb +40 -0
- data/spec/classifiers/naive_bayes_spec.rb +68 -0
- data/spec/evaluation/accuracy_spec.rb +29 -0
- data/spec/evaluation/confusion_matrix_spec.rb +29 -0
- data/spec/frequency_distribution_spec.rb +53 -0
- data/spec/nlp_backpack_spec.rb +4 -0
- data/spec/pos/brill_tagger_spec.rb +24 -0
- data/spec/pos/pos_array_spec.rb +45 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +18 -0
- data/spec/stop_words_spec.rb +15 -0
- data/spec/test_saves/naive.nb +1 -0
- data/spec/tokenizers/custom_spec.rb +24 -0
- data/spec/tokenizers/line_spec.rb +15 -0
- data/spec/tokenizers/space_spec.rb +15 -0
- data/spec/tokenizers/tab_spec.rb +15 -0
- data/spec/tokenizers/whitespace_spec.rb +16 -0
- data/spec/tokenizers/word_spec.rb +15 -0
- metadata +141 -0
@@ -0,0 +1,109 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{nlp_backpack}
|
8
|
+
s.version = "0.0.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["reddavis"]
|
12
|
+
s.date = %q{2010-06-21}
|
13
|
+
s.description = %q{A backpack full of useful toys}
|
14
|
+
s.email = %q{reddavis@gmail.com}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE",
|
17
|
+
"README.rdoc"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".document",
|
21
|
+
".gitignore",
|
22
|
+
"LICENSE",
|
23
|
+
"README.rdoc",
|
24
|
+
"Rakefile",
|
25
|
+
"VERSION",
|
26
|
+
"lib/nlp_backpack.rb",
|
27
|
+
"lib/nlp_backpack/chunker.rb",
|
28
|
+
"lib/nlp_backpack/chunker/regex_chunker.rb",
|
29
|
+
"lib/nlp_backpack/chunker/tag_pattern.rb",
|
30
|
+
"lib/nlp_backpack/classifier.rb",
|
31
|
+
"lib/nlp_backpack/classifier/base.rb",
|
32
|
+
"lib/nlp_backpack/classifier/naive_bayes.rb",
|
33
|
+
"lib/nlp_backpack/evaluation.rb",
|
34
|
+
"lib/nlp_backpack/evaluation/accuracy.rb",
|
35
|
+
"lib/nlp_backpack/evaluation/base.rb",
|
36
|
+
"lib/nlp_backpack/evaluation/confusion_matrix.rb",
|
37
|
+
"lib/nlp_backpack/frequency_distribution.rb",
|
38
|
+
"lib/nlp_backpack/pos.rb",
|
39
|
+
"lib/nlp_backpack/pos/brill_tagger.rb",
|
40
|
+
"lib/nlp_backpack/pos/brill_tagger/lexicon.txt",
|
41
|
+
"lib/nlp_backpack/pos/pos_array.rb",
|
42
|
+
"lib/nlp_backpack/stop_words.rb",
|
43
|
+
"lib/nlp_backpack/stop_words/stop_words.txt",
|
44
|
+
"lib/nlp_backpack/tokenizers/custom.rb",
|
45
|
+
"lib/nlp_backpack/tokenizers/line.rb",
|
46
|
+
"lib/nlp_backpack/tokenizers/space.rb",
|
47
|
+
"lib/nlp_backpack/tokenizers/tab.rb",
|
48
|
+
"lib/nlp_backpack/tokenizers/whitespace.rb",
|
49
|
+
"lib/nlp_backpack/tokenizers/word.rb",
|
50
|
+
"nlp_backpack.gemspec",
|
51
|
+
"spec/chunkers/regex_chunker_spec.rb",
|
52
|
+
"spec/chunkers/tag_pattern_spec.rb",
|
53
|
+
"spec/classifiers/naive_bayes_spec.rb",
|
54
|
+
"spec/evaluation/accuracy_spec.rb",
|
55
|
+
"spec/evaluation/confusion_matrix_spec.rb",
|
56
|
+
"spec/frequency_distribution_spec.rb",
|
57
|
+
"spec/nlp_backpack_spec.rb",
|
58
|
+
"spec/pos/brill_tagger_spec.rb",
|
59
|
+
"spec/pos/pos_array_spec.rb",
|
60
|
+
"spec/spec.opts",
|
61
|
+
"spec/spec_helper.rb",
|
62
|
+
"spec/stop_words_spec.rb",
|
63
|
+
"spec/test_saves/naive.nb",
|
64
|
+
"spec/tokenizers/custom_spec.rb",
|
65
|
+
"spec/tokenizers/line_spec.rb",
|
66
|
+
"spec/tokenizers/space_spec.rb",
|
67
|
+
"spec/tokenizers/tab_spec.rb",
|
68
|
+
"spec/tokenizers/whitespace_spec.rb",
|
69
|
+
"spec/tokenizers/word_spec.rb"
|
70
|
+
]
|
71
|
+
s.homepage = %q{http://github.com/reddavis/NLP-Backpack}
|
72
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
73
|
+
s.require_paths = ["lib"]
|
74
|
+
s.rubygems_version = %q{1.3.6}
|
75
|
+
s.summary = %q{A backpack full of useful toys}
|
76
|
+
s.test_files = [
|
77
|
+
"spec/chunkers/regex_chunker_spec.rb",
|
78
|
+
"spec/chunkers/tag_pattern_spec.rb",
|
79
|
+
"spec/classifiers/naive_bayes_spec.rb",
|
80
|
+
"spec/evaluation/accuracy_spec.rb",
|
81
|
+
"spec/evaluation/confusion_matrix_spec.rb",
|
82
|
+
"spec/frequency_distribution_spec.rb",
|
83
|
+
"spec/nlp_backpack_spec.rb",
|
84
|
+
"spec/pos/brill_tagger_spec.rb",
|
85
|
+
"spec/pos/pos_array_spec.rb",
|
86
|
+
"spec/spec_helper.rb",
|
87
|
+
"spec/stop_words_spec.rb",
|
88
|
+
"spec/tokenizers/custom_spec.rb",
|
89
|
+
"spec/tokenizers/line_spec.rb",
|
90
|
+
"spec/tokenizers/space_spec.rb",
|
91
|
+
"spec/tokenizers/tab_spec.rb",
|
92
|
+
"spec/tokenizers/whitespace_spec.rb",
|
93
|
+
"spec/tokenizers/word_spec.rb"
|
94
|
+
]
|
95
|
+
|
96
|
+
if s.respond_to? :specification_version then
|
97
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
98
|
+
s.specification_version = 3
|
99
|
+
|
100
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
101
|
+
s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
|
102
|
+
else
|
103
|
+
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
104
|
+
end
|
105
|
+
else
|
106
|
+
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
require 'nlp_backpack/pos/pos_array'
|
3
|
+
|
4
|
+
include NLPBackpack
|
5
|
+
|
6
|
+
describe Chunker::RegexChunker do
|
7
|
+
describe "Matching specified chunks" do
|
8
|
+
before do
|
9
|
+
@pos_a = POS::POSArray.new
|
10
|
+
sentence.each {|word| @pos_a << word}
|
11
|
+
end
|
12
|
+
|
13
|
+
describe "Simple chunk" do
|
14
|
+
it "should return 'this is some text'" do
|
15
|
+
grammer = Chunker::RegexChunker.new("<DT><VBZ><DT><NN>")
|
16
|
+
grammer.match(@pos_a).first.should == "this is some text"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "Chunk with conditional tag" do
|
21
|
+
it "should return 'this is some text'" do
|
22
|
+
grammer = Chunker::RegexChunker.new("<DT><VBZ>?<DT>")
|
23
|
+
grammer.match(@pos_a)[0].should == "this is some"
|
24
|
+
grammer.match(@pos_a)[1].should == "this some"
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should return 'this is some text'" do
|
28
|
+
grammer = Chunker::RegexChunker.new("<DT><VBZ>*<DT>")
|
29
|
+
grammer.match(@pos_a)[0].should == "this is some"
|
30
|
+
grammer.match(@pos_a)[1].should == "this some"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
describe "Chunk with tag regex" do
|
35
|
+
it "should return 'this is some text'" do
|
36
|
+
grammer = Chunker::RegexChunker.new('<D\w><VBZ>?<DT>')
|
37
|
+
grammer.match(@pos_a)[0].should == "this is some"
|
38
|
+
grammer.match(@pos_a)[1].should == "this some"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def sentence
|
44
|
+
[["this", "DT"], ["is", "VBZ"], ["some", "DT"], ["text", "NN"], ["text", "NN"], ["that", "IN"], ["I", "PRP"], ["want", "VBP"], ["analyzing", "VBG"], ["this", "DT"], ["some", "DT"]]
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
require 'nlp_backpack/chunker/tag_pattern'
|
3
|
+
|
4
|
+
include NLPBackpack
|
5
|
+
|
6
|
+
describe Chunker::TagPattern do
|
7
|
+
describe "Tag" do
|
8
|
+
before do
|
9
|
+
@tp = Chunker::TagPattern.new("<NN>")
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should return the tag as NN" do
|
13
|
+
@tp.tag.should == /NN/
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
describe "Tag with condition" do
|
18
|
+
before do
|
19
|
+
@tp = Chunker::TagPattern.new("<NN*>")
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should return the tag as NN*" do
|
23
|
+
@tp.tag.should == /NN*/
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
describe "Tag with external condition" do
|
28
|
+
before do
|
29
|
+
@tp = Chunker::TagPattern.new("<NN.+>?")
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should return the tag as NN" do
|
33
|
+
@tp.tag.should == /NN.+/
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should return conditions as ?" do
|
37
|
+
@tp.conditions.should == "?"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
|
3
|
+
include NLPBackpack::Classifier
|
4
|
+
|
5
|
+
describe NaiveBayes do
|
6
|
+
describe "Classification" do
|
7
|
+
before do
|
8
|
+
@classifier = create_and_train_classifier
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should classify as spam with a score of 0.5" do
|
12
|
+
a = @classifier.classify('bad', 'word')
|
13
|
+
a[0].should == :spam
|
14
|
+
a[1].should == 0.5
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
describe "Saving the NB" do
|
19
|
+
describe "DB filepath has been set" do
|
20
|
+
before do
|
21
|
+
@classifier = NaiveBayes.new(:spam, :ham)
|
22
|
+
@classifier.db_filepath = db_filepath
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should save to the filepath provided" do
|
26
|
+
FileUtils.rm(db_filepath, :force => true)
|
27
|
+
@classifier.save
|
28
|
+
File.exists?(db_filepath).should be_true
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
describe "DB filepath has no been set" do
|
33
|
+
it "should raise an error" do
|
34
|
+
lambda do
|
35
|
+
NaiveBayes.new(:spam, :ham).save
|
36
|
+
end.should raise_error
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
describe "Load" do
|
42
|
+
before do
|
43
|
+
classifier = NaiveBayes.new(:spam, :ham)
|
44
|
+
classifier.db_filepath = db_filepath
|
45
|
+
classifier.train(:spam, 'bad', 'word')
|
46
|
+
classifier.train(:ham, 'we', 'bad')
|
47
|
+
classifier.save
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should return 0.5" do
|
51
|
+
classifier = NaiveBayes.load(db_filepath)
|
52
|
+
classifier.classify('bad', 'word')[1].should == 0.5
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
def create_and_train_classifier
|
59
|
+
a = NaiveBayes.new(:spam, :ham)
|
60
|
+
a.train(:spam, 'bad', 'word')
|
61
|
+
a.train(:ham, 'we', 'bad')
|
62
|
+
a
|
63
|
+
end
|
64
|
+
|
65
|
+
def db_filepath
|
66
|
+
File.expand_path(File.dirname(__FILE__) + '/../test_saves/naive.nb')
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
|
3
|
+
include NLPBackpack::Evaluation
|
4
|
+
|
5
|
+
describe Accuracy do
|
6
|
+
before(:all) do
|
7
|
+
@accuracy = Accuracy.new([1,1,2,2], [1,1,2,1])
|
8
|
+
end
|
9
|
+
|
10
|
+
describe "Specific element" do
|
11
|
+
it "should return 100" do
|
12
|
+
@accuracy.accuracy_of(1).should == 100
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should return 50" do
|
16
|
+
@accuracy.accuracy_of(2).should == 50
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "Inspect" do
|
21
|
+
it "should match 100%" do
|
22
|
+
@accuracy.inspect.should match(/100%/)
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should match 50%" do
|
26
|
+
@accuracy.inspect.should match(/50%/)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
|
3
|
+
include NLPBackpack::Evaluation
|
4
|
+
|
5
|
+
describe ConfusionMatrix do
|
6
|
+
before(:all) do
|
7
|
+
@cm = ConfusionMatrix.new(correct_results, test_results)
|
8
|
+
end
|
9
|
+
|
10
|
+
describe "Specific element" do
|
11
|
+
it "should return 90%" do
|
12
|
+
@cm.results_for(1, 1).should == "90%"
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should return 10%" do
|
16
|
+
@cm.results_for(2, 3).should == "10%"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "Inspect" do
|
21
|
+
it "should match <90%>" do
|
22
|
+
@cm.inspect.should match(/<90%>/)
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should match 10%" do
|
26
|
+
@cm.inspect.should match(/10%/)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
include NLPBackpack
|
4
|
+
|
5
|
+
describe FrequencyDistribution do
|
6
|
+
before do
|
7
|
+
conditions = {:happy => happy_text, :sad => sad_text}
|
8
|
+
@cfd = FrequencyDistribution.new(conditions, "happy", "sad")
|
9
|
+
@results = @cfd.process
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should return a hash" do
|
13
|
+
@results.should be_a(Hash)
|
14
|
+
end
|
15
|
+
|
16
|
+
describe "Happy condition" do
|
17
|
+
it "should return 2 for happy" do
|
18
|
+
@results[:happy]["happy"].should == 2
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should return 0 for sad" do
|
22
|
+
@results[:happy]["sad"].should == 0
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
describe "Sad condition" do
|
27
|
+
it "should return 0 for happy" do
|
28
|
+
@results[:sad]["happy"].should == 0
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should return 2 for sad" do
|
32
|
+
@results[:sad]["sad"].should == 2
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
describe "Tabulation" do
|
37
|
+
it "should include the events" do
|
38
|
+
table = @cfd.to_tabulation
|
39
|
+
table.should match(/happy/)
|
40
|
+
table.should match(/sad/)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def happy_text
|
47
|
+
%w{when happy things happen it makes me happy}
|
48
|
+
end
|
49
|
+
|
50
|
+
def sad_text
|
51
|
+
%w{when sad things happen it makes me sad}
|
52
|
+
end
|
53
|
+
end
|