nlp_backpack 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. data/.document +5 -0
  2. data/.gitignore +21 -0
  3. data/LICENSE +20 -0
  4. data/README.rdoc +22 -0
  5. data/Rakefile +45 -0
  6. data/VERSION +1 -0
  7. data/lib/nlp_backpack.rb +10 -0
  8. data/lib/nlp_backpack/chunker.rb +5 -0
  9. data/lib/nlp_backpack/chunker/regex_chunker.rb +107 -0
  10. data/lib/nlp_backpack/chunker/tag_pattern.rb +31 -0
  11. data/lib/nlp_backpack/classifier.rb +5 -0
  12. data/lib/nlp_backpack/classifier/base.rb +28 -0
  13. data/lib/nlp_backpack/classifier/naive_bayes.rb +83 -0
  14. data/lib/nlp_backpack/evaluation.rb +6 -0
  15. data/lib/nlp_backpack/evaluation/accuracy.rb +46 -0
  16. data/lib/nlp_backpack/evaluation/base.rb +12 -0
  17. data/lib/nlp_backpack/evaluation/confusion_matrix.rb +66 -0
  18. data/lib/nlp_backpack/frequency_distribution.rb +47 -0
  19. data/lib/nlp_backpack/pos.rb +5 -0
  20. data/lib/nlp_backpack/pos/brill_tagger.rb +142 -0
  21. data/lib/nlp_backpack/pos/brill_tagger/lexicon.txt +93696 -0
  22. data/lib/nlp_backpack/pos/pos_array.rb +32 -0
  23. data/lib/nlp_backpack/stop_words.rb +17 -0
  24. data/lib/nlp_backpack/stop_words/stop_words.txt +429 -0
  25. data/lib/nlp_backpack/tokenizers/custom.rb +13 -0
  26. data/lib/nlp_backpack/tokenizers/line.rb +13 -0
  27. data/lib/nlp_backpack/tokenizers/space.rb +13 -0
  28. data/lib/nlp_backpack/tokenizers/tab.rb +13 -0
  29. data/lib/nlp_backpack/tokenizers/whitespace.rb +13 -0
  30. data/lib/nlp_backpack/tokenizers/word.rb +13 -0
  31. data/nlp_backpack.gemspec +109 -0
  32. data/spec/chunkers/regex_chunker_spec.rb +46 -0
  33. data/spec/chunkers/tag_pattern_spec.rb +40 -0
  34. data/spec/classifiers/naive_bayes_spec.rb +68 -0
  35. data/spec/evaluation/accuracy_spec.rb +29 -0
  36. data/spec/evaluation/confusion_matrix_spec.rb +29 -0
  37. data/spec/frequency_distribution_spec.rb +53 -0
  38. data/spec/nlp_backpack_spec.rb +4 -0
  39. data/spec/pos/brill_tagger_spec.rb +24 -0
  40. data/spec/pos/pos_array_spec.rb +45 -0
  41. data/spec/spec.opts +1 -0
  42. data/spec/spec_helper.rb +18 -0
  43. data/spec/stop_words_spec.rb +15 -0
  44. data/spec/test_saves/naive.nb +1 -0
  45. data/spec/tokenizers/custom_spec.rb +24 -0
  46. data/spec/tokenizers/line_spec.rb +15 -0
  47. data/spec/tokenizers/space_spec.rb +15 -0
  48. data/spec/tokenizers/tab_spec.rb +15 -0
  49. data/spec/tokenizers/whitespace_spec.rb +16 -0
  50. data/spec/tokenizers/word_spec.rb +15 -0
  51. metadata +141 -0
@@ -0,0 +1,13 @@
1
+ module NLPBackpack
2
+
3
+ module Tokenizer
4
+ class Custom
5
+ class << self
6
+ def tokenize(string, spliter)
7
+ string.split(spliter)
8
+ end
9
+ end
10
+ end
11
+ end
12
+
13
+ end
@@ -0,0 +1,13 @@
1
+ module NLPBackpack
2
+
3
+ module Tokenizer
4
+ class Line
5
+ class << self
6
+ def tokenize(string)
7
+ string.split(/\n{1}/)
8
+ end
9
+ end
10
+ end
11
+ end
12
+
13
+ end
@@ -0,0 +1,13 @@
1
+ module NLPBackpack
2
+
3
+ module Tokenizer
4
+ class Space
5
+ class << self
6
+ def tokenize(string)
7
+ string.split(/\s{1}/)
8
+ end
9
+ end
10
+ end
11
+ end
12
+
13
+ end
@@ -0,0 +1,13 @@
1
+ module NLPBackpack
2
+
3
+ module Tokenizer
4
+ class Tab
5
+ class << self
6
+ def tokenize(string)
7
+ string.split(/\t{1}/)
8
+ end
9
+ end
10
+ end
11
+ end
12
+
13
+ end
@@ -0,0 +1,13 @@
1
+ module NLPBackpack
2
+
3
+ module Tokenizer
4
+ class Whitespace
5
+ class << self
6
+ def tokenize(string)
7
+ string.split
8
+ end
9
+ end
10
+ end
11
+ end
12
+
13
+ end
@@ -0,0 +1,13 @@
1
+ module NLPBackpack
2
+
3
+ module Tokenizer
4
+ class Word
5
+ class << self
6
+ def tokenize(string)
7
+ string.split(/\W*\s/)
8
+ end
9
+ end
10
+ end
11
+ end
12
+
13
+ end
@@ -0,0 +1,109 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{nlp_backpack}
8
+ s.version = "0.0.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["reddavis"]
12
+ s.date = %q{2010-06-21}
13
+ s.description = %q{A backpack full of useful toys}
14
+ s.email = %q{reddavis@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "lib/nlp_backpack.rb",
27
+ "lib/nlp_backpack/chunker.rb",
28
+ "lib/nlp_backpack/chunker/regex_chunker.rb",
29
+ "lib/nlp_backpack/chunker/tag_pattern.rb",
30
+ "lib/nlp_backpack/classifier.rb",
31
+ "lib/nlp_backpack/classifier/base.rb",
32
+ "lib/nlp_backpack/classifier/naive_bayes.rb",
33
+ "lib/nlp_backpack/evaluation.rb",
34
+ "lib/nlp_backpack/evaluation/accuracy.rb",
35
+ "lib/nlp_backpack/evaluation/base.rb",
36
+ "lib/nlp_backpack/evaluation/confusion_matrix.rb",
37
+ "lib/nlp_backpack/frequency_distribution.rb",
38
+ "lib/nlp_backpack/pos.rb",
39
+ "lib/nlp_backpack/pos/brill_tagger.rb",
40
+ "lib/nlp_backpack/pos/brill_tagger/lexicon.txt",
41
+ "lib/nlp_backpack/pos/pos_array.rb",
42
+ "lib/nlp_backpack/stop_words.rb",
43
+ "lib/nlp_backpack/stop_words/stop_words.txt",
44
+ "lib/nlp_backpack/tokenizers/custom.rb",
45
+ "lib/nlp_backpack/tokenizers/line.rb",
46
+ "lib/nlp_backpack/tokenizers/space.rb",
47
+ "lib/nlp_backpack/tokenizers/tab.rb",
48
+ "lib/nlp_backpack/tokenizers/whitespace.rb",
49
+ "lib/nlp_backpack/tokenizers/word.rb",
50
+ "nlp_backpack.gemspec",
51
+ "spec/chunkers/regex_chunker_spec.rb",
52
+ "spec/chunkers/tag_pattern_spec.rb",
53
+ "spec/classifiers/naive_bayes_spec.rb",
54
+ "spec/evaluation/accuracy_spec.rb",
55
+ "spec/evaluation/confusion_matrix_spec.rb",
56
+ "spec/frequency_distribution_spec.rb",
57
+ "spec/nlp_backpack_spec.rb",
58
+ "spec/pos/brill_tagger_spec.rb",
59
+ "spec/pos/pos_array_spec.rb",
60
+ "spec/spec.opts",
61
+ "spec/spec_helper.rb",
62
+ "spec/stop_words_spec.rb",
63
+ "spec/test_saves/naive.nb",
64
+ "spec/tokenizers/custom_spec.rb",
65
+ "spec/tokenizers/line_spec.rb",
66
+ "spec/tokenizers/space_spec.rb",
67
+ "spec/tokenizers/tab_spec.rb",
68
+ "spec/tokenizers/whitespace_spec.rb",
69
+ "spec/tokenizers/word_spec.rb"
70
+ ]
71
+ s.homepage = %q{http://github.com/reddavis/NLP-Backpack}
72
+ s.rdoc_options = ["--charset=UTF-8"]
73
+ s.require_paths = ["lib"]
74
+ s.rubygems_version = %q{1.3.6}
75
+ s.summary = %q{A backpack full of useful toys}
76
+ s.test_files = [
77
+ "spec/chunkers/regex_chunker_spec.rb",
78
+ "spec/chunkers/tag_pattern_spec.rb",
79
+ "spec/classifiers/naive_bayes_spec.rb",
80
+ "spec/evaluation/accuracy_spec.rb",
81
+ "spec/evaluation/confusion_matrix_spec.rb",
82
+ "spec/frequency_distribution_spec.rb",
83
+ "spec/nlp_backpack_spec.rb",
84
+ "spec/pos/brill_tagger_spec.rb",
85
+ "spec/pos/pos_array_spec.rb",
86
+ "spec/spec_helper.rb",
87
+ "spec/stop_words_spec.rb",
88
+ "spec/tokenizers/custom_spec.rb",
89
+ "spec/tokenizers/line_spec.rb",
90
+ "spec/tokenizers/space_spec.rb",
91
+ "spec/tokenizers/tab_spec.rb",
92
+ "spec/tokenizers/whitespace_spec.rb",
93
+ "spec/tokenizers/word_spec.rb"
94
+ ]
95
+
96
+ if s.respond_to? :specification_version then
97
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
98
+ s.specification_version = 3
99
+
100
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
101
+ s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
102
+ else
103
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
104
+ end
105
+ else
106
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
107
+ end
108
+ end
109
+
@@ -0,0 +1,46 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+ require 'nlp_backpack/pos/pos_array'
3
+
4
+ include NLPBackpack
5
+
6
+ describe Chunker::RegexChunker do
7
+ describe "Matching specified chunks" do
8
+ before do
9
+ @pos_a = POS::POSArray.new
10
+ sentence.each {|word| @pos_a << word}
11
+ end
12
+
13
+ describe "Simple chunk" do
14
+ it "should return 'this is some text'" do
15
+ grammer = Chunker::RegexChunker.new("<DT><VBZ><DT><NN>")
16
+ grammer.match(@pos_a).first.should == "this is some text"
17
+ end
18
+ end
19
+
20
+ describe "Chunk with conditional tag" do
21
+ it "should return 'this is some text'" do
22
+ grammer = Chunker::RegexChunker.new("<DT><VBZ>?<DT>")
23
+ grammer.match(@pos_a)[0].should == "this is some"
24
+ grammer.match(@pos_a)[1].should == "this some"
25
+ end
26
+
27
+ it "should return 'this is some text'" do
28
+ grammer = Chunker::RegexChunker.new("<DT><VBZ>*<DT>")
29
+ grammer.match(@pos_a)[0].should == "this is some"
30
+ grammer.match(@pos_a)[1].should == "this some"
31
+ end
32
+ end
33
+
34
+ describe "Chunk with tag regex" do
35
+ it "should return 'this is some text'" do
36
+ grammer = Chunker::RegexChunker.new('<D\w><VBZ>?<DT>')
37
+ grammer.match(@pos_a)[0].should == "this is some"
38
+ grammer.match(@pos_a)[1].should == "this some"
39
+ end
40
+ end
41
+ end
42
+
43
+ def sentence
44
+ [["this", "DT"], ["is", "VBZ"], ["some", "DT"], ["text", "NN"], ["text", "NN"], ["that", "IN"], ["I", "PRP"], ["want", "VBP"], ["analyzing", "VBG"], ["this", "DT"], ["some", "DT"]]
45
+ end
46
+ end
@@ -0,0 +1,40 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+ require 'nlp_backpack/chunker/tag_pattern'
3
+
4
+ include NLPBackpack
5
+
6
+ describe Chunker::TagPattern do
7
+ describe "Tag" do
8
+ before do
9
+ @tp = Chunker::TagPattern.new("<NN>")
10
+ end
11
+
12
+ it "should return the tag as NN" do
13
+ @tp.tag.should == /NN/
14
+ end
15
+ end
16
+
17
+ describe "Tag with condition" do
18
+ before do
19
+ @tp = Chunker::TagPattern.new("<NN*>")
20
+ end
21
+
22
+ it "should return the tag as NN*" do
23
+ @tp.tag.should == /NN*/
24
+ end
25
+ end
26
+
27
+ describe "Tag with external condition" do
28
+ before do
29
+ @tp = Chunker::TagPattern.new("<NN.+>?")
30
+ end
31
+
32
+ it "should return the tag as NN" do
33
+ @tp.tag.should == /NN.+/
34
+ end
35
+
36
+ it "should return conditions as ?" do
37
+ @tp.conditions.should == "?"
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,68 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ include NLPBackpack::Classifier
4
+
5
+ describe NaiveBayes do
6
+ describe "Classification" do
7
+ before do
8
+ @classifier = create_and_train_classifier
9
+ end
10
+
11
+ it "should classify as spam with a score of 0.5" do
12
+ a = @classifier.classify('bad', 'word')
13
+ a[0].should == :spam
14
+ a[1].should == 0.5
15
+ end
16
+ end
17
+
18
+ describe "Saving the NB" do
19
+ describe "DB filepath has been set" do
20
+ before do
21
+ @classifier = NaiveBayes.new(:spam, :ham)
22
+ @classifier.db_filepath = db_filepath
23
+ end
24
+
25
+ it "should save to the filepath provided" do
26
+ FileUtils.rm(db_filepath, :force => true)
27
+ @classifier.save
28
+ File.exists?(db_filepath).should be_true
29
+ end
30
+ end
31
+
32
+ describe "DB filepath has no been set" do
33
+ it "should raise an error" do
34
+ lambda do
35
+ NaiveBayes.new(:spam, :ham).save
36
+ end.should raise_error
37
+ end
38
+ end
39
+ end
40
+
41
+ describe "Load" do
42
+ before do
43
+ classifier = NaiveBayes.new(:spam, :ham)
44
+ classifier.db_filepath = db_filepath
45
+ classifier.train(:spam, 'bad', 'word')
46
+ classifier.train(:ham, 'we', 'bad')
47
+ classifier.save
48
+ end
49
+
50
+ it "should return 0.5" do
51
+ classifier = NaiveBayes.load(db_filepath)
52
+ classifier.classify('bad', 'word')[1].should == 0.5
53
+ end
54
+ end
55
+
56
+ private
57
+
58
+ def create_and_train_classifier
59
+ a = NaiveBayes.new(:spam, :ham)
60
+ a.train(:spam, 'bad', 'word')
61
+ a.train(:ham, 'we', 'bad')
62
+ a
63
+ end
64
+
65
+ def db_filepath
66
+ File.expand_path(File.dirname(__FILE__) + '/../test_saves/naive.nb')
67
+ end
68
+ end
@@ -0,0 +1,29 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ include NLPBackpack::Evaluation
4
+
5
+ describe Accuracy do
6
+ before(:all) do
7
+ @accuracy = Accuracy.new([1,1,2,2], [1,1,2,1])
8
+ end
9
+
10
+ describe "Specific element" do
11
+ it "should return 100" do
12
+ @accuracy.accuracy_of(1).should == 100
13
+ end
14
+
15
+ it "should return 50" do
16
+ @accuracy.accuracy_of(2).should == 50
17
+ end
18
+ end
19
+
20
+ describe "Inspect" do
21
+ it "should match 100%" do
22
+ @accuracy.inspect.should match(/100%/)
23
+ end
24
+
25
+ it "should match 50%" do
26
+ @accuracy.inspect.should match(/50%/)
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,29 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ include NLPBackpack::Evaluation
4
+
5
+ describe ConfusionMatrix do
6
+ before(:all) do
7
+ @cm = ConfusionMatrix.new(correct_results, test_results)
8
+ end
9
+
10
+ describe "Specific element" do
11
+ it "should return 90%" do
12
+ @cm.results_for(1, 1).should == "90%"
13
+ end
14
+
15
+ it "should return 10%" do
16
+ @cm.results_for(2, 3).should == "10%"
17
+ end
18
+ end
19
+
20
+ describe "Inspect" do
21
+ it "should match <90%>" do
22
+ @cm.inspect.should match(/<90%>/)
23
+ end
24
+
25
+ it "should match 10%" do
26
+ @cm.inspect.should match(/10%/)
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,53 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ include NLPBackpack
4
+
5
+ describe FrequencyDistribution do
6
+ before do
7
+ conditions = {:happy => happy_text, :sad => sad_text}
8
+ @cfd = FrequencyDistribution.new(conditions, "happy", "sad")
9
+ @results = @cfd.process
10
+ end
11
+
12
+ it "should return a hash" do
13
+ @results.should be_a(Hash)
14
+ end
15
+
16
+ describe "Happy condition" do
17
+ it "should return 2 for happy" do
18
+ @results[:happy]["happy"].should == 2
19
+ end
20
+
21
+ it "should return 0 for sad" do
22
+ @results[:happy]["sad"].should == 0
23
+ end
24
+ end
25
+
26
+ describe "Sad condition" do
27
+ it "should return 0 for happy" do
28
+ @results[:sad]["happy"].should == 0
29
+ end
30
+
31
+ it "should return 2 for sad" do
32
+ @results[:sad]["sad"].should == 2
33
+ end
34
+ end
35
+
36
+ describe "Tabulation" do
37
+ it "should include the events" do
38
+ table = @cfd.to_tabulation
39
+ table.should match(/happy/)
40
+ table.should match(/sad/)
41
+ end
42
+ end
43
+
44
+ private
45
+
46
+ def happy_text
47
+ %w{when happy things happen it makes me happy}
48
+ end
49
+
50
+ def sad_text
51
+ %w{when sad things happen it makes me sad}
52
+ end
53
+ end