nlp_backpack 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. data/.document +5 -0
  2. data/.gitignore +21 -0
  3. data/LICENSE +20 -0
  4. data/README.rdoc +22 -0
  5. data/Rakefile +45 -0
  6. data/VERSION +1 -0
  7. data/lib/nlp_backpack.rb +10 -0
  8. data/lib/nlp_backpack/chunker.rb +5 -0
  9. data/lib/nlp_backpack/chunker/regex_chunker.rb +107 -0
  10. data/lib/nlp_backpack/chunker/tag_pattern.rb +31 -0
  11. data/lib/nlp_backpack/classifier.rb +5 -0
  12. data/lib/nlp_backpack/classifier/base.rb +28 -0
  13. data/lib/nlp_backpack/classifier/naive_bayes.rb +83 -0
  14. data/lib/nlp_backpack/evaluation.rb +6 -0
  15. data/lib/nlp_backpack/evaluation/accuracy.rb +46 -0
  16. data/lib/nlp_backpack/evaluation/base.rb +12 -0
  17. data/lib/nlp_backpack/evaluation/confusion_matrix.rb +66 -0
  18. data/lib/nlp_backpack/frequency_distribution.rb +47 -0
  19. data/lib/nlp_backpack/pos.rb +5 -0
  20. data/lib/nlp_backpack/pos/brill_tagger.rb +142 -0
  21. data/lib/nlp_backpack/pos/brill_tagger/lexicon.txt +93696 -0
  22. data/lib/nlp_backpack/pos/pos_array.rb +32 -0
  23. data/lib/nlp_backpack/stop_words.rb +17 -0
  24. data/lib/nlp_backpack/stop_words/stop_words.txt +429 -0
  25. data/lib/nlp_backpack/tokenizers/custom.rb +13 -0
  26. data/lib/nlp_backpack/tokenizers/line.rb +13 -0
  27. data/lib/nlp_backpack/tokenizers/space.rb +13 -0
  28. data/lib/nlp_backpack/tokenizers/tab.rb +13 -0
  29. data/lib/nlp_backpack/tokenizers/whitespace.rb +13 -0
  30. data/lib/nlp_backpack/tokenizers/word.rb +13 -0
  31. data/nlp_backpack.gemspec +109 -0
  32. data/spec/chunkers/regex_chunker_spec.rb +46 -0
  33. data/spec/chunkers/tag_pattern_spec.rb +40 -0
  34. data/spec/classifiers/naive_bayes_spec.rb +68 -0
  35. data/spec/evaluation/accuracy_spec.rb +29 -0
  36. data/spec/evaluation/confusion_matrix_spec.rb +29 -0
  37. data/spec/frequency_distribution_spec.rb +53 -0
  38. data/spec/nlp_backpack_spec.rb +4 -0
  39. data/spec/pos/brill_tagger_spec.rb +24 -0
  40. data/spec/pos/pos_array_spec.rb +45 -0
  41. data/spec/spec.opts +1 -0
  42. data/spec/spec_helper.rb +18 -0
  43. data/spec/stop_words_spec.rb +15 -0
  44. data/spec/test_saves/naive.nb +1 -0
  45. data/spec/tokenizers/custom_spec.rb +24 -0
  46. data/spec/tokenizers/line_spec.rb +15 -0
  47. data/spec/tokenizers/space_spec.rb +15 -0
  48. data/spec/tokenizers/tab_spec.rb +15 -0
  49. data/spec/tokenizers/whitespace_spec.rb +16 -0
  50. data/spec/tokenizers/word_spec.rb +15 -0
  51. metadata +141 -0
@@ -0,0 +1,13 @@
1
+ module NLPBackpack
2
+
3
+ module Tokenizer
4
+ class Custom
5
+ class << self
6
+ def tokenize(string, spliter)
7
+ string.split(spliter)
8
+ end
9
+ end
10
+ end
11
+ end
12
+
13
+ end
@@ -0,0 +1,13 @@
1
+ module NLPBackpack
2
+
3
+ module Tokenizer
4
+ class Line
5
+ class << self
6
+ def tokenize(string)
7
+ string.split(/\n{1}/)
8
+ end
9
+ end
10
+ end
11
+ end
12
+
13
+ end
@@ -0,0 +1,13 @@
1
+ module NLPBackpack
2
+
3
+ module Tokenizer
4
+ class Space
5
+ class << self
6
+ def tokenize(string)
7
+ string.split(/\s{1}/)
8
+ end
9
+ end
10
+ end
11
+ end
12
+
13
+ end
@@ -0,0 +1,13 @@
1
+ module NLPBackpack
2
+
3
+ module Tokenizer
4
+ class Tab
5
+ class << self
6
+ def tokenize(string)
7
+ string.split(/\t{1}/)
8
+ end
9
+ end
10
+ end
11
+ end
12
+
13
+ end
@@ -0,0 +1,13 @@
1
+ module NLPBackpack
2
+
3
+ module Tokenizer
4
+ class Whitespace
5
+ class << self
6
+ def tokenize(string)
7
+ string.split
8
+ end
9
+ end
10
+ end
11
+ end
12
+
13
+ end
@@ -0,0 +1,13 @@
1
+ module NLPBackpack
2
+
3
+ module Tokenizer
4
+ class Word
5
+ class << self
6
+ def tokenize(string)
7
+ string.split(/\W*\s/)
8
+ end
9
+ end
10
+ end
11
+ end
12
+
13
+ end
@@ -0,0 +1,109 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{nlp_backpack}
8
+ s.version = "0.0.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["reddavis"]
12
+ s.date = %q{2010-06-21}
13
+ s.description = %q{A backpack full of useful toys}
14
+ s.email = %q{reddavis@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "lib/nlp_backpack.rb",
27
+ "lib/nlp_backpack/chunker.rb",
28
+ "lib/nlp_backpack/chunker/regex_chunker.rb",
29
+ "lib/nlp_backpack/chunker/tag_pattern.rb",
30
+ "lib/nlp_backpack/classifier.rb",
31
+ "lib/nlp_backpack/classifier/base.rb",
32
+ "lib/nlp_backpack/classifier/naive_bayes.rb",
33
+ "lib/nlp_backpack/evaluation.rb",
34
+ "lib/nlp_backpack/evaluation/accuracy.rb",
35
+ "lib/nlp_backpack/evaluation/base.rb",
36
+ "lib/nlp_backpack/evaluation/confusion_matrix.rb",
37
+ "lib/nlp_backpack/frequency_distribution.rb",
38
+ "lib/nlp_backpack/pos.rb",
39
+ "lib/nlp_backpack/pos/brill_tagger.rb",
40
+ "lib/nlp_backpack/pos/brill_tagger/lexicon.txt",
41
+ "lib/nlp_backpack/pos/pos_array.rb",
42
+ "lib/nlp_backpack/stop_words.rb",
43
+ "lib/nlp_backpack/stop_words/stop_words.txt",
44
+ "lib/nlp_backpack/tokenizers/custom.rb",
45
+ "lib/nlp_backpack/tokenizers/line.rb",
46
+ "lib/nlp_backpack/tokenizers/space.rb",
47
+ "lib/nlp_backpack/tokenizers/tab.rb",
48
+ "lib/nlp_backpack/tokenizers/whitespace.rb",
49
+ "lib/nlp_backpack/tokenizers/word.rb",
50
+ "nlp_backpack.gemspec",
51
+ "spec/chunkers/regex_chunker_spec.rb",
52
+ "spec/chunkers/tag_pattern_spec.rb",
53
+ "spec/classifiers/naive_bayes_spec.rb",
54
+ "spec/evaluation/accuracy_spec.rb",
55
+ "spec/evaluation/confusion_matrix_spec.rb",
56
+ "spec/frequency_distribution_spec.rb",
57
+ "spec/nlp_backpack_spec.rb",
58
+ "spec/pos/brill_tagger_spec.rb",
59
+ "spec/pos/pos_array_spec.rb",
60
+ "spec/spec.opts",
61
+ "spec/spec_helper.rb",
62
+ "spec/stop_words_spec.rb",
63
+ "spec/test_saves/naive.nb",
64
+ "spec/tokenizers/custom_spec.rb",
65
+ "spec/tokenizers/line_spec.rb",
66
+ "spec/tokenizers/space_spec.rb",
67
+ "spec/tokenizers/tab_spec.rb",
68
+ "spec/tokenizers/whitespace_spec.rb",
69
+ "spec/tokenizers/word_spec.rb"
70
+ ]
71
+ s.homepage = %q{http://github.com/reddavis/NLP-Backpack}
72
+ s.rdoc_options = ["--charset=UTF-8"]
73
+ s.require_paths = ["lib"]
74
+ s.rubygems_version = %q{1.3.6}
75
+ s.summary = %q{A backpack full of useful toys}
76
+ s.test_files = [
77
+ "spec/chunkers/regex_chunker_spec.rb",
78
+ "spec/chunkers/tag_pattern_spec.rb",
79
+ "spec/classifiers/naive_bayes_spec.rb",
80
+ "spec/evaluation/accuracy_spec.rb",
81
+ "spec/evaluation/confusion_matrix_spec.rb",
82
+ "spec/frequency_distribution_spec.rb",
83
+ "spec/nlp_backpack_spec.rb",
84
+ "spec/pos/brill_tagger_spec.rb",
85
+ "spec/pos/pos_array_spec.rb",
86
+ "spec/spec_helper.rb",
87
+ "spec/stop_words_spec.rb",
88
+ "spec/tokenizers/custom_spec.rb",
89
+ "spec/tokenizers/line_spec.rb",
90
+ "spec/tokenizers/space_spec.rb",
91
+ "spec/tokenizers/tab_spec.rb",
92
+ "spec/tokenizers/whitespace_spec.rb",
93
+ "spec/tokenizers/word_spec.rb"
94
+ ]
95
+
96
+ if s.respond_to? :specification_version then
97
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
98
+ s.specification_version = 3
99
+
100
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
101
+ s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
102
+ else
103
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
104
+ end
105
+ else
106
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
107
+ end
108
+ end
109
+
@@ -0,0 +1,46 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+ require 'nlp_backpack/pos/pos_array'
3
+
4
+ include NLPBackpack
5
+
6
+ describe Chunker::RegexChunker do
7
+ describe "Matching specified chunks" do
8
+ before do
9
+ @pos_a = POS::POSArray.new
10
+ sentence.each {|word| @pos_a << word}
11
+ end
12
+
13
+ describe "Simple chunk" do
14
+ it "should return 'this is some text'" do
15
+ grammer = Chunker::RegexChunker.new("<DT><VBZ><DT><NN>")
16
+ grammer.match(@pos_a).first.should == "this is some text"
17
+ end
18
+ end
19
+
20
+ describe "Chunk with conditional tag" do
21
+ it "should return 'this is some text'" do
22
+ grammer = Chunker::RegexChunker.new("<DT><VBZ>?<DT>")
23
+ grammer.match(@pos_a)[0].should == "this is some"
24
+ grammer.match(@pos_a)[1].should == "this some"
25
+ end
26
+
27
+ it "should return 'this is some text'" do
28
+ grammer = Chunker::RegexChunker.new("<DT><VBZ>*<DT>")
29
+ grammer.match(@pos_a)[0].should == "this is some"
30
+ grammer.match(@pos_a)[1].should == "this some"
31
+ end
32
+ end
33
+
34
+ describe "Chunk with tag regex" do
35
+ it "should return 'this is some text'" do
36
+ grammer = Chunker::RegexChunker.new('<D\w><VBZ>?<DT>')
37
+ grammer.match(@pos_a)[0].should == "this is some"
38
+ grammer.match(@pos_a)[1].should == "this some"
39
+ end
40
+ end
41
+ end
42
+
43
+ def sentence
44
+ [["this", "DT"], ["is", "VBZ"], ["some", "DT"], ["text", "NN"], ["text", "NN"], ["that", "IN"], ["I", "PRP"], ["want", "VBP"], ["analyzing", "VBG"], ["this", "DT"], ["some", "DT"]]
45
+ end
46
+ end
@@ -0,0 +1,40 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+ require 'nlp_backpack/chunker/tag_pattern'
3
+
4
+ include NLPBackpack
5
+
6
+ describe Chunker::TagPattern do
7
+ describe "Tag" do
8
+ before do
9
+ @tp = Chunker::TagPattern.new("<NN>")
10
+ end
11
+
12
+ it "should return the tag as NN" do
13
+ @tp.tag.should == /NN/
14
+ end
15
+ end
16
+
17
+ describe "Tag with condition" do
18
+ before do
19
+ @tp = Chunker::TagPattern.new("<NN*>")
20
+ end
21
+
22
+ it "should return the tag as NN*" do
23
+ @tp.tag.should == /NN*/
24
+ end
25
+ end
26
+
27
+ describe "Tag with external condition" do
28
+ before do
29
+ @tp = Chunker::TagPattern.new("<NN.+>?")
30
+ end
31
+
32
+ it "should return the tag as NN" do
33
+ @tp.tag.should == /NN.+/
34
+ end
35
+
36
+ it "should return conditions as ?" do
37
+ @tp.conditions.should == "?"
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,68 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ include NLPBackpack::Classifier
4
+
5
+ describe NaiveBayes do
6
+ describe "Classification" do
7
+ before do
8
+ @classifier = create_and_train_classifier
9
+ end
10
+
11
+ it "should classify as spam with a score of 0.5" do
12
+ a = @classifier.classify('bad', 'word')
13
+ a[0].should == :spam
14
+ a[1].should == 0.5
15
+ end
16
+ end
17
+
18
+ describe "Saving the NB" do
19
+ describe "DB filepath has been set" do
20
+ before do
21
+ @classifier = NaiveBayes.new(:spam, :ham)
22
+ @classifier.db_filepath = db_filepath
23
+ end
24
+
25
+ it "should save to the filepath provided" do
26
+ FileUtils.rm(db_filepath, :force => true)
27
+ @classifier.save
28
+ File.exists?(db_filepath).should be_true
29
+ end
30
+ end
31
+
32
+ describe "DB filepath has no been set" do
33
+ it "should raise an error" do
34
+ lambda do
35
+ NaiveBayes.new(:spam, :ham).save
36
+ end.should raise_error
37
+ end
38
+ end
39
+ end
40
+
41
+ describe "Load" do
42
+ before do
43
+ classifier = NaiveBayes.new(:spam, :ham)
44
+ classifier.db_filepath = db_filepath
45
+ classifier.train(:spam, 'bad', 'word')
46
+ classifier.train(:ham, 'we', 'bad')
47
+ classifier.save
48
+ end
49
+
50
+ it "should return 0.5" do
51
+ classifier = NaiveBayes.load(db_filepath)
52
+ classifier.classify('bad', 'word')[1].should == 0.5
53
+ end
54
+ end
55
+
56
+ private
57
+
58
+ def create_and_train_classifier
59
+ a = NaiveBayes.new(:spam, :ham)
60
+ a.train(:spam, 'bad', 'word')
61
+ a.train(:ham, 'we', 'bad')
62
+ a
63
+ end
64
+
65
+ def db_filepath
66
+ File.expand_path(File.dirname(__FILE__) + '/../test_saves/naive.nb')
67
+ end
68
+ end
@@ -0,0 +1,29 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ include NLPBackpack::Evaluation
4
+
5
+ describe Accuracy do
6
+ before(:all) do
7
+ @accuracy = Accuracy.new([1,1,2,2], [1,1,2,1])
8
+ end
9
+
10
+ describe "Specific element" do
11
+ it "should return 100" do
12
+ @accuracy.accuracy_of(1).should == 100
13
+ end
14
+
15
+ it "should return 50" do
16
+ @accuracy.accuracy_of(2).should == 50
17
+ end
18
+ end
19
+
20
+ describe "Inspect" do
21
+ it "should match 100%" do
22
+ @accuracy.inspect.should match(/100%/)
23
+ end
24
+
25
+ it "should match 50%" do
26
+ @accuracy.inspect.should match(/50%/)
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,29 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ include NLPBackpack::Evaluation
4
+
5
+ describe ConfusionMatrix do
6
+ before(:all) do
7
+ @cm = ConfusionMatrix.new(correct_results, test_results)
8
+ end
9
+
10
+ describe "Specific element" do
11
+ it "should return 90%" do
12
+ @cm.results_for(1, 1).should == "90%"
13
+ end
14
+
15
+ it "should return 10%" do
16
+ @cm.results_for(2, 3).should == "10%"
17
+ end
18
+ end
19
+
20
+ describe "Inspect" do
21
+ it "should match <90%>" do
22
+ @cm.inspect.should match(/<90%>/)
23
+ end
24
+
25
+ it "should match 10%" do
26
+ @cm.inspect.should match(/10%/)
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,53 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ include NLPBackpack
4
+
5
+ describe FrequencyDistribution do
6
+ before do
7
+ conditions = {:happy => happy_text, :sad => sad_text}
8
+ @cfd = FrequencyDistribution.new(conditions, "happy", "sad")
9
+ @results = @cfd.process
10
+ end
11
+
12
+ it "should return a hash" do
13
+ @results.should be_a(Hash)
14
+ end
15
+
16
+ describe "Happy condition" do
17
+ it "should return 2 for happy" do
18
+ @results[:happy]["happy"].should == 2
19
+ end
20
+
21
+ it "should return 0 for sad" do
22
+ @results[:happy]["sad"].should == 0
23
+ end
24
+ end
25
+
26
+ describe "Sad condition" do
27
+ it "should return 0 for happy" do
28
+ @results[:sad]["happy"].should == 0
29
+ end
30
+
31
+ it "should return 2 for sad" do
32
+ @results[:sad]["sad"].should == 2
33
+ end
34
+ end
35
+
36
+ describe "Tabulation" do
37
+ it "should include the events" do
38
+ table = @cfd.to_tabulation
39
+ table.should match(/happy/)
40
+ table.should match(/sad/)
41
+ end
42
+ end
43
+
44
+ private
45
+
46
+ def happy_text
47
+ %w{when happy things happen it makes me happy}
48
+ end
49
+
50
+ def sad_text
51
+ %w{when sad things happen it makes me sad}
52
+ end
53
+ end