nlp_backpack 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. data/.document +5 -0
  2. data/.gitignore +21 -0
  3. data/LICENSE +20 -0
  4. data/README.rdoc +22 -0
  5. data/Rakefile +45 -0
  6. data/VERSION +1 -0
  7. data/lib/nlp_backpack.rb +10 -0
  8. data/lib/nlp_backpack/chunker.rb +5 -0
  9. data/lib/nlp_backpack/chunker/regex_chunker.rb +107 -0
  10. data/lib/nlp_backpack/chunker/tag_pattern.rb +31 -0
  11. data/lib/nlp_backpack/classifier.rb +5 -0
  12. data/lib/nlp_backpack/classifier/base.rb +28 -0
  13. data/lib/nlp_backpack/classifier/naive_bayes.rb +83 -0
  14. data/lib/nlp_backpack/evaluation.rb +6 -0
  15. data/lib/nlp_backpack/evaluation/accuracy.rb +46 -0
  16. data/lib/nlp_backpack/evaluation/base.rb +12 -0
  17. data/lib/nlp_backpack/evaluation/confusion_matrix.rb +66 -0
  18. data/lib/nlp_backpack/frequency_distribution.rb +47 -0
  19. data/lib/nlp_backpack/pos.rb +5 -0
  20. data/lib/nlp_backpack/pos/brill_tagger.rb +142 -0
  21. data/lib/nlp_backpack/pos/brill_tagger/lexicon.txt +93696 -0
  22. data/lib/nlp_backpack/pos/pos_array.rb +32 -0
  23. data/lib/nlp_backpack/stop_words.rb +17 -0
  24. data/lib/nlp_backpack/stop_words/stop_words.txt +429 -0
  25. data/lib/nlp_backpack/tokenizers/custom.rb +13 -0
  26. data/lib/nlp_backpack/tokenizers/line.rb +13 -0
  27. data/lib/nlp_backpack/tokenizers/space.rb +13 -0
  28. data/lib/nlp_backpack/tokenizers/tab.rb +13 -0
  29. data/lib/nlp_backpack/tokenizers/whitespace.rb +13 -0
  30. data/lib/nlp_backpack/tokenizers/word.rb +13 -0
  31. data/nlp_backpack.gemspec +109 -0
  32. data/spec/chunkers/regex_chunker_spec.rb +46 -0
  33. data/spec/chunkers/tag_pattern_spec.rb +40 -0
  34. data/spec/classifiers/naive_bayes_spec.rb +68 -0
  35. data/spec/evaluation/accuracy_spec.rb +29 -0
  36. data/spec/evaluation/confusion_matrix_spec.rb +29 -0
  37. data/spec/frequency_distribution_spec.rb +53 -0
  38. data/spec/nlp_backpack_spec.rb +4 -0
  39. data/spec/pos/brill_tagger_spec.rb +24 -0
  40. data/spec/pos/pos_array_spec.rb +45 -0
  41. data/spec/spec.opts +1 -0
  42. data/spec/spec_helper.rb +18 -0
  43. data/spec/stop_words_spec.rb +15 -0
  44. data/spec/test_saves/naive.nb +1 -0
  45. data/spec/tokenizers/custom_spec.rb +24 -0
  46. data/spec/tokenizers/line_spec.rb +15 -0
  47. data/spec/tokenizers/space_spec.rb +15 -0
  48. data/spec/tokenizers/tab_spec.rb +15 -0
  49. data/spec/tokenizers/whitespace_spec.rb +16 -0
  50. data/spec/tokenizers/word_spec.rb +15 -0
  51. metadata +141 -0
@@ -0,0 +1,4 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "NlpBackpack" do
4
+ end
@@ -0,0 +1,24 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ include NLPBackpack
4
+
5
+ describe POS::BrillTagger do
6
+ before(:all) do
7
+ @pos = POS::BrillTagger.analyze(text)
8
+ end
9
+
10
+ it "should properly tag 'the fast fox'" do
11
+ @pos[0][1].should == "DT"
12
+ @pos[1][1].should == "JJ"
13
+ @pos[2][1].should == "NN"
14
+ end
15
+
16
+ it "should return a POSData object" do
17
+ @pos.should be_a(POS::POSArray)
18
+ end
19
+
20
+ def text
21
+ "This regular expression is read in the following manner: Zero or more adjectives or nouns, followed by an option group of a noun and a preposition, followed again by zero or more adjectives or nouns, followed by a single noun. A sequence of tags matching this pattern ensures that the corresponding words make up a noun phrase.
22
+ In addition to simply pulling out the phrases, it is common to do some simple post processing to link variants together (For example, unpluralizing plural variants)."
23
+ end
24
+ end
@@ -0,0 +1,45 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+ require 'nlp_backpack/pos/pos_array'
3
+
4
+ include NLPBackpack
5
+
6
+ describe POS::POSArray do
7
+ before do
8
+ @pos_a = POS::POSArray.new
9
+ end
10
+
11
+ describe "Appending words" do
12
+ describe "Using <<" do
13
+ it "should raise an error if we insert too much data" do
14
+ lambda do
15
+ @pos_a << ["hello", :NN, "LOL"]
16
+ end.should raise_error
17
+ end
18
+
19
+ it "size should be 2" do
20
+ 2.times { @pos_a << word }
21
+ @pos_a.size.should == 2
22
+ end
23
+ end
24
+
25
+ describe "Using #append" do
26
+ it "size should be 2" do
27
+ 2.times { @pos_a.append(*word) }
28
+ @pos_a.size.should == 2
29
+ end
30
+ end
31
+ end
32
+
33
+ describe "to_s" do
34
+ it "should return properly formed string for ChunkGrammer" do
35
+ @pos_a << word
36
+ @pos_a << word
37
+
38
+ @pos_a.to_s.should == "hello/NN hello/NN"
39
+ end
40
+ end
41
+
42
+ def word
43
+ ["hello", :NN]
44
+ end
45
+ end
@@ -0,0 +1 @@
1
+ --color
@@ -0,0 +1,18 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'rubygems'
4
+ require 'nlp_backpack'
5
+ require 'spec'
6
+ require 'spec/autorun'
7
+
8
+ Spec::Runner.configure do |config|
9
+
10
+ end
11
+
12
+ def correct_results
13
+ [1,1,2,1,1,1,1,1,1,1]
14
+ end
15
+
16
+ def test_results
17
+ [1,1,3,1,1,1,1,1,1,1]
18
+ end
@@ -0,0 +1,15 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ include NLPBackpack
4
+
5
+ describe StopWords do
6
+ before(:all) { @stop_words = StopWords.all }
7
+
8
+ it "should return z" do
9
+ @stop_words.last.should == "z"
10
+ end
11
+
12
+ it "should return a" do
13
+ @stop_words.first.should == "a"
14
+ end
15
+ end
@@ -0,0 +1 @@
1
+ o:(NLPBackpack::Classifier::NaiveBayes :@features_count{: spam}" wordf1"badf1f0:ham}"wef1"badf1@ :@db_filepath"M/Users/reddavis/Documents/projects/nlp_backpack/spec/test_saves/naive.nb:
@@ -0,0 +1,24 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+ require 'nlp_backpack/tokenizers/custom'
3
+
4
+ include NLPBackpack
5
+
6
+ describe Tokenizer::Custom do
7
+ describe "split by ands" do
8
+ it "should return 3" do
9
+ a = Tokenizer::Custom.tokenize(text, "and")
10
+ a.size.should == 3
11
+ end
12
+ end
13
+
14
+ describe "split by any character" do
15
+ it "should return 3" do
16
+ a = Tokenizer::Custom.tokenize("and", //)
17
+ a.size.should == 3
18
+ end
19
+ end
20
+
21
+ def text
22
+ "hello and there and this"
23
+ end
24
+ end
@@ -0,0 +1,15 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+ require 'nlp_backpack/tokenizers/line'
3
+
4
+ include NLPBackpack
5
+
6
+ describe Tokenizer::Line do
7
+ it "should split text by whitespace" do
8
+ a = Tokenizer::Line.tokenize(text)
9
+ a.size.should == 3
10
+ end
11
+
12
+ def text
13
+ "hello\n\nthere this"
14
+ end
15
+ end
@@ -0,0 +1,15 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+ require 'nlp_backpack/tokenizers/space'
3
+
4
+ include NLPBackpack
5
+
6
+ describe Tokenizer::Space do
7
+ it "should split text by single spaces" do
8
+ a = Tokenizer::Space.tokenize(text)
9
+ a.size.should == 4
10
+ end
11
+
12
+ def text
13
+ "hello there this"
14
+ end
15
+ end
@@ -0,0 +1,15 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+ require 'nlp_backpack/tokenizers/tab'
3
+
4
+ include NLPBackpack
5
+
6
+ describe Tokenizer::Tab do
7
+ it "should split text by single tabs" do
8
+ a = Tokenizer::Tab.tokenize(text)
9
+ a.size.should == 4
10
+ end
11
+
12
+ def text
13
+ "hello\tthere\t\tthis"
14
+ end
15
+ end
@@ -0,0 +1,16 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+ require 'nlp_backpack/tokenizers/whitespace'
3
+
4
+ include NLPBackpack
5
+
6
+ describe Tokenizer::Whitespace do
7
+ it "should split text by whitespace" do
8
+ a = Tokenizer::Whitespace.tokenize(text)
9
+ a.size.should == 3
10
+ a[0].should == "hello"
11
+ end
12
+
13
+ def text
14
+ "hello there this"
15
+ end
16
+ end
@@ -0,0 +1,15 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+ require 'nlp_backpack/tokenizers/word'
3
+
4
+ include NLPBackpack
5
+
6
+ describe Tokenizer::Word do
7
+ it "should split text by words" do
8
+ a = Tokenizer::Word.tokenize(text)
9
+ a.size.should == 5
10
+ end
11
+
12
+ def text
13
+ "hello; there, this. that you're"
14
+ end
15
+ end
metadata ADDED
@@ -0,0 +1,141 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: nlp_backpack
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 0
9
+ version: 0.0.0
10
+ platform: ruby
11
+ authors:
12
+ - reddavis
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-06-21 00:00:00 +01:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: rspec
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 1
29
+ - 2
30
+ - 9
31
+ version: 1.2.9
32
+ type: :development
33
+ version_requirements: *id001
34
+ description: A backpack full of useful toys
35
+ email: reddavis@gmail.com
36
+ executables: []
37
+
38
+ extensions: []
39
+
40
+ extra_rdoc_files:
41
+ - LICENSE
42
+ - README.rdoc
43
+ files:
44
+ - .document
45
+ - .gitignore
46
+ - LICENSE
47
+ - README.rdoc
48
+ - Rakefile
49
+ - VERSION
50
+ - lib/nlp_backpack.rb
51
+ - lib/nlp_backpack/chunker.rb
52
+ - lib/nlp_backpack/chunker/regex_chunker.rb
53
+ - lib/nlp_backpack/chunker/tag_pattern.rb
54
+ - lib/nlp_backpack/classifier.rb
55
+ - lib/nlp_backpack/classifier/base.rb
56
+ - lib/nlp_backpack/classifier/naive_bayes.rb
57
+ - lib/nlp_backpack/evaluation.rb
58
+ - lib/nlp_backpack/evaluation/accuracy.rb
59
+ - lib/nlp_backpack/evaluation/base.rb
60
+ - lib/nlp_backpack/evaluation/confusion_matrix.rb
61
+ - lib/nlp_backpack/frequency_distribution.rb
62
+ - lib/nlp_backpack/pos.rb
63
+ - lib/nlp_backpack/pos/brill_tagger.rb
64
+ - lib/nlp_backpack/pos/brill_tagger/lexicon.txt
65
+ - lib/nlp_backpack/pos/pos_array.rb
66
+ - lib/nlp_backpack/stop_words.rb
67
+ - lib/nlp_backpack/stop_words/stop_words.txt
68
+ - lib/nlp_backpack/tokenizers/custom.rb
69
+ - lib/nlp_backpack/tokenizers/line.rb
70
+ - lib/nlp_backpack/tokenizers/space.rb
71
+ - lib/nlp_backpack/tokenizers/tab.rb
72
+ - lib/nlp_backpack/tokenizers/whitespace.rb
73
+ - lib/nlp_backpack/tokenizers/word.rb
74
+ - nlp_backpack.gemspec
75
+ - spec/chunkers/regex_chunker_spec.rb
76
+ - spec/chunkers/tag_pattern_spec.rb
77
+ - spec/classifiers/naive_bayes_spec.rb
78
+ - spec/evaluation/accuracy_spec.rb
79
+ - spec/evaluation/confusion_matrix_spec.rb
80
+ - spec/frequency_distribution_spec.rb
81
+ - spec/nlp_backpack_spec.rb
82
+ - spec/pos/brill_tagger_spec.rb
83
+ - spec/pos/pos_array_spec.rb
84
+ - spec/spec.opts
85
+ - spec/spec_helper.rb
86
+ - spec/stop_words_spec.rb
87
+ - spec/test_saves/naive.nb
88
+ - spec/tokenizers/custom_spec.rb
89
+ - spec/tokenizers/line_spec.rb
90
+ - spec/tokenizers/space_spec.rb
91
+ - spec/tokenizers/tab_spec.rb
92
+ - spec/tokenizers/whitespace_spec.rb
93
+ - spec/tokenizers/word_spec.rb
94
+ has_rdoc: true
95
+ homepage: http://github.com/reddavis/NLP-Backpack
96
+ licenses: []
97
+
98
+ post_install_message:
99
+ rdoc_options:
100
+ - --charset=UTF-8
101
+ require_paths:
102
+ - lib
103
+ required_ruby_version: !ruby/object:Gem::Requirement
104
+ requirements:
105
+ - - ">="
106
+ - !ruby/object:Gem::Version
107
+ segments:
108
+ - 0
109
+ version: "0"
110
+ required_rubygems_version: !ruby/object:Gem::Requirement
111
+ requirements:
112
+ - - ">="
113
+ - !ruby/object:Gem::Version
114
+ segments:
115
+ - 0
116
+ version: "0"
117
+ requirements: []
118
+
119
+ rubyforge_project:
120
+ rubygems_version: 1.3.6
121
+ signing_key:
122
+ specification_version: 3
123
+ summary: A backpack full of useful toys
124
+ test_files:
125
+ - spec/chunkers/regex_chunker_spec.rb
126
+ - spec/chunkers/tag_pattern_spec.rb
127
+ - spec/classifiers/naive_bayes_spec.rb
128
+ - spec/evaluation/accuracy_spec.rb
129
+ - spec/evaluation/confusion_matrix_spec.rb
130
+ - spec/frequency_distribution_spec.rb
131
+ - spec/nlp_backpack_spec.rb
132
+ - spec/pos/brill_tagger_spec.rb
133
+ - spec/pos/pos_array_spec.rb
134
+ - spec/spec_helper.rb
135
+ - spec/stop_words_spec.rb
136
+ - spec/tokenizers/custom_spec.rb
137
+ - spec/tokenizers/line_spec.rb
138
+ - spec/tokenizers/space_spec.rb
139
+ - spec/tokenizers/tab_spec.rb
140
+ - spec/tokenizers/whitespace_spec.rb
141
+ - spec/tokenizers/word_spec.rb