nlp_backpack 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.rdoc +22 -0
- data/Rakefile +45 -0
- data/VERSION +1 -0
- data/lib/nlp_backpack.rb +10 -0
- data/lib/nlp_backpack/chunker.rb +5 -0
- data/lib/nlp_backpack/chunker/regex_chunker.rb +107 -0
- data/lib/nlp_backpack/chunker/tag_pattern.rb +31 -0
- data/lib/nlp_backpack/classifier.rb +5 -0
- data/lib/nlp_backpack/classifier/base.rb +28 -0
- data/lib/nlp_backpack/classifier/naive_bayes.rb +83 -0
- data/lib/nlp_backpack/evaluation.rb +6 -0
- data/lib/nlp_backpack/evaluation/accuracy.rb +46 -0
- data/lib/nlp_backpack/evaluation/base.rb +12 -0
- data/lib/nlp_backpack/evaluation/confusion_matrix.rb +66 -0
- data/lib/nlp_backpack/frequency_distribution.rb +47 -0
- data/lib/nlp_backpack/pos.rb +5 -0
- data/lib/nlp_backpack/pos/brill_tagger.rb +142 -0
- data/lib/nlp_backpack/pos/brill_tagger/lexicon.txt +93696 -0
- data/lib/nlp_backpack/pos/pos_array.rb +32 -0
- data/lib/nlp_backpack/stop_words.rb +17 -0
- data/lib/nlp_backpack/stop_words/stop_words.txt +429 -0
- data/lib/nlp_backpack/tokenizers/custom.rb +13 -0
- data/lib/nlp_backpack/tokenizers/line.rb +13 -0
- data/lib/nlp_backpack/tokenizers/space.rb +13 -0
- data/lib/nlp_backpack/tokenizers/tab.rb +13 -0
- data/lib/nlp_backpack/tokenizers/whitespace.rb +13 -0
- data/lib/nlp_backpack/tokenizers/word.rb +13 -0
- data/nlp_backpack.gemspec +109 -0
- data/spec/chunkers/regex_chunker_spec.rb +46 -0
- data/spec/chunkers/tag_pattern_spec.rb +40 -0
- data/spec/classifiers/naive_bayes_spec.rb +68 -0
- data/spec/evaluation/accuracy_spec.rb +29 -0
- data/spec/evaluation/confusion_matrix_spec.rb +29 -0
- data/spec/frequency_distribution_spec.rb +53 -0
- data/spec/nlp_backpack_spec.rb +4 -0
- data/spec/pos/brill_tagger_spec.rb +24 -0
- data/spec/pos/pos_array_spec.rb +45 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +18 -0
- data/spec/stop_words_spec.rb +15 -0
- data/spec/test_saves/naive.nb +1 -0
- data/spec/tokenizers/custom_spec.rb +24 -0
- data/spec/tokenizers/line_spec.rb +15 -0
- data/spec/tokenizers/space_spec.rb +15 -0
- data/spec/tokenizers/tab_spec.rb +15 -0
- data/spec/tokenizers/whitespace_spec.rb +16 -0
- data/spec/tokenizers/word_spec.rb +15 -0
- metadata +141 -0
@@ -0,0 +1,24 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
|
3
|
+
include NLPBackpack
|
4
|
+
|
5
|
+
describe POS::BrillTagger do
|
6
|
+
before(:all) do
|
7
|
+
@pos = POS::BrillTagger.analyze(text)
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should properly tag 'the fast fox'" do
|
11
|
+
@pos[0][1].should == "DT"
|
12
|
+
@pos[1][1].should == "JJ"
|
13
|
+
@pos[2][1].should == "NN"
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should return a POSData object" do
|
17
|
+
@pos.should be_a(POS::POSArray)
|
18
|
+
end
|
19
|
+
|
20
|
+
def text
|
21
|
+
"This regular expression is read in the following manner: Zero or more adjectives or nouns, followed by an option group of a noun and a preposition, followed again by zero or more adjectives or nouns, followed by a single noun. A sequence of tags matching this pattern ensures that the corresponding words make up a noun phrase.
|
22
|
+
In addition to simply pulling out the phrases, it is common to do some simple post processing to link variants together (For example, unpluralizing plural variants)."
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
require 'nlp_backpack/pos/pos_array'
|
3
|
+
|
4
|
+
include NLPBackpack
|
5
|
+
|
6
|
+
describe POS::POSArray do
|
7
|
+
before do
|
8
|
+
@pos_a = POS::POSArray.new
|
9
|
+
end
|
10
|
+
|
11
|
+
describe "Appending words" do
|
12
|
+
describe "Using <<" do
|
13
|
+
it "should raise an error if we insert too much data" do
|
14
|
+
lambda do
|
15
|
+
@pos_a << ["hello", :NN, "LOL"]
|
16
|
+
end.should raise_error
|
17
|
+
end
|
18
|
+
|
19
|
+
it "size should be 2" do
|
20
|
+
2.times { @pos_a << word }
|
21
|
+
@pos_a.size.should == 2
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
describe "Using #append" do
|
26
|
+
it "size should be 2" do
|
27
|
+
2.times { @pos_a.append(*word) }
|
28
|
+
@pos_a.size.should == 2
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
describe "to_s" do
|
34
|
+
it "should return properly formed string for ChunkGrammer" do
|
35
|
+
@pos_a << word
|
36
|
+
@pos_a << word
|
37
|
+
|
38
|
+
@pos_a.to_s.should == "hello/NN hello/NN"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def word
|
43
|
+
["hello", :NN]
|
44
|
+
end
|
45
|
+
end
|
data/spec/spec.opts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
2
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
3
|
+
require 'rubygems'
|
4
|
+
require 'nlp_backpack'
|
5
|
+
require 'spec'
|
6
|
+
require 'spec/autorun'
|
7
|
+
|
8
|
+
Spec::Runner.configure do |config|
|
9
|
+
|
10
|
+
end
|
11
|
+
|
12
|
+
def correct_results
|
13
|
+
[1,1,2,1,1,1,1,1,1,1]
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_results
|
17
|
+
[1,1,3,1,1,1,1,1,1,1]
|
18
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
include NLPBackpack
|
4
|
+
|
5
|
+
describe StopWords do
|
6
|
+
before(:all) { @stop_words = StopWords.all }
|
7
|
+
|
8
|
+
it "should return z" do
|
9
|
+
@stop_words.last.should == "z"
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should return a" do
|
13
|
+
@stop_words.first.should == "a"
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
o:(NLPBackpack::Classifier::NaiveBayes :@features_count{: spam}" wordf1"badf1f0:ham}"wef1"badf1@:@db_filepath"M/Users/reddavis/Documents/projects/nlp_backpack/spec/test_saves/naive.nb:
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
require 'nlp_backpack/tokenizers/custom'
|
3
|
+
|
4
|
+
include NLPBackpack
|
5
|
+
|
6
|
+
describe Tokenizer::Custom do
|
7
|
+
describe "split by ands" do
|
8
|
+
it "should return 3" do
|
9
|
+
a = Tokenizer::Custom.tokenize(text, "and")
|
10
|
+
a.size.should == 3
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
describe "split by any character" do
|
15
|
+
it "should return 3" do
|
16
|
+
a = Tokenizer::Custom.tokenize("and", //)
|
17
|
+
a.size.should == 3
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def text
|
22
|
+
"hello and there and this"
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
require 'nlp_backpack/tokenizers/line'
|
3
|
+
|
4
|
+
include NLPBackpack
|
5
|
+
|
6
|
+
describe Tokenizer::Line do
|
7
|
+
it "should split text by whitespace" do
|
8
|
+
a = Tokenizer::Line.tokenize(text)
|
9
|
+
a.size.should == 3
|
10
|
+
end
|
11
|
+
|
12
|
+
def text
|
13
|
+
"hello\n\nthere this"
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
require 'nlp_backpack/tokenizers/space'
|
3
|
+
|
4
|
+
include NLPBackpack
|
5
|
+
|
6
|
+
describe Tokenizer::Space do
|
7
|
+
it "should split text by single spaces" do
|
8
|
+
a = Tokenizer::Space.tokenize(text)
|
9
|
+
a.size.should == 4
|
10
|
+
end
|
11
|
+
|
12
|
+
def text
|
13
|
+
"hello there this"
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
require 'nlp_backpack/tokenizers/tab'
|
3
|
+
|
4
|
+
include NLPBackpack
|
5
|
+
|
6
|
+
describe Tokenizer::Tab do
|
7
|
+
it "should split text by single tabs" do
|
8
|
+
a = Tokenizer::Tab.tokenize(text)
|
9
|
+
a.size.should == 4
|
10
|
+
end
|
11
|
+
|
12
|
+
def text
|
13
|
+
"hello\tthere\t\tthis"
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
require 'nlp_backpack/tokenizers/whitespace'
|
3
|
+
|
4
|
+
include NLPBackpack
|
5
|
+
|
6
|
+
describe Tokenizer::Whitespace do
|
7
|
+
it "should split text by whitespace" do
|
8
|
+
a = Tokenizer::Whitespace.tokenize(text)
|
9
|
+
a.size.should == 3
|
10
|
+
a[0].should == "hello"
|
11
|
+
end
|
12
|
+
|
13
|
+
def text
|
14
|
+
"hello there this"
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
require 'nlp_backpack/tokenizers/word'
|
3
|
+
|
4
|
+
include NLPBackpack
|
5
|
+
|
6
|
+
describe Tokenizer::Word do
|
7
|
+
it "should split text by words" do
|
8
|
+
a = Tokenizer::Word.tokenize(text)
|
9
|
+
a.size.should == 5
|
10
|
+
end
|
11
|
+
|
12
|
+
def text
|
13
|
+
"hello; there, this. that you're"
|
14
|
+
end
|
15
|
+
end
|
metadata
ADDED
@@ -0,0 +1,141 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: nlp_backpack
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
version: 0.0.0
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- reddavis
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-06-21 00:00:00 +01:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: rspec
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 1
|
29
|
+
- 2
|
30
|
+
- 9
|
31
|
+
version: 1.2.9
|
32
|
+
type: :development
|
33
|
+
version_requirements: *id001
|
34
|
+
description: A backpack full of useful toys
|
35
|
+
email: reddavis@gmail.com
|
36
|
+
executables: []
|
37
|
+
|
38
|
+
extensions: []
|
39
|
+
|
40
|
+
extra_rdoc_files:
|
41
|
+
- LICENSE
|
42
|
+
- README.rdoc
|
43
|
+
files:
|
44
|
+
- .document
|
45
|
+
- .gitignore
|
46
|
+
- LICENSE
|
47
|
+
- README.rdoc
|
48
|
+
- Rakefile
|
49
|
+
- VERSION
|
50
|
+
- lib/nlp_backpack.rb
|
51
|
+
- lib/nlp_backpack/chunker.rb
|
52
|
+
- lib/nlp_backpack/chunker/regex_chunker.rb
|
53
|
+
- lib/nlp_backpack/chunker/tag_pattern.rb
|
54
|
+
- lib/nlp_backpack/classifier.rb
|
55
|
+
- lib/nlp_backpack/classifier/base.rb
|
56
|
+
- lib/nlp_backpack/classifier/naive_bayes.rb
|
57
|
+
- lib/nlp_backpack/evaluation.rb
|
58
|
+
- lib/nlp_backpack/evaluation/accuracy.rb
|
59
|
+
- lib/nlp_backpack/evaluation/base.rb
|
60
|
+
- lib/nlp_backpack/evaluation/confusion_matrix.rb
|
61
|
+
- lib/nlp_backpack/frequency_distribution.rb
|
62
|
+
- lib/nlp_backpack/pos.rb
|
63
|
+
- lib/nlp_backpack/pos/brill_tagger.rb
|
64
|
+
- lib/nlp_backpack/pos/brill_tagger/lexicon.txt
|
65
|
+
- lib/nlp_backpack/pos/pos_array.rb
|
66
|
+
- lib/nlp_backpack/stop_words.rb
|
67
|
+
- lib/nlp_backpack/stop_words/stop_words.txt
|
68
|
+
- lib/nlp_backpack/tokenizers/custom.rb
|
69
|
+
- lib/nlp_backpack/tokenizers/line.rb
|
70
|
+
- lib/nlp_backpack/tokenizers/space.rb
|
71
|
+
- lib/nlp_backpack/tokenizers/tab.rb
|
72
|
+
- lib/nlp_backpack/tokenizers/whitespace.rb
|
73
|
+
- lib/nlp_backpack/tokenizers/word.rb
|
74
|
+
- nlp_backpack.gemspec
|
75
|
+
- spec/chunkers/regex_chunker_spec.rb
|
76
|
+
- spec/chunkers/tag_pattern_spec.rb
|
77
|
+
- spec/classifiers/naive_bayes_spec.rb
|
78
|
+
- spec/evaluation/accuracy_spec.rb
|
79
|
+
- spec/evaluation/confusion_matrix_spec.rb
|
80
|
+
- spec/frequency_distribution_spec.rb
|
81
|
+
- spec/nlp_backpack_spec.rb
|
82
|
+
- spec/pos/brill_tagger_spec.rb
|
83
|
+
- spec/pos/pos_array_spec.rb
|
84
|
+
- spec/spec.opts
|
85
|
+
- spec/spec_helper.rb
|
86
|
+
- spec/stop_words_spec.rb
|
87
|
+
- spec/test_saves/naive.nb
|
88
|
+
- spec/tokenizers/custom_spec.rb
|
89
|
+
- spec/tokenizers/line_spec.rb
|
90
|
+
- spec/tokenizers/space_spec.rb
|
91
|
+
- spec/tokenizers/tab_spec.rb
|
92
|
+
- spec/tokenizers/whitespace_spec.rb
|
93
|
+
- spec/tokenizers/word_spec.rb
|
94
|
+
has_rdoc: true
|
95
|
+
homepage: http://github.com/reddavis/NLP-Backpack
|
96
|
+
licenses: []
|
97
|
+
|
98
|
+
post_install_message:
|
99
|
+
rdoc_options:
|
100
|
+
- --charset=UTF-8
|
101
|
+
require_paths:
|
102
|
+
- lib
|
103
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
104
|
+
requirements:
|
105
|
+
- - ">="
|
106
|
+
- !ruby/object:Gem::Version
|
107
|
+
segments:
|
108
|
+
- 0
|
109
|
+
version: "0"
|
110
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
111
|
+
requirements:
|
112
|
+
- - ">="
|
113
|
+
- !ruby/object:Gem::Version
|
114
|
+
segments:
|
115
|
+
- 0
|
116
|
+
version: "0"
|
117
|
+
requirements: []
|
118
|
+
|
119
|
+
rubyforge_project:
|
120
|
+
rubygems_version: 1.3.6
|
121
|
+
signing_key:
|
122
|
+
specification_version: 3
|
123
|
+
summary: A backpack full of useful toys
|
124
|
+
test_files:
|
125
|
+
- spec/chunkers/regex_chunker_spec.rb
|
126
|
+
- spec/chunkers/tag_pattern_spec.rb
|
127
|
+
- spec/classifiers/naive_bayes_spec.rb
|
128
|
+
- spec/evaluation/accuracy_spec.rb
|
129
|
+
- spec/evaluation/confusion_matrix_spec.rb
|
130
|
+
- spec/frequency_distribution_spec.rb
|
131
|
+
- spec/nlp_backpack_spec.rb
|
132
|
+
- spec/pos/brill_tagger_spec.rb
|
133
|
+
- spec/pos/pos_array_spec.rb
|
134
|
+
- spec/spec_helper.rb
|
135
|
+
- spec/stop_words_spec.rb
|
136
|
+
- spec/tokenizers/custom_spec.rb
|
137
|
+
- spec/tokenizers/line_spec.rb
|
138
|
+
- spec/tokenizers/space_spec.rb
|
139
|
+
- spec/tokenizers/tab_spec.rb
|
140
|
+
- spec/tokenizers/whitespace_spec.rb
|
141
|
+
- spec/tokenizers/word_spec.rb
|