nlp_backpack 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.rdoc +22 -0
- data/Rakefile +45 -0
- data/VERSION +1 -0
- data/lib/nlp_backpack.rb +10 -0
- data/lib/nlp_backpack/chunker.rb +5 -0
- data/lib/nlp_backpack/chunker/regex_chunker.rb +107 -0
- data/lib/nlp_backpack/chunker/tag_pattern.rb +31 -0
- data/lib/nlp_backpack/classifier.rb +5 -0
- data/lib/nlp_backpack/classifier/base.rb +28 -0
- data/lib/nlp_backpack/classifier/naive_bayes.rb +83 -0
- data/lib/nlp_backpack/evaluation.rb +6 -0
- data/lib/nlp_backpack/evaluation/accuracy.rb +46 -0
- data/lib/nlp_backpack/evaluation/base.rb +12 -0
- data/lib/nlp_backpack/evaluation/confusion_matrix.rb +66 -0
- data/lib/nlp_backpack/frequency_distribution.rb +47 -0
- data/lib/nlp_backpack/pos.rb +5 -0
- data/lib/nlp_backpack/pos/brill_tagger.rb +142 -0
- data/lib/nlp_backpack/pos/brill_tagger/lexicon.txt +93696 -0
- data/lib/nlp_backpack/pos/pos_array.rb +32 -0
- data/lib/nlp_backpack/stop_words.rb +17 -0
- data/lib/nlp_backpack/stop_words/stop_words.txt +429 -0
- data/lib/nlp_backpack/tokenizers/custom.rb +13 -0
- data/lib/nlp_backpack/tokenizers/line.rb +13 -0
- data/lib/nlp_backpack/tokenizers/space.rb +13 -0
- data/lib/nlp_backpack/tokenizers/tab.rb +13 -0
- data/lib/nlp_backpack/tokenizers/whitespace.rb +13 -0
- data/lib/nlp_backpack/tokenizers/word.rb +13 -0
- data/nlp_backpack.gemspec +109 -0
- data/spec/chunkers/regex_chunker_spec.rb +46 -0
- data/spec/chunkers/tag_pattern_spec.rb +40 -0
- data/spec/classifiers/naive_bayes_spec.rb +68 -0
- data/spec/evaluation/accuracy_spec.rb +29 -0
- data/spec/evaluation/confusion_matrix_spec.rb +29 -0
- data/spec/frequency_distribution_spec.rb +53 -0
- data/spec/nlp_backpack_spec.rb +4 -0
- data/spec/pos/brill_tagger_spec.rb +24 -0
- data/spec/pos/pos_array_spec.rb +45 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +18 -0
- data/spec/stop_words_spec.rb +15 -0
- data/spec/test_saves/naive.nb +1 -0
- data/spec/tokenizers/custom_spec.rb +24 -0
- data/spec/tokenizers/line_spec.rb +15 -0
- data/spec/tokenizers/space_spec.rb +15 -0
- data/spec/tokenizers/tab_spec.rb +15 -0
- data/spec/tokenizers/whitespace_spec.rb +16 -0
- data/spec/tokenizers/word_spec.rb +15 -0
- metadata +141 -0
data/.document
ADDED
data/.gitignore
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 reddavis
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
= NLP Backpack
|
2
|
+
|
3
|
+
An NLTK like playground for Rubyists. A notebook of my studies
|
4
|
+
|
5
|
+
If a fish is a fish, what is a fish?
|
6
|
+
|
7
|
+
== TODO
|
8
|
+
|
9
|
+
* Lots
|
10
|
+
|
11
|
+
Classifiers
|
12
|
+
* Lots of them!
|
13
|
+
|
14
|
+
Chunkers
|
15
|
+
* IOB tags
|
16
|
+
* Tree structure
|
17
|
+
* Unigram Chunker
|
18
|
+
|
19
|
+
Data
|
20
|
+
|
21
|
+
Evaluations
|
22
|
+
* F Score
|
data/Rakefile
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "nlp_backpack"
|
8
|
+
gem.summary = %Q{A backpack full of useful toys}
|
9
|
+
gem.description = %Q{A backpack full of useful toys}
|
10
|
+
gem.email = "reddavis@gmail.com"
|
11
|
+
gem.homepage = "http://github.com/reddavis/NLP-Backpack"
|
12
|
+
gem.authors = ["reddavis"]
|
13
|
+
gem.add_development_dependency "rspec", ">= 1.2.9"
|
14
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
15
|
+
end
|
16
|
+
Jeweler::GemcutterTasks.new
|
17
|
+
rescue LoadError
|
18
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
19
|
+
end
|
20
|
+
|
21
|
+
require 'spec/rake/spectask'
|
22
|
+
Spec::Rake::SpecTask.new(:spec) do |spec|
|
23
|
+
spec.libs << 'lib' << 'spec'
|
24
|
+
spec.spec_files = FileList['spec/**/*_spec.rb']
|
25
|
+
end
|
26
|
+
|
27
|
+
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
28
|
+
spec.libs << 'lib' << 'spec'
|
29
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
30
|
+
spec.rcov = true
|
31
|
+
end
|
32
|
+
|
33
|
+
task :spec => :check_dependencies
|
34
|
+
|
35
|
+
task :default => :spec
|
36
|
+
|
37
|
+
require 'rake/rdoctask'
|
38
|
+
Rake::RDocTask.new do |rdoc|
|
39
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
40
|
+
|
41
|
+
rdoc.rdoc_dir = 'rdoc'
|
42
|
+
rdoc.title = "nlp_backpack #{version}"
|
43
|
+
rdoc.rdoc_files.include('README*')
|
44
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
45
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.0
|
data/lib/nlp_backpack.rb
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
$:.unshift File.expand_path(File.dirname(__FILE__))
|
2
|
+
|
3
|
+
module NLPBackpack
|
4
|
+
autoload :FrequencyDistribution, "nlp_backpack/frequency_distribution"
|
5
|
+
autoload :Evaluation, "nlp_backpack/evaluation"
|
6
|
+
autoload :Classifier, "nlp_backpack/classifier"
|
7
|
+
autoload :StopWords, "nlp_backpack/stop_words"
|
8
|
+
autoload :Chunker, "nlp_backpack/chunker"
|
9
|
+
autoload :POS, "nlp_backpack/pos"
|
10
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
# ChunkGrammer
|
2
|
+
# Chunk Grammers are regex-esq patterns that indicate how sentences should be chunked
|
3
|
+
# The patterns are made up of Tag Patterns.
|
4
|
+
# (DT)?(JJ.*)* == Optional determiner, followed by zero or more of any type of adjective.
|
5
|
+
|
6
|
+
require 'nlp_backpack/chunker/tag_pattern'
|
7
|
+
|
8
|
+
module NLPBackpack
|
9
|
+
|
10
|
+
module Chunker
|
11
|
+
class Retry < Exception; end;
|
12
|
+
|
13
|
+
class RegexChunker
|
14
|
+
|
15
|
+
def initialize(pattern)
|
16
|
+
@pattern = prepare_pattern(pattern)
|
17
|
+
|
18
|
+
@cached_pattern = @pattern.clone
|
19
|
+
@cached_pattern.freeze
|
20
|
+
|
21
|
+
@matched_patterns = []
|
22
|
+
@potential_pattern = []
|
23
|
+
end
|
24
|
+
|
25
|
+
# Extract all matches
|
26
|
+
def match(pos_array)
|
27
|
+
next_pattern(:start)
|
28
|
+
|
29
|
+
pos_array.each do |word|
|
30
|
+
begin
|
31
|
+
pos = word[1]
|
32
|
+
|
33
|
+
if pos.match(@current_pattern.tag)
|
34
|
+
@potential_pattern << word[0]
|
35
|
+
next_pattern(:matched)
|
36
|
+
else
|
37
|
+
next_pattern(:no_match)
|
38
|
+
end
|
39
|
+
rescue Retry
|
40
|
+
retry
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
pop_potential_pattern!
|
45
|
+
|
46
|
+
@matched_patterns.map {|pattern| pattern.join(" ") }
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
def next_pattern(state)
|
52
|
+
if @pattern.empty?
|
53
|
+
reset_pattern!
|
54
|
+
next_pattern(:start)
|
55
|
+
end
|
56
|
+
|
57
|
+
@current_pattern = case state
|
58
|
+
when :start, :next
|
59
|
+
@pattern.pop
|
60
|
+
when :matched
|
61
|
+
if ["+", "*"].include?(@current_pattern.conditions)
|
62
|
+
@current_pattern
|
63
|
+
else
|
64
|
+
@pattern.pop
|
65
|
+
end
|
66
|
+
when :no_match
|
67
|
+
if ["?", "*"].include?(@current_pattern.conditions)
|
68
|
+
next_pattern(:next)
|
69
|
+
raise Retry #Prob not the best way to do this(?)
|
70
|
+
else
|
71
|
+
# Start again
|
72
|
+
reset_pattern!
|
73
|
+
pop_potential_pattern!
|
74
|
+
next_pattern(:start)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def pop_potential_pattern!
|
80
|
+
if @potential_pattern.size >= minimum_potential_pattern_size
|
81
|
+
@matched_patterns << @potential_pattern
|
82
|
+
end
|
83
|
+
|
84
|
+
reset_potential_pattern!
|
85
|
+
end
|
86
|
+
|
87
|
+
def reset_pattern!
|
88
|
+
@pattern = @cached_pattern.dup
|
89
|
+
end
|
90
|
+
|
91
|
+
def reset_potential_pattern!
|
92
|
+
@potential_pattern = []
|
93
|
+
end
|
94
|
+
|
95
|
+
def minimum_potential_pattern_size
|
96
|
+
@cached_pattern.reject {|x| ["?", "*"].include?(x.conditions)}.size
|
97
|
+
end
|
98
|
+
|
99
|
+
# Extract patterns like (DT)? -- (JJ.*)*
|
100
|
+
def prepare_pattern(pattern)
|
101
|
+
pattern.scan(/(\<[^\>]+\>[^\<]?)/).flatten.map { |x| TagPattern.new(x) }.reverse
|
102
|
+
end
|
103
|
+
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module NLPBackpack
|
2
|
+
|
3
|
+
module Chunker
|
4
|
+
class TagPattern
|
5
|
+
|
6
|
+
attr_reader :tag, :conditions
|
7
|
+
|
8
|
+
# Example inputs:
|
9
|
+
# <DT>?
|
10
|
+
# <JJ.*>*
|
11
|
+
def initialize(pattern)
|
12
|
+
extract_tag_and_options(pattern)
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
# TODO Make this work for strings wrapped in " " as well as ' '
|
18
|
+
def extract_tag_and_options(pattern)
|
19
|
+
if match = pattern.match(/\<([^\>]+)\>/)
|
20
|
+
@tag = /#{match[1]}/
|
21
|
+
end
|
22
|
+
|
23
|
+
if match = pattern.match(/\<[^\>]+\>(.)/)
|
24
|
+
@conditions = match[1]
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module NLPBackpack
|
2
|
+
module Classifier
|
3
|
+
|
4
|
+
class Base
|
5
|
+
class << self
|
6
|
+
def load(db_path)
|
7
|
+
data = ""
|
8
|
+
File.open(db_path) do |f|
|
9
|
+
while line = f.gets
|
10
|
+
data << line
|
11
|
+
end
|
12
|
+
end
|
13
|
+
Marshal.load(data)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
attr_accessor :db_filepath
|
18
|
+
|
19
|
+
def save
|
20
|
+
raise "You haven't set a db_filpath, I dont know where to save" if @db_filepath.nil?
|
21
|
+
File.open(@db_filepath, "w+") do |f|
|
22
|
+
f.write(Marshal.dump(self))
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
# Bayes Theorem
|
2
|
+
# P(A|B) = P(B|A) * P(A) / P(B)
|
3
|
+
|
4
|
+
# Terminology
|
5
|
+
# An ITEM is made up of FEATURES
|
6
|
+
# An ITEM belongs to a CLASS
|
7
|
+
|
8
|
+
# Bayes With Our Terminology
|
9
|
+
# P(Class | Item) = P(Item | Class) * P(Class) / P(Item)
|
10
|
+
|
11
|
+
# However, when classifying, P(Item) is the same across all calcualtions
|
12
|
+
# So we don't bother to calculate it
|
13
|
+
|
14
|
+
require "nlp_backpack/classifier/base"
|
15
|
+
|
16
|
+
module NLPBackpack
|
17
|
+
module Classifier
|
18
|
+
|
19
|
+
class NaiveBayes < Base
|
20
|
+
def initialize(*klasses)
|
21
|
+
@features_count = {}
|
22
|
+
@klass_count = {}
|
23
|
+
@klasses = klasses
|
24
|
+
|
25
|
+
klasses.each do |klass|
|
26
|
+
@features_count[klass] = Hash.new(0.0)
|
27
|
+
@klass_count[klass] = 0.0
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def train(klass, *features)
|
32
|
+
features.uniq.each do |feature|
|
33
|
+
@features_count[klass][feature] += 1
|
34
|
+
end
|
35
|
+
@klass_count[klass] += 1
|
36
|
+
end
|
37
|
+
|
38
|
+
#P(Class | Item) = P(Item | Class) * P(Class)
|
39
|
+
def classify(*features)
|
40
|
+
scores = {}
|
41
|
+
@klasses.each do |klass|
|
42
|
+
scores[klass] = (prob_of_item_given_a_class(features, klass) * prob_of_class(klass))
|
43
|
+
end
|
44
|
+
scores.sort {|a,b| b[1] <=> a[1]}[0]
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
# P(Item | Class)
|
50
|
+
def prob_of_item_given_a_class(features, klass)
|
51
|
+
a = features.inject(1.0) do |sum, feature|
|
52
|
+
prob = prob_of_feature_given_a_class(feature, klass)
|
53
|
+
sum *= prob
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# P(Feature | Class)
|
58
|
+
def prob_of_feature_given_a_class(feature, klass)
|
59
|
+
return assumed_probability if @features_count[klass][feature] == 0
|
60
|
+
@features_count[klass][feature] / @klass_count[klass]
|
61
|
+
end
|
62
|
+
|
63
|
+
# P(Class)
|
64
|
+
def prob_of_class(klass)
|
65
|
+
@klass_count[klass] / total_items
|
66
|
+
end
|
67
|
+
|
68
|
+
def total_items
|
69
|
+
@klass_count.inject(0) do |sum, klass|
|
70
|
+
sum += klass[1]
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# If we have only trained a little bit a class may not have had a feature yet
|
75
|
+
# give it a probability of 0 may not be true so we produce a assumed probability
|
76
|
+
# which gets smaller more we train
|
77
|
+
def assumed_probability
|
78
|
+
0.5 / (total_items/2)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'nlp_backpack/evaluation/base'
|
2
|
+
|
3
|
+
module NLPBackpack
|
4
|
+
module Evaluation
|
5
|
+
|
6
|
+
class Accuracy < Base
|
7
|
+
|
8
|
+
def accuracy_of(klass)
|
9
|
+
results[klass]
|
10
|
+
end
|
11
|
+
|
12
|
+
def inspect
|
13
|
+
output = ""
|
14
|
+
|
15
|
+
results.each do |klass, result|
|
16
|
+
output << "#{klass}: #{result}% correct\n"
|
17
|
+
end
|
18
|
+
|
19
|
+
output
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def results
|
25
|
+
@results ||= begin
|
26
|
+
correct_klass_count = Hash.new {|h,k| h[k] = 0.0}
|
27
|
+
total_klass_count = Hash.new {|h,k| h[k] = 0.0}
|
28
|
+
|
29
|
+
@correct_results.each_with_index do |correct_result, index|
|
30
|
+
total_klass_count[correct_result] += 1
|
31
|
+
correct_klass_count[correct_result] += 1 if correct_result == @test_results[index]
|
32
|
+
end
|
33
|
+
|
34
|
+
results = Hash.new
|
35
|
+
|
36
|
+
total_klass_count.each do |klass, total_count|
|
37
|
+
results[klass] = (correct_klass_count[klass] / total_count * 100).round
|
38
|
+
end
|
39
|
+
|
40
|
+
results
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|