nlp_backpack 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. data/.document +5 -0
  2. data/.gitignore +21 -0
  3. data/LICENSE +20 -0
  4. data/README.rdoc +22 -0
  5. data/Rakefile +45 -0
  6. data/VERSION +1 -0
  7. data/lib/nlp_backpack.rb +10 -0
  8. data/lib/nlp_backpack/chunker.rb +5 -0
  9. data/lib/nlp_backpack/chunker/regex_chunker.rb +107 -0
  10. data/lib/nlp_backpack/chunker/tag_pattern.rb +31 -0
  11. data/lib/nlp_backpack/classifier.rb +5 -0
  12. data/lib/nlp_backpack/classifier/base.rb +28 -0
  13. data/lib/nlp_backpack/classifier/naive_bayes.rb +83 -0
  14. data/lib/nlp_backpack/evaluation.rb +6 -0
  15. data/lib/nlp_backpack/evaluation/accuracy.rb +46 -0
  16. data/lib/nlp_backpack/evaluation/base.rb +12 -0
  17. data/lib/nlp_backpack/evaluation/confusion_matrix.rb +66 -0
  18. data/lib/nlp_backpack/frequency_distribution.rb +47 -0
  19. data/lib/nlp_backpack/pos.rb +5 -0
  20. data/lib/nlp_backpack/pos/brill_tagger.rb +142 -0
  21. data/lib/nlp_backpack/pos/brill_tagger/lexicon.txt +93696 -0
  22. data/lib/nlp_backpack/pos/pos_array.rb +32 -0
  23. data/lib/nlp_backpack/stop_words.rb +17 -0
  24. data/lib/nlp_backpack/stop_words/stop_words.txt +429 -0
  25. data/lib/nlp_backpack/tokenizers/custom.rb +13 -0
  26. data/lib/nlp_backpack/tokenizers/line.rb +13 -0
  27. data/lib/nlp_backpack/tokenizers/space.rb +13 -0
  28. data/lib/nlp_backpack/tokenizers/tab.rb +13 -0
  29. data/lib/nlp_backpack/tokenizers/whitespace.rb +13 -0
  30. data/lib/nlp_backpack/tokenizers/word.rb +13 -0
  31. data/nlp_backpack.gemspec +109 -0
  32. data/spec/chunkers/regex_chunker_spec.rb +46 -0
  33. data/spec/chunkers/tag_pattern_spec.rb +40 -0
  34. data/spec/classifiers/naive_bayes_spec.rb +68 -0
  35. data/spec/evaluation/accuracy_spec.rb +29 -0
  36. data/spec/evaluation/confusion_matrix_spec.rb +29 -0
  37. data/spec/frequency_distribution_spec.rb +53 -0
  38. data/spec/nlp_backpack_spec.rb +4 -0
  39. data/spec/pos/brill_tagger_spec.rb +24 -0
  40. data/spec/pos/pos_array_spec.rb +45 -0
  41. data/spec/spec.opts +1 -0
  42. data/spec/spec_helper.rb +18 -0
  43. data/spec/stop_words_spec.rb +15 -0
  44. data/spec/test_saves/naive.nb +1 -0
  45. data/spec/tokenizers/custom_spec.rb +24 -0
  46. data/spec/tokenizers/line_spec.rb +15 -0
  47. data/spec/tokenizers/space_spec.rb +15 -0
  48. data/spec/tokenizers/tab_spec.rb +15 -0
  49. data/spec/tokenizers/whitespace_spec.rb +16 -0
  50. data/spec/tokenizers/word_spec.rb +15 -0
  51. metadata +141 -0
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 reddavis
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,22 @@
1
+ = NLP Backpack
2
+
3
+ An NLTK like playground for Rubyists. A notebook of my studies
4
+
5
+ If a fish is a fish, what is a fish?
6
+
7
+ == TODO
8
+
9
+ * Lots
10
+
11
+ Classifiers
12
+ * Lots of them!
13
+
14
+ Chunkers
15
+ * IOB tags
16
+ * Tree structure
17
+ * Unigram Chunker
18
+
19
+ Data
20
+
21
+ Evaluations
22
+ * F Score
@@ -0,0 +1,45 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "nlp_backpack"
8
+ gem.summary = %Q{A backpack full of useful toys}
9
+ gem.description = %Q{A backpack full of useful toys}
10
+ gem.email = "reddavis@gmail.com"
11
+ gem.homepage = "http://github.com/reddavis/NLP-Backpack"
12
+ gem.authors = ["reddavis"]
13
+ gem.add_development_dependency "rspec", ">= 1.2.9"
14
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
+ end
16
+ Jeweler::GemcutterTasks.new
17
+ rescue LoadError
18
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
19
+ end
20
+
21
+ require 'spec/rake/spectask'
22
+ Spec::Rake::SpecTask.new(:spec) do |spec|
23
+ spec.libs << 'lib' << 'spec'
24
+ spec.spec_files = FileList['spec/**/*_spec.rb']
25
+ end
26
+
27
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
28
+ spec.libs << 'lib' << 'spec'
29
+ spec.pattern = 'spec/**/*_spec.rb'
30
+ spec.rcov = true
31
+ end
32
+
33
+ task :spec => :check_dependencies
34
+
35
+ task :default => :spec
36
+
37
+ require 'rake/rdoctask'
38
+ Rake::RDocTask.new do |rdoc|
39
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
40
+
41
+ rdoc.rdoc_dir = 'rdoc'
42
+ rdoc.title = "nlp_backpack #{version}"
43
+ rdoc.rdoc_files.include('README*')
44
+ rdoc.rdoc_files.include('lib/**/*.rb')
45
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.0
@@ -0,0 +1,10 @@
1
+ $:.unshift File.expand_path(File.dirname(__FILE__))
2
+
3
+ module NLPBackpack
4
+ autoload :FrequencyDistribution, "nlp_backpack/frequency_distribution"
5
+ autoload :Evaluation, "nlp_backpack/evaluation"
6
+ autoload :Classifier, "nlp_backpack/classifier"
7
+ autoload :StopWords, "nlp_backpack/stop_words"
8
+ autoload :Chunker, "nlp_backpack/chunker"
9
+ autoload :POS, "nlp_backpack/pos"
10
+ end
@@ -0,0 +1,5 @@
1
+ module NLPBackpack
2
+ module Chunker
3
+ autoload :RegexChunker, "nlp_backpack/chunker/regex_chunker"
4
+ end
5
+ end
@@ -0,0 +1,107 @@
1
+ # ChunkGrammer
2
+ # Chunk Grammers are regex-esq patterns that indicate how sentences should be chunked
3
+ # The patterns are made up of Tag Patterns.
4
+ # (DT)?(JJ.*)* == Optional determiner, followed by zero or more of any type of adjective.
5
+
6
+ require 'nlp_backpack/chunker/tag_pattern'
7
+
8
+ module NLPBackpack
9
+
10
+ module Chunker
11
+ class Retry < Exception; end;
12
+
13
+ class RegexChunker
14
+
15
+ def initialize(pattern)
16
+ @pattern = prepare_pattern(pattern)
17
+
18
+ @cached_pattern = @pattern.clone
19
+ @cached_pattern.freeze
20
+
21
+ @matched_patterns = []
22
+ @potential_pattern = []
23
+ end
24
+
25
+ # Extract all matches
26
+ def match(pos_array)
27
+ next_pattern(:start)
28
+
29
+ pos_array.each do |word|
30
+ begin
31
+ pos = word[1]
32
+
33
+ if pos.match(@current_pattern.tag)
34
+ @potential_pattern << word[0]
35
+ next_pattern(:matched)
36
+ else
37
+ next_pattern(:no_match)
38
+ end
39
+ rescue Retry
40
+ retry
41
+ end
42
+ end
43
+
44
+ pop_potential_pattern!
45
+
46
+ @matched_patterns.map {|pattern| pattern.join(" ") }
47
+ end
48
+
49
+ private
50
+
51
+ def next_pattern(state)
52
+ if @pattern.empty?
53
+ reset_pattern!
54
+ next_pattern(:start)
55
+ end
56
+
57
+ @current_pattern = case state
58
+ when :start, :next
59
+ @pattern.pop
60
+ when :matched
61
+ if ["+", "*"].include?(@current_pattern.conditions)
62
+ @current_pattern
63
+ else
64
+ @pattern.pop
65
+ end
66
+ when :no_match
67
+ if ["?", "*"].include?(@current_pattern.conditions)
68
+ next_pattern(:next)
69
+ raise Retry #Prob not the best way to do this(?)
70
+ else
71
+ # Start again
72
+ reset_pattern!
73
+ pop_potential_pattern!
74
+ next_pattern(:start)
75
+ end
76
+ end
77
+ end
78
+
79
+ def pop_potential_pattern!
80
+ if @potential_pattern.size >= minimum_potential_pattern_size
81
+ @matched_patterns << @potential_pattern
82
+ end
83
+
84
+ reset_potential_pattern!
85
+ end
86
+
87
+ def reset_pattern!
88
+ @pattern = @cached_pattern.dup
89
+ end
90
+
91
+ def reset_potential_pattern!
92
+ @potential_pattern = []
93
+ end
94
+
95
+ def minimum_potential_pattern_size
96
+ @cached_pattern.reject {|x| ["?", "*"].include?(x.conditions)}.size
97
+ end
98
+
99
+ # Extract patterns like (DT)? -- (JJ.*)*
100
+ def prepare_pattern(pattern)
101
+ pattern.scan(/(\<[^\>]+\>[^\<]?)/).flatten.map { |x| TagPattern.new(x) }.reverse
102
+ end
103
+
104
+ end
105
+ end
106
+
107
+ end
@@ -0,0 +1,31 @@
1
+ module NLPBackpack
2
+
3
+ module Chunker
4
+ class TagPattern
5
+
6
+ attr_reader :tag, :conditions
7
+
8
+ # Example inputs:
9
+ # <DT>?
10
+ # <JJ.*>*
11
+ def initialize(pattern)
12
+ extract_tag_and_options(pattern)
13
+ end
14
+
15
+ private
16
+
17
+ # TODO Make this work for strings wrapped in " " as well as ' '
18
+ def extract_tag_and_options(pattern)
19
+ if match = pattern.match(/\<([^\>]+)\>/)
20
+ @tag = /#{match[1]}/
21
+ end
22
+
23
+ if match = pattern.match(/\<[^\>]+\>(.)/)
24
+ @conditions = match[1]
25
+ end
26
+ end
27
+
28
+ end
29
+ end
30
+
31
+ end
@@ -0,0 +1,5 @@
1
+ module NLPBackpack
2
+ module Classifier
3
+ autoload :NaiveBayes, "nlp_backpack/classifier/naive_bayes"
4
+ end
5
+ end
@@ -0,0 +1,28 @@
1
+ module NLPBackpack
2
+ module Classifier
3
+
4
+ class Base
5
+ class << self
6
+ def load(db_path)
7
+ data = ""
8
+ File.open(db_path) do |f|
9
+ while line = f.gets
10
+ data << line
11
+ end
12
+ end
13
+ Marshal.load(data)
14
+ end
15
+ end
16
+
17
+ attr_accessor :db_filepath
18
+
19
+ def save
20
+ raise "You haven't set a db_filpath, I dont know where to save" if @db_filepath.nil?
21
+ File.open(@db_filepath, "w+") do |f|
22
+ f.write(Marshal.dump(self))
23
+ end
24
+ end
25
+ end
26
+
27
+ end
28
+ end
@@ -0,0 +1,83 @@
1
+ # Bayes Theorem
2
+ # P(A|B) = P(B|A) * P(A) / P(B)
3
+
4
+ # Terminology
5
+ # An ITEM is made up of FEATURES
6
+ # An ITEM belongs to a CLASS
7
+
8
+ # Bayes With Our Terminology
9
+ # P(Class | Item) = P(Item | Class) * P(Class) / P(Item)
10
+
11
+ # However, when classifying, P(Item) is the same across all calcualtions
12
+ # So we don't bother to calculate it
13
+
14
+ require "nlp_backpack/classifier/base"
15
+
16
+ module NLPBackpack
17
+ module Classifier
18
+
19
+ class NaiveBayes < Base
20
+ def initialize(*klasses)
21
+ @features_count = {}
22
+ @klass_count = {}
23
+ @klasses = klasses
24
+
25
+ klasses.each do |klass|
26
+ @features_count[klass] = Hash.new(0.0)
27
+ @klass_count[klass] = 0.0
28
+ end
29
+ end
30
+
31
+ def train(klass, *features)
32
+ features.uniq.each do |feature|
33
+ @features_count[klass][feature] += 1
34
+ end
35
+ @klass_count[klass] += 1
36
+ end
37
+
38
+ #P(Class | Item) = P(Item | Class) * P(Class)
39
+ def classify(*features)
40
+ scores = {}
41
+ @klasses.each do |klass|
42
+ scores[klass] = (prob_of_item_given_a_class(features, klass) * prob_of_class(klass))
43
+ end
44
+ scores.sort {|a,b| b[1] <=> a[1]}[0]
45
+ end
46
+
47
+ private
48
+
49
+ # P(Item | Class)
50
+ def prob_of_item_given_a_class(features, klass)
51
+ a = features.inject(1.0) do |sum, feature|
52
+ prob = prob_of_feature_given_a_class(feature, klass)
53
+ sum *= prob
54
+ end
55
+ end
56
+
57
+ # P(Feature | Class)
58
+ def prob_of_feature_given_a_class(feature, klass)
59
+ return assumed_probability if @features_count[klass][feature] == 0
60
+ @features_count[klass][feature] / @klass_count[klass]
61
+ end
62
+
63
+ # P(Class)
64
+ def prob_of_class(klass)
65
+ @klass_count[klass] / total_items
66
+ end
67
+
68
+ def total_items
69
+ @klass_count.inject(0) do |sum, klass|
70
+ sum += klass[1]
71
+ end
72
+ end
73
+
74
+ # If we have only trained a little bit a class may not have had a feature yet
75
+ # give it a probability of 0 may not be true so we produce a assumed probability
76
+ # which gets smaller more we train
77
+ def assumed_probability
78
+ 0.5 / (total_items/2)
79
+ end
80
+ end
81
+
82
+ end
83
+ end
@@ -0,0 +1,6 @@
1
+ module NLPBackpack
2
+ module Evaluation
3
+ autoload :Accuracy, "nlp_backpack/evaluation/accuracy"
4
+ autoload :ConfusionMatrix, "nlp_backpack/evaluation/confusion_matrix"
5
+ end
6
+ end
@@ -0,0 +1,46 @@
1
+ require 'nlp_backpack/evaluation/base'
2
+
3
+ module NLPBackpack
4
+ module Evaluation
5
+
6
+ class Accuracy < Base
7
+
8
+ def accuracy_of(klass)
9
+ results[klass]
10
+ end
11
+
12
+ def inspect
13
+ output = ""
14
+
15
+ results.each do |klass, result|
16
+ output << "#{klass}: #{result}% correct\n"
17
+ end
18
+
19
+ output
20
+ end
21
+
22
+ private
23
+
24
+ def results
25
+ @results ||= begin
26
+ correct_klass_count = Hash.new {|h,k| h[k] = 0.0}
27
+ total_klass_count = Hash.new {|h,k| h[k] = 0.0}
28
+
29
+ @correct_results.each_with_index do |correct_result, index|
30
+ total_klass_count[correct_result] += 1
31
+ correct_klass_count[correct_result] += 1 if correct_result == @test_results[index]
32
+ end
33
+
34
+ results = Hash.new
35
+
36
+ total_klass_count.each do |klass, total_count|
37
+ results[klass] = (correct_klass_count[klass] / total_count * 100).round
38
+ end
39
+
40
+ results
41
+ end
42
+ end
43
+ end
44
+
45
+ end
46
+ end