nlp_backpack 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. data/.document +5 -0
  2. data/.gitignore +21 -0
  3. data/LICENSE +20 -0
  4. data/README.rdoc +22 -0
  5. data/Rakefile +45 -0
  6. data/VERSION +1 -0
  7. data/lib/nlp_backpack.rb +10 -0
  8. data/lib/nlp_backpack/chunker.rb +5 -0
  9. data/lib/nlp_backpack/chunker/regex_chunker.rb +107 -0
  10. data/lib/nlp_backpack/chunker/tag_pattern.rb +31 -0
  11. data/lib/nlp_backpack/classifier.rb +5 -0
  12. data/lib/nlp_backpack/classifier/base.rb +28 -0
  13. data/lib/nlp_backpack/classifier/naive_bayes.rb +83 -0
  14. data/lib/nlp_backpack/evaluation.rb +6 -0
  15. data/lib/nlp_backpack/evaluation/accuracy.rb +46 -0
  16. data/lib/nlp_backpack/evaluation/base.rb +12 -0
  17. data/lib/nlp_backpack/evaluation/confusion_matrix.rb +66 -0
  18. data/lib/nlp_backpack/frequency_distribution.rb +47 -0
  19. data/lib/nlp_backpack/pos.rb +5 -0
  20. data/lib/nlp_backpack/pos/brill_tagger.rb +142 -0
  21. data/lib/nlp_backpack/pos/brill_tagger/lexicon.txt +93696 -0
  22. data/lib/nlp_backpack/pos/pos_array.rb +32 -0
  23. data/lib/nlp_backpack/stop_words.rb +17 -0
  24. data/lib/nlp_backpack/stop_words/stop_words.txt +429 -0
  25. data/lib/nlp_backpack/tokenizers/custom.rb +13 -0
  26. data/lib/nlp_backpack/tokenizers/line.rb +13 -0
  27. data/lib/nlp_backpack/tokenizers/space.rb +13 -0
  28. data/lib/nlp_backpack/tokenizers/tab.rb +13 -0
  29. data/lib/nlp_backpack/tokenizers/whitespace.rb +13 -0
  30. data/lib/nlp_backpack/tokenizers/word.rb +13 -0
  31. data/nlp_backpack.gemspec +109 -0
  32. data/spec/chunkers/regex_chunker_spec.rb +46 -0
  33. data/spec/chunkers/tag_pattern_spec.rb +40 -0
  34. data/spec/classifiers/naive_bayes_spec.rb +68 -0
  35. data/spec/evaluation/accuracy_spec.rb +29 -0
  36. data/spec/evaluation/confusion_matrix_spec.rb +29 -0
  37. data/spec/frequency_distribution_spec.rb +53 -0
  38. data/spec/nlp_backpack_spec.rb +4 -0
  39. data/spec/pos/brill_tagger_spec.rb +24 -0
  40. data/spec/pos/pos_array_spec.rb +45 -0
  41. data/spec/spec.opts +1 -0
  42. data/spec/spec_helper.rb +18 -0
  43. data/spec/stop_words_spec.rb +15 -0
  44. data/spec/test_saves/naive.nb +1 -0
  45. data/spec/tokenizers/custom_spec.rb +24 -0
  46. data/spec/tokenizers/line_spec.rb +15 -0
  47. data/spec/tokenizers/space_spec.rb +15 -0
  48. data/spec/tokenizers/tab_spec.rb +15 -0
  49. data/spec/tokenizers/whitespace_spec.rb +16 -0
  50. data/spec/tokenizers/word_spec.rb +15 -0
  51. metadata +141 -0
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 reddavis
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,22 @@
1
+ = NLP Backpack
2
+
3
+ An NLTK like playground for Rubyists. A notebook of my studies
4
+
5
+ If a fish is a fish, what is a fish?
6
+
7
+ == TODO
8
+
9
+ * Lots
10
+
11
+ Classifiers
12
+ * Lots of them!
13
+
14
+ Chunkers
15
+ * IOB tags
16
+ * Tree structure
17
+ * Unigram Chunker
18
+
19
+ Data
20
+
21
+ Evaluations
22
+ * F Score
@@ -0,0 +1,45 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "nlp_backpack"
8
+ gem.summary = %Q{A backpack full of useful toys}
9
+ gem.description = %Q{A backpack full of useful toys}
10
+ gem.email = "reddavis@gmail.com"
11
+ gem.homepage = "http://github.com/reddavis/NLP-Backpack"
12
+ gem.authors = ["reddavis"]
13
+ gem.add_development_dependency "rspec", ">= 1.2.9"
14
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
+ end
16
+ Jeweler::GemcutterTasks.new
17
+ rescue LoadError
18
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
19
+ end
20
+
21
+ require 'spec/rake/spectask'
22
+ Spec::Rake::SpecTask.new(:spec) do |spec|
23
+ spec.libs << 'lib' << 'spec'
24
+ spec.spec_files = FileList['spec/**/*_spec.rb']
25
+ end
26
+
27
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
28
+ spec.libs << 'lib' << 'spec'
29
+ spec.pattern = 'spec/**/*_spec.rb'
30
+ spec.rcov = true
31
+ end
32
+
33
+ task :spec => :check_dependencies
34
+
35
+ task :default => :spec
36
+
37
+ require 'rake/rdoctask'
38
+ Rake::RDocTask.new do |rdoc|
39
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
40
+
41
+ rdoc.rdoc_dir = 'rdoc'
42
+ rdoc.title = "nlp_backpack #{version}"
43
+ rdoc.rdoc_files.include('README*')
44
+ rdoc.rdoc_files.include('lib/**/*.rb')
45
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.0
@@ -0,0 +1,10 @@
1
+ $:.unshift File.expand_path(File.dirname(__FILE__))
2
+
3
+ module NLPBackpack
4
+ autoload :FrequencyDistribution, "nlp_backpack/frequency_distribution"
5
+ autoload :Evaluation, "nlp_backpack/evaluation"
6
+ autoload :Classifier, "nlp_backpack/classifier"
7
+ autoload :StopWords, "nlp_backpack/stop_words"
8
+ autoload :Chunker, "nlp_backpack/chunker"
9
+ autoload :POS, "nlp_backpack/pos"
10
+ end
@@ -0,0 +1,5 @@
1
+ module NLPBackpack
2
+ module Chunker
3
+ autoload :RegexChunker, "nlp_backpack/chunker/regex_chunker"
4
+ end
5
+ end
@@ -0,0 +1,107 @@
1
+ # ChunkGrammer
2
+ # Chunk Grammers are regex-esq patterns that indicate how sentences should be chunked
3
+ # The patterns are made up of Tag Patterns.
4
+ # (DT)?(JJ.*)* == Optional determiner, followed by zero or more of any type of adjective.
5
+
6
+ require 'nlp_backpack/chunker/tag_pattern'
7
+
8
+ module NLPBackpack
9
+
10
+ module Chunker
11
+ class Retry < Exception; end;
12
+
13
+ class RegexChunker
14
+
15
+ def initialize(pattern)
16
+ @pattern = prepare_pattern(pattern)
17
+
18
+ @cached_pattern = @pattern.clone
19
+ @cached_pattern.freeze
20
+
21
+ @matched_patterns = []
22
+ @potential_pattern = []
23
+ end
24
+
25
+ # Extract all matches
26
+ def match(pos_array)
27
+ next_pattern(:start)
28
+
29
+ pos_array.each do |word|
30
+ begin
31
+ pos = word[1]
32
+
33
+ if pos.match(@current_pattern.tag)
34
+ @potential_pattern << word[0]
35
+ next_pattern(:matched)
36
+ else
37
+ next_pattern(:no_match)
38
+ end
39
+ rescue Retry
40
+ retry
41
+ end
42
+ end
43
+
44
+ pop_potential_pattern!
45
+
46
+ @matched_patterns.map {|pattern| pattern.join(" ") }
47
+ end
48
+
49
+ private
50
+
51
+ def next_pattern(state)
52
+ if @pattern.empty?
53
+ reset_pattern!
54
+ next_pattern(:start)
55
+ end
56
+
57
+ @current_pattern = case state
58
+ when :start, :next
59
+ @pattern.pop
60
+ when :matched
61
+ if ["+", "*"].include?(@current_pattern.conditions)
62
+ @current_pattern
63
+ else
64
+ @pattern.pop
65
+ end
66
+ when :no_match
67
+ if ["?", "*"].include?(@current_pattern.conditions)
68
+ next_pattern(:next)
69
+ raise Retry #Prob not the best way to do this(?)
70
+ else
71
+ # Start again
72
+ reset_pattern!
73
+ pop_potential_pattern!
74
+ next_pattern(:start)
75
+ end
76
+ end
77
+ end
78
+
79
+ def pop_potential_pattern!
80
+ if @potential_pattern.size >= minimum_potential_pattern_size
81
+ @matched_patterns << @potential_pattern
82
+ end
83
+
84
+ reset_potential_pattern!
85
+ end
86
+
87
+ def reset_pattern!
88
+ @pattern = @cached_pattern.dup
89
+ end
90
+
91
+ def reset_potential_pattern!
92
+ @potential_pattern = []
93
+ end
94
+
95
+ def minimum_potential_pattern_size
96
+ @cached_pattern.reject {|x| ["?", "*"].include?(x.conditions)}.size
97
+ end
98
+
99
+ # Extract patterns like (DT)? -- (JJ.*)*
100
+ def prepare_pattern(pattern)
101
+ pattern.scan(/(\<[^\>]+\>[^\<]?)/).flatten.map { |x| TagPattern.new(x) }.reverse
102
+ end
103
+
104
+ end
105
+ end
106
+
107
+ end
@@ -0,0 +1,31 @@
1
+ module NLPBackpack
2
+
3
+ module Chunker
4
+ class TagPattern
5
+
6
+ attr_reader :tag, :conditions
7
+
8
+ # Example inputs:
9
+ # <DT>?
10
+ # <JJ.*>*
11
+ def initialize(pattern)
12
+ extract_tag_and_options(pattern)
13
+ end
14
+
15
+ private
16
+
17
+ # TODO Make this work for strings wrapped in " " as well as ' '
18
+ def extract_tag_and_options(pattern)
19
+ if match = pattern.match(/\<([^\>]+)\>/)
20
+ @tag = /#{match[1]}/
21
+ end
22
+
23
+ if match = pattern.match(/\<[^\>]+\>(.)/)
24
+ @conditions = match[1]
25
+ end
26
+ end
27
+
28
+ end
29
+ end
30
+
31
+ end
@@ -0,0 +1,5 @@
1
+ module NLPBackpack
2
+ module Classifier
3
+ autoload :NaiveBayes, "nlp_backpack/classifier/naive_bayes"
4
+ end
5
+ end
@@ -0,0 +1,28 @@
1
+ module NLPBackpack
2
+ module Classifier
3
+
4
+ class Base
5
+ class << self
6
+ def load(db_path)
7
+ data = ""
8
+ File.open(db_path) do |f|
9
+ while line = f.gets
10
+ data << line
11
+ end
12
+ end
13
+ Marshal.load(data)
14
+ end
15
+ end
16
+
17
+ attr_accessor :db_filepath
18
+
19
+ def save
20
+ raise "You haven't set a db_filpath, I dont know where to save" if @db_filepath.nil?
21
+ File.open(@db_filepath, "w+") do |f|
22
+ f.write(Marshal.dump(self))
23
+ end
24
+ end
25
+ end
26
+
27
+ end
28
+ end
@@ -0,0 +1,83 @@
1
+ # Bayes Theorem
2
+ # P(A|B) = P(B|A) * P(A) / P(B)
3
+
4
+ # Terminology
5
+ # An ITEM is made up of FEATURES
6
+ # An ITEM belongs to a CLASS
7
+
8
+ # Bayes With Our Terminology
9
+ # P(Class | Item) = P(Item | Class) * P(Class) / P(Item)
10
+
11
+ # However, when classifying, P(Item) is the same across all calcualtions
12
+ # So we don't bother to calculate it
13
+
14
+ require "nlp_backpack/classifier/base"
15
+
16
+ module NLPBackpack
17
+ module Classifier
18
+
19
+ class NaiveBayes < Base
20
+ def initialize(*klasses)
21
+ @features_count = {}
22
+ @klass_count = {}
23
+ @klasses = klasses
24
+
25
+ klasses.each do |klass|
26
+ @features_count[klass] = Hash.new(0.0)
27
+ @klass_count[klass] = 0.0
28
+ end
29
+ end
30
+
31
+ def train(klass, *features)
32
+ features.uniq.each do |feature|
33
+ @features_count[klass][feature] += 1
34
+ end
35
+ @klass_count[klass] += 1
36
+ end
37
+
38
+ #P(Class | Item) = P(Item | Class) * P(Class)
39
+ def classify(*features)
40
+ scores = {}
41
+ @klasses.each do |klass|
42
+ scores[klass] = (prob_of_item_given_a_class(features, klass) * prob_of_class(klass))
43
+ end
44
+ scores.sort {|a,b| b[1] <=> a[1]}[0]
45
+ end
46
+
47
+ private
48
+
49
+ # P(Item | Class)
50
+ def prob_of_item_given_a_class(features, klass)
51
+ a = features.inject(1.0) do |sum, feature|
52
+ prob = prob_of_feature_given_a_class(feature, klass)
53
+ sum *= prob
54
+ end
55
+ end
56
+
57
+ # P(Feature | Class)
58
+ def prob_of_feature_given_a_class(feature, klass)
59
+ return assumed_probability if @features_count[klass][feature] == 0
60
+ @features_count[klass][feature] / @klass_count[klass]
61
+ end
62
+
63
+ # P(Class)
64
+ def prob_of_class(klass)
65
+ @klass_count[klass] / total_items
66
+ end
67
+
68
+ def total_items
69
+ @klass_count.inject(0) do |sum, klass|
70
+ sum += klass[1]
71
+ end
72
+ end
73
+
74
+ # If we have only trained a little bit a class may not have had a feature yet
75
+ # give it a probability of 0 may not be true so we produce a assumed probability
76
+ # which gets smaller more we train
77
+ def assumed_probability
78
+ 0.5 / (total_items/2)
79
+ end
80
+ end
81
+
82
+ end
83
+ end
@@ -0,0 +1,6 @@
1
+ module NLPBackpack
2
+ module Evaluation
3
+ autoload :Accuracy, "nlp_backpack/evaluation/accuracy"
4
+ autoload :ConfusionMatrix, "nlp_backpack/evaluation/confusion_matrix"
5
+ end
6
+ end
@@ -0,0 +1,46 @@
1
+ require 'nlp_backpack/evaluation/base'
2
+
3
+ module NLPBackpack
4
+ module Evaluation
5
+
6
+ class Accuracy < Base
7
+
8
+ def accuracy_of(klass)
9
+ results[klass]
10
+ end
11
+
12
+ def inspect
13
+ output = ""
14
+
15
+ results.each do |klass, result|
16
+ output << "#{klass}: #{result}% correct\n"
17
+ end
18
+
19
+ output
20
+ end
21
+
22
+ private
23
+
24
+ def results
25
+ @results ||= begin
26
+ correct_klass_count = Hash.new {|h,k| h[k] = 0.0}
27
+ total_klass_count = Hash.new {|h,k| h[k] = 0.0}
28
+
29
+ @correct_results.each_with_index do |correct_result, index|
30
+ total_klass_count[correct_result] += 1
31
+ correct_klass_count[correct_result] += 1 if correct_result == @test_results[index]
32
+ end
33
+
34
+ results = Hash.new
35
+
36
+ total_klass_count.each do |klass, total_count|
37
+ results[klass] = (correct_klass_count[klass] / total_count * 100).round
38
+ end
39
+
40
+ results
41
+ end
42
+ end
43
+ end
44
+
45
+ end
46
+ end