RubyGems - nlp_backpack - Versions diffs - 0.0.0 - Mend

nlp_backpack 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

data/.document +5 -0
data/.gitignore +21 -0
data/LICENSE +20 -0
data/README.rdoc +22 -0
data/Rakefile +45 -0
data/VERSION +1 -0
data/lib/nlp_backpack.rb +10 -0
data/lib/nlp_backpack/chunker.rb +5 -0
data/lib/nlp_backpack/chunker/regex_chunker.rb +107 -0
data/lib/nlp_backpack/chunker/tag_pattern.rb +31 -0
data/lib/nlp_backpack/classifier.rb +5 -0
data/lib/nlp_backpack/classifier/base.rb +28 -0
data/lib/nlp_backpack/classifier/naive_bayes.rb +83 -0
data/lib/nlp_backpack/evaluation.rb +6 -0
data/lib/nlp_backpack/evaluation/accuracy.rb +46 -0
data/lib/nlp_backpack/evaluation/base.rb +12 -0
data/lib/nlp_backpack/evaluation/confusion_matrix.rb +66 -0
data/lib/nlp_backpack/frequency_distribution.rb +47 -0
data/lib/nlp_backpack/pos.rb +5 -0
data/lib/nlp_backpack/pos/brill_tagger.rb +142 -0
data/lib/nlp_backpack/pos/brill_tagger/lexicon.txt +93696 -0
data/lib/nlp_backpack/pos/pos_array.rb +32 -0
data/lib/nlp_backpack/stop_words.rb +17 -0
data/lib/nlp_backpack/stop_words/stop_words.txt +429 -0
data/lib/nlp_backpack/tokenizers/custom.rb +13 -0
data/lib/nlp_backpack/tokenizers/line.rb +13 -0
data/lib/nlp_backpack/tokenizers/space.rb +13 -0
data/lib/nlp_backpack/tokenizers/tab.rb +13 -0
data/lib/nlp_backpack/tokenizers/whitespace.rb +13 -0
data/lib/nlp_backpack/tokenizers/word.rb +13 -0
data/nlp_backpack.gemspec +109 -0
data/spec/chunkers/regex_chunker_spec.rb +46 -0
data/spec/chunkers/tag_pattern_spec.rb +40 -0
data/spec/classifiers/naive_bayes_spec.rb +68 -0
data/spec/evaluation/accuracy_spec.rb +29 -0
data/spec/evaluation/confusion_matrix_spec.rb +29 -0
data/spec/frequency_distribution_spec.rb +53 -0
data/spec/nlp_backpack_spec.rb +4 -0
data/spec/pos/brill_tagger_spec.rb +24 -0
data/spec/pos/pos_array_spec.rb +45 -0
data/spec/spec.opts +1 -0
data/spec/spec_helper.rb +18 -0
data/spec/stop_words_spec.rb +15 -0
data/spec/test_saves/naive.nb +1 -0
data/spec/tokenizers/custom_spec.rb +24 -0
data/spec/tokenizers/line_spec.rb +15 -0
data/spec/tokenizers/space_spec.rb +15 -0
data/spec/tokenizers/tab_spec.rb +15 -0
data/spec/tokenizers/whitespace_spec.rb +16 -0
data/spec/tokenizers/word_spec.rb +15 -0
metadata +141 -0

data/lib/nlp_backpack/tokenizers/custom.rb ADDED

@@ -0,0 +1,13 @@
+module NLPBackpack
+  module Tokenizer
+    class Custom
+      class << self
+        def tokenize(string, spliter)
+          string.split(spliter)
+        end
+      end
+    end
+  end
+end

data/lib/nlp_backpack/tokenizers/line.rb ADDED

@@ -0,0 +1,13 @@
+module NLPBackpack
+  module Tokenizer
+    class Line
+      class << self
+        def tokenize(string)
+          string.split(/\n{1}/)
+        end
+      end
+    end
+  end
+end

data/lib/nlp_backpack/tokenizers/space.rb ADDED

@@ -0,0 +1,13 @@
+module NLPBackpack
+  module Tokenizer
+    class Space
+      class << self
+        def tokenize(string)
+          string.split(/\s{1}/)
+        end
+      end
+    end
+  end
+end

data/lib/nlp_backpack/tokenizers/tab.rb ADDED

@@ -0,0 +1,13 @@
+module NLPBackpack
+  module Tokenizer
+    class Tab
+      class << self
+        def tokenize(string)
+          string.split(/\t{1}/)
+        end
+      end
+    end
+  end
+end

data/lib/nlp_backpack/tokenizers/whitespace.rb ADDED

@@ -0,0 +1,13 @@
+module NLPBackpack
+  module Tokenizer
+    class Whitespace
+      class << self
+        def tokenize(string)
+          string.split
+        end
+      end
+    end
+  end
+end

data/lib/nlp_backpack/tokenizers/word.rb ADDED

@@ -0,0 +1,13 @@
+module NLPBackpack
+  module Tokenizer
+    class Word
+      class << self
+        def tokenize(string)
+          string.split(/\W*\s/)
+        end
+      end
+    end
+  end
+end

data/nlp_backpack.gemspec ADDED

@@ -0,0 +1,109 @@
+# Generated by jeweler
+# DO NOT EDIT THIS FILE DIRECTLY
+# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = %q{nlp_backpack}
+  s.version = "0.0.0"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["reddavis"]
+  s.date = %q{2010-06-21}
+  s.description = %q{A backpack full of useful toys}
+  s.email = %q{reddavis@gmail.com}
+  s.extra_rdoc_files = [
+    "LICENSE",
+     "README.rdoc"
+  ]
+  s.files = [
+    ".document",
+     ".gitignore",
+     "LICENSE",
+     "README.rdoc",
+     "Rakefile",
+     "VERSION",
+     "lib/nlp_backpack.rb",
+     "lib/nlp_backpack/chunker.rb",
+     "lib/nlp_backpack/chunker/regex_chunker.rb",
+     "lib/nlp_backpack/chunker/tag_pattern.rb",
+     "lib/nlp_backpack/classifier.rb",
+     "lib/nlp_backpack/classifier/base.rb",
+     "lib/nlp_backpack/classifier/naive_bayes.rb",
+     "lib/nlp_backpack/evaluation.rb",
+     "lib/nlp_backpack/evaluation/accuracy.rb",
+     "lib/nlp_backpack/evaluation/base.rb",
+     "lib/nlp_backpack/evaluation/confusion_matrix.rb",
+     "lib/nlp_backpack/frequency_distribution.rb",
+     "lib/nlp_backpack/pos.rb",
+     "lib/nlp_backpack/pos/brill_tagger.rb",
+     "lib/nlp_backpack/pos/brill_tagger/lexicon.txt",
+     "lib/nlp_backpack/pos/pos_array.rb",
+     "lib/nlp_backpack/stop_words.rb",
+     "lib/nlp_backpack/stop_words/stop_words.txt",
+     "lib/nlp_backpack/tokenizers/custom.rb",
+     "lib/nlp_backpack/tokenizers/line.rb",
+     "lib/nlp_backpack/tokenizers/space.rb",
+     "lib/nlp_backpack/tokenizers/tab.rb",
+     "lib/nlp_backpack/tokenizers/whitespace.rb",
+     "lib/nlp_backpack/tokenizers/word.rb",
+     "nlp_backpack.gemspec",
+     "spec/chunkers/regex_chunker_spec.rb",
+     "spec/chunkers/tag_pattern_spec.rb",
+     "spec/classifiers/naive_bayes_spec.rb",
+     "spec/evaluation/accuracy_spec.rb",
+     "spec/evaluation/confusion_matrix_spec.rb",
+     "spec/frequency_distribution_spec.rb",
+     "spec/nlp_backpack_spec.rb",
+     "spec/pos/brill_tagger_spec.rb",
+     "spec/pos/pos_array_spec.rb",
+     "spec/spec.opts",
+     "spec/spec_helper.rb",
+     "spec/stop_words_spec.rb",
+     "spec/test_saves/naive.nb",
+     "spec/tokenizers/custom_spec.rb",
+     "spec/tokenizers/line_spec.rb",
+     "spec/tokenizers/space_spec.rb",
+     "spec/tokenizers/tab_spec.rb",
+     "spec/tokenizers/whitespace_spec.rb",
+     "spec/tokenizers/word_spec.rb"
+  ]
+  s.homepage = %q{http://github.com/reddavis/NLP-Backpack}
+  s.rdoc_options = ["--charset=UTF-8"]
+  s.require_paths = ["lib"]
+  s.rubygems_version = %q{1.3.6}
+  s.summary = %q{A backpack full of useful toys}
+  s.test_files = [
+    "spec/chunkers/regex_chunker_spec.rb",
+     "spec/chunkers/tag_pattern_spec.rb",
+     "spec/classifiers/naive_bayes_spec.rb",
+     "spec/evaluation/accuracy_spec.rb",
+     "spec/evaluation/confusion_matrix_spec.rb",
+     "spec/frequency_distribution_spec.rb",
+     "spec/nlp_backpack_spec.rb",
+     "spec/pos/brill_tagger_spec.rb",
+     "spec/pos/pos_array_spec.rb",
+     "spec/spec_helper.rb",
+     "spec/stop_words_spec.rb",
+     "spec/tokenizers/custom_spec.rb",
+     "spec/tokenizers/line_spec.rb",
+     "spec/tokenizers/space_spec.rb",
+     "spec/tokenizers/tab_spec.rb",
+     "spec/tokenizers/whitespace_spec.rb",
+     "spec/tokenizers/word_spec.rb"
+  ]
+  if s.respond_to? :specification_version then
+    current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
+    s.specification_version = 3
+    if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
+      s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
+    else
+      s.add_dependency(%q<rspec>, [">= 1.2.9"])
+    end
+  else
+    s.add_dependency(%q<rspec>, [">= 1.2.9"])
+  end
+end

data/spec/chunkers/regex_chunker_spec.rb ADDED

@@ -0,0 +1,46 @@
+require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+require 'nlp_backpack/pos/pos_array'
+include NLPBackpack
+describe Chunker::RegexChunker do
+  describe "Matching specified chunks" do
+    before do
+      @pos_a = POS::POSArray.new
+      sentence.each {|word| @pos_a << word}
+    end
+    describe "Simple chunk" do
+      it "should return 'this is some text'" do
+        grammer = Chunker::RegexChunker.new("<DT><VBZ><DT><NN>")
+        grammer.match(@pos_a).first.should == "this is some text"
+      end
+    end
+    describe "Chunk with conditional tag" do
+      it "should return 'this is some text'" do
+        grammer = Chunker::RegexChunker.new("<DT><VBZ>?<DT>")
+        grammer.match(@pos_a)[0].should == "this is some"
+        grammer.match(@pos_a)[1].should == "this some"
+      end
+      it "should return 'this is some text'" do
+        grammer = Chunker::RegexChunker.new("<DT><VBZ>*<DT>")
+        grammer.match(@pos_a)[0].should == "this is some"
+        grammer.match(@pos_a)[1].should == "this some"
+      end
+    end
+    describe "Chunk with tag regex" do
+      it "should return 'this is some text'" do
+        grammer = Chunker::RegexChunker.new('<D\w><VBZ>?<DT>')
+        grammer.match(@pos_a)[0].should == "this is some"
+        grammer.match(@pos_a)[1].should == "this some"
+      end
+    end
+  end
+  def sentence
+    [["this", "DT"], ["is", "VBZ"], ["some", "DT"], ["text", "NN"], ["text", "NN"], ["that", "IN"], ["I", "PRP"], ["want", "VBP"], ["analyzing", "VBG"], ["this", "DT"], ["some", "DT"]]
+  end
+end

data/spec/chunkers/tag_pattern_spec.rb ADDED

@@ -0,0 +1,40 @@
+require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+require 'nlp_backpack/chunker/tag_pattern'
+include NLPBackpack
+describe Chunker::TagPattern do
+  describe "Tag" do
+    before do
+      @tp = Chunker::TagPattern.new("<NN>")
+    end
+    it "should return the tag as NN" do
+      @tp.tag.should == /NN/
+    end
+  end
+  describe "Tag with condition" do
+    before do
+      @tp = Chunker::TagPattern.new("<NN*>")
+    end
+    it "should return the tag as NN*" do
+      @tp.tag.should == /NN*/
+    end
+  end
+  describe "Tag with external condition" do
+    before do
+      @tp = Chunker::TagPattern.new("<NN.+>?")
+    end
+    it "should return the tag as NN" do
+      @tp.tag.should == /NN.+/
+    end
+    it "should return conditions as ?" do
+      @tp.conditions.should == "?"
+    end
+  end
+end

data/spec/classifiers/naive_bayes_spec.rb ADDED

@@ -0,0 +1,68 @@
+require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+include NLPBackpack::Classifier
+describe NaiveBayes do
+  describe "Classification" do
+    before do
+      @classifier = create_and_train_classifier
+    end
+    it "should classify as spam with a score of 0.5" do
+      a = @classifier.classify('bad', 'word')
+      a[0].should == :spam
+      a[1].should == 0.5
+    end
+  end
+  describe "Saving the NB" do
+    describe "DB filepath has been set" do
+      before do
+        @classifier = NaiveBayes.new(:spam, :ham)
+        @classifier.db_filepath = db_filepath
+      end
+      it "should save to the filepath provided" do
+        FileUtils.rm(db_filepath, :force => true)
+        @classifier.save
+        File.exists?(db_filepath).should be_true
+      end
+    end
+    describe "DB filepath has no been set" do
+      it "should raise an error" do
+        lambda do
+          NaiveBayes.new(:spam, :ham).save
+        end.should raise_error
+      end
+    end
+  end
+  describe "Load" do
+    before do
+      classifier = NaiveBayes.new(:spam, :ham)
+      classifier.db_filepath = db_filepath
+      classifier.train(:spam, 'bad', 'word')
+      classifier.train(:ham, 'we', 'bad')
+      classifier.save
+    end
+    it "should return 0.5" do
+      classifier = NaiveBayes.load(db_filepath)
+      classifier.classify('bad', 'word')[1].should == 0.5
+    end
+  end
+  private
+  def create_and_train_classifier
+    a = NaiveBayes.new(:spam, :ham)
+    a.train(:spam, 'bad', 'word')
+    a.train(:ham, 'we', 'bad')
+    a
+  end
+  def db_filepath
+    File.expand_path(File.dirname(__FILE__) + '/../test_saves/naive.nb')
+  end
+end

data/spec/evaluation/accuracy_spec.rb ADDED

@@ -0,0 +1,29 @@
+require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+include NLPBackpack::Evaluation
+describe Accuracy do
+  before(:all) do
+    @accuracy = Accuracy.new([1,1,2,2], [1,1,2,1])
+  end
+  describe "Specific element" do
+    it "should return 100" do
+      @accuracy.accuracy_of(1).should == 100
+    end
+    it "should return 50" do
+      @accuracy.accuracy_of(2).should == 50
+    end
+  end
+  describe "Inspect" do
+    it "should match 100%" do
+      @accuracy.inspect.should match(/100%/)
+    end
+    it "should match 50%" do
+      @accuracy.inspect.should match(/50%/)
+    end
+  end
+end

data/spec/evaluation/confusion_matrix_spec.rb ADDED

@@ -0,0 +1,29 @@
+require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+include NLPBackpack::Evaluation
+describe ConfusionMatrix do
+  before(:all) do
+    @cm = ConfusionMatrix.new(correct_results, test_results)
+  end
+  describe "Specific element" do
+    it "should return 90%" do
+      @cm.results_for(1, 1).should == "90%"
+    end
+    it "should return 10%" do
+      @cm.results_for(2, 3).should == "10%"
+    end
+  end
+  describe "Inspect" do
+    it "should match <90%>" do
+      @cm.inspect.should match(/<90%>/)
+    end
+    it "should match 10%" do
+      @cm.inspect.should match(/10%/)
+    end
+  end
+end

data/spec/frequency_distribution_spec.rb ADDED

@@ -0,0 +1,53 @@
+require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+include NLPBackpack
+describe FrequencyDistribution do
+  before do
+    conditions = {:happy => happy_text, :sad => sad_text}
+    @cfd = FrequencyDistribution.new(conditions, "happy", "sad")
+    @results = @cfd.process
+  end
+  it "should return a hash" do
+    @results.should be_a(Hash)
+  end
+  describe "Happy condition" do
+    it "should return 2 for happy" do
+      @results[:happy]["happy"].should == 2
+    end
+    it "should return 0 for sad" do
+      @results[:happy]["sad"].should == 0
+    end
+  end
+  describe "Sad condition" do
+    it "should return 0 for happy" do
+      @results[:sad]["happy"].should == 0
+    end
+    it "should return 2 for sad" do
+      @results[:sad]["sad"].should == 2
+    end
+  end
+  describe "Tabulation" do
+    it "should include the events" do
+      table = @cfd.to_tabulation
+      table.should match(/happy/)
+      table.should match(/sad/)
+    end
+  end
+  private
+  def happy_text
+    %w{when happy things happen it makes me happy}
+  end
+  def sad_text
+    %w{when sad things happen it makes me sad}
+  end
+end