RubyGems - yanbi-ml - Versions diffs - 0.1.0 - Mend

yanbi-ml 0.1.0

Files changed (17) hide show

data/.travis.yml +4 -0
data/Gemfile +4 -0
data/README.md +34 -0
data/Rakefile +5 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/lib/bayes/bayes.rb +112 -0
data/lib/bayes/fisher.rb +62 -0
data/lib/corpus.rb +63 -0
data/lib/version.rb +7 -0
data/lib/wordbags/diadbag.rb +36 -0
data/lib/wordbags/stembag.rb +20 -0
data/lib/wordbags/stemmed_diadbag.rb +16 -0
data/lib/wordbags/wordbag.rb +104 -0
data/lib/yanbi.rb +17 -0
data/yanbi-ml.gemspec +25 -0
metadata +126 -0

data/.travis.yml ADDED

@@ -0,0 +1,4 @@
+language: ruby
+rvm:
+  - 2.2.2
+before_install: gem install bundler -v 1.11.2

data/Gemfile ADDED

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in yanbi-ml.gemspec
+gemspec

data/README.md ADDED

@@ -0,0 +1,34 @@
+# YANBI-ML
+Yet Another Naive Bayes Implementation
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'yanbi-ml'
+```
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install yanbi-ml
+## Usage
+TODO: Write usage instructions here
+## Contributing
+Bug reports and pull requests are welcome on GitHub at https://github.com/rdormer/yanbi-ml.
+## License
+The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).

data/Rakefile ADDED

@@ -0,0 +1,5 @@
+require "bundler/gem_tasks"
+require "rspec/core/rake_task"
+RSpec::Core::RakeTask.new(:spec)
+task :default => :spec

data/bin/console ADDED

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require "bundler/setup"
+require "yanbi/ml"
+# You can add fixtures and/or initialization code here to make experimenting
+# with your gem easier. You can also use a different console, if you like.
+# (If you use this, don't forget to add pry to your Gemfile!)
+# require "pry"
+# Pry.start
+require "irb"
+IRB.start

data/bin/setup ADDED

@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+set -euo pipefail
+IFS=$'\n\t'
+set -vx
+bundle install
+# Do any other automated setup that you need to do here

data/lib/bayes/bayes.rb ADDED

@@ -0,0 +1,112 @@
+# Author::    Robert Dormer (mailto:rdormer@gmail.com)
+# Copyright:: Copyright (c) 2016 Robert Dormer
+# License::   MIT
+# Naive Bayesian classifier.  Training and classification are both done via passed in
+# word bags, as opposed to raw text.  The first argument to new is the class of WordBag
+# that you want newdoc to create.  From then on, you can use newdoc to process text instead
+# of manually creating word bags yourself, which will help to keep the word bag type
+# consistent for a given classifier object.  Note that if you really want to, you can train
+# or classify with a different type of word bag then you passed in, although I can't imagine
+# why you would want to.  There's also a default constructor if you just want to create a
+# classifier without being bothered about which word bag it uses.
+module Yanbi
+  class Bayes
+    def initialize(klass, *categories)
+      raise ArgumentError unless categories.size > 1
+      @categories = categories
+      @category_counts = {}
+      @document_counts = {}
+      @categories.each do |category|
+        cat = category.to_sym
+        @category_counts[cat] = {}
+        @document_counts[cat] = 0
+      end
+      @bag_class = klass.to_s.split('::').last
+    end
+    def self.default(*categories)
+      self.new(WordBag, *categories)
+    end
+    def train(category, document)
+      cat = category.to_sym
+      @document_counts[cat] += 1
+      document.words.uniq.each do |word|
+        @category_counts[cat][word] ||= 0
+        @category_counts[cat][word] += 1
+      end
+    end
+    def classify(document)
+      max_score(document) do |cat, doc|
+        cond_prob(cat, doc)
+      end
+    end
+    def train_raw(category, text)
+      train(category, self.newdoc(text))
+    end
+    def classify_raw(text)
+      classify(self.newdoc(text))
+    end
+    def set_significance(cutoff, category=nil)
+      categories = (category.nil? ? @categories : [category])
+      categories.each do |category|
+        cat = category.to_sym
+        @category_counts[cat].reject! {|k,v| v < cutoff}
+      end
+    end
+    def newdoc(doc)
+      Yanbi.const_get(@bag_class).new(doc)
+    end
+    def save(name)
+      File.open(name + ".obj", 'w') do |out|
+         YAML.dump(self, out)
+      end
+    end
+    private
+    def cond_prob(cat, document)
+      total_docs = @document_counts.values.reduce(:+).to_f
+      document_prob = document.words.uniq.map {|word| word_prob(cat, word)}.reduce(:+)
+      document_prob + Math.log(@document_counts[cat] / total_docs)
+    end
+    def word_prob(cat, word)
+      all_word_count = @category_counts[cat].values.reduce(&:+)
+      count = @category_counts[cat].has_key?(word) ? @category_counts[cat][word].to_f : 0.1
+      Math.log(count / all_word_count)
+    end
+    def max_score(document)
+      scores = []
+      @categories.each do |c|
+        score = yield c, document
+        scores << score
+      end
+      i = scores.rindex(scores.max)
+      @categories[i]
+    end
+#    def weighted_prob(word, category, basicprob, weight=1.0, ap=0.5)
+#      #basicprob = word_prob(category, word) if basicprob.nil?
+#      totals = @category_counts.inject(0) {|sum, cat| sum += cat.last[word].to_i}
+#      ((weight * ap) + (totals*basicprob)) / (weight + totals)
+#    end
+  end
+end

data/lib/bayes/fisher.rb ADDED

@@ -0,0 +1,62 @@
+module Yanbi
+  class Fisher < Yanbi::Bayes
+    def classify(text)
+      max_score(text) do |cat, doc|
+        fisher_score(cat, doc)
+      end
+    end
+    private
+    def fisher_score(category, document)
+      features = document.words.uniq
+      pscores = 1
+###
+#compute weighted probabilities for each word/cat tuple
+#and then multiply them all together...
+##
+      features.each do |word|
+        clf = word_prob(category, word)
+        freqsum = @categories.reduce(0) {|sum, x| sum + word_prob(x, word)}
+        pscores *= (clf / freqsum) if clf > 0
+      end
+#####
+#compute fisher factor of pscores
+      score = -2 * Math.log(pscores)
+#this is okay
+      invchi2(score, features.count * 2)
+    end
+    def word_prob(cat, word)
+      @category_counts[cat][word].to_f / @document_counts[cat]
+    end
+    def invchi2(chi, df)
+      m = chi / 2.0
+      sum = Math.exp(-m)
+      term = Math.exp(-m)
+      (1..df/2).each do |i|
+        term *= (m / i)
+        sum += term
+      end
+      [sum, 1.0].min
+    rescue
+      1.0
+    end
+  end
+end

data/lib/corpus.rb ADDED

@@ -0,0 +1,63 @@
+# Author::    Robert Dormer (mailto:rdormer@gmail.com)
+# Copyright:: Copyright (c) 2016 Robert Dormer
+# License::   MIT
+# This is the class for managing a corpus of documents.  It's recommended, though not necessary,
+# that all of the documents in a given corpus be in the same category, if you're using the corpus
+# to train your classifier.  Can accept either raw strings through add_doc, or files through add_file.
+# Files can be delimited so that you can have more than one document in them, and commenting is
+# available
+$: << File.dirname(__FILE__)
+require 'yanbi'
+module Yanbi
+  class Corpus
+    attr_reader :docs
+    attr_reader :bags
+    attr_reader :all
+    def initialize(klass=WordBag)
+      @all = klass.new
+      @docs = []
+      @bags = []
+    end
+    def size
+      @docs.size
+    end
+    def add_file(docpath, delim=nil, comment=nil)
+      infile = File.open(docpath, 'r')
+      raw = infile.read
+      infile.close
+      if delim
+        docs = raw.split(delim)
+        docs.each {|d| add_doc(d, comment)}
+      else
+        add_doc(raw, comment)
+      end
+    end
+    def add_doc(doc, comment=nil)
+      doc.gsub! comment, '' if comment
+      doc.strip!
+      unless doc.length.zero?
+        @bags << @all.class.new(doc)
+        @all.add_text doc
+        @docs << doc
+      end
+    end
+    def each_doc
+      @bags.each do |bag|
+        yield bag
+      end
+    end
+  end
+end

data/lib/version.rb ADDED

@@ -0,0 +1,7 @@
+# Author::    Robert Dormer (mailto:rdormer@gmail.com)
+# Copyright:: Copyright (c) 2016 Robert Dormer
+# License::   MIT
+module Yanbi
+  VERSION = "0.1.0"
+end

data/lib/wordbags/diadbag.rb ADDED

@@ -0,0 +1,36 @@
+# Author::    Robert Dormer (mailto:rdormer@gmail.com)
+# Copyright:: Copyright (c) 2016 Robert Dormer
+# License::   MIT
+# A word bag that stores the words as diads instead of individual words.
+# i.e. "the quick brown fox" becomes "the quick", "quick brown", "brown fox".
+# This type of shingling is often recommended as a way to boost the accuracy
+# of Bayes classifiers
+$: << File.dirname(__FILE__)
+require 'wordbag'
+module Yanbi
+  class DiadBag < WordBag
+    def process(raw)
+      processed = raw.downcase
+      processed.gsub!(/[^\w\s'\-]/, ' ')
+      words = processed.split
+      words = words.map {|x| x.split /-/}.flatten
+      if block_given?
+        words.map! {|x| yield x}
+      end
+      diads = []
+      words.each_with_index {|w, i| diads << [w, words[i+1]]}
+      diads.delete_at(-1)
+      words = diads.map {|x| "#{x.first} #{x.last}"}
+      update_counts(words)
+      @words.concat(words)
+    end
+  end
+end

data/lib/wordbags/stembag.rb ADDED

@@ -0,0 +1,20 @@
+# Author::    Robert Dormer (mailto:rdormer@gmail.com)
+# Copyright:: Copyright (c) 2016 Robert Dormer
+# License::   MIT
+# This is a word bag with a post-processing step to stem (lemmatize)
+# the words in the bag
+$: << File.dirname(__FILE__)
+require 'fast_stemmer'
+require 'wordbag'
+module Yanbi
+  class StemmedWordBag < WordBag
+    def standardize(raw)
+      process(raw) {|word| word.stem}
+    end
+  end
+end

data/lib/wordbags/stemmed_diadbag.rb ADDED

@@ -0,0 +1,16 @@
+# Author::    Robert Dormer (mailto:rdormer@gmail.com)
+# Copyright:: Copyright (c) 2016 Robert Dormer
+# License::   MIT
+$: << File.dirname(__FILE__)
+require 'diadbag'
+module Yanbi
+  class StemmedDiadBag < DiadBag
+    def standardize(raw)
+      process(raw) {|word| word.stem}
+    end
+  end
+end

data/lib/wordbags/wordbag.rb ADDED

@@ -0,0 +1,104 @@
+# Author::    Robert Dormer (mailto:rdormer@gmail.com)
+# Copyright:: Copyright (c) 2016 Robert Dormer
+# License::   MIT
+# Word bag class, implementing the bag of words / multi-set that is so popular in text
+# classification literature.  A single bag can contain multiple documents if you want
+# it to, although for training a Bayes classifier this is probably not recommended.
+$: << File.dirname(__FILE__)
+require 'yaml'
+module Yanbi
+  class WordBag
+    attr_reader :words
+    def initialize(corpus=nil)
+      @words = []
+      @counts = {}
+      standardize(corpus) if corpus
+    end
+    def add_file(filename)
+      raw = File.open(filename).read
+      standardize(raw)
+    end
+    def add_text(text)
+      standardize(text)
+    end
+    def save(filename)
+      out = File.new(filename + ".yml", "w")
+      out.write(@words.to_yaml)
+      out.close
+    end
+    def load(filename)
+      @words = YAML.load_file(filename + ".yml")
+      update_counts(@words)
+    end
+    def self.load(filename)
+      WordBag.new.load(filename)
+    end
+    def word_counts(min=1)
+      @counts.select {|key, value| value >= min}
+    end
+    def remove(words)
+      words.each do |word|
+        @words.reject! {|x| x == word}
+        @counts.delete(word)
+      end
+    end
+    def between_counts(min, max=nil)
+      counts = @counts.select{|key, value| value >= min}
+      counts.select! {|key, value| value <= max} unless max.nil?
+      @words.select {|word| counts.keys.include? word}
+    end
+    def intersection(other)
+      self.words & other.words
+    end
+    def empty?
+      @words.empty?
+    end
+    private
+    def standardize(raw)
+      process(raw)
+    end
+    def process(raw)
+      processed = raw.downcase
+      processed.gsub!(/[^\w\s'\-]/, ' ')
+      words = processed.split
+      words = words.map {|x| x.split /-/}.flatten
+      if block_given?
+        words.map! {|x| yield x}
+      end
+      update_counts(words)
+      @words.concat(words)
+    end
+    def update_counts(data)
+      data.each do |word|
+        if @counts[word].nil?
+          @counts[word] = 1
+        else
+          @counts[word] += 1
+        end
+      end
+    end
+  end
+end

data/lib/yanbi.rb ADDED

@@ -0,0 +1,17 @@
+# Author::    Robert Dormer (mailto:rdormer@gmail.com)
+# Copyright:: Copyright (c) 2016 Robert Dormer
+# License::   MIT
+base = File.dirname(__FILE__)
+$: << base
+Dir[base + "/wordbags/**/*.rb"].each do |bag|
+  require bag
+end
+Dir[base + "/bayes/**/*.rb"].each do |c|
+  require c
+end
+require 'corpus'
+require 'version'

data/yanbi-ml.gemspec ADDED

@@ -0,0 +1,25 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'version'
+Gem::Specification.new do |spec|
+  spec.name          = "yanbi-ml"
+  spec.version       = Yanbi::VERSION
+  spec.authors       = ["Robert Dormer"]
+  spec.email         = ["rdormer@gmail.com"]
+  spec.summary       = %q{Yet Another Naive Bayes Implementation}
+  spec.homepage      = "http://github.com/rdormer/yanbi-ml"
+  spec.license       = "MIT"
+  spec.files         = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
+  spec.bindir        = "exe"
+  spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
+  spec.require_paths = ["lib"]
+  spec.add_development_dependency "bundler", "~> 1.11"
+  spec.add_development_dependency "rake", "~> 10.0"
+  spec.add_development_dependency "rspec", "~> 3.4.0"
+  spec.add_development_dependency "fast-stemmer", "~> 1.0.2"
+end

metadata ADDED

@@ -0,0 +1,126 @@
+--- !ruby/object:Gem::Specification
+name: yanbi-ml
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+  prerelease:
+platform: ruby
+authors:
+- Robert Dormer
+autorequire:
+bindir: exe
+cert_chain: []
+date: 2016-07-05 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.11'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.11'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '10.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '10.0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 3.4.0
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 3.4.0
+- !ruby/object:Gem::Dependency
+  name: fast-stemmer
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.0.2
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.0.2
+description:
+email:
+- rdormer@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- .travis.yml
+- Gemfile
+- README.md
+- Rakefile
+- bin/console
+- bin/setup
+- lib/bayes/bayes.rb
+- lib/bayes/fisher.rb
+- lib/corpus.rb
+- lib/version.rb
+- lib/wordbags/diadbag.rb
+- lib/wordbags/stembag.rb
+- lib/wordbags/stemmed_diadbag.rb
+- lib/wordbags/wordbag.rb
+- lib/yanbi.rb
+- yanbi-ml.gemspec
+homepage: http://github.com/rdormer/yanbi-ml
+licenses:
+- MIT
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.25
+signing_key:
+specification_version: 3
+summary: Yet Another Naive Bayes Implementation
+test_files: []