RubyGems - yanbi-ml - Versions diffs - 0.1.0 - Mend

yanbi-ml 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

data/.travis.yml +4 -0
data/Gemfile +4 -0
data/README.md +34 -0
data/Rakefile +5 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/lib/bayes/bayes.rb +112 -0
data/lib/bayes/fisher.rb +62 -0
data/lib/corpus.rb +63 -0
data/lib/version.rb +7 -0
data/lib/wordbags/diadbag.rb +36 -0
data/lib/wordbags/stembag.rb +20 -0
data/lib/wordbags/stemmed_diadbag.rb +16 -0
data/lib/wordbags/wordbag.rb +104 -0
data/lib/yanbi.rb +17 -0
data/yanbi-ml.gemspec +25 -0
metadata +126 -0

data/.travis.yml ADDED

@@ -0,0 +1,4 @@
+language: ruby
+rvm:
+  - 2.2.2
+before_install: gem install bundler -v 1.11.2

data/Gemfile ADDED

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in yanbi-ml.gemspec
+gemspec

data/README.md ADDED

@@ -0,0 +1,34 @@
+# YANBI-ML
+Yet Another Naive Bayes Implementation
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'yanbi-ml'
+```
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install yanbi-ml
+## Usage
+TODO: Write usage instructions here
+## Contributing
+Bug reports and pull requests are welcome on GitHub at https://github.com/rdormer/yanbi-ml.
+## License
+The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).

data/Rakefile ADDED

@@ -0,0 +1,5 @@
+require "bundler/gem_tasks"
+require "rspec/core/rake_task"
+RSpec::Core::RakeTask.new(:spec)
+task :default => :spec

data/bin/console ADDED

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require "bundler/setup"
+require "yanbi/ml"
+# You can add fixtures and/or initialization code here to make experimenting
+# with your gem easier. You can also use a different console, if you like.
+# (If you use this, don't forget to add pry to your Gemfile!)
+# require "pry"
+# Pry.start
+require "irb"
+IRB.start

data/bin/setup ADDED

@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+set -euo pipefail
+IFS=$'\n\t'
+set -vx
+bundle install
+# Do any other automated setup that you need to do here

data/lib/bayes/bayes.rb ADDED

@@ -0,0 +1,112 @@
+# Author::    Robert Dormer (mailto:rdormer@gmail.com)
+# Copyright:: Copyright (c) 2016 Robert Dormer
+# License::   MIT
+# Naive Bayesian classifier.  Training and classification are both done via passed in
+# word bags, as opposed to raw text.  The first argument to new is the class of WordBag
+# that you want newdoc to create.  From then on, you can use newdoc to process text instead
+# of manually creating word bags yourself, which will help to keep the word bag type
+# consistent for a given classifier object.  Note that if you really want to, you can train
+# or classify with a different type of word bag then you passed in, although I can't imagine
+# why you would want to.  There's also a default constructor if you just want to create a
+# classifier without being bothered about which word bag it uses.
+module Yanbi
+  class Bayes
+    def initialize(klass, *categories)
+      raise ArgumentError unless categories.size > 1
+      @categories = categories
+      @category_counts = {}
+      @document_counts = {}
+      @categories.each do |category|
+        cat = category.to_sym
+        @category_counts[cat] = {}
+        @document_counts[cat] = 0
+      end
+      @bag_class = klass.to_s.split('::').last
+    end
+    def self.default(*categories)
+      self.new(WordBag, *categories)
+    end
+    def train(category, document)
+      cat = category.to_sym
+      @document_counts[cat] += 1
+      document.words.uniq.each do |word|
+        @category_counts[cat][word] ||= 0
+        @category_counts[cat][word] += 1
+      end
+    end
+    def classify(document)
+      max_score(document) do |cat, doc|
+        cond_prob(cat, doc)
+      end
+    end
+    def train_raw(category, text)
+      train(category, self.newdoc(text))
+    end
+    def classify_raw(text)
+      classify(self.newdoc(text))
+    end
+    def set_significance(cutoff, category=nil)
+      categories = (category.nil? ? @categories : [category])
+      categories.each do |category|
+        cat = category.to_sym
+        @category_counts[cat].reject! {|k,v| v < cutoff}
+      end
+    end
+    def newdoc(doc)
+      Yanbi.const_get(@bag_class).new(doc)
+    end
+    def save(name)
+      File.open(name + ".obj", 'w') do |out|
+         YAML.dump(self, out)
+      end
+    end
+    private
+    def cond_prob(cat, document)
+      total_docs = @document_counts.values.reduce(:+).to_f
+      document_prob = document.words.uniq.map {|word| word_prob(cat, word)}.reduce(:+)
+      document_prob + Math.log(@document_counts[cat] / total_docs)
+    end
+    def word_prob(cat, word)
+      all_word_count = @category_counts[cat].values.reduce(&:+)
+      count = @category_counts[cat].has_key?(word) ? @category_counts[cat][word].to_f : 0.1
+      Math.log(count / all_word_count)
+    end
+    def max_score(document)
+      scores = []
+      @categories.each do |c|
+        score = yield c, document
+        scores << score
+      end
+      i = scores.rindex(scores.max)
+      @categories[i]
+    end
+#    def weighted_prob(word, category, basicprob, weight=1.0, ap=0.5)
+#      #basicprob = word_prob(category, word) if basicprob.nil?
+#      totals = @category_counts.inject(0) {|sum, cat| sum += cat.last[word].to_i}
+#      ((weight * ap) + (totals*basicprob)) / (weight + totals)
+#    end
+  end
+end

data/lib/bayes/fisher.rb ADDED

@@ -0,0 +1,62 @@
+module Yanbi
+  class Fisher < Yanbi::Bayes
+    def classify(text)
+      max_score(text) do |cat, doc|
+        fisher_score(cat, doc)
+      end
+    end
+    private
+    def fisher_score(category, document)
+      features = document.words.uniq
+      pscores = 1
+###
+#compute weighted probabilities for each word/cat tuple
+#and then multiply them all together...
+##
+      features.each do |word|
+        clf = word_prob(category, word)
+        freqsum = @categories.reduce(0) {|sum, x| sum + word_prob(x, word)}
+        pscores *= (clf / freqsum) if clf > 0
+      end
+#####
+#compute fisher factor of pscores
+      score = -2 * Math.log(pscores)
+#this is okay
+      invchi2(score, features.count * 2)
+    end
+    def word_prob(cat, word)
+      @category_counts[cat][word].to_f / @document_counts[cat]
+    end
+    def invchi2(chi, df)
+      m = chi / 2.0
+      sum = Math.exp(-m)
+      term = Math.exp(-m)
+      (1..df/2).each do |i|
+        term *= (m / i)
+        sum += term
+      end
+      [sum, 1.0].min
+    rescue
+      1.0
+    end
+  end
+end

data/lib/corpus.rb ADDED

@@ -0,0 +1,63 @@
+# Author::    Robert Dormer (mailto:rdormer@gmail.com)
+# Copyright:: Copyright (c) 2016 Robert Dormer
+# License::   MIT
+# This is the class for managing a corpus of documents.  It's recommended, though not necessary,
+# that all of the documents in a given corpus be in the same category, if you're using the corpus
+# to train your classifier.  Can accept either raw strings through add_doc, or files through add_file.
+# Files can be delimited so that you can have more than one document in them, and commenting is
+# available
+$: << File.dirname(__FILE__)
+require 'yanbi'
+module Yanbi
+  class Corpus
+    attr_reader :docs
+    attr_reader :bags
+    attr_reader :all
+    def initialize(klass=WordBag)
+      @all = klass.new
+      @docs = []
+      @bags = []
+    end
+    def size
+      @docs.size
+    end
+    def add_file(docpath, delim=nil, comment=nil)
+      infile = File.open(docpath, 'r')
+      raw = infile.read
+      infile.close
+      if delim
+        docs = raw.split(delim)
+        docs.each {|d| add_doc(d, comment)}
+      else
+        add_doc(raw, comment)
+      end
+    end
+    def add_doc(doc, comment=nil)
+      doc.gsub! comment, '' if comment
+      doc.strip!
+      unless doc.length.zero?
+        @bags << @all.class.new(doc)
+        @all.add_text doc
+        @docs << doc
+      end
+    end
+    def each_doc
+      @bags.each do |bag|
+        yield bag
+      end
+    end
+  end
+end

data/lib/version.rb ADDED

@@ -0,0 +1,7 @@
+# Author::    Robert Dormer (mailto:rdormer@gmail.com)
+# Copyright:: Copyright (c) 2016 Robert Dormer
+# License::   MIT
+module Yanbi
+  VERSION = "0.1.0"
+end

data/lib/wordbags/diadbag.rb ADDED

@@ -0,0 +1,36 @@
+# Author::    Robert Dormer (mailto:rdormer@gmail.com)
+# Copyright:: Copyright (c) 2016 Robert Dormer
+# License::   MIT
+# A word bag that stores the words as diads instead of individual words.
+# i.e. "the quick brown fox" becomes "the quick", "quick brown", "brown fox".
+# This type of shingling is often recommended as a way to boost the accuracy
+# of Bayes classifiers
+$: << File.dirname(__FILE__)
+require 'wordbag'
+module Yanbi
+  class DiadBag < WordBag
+    def process(raw)
+      processed = raw.downcase
+      processed.gsub!(/[^\w\s'\-]/, ' ')
+      words = processed.split
+      words = words.map {|x| x.split /-/}.flatten
+      if block_given?
+        words.map! {|x| yield x}
+      end
+      diads = []
+      words.each_with_index {|w, i| diads << [w, words[i+1]]}
+      diads.delete_at(-1)
+      words = diads.map {|x| "#{x.first} #{x.last}"}
+      update_counts(words)
+      @words.concat(words)
+    end
+  end
+end

data/lib/wordbags/stembag.rb ADDED

@@ -0,0 +1,20 @@
+# Author::    Robert Dormer (mailto:rdormer@gmail.com)
+# Copyright:: Copyright (c) 2016 Robert Dormer
+# License::   MIT
+# This is a word bag with a post-processing step to stem (lemmatize)
+# the words in the bag
+$: << File.dirname(__FILE__)
+require 'fast_stemmer'
+require 'wordbag'
+module Yanbi
+  class StemmedWordBag < WordBag
+    def standardize(raw)
+      process(raw) {|word| word.stem}
+    end
+  end
+end

data/lib/wordbags/stemmed_diadbag.rb ADDED

@@ -0,0 +1,16 @@
+# Author::    Robert Dormer (mailto:rdormer@gmail.com)
+# Copyright:: Copyright (c) 2016 Robert Dormer
+# License::   MIT
+$: << File.dirname(__FILE__)
+require 'diadbag'
+module Yanbi
+  class StemmedDiadBag < DiadBag
+    def standardize(raw)
+      process(raw) {|word| word.stem}
+    end
+  end
+end

data/lib/wordbags/wordbag.rb ADDED

@@ -0,0 +1,104 @@
+# Author::    Robert Dormer (mailto:rdormer@gmail.com)
+# Copyright:: Copyright (c) 2016 Robert Dormer
+# License::   MIT
+# Word bag class, implementing the bag of words / multi-set that is so popular in text
+# classification literature.  A single bag can contain multiple documents if you want
+# it to, although for training a Bayes classifier this is probably not recommended.
+$: << File.dirname(__FILE__)
+require 'yaml'
+module Yanbi
+  class WordBag
+    attr_reader :words
+    def initialize(corpus=nil)
+      @words = []
+      @counts = {}
+      standardize(corpus) if corpus
+    end
+    def add_file(filename)
+      raw = File.open(filename).read
+      standardize(raw)
+    end
+    def add_text(text)
+      standardize(text)
+    end
+    def save(filename)
+      out = File.new(filename + ".yml", "w")
+      out.write(@words.to_yaml)
+      out.close
+    end
+    def load(filename)
+      @words = YAML.load_file(filename + ".yml")
+      update_counts(@words)
+    end
+    def self.load(filename)
+      WordBag.new.load(filename)
+    end
+    def word_counts(min=1)
+      @counts.select {|key, value| value >= min}
+    end
+    def remove(words)
+      words.each do |word|
+        @words.reject! {|x| x == word}
+        @counts.delete(word)
+      end
+    end
+    def between_counts(min, max=nil)
+      counts = @counts.select{|key, value| value >= min}
+      counts.select! {|key, value| value <= max} unless max.nil?
+      @words.select {|word| counts.keys.include? word}
+    end
+    def intersection(other)
+      self.words & other.words
+    end
+    def empty?
+      @words.empty?
+    end
+    private
+    def standardize(raw)
+      process(raw)
+    end
+    def process(raw)
+      processed = raw.downcase
+      processed.gsub!(/[^\w\s'\-]/, ' ')
+      words = processed.split
+      words = words.map {|x| x.split /-/}.flatten
+      if block_given?
+        words.map! {|x| yield x}
+      end
+      update_counts(words)
+      @words.concat(words)
+    end
+    def update_counts(data)
+      data.each do |word|
+        if @counts[word].nil?
+          @counts[word] = 1
+        else
+          @counts[word] += 1
+        end
+      end
+    end
+  end
+end

data/lib/yanbi.rb ADDED

@@ -0,0 +1,17 @@
+# Author::    Robert Dormer (mailto:rdormer@gmail.com)
+# Copyright:: Copyright (c) 2016 Robert Dormer
+# License::   MIT
+base = File.dirname(__FILE__)
+$: << base
+Dir[base + "/wordbags/**/*.rb"].each do |bag|
+  require bag
+end
+Dir[base + "/bayes/**/*.rb"].each do |c|
+  require c
+end
+require 'corpus'
+require 'version'

data/yanbi-ml.gemspec ADDED

@@ -0,0 +1,25 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'version'
+Gem::Specification.new do |spec|
+  spec.name          = "yanbi-ml"
+  spec.version       = Yanbi::VERSION
+  spec.authors       = ["Robert Dormer"]
+  spec.email         = ["rdormer@gmail.com"]
+  spec.summary       = %q{Yet Another Naive Bayes Implementation}
+  spec.homepage      = "http://github.com/rdormer/yanbi-ml"
+  spec.license       = "MIT"
+  spec.files         = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
+  spec.bindir        = "exe"
+  spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
+  spec.require_paths = ["lib"]
+  spec.add_development_dependency "bundler", "~> 1.11"
+  spec.add_development_dependency "rake", "~> 10.0"
+  spec.add_development_dependency "rspec", "~> 3.4.0"
+  spec.add_development_dependency "fast-stemmer", "~> 1.0.2"
+end

metadata ADDED

@@ -0,0 +1,126 @@
+--- !ruby/object:Gem::Specification
+name: yanbi-ml
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+  prerelease:
+platform: ruby
+authors:
+- Robert Dormer
+autorequire:
+bindir: exe
+cert_chain: []
+date: 2016-07-05 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.11'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.11'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '10.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '10.0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 3.4.0
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 3.4.0
+- !ruby/object:Gem::Dependency
+  name: fast-stemmer
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.0.2
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.0.2
+description:
+email:
+- rdormer@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- .travis.yml
+- Gemfile
+- README.md
+- Rakefile
+- bin/console
+- bin/setup
+- lib/bayes/bayes.rb
+- lib/bayes/fisher.rb
+- lib/corpus.rb
+- lib/version.rb
+- lib/wordbags/diadbag.rb
+- lib/wordbags/stembag.rb
+- lib/wordbags/stemmed_diadbag.rb
+- lib/wordbags/wordbag.rb
+- lib/yanbi.rb
+- yanbi-ml.gemspec
+homepage: http://github.com/rdormer/yanbi-ml
+licenses:
+- MIT
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.25
+signing_key:
+specification_version: 3
+summary: Yet Another Naive Bayes Implementation
+test_files: []