yanbi-ml 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.2.2
4
+ before_install: gem install bundler -v 1.11.2
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in yanbi-ml.gemspec
4
+ gemspec
@@ -0,0 +1,34 @@
1
+ # YANBI-ML
2
+
3
+ Yet Another Naive Bayes Implementation
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'yanbi-ml'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install yanbi-ml
20
+
21
+ ## Usage
22
+
23
+ TODO: Write usage instructions here
24
+
25
+
26
+ ## Contributing
27
+
28
+ Bug reports and pull requests are welcome on GitHub at https://github.com/rdormer/yanbi-ml.
29
+
30
+
31
+ ## License
32
+
33
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
34
+
@@ -0,0 +1,5 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+ task :default => :spec
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "yanbi/ml"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,112 @@
1
+ # Author:: Robert Dormer (mailto:rdormer@gmail.com)
2
+ # Copyright:: Copyright (c) 2016 Robert Dormer
3
+ # License:: MIT
4
+
5
+ # Naive Bayesian classifier. Training and classification are both done via passed in
6
+ # word bags, as opposed to raw text. The first argument to new is the class of WordBag
7
+ # that you want newdoc to create. From then on, you can use newdoc to process text instead
8
+ # of manually creating word bags yourself, which will help to keep the word bag type
9
+ # consistent for a given classifier object. Note that if you really want to, you can train
10
+ # or classify with a different type of word bag then you passed in, although I can't imagine
11
+ # why you would want to. There's also a default constructor if you just want to create a
12
+ # classifier without being bothered about which word bag it uses.
13
+
14
+ module Yanbi
15
+
16
+ class Bayes
17
+
18
+ def initialize(klass, *categories)
19
+ raise ArgumentError unless categories.size > 1
20
+ @categories = categories
21
+ @category_counts = {}
22
+ @document_counts = {}
23
+
24
+ @categories.each do |category|
25
+ cat = category.to_sym
26
+ @category_counts[cat] = {}
27
+ @document_counts[cat] = 0
28
+ end
29
+
30
+ @bag_class = klass.to_s.split('::').last
31
+ end
32
+
33
+ def self.default(*categories)
34
+ self.new(WordBag, *categories)
35
+ end
36
+
37
+ def train(category, document)
38
+ cat = category.to_sym
39
+ @document_counts[cat] += 1
40
+
41
+ document.words.uniq.each do |word|
42
+ @category_counts[cat][word] ||= 0
43
+ @category_counts[cat][word] += 1
44
+ end
45
+ end
46
+
47
+ def classify(document)
48
+ max_score(document) do |cat, doc|
49
+ cond_prob(cat, doc)
50
+ end
51
+ end
52
+
53
+ def train_raw(category, text)
54
+ train(category, self.newdoc(text))
55
+ end
56
+
57
+ def classify_raw(text)
58
+ classify(self.newdoc(text))
59
+ end
60
+
61
+ def set_significance(cutoff, category=nil)
62
+ categories = (category.nil? ? @categories : [category])
63
+ categories.each do |category|
64
+ cat = category.to_sym
65
+ @category_counts[cat].reject! {|k,v| v < cutoff}
66
+ end
67
+ end
68
+
69
+ def newdoc(doc)
70
+ Yanbi.const_get(@bag_class).new(doc)
71
+ end
72
+
73
+ def save(name)
74
+ File.open(name + ".obj", 'w') do |out|
75
+ YAML.dump(self, out)
76
+ end
77
+ end
78
+
79
+ private
80
+
81
+ def cond_prob(cat, document)
82
+ total_docs = @document_counts.values.reduce(:+).to_f
83
+ document_prob = document.words.uniq.map {|word| word_prob(cat, word)}.reduce(:+)
84
+ document_prob + Math.log(@document_counts[cat] / total_docs)
85
+ end
86
+
87
+ def word_prob(cat, word)
88
+ all_word_count = @category_counts[cat].values.reduce(&:+)
89
+ count = @category_counts[cat].has_key?(word) ? @category_counts[cat][word].to_f : 0.1
90
+ Math.log(count / all_word_count)
91
+ end
92
+
93
+ def max_score(document)
94
+ scores = []
95
+
96
+ @categories.each do |c|
97
+ score = yield c, document
98
+ scores << score
99
+ end
100
+
101
+ i = scores.rindex(scores.max)
102
+ @categories[i]
103
+ end
104
+
105
+ # def weighted_prob(word, category, basicprob, weight=1.0, ap=0.5)
106
+ # #basicprob = word_prob(category, word) if basicprob.nil?
107
+ # totals = @category_counts.inject(0) {|sum, cat| sum += cat.last[word].to_i}
108
+ # ((weight * ap) + (totals*basicprob)) / (weight + totals)
109
+ # end
110
+ end
111
+
112
+ end
@@ -0,0 +1,62 @@
1
+ module Yanbi
2
+
3
+ class Fisher < Yanbi::Bayes
4
+
5
+ def classify(text)
6
+ max_score(text) do |cat, doc|
7
+ fisher_score(cat, doc)
8
+ end
9
+ end
10
+
11
+ private
12
+
13
+ def fisher_score(category, document)
14
+ features = document.words.uniq
15
+ pscores = 1
16
+
17
+
18
+ ###
19
+ #compute weighted probabilities for each word/cat tuple
20
+ #and then multiply them all together...
21
+ ##
22
+
23
+
24
+
25
+ features.each do |word|
26
+ clf = word_prob(category, word)
27
+ freqsum = @categories.reduce(0) {|sum, x| sum + word_prob(x, word)}
28
+ pscores *= (clf / freqsum) if clf > 0
29
+ end
30
+
31
+ #####
32
+
33
+
34
+ #compute fisher factor of pscores
35
+ score = -2 * Math.log(pscores)
36
+
37
+ #this is okay
38
+ invchi2(score, features.count * 2)
39
+ end
40
+
41
+ def word_prob(cat, word)
42
+ @category_counts[cat][word].to_f / @document_counts[cat]
43
+ end
44
+
45
+ def invchi2(chi, df)
46
+ m = chi / 2.0
47
+ sum = Math.exp(-m)
48
+ term = Math.exp(-m)
49
+
50
+ (1..df/2).each do |i|
51
+ term *= (m / i)
52
+ sum += term
53
+ end
54
+
55
+ [sum, 1.0].min
56
+
57
+ rescue
58
+ 1.0
59
+ end
60
+ end
61
+
62
+ end
@@ -0,0 +1,63 @@
1
+ # Author:: Robert Dormer (mailto:rdormer@gmail.com)
2
+ # Copyright:: Copyright (c) 2016 Robert Dormer
3
+ # License:: MIT
4
+
5
+ # This is the class for managing a corpus of documents. It's recommended, though not necessary,
6
+ # that all of the documents in a given corpus be in the same category, if you're using the corpus
7
+ # to train your classifier. Can accept either raw strings through add_doc, or files through add_file.
8
+ # Files can be delimited so that you can have more than one document in them, and commenting is
9
+ # available
10
+
11
+ $: << File.dirname(__FILE__)
12
+ require 'yanbi'
13
+
14
+ module Yanbi
15
+
16
+ class Corpus
17
+
18
+ attr_reader :docs
19
+ attr_reader :bags
20
+ attr_reader :all
21
+
22
+ def initialize(klass=WordBag)
23
+ @all = klass.new
24
+ @docs = []
25
+ @bags = []
26
+ end
27
+
28
+ def size
29
+ @docs.size
30
+ end
31
+
32
+ def add_file(docpath, delim=nil, comment=nil)
33
+ infile = File.open(docpath, 'r')
34
+ raw = infile.read
35
+ infile.close
36
+
37
+ if delim
38
+ docs = raw.split(delim)
39
+ docs.each {|d| add_doc(d, comment)}
40
+ else
41
+ add_doc(raw, comment)
42
+ end
43
+ end
44
+
45
+ def add_doc(doc, comment=nil)
46
+ doc.gsub! comment, '' if comment
47
+ doc.strip!
48
+
49
+ unless doc.length.zero?
50
+ @bags << @all.class.new(doc)
51
+ @all.add_text doc
52
+ @docs << doc
53
+ end
54
+ end
55
+
56
+ def each_doc
57
+ @bags.each do |bag|
58
+ yield bag
59
+ end
60
+ end
61
+ end
62
+
63
+ end
@@ -0,0 +1,7 @@
1
+ # Author:: Robert Dormer (mailto:rdormer@gmail.com)
2
+ # Copyright:: Copyright (c) 2016 Robert Dormer
3
+ # License:: MIT
4
+
5
+ module Yanbi
6
+ VERSION = "0.1.0"
7
+ end
@@ -0,0 +1,36 @@
1
+ # Author:: Robert Dormer (mailto:rdormer@gmail.com)
2
+ # Copyright:: Copyright (c) 2016 Robert Dormer
3
+ # License:: MIT
4
+
5
+ # A word bag that stores the words as diads instead of individual words.
6
+ # i.e. "the quick brown fox" becomes "the quick", "quick brown", "brown fox".
7
+ # This type of shingling is often recommended as a way to boost the accuracy
8
+ # of Bayes classifiers
9
+
10
+ $: << File.dirname(__FILE__)
11
+ require 'wordbag'
12
+
13
+ module Yanbi
14
+
15
+ class DiadBag < WordBag
16
+ def process(raw)
17
+ processed = raw.downcase
18
+ processed.gsub!(/[^\w\s'\-]/, ' ')
19
+ words = processed.split
20
+ words = words.map {|x| x.split /-/}.flatten
21
+
22
+ if block_given?
23
+ words.map! {|x| yield x}
24
+ end
25
+
26
+ diads = []
27
+ words.each_with_index {|w, i| diads << [w, words[i+1]]}
28
+ diads.delete_at(-1)
29
+
30
+ words = diads.map {|x| "#{x.first} #{x.last}"}
31
+ update_counts(words)
32
+ @words.concat(words)
33
+ end
34
+ end
35
+
36
+ end
@@ -0,0 +1,20 @@
1
+ # Author:: Robert Dormer (mailto:rdormer@gmail.com)
2
+ # Copyright:: Copyright (c) 2016 Robert Dormer
3
+ # License:: MIT
4
+
5
+ # This is a word bag with a post-processing step to stem (lemmatize)
6
+ # the words in the bag
7
+
8
+ $: << File.dirname(__FILE__)
9
+ require 'fast_stemmer'
10
+ require 'wordbag'
11
+
12
+ module Yanbi
13
+
14
+ class StemmedWordBag < WordBag
15
+ def standardize(raw)
16
+ process(raw) {|word| word.stem}
17
+ end
18
+ end
19
+
20
+ end
@@ -0,0 +1,16 @@
1
+ # Author:: Robert Dormer (mailto:rdormer@gmail.com)
2
+ # Copyright:: Copyright (c) 2016 Robert Dormer
3
+ # License:: MIT
4
+
5
+ $: << File.dirname(__FILE__)
6
+ require 'diadbag'
7
+
8
+ module Yanbi
9
+
10
+ class StemmedDiadBag < DiadBag
11
+ def standardize(raw)
12
+ process(raw) {|word| word.stem}
13
+ end
14
+ end
15
+
16
+ end
@@ -0,0 +1,104 @@
1
+ # Author:: Robert Dormer (mailto:rdormer@gmail.com)
2
+ # Copyright:: Copyright (c) 2016 Robert Dormer
3
+ # License:: MIT
4
+
5
+ # Word bag class, implementing the bag of words / multi-set that is so popular in text
6
+ # classification literature. A single bag can contain multiple documents if you want
7
+ # it to, although for training a Bayes classifier this is probably not recommended.
8
+
9
+ $: << File.dirname(__FILE__)
10
+ require 'yaml'
11
+
12
+ module Yanbi
13
+
14
+ class WordBag
15
+
16
+ attr_reader :words
17
+
18
+ def initialize(corpus=nil)
19
+ @words = []
20
+ @counts = {}
21
+ standardize(corpus) if corpus
22
+ end
23
+
24
+ def add_file(filename)
25
+ raw = File.open(filename).read
26
+ standardize(raw)
27
+ end
28
+
29
+ def add_text(text)
30
+ standardize(text)
31
+ end
32
+
33
+ def save(filename)
34
+ out = File.new(filename + ".yml", "w")
35
+ out.write(@words.to_yaml)
36
+ out.close
37
+ end
38
+
39
+ def load(filename)
40
+ @words = YAML.load_file(filename + ".yml")
41
+ update_counts(@words)
42
+ end
43
+
44
+ def self.load(filename)
45
+ WordBag.new.load(filename)
46
+ end
47
+
48
+ def word_counts(min=1)
49
+ @counts.select {|key, value| value >= min}
50
+ end
51
+
52
+ def remove(words)
53
+ words.each do |word|
54
+ @words.reject! {|x| x == word}
55
+ @counts.delete(word)
56
+ end
57
+ end
58
+
59
+ def between_counts(min, max=nil)
60
+ counts = @counts.select{|key, value| value >= min}
61
+ counts.select! {|key, value| value <= max} unless max.nil?
62
+ @words.select {|word| counts.keys.include? word}
63
+ end
64
+
65
+ def intersection(other)
66
+ self.words & other.words
67
+ end
68
+
69
+ def empty?
70
+ @words.empty?
71
+ end
72
+
73
+ private
74
+
75
+ def standardize(raw)
76
+ process(raw)
77
+ end
78
+
79
+ def process(raw)
80
+ processed = raw.downcase
81
+ processed.gsub!(/[^\w\s'\-]/, ' ')
82
+ words = processed.split
83
+ words = words.map {|x| x.split /-/}.flatten
84
+
85
+ if block_given?
86
+ words.map! {|x| yield x}
87
+ end
88
+
89
+ update_counts(words)
90
+ @words.concat(words)
91
+ end
92
+
93
+ def update_counts(data)
94
+ data.each do |word|
95
+ if @counts[word].nil?
96
+ @counts[word] = 1
97
+ else
98
+ @counts[word] += 1
99
+ end
100
+ end
101
+ end
102
+ end
103
+
104
+ end
@@ -0,0 +1,17 @@
1
+ # Author:: Robert Dormer (mailto:rdormer@gmail.com)
2
+ # Copyright:: Copyright (c) 2016 Robert Dormer
3
+ # License:: MIT
4
+
5
+ base = File.dirname(__FILE__)
6
+ $: << base
7
+
8
+ Dir[base + "/wordbags/**/*.rb"].each do |bag|
9
+ require bag
10
+ end
11
+
12
+ Dir[base + "/bayes/**/*.rb"].each do |c|
13
+ require c
14
+ end
15
+
16
+ require 'corpus'
17
+ require 'version'
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "yanbi-ml"
8
+ spec.version = Yanbi::VERSION
9
+ spec.authors = ["Robert Dormer"]
10
+ spec.email = ["rdormer@gmail.com"]
11
+
12
+ spec.summary = %q{Yet Another Naive Bayes Implementation}
13
+ spec.homepage = "http://github.com/rdormer/yanbi-ml"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
17
+ spec.bindir = "exe"
18
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.11"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+ spec.add_development_dependency "rspec", "~> 3.4.0"
24
+ spec.add_development_dependency "fast-stemmer", "~> 1.0.2"
25
+ end
metadata ADDED
@@ -0,0 +1,126 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: yanbi-ml
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Robert Dormer
9
+ autorequire:
10
+ bindir: exe
11
+ cert_chain: []
12
+ date: 2016-07-05 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: bundler
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '1.11'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: '1.11'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rake
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: '10.0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: '10.0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: rspec
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ~>
52
+ - !ruby/object:Gem::Version
53
+ version: 3.4.0
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 3.4.0
62
+ - !ruby/object:Gem::Dependency
63
+ name: fast-stemmer
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ~>
68
+ - !ruby/object:Gem::Version
69
+ version: 1.0.2
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ~>
76
+ - !ruby/object:Gem::Version
77
+ version: 1.0.2
78
+ description:
79
+ email:
80
+ - rdormer@gmail.com
81
+ executables: []
82
+ extensions: []
83
+ extra_rdoc_files: []
84
+ files:
85
+ - .travis.yml
86
+ - Gemfile
87
+ - README.md
88
+ - Rakefile
89
+ - bin/console
90
+ - bin/setup
91
+ - lib/bayes/bayes.rb
92
+ - lib/bayes/fisher.rb
93
+ - lib/corpus.rb
94
+ - lib/version.rb
95
+ - lib/wordbags/diadbag.rb
96
+ - lib/wordbags/stembag.rb
97
+ - lib/wordbags/stemmed_diadbag.rb
98
+ - lib/wordbags/wordbag.rb
99
+ - lib/yanbi.rb
100
+ - yanbi-ml.gemspec
101
+ homepage: http://github.com/rdormer/yanbi-ml
102
+ licenses:
103
+ - MIT
104
+ post_install_message:
105
+ rdoc_options: []
106
+ require_paths:
107
+ - lib
108
+ required_ruby_version: !ruby/object:Gem::Requirement
109
+ none: false
110
+ requirements:
111
+ - - ! '>='
112
+ - !ruby/object:Gem::Version
113
+ version: '0'
114
+ required_rubygems_version: !ruby/object:Gem::Requirement
115
+ none: false
116
+ requirements:
117
+ - - ! '>='
118
+ - !ruby/object:Gem::Version
119
+ version: '0'
120
+ requirements: []
121
+ rubyforge_project:
122
+ rubygems_version: 1.8.25
123
+ signing_key:
124
+ specification_version: 3
125
+ summary: Yet Another Naive Bayes Implementation
126
+ test_files: []