yanbi-ml 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,4 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.2.2
4
+ before_install: gem install bundler -v 1.11.2
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in yanbi-ml.gemspec
4
+ gemspec
@@ -0,0 +1,34 @@
1
+ # YANBI-ML
2
+
3
+ Yet Another Naive Bayes Implementation
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'yanbi-ml'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install yanbi-ml
20
+
21
+ ## Usage
22
+
23
+ TODO: Write usage instructions here
24
+
25
+
26
+ ## Contributing
27
+
28
+ Bug reports and pull requests are welcome on GitHub at https://github.com/rdormer/yanbi-ml.
29
+
30
+
31
+ ## License
32
+
33
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
34
+
@@ -0,0 +1,5 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+ task :default => :spec
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "yanbi/ml"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,112 @@
1
+ # Author:: Robert Dormer (mailto:rdormer@gmail.com)
2
+ # Copyright:: Copyright (c) 2016 Robert Dormer
3
+ # License:: MIT
4
+
5
+ # Naive Bayesian classifier. Training and classification are both done via passed in
6
+ # word bags, as opposed to raw text. The first argument to new is the class of WordBag
7
+ # that you want newdoc to create. From then on, you can use newdoc to process text instead
8
+ # of manually creating word bags yourself, which will help to keep the word bag type
9
+ # consistent for a given classifier object. Note that if you really want to, you can train
10
+ # or classify with a different type of word bag then you passed in, although I can't imagine
11
+ # why you would want to. There's also a default constructor if you just want to create a
12
+ # classifier without being bothered about which word bag it uses.
13
+
14
+ module Yanbi
15
+
16
+ class Bayes
17
+
18
+ def initialize(klass, *categories)
19
+ raise ArgumentError unless categories.size > 1
20
+ @categories = categories
21
+ @category_counts = {}
22
+ @document_counts = {}
23
+
24
+ @categories.each do |category|
25
+ cat = category.to_sym
26
+ @category_counts[cat] = {}
27
+ @document_counts[cat] = 0
28
+ end
29
+
30
+ @bag_class = klass.to_s.split('::').last
31
+ end
32
+
33
+ def self.default(*categories)
34
+ self.new(WordBag, *categories)
35
+ end
36
+
37
+ def train(category, document)
38
+ cat = category.to_sym
39
+ @document_counts[cat] += 1
40
+
41
+ document.words.uniq.each do |word|
42
+ @category_counts[cat][word] ||= 0
43
+ @category_counts[cat][word] += 1
44
+ end
45
+ end
46
+
47
+ def classify(document)
48
+ max_score(document) do |cat, doc|
49
+ cond_prob(cat, doc)
50
+ end
51
+ end
52
+
53
+ def train_raw(category, text)
54
+ train(category, self.newdoc(text))
55
+ end
56
+
57
+ def classify_raw(text)
58
+ classify(self.newdoc(text))
59
+ end
60
+
61
+ def set_significance(cutoff, category=nil)
62
+ categories = (category.nil? ? @categories : [category])
63
+ categories.each do |category|
64
+ cat = category.to_sym
65
+ @category_counts[cat].reject! {|k,v| v < cutoff}
66
+ end
67
+ end
68
+
69
+ def newdoc(doc)
70
+ Yanbi.const_get(@bag_class).new(doc)
71
+ end
72
+
73
+ def save(name)
74
+ File.open(name + ".obj", 'w') do |out|
75
+ YAML.dump(self, out)
76
+ end
77
+ end
78
+
79
+ private
80
+
81
+ def cond_prob(cat, document)
82
+ total_docs = @document_counts.values.reduce(:+).to_f
83
+ document_prob = document.words.uniq.map {|word| word_prob(cat, word)}.reduce(:+)
84
+ document_prob + Math.log(@document_counts[cat] / total_docs)
85
+ end
86
+
87
+ def word_prob(cat, word)
88
+ all_word_count = @category_counts[cat].values.reduce(&:+)
89
+ count = @category_counts[cat].has_key?(word) ? @category_counts[cat][word].to_f : 0.1
90
+ Math.log(count / all_word_count)
91
+ end
92
+
93
+ def max_score(document)
94
+ scores = []
95
+
96
+ @categories.each do |c|
97
+ score = yield c, document
98
+ scores << score
99
+ end
100
+
101
+ i = scores.rindex(scores.max)
102
+ @categories[i]
103
+ end
104
+
105
+ # def weighted_prob(word, category, basicprob, weight=1.0, ap=0.5)
106
+ # #basicprob = word_prob(category, word) if basicprob.nil?
107
+ # totals = @category_counts.inject(0) {|sum, cat| sum += cat.last[word].to_i}
108
+ # ((weight * ap) + (totals*basicprob)) / (weight + totals)
109
+ # end
110
+ end
111
+
112
+ end
@@ -0,0 +1,62 @@
1
+ module Yanbi
2
+
3
+ class Fisher < Yanbi::Bayes
4
+
5
+ def classify(text)
6
+ max_score(text) do |cat, doc|
7
+ fisher_score(cat, doc)
8
+ end
9
+ end
10
+
11
+ private
12
+
13
+ def fisher_score(category, document)
14
+ features = document.words.uniq
15
+ pscores = 1
16
+
17
+
18
+ ###
19
+ #compute weighted probabilities for each word/cat tuple
20
+ #and then multiply them all together...
21
+ ##
22
+
23
+
24
+
25
+ features.each do |word|
26
+ clf = word_prob(category, word)
27
+ freqsum = @categories.reduce(0) {|sum, x| sum + word_prob(x, word)}
28
+ pscores *= (clf / freqsum) if clf > 0
29
+ end
30
+
31
+ #####
32
+
33
+
34
+ #compute fisher factor of pscores
35
+ score = -2 * Math.log(pscores)
36
+
37
+ #this is okay
38
+ invchi2(score, features.count * 2)
39
+ end
40
+
41
+ def word_prob(cat, word)
42
+ @category_counts[cat][word].to_f / @document_counts[cat]
43
+ end
44
+
45
+ def invchi2(chi, df)
46
+ m = chi / 2.0
47
+ sum = Math.exp(-m)
48
+ term = Math.exp(-m)
49
+
50
+ (1..df/2).each do |i|
51
+ term *= (m / i)
52
+ sum += term
53
+ end
54
+
55
+ [sum, 1.0].min
56
+
57
+ rescue
58
+ 1.0
59
+ end
60
+ end
61
+
62
+ end
@@ -0,0 +1,63 @@
1
+ # Author:: Robert Dormer (mailto:rdormer@gmail.com)
2
+ # Copyright:: Copyright (c) 2016 Robert Dormer
3
+ # License:: MIT
4
+
5
+ # This is the class for managing a corpus of documents. It's recommended, though not necessary,
6
+ # that all of the documents in a given corpus be in the same category, if you're using the corpus
7
+ # to train your classifier. Can accept either raw strings through add_doc, or files through add_file.
8
+ # Files can be delimited so that you can have more than one document in them, and commenting is
9
+ # available
10
+
11
+ $: << File.dirname(__FILE__)
12
+ require 'yanbi'
13
+
14
+ module Yanbi
15
+
16
+ class Corpus
17
+
18
+ attr_reader :docs
19
+ attr_reader :bags
20
+ attr_reader :all
21
+
22
+ def initialize(klass=WordBag)
23
+ @all = klass.new
24
+ @docs = []
25
+ @bags = []
26
+ end
27
+
28
+ def size
29
+ @docs.size
30
+ end
31
+
32
+ def add_file(docpath, delim=nil, comment=nil)
33
+ infile = File.open(docpath, 'r')
34
+ raw = infile.read
35
+ infile.close
36
+
37
+ if delim
38
+ docs = raw.split(delim)
39
+ docs.each {|d| add_doc(d, comment)}
40
+ else
41
+ add_doc(raw, comment)
42
+ end
43
+ end
44
+
45
+ def add_doc(doc, comment=nil)
46
+ doc.gsub! comment, '' if comment
47
+ doc.strip!
48
+
49
+ unless doc.length.zero?
50
+ @bags << @all.class.new(doc)
51
+ @all.add_text doc
52
+ @docs << doc
53
+ end
54
+ end
55
+
56
+ def each_doc
57
+ @bags.each do |bag|
58
+ yield bag
59
+ end
60
+ end
61
+ end
62
+
63
+ end
@@ -0,0 +1,7 @@
1
+ # Author:: Robert Dormer (mailto:rdormer@gmail.com)
2
+ # Copyright:: Copyright (c) 2016 Robert Dormer
3
+ # License:: MIT
4
+
5
+ module Yanbi
6
+ VERSION = "0.1.0"
7
+ end
@@ -0,0 +1,36 @@
1
+ # Author:: Robert Dormer (mailto:rdormer@gmail.com)
2
+ # Copyright:: Copyright (c) 2016 Robert Dormer
3
+ # License:: MIT
4
+
5
+ # A word bag that stores the words as diads instead of individual words.
6
+ # i.e. "the quick brown fox" becomes "the quick", "quick brown", "brown fox".
7
+ # This type of shingling is often recommended as a way to boost the accuracy
8
+ # of Bayes classifiers
9
+
10
+ $: << File.dirname(__FILE__)
11
+ require 'wordbag'
12
+
13
+ module Yanbi
14
+
15
+ class DiadBag < WordBag
16
+ def process(raw)
17
+ processed = raw.downcase
18
+ processed.gsub!(/[^\w\s'\-]/, ' ')
19
+ words = processed.split
20
+ words = words.map {|x| x.split /-/}.flatten
21
+
22
+ if block_given?
23
+ words.map! {|x| yield x}
24
+ end
25
+
26
+ diads = []
27
+ words.each_with_index {|w, i| diads << [w, words[i+1]]}
28
+ diads.delete_at(-1)
29
+
30
+ words = diads.map {|x| "#{x.first} #{x.last}"}
31
+ update_counts(words)
32
+ @words.concat(words)
33
+ end
34
+ end
35
+
36
+ end
@@ -0,0 +1,20 @@
1
+ # Author:: Robert Dormer (mailto:rdormer@gmail.com)
2
+ # Copyright:: Copyright (c) 2016 Robert Dormer
3
+ # License:: MIT
4
+
5
+ # This is a word bag with a post-processing step to stem (lemmatize)
6
+ # the words in the bag
7
+
8
+ $: << File.dirname(__FILE__)
9
+ require 'fast_stemmer'
10
+ require 'wordbag'
11
+
12
+ module Yanbi
13
+
14
+ class StemmedWordBag < WordBag
15
+ def standardize(raw)
16
+ process(raw) {|word| word.stem}
17
+ end
18
+ end
19
+
20
+ end
@@ -0,0 +1,16 @@
1
+ # Author:: Robert Dormer (mailto:rdormer@gmail.com)
2
+ # Copyright:: Copyright (c) 2016 Robert Dormer
3
+ # License:: MIT
4
+
5
+ $: << File.dirname(__FILE__)
6
+ require 'diadbag'
7
+
8
+ module Yanbi
9
+
10
+ class StemmedDiadBag < DiadBag
11
+ def standardize(raw)
12
+ process(raw) {|word| word.stem}
13
+ end
14
+ end
15
+
16
+ end
@@ -0,0 +1,104 @@
1
+ # Author:: Robert Dormer (mailto:rdormer@gmail.com)
2
+ # Copyright:: Copyright (c) 2016 Robert Dormer
3
+ # License:: MIT
4
+
5
+ # Word bag class, implementing the bag of words / multi-set that is so popular in text
6
+ # classification literature. A single bag can contain multiple documents if you want
7
+ # it to, although for training a Bayes classifier this is probably not recommended.
8
+
9
+ $: << File.dirname(__FILE__)
10
+ require 'yaml'
11
+
12
+ module Yanbi
13
+
14
+ class WordBag
15
+
16
+ attr_reader :words
17
+
18
+ def initialize(corpus=nil)
19
+ @words = []
20
+ @counts = {}
21
+ standardize(corpus) if corpus
22
+ end
23
+
24
+ def add_file(filename)
25
+ raw = File.open(filename).read
26
+ standardize(raw)
27
+ end
28
+
29
+ def add_text(text)
30
+ standardize(text)
31
+ end
32
+
33
+ def save(filename)
34
+ out = File.new(filename + ".yml", "w")
35
+ out.write(@words.to_yaml)
36
+ out.close
37
+ end
38
+
39
+ def load(filename)
40
+ @words = YAML.load_file(filename + ".yml")
41
+ update_counts(@words)
42
+ end
43
+
44
+ def self.load(filename)
45
+ WordBag.new.load(filename)
46
+ end
47
+
48
+ def word_counts(min=1)
49
+ @counts.select {|key, value| value >= min}
50
+ end
51
+
52
+ def remove(words)
53
+ words.each do |word|
54
+ @words.reject! {|x| x == word}
55
+ @counts.delete(word)
56
+ end
57
+ end
58
+
59
+ def between_counts(min, max=nil)
60
+ counts = @counts.select{|key, value| value >= min}
61
+ counts.select! {|key, value| value <= max} unless max.nil?
62
+ @words.select {|word| counts.keys.include? word}
63
+ end
64
+
65
+ def intersection(other)
66
+ self.words & other.words
67
+ end
68
+
69
+ def empty?
70
+ @words.empty?
71
+ end
72
+
73
+ private
74
+
75
+ def standardize(raw)
76
+ process(raw)
77
+ end
78
+
79
+ def process(raw)
80
+ processed = raw.downcase
81
+ processed.gsub!(/[^\w\s'\-]/, ' ')
82
+ words = processed.split
83
+ words = words.map {|x| x.split /-/}.flatten
84
+
85
+ if block_given?
86
+ words.map! {|x| yield x}
87
+ end
88
+
89
+ update_counts(words)
90
+ @words.concat(words)
91
+ end
92
+
93
+ def update_counts(data)
94
+ data.each do |word|
95
+ if @counts[word].nil?
96
+ @counts[word] = 1
97
+ else
98
+ @counts[word] += 1
99
+ end
100
+ end
101
+ end
102
+ end
103
+
104
+ end
@@ -0,0 +1,17 @@
1
+ # Author:: Robert Dormer (mailto:rdormer@gmail.com)
2
+ # Copyright:: Copyright (c) 2016 Robert Dormer
3
+ # License:: MIT
4
+
5
+ base = File.dirname(__FILE__)
6
+ $: << base
7
+
8
+ Dir[base + "/wordbags/**/*.rb"].each do |bag|
9
+ require bag
10
+ end
11
+
12
+ Dir[base + "/bayes/**/*.rb"].each do |c|
13
+ require c
14
+ end
15
+
16
+ require 'corpus'
17
+ require 'version'
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "yanbi-ml"
8
+ spec.version = Yanbi::VERSION
9
+ spec.authors = ["Robert Dormer"]
10
+ spec.email = ["rdormer@gmail.com"]
11
+
12
+ spec.summary = %q{Yet Another Naive Bayes Implementation}
13
+ spec.homepage = "http://github.com/rdormer/yanbi-ml"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
17
+ spec.bindir = "exe"
18
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.11"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+ spec.add_development_dependency "rspec", "~> 3.4.0"
24
+ spec.add_development_dependency "fast-stemmer", "~> 1.0.2"
25
+ end
metadata ADDED
@@ -0,0 +1,126 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: yanbi-ml
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Robert Dormer
9
+ autorequire:
10
+ bindir: exe
11
+ cert_chain: []
12
+ date: 2016-07-05 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: bundler
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '1.11'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: '1.11'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rake
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: '10.0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: '10.0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: rspec
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ~>
52
+ - !ruby/object:Gem::Version
53
+ version: 3.4.0
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 3.4.0
62
+ - !ruby/object:Gem::Dependency
63
+ name: fast-stemmer
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ~>
68
+ - !ruby/object:Gem::Version
69
+ version: 1.0.2
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ~>
76
+ - !ruby/object:Gem::Version
77
+ version: 1.0.2
78
+ description:
79
+ email:
80
+ - rdormer@gmail.com
81
+ executables: []
82
+ extensions: []
83
+ extra_rdoc_files: []
84
+ files:
85
+ - .travis.yml
86
+ - Gemfile
87
+ - README.md
88
+ - Rakefile
89
+ - bin/console
90
+ - bin/setup
91
+ - lib/bayes/bayes.rb
92
+ - lib/bayes/fisher.rb
93
+ - lib/corpus.rb
94
+ - lib/version.rb
95
+ - lib/wordbags/diadbag.rb
96
+ - lib/wordbags/stembag.rb
97
+ - lib/wordbags/stemmed_diadbag.rb
98
+ - lib/wordbags/wordbag.rb
99
+ - lib/yanbi.rb
100
+ - yanbi-ml.gemspec
101
+ homepage: http://github.com/rdormer/yanbi-ml
102
+ licenses:
103
+ - MIT
104
+ post_install_message:
105
+ rdoc_options: []
106
+ require_paths:
107
+ - lib
108
+ required_ruby_version: !ruby/object:Gem::Requirement
109
+ none: false
110
+ requirements:
111
+ - - ! '>='
112
+ - !ruby/object:Gem::Version
113
+ version: '0'
114
+ required_rubygems_version: !ruby/object:Gem::Requirement
115
+ none: false
116
+ requirements:
117
+ - - ! '>='
118
+ - !ruby/object:Gem::Version
119
+ version: '0'
120
+ requirements: []
121
+ rubyforge_project:
122
+ rubygems_version: 1.8.25
123
+ signing_key:
124
+ specification_version: 3
125
+ summary: Yet Another Naive Bayes Implementation
126
+ test_files: []