simple_naive_bayes 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: fe3a97c568b49df1ee13a85badfa0c106e800f84
4
+ data.tar.gz: 3ce26714bcbbd3cd914dd96b996447e6c0150db4
5
+ SHA512:
6
+ metadata.gz: 652a9f57aa077f89e7d3611649630684bf4a49e6236e886685952d433d2081ba65a2d7c78fa8d1d0085f458378e30ed46a6e3e64e82659af10e78c39b8433151
7
+ data.tar.gz: d313812706be35cedaa51cbe93b3db5d41fbb6eb1659d733ec1b4556a7732ac14777bc56c731b269260537112a4a4aaf4746738bc96cd82bc51fd4475a5712d4
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in simple_naive_bayes.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 y42sora
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,61 @@
1
+ # SimpleNaiveBayes
2
+
3
+ This is a very simple naive bayes written in ruby.
4
+
5
+
6
+ ## Installation
7
+
8
+ $ gem install simple_naive_bayes
9
+
10
+ ## Usage
11
+
12
+ ```ruby
13
+ require 'simple_naive_bayes'
14
+ cl = SimpleNaiveBayes::NaiveBayes.new
15
+ cl.training("yes", ["Chinese", "Beijing", "Chinese"])
16
+ cl.training("yes", ["Chinese", "Chinese", "Shanghai"])
17
+ cl.training("yes", ["Chinese", "Macao"])
18
+ cl.training("no", ["Tokyo", "Japan", "Chinese"])
19
+
20
+ cl.classify(["Tokyo"])
21
+ ```
22
+
23
+ show example.rb
24
+
25
+
26
+ ## Supported Ruby Versions
27
+ Ruby 2.0.0
28
+
29
+ ## Performance
30
+ To measure the performance of the filte, I tested.
31
+ The datasource is publiccorpus (http://spamassassin.apache.org/publiccorpus/).
32
+ This data is mail corpus, so I classify mails.
33
+ Those mails have three type which is spam, easy_ham, hard_ham.
34
+ The test script is publiccorpus_test.rb.
35
+
36
+ ### Data sources
37
+ #### Training Data
38
+ * http://spamassassin.apache.org/publiccorpus/20021010_easy_ham.tar.bz2
39
+ * http://spamassassin.apache.org/publiccorpus/20021010_hard_ham.tar.bz2
40
+ * http://spamassassin.apache.org/publiccorpus/20021010_spam.tar.bz2
41
+
42
+ #### Test Data
43
+ * http://spamassassin.apache.org/publiccorpus/20030228_easy_ham.tar.bz2
44
+ * http://spamassassin.apache.org/publiccorpus/20030228_hard_ham.tar.bz2
45
+ * http://spamassassin.apache.org/publiccorpus/20030228_spam.tar.bz2
46
+
47
+ ### Result
48
+ * spam accuracy rate is 99.6% (498/500)
49
+ * easy ham accuracy rate is 99.8% (2497/2500)
50
+ * hard ham accuracy rate is 81.6% (204/250)
51
+
52
+ ## License
53
+ MIT License
54
+
55
+ ## Contributing
56
+
57
+ 1. Fork it
58
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
59
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
60
+ 4. Push to the branch (`git push origin my-new-feature`)
61
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,19 @@
1
+ require 'simple_naive_bayes'
2
+
3
+ cl = SimpleNaiveBayes::NaiveBayes.new
4
+
5
+ data = [
6
+ ["yes", ["Chinese", "Beijing", "Chinese"]],
7
+ ["yes", ["Chinese", "Chinese", "Shanghai"]],
8
+ ["yes", ["Chinese", "Macao"]],
9
+ ["no", ["Tokyo", "Japan", "Chinese"]]
10
+ ]
11
+
12
+ data.each do |cat, doc|
13
+ cl.training(cat, doc)
14
+ end
15
+
16
+ test = ["Chinese", "Chinese", "Chinese", "Tokyo", "Japan"]
17
+
18
+ p cl.classify(test)
19
+ p cl.classify_with_all_result(test)
@@ -0,0 +1,155 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # test script for http://spamassassin.apache.org/publiccorpus/
4
+ require 'find'
5
+ require 'simple_naive_bayes'
6
+
7
+ train_spam_folder = "20021010/spam"
8
+ train_ham_folder = "20021010/easy_ham"
9
+ train_hard_ham_folder = "20021010/hard_ham"
10
+
11
+ test_spam_folder = "20030228/spam"
12
+ test_ham_folder = "20030228/easy_ham"
13
+ test_hard_ham_folder = "20030228/hard_ham"
14
+
15
+ @header_regxp = /[\w-]*: .*/
16
+ @nb_classifier = SimpleNaiveBayes::NaiveBayes.new
17
+
18
+
19
+ # delete all mail header
20
+ # chek line is not mail header and befor line is blank
21
+ # it expect like that header
22
+
23
+ # X-Original-Date: Wed, 4 Dec 2002 11:54:45 +0000
24
+ # Date: Wed, 4 Dec 2002 11:54:45 +0000
25
+ #
26
+ #
27
+ # Hi,
28
+ # I think you need to give us a little more detailed information.
29
+ # ...
30
+ def get_context_from_file(filepath)
31
+ context = []
32
+
33
+ end_header = false
34
+ before_line = "before"
35
+
36
+ open(filepath) {|f|
37
+ f.each {|line|
38
+ line = line.encode("UTF-16BE", :invalid => :replace, :undef => :replace, :replace => '?').encode("UTF-8")
39
+ line = line.chomp
40
+
41
+ if before_line.empty? and not line.empty? and not @header_regxp.match(line)
42
+ end_header = true
43
+ end
44
+
45
+ context << line if end_header
46
+
47
+ before_line = line
48
+ }
49
+ }
50
+ context.join(" ")
51
+ end
52
+
53
+ # divide context string to word list
54
+ # return like [word1, word2, word3]
55
+ # and delete stopword that word length smaller than 3
56
+ def get_word_from_context(context)
57
+ words = []
58
+
59
+ context.split(" ").each do |word|
60
+ if word[-1] == "." or word[-1] == "," or
61
+ word[-1] == "?" or word[-1] == "!" or
62
+ word[-1] == ":"
63
+
64
+ word = word[0..-2]
65
+ end
66
+
67
+ words << word unless word.size < 3
68
+ end
69
+
70
+ words
71
+ end
72
+
73
+ def train_data_from_file(category, filepath)
74
+ context = get_context_from_file(filepath)
75
+ words = get_word_from_context(context)
76
+
77
+ @nb_classifier.training(category, words)
78
+ end
79
+
80
+ def train_data_from_folder(category, folder)
81
+ all_num = 0
82
+ t0 = Time.now
83
+ Find.find(folder) do |filepath|
84
+ if File::ftype(filepath) == "file"
85
+ train_data_from_file(category, filepath)
86
+ all_num += 1
87
+ end
88
+ end
89
+ t1 = Time.now
90
+
91
+ puts "training #{category} #{t1 - t0} sec and #{all_num} file"
92
+ end
93
+
94
+
95
+ # check correct rate
96
+ def check_data_from_folder(category, folder)
97
+ correct_num = 0
98
+ all_num = 0
99
+
100
+ t0 = Time.now
101
+ Find.find(folder) do |filepath|
102
+ if File::ftype(filepath) == "file"
103
+ context = get_context_from_file(filepath)
104
+ words = get_word_from_context(context)
105
+ correct_num += 1 if category == @nb_classifier.classify(words)
106
+ all_num += 1
107
+ end
108
+ end
109
+ t1 = Time.now
110
+
111
+ puts "check #{category} #{t1 - t0} sec"
112
+ [all_num, correct_num]
113
+ end
114
+
115
+
116
+ train_data_from_folder("spam", train_spam_folder)
117
+ train_data_from_folder("ham", train_ham_folder)
118
+ train_data_from_folder("hard", train_hard_ham_folder)
119
+
120
+ puts "----check spam----"
121
+
122
+ ans = check_data_from_folder("spam", test_spam_folder)
123
+ puts "spam rate is " + (ans[1].to_f / ans[0]).to_s
124
+ puts "all #{ans[0]} correct #{ans[1]}"
125
+
126
+ puts "----check ham----"
127
+
128
+ ans = check_data_from_folder("ham", test_ham_folder)
129
+ puts "ham rate is " + (ans[1].to_f / ans[0]).to_s
130
+ puts "all #{ans[0]} correct #{ans[1]}"
131
+
132
+ puts "----check hard_ham----"
133
+
134
+ ans = check_data_from_folder("hard", test_hard_ham_folder)
135
+ puts "hard ham rate is " + (ans[1].to_f / ans[0]).to_s
136
+ puts "all #{ans[0]} correct #{ans[1]}"
137
+
138
+ =begin
139
+ training spam 2.337407645 sec and 501 file
140
+ training ham 7.85665402 sec and 2551 file
141
+ training hard 6.014818518 sec and 250 file
142
+ ----check spam----
143
+ check spam 4.681404607 sec
144
+ spam rate is 0.996
145
+ all 500 correct 498
146
+ ----check ham----
147
+ check ham 11.444270327 sec
148
+ ham rate is 0.9988
149
+ all 2500 correct 2497
150
+ ----check hard_ham----
151
+ check hard 8.78753183 sec
152
+ hard ham rate is 0.816
153
+ all 250 correct 204
154
+ =end
155
+
@@ -0,0 +1,3 @@
1
+ module SimpleNaiveBayes
2
+ VERSION = "0.0.2"
3
+ end
@@ -0,0 +1,116 @@
1
+ require "simple_naive_bayes/version"
2
+ require 'set'
3
+
4
+ module SimpleNaiveBayes
5
+ class NaiveBayes
6
+ =begin
7
+
8
+ P(cat|doc) = P(doc|cat) * P(cat) / P(doc)
9
+
10
+ P(doc) is stable, so don't care.
11
+
12
+ P(cat) = @categories_count[cat] / @all_category_num
13
+
14
+ P(doc|cat) = P(word1|cat) * P(word2|cat)....
15
+ P(word1|cat) = T(cat, word1) / (T(cat, word1) + T(cat, word2) + ...)
16
+ T(cat, word1) = @categories_word[cat][word]
17
+ (T(cat, word1) + T(cat, word2) + ...) = sum(T(cat, word)) = @categories_all_word_count[cat]
18
+
19
+ Additive smoothing
20
+
21
+ P(word1|cat) = (T(cat, word1) + a) / sum(T(cat, word) + a))
22
+ sum(T(cat, word) + a) = sum(T(cat, word)) + @all_word_set.length() * @additive = @laplace_categories_all_word_count[cat]
23
+
24
+
25
+ arg max P(cat|doc) = arg max log(P(cat|doc))
26
+ log(P(cat|doc)) = log(P(doc|cat)) + log( P(cat))
27
+
28
+ log(P(cat)) = log(@categories_count[cat]) - log(@all_category_num)
29
+
30
+ log(P(doc|cat)) = log(P(word1|cat)) + log(P(word2|cat)) + ....
31
+ log(P(word1|cat)) = log(T(cat, word1)) - log(sum(T(cat, word)))
32
+
33
+
34
+ http://aidiary.hatenablog.com/entry/20100613/1276389337
35
+ =end
36
+
37
+ def initialize()
38
+ @all_category_num = 0
39
+ @all_word_set = Set.new
40
+ @categories_count = Hash.new(0)
41
+
42
+ @categories_word = Hash.new
43
+ @categories_all_word_count = Hash.new(0)
44
+ @laplace_categories_all_word_count = Hash.new(0)
45
+ @additive = 0.5
46
+ end
47
+
48
+ """
49
+ doc = [word1, word2, word3...]
50
+ """
51
+ def training(category, doc)
52
+ @categories_count[category] += 1
53
+ @all_category_num += 1
54
+
55
+ @categories_word[category] = Hash.new(0) unless @categories_word.key?(category)
56
+ doc.each do |word|
57
+ @all_word_set.add(word)
58
+ @categories_word[category][word] += 1
59
+ @categories_all_word_count[category] += 1
60
+ end
61
+
62
+ # sum(T(cat, word) + 1))
63
+ # Additive smoothing
64
+ @laplace_categories_all_word_count[category] = @categories_all_word_count[category] + @all_word_set.length() * @additive
65
+ end
66
+
67
+ # classify and return best category
68
+ def classify(doc)
69
+ result = classify_with_all_result(doc)
70
+
71
+ best = result.max_by { |classify_relust| classify_relust[1] }
72
+ best[0]
73
+ end
74
+
75
+ # classify and return all category's probability
76
+ # get all log(P(cat|doc))
77
+ # return [ [category1, probability1], [category2, probability2]... ]
78
+ def classify_with_all_result(doc)
79
+ result = []
80
+
81
+ @categories_count.keys().each do |category|
82
+ # log(P(doc|cat))
83
+ document_category = calc_document_category(doc, category)
84
+
85
+ # log(P(cat)) = log(@categories_count[cat]) - log( @all_category_num )
86
+ category_probability = Math.log2(@categories_count[category]) - Math.log2(@all_category_num)
87
+
88
+ # log(P(cat|doc)) = log(P(doc|cat)) + log(P(cat))
89
+ category_document_probability = document_category + category_probability
90
+
91
+ result << [category, category_document_probability]
92
+ end
93
+ result
94
+ end
95
+
96
+ # log(P(doc|cat)) = log(P(word1|cat)) + log(P(word2|cat)) + ....
97
+ def calc_document_category(doc, category)
98
+ probability = 0
99
+
100
+ # log(P(word1|cat)) + log(P(word2|cat)) + ....
101
+ doc.each do |word|
102
+ # log(T(cat, word1))
103
+ # Additive smoothing
104
+ category_word = Math.log2(@categories_word[category][word] + @additive)
105
+
106
+ # sum(T(cat, word) + 1))
107
+ all_category_word = Math.log2(@laplace_categories_all_word_count[category])
108
+
109
+ # log(P(word1|cat)) = log(T(cat, word1) + 1) - log(sum(T(cat, word) + 1))
110
+ prob = category_word - all_category_word
111
+ probability += prob if prob.finite?
112
+ end
113
+ probability
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,23 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'simple_naive_bayes/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "simple_naive_bayes"
8
+ spec.version = SimpleNaiveBayes::VERSION
9
+ spec.authors = ["y42sora"]
10
+ spec.email = ["y42sora@y42sora.com"]
11
+ spec.description = %q{Simple pure ruby naive bayes}
12
+ spec.summary = %q{Simple pure ruby naive bayes}
13
+ spec.homepage = "https://github.com/y42sora/simple_naive_bayes"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+ end
metadata ADDED
@@ -0,0 +1,82 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: simple_naive_bayes
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: ruby
6
+ authors:
7
+ - y42sora
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-08-17 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: Simple pure ruby naive bayes
42
+ email:
43
+ - y42sora@y42sora.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - .gitignore
49
+ - Gemfile
50
+ - LICENSE.txt
51
+ - README.md
52
+ - Rakefile
53
+ - example/example.rb
54
+ - example/publiccorpus_test.rb
55
+ - lib/simple_naive_bayes.rb
56
+ - lib/simple_naive_bayes/version.rb
57
+ - simple_naive_bayes.gemspec
58
+ homepage: https://github.com/y42sora/simple_naive_bayes
59
+ licenses:
60
+ - MIT
61
+ metadata: {}
62
+ post_install_message:
63
+ rdoc_options: []
64
+ require_paths:
65
+ - lib
66
+ required_ruby_version: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - '>='
69
+ - !ruby/object:Gem::Version
70
+ version: '0'
71
+ required_rubygems_version: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ requirements: []
77
+ rubyforge_project:
78
+ rubygems_version: 2.0.0
79
+ signing_key:
80
+ specification_version: 4
81
+ summary: Simple pure ruby naive bayes
82
+ test_files: []