omnicat 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.txt +3 -0
- data/README.md +1 -1
- data/lib/omnicat/classifiers/bayes.rb +68 -34
- data/lib/omnicat/classifiers/bayes_internals/category.rb +2 -1
- data/lib/omnicat/version.rb +1 -1
- data/lib/test/unit/hash_test.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ea920e881bd63f956dd1237f666d008f893668af
|
4
|
+
data.tar.gz: f9d1ec2fe73eb047c5ac661c42600cff033fd35f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4c65cec9bf29fc07b9b0f0eee51da3bfc40f2ba8e443daf287b3e76f499b9084e8526baeb7b7319acd7eeda826ff9a892a0e761848d23e52af2e4545cfbd60ff
|
7
|
+
data.tar.gz: 3f153307273e1c94bea62399a1d1f8d039b4c17956187779f08726429329a84acbce2ede51c7ade3c2ef2b1a778f37da664ae9855144f07a2c906f23d0ee5d80
|
data/CHANGELOG.txt
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# OmniCat
|
2
2
|
|
3
|
-
[![Build Status](https://travis-ci.org/mustafaturan/omnicat.png)](https://travis-ci.org/mustafaturan/omnicat)
|
3
|
+
[![Build Status](https://travis-ci.org/mustafaturan/omnicat.png)](https://travis-ci.org/mustafaturan/omnicat) [![Code Climate](https://codeclimate.com/github/mustafaturan/omnicat.png)](https://codeclimate.com/github/mustafaturan/omnicat)
|
4
4
|
|
5
5
|
A generalized framework for text classifications. For now, it only supports Naive Bayes algorithm for text classification.
|
6
6
|
|
@@ -2,8 +2,12 @@ module OmniCat
|
|
2
2
|
module Classifiers
|
3
3
|
class Bayes < ::OmniCat::Classifiers::Base
|
4
4
|
|
5
|
-
attr_accessor :categories
|
6
|
-
attr_accessor :
|
5
|
+
attr_accessor :categories # ::OmniCat::Hash - Hash of categories
|
6
|
+
attr_accessor :category_count # Integer - Total category count
|
7
|
+
attr_accessor :doc_count # Integer - Total token count
|
8
|
+
attr_accessor :token_count # Integer - Total token count
|
9
|
+
attr_accessor :uniq_token_count # Integer - Total uniq token count
|
10
|
+
attr_accessor :k_value # Integer - Helper value for skipping some Bayes algorithm errors
|
7
11
|
|
8
12
|
def initialize(bayes_hash = {})
|
9
13
|
self.categories = ::OmniCat::Hash.new
|
@@ -56,20 +60,11 @@ module OmniCat
|
|
56
60
|
# bayes.train("neutral", "how is the management gui")
|
57
61
|
def train(category_name, doc)
|
58
62
|
if category_exists?(category_name)
|
59
|
-
|
60
|
-
|
63
|
+
increment_doc_counts(category_name)
|
64
|
+
update_priors
|
61
65
|
doc.tokenize_with_counts.each do |token, count|
|
62
|
-
|
63
|
-
categories.each do |name, category|
|
64
|
-
if category.tokens.has_key?(token)
|
65
|
-
uniq_token_addition = 1
|
66
|
-
break
|
67
|
-
end
|
68
|
-
end
|
69
|
-
self.uniq_token_count += 1 if uniq_token_addition == 0
|
70
|
-
self.token_count += count
|
66
|
+
increment_token_counts(category_name, token, count)
|
71
67
|
self.categories[category_name].tokens[token] = self.categories[category_name].tokens[token].to_i + count
|
72
|
-
self.categories[category_name].token_count += count
|
73
68
|
end
|
74
69
|
else
|
75
70
|
raise StandardError,
|
@@ -99,27 +94,13 @@ module OmniCat
|
|
99
94
|
end
|
100
95
|
score = -1000000
|
101
96
|
result = ::OmniCat::Result.new
|
102
|
-
categories.each do |
|
103
|
-
|
104
|
-
result.scores[
|
105
|
-
|
106
|
-
|
107
|
-
result.scores[name] *= k_value / token_count
|
108
|
-
else
|
109
|
-
result.scores[name] *= (
|
110
|
-
count * (
|
111
|
-
(category.tokens[token].to_i + k_value) /
|
112
|
-
(category.token_count + uniq_token_count)
|
113
|
-
)
|
114
|
-
)
|
115
|
-
end
|
97
|
+
self.categories.each do |category_name, category|
|
98
|
+
result.scores[category_name] = doc_probability(category, doc)
|
99
|
+
if result.scores[category_name] > score
|
100
|
+
result.category[:name] = category_name
|
101
|
+
score = result.scores[category_name]
|
116
102
|
end
|
117
|
-
result.
|
118
|
-
if result.scores[name] > score
|
119
|
-
result.category[:name] = name;
|
120
|
-
score = result.scores[name];
|
121
|
-
end
|
122
|
-
result.total_score += result.scores[name]
|
103
|
+
result.total_score += result.scores[category_name]
|
123
104
|
end
|
124
105
|
result.total_score = 1 if result.total_score == 0
|
125
106
|
result.category[:percentage] = (
|
@@ -135,6 +116,59 @@ module OmniCat
|
|
135
116
|
categories.has_key?(category_name)
|
136
117
|
end
|
137
118
|
|
119
|
+
# nodoc
|
120
|
+
def increment_doc_counts(category_name)
|
121
|
+
self.doc_count += 1
|
122
|
+
self.categories[category_name].doc_count += 1
|
123
|
+
end
|
124
|
+
|
125
|
+
# nodoc
|
126
|
+
def update_priors
|
127
|
+
self.categories.each do |_, category|
|
128
|
+
category.prior = category.doc_count / doc_count.to_f
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
# nodoc
|
133
|
+
def increment_token_counts(category_name, token, count)
|
134
|
+
increment_uniq_token_count(token)
|
135
|
+
self.token_count += count
|
136
|
+
self.categories[category_name].token_count += count
|
137
|
+
end
|
138
|
+
|
139
|
+
# nodoc
|
140
|
+
def increment_uniq_token_count(token)
|
141
|
+
uniq_token_addition = 0
|
142
|
+
categories.each do |_, category|
|
143
|
+
if category.tokens.has_key?(token)
|
144
|
+
uniq_token_addition = 1
|
145
|
+
break
|
146
|
+
end
|
147
|
+
end
|
148
|
+
self.uniq_token_count += 1 if uniq_token_addition == 0
|
149
|
+
end
|
150
|
+
|
151
|
+
# nodoc
|
152
|
+
def doc_probability(category, doc)
|
153
|
+
score = k_value
|
154
|
+
doc.tokenize_with_counts.each do |token, count|
|
155
|
+
score *= token_probability(category, token, count)
|
156
|
+
end
|
157
|
+
category.prior * score
|
158
|
+
end
|
159
|
+
|
160
|
+
# nodoc
|
161
|
+
def token_probability(category, token, count)
|
162
|
+
if category.tokens[token].to_i == 0
|
163
|
+
k_value / token_count
|
164
|
+
else
|
165
|
+
count * (
|
166
|
+
(category.tokens[token].to_i + k_value) /
|
167
|
+
(category.token_count + uniq_token_count)
|
168
|
+
)
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
138
172
|
end
|
139
173
|
end
|
140
174
|
end
|
@@ -2,10 +2,11 @@ module OmniCat
|
|
2
2
|
module Classifiers
|
3
3
|
module BayesInternals
|
4
4
|
class Category < ::OmniCat::Base
|
5
|
-
attr_accessor :doc_count, :tokens, :token_count
|
5
|
+
attr_accessor :doc_count, :prior, :tokens, :token_count
|
6
6
|
|
7
7
|
def initialize(category_hash = {})
|
8
8
|
self.doc_count = category_hash[:doc_count].to_i
|
9
|
+
self.prior = category_hash[:prior].to_f
|
9
10
|
self.tokens = category_hash[:tokens] || {}
|
10
11
|
self.token_count = category_hash[:token_count].to_i
|
11
12
|
end
|
data/lib/omnicat/version.rb
CHANGED
data/lib/test/unit/hash_test.rb
CHANGED
@@ -2,7 +2,7 @@ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
|
|
2
2
|
|
3
3
|
class TestHash < Test::Unit::TestCase
|
4
4
|
def test_to_hash
|
5
|
-
categories_hash = { "pos" => { doc_count: 0, tokens: {}, token_count: 0 } }
|
5
|
+
categories_hash = { "pos" => { doc_count: 0, prior: 0.0, tokens: {}, token_count: 0 } }
|
6
6
|
categories = OmniCat::Hash.new
|
7
7
|
categories["pos"] = OmniCat::Classifiers::BayesInternals::Category.new(categories_hash["pos"])
|
8
8
|
assert_equal(categories_hash, categories.to_hash)
|