omnicat 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.txt +3 -0
- data/README.md +1 -1
- data/lib/omnicat/classifiers/bayes.rb +68 -34
- data/lib/omnicat/classifiers/bayes_internals/category.rb +2 -1
- data/lib/omnicat/version.rb +1 -1
- data/lib/test/unit/hash_test.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ea920e881bd63f956dd1237f666d008f893668af
|
4
|
+
data.tar.gz: f9d1ec2fe73eb047c5ac661c42600cff033fd35f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4c65cec9bf29fc07b9b0f0eee51da3bfc40f2ba8e443daf287b3e76f499b9084e8526baeb7b7319acd7eeda826ff9a892a0e761848d23e52af2e4545cfbd60ff
|
7
|
+
data.tar.gz: 3f153307273e1c94bea62399a1d1f8d039b4c17956187779f08726429329a84acbce2ede51c7ade3c2ef2b1a778f37da664ae9855144f07a2c906f23d0ee5d80
|
data/CHANGELOG.txt
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# OmniCat
|
2
2
|
|
3
|
-
[](https://travis-ci.org/mustafaturan/omnicat)
|
3
|
+
[](https://travis-ci.org/mustafaturan/omnicat) [](https://codeclimate.com/github/mustafaturan/omnicat)
|
4
4
|
|
5
5
|
A generalized framework for text classifications. For now, it only supports Naive Bayes algorithm for text classification.
|
6
6
|
|
@@ -2,8 +2,12 @@ module OmniCat
|
|
2
2
|
module Classifiers
|
3
3
|
class Bayes < ::OmniCat::Classifiers::Base
|
4
4
|
|
5
|
-
attr_accessor :categories
|
6
|
-
attr_accessor :
|
5
|
+
attr_accessor :categories # ::OmniCat::Hash - Hash of categories
|
6
|
+
attr_accessor :category_count # Integer - Total category count
|
7
|
+
attr_accessor :doc_count # Integer - Total token count
|
8
|
+
attr_accessor :token_count # Integer - Total token count
|
9
|
+
attr_accessor :uniq_token_count # Integer - Total uniq token count
|
10
|
+
attr_accessor :k_value # Integer - Helper value for skipping some Bayes algorithm errors
|
7
11
|
|
8
12
|
def initialize(bayes_hash = {})
|
9
13
|
self.categories = ::OmniCat::Hash.new
|
@@ -56,20 +60,11 @@ module OmniCat
|
|
56
60
|
# bayes.train("neutral", "how is the management gui")
|
57
61
|
def train(category_name, doc)
|
58
62
|
if category_exists?(category_name)
|
59
|
-
|
60
|
-
|
63
|
+
increment_doc_counts(category_name)
|
64
|
+
update_priors
|
61
65
|
doc.tokenize_with_counts.each do |token, count|
|
62
|
-
|
63
|
-
categories.each do |name, category|
|
64
|
-
if category.tokens.has_key?(token)
|
65
|
-
uniq_token_addition = 1
|
66
|
-
break
|
67
|
-
end
|
68
|
-
end
|
69
|
-
self.uniq_token_count += 1 if uniq_token_addition == 0
|
70
|
-
self.token_count += count
|
66
|
+
increment_token_counts(category_name, token, count)
|
71
67
|
self.categories[category_name].tokens[token] = self.categories[category_name].tokens[token].to_i + count
|
72
|
-
self.categories[category_name].token_count += count
|
73
68
|
end
|
74
69
|
else
|
75
70
|
raise StandardError,
|
@@ -99,27 +94,13 @@ module OmniCat
|
|
99
94
|
end
|
100
95
|
score = -1000000
|
101
96
|
result = ::OmniCat::Result.new
|
102
|
-
categories.each do |
|
103
|
-
|
104
|
-
result.scores[
|
105
|
-
|
106
|
-
|
107
|
-
result.scores[name] *= k_value / token_count
|
108
|
-
else
|
109
|
-
result.scores[name] *= (
|
110
|
-
count * (
|
111
|
-
(category.tokens[token].to_i + k_value) /
|
112
|
-
(category.token_count + uniq_token_count)
|
113
|
-
)
|
114
|
-
)
|
115
|
-
end
|
97
|
+
self.categories.each do |category_name, category|
|
98
|
+
result.scores[category_name] = doc_probability(category, doc)
|
99
|
+
if result.scores[category_name] > score
|
100
|
+
result.category[:name] = category_name
|
101
|
+
score = result.scores[category_name]
|
116
102
|
end
|
117
|
-
result.
|
118
|
-
if result.scores[name] > score
|
119
|
-
result.category[:name] = name;
|
120
|
-
score = result.scores[name];
|
121
|
-
end
|
122
|
-
result.total_score += result.scores[name]
|
103
|
+
result.total_score += result.scores[category_name]
|
123
104
|
end
|
124
105
|
result.total_score = 1 if result.total_score == 0
|
125
106
|
result.category[:percentage] = (
|
@@ -135,6 +116,59 @@ module OmniCat
|
|
135
116
|
categories.has_key?(category_name)
|
136
117
|
end
|
137
118
|
|
119
|
+
# nodoc
|
120
|
+
def increment_doc_counts(category_name)
|
121
|
+
self.doc_count += 1
|
122
|
+
self.categories[category_name].doc_count += 1
|
123
|
+
end
|
124
|
+
|
125
|
+
# nodoc
|
126
|
+
def update_priors
|
127
|
+
self.categories.each do |_, category|
|
128
|
+
category.prior = category.doc_count / doc_count.to_f
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
# nodoc
|
133
|
+
def increment_token_counts(category_name, token, count)
|
134
|
+
increment_uniq_token_count(token)
|
135
|
+
self.token_count += count
|
136
|
+
self.categories[category_name].token_count += count
|
137
|
+
end
|
138
|
+
|
139
|
+
# nodoc
|
140
|
+
def increment_uniq_token_count(token)
|
141
|
+
uniq_token_addition = 0
|
142
|
+
categories.each do |_, category|
|
143
|
+
if category.tokens.has_key?(token)
|
144
|
+
uniq_token_addition = 1
|
145
|
+
break
|
146
|
+
end
|
147
|
+
end
|
148
|
+
self.uniq_token_count += 1 if uniq_token_addition == 0
|
149
|
+
end
|
150
|
+
|
151
|
+
# nodoc
|
152
|
+
def doc_probability(category, doc)
|
153
|
+
score = k_value
|
154
|
+
doc.tokenize_with_counts.each do |token, count|
|
155
|
+
score *= token_probability(category, token, count)
|
156
|
+
end
|
157
|
+
category.prior * score
|
158
|
+
end
|
159
|
+
|
160
|
+
# nodoc
|
161
|
+
def token_probability(category, token, count)
|
162
|
+
if category.tokens[token].to_i == 0
|
163
|
+
k_value / token_count
|
164
|
+
else
|
165
|
+
count * (
|
166
|
+
(category.tokens[token].to_i + k_value) /
|
167
|
+
(category.token_count + uniq_token_count)
|
168
|
+
)
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
138
172
|
end
|
139
173
|
end
|
140
174
|
end
|
@@ -2,10 +2,11 @@ module OmniCat
|
|
2
2
|
module Classifiers
|
3
3
|
module BayesInternals
|
4
4
|
class Category < ::OmniCat::Base
|
5
|
-
attr_accessor :doc_count, :tokens, :token_count
|
5
|
+
attr_accessor :doc_count, :prior, :tokens, :token_count
|
6
6
|
|
7
7
|
def initialize(category_hash = {})
|
8
8
|
self.doc_count = category_hash[:doc_count].to_i
|
9
|
+
self.prior = category_hash[:prior].to_f
|
9
10
|
self.tokens = category_hash[:tokens] || {}
|
10
11
|
self.token_count = category_hash[:token_count].to_i
|
11
12
|
end
|
data/lib/omnicat/version.rb
CHANGED
data/lib/test/unit/hash_test.rb
CHANGED
@@ -2,7 +2,7 @@ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
|
|
2
2
|
|
3
3
|
class TestHash < Test::Unit::TestCase
|
4
4
|
def test_to_hash
|
5
|
-
categories_hash = { "pos" => { doc_count: 0, tokens: {}, token_count: 0 } }
|
5
|
+
categories_hash = { "pos" => { doc_count: 0, prior: 0.0, tokens: {}, token_count: 0 } }
|
6
6
|
categories = OmniCat::Hash.new
|
7
7
|
categories["pos"] = OmniCat::Classifiers::BayesInternals::Category.new(categories_hash["pos"])
|
8
8
|
assert_equal(categories_hash, categories.to_hash)
|