omnicat 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5d04e29b0e2e16e019592553041b62eb9b759d3d
4
- data.tar.gz: 1a4614e75aef06179e7c9589a73bf9c0b3552d20
3
+ metadata.gz: ea920e881bd63f956dd1237f666d008f893668af
4
+ data.tar.gz: f9d1ec2fe73eb047c5ac661c42600cff033fd35f
5
5
  SHA512:
6
- metadata.gz: d30317305905b877570cc2626665b74b56d3d4278422398ed05bb836d2316b2cb3fe4faee1cb0e26e1068121151fb0ba49b1eac61b79541fd2e44c23c6d19c03
7
- data.tar.gz: bc998d7a815212af3881fb81bfa18bda74d1f68df1a273f9c17fdbf76340579de26953a384eb30edbdfac51409ddb7279e6195bac1e86d51f6e11bde48e5029e
6
+ metadata.gz: 4c65cec9bf29fc07b9b0f0eee51da3bfc40f2ba8e443daf287b3e76f499b9084e8526baeb7b7319acd7eeda826ff9a892a0e761848d23e52af2e4545cfbd60ff
7
+ data.tar.gz: 3f153307273e1c94bea62399a1d1f8d039b4c17956187779f08726429329a84acbce2ede51c7ade3c2ef2b1a778f37da664ae9855144f07a2c906f23d0ee5d80
@@ -1,3 +1,6 @@
1
+ 0.1.3
2
+ # refactoring at bayes algorithm
3
+
1
4
  0.1.2
2
5
  # fix the bayes algorithm (so important changes!)
3
6
 
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # OmniCat
2
2
 
3
- [![Build Status](https://travis-ci.org/mustafaturan/omnicat.png)](https://travis-ci.org/mustafaturan/omnicat)
3
+ [![Build Status](https://travis-ci.org/mustafaturan/omnicat.png)](https://travis-ci.org/mustafaturan/omnicat) [![Code Climate](https://codeclimate.com/github/mustafaturan/omnicat.png)](https://codeclimate.com/github/mustafaturan/omnicat)
4
4
 
5
5
  A generalized framework for text classifications. For now, it only supports Naive Bayes algorithm for text classification.
6
6
 
@@ -2,8 +2,12 @@ module OmniCat
2
2
  module Classifiers
3
3
  class Bayes < ::OmniCat::Classifiers::Base
4
4
 
5
- attr_accessor :categories, :category_count, :doc_count, :token_count, :uniq_token_count
6
- attr_accessor :k_value # helper val for skipping some Bayes theorem errors
5
+ attr_accessor :categories # ::OmniCat::Hash - Hash of categories
6
+ attr_accessor :category_count # Integer - Total category count
7
+ attr_accessor :doc_count # Integer - Total token count
8
+ attr_accessor :token_count # Integer - Total token count
9
+ attr_accessor :uniq_token_count # Integer - Total uniq token count
10
+ attr_accessor :k_value # Integer - Helper value for skipping some Bayes algorithm errors
7
11
 
8
12
  def initialize(bayes_hash = {})
9
13
  self.categories = ::OmniCat::Hash.new
@@ -56,20 +60,11 @@ module OmniCat
56
60
  # bayes.train("neutral", "how is the management gui")
57
61
  def train(category_name, doc)
58
62
  if category_exists?(category_name)
59
- self.doc_count += 1
60
- categories[category_name].doc_count += 1
63
+ increment_doc_counts(category_name)
64
+ update_priors
61
65
  doc.tokenize_with_counts.each do |token, count|
62
- uniq_token_addition = 0
63
- categories.each do |name, category|
64
- if category.tokens.has_key?(token)
65
- uniq_token_addition = 1
66
- break
67
- end
68
- end
69
- self.uniq_token_count += 1 if uniq_token_addition == 0
70
- self.token_count += count
66
+ increment_token_counts(category_name, token, count)
71
67
  self.categories[category_name].tokens[token] = self.categories[category_name].tokens[token].to_i + count
72
- self.categories[category_name].token_count += count
73
68
  end
74
69
  else
75
70
  raise StandardError,
@@ -99,27 +94,13 @@ module OmniCat
99
94
  end
100
95
  score = -1000000
101
96
  result = ::OmniCat::Result.new
102
- categories.each do |name, category|
103
- prior = category.doc_count / doc_count.to_f
104
- result.scores[name] = k_value
105
- doc.tokenize_with_counts.each do |token, count|
106
- if category.tokens[token].to_i == 0
107
- result.scores[name] *= k_value / token_count
108
- else
109
- result.scores[name] *= (
110
- count * (
111
- (category.tokens[token].to_i + k_value) /
112
- (category.token_count + uniq_token_count)
113
- )
114
- )
115
- end
97
+ self.categories.each do |category_name, category|
98
+ result.scores[category_name] = doc_probability(category, doc)
99
+ if result.scores[category_name] > score
100
+ result.category[:name] = category_name
101
+ score = result.scores[category_name]
116
102
  end
117
- result.scores[name] = prior * result.scores[name]
118
- if result.scores[name] > score
119
- result.category[:name] = name;
120
- score = result.scores[name];
121
- end
122
- result.total_score += result.scores[name]
103
+ result.total_score += result.scores[category_name]
123
104
  end
124
105
  result.total_score = 1 if result.total_score == 0
125
106
  result.category[:percentage] = (
@@ -135,6 +116,59 @@ module OmniCat
135
116
  categories.has_key?(category_name)
136
117
  end
137
118
 
119
+ # nodoc
120
+ def increment_doc_counts(category_name)
121
+ self.doc_count += 1
122
+ self.categories[category_name].doc_count += 1
123
+ end
124
+
125
+ # nodoc
126
+ def update_priors
127
+ self.categories.each do |_, category|
128
+ category.prior = category.doc_count / doc_count.to_f
129
+ end
130
+ end
131
+
132
+ # nodoc
133
+ def increment_token_counts(category_name, token, count)
134
+ increment_uniq_token_count(token)
135
+ self.token_count += count
136
+ self.categories[category_name].token_count += count
137
+ end
138
+
139
+ # nodoc
140
+ def increment_uniq_token_count(token)
141
+ uniq_token_addition = 0
142
+ categories.each do |_, category|
143
+ if category.tokens.has_key?(token)
144
+ uniq_token_addition = 1
145
+ break
146
+ end
147
+ end
148
+ self.uniq_token_count += 1 if uniq_token_addition == 0
149
+ end
150
+
151
+ # nodoc
152
+ def doc_probability(category, doc)
153
+ score = k_value
154
+ doc.tokenize_with_counts.each do |token, count|
155
+ score *= token_probability(category, token, count)
156
+ end
157
+ category.prior * score
158
+ end
159
+
160
+ # nodoc
161
+ def token_probability(category, token, count)
162
+ if category.tokens[token].to_i == 0
163
+ k_value / token_count
164
+ else
165
+ count * (
166
+ (category.tokens[token].to_i + k_value) /
167
+ (category.token_count + uniq_token_count)
168
+ )
169
+ end
170
+ end
171
+
138
172
  end
139
173
  end
140
174
  end
@@ -2,10 +2,11 @@ module OmniCat
2
2
  module Classifiers
3
3
  module BayesInternals
4
4
  class Category < ::OmniCat::Base
5
- attr_accessor :doc_count, :tokens, :token_count
5
+ attr_accessor :doc_count, :prior, :tokens, :token_count
6
6
 
7
7
  def initialize(category_hash = {})
8
8
  self.doc_count = category_hash[:doc_count].to_i
9
+ self.prior = category_hash[:prior].to_f
9
10
  self.tokens = category_hash[:tokens] || {}
10
11
  self.token_count = category_hash[:token_count].to_i
11
12
  end
@@ -1,3 +1,3 @@
1
1
  module OmniCat
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
@@ -2,7 +2,7 @@ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
2
2
 
3
3
  class TestHash < Test::Unit::TestCase
4
4
  def test_to_hash
5
- categories_hash = { "pos" => { doc_count: 0, tokens: {}, token_count: 0 } }
5
+ categories_hash = { "pos" => { doc_count: 0, prior: 0.0, tokens: {}, token_count: 0 } }
6
6
  categories = OmniCat::Hash.new
7
7
  categories["pos"] = OmniCat::Classifiers::BayesInternals::Category.new(categories_hash["pos"])
8
8
  assert_equal(categories_hash, categories.to_hash)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: omnicat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mustafa Turan