omnicat 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5d04e29b0e2e16e019592553041b62eb9b759d3d
4
- data.tar.gz: 1a4614e75aef06179e7c9589a73bf9c0b3552d20
3
+ metadata.gz: ea920e881bd63f956dd1237f666d008f893668af
4
+ data.tar.gz: f9d1ec2fe73eb047c5ac661c42600cff033fd35f
5
5
  SHA512:
6
- metadata.gz: d30317305905b877570cc2626665b74b56d3d4278422398ed05bb836d2316b2cb3fe4faee1cb0e26e1068121151fb0ba49b1eac61b79541fd2e44c23c6d19c03
7
- data.tar.gz: bc998d7a815212af3881fb81bfa18bda74d1f68df1a273f9c17fdbf76340579de26953a384eb30edbdfac51409ddb7279e6195bac1e86d51f6e11bde48e5029e
6
+ metadata.gz: 4c65cec9bf29fc07b9b0f0eee51da3bfc40f2ba8e443daf287b3e76f499b9084e8526baeb7b7319acd7eeda826ff9a892a0e761848d23e52af2e4545cfbd60ff
7
+ data.tar.gz: 3f153307273e1c94bea62399a1d1f8d039b4c17956187779f08726429329a84acbce2ede51c7ade3c2ef2b1a778f37da664ae9855144f07a2c906f23d0ee5d80
@@ -1,3 +1,6 @@
1
+ 0.1.3
2
+ # refactoring at bayes algorithm
3
+
1
4
  0.1.2
2
5
  # fix the bayes algorithm (so important changes!)
3
6
 
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # OmniCat
2
2
 
3
- [![Build Status](https://travis-ci.org/mustafaturan/omnicat.png)](https://travis-ci.org/mustafaturan/omnicat)
3
+ [![Build Status](https://travis-ci.org/mustafaturan/omnicat.png)](https://travis-ci.org/mustafaturan/omnicat) [![Code Climate](https://codeclimate.com/github/mustafaturan/omnicat.png)](https://codeclimate.com/github/mustafaturan/omnicat)
4
4
 
5
5
  A generalized framework for text classifications. For now, it only supports Naive Bayes algorithm for text classification.
6
6
 
@@ -2,8 +2,12 @@ module OmniCat
2
2
  module Classifiers
3
3
  class Bayes < ::OmniCat::Classifiers::Base
4
4
 
5
- attr_accessor :categories, :category_count, :doc_count, :token_count, :uniq_token_count
6
- attr_accessor :k_value # helper val for skipping some Bayes theorem errors
5
+ attr_accessor :categories # ::OmniCat::Hash - Hash of categories
6
+ attr_accessor :category_count # Integer - Total category count
7
+ attr_accessor :doc_count # Integer - Total token count
8
+ attr_accessor :token_count # Integer - Total token count
9
+ attr_accessor :uniq_token_count # Integer - Total uniq token count
10
+ attr_accessor :k_value # Integer - Helper value for skipping some Bayes algorithm errors
7
11
 
8
12
  def initialize(bayes_hash = {})
9
13
  self.categories = ::OmniCat::Hash.new
@@ -56,20 +60,11 @@ module OmniCat
56
60
  # bayes.train("neutral", "how is the management gui")
57
61
  def train(category_name, doc)
58
62
  if category_exists?(category_name)
59
- self.doc_count += 1
60
- categories[category_name].doc_count += 1
63
+ increment_doc_counts(category_name)
64
+ update_priors
61
65
  doc.tokenize_with_counts.each do |token, count|
62
- uniq_token_addition = 0
63
- categories.each do |name, category|
64
- if category.tokens.has_key?(token)
65
- uniq_token_addition = 1
66
- break
67
- end
68
- end
69
- self.uniq_token_count += 1 if uniq_token_addition == 0
70
- self.token_count += count
66
+ increment_token_counts(category_name, token, count)
71
67
  self.categories[category_name].tokens[token] = self.categories[category_name].tokens[token].to_i + count
72
- self.categories[category_name].token_count += count
73
68
  end
74
69
  else
75
70
  raise StandardError,
@@ -99,27 +94,13 @@ module OmniCat
99
94
  end
100
95
  score = -1000000
101
96
  result = ::OmniCat::Result.new
102
- categories.each do |name, category|
103
- prior = category.doc_count / doc_count.to_f
104
- result.scores[name] = k_value
105
- doc.tokenize_with_counts.each do |token, count|
106
- if category.tokens[token].to_i == 0
107
- result.scores[name] *= k_value / token_count
108
- else
109
- result.scores[name] *= (
110
- count * (
111
- (category.tokens[token].to_i + k_value) /
112
- (category.token_count + uniq_token_count)
113
- )
114
- )
115
- end
97
+ self.categories.each do |category_name, category|
98
+ result.scores[category_name] = doc_probability(category, doc)
99
+ if result.scores[category_name] > score
100
+ result.category[:name] = category_name
101
+ score = result.scores[category_name]
116
102
  end
117
- result.scores[name] = prior * result.scores[name]
118
- if result.scores[name] > score
119
- result.category[:name] = name;
120
- score = result.scores[name];
121
- end
122
- result.total_score += result.scores[name]
103
+ result.total_score += result.scores[category_name]
123
104
  end
124
105
  result.total_score = 1 if result.total_score == 0
125
106
  result.category[:percentage] = (
@@ -135,6 +116,59 @@ module OmniCat
135
116
  categories.has_key?(category_name)
136
117
  end
137
118
 
119
+ # nodoc
120
+ def increment_doc_counts(category_name)
121
+ self.doc_count += 1
122
+ self.categories[category_name].doc_count += 1
123
+ end
124
+
125
+ # nodoc
126
+ def update_priors
127
+ self.categories.each do |_, category|
128
+ category.prior = category.doc_count / doc_count.to_f
129
+ end
130
+ end
131
+
132
+ # nodoc
133
+ def increment_token_counts(category_name, token, count)
134
+ increment_uniq_token_count(token)
135
+ self.token_count += count
136
+ self.categories[category_name].token_count += count
137
+ end
138
+
139
+ # nodoc
140
+ def increment_uniq_token_count(token)
141
+ uniq_token_addition = 0
142
+ categories.each do |_, category|
143
+ if category.tokens.has_key?(token)
144
+ uniq_token_addition = 1
145
+ break
146
+ end
147
+ end
148
+ self.uniq_token_count += 1 if uniq_token_addition == 0
149
+ end
150
+
151
+ # nodoc
152
+ def doc_probability(category, doc)
153
+ score = k_value
154
+ doc.tokenize_with_counts.each do |token, count|
155
+ score *= token_probability(category, token, count)
156
+ end
157
+ category.prior * score
158
+ end
159
+
160
+ # nodoc
161
+ def token_probability(category, token, count)
162
+ if category.tokens[token].to_i == 0
163
+ k_value / token_count
164
+ else
165
+ count * (
166
+ (category.tokens[token].to_i + k_value) /
167
+ (category.token_count + uniq_token_count)
168
+ )
169
+ end
170
+ end
171
+
138
172
  end
139
173
  end
140
174
  end
@@ -2,10 +2,11 @@ module OmniCat
2
2
  module Classifiers
3
3
  module BayesInternals
4
4
  class Category < ::OmniCat::Base
5
- attr_accessor :doc_count, :tokens, :token_count
5
+ attr_accessor :doc_count, :prior, :tokens, :token_count
6
6
 
7
7
  def initialize(category_hash = {})
8
8
  self.doc_count = category_hash[:doc_count].to_i
9
+ self.prior = category_hash[:prior].to_f
9
10
  self.tokens = category_hash[:tokens] || {}
10
11
  self.token_count = category_hash[:token_count].to_i
11
12
  end
@@ -1,3 +1,3 @@
1
1
  module OmniCat
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
@@ -2,7 +2,7 @@ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
2
2
 
3
3
  class TestHash < Test::Unit::TestCase
4
4
  def test_to_hash
5
- categories_hash = { "pos" => { doc_count: 0, tokens: {}, token_count: 0 } }
5
+ categories_hash = { "pos" => { doc_count: 0, prior: 0.0, tokens: {}, token_count: 0 } }
6
6
  categories = OmniCat::Hash.new
7
7
  categories["pos"] = OmniCat::Classifiers::BayesInternals::Category.new(categories_hash["pos"])
8
8
  assert_equal(categories_hash, categories.to_hash)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: omnicat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mustafa Turan