omnicat 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 5d04e29b0e2e16e019592553041b62eb9b759d3d
4
+ data.tar.gz: 1a4614e75aef06179e7c9589a73bf9c0b3552d20
5
+ SHA512:
6
+ metadata.gz: d30317305905b877570cc2626665b74b56d3d4278422398ed05bb836d2316b2cb3fe4faee1cb0e26e1068121151fb0ba49b1eac61b79541fd2e44c23c6d19c03
7
+ data.tar.gz: bc998d7a815212af3881fb81bfa18bda74d1f68df1a273f9c17fdbf76340579de26953a384eb30edbdfac51409ddb7279e6195bac1e86d51f6e11bde48e5029e
data/CHANGELOG.txt CHANGED
@@ -1,3 +1,6 @@
1
+ 0.1.2
2
+ # fix the bayes algorithm (so important changes!)
3
+
1
4
  0.1.1
2
5
  # fix Regexp error for ruby version < 2.0.0
3
6
 
data/README.md CHANGED
@@ -57,9 +57,9 @@ Train category with multiple documents.
57
57
  Classify a document.
58
58
 
59
59
  result = bayes.classify('I feel so good and happy')
60
- => #<OmniCat::Result:0x007fe59b97b548 @category={:name=>"negative", :percentage=>99}, @scores={"positive"=>1.749909854122994e-07, "negative"=>0.014084507042253521}, @total_score=0.014084682033238934>
60
+ => #<OmniCat::Result:0x007fd20296aad8 @category={:name=>"positive", :percentage=>73}, @scores={"positive"=>5.4253472222222225e-09, "negative"=>1.9600796074572086e-09}, @total_score=7.385426829679431e-09>
61
61
  result.to_hash
62
- => {:category=>{:name=>"negative", :percentage=>99}, :scores=>{"positive"=>1.749909854122994e-07, "negative"=>0.014084507042253521}, :total_score=>0.014084682033238934}
62
+ => {:category=>{:name=>"positive", :percentage=>73}, :scores=>{"positive"=>5.4253472222222225e-09, "negative"=>1.9600796074572086e-09}, :total_score=>7.385426829679431e-09}
63
63
 
64
64
  ### Classify batch
65
65
  Classify multiple documents at a time.
@@ -70,19 +70,22 @@ Classify multiple documents at a time.
70
70
  'a good piece of work'
71
71
  ]
72
72
  )
73
- => [#<OmniCat::Result:0x007fe59b949d90 @category={:name=>"negative", :percentage=>75}, @scores={"positive"=>7.962089836259623e-06, "negative"=>2.5145916163515512e-05}, @total_score=3.3108005999775135e-05>, #<OmniCat::Result:0x007fe59c9d7d10 @category={:name=>"positive", :percentage=>100}, @scores={"positive"=>0.0005434126313247192, "negative"=>0}, @total_score=0.0005434126313247192>]
73
+ => [#<OmniCat::Result:0x007fd2029341b8 @category={:name=>"negative", :percentage=>78}, @scores={"positive"=>2.5521869888765736e-14, "negative"=>9.074442627116706e-14}, @total_score=1.162662961599328e-13>, #<OmniCat::Result:0x007fd20292e7e0 @category={:name=>"positive", :percentage=>80}, @scores={"positive"=>2.411265432098765e-07, "negative"=>5.880238822371627e-08}, @total_score=2.999289314335928e-07>]
74
74
 
75
75
  ### Convert to hash
76
76
  Convert full Bayes object to hash.
77
77
 
78
78
  # For storing, restoring modal data
79
79
  bayes_hash = bayes.to_hash
80
+ => {:categories=>{"positive"=>{:doc_count=>4, :tokens=>{"great"=>1, "if"=>1, "you"=>2, "are"=>2, "in"=>2, "slap"=>1, "happy"=>1, "mood"=>1, "feel-good"=>2, "picture"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>2, "term"=>1, "it"=>1, "is"=>1, "movie"=>1, "about"=>1, "which"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>2, "love"=>1, "and"=>1, "money"=>1, "both"=>1, "them"=>1, "choises"=>1}, :token_count=>37}, "negative"=>{:doc_count=>4, :tokens=>{"bad"=>1, "tracking"=>1, "issue"=>1, "simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1, "interesting"=>1, "but"=>2, "not"=>2, "compelling"=>2, "seems"=>1, "clever"=>1, "especially"=>1}, :token_count=>17}}, :category_count=>2, :doc_count=>8, :k_value=>1.0, :token_count=>54, :uniq_token_count=>43}
80
81
 
81
82
  ### Load from hash
82
83
  Load full Bayes object from hash.
83
84
 
84
85
  another_bayes_obj = OmniCat::Classifiers::Bayes.new(bayes_hash)
86
+ => #<OmniCat::Classifiers::Bayes:0x007fd20308cff0 @categories={"positive"=>#<OmniCat::Classifiers::BayesInternals::Category:0x007fd20308cf78 @doc_count=4, @tokens={"great"=>1, "if"=>1, "you"=>2, "are"=>2, "in"=>2, "slap"=>1, "happy"=>1, "mood"=>1, "feel-good"=>2, "picture"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>2, "term"=>1, "it"=>1, "is"=>1, "movie"=>1, "about"=>1, "which"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>2, "love"=>1, "and"=>1, "money"=>1, "both"=>1, "them"=>1, "choises"=>1}, @token_count=37>, "negative"=>#<OmniCat::Classifiers::BayesInternals::Category:0x007fd20308cf00 @doc_count=4, @tokens={"bad"=>1, "tracking"=>1, "issue"=>1, "simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1, "interesting"=>1, "but"=>2, "not"=>2, "compelling"=>2, "seems"=>1, "clever"=>1, "especially"=>1}, @token_count=17>}, @category_count=2, @doc_count=8, @k_value=1.0, @token_count=54, @uniq_token_count=43>
85
87
  another_bayes_obj.classify('best senses')
88
+ => #<OmniCat::Result:0x007fd203075008 @category={:name=>"positive", :percentage=>57}, @scores={"positive"=>0.0002314814814814815, "negative"=>0.00017146776406035664}, @total_score=0.00040294924554183816>
86
89
 
87
90
  ## Todo
88
91
  * Add more text classification modules such as Support Vector Machine (SVM).
@@ -2,7 +2,7 @@ module OmniCat
2
2
  module Classifiers
3
3
  class Bayes < ::OmniCat::Classifiers::Base
4
4
 
5
- attr_accessor :categories, :category_count, :doc_count, :token_count
5
+ attr_accessor :categories, :category_count, :doc_count, :token_count, :uniq_token_count
6
6
  attr_accessor :k_value # helper val for skipping some Bayes theorem errors
7
7
 
8
8
  def initialize(bayes_hash = {})
@@ -16,6 +16,7 @@ module OmniCat
16
16
  self.doc_count = bayes_hash[:doc_count].to_i
17
17
  self.k_value = bayes_hash[:k_value] || 1.0
18
18
  self.token_count = bayes_hash[:token_count].to_i
19
+ self.uniq_token_count = bayes_hash[:uniq_token_count].to_i
19
20
  end
20
21
 
21
22
  # Allows adding new classification category
@@ -53,18 +54,26 @@ module OmniCat
53
54
  # bayes.train("positive", "good, very well")
54
55
  # bayes.train("negative", "bad dog")
55
56
  # bayes.train("neutral", "how is the management gui")
56
- def train(category, doc)
57
- if category_exists?(category)
57
+ def train(category_name, doc)
58
+ if category_exists?(category_name)
58
59
  self.doc_count += 1
59
- categories[category].doc_count += 1
60
+ categories[category_name].doc_count += 1
60
61
  doc.tokenize_with_counts.each do |token, count|
62
+ uniq_token_addition = 0
63
+ categories.each do |name, category|
64
+ if category.tokens.has_key?(token)
65
+ uniq_token_addition = 1
66
+ break
67
+ end
68
+ end
69
+ self.uniq_token_count += 1 if uniq_token_addition == 0
61
70
  self.token_count += count
62
- self.categories[category].tokens[token] = self.categories[category].tokens[token].to_i + count
63
- self.categories[category].token_count += count
71
+ self.categories[category_name].tokens[token] = self.categories[category_name].tokens[token].to_i + count
72
+ self.categories[category_name].token_count += count
64
73
  end
65
74
  else
66
75
  raise StandardError,
67
- "Category with name '#{category}' does not exist!"
76
+ "Category with name '#{category_name}' does not exist!"
68
77
  end
69
78
  end
70
79
 
@@ -94,14 +103,18 @@ module OmniCat
94
103
  prior = category.doc_count / doc_count.to_f
95
104
  result.scores[name] = k_value
96
105
  doc.tokenize_with_counts.each do |token, count|
97
- result.scores[name] *= (
98
- (category.tokens[token].to_i + k_value) /
99
- (category.token_count + token_count)
100
- ) if category.tokens.has_key?(token)
106
+ if category.tokens[token].to_i == 0
107
+ result.scores[name] *= k_value / token_count
108
+ else
109
+ result.scores[name] *= (
110
+ count * (
111
+ (category.tokens[token].to_i + k_value) /
112
+ (category.token_count + uniq_token_count)
113
+ )
114
+ )
115
+ end
101
116
  end
102
- result.scores[name] = (
103
- result.scores[name].to_f == 1.0 ? 0 : (prior * result.scores[name])
104
- )
117
+ result.scores[name] = prior * result.scores[name]
105
118
  if result.scores[name] > score
106
119
  result.category[:name] = name;
107
120
  score = result.scores[name];
@@ -1,3 +1,3 @@
1
1
  module OmniCat
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.2"
3
3
  end
metadata CHANGED
@@ -1,20 +1,18 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: omnicat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
5
- prerelease:
4
+ version: 0.1.2
6
5
  platform: ruby
7
6
  authors:
8
7
  - Mustafa Turan
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-06-16 00:00:00.000000000 Z
11
+ date: 2013-06-18 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: bundler
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
17
  - - ~>
20
18
  - !ruby/object:Gem::Version
@@ -22,7 +20,6 @@ dependencies:
22
20
  type: :development
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
24
  - - ~>
28
25
  - !ruby/object:Gem::Version
@@ -30,17 +27,15 @@ dependencies:
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: rake
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
- - - ! '>='
31
+ - - '>='
36
32
  - !ruby/object:Gem::Version
37
33
  version: '0'
38
34
  type: :development
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
- - - ! '>='
38
+ - - '>='
44
39
  - !ruby/object:Gem::Version
45
40
  version: '0'
46
41
  description: A generalized framework for text classifications.
@@ -77,26 +72,25 @@ files:
77
72
  homepage: https://github.com/mustafaturan/omnicat
78
73
  licenses:
79
74
  - MIT
75
+ metadata: {}
80
76
  post_install_message:
81
77
  rdoc_options: []
82
78
  require_paths:
83
79
  - lib
84
80
  required_ruby_version: !ruby/object:Gem::Requirement
85
- none: false
86
81
  requirements:
87
- - - ! '>='
82
+ - - '>='
88
83
  - !ruby/object:Gem::Version
89
84
  version: '0'
90
85
  required_rubygems_version: !ruby/object:Gem::Requirement
91
- none: false
92
86
  requirements:
93
- - - ! '>='
87
+ - - '>='
94
88
  - !ruby/object:Gem::Version
95
89
  version: '0'
96
90
  requirements: []
97
91
  rubyforge_project:
98
- rubygems_version: 1.8.23
92
+ rubygems_version: 2.0.3
99
93
  signing_key:
100
- specification_version: 3
94
+ specification_version: 4
101
95
  summary: A generalized framework for text classifications.
102
96
  test_files: []