omnicat 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 5d04e29b0e2e16e019592553041b62eb9b759d3d
4
+ data.tar.gz: 1a4614e75aef06179e7c9589a73bf9c0b3552d20
5
+ SHA512:
6
+ metadata.gz: d30317305905b877570cc2626665b74b56d3d4278422398ed05bb836d2316b2cb3fe4faee1cb0e26e1068121151fb0ba49b1eac61b79541fd2e44c23c6d19c03
7
+ data.tar.gz: bc998d7a815212af3881fb81bfa18bda74d1f68df1a273f9c17fdbf76340579de26953a384eb30edbdfac51409ddb7279e6195bac1e86d51f6e11bde48e5029e
data/CHANGELOG.txt CHANGED
@@ -1,3 +1,6 @@
1
+ 0.1.2
2
+ # fix the bayes algorithm (so important changes!)
3
+
1
4
  0.1.1
2
5
  # fix Regexp error for ruby version < 2.0.0
3
6
 
data/README.md CHANGED
@@ -57,9 +57,9 @@ Train category with multiple documents.
57
57
  Classify a document.
58
58
 
59
59
  result = bayes.classify('I feel so good and happy')
60
- => #<OmniCat::Result:0x007fe59b97b548 @category={:name=>"negative", :percentage=>99}, @scores={"positive"=>1.749909854122994e-07, "negative"=>0.014084507042253521}, @total_score=0.014084682033238934>
60
+ => #<OmniCat::Result:0x007fd20296aad8 @category={:name=>"positive", :percentage=>73}, @scores={"positive"=>5.4253472222222225e-09, "negative"=>1.9600796074572086e-09}, @total_score=7.385426829679431e-09>
61
61
  result.to_hash
62
- => {:category=>{:name=>"negative", :percentage=>99}, :scores=>{"positive"=>1.749909854122994e-07, "negative"=>0.014084507042253521}, :total_score=>0.014084682033238934}
62
+ => {:category=>{:name=>"positive", :percentage=>73}, :scores=>{"positive"=>5.4253472222222225e-09, "negative"=>1.9600796074572086e-09}, :total_score=>7.385426829679431e-09}
63
63
 
64
64
  ### Classify batch
65
65
  Classify multiple documents at a time.
@@ -70,19 +70,22 @@ Classify multiple documents at a time.
70
70
  'a good piece of work'
71
71
  ]
72
72
  )
73
- => [#<OmniCat::Result:0x007fe59b949d90 @category={:name=>"negative", :percentage=>75}, @scores={"positive"=>7.962089836259623e-06, "negative"=>2.5145916163515512e-05}, @total_score=3.3108005999775135e-05>, #<OmniCat::Result:0x007fe59c9d7d10 @category={:name=>"positive", :percentage=>100}, @scores={"positive"=>0.0005434126313247192, "negative"=>0}, @total_score=0.0005434126313247192>]
73
+ => [#<OmniCat::Result:0x007fd2029341b8 @category={:name=>"negative", :percentage=>78}, @scores={"positive"=>2.5521869888765736e-14, "negative"=>9.074442627116706e-14}, @total_score=1.162662961599328e-13>, #<OmniCat::Result:0x007fd20292e7e0 @category={:name=>"positive", :percentage=>80}, @scores={"positive"=>2.411265432098765e-07, "negative"=>5.880238822371627e-08}, @total_score=2.999289314335928e-07>]
74
74
 
75
75
  ### Convert to hash
76
76
  Convert full Bayes object to hash.
77
77
 
78
78
  # For storing, restoring modal data
79
79
  bayes_hash = bayes.to_hash
80
+ => {:categories=>{"positive"=>{:doc_count=>4, :tokens=>{"great"=>1, "if"=>1, "you"=>2, "are"=>2, "in"=>2, "slap"=>1, "happy"=>1, "mood"=>1, "feel-good"=>2, "picture"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>2, "term"=>1, "it"=>1, "is"=>1, "movie"=>1, "about"=>1, "which"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>2, "love"=>1, "and"=>1, "money"=>1, "both"=>1, "them"=>1, "choises"=>1}, :token_count=>37}, "negative"=>{:doc_count=>4, :tokens=>{"bad"=>1, "tracking"=>1, "issue"=>1, "simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1, "interesting"=>1, "but"=>2, "not"=>2, "compelling"=>2, "seems"=>1, "clever"=>1, "especially"=>1}, :token_count=>17}}, :category_count=>2, :doc_count=>8, :k_value=>1.0, :token_count=>54, :uniq_token_count=>43}
80
81
 
81
82
  ### Load from hash
82
83
  Load full Bayes object from hash.
83
84
 
84
85
  another_bayes_obj = OmniCat::Classifiers::Bayes.new(bayes_hash)
86
+ => #<OmniCat::Classifiers::Bayes:0x007fd20308cff0 @categories={"positive"=>#<OmniCat::Classifiers::BayesInternals::Category:0x007fd20308cf78 @doc_count=4, @tokens={"great"=>1, "if"=>1, "you"=>2, "are"=>2, "in"=>2, "slap"=>1, "happy"=>1, "mood"=>1, "feel-good"=>2, "picture"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>2, "term"=>1, "it"=>1, "is"=>1, "movie"=>1, "about"=>1, "which"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>2, "love"=>1, "and"=>1, "money"=>1, "both"=>1, "them"=>1, "choises"=>1}, @token_count=37>, "negative"=>#<OmniCat::Classifiers::BayesInternals::Category:0x007fd20308cf00 @doc_count=4, @tokens={"bad"=>1, "tracking"=>1, "issue"=>1, "simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1, "interesting"=>1, "but"=>2, "not"=>2, "compelling"=>2, "seems"=>1, "clever"=>1, "especially"=>1}, @token_count=17>}, @category_count=2, @doc_count=8, @k_value=1.0, @token_count=54, @uniq_token_count=43>
85
87
  another_bayes_obj.classify('best senses')
88
+ => #<OmniCat::Result:0x007fd203075008 @category={:name=>"positive", :percentage=>57}, @scores={"positive"=>0.0002314814814814815, "negative"=>0.00017146776406035664}, @total_score=0.00040294924554183816>
86
89
 
87
90
  ## Todo
88
91
  * Add more text classification modules such as Support Vector Machine (SVM).
@@ -2,7 +2,7 @@ module OmniCat
2
2
  module Classifiers
3
3
  class Bayes < ::OmniCat::Classifiers::Base
4
4
 
5
- attr_accessor :categories, :category_count, :doc_count, :token_count
5
+ attr_accessor :categories, :category_count, :doc_count, :token_count, :uniq_token_count
6
6
  attr_accessor :k_value # helper val for skipping some Bayes theorem errors
7
7
 
8
8
  def initialize(bayes_hash = {})
@@ -16,6 +16,7 @@ module OmniCat
16
16
  self.doc_count = bayes_hash[:doc_count].to_i
17
17
  self.k_value = bayes_hash[:k_value] || 1.0
18
18
  self.token_count = bayes_hash[:token_count].to_i
19
+ self.uniq_token_count = bayes_hash[:uniq_token_count].to_i
19
20
  end
20
21
 
21
22
  # Allows adding new classification category
@@ -53,18 +54,26 @@ module OmniCat
53
54
  # bayes.train("positive", "good, very well")
54
55
  # bayes.train("negative", "bad dog")
55
56
  # bayes.train("neutral", "how is the management gui")
56
- def train(category, doc)
57
- if category_exists?(category)
57
+ def train(category_name, doc)
58
+ if category_exists?(category_name)
58
59
  self.doc_count += 1
59
- categories[category].doc_count += 1
60
+ categories[category_name].doc_count += 1
60
61
  doc.tokenize_with_counts.each do |token, count|
62
+ uniq_token_addition = 0
63
+ categories.each do |name, category|
64
+ if category.tokens.has_key?(token)
65
+ uniq_token_addition = 1
66
+ break
67
+ end
68
+ end
69
+ self.uniq_token_count += 1 if uniq_token_addition == 0
61
70
  self.token_count += count
62
- self.categories[category].tokens[token] = self.categories[category].tokens[token].to_i + count
63
- self.categories[category].token_count += count
71
+ self.categories[category_name].tokens[token] = self.categories[category_name].tokens[token].to_i + count
72
+ self.categories[category_name].token_count += count
64
73
  end
65
74
  else
66
75
  raise StandardError,
67
- "Category with name '#{category}' does not exist!"
76
+ "Category with name '#{category_name}' does not exist!"
68
77
  end
69
78
  end
70
79
 
@@ -94,14 +103,18 @@ module OmniCat
94
103
  prior = category.doc_count / doc_count.to_f
95
104
  result.scores[name] = k_value
96
105
  doc.tokenize_with_counts.each do |token, count|
97
- result.scores[name] *= (
98
- (category.tokens[token].to_i + k_value) /
99
- (category.token_count + token_count)
100
- ) if category.tokens.has_key?(token)
106
+ if category.tokens[token].to_i == 0
107
+ result.scores[name] *= k_value / token_count
108
+ else
109
+ result.scores[name] *= (
110
+ count * (
111
+ (category.tokens[token].to_i + k_value) /
112
+ (category.token_count + uniq_token_count)
113
+ )
114
+ )
115
+ end
101
116
  end
102
- result.scores[name] = (
103
- result.scores[name].to_f == 1.0 ? 0 : (prior * result.scores[name])
104
- )
117
+ result.scores[name] = prior * result.scores[name]
105
118
  if result.scores[name] > score
106
119
  result.category[:name] = name;
107
120
  score = result.scores[name];
@@ -1,3 +1,3 @@
1
1
  module OmniCat
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.2"
3
3
  end
metadata CHANGED
@@ -1,20 +1,18 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: omnicat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
5
- prerelease:
4
+ version: 0.1.2
6
5
  platform: ruby
7
6
  authors:
8
7
  - Mustafa Turan
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-06-16 00:00:00.000000000 Z
11
+ date: 2013-06-18 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: bundler
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
17
  - - ~>
20
18
  - !ruby/object:Gem::Version
@@ -22,7 +20,6 @@ dependencies:
22
20
  type: :development
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
24
  - - ~>
28
25
  - !ruby/object:Gem::Version
@@ -30,17 +27,15 @@ dependencies:
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: rake
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
- - - ! '>='
31
+ - - '>='
36
32
  - !ruby/object:Gem::Version
37
33
  version: '0'
38
34
  type: :development
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
- - - ! '>='
38
+ - - '>='
44
39
  - !ruby/object:Gem::Version
45
40
  version: '0'
46
41
  description: A generalized framework for text classifications.
@@ -77,26 +72,25 @@ files:
77
72
  homepage: https://github.com/mustafaturan/omnicat
78
73
  licenses:
79
74
  - MIT
75
+ metadata: {}
80
76
  post_install_message:
81
77
  rdoc_options: []
82
78
  require_paths:
83
79
  - lib
84
80
  required_ruby_version: !ruby/object:Gem::Requirement
85
- none: false
86
81
  requirements:
87
- - - ! '>='
82
+ - - '>='
88
83
  - !ruby/object:Gem::Version
89
84
  version: '0'
90
85
  required_rubygems_version: !ruby/object:Gem::Requirement
91
- none: false
92
86
  requirements:
93
- - - ! '>='
87
+ - - '>='
94
88
  - !ruby/object:Gem::Version
95
89
  version: '0'
96
90
  requirements: []
97
91
  rubyforge_project:
98
- rubygems_version: 1.8.23
92
+ rubygems_version: 2.0.3
99
93
  signing_key:
100
- specification_version: 3
94
+ specification_version: 4
101
95
  summary: A generalized framework for text classifications.
102
96
  test_files: []