omnicat 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.txt +3 -0
- data/README.md +6 -3
- data/lib/omnicat/classifiers/bayes.rb +27 -14
- data/lib/omnicat/version.rb +1 -1
- metadata +9 -15
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 5d04e29b0e2e16e019592553041b62eb9b759d3d
|
4
|
+
data.tar.gz: 1a4614e75aef06179e7c9589a73bf9c0b3552d20
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: d30317305905b877570cc2626665b74b56d3d4278422398ed05bb836d2316b2cb3fe4faee1cb0e26e1068121151fb0ba49b1eac61b79541fd2e44c23c6d19c03
|
7
|
+
data.tar.gz: bc998d7a815212af3881fb81bfa18bda74d1f68df1a273f9c17fdbf76340579de26953a384eb30edbdfac51409ddb7279e6195bac1e86d51f6e11bde48e5029e
|
data/CHANGELOG.txt
CHANGED
data/README.md
CHANGED
@@ -57,9 +57,9 @@ Train category with multiple documents.
|
|
57
57
|
Classify a document.
|
58
58
|
|
59
59
|
result = bayes.classify('I feel so good and happy')
|
60
|
-
=> #<OmniCat::Result:
|
60
|
+
=> #<OmniCat::Result:0x007fd20296aad8 @category={:name=>"positive", :percentage=>73}, @scores={"positive"=>5.4253472222222225e-09, "negative"=>1.9600796074572086e-09}, @total_score=7.385426829679431e-09>
|
61
61
|
result.to_hash
|
62
|
-
=> {:category=>{:name=>"
|
62
|
+
=> {:category=>{:name=>"positive", :percentage=>73}, :scores=>{"positive"=>5.4253472222222225e-09, "negative"=>1.9600796074572086e-09}, :total_score=>7.385426829679431e-09}
|
63
63
|
|
64
64
|
### Classify batch
|
65
65
|
Classify multiple documents at a time.
|
@@ -70,19 +70,22 @@ Classify multiple documents at a time.
|
|
70
70
|
'a good piece of work'
|
71
71
|
]
|
72
72
|
)
|
73
|
-
=> [#<OmniCat::Result:
|
73
|
+
=> [#<OmniCat::Result:0x007fd2029341b8 @category={:name=>"negative", :percentage=>78}, @scores={"positive"=>2.5521869888765736e-14, "negative"=>9.074442627116706e-14}, @total_score=1.162662961599328e-13>, #<OmniCat::Result:0x007fd20292e7e0 @category={:name=>"positive", :percentage=>80}, @scores={"positive"=>2.411265432098765e-07, "negative"=>5.880238822371627e-08}, @total_score=2.999289314335928e-07>]
|
74
74
|
|
75
75
|
### Convert to hash
|
76
76
|
Convert full Bayes object to hash.
|
77
77
|
|
78
78
|
# For storing, restoring modal data
|
79
79
|
bayes_hash = bayes.to_hash
|
80
|
+
=> {:categories=>{"positive"=>{:doc_count=>4, :tokens=>{"great"=>1, "if"=>1, "you"=>2, "are"=>2, "in"=>2, "slap"=>1, "happy"=>1, "mood"=>1, "feel-good"=>2, "picture"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>2, "term"=>1, "it"=>1, "is"=>1, "movie"=>1, "about"=>1, "which"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>2, "love"=>1, "and"=>1, "money"=>1, "both"=>1, "them"=>1, "choises"=>1}, :token_count=>37}, "negative"=>{:doc_count=>4, :tokens=>{"bad"=>1, "tracking"=>1, "issue"=>1, "simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1, "interesting"=>1, "but"=>2, "not"=>2, "compelling"=>2, "seems"=>1, "clever"=>1, "especially"=>1}, :token_count=>17}}, :category_count=>2, :doc_count=>8, :k_value=>1.0, :token_count=>54, :uniq_token_count=>43}
|
80
81
|
|
81
82
|
### Load from hash
|
82
83
|
Load full Bayes object from hash.
|
83
84
|
|
84
85
|
another_bayes_obj = OmniCat::Classifiers::Bayes.new(bayes_hash)
|
86
|
+
=> #<OmniCat::Classifiers::Bayes:0x007fd20308cff0 @categories={"positive"=>#<OmniCat::Classifiers::BayesInternals::Category:0x007fd20308cf78 @doc_count=4, @tokens={"great"=>1, "if"=>1, "you"=>2, "are"=>2, "in"=>2, "slap"=>1, "happy"=>1, "mood"=>1, "feel-good"=>2, "picture"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>2, "term"=>1, "it"=>1, "is"=>1, "movie"=>1, "about"=>1, "which"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>2, "love"=>1, "and"=>1, "money"=>1, "both"=>1, "them"=>1, "choises"=>1}, @token_count=37>, "negative"=>#<OmniCat::Classifiers::BayesInternals::Category:0x007fd20308cf00 @doc_count=4, @tokens={"bad"=>1, "tracking"=>1, "issue"=>1, "simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1, "interesting"=>1, "but"=>2, "not"=>2, "compelling"=>2, "seems"=>1, "clever"=>1, "especially"=>1}, @token_count=17>}, @category_count=2, @doc_count=8, @k_value=1.0, @token_count=54, @uniq_token_count=43>
|
85
87
|
another_bayes_obj.classify('best senses')
|
88
|
+
=> #<OmniCat::Result:0x007fd203075008 @category={:name=>"positive", :percentage=>57}, @scores={"positive"=>0.0002314814814814815, "negative"=>0.00017146776406035664}, @total_score=0.00040294924554183816>
|
86
89
|
|
87
90
|
## Todo
|
88
91
|
* Add more text classification modules such as Support Vector Machine (SVM).
|
@@ -2,7 +2,7 @@ module OmniCat
|
|
2
2
|
module Classifiers
|
3
3
|
class Bayes < ::OmniCat::Classifiers::Base
|
4
4
|
|
5
|
-
attr_accessor :categories, :category_count, :doc_count, :token_count
|
5
|
+
attr_accessor :categories, :category_count, :doc_count, :token_count, :uniq_token_count
|
6
6
|
attr_accessor :k_value # helper val for skipping some Bayes theorem errors
|
7
7
|
|
8
8
|
def initialize(bayes_hash = {})
|
@@ -16,6 +16,7 @@ module OmniCat
|
|
16
16
|
self.doc_count = bayes_hash[:doc_count].to_i
|
17
17
|
self.k_value = bayes_hash[:k_value] || 1.0
|
18
18
|
self.token_count = bayes_hash[:token_count].to_i
|
19
|
+
self.uniq_token_count = bayes_hash[:uniq_token_count].to_i
|
19
20
|
end
|
20
21
|
|
21
22
|
# Allows adding new classification category
|
@@ -53,18 +54,26 @@ module OmniCat
|
|
53
54
|
# bayes.train("positive", "good, very well")
|
54
55
|
# bayes.train("negative", "bad dog")
|
55
56
|
# bayes.train("neutral", "how is the management gui")
|
56
|
-
def train(
|
57
|
-
if category_exists?(
|
57
|
+
def train(category_name, doc)
|
58
|
+
if category_exists?(category_name)
|
58
59
|
self.doc_count += 1
|
59
|
-
categories[
|
60
|
+
categories[category_name].doc_count += 1
|
60
61
|
doc.tokenize_with_counts.each do |token, count|
|
62
|
+
uniq_token_addition = 0
|
63
|
+
categories.each do |name, category|
|
64
|
+
if category.tokens.has_key?(token)
|
65
|
+
uniq_token_addition = 1
|
66
|
+
break
|
67
|
+
end
|
68
|
+
end
|
69
|
+
self.uniq_token_count += 1 if uniq_token_addition == 0
|
61
70
|
self.token_count += count
|
62
|
-
self.categories[
|
63
|
-
self.categories[
|
71
|
+
self.categories[category_name].tokens[token] = self.categories[category_name].tokens[token].to_i + count
|
72
|
+
self.categories[category_name].token_count += count
|
64
73
|
end
|
65
74
|
else
|
66
75
|
raise StandardError,
|
67
|
-
"Category with name '#{
|
76
|
+
"Category with name '#{category_name}' does not exist!"
|
68
77
|
end
|
69
78
|
end
|
70
79
|
|
@@ -94,14 +103,18 @@ module OmniCat
|
|
94
103
|
prior = category.doc_count / doc_count.to_f
|
95
104
|
result.scores[name] = k_value
|
96
105
|
doc.tokenize_with_counts.each do |token, count|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
106
|
+
if category.tokens[token].to_i == 0
|
107
|
+
result.scores[name] *= k_value / token_count
|
108
|
+
else
|
109
|
+
result.scores[name] *= (
|
110
|
+
count * (
|
111
|
+
(category.tokens[token].to_i + k_value) /
|
112
|
+
(category.token_count + uniq_token_count)
|
113
|
+
)
|
114
|
+
)
|
115
|
+
end
|
101
116
|
end
|
102
|
-
result.scores[name] =
|
103
|
-
result.scores[name].to_f == 1.0 ? 0 : (prior * result.scores[name])
|
104
|
-
)
|
117
|
+
result.scores[name] = prior * result.scores[name]
|
105
118
|
if result.scores[name] > score
|
106
119
|
result.category[:name] = name;
|
107
120
|
score = result.scores[name];
|
data/lib/omnicat/version.rb
CHANGED
metadata
CHANGED
@@ -1,20 +1,18 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: omnicat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
5
|
-
prerelease:
|
4
|
+
version: 0.1.2
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Mustafa Turan
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date: 2013-06-
|
11
|
+
date: 2013-06-18 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: bundler
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
17
|
- - ~>
|
20
18
|
- !ruby/object:Gem::Version
|
@@ -22,7 +20,6 @@ dependencies:
|
|
22
20
|
type: :development
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
24
|
- - ~>
|
28
25
|
- !ruby/object:Gem::Version
|
@@ -30,17 +27,15 @@ dependencies:
|
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: rake
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
|
-
- -
|
31
|
+
- - '>='
|
36
32
|
- !ruby/object:Gem::Version
|
37
33
|
version: '0'
|
38
34
|
type: :development
|
39
35
|
prerelease: false
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
37
|
requirements:
|
43
|
-
- -
|
38
|
+
- - '>='
|
44
39
|
- !ruby/object:Gem::Version
|
45
40
|
version: '0'
|
46
41
|
description: A generalized framework for text classifications.
|
@@ -77,26 +72,25 @@ files:
|
|
77
72
|
homepage: https://github.com/mustafaturan/omnicat
|
78
73
|
licenses:
|
79
74
|
- MIT
|
75
|
+
metadata: {}
|
80
76
|
post_install_message:
|
81
77
|
rdoc_options: []
|
82
78
|
require_paths:
|
83
79
|
- lib
|
84
80
|
required_ruby_version: !ruby/object:Gem::Requirement
|
85
|
-
none: false
|
86
81
|
requirements:
|
87
|
-
- -
|
82
|
+
- - '>='
|
88
83
|
- !ruby/object:Gem::Version
|
89
84
|
version: '0'
|
90
85
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
91
|
-
none: false
|
92
86
|
requirements:
|
93
|
-
- -
|
87
|
+
- - '>='
|
94
88
|
- !ruby/object:Gem::Version
|
95
89
|
version: '0'
|
96
90
|
requirements: []
|
97
91
|
rubyforge_project:
|
98
|
-
rubygems_version:
|
92
|
+
rubygems_version: 2.0.3
|
99
93
|
signing_key:
|
100
|
-
specification_version:
|
94
|
+
specification_version: 4
|
101
95
|
summary: A generalized framework for text classifications.
|
102
96
|
test_files: []
|