omnicat 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.txt +3 -0
- data/README.md +6 -3
- data/lib/omnicat/classifiers/bayes.rb +27 -14
- data/lib/omnicat/version.rb +1 -1
- metadata +9 -15
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 5d04e29b0e2e16e019592553041b62eb9b759d3d
|
4
|
+
data.tar.gz: 1a4614e75aef06179e7c9589a73bf9c0b3552d20
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: d30317305905b877570cc2626665b74b56d3d4278422398ed05bb836d2316b2cb3fe4faee1cb0e26e1068121151fb0ba49b1eac61b79541fd2e44c23c6d19c03
|
7
|
+
data.tar.gz: bc998d7a815212af3881fb81bfa18bda74d1f68df1a273f9c17fdbf76340579de26953a384eb30edbdfac51409ddb7279e6195bac1e86d51f6e11bde48e5029e
|
data/CHANGELOG.txt
CHANGED
data/README.md
CHANGED
@@ -57,9 +57,9 @@ Train category with multiple documents.
|
|
57
57
|
Classify a document.
|
58
58
|
|
59
59
|
result = bayes.classify('I feel so good and happy')
|
60
|
-
=> #<OmniCat::Result:
|
60
|
+
=> #<OmniCat::Result:0x007fd20296aad8 @category={:name=>"positive", :percentage=>73}, @scores={"positive"=>5.4253472222222225e-09, "negative"=>1.9600796074572086e-09}, @total_score=7.385426829679431e-09>
|
61
61
|
result.to_hash
|
62
|
-
=> {:category=>{:name=>"
|
62
|
+
=> {:category=>{:name=>"positive", :percentage=>73}, :scores=>{"positive"=>5.4253472222222225e-09, "negative"=>1.9600796074572086e-09}, :total_score=>7.385426829679431e-09}
|
63
63
|
|
64
64
|
### Classify batch
|
65
65
|
Classify multiple documents at a time.
|
@@ -70,19 +70,22 @@ Classify multiple documents at a time.
|
|
70
70
|
'a good piece of work'
|
71
71
|
]
|
72
72
|
)
|
73
|
-
=> [#<OmniCat::Result:
|
73
|
+
=> [#<OmniCat::Result:0x007fd2029341b8 @category={:name=>"negative", :percentage=>78}, @scores={"positive"=>2.5521869888765736e-14, "negative"=>9.074442627116706e-14}, @total_score=1.162662961599328e-13>, #<OmniCat::Result:0x007fd20292e7e0 @category={:name=>"positive", :percentage=>80}, @scores={"positive"=>2.411265432098765e-07, "negative"=>5.880238822371627e-08}, @total_score=2.999289314335928e-07>]
|
74
74
|
|
75
75
|
### Convert to hash
|
76
76
|
Convert full Bayes object to hash.
|
77
77
|
|
78
78
|
# For storing, restoring modal data
|
79
79
|
bayes_hash = bayes.to_hash
|
80
|
+
=> {:categories=>{"positive"=>{:doc_count=>4, :tokens=>{"great"=>1, "if"=>1, "you"=>2, "are"=>2, "in"=>2, "slap"=>1, "happy"=>1, "mood"=>1, "feel-good"=>2, "picture"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>2, "term"=>1, "it"=>1, "is"=>1, "movie"=>1, "about"=>1, "which"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>2, "love"=>1, "and"=>1, "money"=>1, "both"=>1, "them"=>1, "choises"=>1}, :token_count=>37}, "negative"=>{:doc_count=>4, :tokens=>{"bad"=>1, "tracking"=>1, "issue"=>1, "simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1, "interesting"=>1, "but"=>2, "not"=>2, "compelling"=>2, "seems"=>1, "clever"=>1, "especially"=>1}, :token_count=>17}}, :category_count=>2, :doc_count=>8, :k_value=>1.0, :token_count=>54, :uniq_token_count=>43}
|
80
81
|
|
81
82
|
### Load from hash
|
82
83
|
Load full Bayes object from hash.
|
83
84
|
|
84
85
|
another_bayes_obj = OmniCat::Classifiers::Bayes.new(bayes_hash)
|
86
|
+
=> #<OmniCat::Classifiers::Bayes:0x007fd20308cff0 @categories={"positive"=>#<OmniCat::Classifiers::BayesInternals::Category:0x007fd20308cf78 @doc_count=4, @tokens={"great"=>1, "if"=>1, "you"=>2, "are"=>2, "in"=>2, "slap"=>1, "happy"=>1, "mood"=>1, "feel-good"=>2, "picture"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>2, "term"=>1, "it"=>1, "is"=>1, "movie"=>1, "about"=>1, "which"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>2, "love"=>1, "and"=>1, "money"=>1, "both"=>1, "them"=>1, "choises"=>1}, @token_count=37>, "negative"=>#<OmniCat::Classifiers::BayesInternals::Category:0x007fd20308cf00 @doc_count=4, @tokens={"bad"=>1, "tracking"=>1, "issue"=>1, "simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1, "interesting"=>1, "but"=>2, "not"=>2, "compelling"=>2, "seems"=>1, "clever"=>1, "especially"=>1}, @token_count=17>}, @category_count=2, @doc_count=8, @k_value=1.0, @token_count=54, @uniq_token_count=43>
|
85
87
|
another_bayes_obj.classify('best senses')
|
88
|
+
=> #<OmniCat::Result:0x007fd203075008 @category={:name=>"positive", :percentage=>57}, @scores={"positive"=>0.0002314814814814815, "negative"=>0.00017146776406035664}, @total_score=0.00040294924554183816>
|
86
89
|
|
87
90
|
## Todo
|
88
91
|
* Add more text classification modules such as Support Vector Machine (SVM).
|
@@ -2,7 +2,7 @@ module OmniCat
|
|
2
2
|
module Classifiers
|
3
3
|
class Bayes < ::OmniCat::Classifiers::Base
|
4
4
|
|
5
|
-
attr_accessor :categories, :category_count, :doc_count, :token_count
|
5
|
+
attr_accessor :categories, :category_count, :doc_count, :token_count, :uniq_token_count
|
6
6
|
attr_accessor :k_value # helper val for skipping some Bayes theorem errors
|
7
7
|
|
8
8
|
def initialize(bayes_hash = {})
|
@@ -16,6 +16,7 @@ module OmniCat
|
|
16
16
|
self.doc_count = bayes_hash[:doc_count].to_i
|
17
17
|
self.k_value = bayes_hash[:k_value] || 1.0
|
18
18
|
self.token_count = bayes_hash[:token_count].to_i
|
19
|
+
self.uniq_token_count = bayes_hash[:uniq_token_count].to_i
|
19
20
|
end
|
20
21
|
|
21
22
|
# Allows adding new classification category
|
@@ -53,18 +54,26 @@ module OmniCat
|
|
53
54
|
# bayes.train("positive", "good, very well")
|
54
55
|
# bayes.train("negative", "bad dog")
|
55
56
|
# bayes.train("neutral", "how is the management gui")
|
56
|
-
def train(
|
57
|
-
if category_exists?(
|
57
|
+
def train(category_name, doc)
|
58
|
+
if category_exists?(category_name)
|
58
59
|
self.doc_count += 1
|
59
|
-
categories[
|
60
|
+
categories[category_name].doc_count += 1
|
60
61
|
doc.tokenize_with_counts.each do |token, count|
|
62
|
+
uniq_token_addition = 0
|
63
|
+
categories.each do |name, category|
|
64
|
+
if category.tokens.has_key?(token)
|
65
|
+
uniq_token_addition = 1
|
66
|
+
break
|
67
|
+
end
|
68
|
+
end
|
69
|
+
self.uniq_token_count += 1 if uniq_token_addition == 0
|
61
70
|
self.token_count += count
|
62
|
-
self.categories[
|
63
|
-
self.categories[
|
71
|
+
self.categories[category_name].tokens[token] = self.categories[category_name].tokens[token].to_i + count
|
72
|
+
self.categories[category_name].token_count += count
|
64
73
|
end
|
65
74
|
else
|
66
75
|
raise StandardError,
|
67
|
-
"Category with name '#{
|
76
|
+
"Category with name '#{category_name}' does not exist!"
|
68
77
|
end
|
69
78
|
end
|
70
79
|
|
@@ -94,14 +103,18 @@ module OmniCat
|
|
94
103
|
prior = category.doc_count / doc_count.to_f
|
95
104
|
result.scores[name] = k_value
|
96
105
|
doc.tokenize_with_counts.each do |token, count|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
106
|
+
if category.tokens[token].to_i == 0
|
107
|
+
result.scores[name] *= k_value / token_count
|
108
|
+
else
|
109
|
+
result.scores[name] *= (
|
110
|
+
count * (
|
111
|
+
(category.tokens[token].to_i + k_value) /
|
112
|
+
(category.token_count + uniq_token_count)
|
113
|
+
)
|
114
|
+
)
|
115
|
+
end
|
101
116
|
end
|
102
|
-
result.scores[name] =
|
103
|
-
result.scores[name].to_f == 1.0 ? 0 : (prior * result.scores[name])
|
104
|
-
)
|
117
|
+
result.scores[name] = prior * result.scores[name]
|
105
118
|
if result.scores[name] > score
|
106
119
|
result.category[:name] = name;
|
107
120
|
score = result.scores[name];
|
data/lib/omnicat/version.rb
CHANGED
metadata
CHANGED
@@ -1,20 +1,18 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: omnicat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
5
|
-
prerelease:
|
4
|
+
version: 0.1.2
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Mustafa Turan
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date: 2013-06-
|
11
|
+
date: 2013-06-18 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: bundler
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
17
|
- - ~>
|
20
18
|
- !ruby/object:Gem::Version
|
@@ -22,7 +20,6 @@ dependencies:
|
|
22
20
|
type: :development
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
24
|
- - ~>
|
28
25
|
- !ruby/object:Gem::Version
|
@@ -30,17 +27,15 @@ dependencies:
|
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: rake
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
|
-
- -
|
31
|
+
- - '>='
|
36
32
|
- !ruby/object:Gem::Version
|
37
33
|
version: '0'
|
38
34
|
type: :development
|
39
35
|
prerelease: false
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
37
|
requirements:
|
43
|
-
- -
|
38
|
+
- - '>='
|
44
39
|
- !ruby/object:Gem::Version
|
45
40
|
version: '0'
|
46
41
|
description: A generalized framework for text classifications.
|
@@ -77,26 +72,25 @@ files:
|
|
77
72
|
homepage: https://github.com/mustafaturan/omnicat
|
78
73
|
licenses:
|
79
74
|
- MIT
|
75
|
+
metadata: {}
|
80
76
|
post_install_message:
|
81
77
|
rdoc_options: []
|
82
78
|
require_paths:
|
83
79
|
- lib
|
84
80
|
required_ruby_version: !ruby/object:Gem::Requirement
|
85
|
-
none: false
|
86
81
|
requirements:
|
87
|
-
- -
|
82
|
+
- - '>='
|
88
83
|
- !ruby/object:Gem::Version
|
89
84
|
version: '0'
|
90
85
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
91
|
-
none: false
|
92
86
|
requirements:
|
93
|
-
- -
|
87
|
+
- - '>='
|
94
88
|
- !ruby/object:Gem::Version
|
95
89
|
version: '0'
|
96
90
|
requirements: []
|
97
91
|
rubyforge_project:
|
98
|
-
rubygems_version:
|
92
|
+
rubygems_version: 2.0.3
|
99
93
|
signing_key:
|
100
|
-
specification_version:
|
94
|
+
specification_version: 4
|
101
95
|
summary: A generalized framework for text classifications.
|
102
96
|
test_files: []
|