omnicat-bayes 0.2.2 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +20 -10
- data/lib/omnicat/bayes/version.rb +1 -1
- data/lib/omnicat/classifiers/bayes.rb +84 -62
- data/omnicat-bayes.gemspec +1 -2
- data/test/unit/classifiers/bayes_test.rb +23 -10
- metadata +16 -40
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 9226b652056d418b7f374a46c15e55c146f9f421
|
4
|
+
data.tar.gz: e370533de84b92342b0cc202ecf50c978749c9a4
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ecae56eab0769f6d582114d13326ec5e483431ac8f33ff2479578dff56a759e482e61e120e03619123c65ca78419f303cfb0bd685f8d712142a6e502fc3f6c28
|
7
|
+
data.tar.gz: 3cbf521720c7f804a70aacde837cac67599de3fd8e190540e4272909c085ebcedc3c1f5e322fb7f2160263432f22b17cc19f8d7f769a8b6b8be4cb75e1346ece
|
data/README.md
CHANGED
@@ -27,11 +27,15 @@ See rdoc for detailed usage.
|
|
27
27
|
Optional configuration sample:
|
28
28
|
|
29
29
|
OmniCat.configure do |config|
|
30
|
+
# you can enable auto train mode by :unique or :continues
|
31
|
+
# unique: only uniq docs will be added to training docs on prediction
|
32
|
+
# continues: always add docs to training docs on prediction
|
33
|
+
config.auto_train = :off
|
30
34
|
config.exclude_tokens = ['something', 'anything'] # exclude token list
|
31
35
|
config.token_patterns = {
|
32
|
-
# exclude
|
36
|
+
# exclude tokens with Regex patterns
|
33
37
|
minus: [/[\s\t\n\r]+/, /(@[\w\d]+)/],
|
34
|
-
# include
|
38
|
+
# include tokens with Regex patterns
|
35
39
|
plus: [/[\p{L}\-0-9]{2,}/, /[\!\?]/, /[\:\)\(\;\-\|]{2,3}/]
|
36
40
|
}
|
37
41
|
end
|
@@ -96,9 +100,14 @@ Untrain category with multiple documents.
|
|
96
100
|
Classify a document.
|
97
101
|
|
98
102
|
result = bayes.classify('I feel so good and happy')
|
99
|
-
=> #<OmniCat::Result:
|
103
|
+
=> #<OmniCat::Result:0x007febb152af68 @top_score_key="positive", @scores={"positive"=>#<OmniCat::Score:0x007febb152add8 @key="positive", @value=6.813226744186048e-09, @percentage=58>, "negative"=>#<OmniCat::Score:0x007febb152ac70 @key="negative", @value=4.875003449064939e-09, @percentage=42>}, @total_score=1.1688230193250986e-08>
|
100
104
|
result.to_hash
|
101
|
-
=> {:
|
105
|
+
=> {:top_score_key=>"positive", :scores=>{"positive"=>{:key=>"positive", :value=>6.813226744186048e-09, :percentage=>58}, "negative"=>{:key=>"negative", :value=>4.875003449064939e-09, :percentage=>42}}, :total_score=>1.1688230193250986e-08}
|
106
|
+
result.top_score
|
107
|
+
=> #<OmniCat::Score:0x007febb152add8 @key="positive", @value=6.813226744186048e-09, @percentage=58>
|
108
|
+
result.top_score.to_hash
|
109
|
+
=> {:key=>"positive", :value=>6.813226744186048e-09, :percentage=>58}
|
110
|
+
|
102
111
|
|
103
112
|
### Classify batch
|
104
113
|
Classify multiple documents at a time.
|
@@ -109,25 +118,26 @@ Classify multiple documents at a time.
|
|
109
118
|
'a good piece of work'
|
110
119
|
]
|
111
120
|
)
|
112
|
-
=> [#<OmniCat::Result:
|
121
|
+
=> [#<OmniCat::Result:0x007febb14f3680 @top_score_key="negative", @scores={"positive"=>#<OmniCat::Score:0x007febb14f34a0 @key="positive", @value=7.971480930520432e-14, @percentage=22>, "negative"=>#<OmniCat::Score:0x007febb14f32c0 @key="negative", @value=2.834304330851709e-13, @percentage=78>}, @total_score=3.6314524239037524e-13>, #<OmniCat::Result:0x007febb14f2aa0 @top_score_key="positive", @scores={"positive"=>#<OmniCat::Score:0x007febb14f2960 @key="positive", @value=3.802731206057328e-07, @percentage=72>, "negative"=>#<OmniCat::Score:0x007febb14f2820 @key="negative", @value=1.4625010347194818e-07, @percentage=28>}, @total_score=5.26523224077681e-07>]
|
113
122
|
|
114
123
|
### Convert to hash
|
115
124
|
Convert full Bayes object to hash.
|
116
125
|
|
117
126
|
# For storing, restoring modal data
|
118
127
|
bayes_hash = bayes.to_hash
|
119
|
-
=> {:categories=>{"positive"=>{:doc_count=>4, :tokens=>{"great"=>1, "if"=>1, "you"=>2, "are"=>2, "in"=>2, "slap"=>1, "happy"=>1, "mood"=>1, "feel-good"=>2, "picture"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>2, "term"=>1, "it"=>1, "is"=>1, "movie"=>1, "about"=>1, "which"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>2, "love"=>1, "and"=>1, "money"=>1, "both"=>1, "them"=>1, "choises"=>1}, :token_count=>37}, "negative"=>{:doc_count=>4, :tokens=>{"bad"=>1, "tracking"=>1, "issue"=>1, "simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1, "interesting"=>1, "but"=>2, "not"=>2, "compelling"=>2, "seems"=>1, "clever"=>1, "especially"=>1}, :token_count=>17}}, :category_count=>2, :
|
128
|
+
=> {:categories=>{"positive"=>{:doc_count=>4, :docs=>{"28fd29bbf840c86db65e510ff3cd07a9"=>{:content=>"great if you are in a slap happy mood .", :content_md5=>"28fd29bbf840c86db65e510ff3cd07a9", :count=>1, :tokens=>{"great"=>1, "if"=>1, "you"=>1, "are"=>1, "in"=>1, "slap"=>1, "happy"=>1, "mood"=>1}}, "82b4cd9513f448dea0024f2d0e2ccd44"=>{:content=>"a feel-good picture in the best sense of the term...", :content_md5=>"82b4cd9513f448dea0024f2d0e2ccd44", :count=>1, :tokens=>{"feel-good"=>1, "picture"=>1, "in"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>1, "term"=>1}}, "f917bf1cf1256c78c5436d850dab3104"=>{:content=>"it is a feel-good movie about which you can actually feel good.", :content_md5=>"f917bf1cf1256c78c5436d850dab3104", :count=>1, :tokens=>{"it"=>1, "is"=>1, "feel-good"=>1, "movie"=>1, "about"=>1, "which"=>1, "you"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>1}}, "4343bbe84c035733708c3f58136f321e"=>{:content=>"love and money both of them are good choises", :content_md5=>"4343bbe84c035733708c3f58136f321e", :count=>1, :tokens=>{"love"=>1, "and"=>1, "money"=>1, "both"=>1, "of"=>1, "them"=>1, "are"=>1, "good"=>1, "choises"=>1}}}, :name=>"positive", :tokens=>{"great"=>1, "if"=>1, "you"=>2, "are"=>2, "in"=>2, "slap"=>1, "happy"=>1, "mood"=>1, "feel-good"=>2, "picture"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>2, "term"=>1, "it"=>1, "is"=>1, "movie"=>1, "about"=>1, "which"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>2, "love"=>1, "and"=>1, "money"=>1, "both"=>1, "them"=>1, "choises"=>1}, :token_count=>37, :prior=>0.5}, "negative"=>{:doc_count=>4, :docs=>{"89b36e774579662591ea21b3283d9b35"=>{:content=>"bad tracking issue", :content_md5=>"89b36e774579662591ea21b3283d9b35", :count=>1, :tokens=>{"bad"=>1, "tracking"=>1, "issue"=>1}}, "b0ec48bc87527e285b26d6cce8e278e7"=>{:content=>"simplistic , silly and tedious .", :content_md5=>"b0ec48bc87527e285b26d6cce8e278e7", :count=>1, :tokens=>{"simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1}}, "ae9d4fbaf40906614ca712a888648c5f"=>{:content=>"interesting , but not compelling . ", :content_md5=>"ae9d4fbaf40906614ca712a888648c5f", :count=>1, :tokens=>{"interesting"=>1, "but"=>1, "not"=>1, "compelling"=>1}}, "0e495f5d88d8049746a1b6961bf3cc90"=>{:content=>"seems clever but not especially compelling", :content_md5=>"0e495f5d88d8049746a1b6961bf3cc90", :count=>1, :tokens=>{"seems"=>1, "clever"=>1, "but"=>1, "not"=>1, "especially"=>1, "compelling"=>1}}}, :name=>"negative", :tokens=>{"bad"=>1, "tracking"=>1, "issue"=>1, "simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1, "interesting"=>1, "but"=>2, "not"=>2, "compelling"=>2, "seems"=>1, "clever"=>1, "especially"=>1}, :token_count=>17, :prior=>0.5}}, :category_count=>2, :category_size_limit=>0, :doc_count=>8, :token_count=>54, :unique_token_count=>43, :k_value=>1.0}
|
120
129
|
|
121
130
|
### Load from hash
|
122
131
|
Load full Bayes object from hash.
|
123
132
|
|
124
133
|
another_bayes_obj = OmniCat::Classifiers::Bayes.new(bayes_hash)
|
125
|
-
=> #<OmniCat::Classifiers::Bayes:
|
134
|
+
=> #<OmniCat::Classifiers::Bayes:0x007febb14d15a8 @categories={"positive"=>#<OmniCat::Classifiers::BayesInternals::Category:0x007febb14d1530 @doc_count=4, @docs={"28fd29bbf840c86db65e510ff3cd07a9"=>{:content=>"great if you are in a slap happy mood .", :content_md5=>"28fd29bbf840c86db65e510ff3cd07a9", :count=>1, :tokens=>{"great"=>1, "if"=>1, "you"=>1, "are"=>1, "in"=>1, "slap"=>1, "happy"=>1, "mood"=>1}}, "82b4cd9513f448dea0024f2d0e2ccd44"=>{:content=>"a feel-good picture in the best sense of the term...", :content_md5=>"82b4cd9513f448dea0024f2d0e2ccd44", :count=>1, :tokens=>{"feel-good"=>1, "picture"=>1, "in"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>1, "term"=>1}}, "f917bf1cf1256c78c5436d850dab3104"=>{:content=>"it is a feel-good movie about which you can actually feel good.", :content_md5=>"f917bf1cf1256c78c5436d850dab3104", :count=>1, :tokens=>{"it"=>1, "is"=>1, "feel-good"=>1, "movie"=>1, "about"=>1, "which"=>1, "you"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>1}}, "4343bbe84c035733708c3f58136f321e"=>{:content=>"love and money both of them are good choises", :content_md5=>"4343bbe84c035733708c3f58136f321e", :count=>1, :tokens=>{"love"=>1, "and"=>1, "money"=>1, "both"=>1, "of"=>1, "them"=>1, "are"=>1, "good"=>1, "choises"=>1}}}, @name="positive", @tokens={"great"=>1, "if"=>1, "you"=>2, "are"=>2, "in"=>2, "slap"=>1, "happy"=>1, "mood"=>1, "feel-good"=>2, "picture"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>2, "term"=>1, "it"=>1, "is"=>1, "movie"=>1, "about"=>1, "which"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>2, "love"=>1, "and"=>1, "money"=>1, "both"=>1, "them"=>1, "choises"=>1}, @token_count=37, @prior=0.5>, "negative"=>#<OmniCat::Classifiers::BayesInternals::Category:0x007febb14d14e0 @doc_count=4, @docs={"89b36e774579662591ea21b3283d9b35"=>{:content=>"bad tracking issue", :content_md5=>"89b36e774579662591ea21b3283d9b35", :count=>1, :tokens=>{"bad"=>1, "tracking"=>1, "issue"=>1}}, "b0ec48bc87527e285b26d6cce8e278e7"=>{:content=>"simplistic , silly and tedious .", :content_md5=>"b0ec48bc87527e285b26d6cce8e278e7", :count=>1, :tokens=>{"simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1}}, "ae9d4fbaf40906614ca712a888648c5f"=>{:content=>"interesting , but not compelling . ", :content_md5=>"ae9d4fbaf40906614ca712a888648c5f", :count=>1, :tokens=>{"interesting"=>1, "but"=>1, "not"=>1, "compelling"=>1}}, "0e495f5d88d8049746a1b6961bf3cc90"=>{:content=>"seems clever but not especially compelling", :content_md5=>"0e495f5d88d8049746a1b6961bf3cc90", :count=>1, :tokens=>{"seems"=>1, "clever"=>1, "but"=>1, "not"=>1, "especially"=>1, "compelling"=>1}}}, @name="negative", @tokens={"bad"=>1, "tracking"=>1, "issue"=>1, "simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1, "interesting"=>1, "but"=>2, "not"=>2, "compelling"=>2, "seems"=>1, "clever"=>1, "especially"=>1}, @token_count=17, @prior=0.5>}, @category_count=2, @category_size_limit=0, @doc_count=8, @token_count=54, @unique_token_count=43, @k_value=1.0>
|
126
135
|
another_bayes_obj.classify('best senses')
|
127
|
-
=> #<OmniCat::Result:
|
136
|
+
=> #<OmniCat::Result:0x007febb14c0fc8 @top_score_key="positive", @scores={"positive"=>#<OmniCat::Score:0x007febb14c0ed8 @key="positive", @value=0.00029069767441860465, @percentage=52>, "negative"=>#<OmniCat::Score:0x007febb14c0de8 @key="negative", @value=0.0002704164413196322, @percentage=48>}, @total_score=0.0005611141157382368>
|
128
137
|
|
129
|
-
|
130
|
-
|
138
|
+
### Best practices
|
139
|
+
For bayes classification always try to train same amount of documents for each category. So, do not activate auto training mode, because it make overages on balance of trained docs and makes algorithm go crazy :).
|
140
|
+
To get best results on text classification you should apply some cleaning actions like spellchecking, stemming, stop words cleaning before training and prediction actions.
|
131
141
|
|
132
142
|
## Contributing
|
133
143
|
|
@@ -33,7 +33,7 @@ module OmniCat
|
|
33
33
|
"Category with name '#{category_name}' is already exists!"
|
34
34
|
else
|
35
35
|
increment_category_count
|
36
|
-
@categories[category_name] = ::OmniCat::Classifiers::BayesInternals::Category.new
|
36
|
+
@categories[category_name] = ::OmniCat::Classifiers::BayesInternals::Category.new(name: category_name)
|
37
37
|
end
|
38
38
|
end
|
39
39
|
|
@@ -52,24 +52,14 @@ module OmniCat
|
|
52
52
|
# bayes.train("negative", "bad dog")
|
53
53
|
# bayes.train("neutral", "how is the management gui")
|
54
54
|
def train(category_name, doc_content)
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
doc.increment_count
|
61
|
-
else
|
62
|
-
doc = OmniCat::Doc.new(content: doc_content)
|
63
|
-
end
|
64
|
-
@categories[category_name].docs[doc_key] = doc
|
65
|
-
doc.tokens.each do |token, count|
|
66
|
-
increment_token_counts(category_name, token, count)
|
67
|
-
@categories[category_name].tokens[token] = @categories[category_name].tokens[token].to_i + count
|
68
|
-
end
|
69
|
-
else
|
70
|
-
raise StandardError,
|
71
|
-
"Category with name '#{category_name}' does not exist!"
|
55
|
+
category_must_exist(category_name)
|
56
|
+
doc = add_doc(category_name, doc_content)
|
57
|
+
doc.tokens.each do |token, count|
|
58
|
+
increment_token_counts(category_name, token, count)
|
59
|
+
@categories[category_name].tokens[token] = @categories[category_name].tokens[token].to_i + count
|
72
60
|
end
|
61
|
+
increment_doc_counts(category_name)
|
62
|
+
update_priors
|
73
63
|
end
|
74
64
|
|
75
65
|
# Untrain the desired category with a document
|
@@ -87,26 +77,15 @@ module OmniCat
|
|
87
77
|
# bayes.untrain("negative", "bad dog")
|
88
78
|
# bayes.untrain("neutral", "how is the management gui")
|
89
79
|
def untrain(category_name, doc_content)
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
"Document is not found in #{category_name} documents!"
|
97
|
-
end
|
98
|
-
doc.tokens.each do |token, count|
|
99
|
-
@categories[category_name].tokens[token] = @categories[category_name].tokens[token].to_i - count
|
100
|
-
@categories[category_name].tokens.delete(token) if @categories[category_name].tokens[token] == 0
|
101
|
-
decrement_token_counts(category_name, token, count)
|
102
|
-
end
|
103
|
-
@categories[category_name].docs.delete(doc_key) if @categories[category_name].docs[doc_key].count == 0
|
104
|
-
decrement_doc_counts(category_name)
|
105
|
-
update_priors
|
106
|
-
else
|
107
|
-
raise StandardError,
|
108
|
-
"Category with name '#{category_name}' does not exist!"
|
80
|
+
category_must_exist(category_name)
|
81
|
+
doc = remove_doc(category_name, doc_content)
|
82
|
+
doc.tokens.each do |token, count|
|
83
|
+
@categories[category_name].tokens[token] = @categories[category_name].tokens[token].to_i - count
|
84
|
+
@categories[category_name].tokens.delete(token) if @categories[category_name].tokens[token] == 0
|
85
|
+
decrement_token_counts(category_name, token, count)
|
109
86
|
end
|
87
|
+
decrement_doc_counts(category_name)
|
88
|
+
update_priors
|
110
89
|
end
|
111
90
|
|
112
91
|
# Classify the given document
|
@@ -126,25 +105,32 @@ module OmniCat
|
|
126
105
|
# =>
|
127
106
|
def classify(doc_content)
|
128
107
|
return unless classifiable?
|
129
|
-
|
108
|
+
doc = ::OmniCat::Doc.new(content: doc_content)
|
130
109
|
result = ::OmniCat::Result.new
|
131
110
|
@categories.each do |category_name, category|
|
132
|
-
result.
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
111
|
+
result.add_score(
|
112
|
+
Score.new(
|
113
|
+
key: category.name,
|
114
|
+
value: doc_probability(category, doc)
|
115
|
+
)
|
116
|
+
)
|
138
117
|
end
|
139
|
-
result.
|
140
|
-
result.
|
141
|
-
result.scores[result.category[:name]] * 100.0 /
|
142
|
-
result.total_score
|
143
|
-
).floor
|
118
|
+
auto_train(@categories[result.top_score.key], doc)
|
119
|
+
result.calculate_percentages
|
144
120
|
result
|
145
121
|
end
|
146
122
|
|
147
123
|
private
|
124
|
+
# nodoc
|
125
|
+
def auto_train(category, doc)
|
126
|
+
case ::OmniCat.config.auto_train
|
127
|
+
when :continues
|
128
|
+
train(category.name, doc.content)
|
129
|
+
when :unique
|
130
|
+
train(category.name, doc.content) unless category.docs.has_key?(doc.content_md5)
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
148
134
|
# nodoc
|
149
135
|
def update_priors
|
150
136
|
@categories.each do |_, category|
|
@@ -164,36 +150,35 @@ module OmniCat
|
|
164
150
|
|
165
151
|
# nodoc
|
166
152
|
def modify_token_counts(category_name, token, count)
|
167
|
-
|
153
|
+
modify_unique_token_count(token, count < 0 ? -1 : 1)
|
168
154
|
@token_count += count
|
169
155
|
@categories[category_name].token_count += count
|
170
156
|
end
|
171
157
|
|
172
158
|
# nodoc
|
173
|
-
def
|
174
|
-
|
159
|
+
def increment_unique_token_count(token)
|
160
|
+
modify_unique_token_count(token, 1)
|
175
161
|
end
|
176
162
|
|
177
163
|
# nodoc
|
178
|
-
def
|
179
|
-
|
164
|
+
def decrement_unique_token_count(token)
|
165
|
+
modify_unique_token_count(token, -1)
|
180
166
|
end
|
181
167
|
|
182
168
|
# nodoc
|
183
|
-
def
|
169
|
+
def modify_unique_token_count(token, uniq_token_addition)
|
184
170
|
@categories.each do |_, category|
|
185
171
|
if category.tokens.has_key?(token)
|
186
172
|
uniq_token_addition = 0
|
187
173
|
break
|
188
174
|
end
|
189
175
|
end
|
190
|
-
@
|
176
|
+
@unique_token_count += uniq_token_addition
|
191
177
|
end
|
192
178
|
|
193
179
|
# nodoc
|
194
|
-
def doc_probability(category,
|
195
|
-
score = k_value
|
196
|
-
doc = OmniCat::Doc.new(content: doc_content)
|
180
|
+
def doc_probability(category, doc)
|
181
|
+
score = @k_value
|
197
182
|
doc.tokens.each do |token, count|
|
198
183
|
score *= token_probability(category, token, count)
|
199
184
|
end
|
@@ -203,14 +188,51 @@ module OmniCat
|
|
203
188
|
# nodoc
|
204
189
|
def token_probability(category, token, count)
|
205
190
|
if category.tokens[token].to_i == 0
|
206
|
-
k_value /
|
191
|
+
@k_value / (@unique_token_count * count)
|
207
192
|
else
|
208
193
|
count * (
|
209
|
-
(category.tokens[token].to_i + k_value) /
|
210
|
-
(category.token_count +
|
194
|
+
(category.tokens[token].to_i + @k_value) /
|
195
|
+
(category.token_count + @unique_token_count)
|
211
196
|
)
|
212
197
|
end
|
213
198
|
end
|
199
|
+
|
200
|
+
# nodoc
|
201
|
+
def add_doc(category_name, doc_content)
|
202
|
+
doc_key = generate_doc_key(doc_content)
|
203
|
+
if doc = @categories[category_name].docs[doc_key]
|
204
|
+
doc.increment_count
|
205
|
+
else
|
206
|
+
@categories[category_name].docs[doc_key] = ::OmniCat::Doc.new(content: doc_content)
|
207
|
+
end
|
208
|
+
@categories[category_name].docs[doc_key]
|
209
|
+
end
|
210
|
+
|
211
|
+
# nodoc
|
212
|
+
def remove_doc(category_name, doc_content)
|
213
|
+
doc_key = generate_doc_key(doc_content)
|
214
|
+
doc = @categories[category_name].docs[doc_key]
|
215
|
+
unless doc
|
216
|
+
raise StandardError,
|
217
|
+
"Document is not found in #{category_name} documents!"
|
218
|
+
end
|
219
|
+
doc.decrement_count
|
220
|
+
@categories[category_name].docs.delete(doc_key) if doc.count == 0
|
221
|
+
doc
|
222
|
+
end
|
223
|
+
|
224
|
+
# nodoc
|
225
|
+
def generate_doc_key(doc_content)
|
226
|
+
Digest::MD5.hexdigest(doc_content)
|
227
|
+
end
|
228
|
+
|
229
|
+
# nodoc
|
230
|
+
def category_must_exist(category_name)
|
231
|
+
unless category_exists?(category_name)
|
232
|
+
raise StandardError,
|
233
|
+
"Category with name '#{category_name}' does not exist!"
|
234
|
+
end
|
235
|
+
end
|
214
236
|
end
|
215
237
|
end
|
216
238
|
end
|
data/omnicat-bayes.gemspec
CHANGED
@@ -18,8 +18,7 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
19
|
spec.require_paths = ['lib']
|
20
20
|
|
21
|
-
spec.add_dependency '
|
22
|
-
spec.add_dependency 'omnicat', '~> 0.2.2'
|
21
|
+
spec.add_dependency 'omnicat', '~> 0.3.0'
|
23
22
|
spec.add_development_dependency 'bundler', '~> 1.3'
|
24
23
|
spec.add_development_dependency 'rake'
|
25
24
|
end
|
@@ -4,6 +4,7 @@ require File.expand_path(File.join(File.dirname(__FILE__), '..', '..', 'test_hel
|
|
4
4
|
class TestBayes < Test::Unit::TestCase
|
5
5
|
def setup
|
6
6
|
OmniCat.configure do |config|
|
7
|
+
config.auto_train = :off
|
7
8
|
config.exclude_tokens = ['are', 'at', 'by']
|
8
9
|
config.token_patterns = {
|
9
10
|
minus: [/[\s\t\n\r]+/, /(@[\w\d]+)/],
|
@@ -82,7 +83,7 @@ class TestBayes < Test::Unit::TestCase
|
|
82
83
|
@bayes.categories['neutral'].token_count
|
83
84
|
)
|
84
85
|
end
|
85
|
-
|
86
|
+
|
86
87
|
def test_untrain_with_doc_count_2
|
87
88
|
@bayes.add_category 'neutral'
|
88
89
|
@bayes.train 'neutral', 'how are you?? : :| :) ;-) :('
|
@@ -112,7 +113,8 @@ class TestBayes < Test::Unit::TestCase
|
|
112
113
|
def test_untrain_with_missing_doc
|
113
114
|
@bayes.add_category 'neutral'
|
114
115
|
assert_raise(StandardError) {
|
115
|
-
@bayes.untrain 'neutral', 'how are you?? : :| :) ;-) :('
|
116
|
+
@bayes.untrain 'neutral', 'how are you?? : :| :) ;-) :('
|
117
|
+
}
|
116
118
|
end
|
117
119
|
|
118
120
|
def test_train_batch
|
@@ -128,15 +130,15 @@ class TestBayes < Test::Unit::TestCase
|
|
128
130
|
def test_train_missing_category
|
129
131
|
assert_raise(StandardError) { @bayes.train 'neutral', 'how are you?' }
|
130
132
|
end
|
131
|
-
|
133
|
+
|
132
134
|
def test_unique_token_count
|
133
135
|
@bayes.add_category 'positive'
|
134
136
|
@bayes.train_batch 'positive', ['good job ever', 'valid syntax',
|
135
137
|
'best moments of my good life']
|
136
|
-
assert_equal(10,@bayes.
|
138
|
+
assert_equal(10,@bayes.unique_token_count)
|
137
139
|
@bayes.untrain_batch 'positive', ['good job ever', 'valid syntax',
|
138
140
|
'best moments of my good life']
|
139
|
-
assert_equal(0,@bayes.
|
141
|
+
assert_equal(0,@bayes.unique_token_count)
|
140
142
|
end
|
141
143
|
|
142
144
|
def test_classifiability_error
|
@@ -154,11 +156,12 @@ class TestBayes < Test::Unit::TestCase
|
|
154
156
|
@bayes.train('negative', 'bad work')
|
155
157
|
assert_equal(
|
156
158
|
'positive',
|
157
|
-
@bayes.classify('very good position for this sentence').
|
159
|
+
@bayes.classify('very good position for this sentence').top_score.key
|
158
160
|
)
|
161
|
+
@bayes.train('negative', 'work')
|
159
162
|
assert_equal(
|
160
163
|
'negative',
|
161
|
-
@bayes.classify('bad words').
|
164
|
+
@bayes.classify('bad words').top_score.key
|
162
165
|
)
|
163
166
|
end
|
164
167
|
|
@@ -176,11 +179,11 @@ class TestBayes < Test::Unit::TestCase
|
|
176
179
|
|
177
180
|
assert_equal(
|
178
181
|
'positive',
|
179
|
-
results[0].
|
182
|
+
results[0].top_score.key
|
180
183
|
)
|
181
184
|
assert_equal(
|
182
185
|
'negative',
|
183
|
-
results[1].
|
186
|
+
results[1].top_score.key
|
184
187
|
)
|
185
188
|
end
|
186
189
|
|
@@ -191,11 +194,21 @@ class TestBayes < Test::Unit::TestCase
|
|
191
194
|
bayes1.train('positive', 'good job')
|
192
195
|
bayes1.train('negative', 'bad work')
|
193
196
|
h1 = bayes1.to_hash
|
194
|
-
|
195
197
|
bayes2 = ::OmniCat::Classifiers::Bayes.new(h1)
|
196
198
|
assert_equal(h1, bayes2.to_hash)
|
197
199
|
end
|
198
200
|
|
201
|
+
def test_change_strategy
|
202
|
+
c1 = ::OmniCat::Classifier.new(::OmniCat::Classifiers::Bayes.new)
|
203
|
+
c1.add_category 'positive'
|
204
|
+
c1.add_category 'negative'
|
205
|
+
c1.train('positive', 'good job')
|
206
|
+
c1.train('negative', 'bad work')
|
207
|
+
h1 = c1.to_hash
|
208
|
+
c1.strategy = ::OmniCat::Classifiers::Bayes.new
|
209
|
+
assert_equal(h1, c1.to_hash)
|
210
|
+
end
|
211
|
+
|
199
212
|
def test_classify_with_insufficient_categories
|
200
213
|
assert_raise(StandardError) { @bayes.classify 'blank' }
|
201
214
|
end
|
metadata
CHANGED
@@ -1,78 +1,55 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: omnicat-bayes
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.3.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Mustafa Turan
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2014-02-19 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
|
-
- !ruby/object:Gem::Dependency
|
15
|
-
name: hashable
|
16
|
-
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
|
-
requirements:
|
19
|
-
- - ~>
|
20
|
-
- !ruby/object:Gem::Version
|
21
|
-
version: 0.1.1
|
22
|
-
type: :runtime
|
23
|
-
prerelease: false
|
24
|
-
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
|
-
requirements:
|
27
|
-
- - ~>
|
28
|
-
- !ruby/object:Gem::Version
|
29
|
-
version: 0.1.1
|
30
13
|
- !ruby/object:Gem::Dependency
|
31
14
|
name: omnicat
|
32
15
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
16
|
requirements:
|
35
|
-
- - ~>
|
17
|
+
- - "~>"
|
36
18
|
- !ruby/object:Gem::Version
|
37
|
-
version: 0.
|
19
|
+
version: 0.3.0
|
38
20
|
type: :runtime
|
39
21
|
prerelease: false
|
40
22
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
23
|
requirements:
|
43
|
-
- - ~>
|
24
|
+
- - "~>"
|
44
25
|
- !ruby/object:Gem::Version
|
45
|
-
version: 0.
|
26
|
+
version: 0.3.0
|
46
27
|
- !ruby/object:Gem::Dependency
|
47
28
|
name: bundler
|
48
29
|
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
30
|
requirements:
|
51
|
-
- - ~>
|
31
|
+
- - "~>"
|
52
32
|
- !ruby/object:Gem::Version
|
53
33
|
version: '1.3'
|
54
34
|
type: :development
|
55
35
|
prerelease: false
|
56
36
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
37
|
requirements:
|
59
|
-
- - ~>
|
38
|
+
- - "~>"
|
60
39
|
- !ruby/object:Gem::Version
|
61
40
|
version: '1.3'
|
62
41
|
- !ruby/object:Gem::Dependency
|
63
42
|
name: rake
|
64
43
|
requirement: !ruby/object:Gem::Requirement
|
65
|
-
none: false
|
66
44
|
requirements:
|
67
|
-
- -
|
45
|
+
- - ">="
|
68
46
|
- !ruby/object:Gem::Version
|
69
47
|
version: '0'
|
70
48
|
type: :development
|
71
49
|
prerelease: false
|
72
50
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
51
|
requirements:
|
75
|
-
- -
|
52
|
+
- - ">="
|
76
53
|
- !ruby/object:Gem::Version
|
77
54
|
version: '0'
|
78
55
|
description: Naive Bayes classifier strategy for OmniCat
|
@@ -82,7 +59,7 @@ executables: []
|
|
82
59
|
extensions: []
|
83
60
|
extra_rdoc_files: []
|
84
61
|
files:
|
85
|
-
- .gitignore
|
62
|
+
- ".gitignore"
|
86
63
|
- Gemfile
|
87
64
|
- LICENSE.txt
|
88
65
|
- README.md
|
@@ -98,27 +75,26 @@ files:
|
|
98
75
|
homepage: https://github.com/mustafaturan/omnicat-bayes
|
99
76
|
licenses:
|
100
77
|
- MIT
|
78
|
+
metadata: {}
|
101
79
|
post_install_message:
|
102
80
|
rdoc_options: []
|
103
81
|
require_paths:
|
104
82
|
- lib
|
105
83
|
required_ruby_version: !ruby/object:Gem::Requirement
|
106
|
-
none: false
|
107
84
|
requirements:
|
108
|
-
- -
|
85
|
+
- - ">="
|
109
86
|
- !ruby/object:Gem::Version
|
110
87
|
version: '0'
|
111
88
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
112
|
-
none: false
|
113
89
|
requirements:
|
114
|
-
- -
|
90
|
+
- - ">="
|
115
91
|
- !ruby/object:Gem::Version
|
116
92
|
version: '0'
|
117
93
|
requirements: []
|
118
94
|
rubyforge_project:
|
119
|
-
rubygems_version:
|
95
|
+
rubygems_version: 2.2.0
|
120
96
|
signing_key:
|
121
|
-
specification_version:
|
97
|
+
specification_version: 4
|
122
98
|
summary: Naive Bayes text classification implementation as an OmniCat classifier strategy.
|
123
99
|
test_files:
|
124
100
|
- test/test_helper.rb
|