omnicat 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/.travis.yml +1 -3
- data/CHANGELOG.txt +8 -0
- data/README.md +22 -66
- data/lib/omnicat.rb +17 -2
- data/lib/omnicat/classifier.rb +58 -0
- data/lib/omnicat/classifiers/strategy.rb +178 -0
- data/lib/omnicat/classifiers/strategy_internals/category.rb +18 -0
- data/lib/omnicat/configuration.rb +35 -0
- data/lib/omnicat/doc.rb +52 -0
- data/lib/omnicat/result.rb +2 -0
- data/lib/omnicat/version.rb +1 -1
- data/lib/test/unit/classifier_test.rb +1 -0
- data/lib/test/unit/classifiers/strategy_test.rb +46 -0
- data/lib/test/unit/doc_test.rb +40 -0
- data/lib/test/unit/hash_test.rb +4 -2
- metadata +24 -17
- checksums.yaml +0 -7
- data/lib/omnicat/bayes.rb +0 -3
- data/lib/omnicat/classifiers/base.rb +0 -55
- data/lib/omnicat/classifiers/bayes.rb +0 -174
- data/lib/omnicat/classifiers/bayes_internals/category.rb +0 -16
- data/lib/omnicat/string.rb +0 -10
- data/lib/test/unit/base_test.rb +0 -49
- data/lib/test/unit/bayes_test.rb +0 -85
- data/lib/test/unit/string_test.rb +0 -17
data/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
/omnicat*.gem
|
data/.travis.yml
CHANGED
data/CHANGELOG.txt
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
Master Branch
|
2
|
+
|
3
|
+
0.2.0
|
4
|
+
# bayes classifier moved to another gem which is 'omnicat-bayes'
|
5
|
+
# applied 'Strategy Software Design Pattern' for classifiers
|
6
|
+
# configuration added with Singleton Software Design Pattern
|
7
|
+
# string methods moved to OmniCat::Doc class
|
8
|
+
|
1
9
|
0.1.3
|
2
10
|
# refactoring at bayes algorithm
|
3
11
|
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
[](https://travis-ci.org/mustafaturan/omnicat) [](https://codeclimate.com/github/mustafaturan/omnicat)
|
4
4
|
|
5
|
-
A generalized framework for text classifications.
|
5
|
+
A generalized framework for text classifications.
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
@@ -20,76 +20,32 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
-
|
23
|
+
Stand-alone version of omnicat is just a strategy holder for developers. Its aim is providing omnification of methods for text classification gems with loseless conversion of a strategy to another one. End-users should see 'classifier strategies' section and 'changing classifier strategy' sub section.
|
24
24
|
|
25
|
-
###
|
26
|
-
Create a Bayes classifier object.
|
25
|
+
### Changing classifier strategy
|
27
26
|
|
28
|
-
|
27
|
+
OmniCat allows you to change strategy on runtime.
|
29
28
|
|
30
|
-
|
31
|
-
|
29
|
+
# Declare classifier with Naive Bayes classifier
|
30
|
+
classifier = OmniCat::Classifier.new(OmniCat::Classifiers::Bayes.new())
|
31
|
+
...
|
32
|
+
# do some operations like adding category, training, etc...
|
33
|
+
...
|
34
|
+
# make some classification using Bayes
|
35
|
+
classifier.classify('I am happy :)')
|
36
|
+
...
|
37
|
+
# change strategy to Support Vector Machine (SVM) on runtime
|
38
|
+
classifier = OmniCat::Classifier.new(OmniCat::Classifiers::SVM.new())
|
39
|
+
# now you do not need to re-train, add category and so on..
|
40
|
+
# just classify with new strategy
|
41
|
+
classifier.classify('I am happy :)')
|
32
42
|
|
33
|
-
|
34
|
-
|
43
|
+
## Classifier strategies
|
44
|
+
Here is the classifier list avaliable for OmniCat.
|
35
45
|
|
36
|
-
###
|
37
|
-
|
38
|
-
|
39
|
-
bayes.train('positive', 'great if you are in a slap happy mood .')
|
40
|
-
bayes.train('negative', 'bad tracking issue')
|
41
|
-
|
42
|
-
### Train batch
|
43
|
-
Train category with multiple documents.
|
44
|
-
|
45
|
-
bayes.train_batch('positive', [
|
46
|
-
'a feel-good picture in the best sense of the term...',
|
47
|
-
'it is a feel-good movie about which you can actually feel good.',
|
48
|
-
'love and money both of them are good choises'
|
49
|
-
])
|
50
|
-
bayes.train_batch('negative', [
|
51
|
-
'simplistic , silly and tedious .',
|
52
|
-
'interesting , but not compelling . ',
|
53
|
-
'seems clever but not especially compelling'
|
54
|
-
])
|
55
|
-
|
56
|
-
### Classify
|
57
|
-
Classify a document.
|
58
|
-
|
59
|
-
result = bayes.classify('I feel so good and happy')
|
60
|
-
=> #<OmniCat::Result:0x007fd20296aad8 @category={:name=>"positive", :percentage=>73}, @scores={"positive"=>5.4253472222222225e-09, "negative"=>1.9600796074572086e-09}, @total_score=7.385426829679431e-09>
|
61
|
-
result.to_hash
|
62
|
-
=> {:category=>{:name=>"positive", :percentage=>73}, :scores=>{"positive"=>5.4253472222222225e-09, "negative"=>1.9600796074572086e-09}, :total_score=>7.385426829679431e-09}
|
63
|
-
|
64
|
-
### Classify batch
|
65
|
-
Classify multiple documents at a time.
|
66
|
-
|
67
|
-
results = bayes.classify_batch(
|
68
|
-
[
|
69
|
-
'the movie is silly so not compelling enough',
|
70
|
-
'a good piece of work'
|
71
|
-
]
|
72
|
-
)
|
73
|
-
=> [#<OmniCat::Result:0x007fd2029341b8 @category={:name=>"negative", :percentage=>78}, @scores={"positive"=>2.5521869888765736e-14, "negative"=>9.074442627116706e-14}, @total_score=1.162662961599328e-13>, #<OmniCat::Result:0x007fd20292e7e0 @category={:name=>"positive", :percentage=>80}, @scores={"positive"=>2.411265432098765e-07, "negative"=>5.880238822371627e-08}, @total_score=2.999289314335928e-07>]
|
74
|
-
|
75
|
-
### Convert to hash
|
76
|
-
Convert full Bayes object to hash.
|
77
|
-
|
78
|
-
# For storing, restoring modal data
|
79
|
-
bayes_hash = bayes.to_hash
|
80
|
-
=> {:categories=>{"positive"=>{:doc_count=>4, :tokens=>{"great"=>1, "if"=>1, "you"=>2, "are"=>2, "in"=>2, "slap"=>1, "happy"=>1, "mood"=>1, "feel-good"=>2, "picture"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>2, "term"=>1, "it"=>1, "is"=>1, "movie"=>1, "about"=>1, "which"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>2, "love"=>1, "and"=>1, "money"=>1, "both"=>1, "them"=>1, "choises"=>1}, :token_count=>37}, "negative"=>{:doc_count=>4, :tokens=>{"bad"=>1, "tracking"=>1, "issue"=>1, "simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1, "interesting"=>1, "but"=>2, "not"=>2, "compelling"=>2, "seems"=>1, "clever"=>1, "especially"=>1}, :token_count=>17}}, :category_count=>2, :doc_count=>8, :k_value=>1.0, :token_count=>54, :uniq_token_count=>43}
|
81
|
-
|
82
|
-
### Load from hash
|
83
|
-
Load full Bayes object from hash.
|
84
|
-
|
85
|
-
another_bayes_obj = OmniCat::Classifiers::Bayes.new(bayes_hash)
|
86
|
-
=> #<OmniCat::Classifiers::Bayes:0x007fd20308cff0 @categories={"positive"=>#<OmniCat::Classifiers::BayesInternals::Category:0x007fd20308cf78 @doc_count=4, @tokens={"great"=>1, "if"=>1, "you"=>2, "are"=>2, "in"=>2, "slap"=>1, "happy"=>1, "mood"=>1, "feel-good"=>2, "picture"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>2, "term"=>1, "it"=>1, "is"=>1, "movie"=>1, "about"=>1, "which"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>2, "love"=>1, "and"=>1, "money"=>1, "both"=>1, "them"=>1, "choises"=>1}, @token_count=37>, "negative"=>#<OmniCat::Classifiers::BayesInternals::Category:0x007fd20308cf00 @doc_count=4, @tokens={"bad"=>1, "tracking"=>1, "issue"=>1, "simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1, "interesting"=>1, "but"=>2, "not"=>2, "compelling"=>2, "seems"=>1, "clever"=>1, "especially"=>1}, @token_count=17>}, @category_count=2, @doc_count=8, @k_value=1.0, @token_count=54, @uniq_token_count=43>
|
87
|
-
another_bayes_obj.classify('best senses')
|
88
|
-
=> #<OmniCat::Result:0x007fd203075008 @category={:name=>"positive", :percentage=>57}, @scores={"positive"=>0.0002314814814814815, "negative"=>0.00017146776406035664}, @total_score=0.00040294924554183816>
|
89
|
-
|
90
|
-
## Todo
|
91
|
-
* Add more text classification modules such as Support Vector Machine (SVM).
|
92
|
-
* Add text cleaning/manipulating extensions such as stopwords cleaner, stemmer, and pos-tagger, etc...
|
46
|
+
### Naive Bayes classifier
|
47
|
+
* gem 'omnicat-bayes'
|
48
|
+
* Details: http://github.com/mustafaturan/omnicat-bayes
|
93
49
|
|
94
50
|
## Contributing
|
95
51
|
|
data/lib/omnicat.rb
CHANGED
@@ -1,7 +1,22 @@
|
|
1
1
|
require File.dirname(__FILE__) + '/omnicat/version'
|
2
|
-
require File.dirname(__FILE__) + '/omnicat/
|
2
|
+
require File.dirname(__FILE__) + '/omnicat/configuration'
|
3
3
|
require File.dirname(__FILE__) + '/omnicat/array'
|
4
4
|
require File.dirname(__FILE__) + '/omnicat/hash'
|
5
5
|
require File.dirname(__FILE__) + '/omnicat/base'
|
6
|
+
require File.dirname(__FILE__) + '/omnicat/doc'
|
6
7
|
require File.dirname(__FILE__) + '/omnicat/result'
|
7
|
-
require File.dirname(__FILE__) + '/omnicat/
|
8
|
+
require File.dirname(__FILE__) + '/omnicat/classifier'
|
9
|
+
|
10
|
+
module OmniCat
|
11
|
+
def self.config
|
12
|
+
OmniCat::Configuration.instance
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.configure
|
16
|
+
yield config
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.logger
|
20
|
+
config.logger
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/classifiers/strategy'
|
2
|
+
require File.dirname(__FILE__) + '/classifiers/strategy_internals/category'
|
3
|
+
require 'forwardable'
|
4
|
+
|
5
|
+
module OmniCat
|
6
|
+
class Classifier
|
7
|
+
extend Forwardable
|
8
|
+
|
9
|
+
# classification strategy
|
10
|
+
attr_accessor :strategy
|
11
|
+
|
12
|
+
# delegate category methods
|
13
|
+
def_delegators :@strategy, :add_category, :add_categories
|
14
|
+
|
15
|
+
# delegate training methods
|
16
|
+
def_delegators :@strategy, :train, :train_batch, :untrain, :untrain_batch
|
17
|
+
|
18
|
+
# delegate classification methods
|
19
|
+
def_delegators :@strategy, :classify, :classify_batch
|
20
|
+
|
21
|
+
# delegate base methods
|
22
|
+
def_delegator :@strategy, :to_hash
|
23
|
+
|
24
|
+
# nodoc
|
25
|
+
def initialize(classifier)
|
26
|
+
@strategy = classifier
|
27
|
+
end
|
28
|
+
|
29
|
+
def strategy=(classifier)
|
30
|
+
is_interchangeable?(classifier)
|
31
|
+
if @strategy && classifier.doc_count == 0
|
32
|
+
previous_strategy = @strategy
|
33
|
+
@strategy = classifier
|
34
|
+
# pass previous strategy contents into the new one
|
35
|
+
previous_strategy.categories.each do |category_name, category|
|
36
|
+
@strategy.add_category(category_name)
|
37
|
+
category.docs.each do |_, doc|
|
38
|
+
doc.count.times do
|
39
|
+
@strategy.train(category_name, doc.content)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
else
|
44
|
+
@strategy = classifier
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
def is_interchangeable?(classifier)
|
50
|
+
if classifier.category_size_limit
|
51
|
+
if @strategy.category_count > classifier.category_size_limit
|
52
|
+
raise StandardError,
|
53
|
+
'New classifier category size limit is less than the current classifier\'s category count.'
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,178 @@
|
|
1
|
+
require 'omnicat'
|
2
|
+
|
3
|
+
module OmniCat
|
4
|
+
module Classifiers
|
5
|
+
#
|
6
|
+
# Author:: Mustafa Turan (mailto:mustafaturan.net@gmail.com)
|
7
|
+
# Copyright:: Copyright (c) 2013 Mustafa Turan
|
8
|
+
# License:: MIT
|
9
|
+
#
|
10
|
+
# The class supplies abstract methods for possible text classifiers
|
11
|
+
class Strategy < ::OmniCat::Base
|
12
|
+
attr_accessor :categories # ::OmniCat::Hash - Hash of categories
|
13
|
+
attr_accessor :category_count # Integer - Total category count
|
14
|
+
attr_accessor :category_size_limit # Integer - Max allowed category
|
15
|
+
attr_accessor :doc_count # Integer - Total token count
|
16
|
+
attr_accessor :token_count # Integer - Total token count
|
17
|
+
attr_accessor :uniq_token_count # Integer - Total uniq token count
|
18
|
+
|
19
|
+
def initialize(strategy_hash = {})
|
20
|
+
@categories = ::OmniCat::Hash.new
|
21
|
+
@category_count = strategy_hash[:category_count].to_i
|
22
|
+
@category_size_limit = strategy_hash[:category_size_limit].to_i
|
23
|
+
@doc_count = strategy_hash[:doc_count].to_i
|
24
|
+
@token_count = strategy_hash[:token_count].to_i
|
25
|
+
@uniq_token_count = strategy_hash[:uniq_token_count].to_i
|
26
|
+
end
|
27
|
+
|
28
|
+
# Abstract method for adding new classification category
|
29
|
+
#
|
30
|
+
# ==== Parameters
|
31
|
+
#
|
32
|
+
# * +name+ - Name for category
|
33
|
+
#
|
34
|
+
def add_category(name)
|
35
|
+
not_implemented_error(__callee__)
|
36
|
+
end
|
37
|
+
|
38
|
+
# Allows adding multiple classification categories
|
39
|
+
#
|
40
|
+
# ==== Parameters
|
41
|
+
#
|
42
|
+
# * +names+ - Array of categories
|
43
|
+
#
|
44
|
+
def add_categories(names)
|
45
|
+
names.each { |name| add_category(name) }
|
46
|
+
end
|
47
|
+
|
48
|
+
# Abstract method for training the desired category with a document
|
49
|
+
#
|
50
|
+
# ==== Parameters
|
51
|
+
#
|
52
|
+
# * +category+ - Name of the category from added categories list
|
53
|
+
# * +doc+ - Document text
|
54
|
+
#
|
55
|
+
def train(category_name, doc)
|
56
|
+
not_implemented_error(__callee__)
|
57
|
+
end
|
58
|
+
|
59
|
+
# Train the desired category with multiple documents
|
60
|
+
#
|
61
|
+
# ==== Parameters
|
62
|
+
#
|
63
|
+
# * +category+ - Name of the category from added categories list
|
64
|
+
# * +docs+ - Array of documents
|
65
|
+
#
|
66
|
+
def train_batch(category, docs)
|
67
|
+
docs.each { |doc| train(category, doc) }
|
68
|
+
end
|
69
|
+
|
70
|
+
# Abstract method for untraining the desired category with a document
|
71
|
+
#
|
72
|
+
# ==== Parameters
|
73
|
+
#
|
74
|
+
# * +category+ - Name of the category from added categories list
|
75
|
+
# * +doc+ - Document text
|
76
|
+
#
|
77
|
+
def untrain(category_name, doc)
|
78
|
+
not_implemented_error(__callee__)
|
79
|
+
end
|
80
|
+
|
81
|
+
# Untrain the desired category with multiple documents
|
82
|
+
#
|
83
|
+
# ==== Parameters
|
84
|
+
#
|
85
|
+
# * +category+ - Name of the category from added categories list
|
86
|
+
# * +docs+ - Array of documents
|
87
|
+
#
|
88
|
+
def untrain_batch(category, docs)
|
89
|
+
docs.each { |doc| untrain(category, doc) }
|
90
|
+
end
|
91
|
+
|
92
|
+
# Abstract method for classifying the given document
|
93
|
+
#
|
94
|
+
# ==== Parameters
|
95
|
+
#
|
96
|
+
# * +doc+ - The document for classification
|
97
|
+
#
|
98
|
+
# ==== Returns
|
99
|
+
#
|
100
|
+
# * +result+ - OmniCat::Result object
|
101
|
+
#
|
102
|
+
def classify(doc)
|
103
|
+
not_implemented_error(__callee__)
|
104
|
+
end
|
105
|
+
|
106
|
+
# Classify the multiple documents at a time
|
107
|
+
#
|
108
|
+
# ==== Parameters
|
109
|
+
#
|
110
|
+
# * +docs+ - Array of documents
|
111
|
+
#
|
112
|
+
# ==== Returns
|
113
|
+
#
|
114
|
+
# * +result_set+ - Array of OmniCat::Result objects
|
115
|
+
#
|
116
|
+
def classify_batch(docs)
|
117
|
+
docs.collect { |doc| classify(doc) }
|
118
|
+
end
|
119
|
+
|
120
|
+
private
|
121
|
+
# nodoc
|
122
|
+
def not_implemented_error(method_name)
|
123
|
+
raise NotImplementedError.new("#{self.class.name}##{method_name} method is not implemented!")
|
124
|
+
end
|
125
|
+
|
126
|
+
protected
|
127
|
+
# nodoc
|
128
|
+
def category_exists?(category_name)
|
129
|
+
categories.has_key?(category_name)
|
130
|
+
end
|
131
|
+
|
132
|
+
# nodoc
|
133
|
+
def increment_category_count
|
134
|
+
@category_count += 1
|
135
|
+
end
|
136
|
+
|
137
|
+
# nodoc
|
138
|
+
def decrement_category_count
|
139
|
+
@category_count -= 1
|
140
|
+
end
|
141
|
+
|
142
|
+
# nodoc
|
143
|
+
def increment_doc_counts(category_name)
|
144
|
+
@doc_count += 1
|
145
|
+
@categories[category_name].doc_count += 1
|
146
|
+
end
|
147
|
+
|
148
|
+
# nodoc
|
149
|
+
def decrement_doc_counts(category_name)
|
150
|
+
@doc_count -= 1
|
151
|
+
@categories[category_name].doc_count -= 1
|
152
|
+
end
|
153
|
+
|
154
|
+
# nodoc
|
155
|
+
def classifiable?
|
156
|
+
if category_count < 2
|
157
|
+
raise StandardError,
|
158
|
+
'At least 2 categories needed for classification process!'
|
159
|
+
false
|
160
|
+
elsif doc_avability? == false
|
161
|
+
raise StandardError,
|
162
|
+
'Each category must trained with at least one document!'
|
163
|
+
false
|
164
|
+
else
|
165
|
+
true
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
# nodoc
|
170
|
+
def doc_avability?
|
171
|
+
@categories.each do |_, category|
|
172
|
+
return false if category.doc_count == 0
|
173
|
+
end
|
174
|
+
true
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'omnicat'
|
2
|
+
|
3
|
+
module OmniCat
|
4
|
+
module Classifiers
|
5
|
+
module StrategyInternals
|
6
|
+
class Category < ::OmniCat::Base
|
7
|
+
attr_accessor :doc_count, :docs, :tokens, :token_count
|
8
|
+
|
9
|
+
def initialize(category_hash = {})
|
10
|
+
@doc_count = category_hash[:doc_count].to_i
|
11
|
+
@docs = category_hash[:docs] || {}
|
12
|
+
@tokens = category_hash[:tokens] || {}
|
13
|
+
@token_count = category_hash[:token_count].to_i
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'singleton'
|
4
|
+
require 'logger'
|
5
|
+
|
6
|
+
module OmniCat
|
7
|
+
class Configuration
|
8
|
+
include Singleton
|
9
|
+
attr_accessor :logger
|
10
|
+
attr_accessor :exclude_tokens, :logger, :token_patterns
|
11
|
+
|
12
|
+
def self.default_logger
|
13
|
+
logger = Logger.new(STDOUT)
|
14
|
+
logger.progname = 'omnicat'
|
15
|
+
logger
|
16
|
+
end
|
17
|
+
|
18
|
+
@@defaults = {
|
19
|
+
exclude_tokens: ['a','about','across','after','all','almost','also','am','among','an','and','are','as','at','be','because','been','by','did','do','does','else','ever','every','for','from','get','got','had','has','have','he','her','hers','him','his','how','however','i','if','in','into','is','it','its','just','least','let','may','me','might','most','must','my','of','often','on','only','or','other','our','own','rather','said','say','says','she','should','since','so','some','than','that','the','their','them','then','there','these','they','this','tis','to','too','twas','us','wants','was','we','were','what','when','where','which','while','who','whom','will','with','would','yet','you','your'],
|
20
|
+
logger: default_logger,
|
21
|
+
token_patterns: {
|
22
|
+
minus: [/[\s\t\n\r]+/, /(@[\w\d]+)/],
|
23
|
+
plus: [/[\p{L}\-0-9]{2,}/, /[\!\?]/, /[\:\)\(\;\-\|]{2,3}/]
|
24
|
+
}
|
25
|
+
}
|
26
|
+
|
27
|
+
def self.defaults
|
28
|
+
@@defaults
|
29
|
+
end
|
30
|
+
|
31
|
+
def initialize
|
32
|
+
@@defaults.each_pair{|k,v| self.send("#{k}=",v)}
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
data/lib/omnicat/doc.rb
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require File.dirname(__FILE__) + '/base'
|
3
|
+
|
4
|
+
module OmniCat
|
5
|
+
class Doc < ::OmniCat::Base
|
6
|
+
attr_reader :content, :count, :tokens
|
7
|
+
|
8
|
+
def initialize(doc_hash = {})
|
9
|
+
@content = doc_hash[:content]
|
10
|
+
@count = (doc_hash[:count] || 1).to_i
|
11
|
+
@tokens = tokenize_with_counts unless @tokens.is_a?(Hash)
|
12
|
+
end
|
13
|
+
|
14
|
+
def increment_count
|
15
|
+
@count += 1
|
16
|
+
end
|
17
|
+
|
18
|
+
def decrement_count
|
19
|
+
@count -= 1 if @count > 0
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
# nodoc
|
24
|
+
def minus_tokens
|
25
|
+
body = @content
|
26
|
+
OmniCat.config.token_patterns[:minus].each { |p| body.gsub!(p, ' ') }
|
27
|
+
body
|
28
|
+
end
|
29
|
+
|
30
|
+
# nodoc
|
31
|
+
def plus_tokens(body)
|
32
|
+
body_tokens = []
|
33
|
+
OmniCat.config.token_patterns[:plus].each { |p| body_tokens += body.scan(p) }
|
34
|
+
body_tokens
|
35
|
+
end
|
36
|
+
|
37
|
+
# nodoc
|
38
|
+
def exclude_tokens
|
39
|
+
OmniCat.config.exclude_tokens
|
40
|
+
end
|
41
|
+
|
42
|
+
# nodoc
|
43
|
+
def tokenize_with_counts
|
44
|
+
tokenize.hashify_with_counts
|
45
|
+
end
|
46
|
+
|
47
|
+
# nodoc
|
48
|
+
def tokenize
|
49
|
+
plus_tokens(minus_tokens) - exclude_tokens
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
data/lib/omnicat/result.rb
CHANGED
data/lib/omnicat/version.rb
CHANGED
@@ -0,0 +1 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/classifiers/strategy_test'
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require File.expand_path(File.join(File.dirname(__FILE__), '..', '..', 'test_helper'))
|
2
|
+
|
3
|
+
class TestStrategy < Test::Unit::TestCase
|
4
|
+
def setup
|
5
|
+
@strategy = OmniCat::Classifiers::Strategy.new
|
6
|
+
end
|
7
|
+
|
8
|
+
def test_add_category
|
9
|
+
assert_raise(NotImplementedError) { @strategy.add_category("positive") }
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_add_categories
|
13
|
+
assert_raise(NotImplementedError) { @strategy.add_categories(
|
14
|
+
["neutral", "positive", "negative"]) }
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_train
|
18
|
+
assert_raise(NotImplementedError) { @strategy.train("positive", "good") }
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_train_batch
|
22
|
+
assert_raise(NotImplementedError) {
|
23
|
+
@strategy.train_batch("positive", ["good job ever", "valid syntax",
|
24
|
+
"best moments of my life"])
|
25
|
+
}
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_untrain
|
29
|
+
assert_raise(NotImplementedError) { @strategy.untrain("positive", "good") }
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_untrain_batch
|
33
|
+
assert_raise(NotImplementedError) { @strategy.untrain_batch(
|
34
|
+
"positive", ["good work", "well done"]) }
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_classify
|
38
|
+
assert_raise(NotImplementedError) { @strategy.classify("good job") }
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_classify_batch
|
42
|
+
assert_raise(NotImplementedError) {
|
43
|
+
@strategy.classify_batch(["good job", "you did well"])
|
44
|
+
}
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
|
3
|
+
|
4
|
+
class TestDoc < Test::Unit::TestCase
|
5
|
+
def setup
|
6
|
+
OmniCat.configure do |config|
|
7
|
+
config.exclude_tokens = ["was", "at", "by"]
|
8
|
+
config.token_patterns = {
|
9
|
+
minus: [/[\s\t\n\r]+/, /(@[\w\d]+)/],
|
10
|
+
plus: [/[\p{L}\-0-9]{2,}/, /[\!\?]/, /[\:\)\(\;\-\|]{2,3}/]
|
11
|
+
}
|
12
|
+
end
|
13
|
+
@doc = OmniCat::Doc.new(
|
14
|
+
content: "omnicat v-01 was written at 2011, omnicat by @mustafaturan"
|
15
|
+
)
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_omnicat_tokenize
|
19
|
+
assert_equal(
|
20
|
+
{"omnicat" => 2, "v-01" => 1, "written" => 1, "2011" => 1},
|
21
|
+
@doc.tokens
|
22
|
+
)
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_increment_count
|
26
|
+
@doc.increment_count
|
27
|
+
assert_equal(2, @doc.count)
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_decrement_count
|
31
|
+
@doc.decrement_count
|
32
|
+
assert_equal(0, @doc.count)
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_decrement_count_if_zero
|
36
|
+
@doc.decrement_count
|
37
|
+
@doc.decrement_count
|
38
|
+
assert_equal(0, @doc.count)
|
39
|
+
end
|
40
|
+
end
|
data/lib/test/unit/hash_test.rb
CHANGED
@@ -2,9 +2,11 @@ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
|
|
2
2
|
|
3
3
|
class TestHash < Test::Unit::TestCase
|
4
4
|
def test_to_hash
|
5
|
-
categories_hash = {
|
5
|
+
categories_hash = {
|
6
|
+
"pos" => { doc_count: 0, docs: {}, tokens: {}, token_count: 0 }
|
7
|
+
}
|
6
8
|
categories = OmniCat::Hash.new
|
7
|
-
categories["pos"] = OmniCat::Classifiers::
|
9
|
+
categories["pos"] = OmniCat::Classifiers::StrategyInternals::Category.new(categories_hash["pos"])
|
8
10
|
assert_equal(categories_hash, categories.to_hash)
|
9
11
|
end
|
10
12
|
end
|
metadata
CHANGED
@@ -1,18 +1,20 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: omnicat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
|
+
prerelease:
|
5
6
|
platform: ruby
|
6
7
|
authors:
|
7
8
|
- Mustafa Turan
|
8
9
|
autorequire:
|
9
10
|
bindir: bin
|
10
11
|
cert_chain: []
|
11
|
-
date: 2013-06
|
12
|
+
date: 2013-07-06 00:00:00.000000000 Z
|
12
13
|
dependencies:
|
13
14
|
- !ruby/object:Gem::Dependency
|
14
15
|
name: bundler
|
15
16
|
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
16
18
|
requirements:
|
17
19
|
- - ~>
|
18
20
|
- !ruby/object:Gem::Version
|
@@ -20,6 +22,7 @@ dependencies:
|
|
20
22
|
type: :development
|
21
23
|
prerelease: false
|
22
24
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
23
26
|
requirements:
|
24
27
|
- - ~>
|
25
28
|
- !ruby/object:Gem::Version
|
@@ -27,15 +30,17 @@ dependencies:
|
|
27
30
|
- !ruby/object:Gem::Dependency
|
28
31
|
name: rake
|
29
32
|
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
30
34
|
requirements:
|
31
|
-
- - '>='
|
35
|
+
- - ! '>='
|
32
36
|
- !ruby/object:Gem::Version
|
33
37
|
version: '0'
|
34
38
|
type: :development
|
35
39
|
prerelease: false
|
36
40
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
37
42
|
requirements:
|
38
|
-
- - '>='
|
43
|
+
- - ! '>='
|
39
44
|
- !ruby/object:Gem::Version
|
40
45
|
version: '0'
|
41
46
|
description: A generalized framework for text classifications.
|
@@ -45,6 +50,7 @@ executables: []
|
|
45
50
|
extensions: []
|
46
51
|
extra_rdoc_files: []
|
47
52
|
files:
|
53
|
+
- .gitignore
|
48
54
|
- .travis.yml
|
49
55
|
- CHANGELOG.txt
|
50
56
|
- Gemfile
|
@@ -54,43 +60,44 @@ files:
|
|
54
60
|
- lib/omnicat.rb
|
55
61
|
- lib/omnicat/array.rb
|
56
62
|
- lib/omnicat/base.rb
|
57
|
-
- lib/omnicat/
|
58
|
-
- lib/omnicat/classifiers/
|
59
|
-
- lib/omnicat/classifiers/
|
60
|
-
- lib/omnicat/
|
63
|
+
- lib/omnicat/classifier.rb
|
64
|
+
- lib/omnicat/classifiers/strategy.rb
|
65
|
+
- lib/omnicat/classifiers/strategy_internals/category.rb
|
66
|
+
- lib/omnicat/configuration.rb
|
67
|
+
- lib/omnicat/doc.rb
|
61
68
|
- lib/omnicat/hash.rb
|
62
69
|
- lib/omnicat/result.rb
|
63
|
-
- lib/omnicat/string.rb
|
64
70
|
- lib/omnicat/version.rb
|
65
71
|
- lib/test/test_helper.rb
|
66
72
|
- lib/test/unit/array_test.rb
|
67
|
-
- lib/test/unit/
|
68
|
-
- lib/test/unit/
|
73
|
+
- lib/test/unit/classifier_test.rb
|
74
|
+
- lib/test/unit/classifiers/strategy_test.rb
|
75
|
+
- lib/test/unit/doc_test.rb
|
69
76
|
- lib/test/unit/hash_test.rb
|
70
|
-
- lib/test/unit/string_test.rb
|
71
77
|
- omnicat.gemspec
|
72
78
|
homepage: https://github.com/mustafaturan/omnicat
|
73
79
|
licenses:
|
74
80
|
- MIT
|
75
|
-
metadata: {}
|
76
81
|
post_install_message:
|
77
82
|
rdoc_options: []
|
78
83
|
require_paths:
|
79
84
|
- lib
|
80
85
|
required_ruby_version: !ruby/object:Gem::Requirement
|
86
|
+
none: false
|
81
87
|
requirements:
|
82
|
-
- - '>='
|
88
|
+
- - ! '>='
|
83
89
|
- !ruby/object:Gem::Version
|
84
90
|
version: '0'
|
85
91
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
92
|
+
none: false
|
86
93
|
requirements:
|
87
|
-
- - '>='
|
94
|
+
- - ! '>='
|
88
95
|
- !ruby/object:Gem::Version
|
89
96
|
version: '0'
|
90
97
|
requirements: []
|
91
98
|
rubyforge_project:
|
92
|
-
rubygems_version:
|
99
|
+
rubygems_version: 1.8.23
|
93
100
|
signing_key:
|
94
|
-
specification_version:
|
101
|
+
specification_version: 3
|
95
102
|
summary: A generalized framework for text classifications.
|
96
103
|
test_files: []
|
checksums.yaml
DELETED
@@ -1,7 +0,0 @@
|
|
1
|
-
---
|
2
|
-
SHA1:
|
3
|
-
metadata.gz: ea920e881bd63f956dd1237f666d008f893668af
|
4
|
-
data.tar.gz: f9d1ec2fe73eb047c5ac661c42600cff033fd35f
|
5
|
-
SHA512:
|
6
|
-
metadata.gz: 4c65cec9bf29fc07b9b0f0eee51da3bfc40f2ba8e443daf287b3e76f499b9084e8526baeb7b7319acd7eeda826ff9a892a0e761848d23e52af2e4545cfbd60ff
|
7
|
-
data.tar.gz: 3f153307273e1c94bea62399a1d1f8d039b4c17956187779f08726429329a84acbce2ede51c7ade3c2ef2b1a778f37da664ae9855144f07a2c906f23d0ee5d80
|
data/lib/omnicat/bayes.rb
DELETED
@@ -1,55 +0,0 @@
|
|
1
|
-
module OmniCat
|
2
|
-
module Classifiers
|
3
|
-
class Base < ::OmniCat::Base
|
4
|
-
# Allows adding multiple classification categories
|
5
|
-
#
|
6
|
-
# ==== Parameters
|
7
|
-
#
|
8
|
-
# * +names+ - Array of categories
|
9
|
-
#
|
10
|
-
# ==== Examples
|
11
|
-
#
|
12
|
-
# # Add multiple categories for classification
|
13
|
-
# bayes.add_categories(["positive", "negative", "neutral"])
|
14
|
-
def add_categories(names)
|
15
|
-
names.each { |name| add_category(name) }
|
16
|
-
end
|
17
|
-
|
18
|
-
# Train the desired category with multiple documents
|
19
|
-
#
|
20
|
-
# ==== Parameters
|
21
|
-
#
|
22
|
-
# * +category+ - Name of the category from added categories list
|
23
|
-
# * +docs+ - Array of documents
|
24
|
-
#
|
25
|
-
# ==== Examples
|
26
|
-
#
|
27
|
-
# # Add multiple docs for training the category
|
28
|
-
# bayes.train("positive", ["clear documentation", "good, very well"])
|
29
|
-
# bayes.train("negative", ["bad interface", "damn"])
|
30
|
-
def train_batch(category, docs)
|
31
|
-
docs.each { |doc| train(category, doc) }
|
32
|
-
end
|
33
|
-
|
34
|
-
# Classify the multiple documents at a time
|
35
|
-
#
|
36
|
-
# ==== Parameters
|
37
|
-
#
|
38
|
-
# * +docs+ - Array of documents
|
39
|
-
#
|
40
|
-
# ==== Returns
|
41
|
-
#
|
42
|
-
# * +result_set+ - Array of OmniCat::Result objects
|
43
|
-
#
|
44
|
-
# ==== Examples
|
45
|
-
#
|
46
|
-
# # Classify multiple documents
|
47
|
-
# bayes.classify_batch(["good documentation", "damn workin again"])
|
48
|
-
# =>
|
49
|
-
def classify_batch(docs)
|
50
|
-
docs.collect { |doc| classify(doc) }
|
51
|
-
end
|
52
|
-
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end
|
@@ -1,174 +0,0 @@
|
|
1
|
-
module OmniCat
|
2
|
-
module Classifiers
|
3
|
-
class Bayes < ::OmniCat::Classifiers::Base
|
4
|
-
|
5
|
-
attr_accessor :categories # ::OmniCat::Hash - Hash of categories
|
6
|
-
attr_accessor :category_count # Integer - Total category count
|
7
|
-
attr_accessor :doc_count # Integer - Total token count
|
8
|
-
attr_accessor :token_count # Integer - Total token count
|
9
|
-
attr_accessor :uniq_token_count # Integer - Total uniq token count
|
10
|
-
attr_accessor :k_value # Integer - Helper value for skipping some Bayes algorithm errors
|
11
|
-
|
12
|
-
def initialize(bayes_hash = {})
|
13
|
-
self.categories = ::OmniCat::Hash.new
|
14
|
-
if bayes_hash.has_key?(:categories)
|
15
|
-
bayes_hash[:categories].each do |name, category|
|
16
|
-
self.categories[name] = ::OmniCat::Classifiers::BayesInternals::Category.new(category)
|
17
|
-
end
|
18
|
-
end
|
19
|
-
self.category_count = bayes_hash[:category_count].to_i
|
20
|
-
self.doc_count = bayes_hash[:doc_count].to_i
|
21
|
-
self.k_value = bayes_hash[:k_value] || 1.0
|
22
|
-
self.token_count = bayes_hash[:token_count].to_i
|
23
|
-
self.uniq_token_count = bayes_hash[:uniq_token_count].to_i
|
24
|
-
end
|
25
|
-
|
26
|
-
# Allows adding new classification category
|
27
|
-
#
|
28
|
-
# ==== Parameters
|
29
|
-
#
|
30
|
-
# * +name+ - Name for category
|
31
|
-
#
|
32
|
-
# ==== Examples
|
33
|
-
#
|
34
|
-
# # Create a classification category
|
35
|
-
# bayes = Bayes.new
|
36
|
-
# bayes.add_category("positive")
|
37
|
-
def add_category(name)
|
38
|
-
if category_exists?(name)
|
39
|
-
raise StandardError,
|
40
|
-
"Category with name '#{name}' is already exists!"
|
41
|
-
else
|
42
|
-
self.category_count +=1
|
43
|
-
self.categories[name] = ::OmniCat::Classifiers::BayesInternals::Category.new
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
# Train the desired category with a document
|
48
|
-
#
|
49
|
-
# ==== Parameters
|
50
|
-
#
|
51
|
-
# * +category+ - Name of the category from added categories list
|
52
|
-
# * +doc+ - Document text
|
53
|
-
#
|
54
|
-
# ==== Examples
|
55
|
-
#
|
56
|
-
# # Train the desired category
|
57
|
-
# bayes.train("positive", "clear documentation")
|
58
|
-
# bayes.train("positive", "good, very well")
|
59
|
-
# bayes.train("negative", "bad dog")
|
60
|
-
# bayes.train("neutral", "how is the management gui")
|
61
|
-
def train(category_name, doc)
|
62
|
-
if category_exists?(category_name)
|
63
|
-
increment_doc_counts(category_name)
|
64
|
-
update_priors
|
65
|
-
doc.tokenize_with_counts.each do |token, count|
|
66
|
-
increment_token_counts(category_name, token, count)
|
67
|
-
self.categories[category_name].tokens[token] = self.categories[category_name].tokens[token].to_i + count
|
68
|
-
end
|
69
|
-
else
|
70
|
-
raise StandardError,
|
71
|
-
"Category with name '#{category_name}' does not exist!"
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
# Classify the given document
|
76
|
-
#
|
77
|
-
# ==== Parameters
|
78
|
-
#
|
79
|
-
# * +doc+ - The document for classification
|
80
|
-
#
|
81
|
-
# ==== Returns
|
82
|
-
#
|
83
|
-
# * +result+ - OmniCat::Result object
|
84
|
-
#
|
85
|
-
# ==== Examples
|
86
|
-
#
|
87
|
-
# # Classify a document
|
88
|
-
# bayes.classify("good documentation")
|
89
|
-
# =>
|
90
|
-
def classify(doc)
|
91
|
-
if category_count < 2
|
92
|
-
return raise StandardError,
|
93
|
-
"At least 2 categories needed for classification process!"
|
94
|
-
end
|
95
|
-
score = -1000000
|
96
|
-
result = ::OmniCat::Result.new
|
97
|
-
self.categories.each do |category_name, category|
|
98
|
-
result.scores[category_name] = doc_probability(category, doc)
|
99
|
-
if result.scores[category_name] > score
|
100
|
-
result.category[:name] = category_name
|
101
|
-
score = result.scores[category_name]
|
102
|
-
end
|
103
|
-
result.total_score += result.scores[category_name]
|
104
|
-
end
|
105
|
-
result.total_score = 1 if result.total_score == 0
|
106
|
-
result.category[:percentage] = (
|
107
|
-
result.scores[result.category[:name]] * 100.0 /
|
108
|
-
result.total_score
|
109
|
-
).floor
|
110
|
-
result
|
111
|
-
end
|
112
|
-
|
113
|
-
private
|
114
|
-
# nodoc
|
115
|
-
def category_exists?(category_name)
|
116
|
-
categories.has_key?(category_name)
|
117
|
-
end
|
118
|
-
|
119
|
-
# nodoc
|
120
|
-
def increment_doc_counts(category_name)
|
121
|
-
self.doc_count += 1
|
122
|
-
self.categories[category_name].doc_count += 1
|
123
|
-
end
|
124
|
-
|
125
|
-
# nodoc
|
126
|
-
def update_priors
|
127
|
-
self.categories.each do |_, category|
|
128
|
-
category.prior = category.doc_count / doc_count.to_f
|
129
|
-
end
|
130
|
-
end
|
131
|
-
|
132
|
-
# nodoc
|
133
|
-
def increment_token_counts(category_name, token, count)
|
134
|
-
increment_uniq_token_count(token)
|
135
|
-
self.token_count += count
|
136
|
-
self.categories[category_name].token_count += count
|
137
|
-
end
|
138
|
-
|
139
|
-
# nodoc
|
140
|
-
def increment_uniq_token_count(token)
|
141
|
-
uniq_token_addition = 0
|
142
|
-
categories.each do |_, category|
|
143
|
-
if category.tokens.has_key?(token)
|
144
|
-
uniq_token_addition = 1
|
145
|
-
break
|
146
|
-
end
|
147
|
-
end
|
148
|
-
self.uniq_token_count += 1 if uniq_token_addition == 0
|
149
|
-
end
|
150
|
-
|
151
|
-
# nodoc
|
152
|
-
def doc_probability(category, doc)
|
153
|
-
score = k_value
|
154
|
-
doc.tokenize_with_counts.each do |token, count|
|
155
|
-
score *= token_probability(category, token, count)
|
156
|
-
end
|
157
|
-
category.prior * score
|
158
|
-
end
|
159
|
-
|
160
|
-
# nodoc
|
161
|
-
def token_probability(category, token, count)
|
162
|
-
if category.tokens[token].to_i == 0
|
163
|
-
k_value / token_count
|
164
|
-
else
|
165
|
-
count * (
|
166
|
-
(category.tokens[token].to_i + k_value) /
|
167
|
-
(category.token_count + uniq_token_count)
|
168
|
-
)
|
169
|
-
end
|
170
|
-
end
|
171
|
-
|
172
|
-
end
|
173
|
-
end
|
174
|
-
end
|
@@ -1,16 +0,0 @@
|
|
1
|
-
module OmniCat
|
2
|
-
module Classifiers
|
3
|
-
module BayesInternals
|
4
|
-
class Category < ::OmniCat::Base
|
5
|
-
attr_accessor :doc_count, :prior, :tokens, :token_count
|
6
|
-
|
7
|
-
def initialize(category_hash = {})
|
8
|
-
self.doc_count = category_hash[:doc_count].to_i
|
9
|
-
self.prior = category_hash[:prior].to_f
|
10
|
-
self.tokens = category_hash[:tokens] || {}
|
11
|
-
self.token_count = category_hash[:token_count].to_i
|
12
|
-
end
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|
data/lib/omnicat/string.rb
DELETED
data/lib/test/unit/base_test.rb
DELETED
@@ -1,49 +0,0 @@
|
|
1
|
-
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
|
2
|
-
|
3
|
-
class TestBase < Test::Unit::TestCase
|
4
|
-
def setup
|
5
|
-
@bayes = OmniCat::Classifiers::Bayes.new
|
6
|
-
end
|
7
|
-
|
8
|
-
def test_add_categories
|
9
|
-
@bayes.add_categories ["neutral", "positive", "negative"]
|
10
|
-
assert_not_nil(@bayes.categories["neutral"])
|
11
|
-
assert_equal(
|
12
|
-
["neutral", "positive", "negative"],
|
13
|
-
@bayes.categories.keys
|
14
|
-
)
|
15
|
-
end
|
16
|
-
|
17
|
-
def test_train_batch
|
18
|
-
@bayes.add_category "positive"
|
19
|
-
@bayes.train_batch "positive", ["good job ever", "valid syntax",
|
20
|
-
"best moments of my life"]
|
21
|
-
assert_equal(
|
22
|
-
3,
|
23
|
-
@bayes.categories["positive"].doc_count
|
24
|
-
)
|
25
|
-
end
|
26
|
-
|
27
|
-
def test_classify_batch
|
28
|
-
@bayes.add_category "positive"
|
29
|
-
@bayes.add_category "negative"
|
30
|
-
@bayes.train_batch "positive", ["good job ever", "valid syntax",
|
31
|
-
"best moments of my life"]
|
32
|
-
@bayes.train_batch("negative", ["bad work", "awfull day", "never liked it"])
|
33
|
-
results = @bayes.classify_batch(
|
34
|
-
["good sytanx research", "bad words"]
|
35
|
-
)
|
36
|
-
|
37
|
-
assert_equal(2, results.count)
|
38
|
-
|
39
|
-
assert_equal(
|
40
|
-
"positive",
|
41
|
-
results[0].category[:name]
|
42
|
-
)
|
43
|
-
assert_equal(
|
44
|
-
"negative",
|
45
|
-
results[1].category[:name]
|
46
|
-
)
|
47
|
-
|
48
|
-
end
|
49
|
-
end
|
data/lib/test/unit/bayes_test.rb
DELETED
@@ -1,85 +0,0 @@
|
|
1
|
-
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
|
2
|
-
|
3
|
-
class TestBayes < Test::Unit::TestCase
|
4
|
-
def setup
|
5
|
-
@bayes = OmniCat::Classifiers::Bayes.new
|
6
|
-
end
|
7
|
-
|
8
|
-
def test_add_category
|
9
|
-
@bayes.add_category "neutral"
|
10
|
-
assert_not_nil(@bayes.categories["neutral"])
|
11
|
-
assert_equal(
|
12
|
-
["neutral"],
|
13
|
-
@bayes.categories.keys
|
14
|
-
)
|
15
|
-
assert_equal(
|
16
|
-
0,
|
17
|
-
@bayes.categories["neutral"].doc_count
|
18
|
-
)
|
19
|
-
assert_equal(
|
20
|
-
{},
|
21
|
-
@bayes.categories["neutral"].tokens
|
22
|
-
)
|
23
|
-
assert_equal(
|
24
|
-
0,
|
25
|
-
@bayes.categories["neutral"].token_count
|
26
|
-
)
|
27
|
-
end
|
28
|
-
|
29
|
-
def test_add_category_that_already_exists
|
30
|
-
@bayes.add_category "neutral"
|
31
|
-
assert_raise(StandardError) { @bayes.add_category "neutral" }
|
32
|
-
end
|
33
|
-
|
34
|
-
def test_train_valid_category
|
35
|
-
@bayes.add_category "neutral"
|
36
|
-
@bayes.train "neutral", "how are you?"
|
37
|
-
assert_equal(
|
38
|
-
1,
|
39
|
-
@bayes.categories["neutral"].doc_count
|
40
|
-
)
|
41
|
-
assert_equal(
|
42
|
-
{"how" => 1, "are" => 1, "you" => 1},
|
43
|
-
@bayes.categories["neutral"].tokens
|
44
|
-
)
|
45
|
-
assert_equal(
|
46
|
-
3,
|
47
|
-
@bayes.categories["neutral"].token_count
|
48
|
-
)
|
49
|
-
end
|
50
|
-
|
51
|
-
def test_train_missing_category
|
52
|
-
assert_raise(StandardError) { @bayes.train "neutral", "how are you?" }
|
53
|
-
end
|
54
|
-
|
55
|
-
def test_classify
|
56
|
-
@bayes.add_category "positive"
|
57
|
-
@bayes.add_category "negative"
|
58
|
-
@bayes.train("positive", "good job")
|
59
|
-
@bayes.train("negative", "bad work")
|
60
|
-
assert_equal(
|
61
|
-
"positive",
|
62
|
-
@bayes.classify("very good position for this sentence").category[:name]
|
63
|
-
)
|
64
|
-
assert_equal(
|
65
|
-
"negative",
|
66
|
-
@bayes.classify("bad words").category[:name]
|
67
|
-
)
|
68
|
-
end
|
69
|
-
|
70
|
-
def test_initialize_with_hash
|
71
|
-
bayes1 = ::OmniCat::Classifiers::Bayes.new
|
72
|
-
bayes1.add_category "positive"
|
73
|
-
bayes1.add_category "negative"
|
74
|
-
bayes1.train("positive", "good job")
|
75
|
-
bayes1.train("negative", "bad work")
|
76
|
-
h1 = bayes1.to_hash
|
77
|
-
|
78
|
-
bayes2 = ::OmniCat::Classifiers::Bayes.new(h1)
|
79
|
-
assert_equal(h1, bayes2.to_hash)
|
80
|
-
end
|
81
|
-
|
82
|
-
def test_classify_with_insufficient_categories
|
83
|
-
assert_raise(StandardError) { @bayes.classify "blank" }
|
84
|
-
end
|
85
|
-
end
|
@@ -1,17 +0,0 @@
|
|
1
|
-
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
|
2
|
-
|
3
|
-
class TestString < Test::Unit::TestCase
|
4
|
-
def test_omnicat_tokenize
|
5
|
-
assert_equal(
|
6
|
-
["mustafa", "turan", "omni-cat-v0", "1986"],
|
7
|
-
"mustafa turan omni-cat-v0 1986 1 a s d".omnicat_tokenize
|
8
|
-
)
|
9
|
-
end
|
10
|
-
|
11
|
-
def test_tokenize_with_counts
|
12
|
-
assert_equal(
|
13
|
-
{"omnicat" => 2, "written" => 1, "at" => 1, "2011" => 1},
|
14
|
-
"omnicat written at 2011, omnicat".tokenize_with_counts
|
15
|
-
)
|
16
|
-
end
|
17
|
-
end
|