omnicat 0.1.3 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1 @@
1
+ /omnicat*.gem
data/.travis.yml CHANGED
@@ -1,6 +1,4 @@
1
1
  language: ruby
2
2
  rvm:
3
3
  - 1.9.3
4
- - 2.0.0
5
- - jruby-19mode
6
- - rbx-19mode
4
+ - 2.0.0
data/CHANGELOG.txt CHANGED
@@ -1,3 +1,11 @@
1
+ Master Branch
2
+
3
+ 0.2.0
4
+ # bayes classifier moved to another gem which is 'omnicat-bayes'
5
+ # applied 'Strategy Software Design Pattern' for classifiers
6
+ # configuration added with Singleton Software Design Pattern
7
+ # string methods moved to OmniCat::Doc class
8
+
1
9
  0.1.3
2
10
  # refactoring at bayes algorithm
3
11
 
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  [![Build Status](https://travis-ci.org/mustafaturan/omnicat.png)](https://travis-ci.org/mustafaturan/omnicat) [![Code Climate](https://codeclimate.com/github/mustafaturan/omnicat.png)](https://codeclimate.com/github/mustafaturan/omnicat)
4
4
 
5
- A generalized framework for text classifications. For now, it only supports Naive Bayes algorithm for text classification.
5
+ A generalized framework for text classifications.
6
6
 
7
7
  ## Installation
8
8
 
@@ -20,76 +20,32 @@ Or install it yourself as:
20
20
 
21
21
  ## Usage
22
22
 
23
- See rdoc for detailed usage.
23
+ Stand-alone version of omnicat is just a strategy holder for developers. Its aim is providing omnification of methods for text classification gems with loseless conversion of a strategy to another one. End-users should see 'classifier strategies' section and 'changing classifier strategy' sub section.
24
24
 
25
- ### Bayes classifier
26
- Create a Bayes classifier object.
25
+ ### Changing classifier strategy
27
26
 
28
- bayes = OmniCat::Classifiers::Bayes.new
27
+ OmniCat allows you to change strategy on runtime.
29
28
 
30
- ### Create categories
31
- Create a classification category.
29
+ # Declare classifier with Naive Bayes classifier
30
+ classifier = OmniCat::Classifier.new(OmniCat::Classifiers::Bayes.new())
31
+ ...
32
+ # do some operations like adding category, training, etc...
33
+ ...
34
+ # make some classification using Bayes
35
+ classifier.classify('I am happy :)')
36
+ ...
37
+ # change strategy to Support Vector Machine (SVM) on runtime
38
+ classifier = OmniCat::Classifier.new(OmniCat::Classifiers::SVM.new())
39
+ # now you do not need to re-train, add category and so on..
40
+ # just classify with new strategy
41
+ classifier.classify('I am happy :)')
32
42
 
33
- bayes.add_category('positive')
34
- bayes.add_category('negative')
43
+ ## Classifier strategies
44
+ Here is the classifier list avaliable for OmniCat.
35
45
 
36
- ### Train
37
- Train category with a document.
38
-
39
- bayes.train('positive', 'great if you are in a slap happy mood .')
40
- bayes.train('negative', 'bad tracking issue')
41
-
42
- ### Train batch
43
- Train category with multiple documents.
44
-
45
- bayes.train_batch('positive', [
46
- 'a feel-good picture in the best sense of the term...',
47
- 'it is a feel-good movie about which you can actually feel good.',
48
- 'love and money both of them are good choises'
49
- ])
50
- bayes.train_batch('negative', [
51
- 'simplistic , silly and tedious .',
52
- 'interesting , but not compelling . ',
53
- 'seems clever but not especially compelling'
54
- ])
55
-
56
- ### Classify
57
- Classify a document.
58
-
59
- result = bayes.classify('I feel so good and happy')
60
- => #<OmniCat::Result:0x007fd20296aad8 @category={:name=>"positive", :percentage=>73}, @scores={"positive"=>5.4253472222222225e-09, "negative"=>1.9600796074572086e-09}, @total_score=7.385426829679431e-09>
61
- result.to_hash
62
- => {:category=>{:name=>"positive", :percentage=>73}, :scores=>{"positive"=>5.4253472222222225e-09, "negative"=>1.9600796074572086e-09}, :total_score=>7.385426829679431e-09}
63
-
64
- ### Classify batch
65
- Classify multiple documents at a time.
66
-
67
- results = bayes.classify_batch(
68
- [
69
- 'the movie is silly so not compelling enough',
70
- 'a good piece of work'
71
- ]
72
- )
73
- => [#<OmniCat::Result:0x007fd2029341b8 @category={:name=>"negative", :percentage=>78}, @scores={"positive"=>2.5521869888765736e-14, "negative"=>9.074442627116706e-14}, @total_score=1.162662961599328e-13>, #<OmniCat::Result:0x007fd20292e7e0 @category={:name=>"positive", :percentage=>80}, @scores={"positive"=>2.411265432098765e-07, "negative"=>5.880238822371627e-08}, @total_score=2.999289314335928e-07>]
74
-
75
- ### Convert to hash
76
- Convert full Bayes object to hash.
77
-
78
- # For storing, restoring modal data
79
- bayes_hash = bayes.to_hash
80
- => {:categories=>{"positive"=>{:doc_count=>4, :tokens=>{"great"=>1, "if"=>1, "you"=>2, "are"=>2, "in"=>2, "slap"=>1, "happy"=>1, "mood"=>1, "feel-good"=>2, "picture"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>2, "term"=>1, "it"=>1, "is"=>1, "movie"=>1, "about"=>1, "which"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>2, "love"=>1, "and"=>1, "money"=>1, "both"=>1, "them"=>1, "choises"=>1}, :token_count=>37}, "negative"=>{:doc_count=>4, :tokens=>{"bad"=>1, "tracking"=>1, "issue"=>1, "simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1, "interesting"=>1, "but"=>2, "not"=>2, "compelling"=>2, "seems"=>1, "clever"=>1, "especially"=>1}, :token_count=>17}}, :category_count=>2, :doc_count=>8, :k_value=>1.0, :token_count=>54, :uniq_token_count=>43}
81
-
82
- ### Load from hash
83
- Load full Bayes object from hash.
84
-
85
- another_bayes_obj = OmniCat::Classifiers::Bayes.new(bayes_hash)
86
- => #<OmniCat::Classifiers::Bayes:0x007fd20308cff0 @categories={"positive"=>#<OmniCat::Classifiers::BayesInternals::Category:0x007fd20308cf78 @doc_count=4, @tokens={"great"=>1, "if"=>1, "you"=>2, "are"=>2, "in"=>2, "slap"=>1, "happy"=>1, "mood"=>1, "feel-good"=>2, "picture"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>2, "term"=>1, "it"=>1, "is"=>1, "movie"=>1, "about"=>1, "which"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>2, "love"=>1, "and"=>1, "money"=>1, "both"=>1, "them"=>1, "choises"=>1}, @token_count=37>, "negative"=>#<OmniCat::Classifiers::BayesInternals::Category:0x007fd20308cf00 @doc_count=4, @tokens={"bad"=>1, "tracking"=>1, "issue"=>1, "simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1, "interesting"=>1, "but"=>2, "not"=>2, "compelling"=>2, "seems"=>1, "clever"=>1, "especially"=>1}, @token_count=17>}, @category_count=2, @doc_count=8, @k_value=1.0, @token_count=54, @uniq_token_count=43>
87
- another_bayes_obj.classify('best senses')
88
- => #<OmniCat::Result:0x007fd203075008 @category={:name=>"positive", :percentage=>57}, @scores={"positive"=>0.0002314814814814815, "negative"=>0.00017146776406035664}, @total_score=0.00040294924554183816>
89
-
90
- ## Todo
91
- * Add more text classification modules such as Support Vector Machine (SVM).
92
- * Add text cleaning/manipulating extensions such as stopwords cleaner, stemmer, and pos-tagger, etc...
46
+ ### Naive Bayes classifier
47
+ * gem 'omnicat-bayes'
48
+ * Details: http://github.com/mustafaturan/omnicat-bayes
93
49
 
94
50
  ## Contributing
95
51
 
data/lib/omnicat.rb CHANGED
@@ -1,7 +1,22 @@
1
1
  require File.dirname(__FILE__) + '/omnicat/version'
2
- require File.dirname(__FILE__) + '/omnicat/string'
2
+ require File.dirname(__FILE__) + '/omnicat/configuration'
3
3
  require File.dirname(__FILE__) + '/omnicat/array'
4
4
  require File.dirname(__FILE__) + '/omnicat/hash'
5
5
  require File.dirname(__FILE__) + '/omnicat/base'
6
+ require File.dirname(__FILE__) + '/omnicat/doc'
6
7
  require File.dirname(__FILE__) + '/omnicat/result'
7
- require File.dirname(__FILE__) + '/omnicat/bayes'
8
+ require File.dirname(__FILE__) + '/omnicat/classifier'
9
+
10
+ module OmniCat
11
+ def self.config
12
+ OmniCat::Configuration.instance
13
+ end
14
+
15
+ def self.configure
16
+ yield config
17
+ end
18
+
19
+ def self.logger
20
+ config.logger
21
+ end
22
+ end
@@ -0,0 +1,58 @@
1
+ require File.dirname(__FILE__) + '/classifiers/strategy'
2
+ require File.dirname(__FILE__) + '/classifiers/strategy_internals/category'
3
+ require 'forwardable'
4
+
5
+ module OmniCat
6
+ class Classifier
7
+ extend Forwardable
8
+
9
+ # classification strategy
10
+ attr_accessor :strategy
11
+
12
+ # delegate category methods
13
+ def_delegators :@strategy, :add_category, :add_categories
14
+
15
+ # delegate training methods
16
+ def_delegators :@strategy, :train, :train_batch, :untrain, :untrain_batch
17
+
18
+ # delegate classification methods
19
+ def_delegators :@strategy, :classify, :classify_batch
20
+
21
+ # delegate base methods
22
+ def_delegator :@strategy, :to_hash
23
+
24
+ # nodoc
25
+ def initialize(classifier)
26
+ @strategy = classifier
27
+ end
28
+
29
+ def strategy=(classifier)
30
+ is_interchangeable?(classifier)
31
+ if @strategy && classifier.doc_count == 0
32
+ previous_strategy = @strategy
33
+ @strategy = classifier
34
+ # pass previous strategy contents into the new one
35
+ previous_strategy.categories.each do |category_name, category|
36
+ @strategy.add_category(category_name)
37
+ category.docs.each do |_, doc|
38
+ doc.count.times do
39
+ @strategy.train(category_name, doc.content)
40
+ end
41
+ end
42
+ end
43
+ else
44
+ @strategy = classifier
45
+ end
46
+ end
47
+
48
+ private
49
+ def is_interchangeable?(classifier)
50
+ if classifier.category_size_limit
51
+ if @strategy.category_count > classifier.category_size_limit
52
+ raise StandardError,
53
+ 'New classifier category size limit is less than the current classifier\'s category count.'
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,178 @@
1
+ require 'omnicat'
2
+
3
+ module OmniCat
4
+ module Classifiers
5
+ #
6
+ # Author:: Mustafa Turan (mailto:mustafaturan.net@gmail.com)
7
+ # Copyright:: Copyright (c) 2013 Mustafa Turan
8
+ # License:: MIT
9
+ #
10
+ # The class supplies abstract methods for possible text classifiers
11
+ class Strategy < ::OmniCat::Base
12
+ attr_accessor :categories # ::OmniCat::Hash - Hash of categories
13
+ attr_accessor :category_count # Integer - Total category count
14
+ attr_accessor :category_size_limit # Integer - Max allowed category
15
+ attr_accessor :doc_count # Integer - Total token count
16
+ attr_accessor :token_count # Integer - Total token count
17
+ attr_accessor :uniq_token_count # Integer - Total uniq token count
18
+
19
+ def initialize(strategy_hash = {})
20
+ @categories = ::OmniCat::Hash.new
21
+ @category_count = strategy_hash[:category_count].to_i
22
+ @category_size_limit = strategy_hash[:category_size_limit].to_i
23
+ @doc_count = strategy_hash[:doc_count].to_i
24
+ @token_count = strategy_hash[:token_count].to_i
25
+ @uniq_token_count = strategy_hash[:uniq_token_count].to_i
26
+ end
27
+
28
+ # Abstract method for adding new classification category
29
+ #
30
+ # ==== Parameters
31
+ #
32
+ # * +name+ - Name for category
33
+ #
34
+ def add_category(name)
35
+ not_implemented_error(__callee__)
36
+ end
37
+
38
+ # Allows adding multiple classification categories
39
+ #
40
+ # ==== Parameters
41
+ #
42
+ # * +names+ - Array of categories
43
+ #
44
+ def add_categories(names)
45
+ names.each { |name| add_category(name) }
46
+ end
47
+
48
+ # Abstract method for training the desired category with a document
49
+ #
50
+ # ==== Parameters
51
+ #
52
+ # * +category+ - Name of the category from added categories list
53
+ # * +doc+ - Document text
54
+ #
55
+ def train(category_name, doc)
56
+ not_implemented_error(__callee__)
57
+ end
58
+
59
+ # Train the desired category with multiple documents
60
+ #
61
+ # ==== Parameters
62
+ #
63
+ # * +category+ - Name of the category from added categories list
64
+ # * +docs+ - Array of documents
65
+ #
66
+ def train_batch(category, docs)
67
+ docs.each { |doc| train(category, doc) }
68
+ end
69
+
70
+ # Abstract method for untraining the desired category with a document
71
+ #
72
+ # ==== Parameters
73
+ #
74
+ # * +category+ - Name of the category from added categories list
75
+ # * +doc+ - Document text
76
+ #
77
+ def untrain(category_name, doc)
78
+ not_implemented_error(__callee__)
79
+ end
80
+
81
+ # Untrain the desired category with multiple documents
82
+ #
83
+ # ==== Parameters
84
+ #
85
+ # * +category+ - Name of the category from added categories list
86
+ # * +docs+ - Array of documents
87
+ #
88
+ def untrain_batch(category, docs)
89
+ docs.each { |doc| untrain(category, doc) }
90
+ end
91
+
92
+ # Abstract method for classifying the given document
93
+ #
94
+ # ==== Parameters
95
+ #
96
+ # * +doc+ - The document for classification
97
+ #
98
+ # ==== Returns
99
+ #
100
+ # * +result+ - OmniCat::Result object
101
+ #
102
+ def classify(doc)
103
+ not_implemented_error(__callee__)
104
+ end
105
+
106
+ # Classify the multiple documents at a time
107
+ #
108
+ # ==== Parameters
109
+ #
110
+ # * +docs+ - Array of documents
111
+ #
112
+ # ==== Returns
113
+ #
114
+ # * +result_set+ - Array of OmniCat::Result objects
115
+ #
116
+ def classify_batch(docs)
117
+ docs.collect { |doc| classify(doc) }
118
+ end
119
+
120
+ private
121
+ # nodoc
122
+ def not_implemented_error(method_name)
123
+ raise NotImplementedError.new("#{self.class.name}##{method_name} method is not implemented!")
124
+ end
125
+
126
+ protected
127
+ # nodoc
128
+ def category_exists?(category_name)
129
+ categories.has_key?(category_name)
130
+ end
131
+
132
+ # nodoc
133
+ def increment_category_count
134
+ @category_count += 1
135
+ end
136
+
137
+ # nodoc
138
+ def decrement_category_count
139
+ @category_count -= 1
140
+ end
141
+
142
+ # nodoc
143
+ def increment_doc_counts(category_name)
144
+ @doc_count += 1
145
+ @categories[category_name].doc_count += 1
146
+ end
147
+
148
+ # nodoc
149
+ def decrement_doc_counts(category_name)
150
+ @doc_count -= 1
151
+ @categories[category_name].doc_count -= 1
152
+ end
153
+
154
+ # nodoc
155
+ def classifiable?
156
+ if category_count < 2
157
+ raise StandardError,
158
+ 'At least 2 categories needed for classification process!'
159
+ false
160
+ elsif doc_avability? == false
161
+ raise StandardError,
162
+ 'Each category must trained with at least one document!'
163
+ false
164
+ else
165
+ true
166
+ end
167
+ end
168
+
169
+ # nodoc
170
+ def doc_avability?
171
+ @categories.each do |_, category|
172
+ return false if category.doc_count == 0
173
+ end
174
+ true
175
+ end
176
+ end
177
+ end
178
+ end
@@ -0,0 +1,18 @@
1
+ require 'omnicat'
2
+
3
+ module OmniCat
4
+ module Classifiers
5
+ module StrategyInternals
6
+ class Category < ::OmniCat::Base
7
+ attr_accessor :doc_count, :docs, :tokens, :token_count
8
+
9
+ def initialize(category_hash = {})
10
+ @doc_count = category_hash[:doc_count].to_i
11
+ @docs = category_hash[:docs] || {}
12
+ @tokens = category_hash[:tokens] || {}
13
+ @token_count = category_hash[:token_count].to_i
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,35 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'singleton'
4
+ require 'logger'
5
+
6
+ module OmniCat
7
+ class Configuration
8
+ include Singleton
9
+ attr_accessor :logger
10
+ attr_accessor :exclude_tokens, :logger, :token_patterns
11
+
12
+ def self.default_logger
13
+ logger = Logger.new(STDOUT)
14
+ logger.progname = 'omnicat'
15
+ logger
16
+ end
17
+
18
+ @@defaults = {
19
+ exclude_tokens: ['a','about','across','after','all','almost','also','am','among','an','and','are','as','at','be','because','been','by','did','do','does','else','ever','every','for','from','get','got','had','has','have','he','her','hers','him','his','how','however','i','if','in','into','is','it','its','just','least','let','may','me','might','most','must','my','of','often','on','only','or','other','our','own','rather','said','say','says','she','should','since','so','some','than','that','the','their','them','then','there','these','they','this','tis','to','too','twas','us','wants','was','we','were','what','when','where','which','while','who','whom','will','with','would','yet','you','your'],
20
+ logger: default_logger,
21
+ token_patterns: {
22
+ minus: [/[\s\t\n\r]+/, /(@[\w\d]+)/],
23
+ plus: [/[\p{L}\-0-9]{2,}/, /[\!\?]/, /[\:\)\(\;\-\|]{2,3}/]
24
+ }
25
+ }
26
+
27
+ def self.defaults
28
+ @@defaults
29
+ end
30
+
31
+ def initialize
32
+ @@defaults.each_pair{|k,v| self.send("#{k}=",v)}
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,52 @@
1
+ # encoding: UTF-8
2
+ require File.dirname(__FILE__) + '/base'
3
+
4
+ module OmniCat
5
+ class Doc < ::OmniCat::Base
6
+ attr_reader :content, :count, :tokens
7
+
8
+ def initialize(doc_hash = {})
9
+ @content = doc_hash[:content]
10
+ @count = (doc_hash[:count] || 1).to_i
11
+ @tokens = tokenize_with_counts unless @tokens.is_a?(Hash)
12
+ end
13
+
14
+ def increment_count
15
+ @count += 1
16
+ end
17
+
18
+ def decrement_count
19
+ @count -= 1 if @count > 0
20
+ end
21
+
22
+ private
23
+ # nodoc
24
+ def minus_tokens
25
+ body = @content
26
+ OmniCat.config.token_patterns[:minus].each { |p| body.gsub!(p, ' ') }
27
+ body
28
+ end
29
+
30
+ # nodoc
31
+ def plus_tokens(body)
32
+ body_tokens = []
33
+ OmniCat.config.token_patterns[:plus].each { |p| body_tokens += body.scan(p) }
34
+ body_tokens
35
+ end
36
+
37
+ # nodoc
38
+ def exclude_tokens
39
+ OmniCat.config.exclude_tokens
40
+ end
41
+
42
+ # nodoc
43
+ def tokenize_with_counts
44
+ tokenize.hashify_with_counts
45
+ end
46
+
47
+ # nodoc
48
+ def tokenize
49
+ plus_tokens(minus_tokens) - exclude_tokens
50
+ end
51
+ end
52
+ end
@@ -1,3 +1,5 @@
1
+ require File.dirname(__FILE__) + '/base'
2
+
1
3
  module OmniCat
2
4
  class Result < ::OmniCat::Base
3
5
  attr_accessor :category, :scores, :total_score
@@ -1,3 +1,3 @@
1
1
  module OmniCat
2
- VERSION = "0.1.3"
2
+ VERSION = "0.2.0"
3
3
  end
@@ -0,0 +1 @@
1
+ require File.dirname(__FILE__) + '/classifiers/strategy_test'
@@ -0,0 +1,46 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), '..', '..', 'test_helper'))
2
+
3
+ class TestStrategy < Test::Unit::TestCase
4
+ def setup
5
+ @strategy = OmniCat::Classifiers::Strategy.new
6
+ end
7
+
8
+ def test_add_category
9
+ assert_raise(NotImplementedError) { @strategy.add_category("positive") }
10
+ end
11
+
12
+ def test_add_categories
13
+ assert_raise(NotImplementedError) { @strategy.add_categories(
14
+ ["neutral", "positive", "negative"]) }
15
+ end
16
+
17
+ def test_train
18
+ assert_raise(NotImplementedError) { @strategy.train("positive", "good") }
19
+ end
20
+
21
+ def test_train_batch
22
+ assert_raise(NotImplementedError) {
23
+ @strategy.train_batch("positive", ["good job ever", "valid syntax",
24
+ "best moments of my life"])
25
+ }
26
+ end
27
+
28
+ def test_untrain
29
+ assert_raise(NotImplementedError) { @strategy.untrain("positive", "good") }
30
+ end
31
+
32
+ def test_untrain_batch
33
+ assert_raise(NotImplementedError) { @strategy.untrain_batch(
34
+ "positive", ["good work", "well done"]) }
35
+ end
36
+
37
+ def test_classify
38
+ assert_raise(NotImplementedError) { @strategy.classify("good job") }
39
+ end
40
+
41
+ def test_classify_batch
42
+ assert_raise(NotImplementedError) {
43
+ @strategy.classify_batch(["good job", "you did well"])
44
+ }
45
+ end
46
+ end
@@ -0,0 +1,40 @@
1
+ # encoding: UTF-8
2
+ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
3
+
4
+ class TestDoc < Test::Unit::TestCase
5
+ def setup
6
+ OmniCat.configure do |config|
7
+ config.exclude_tokens = ["was", "at", "by"]
8
+ config.token_patterns = {
9
+ minus: [/[\s\t\n\r]+/, /(@[\w\d]+)/],
10
+ plus: [/[\p{L}\-0-9]{2,}/, /[\!\?]/, /[\:\)\(\;\-\|]{2,3}/]
11
+ }
12
+ end
13
+ @doc = OmniCat::Doc.new(
14
+ content: "omnicat v-01 was written at 2011, omnicat by @mustafaturan"
15
+ )
16
+ end
17
+
18
+ def test_omnicat_tokenize
19
+ assert_equal(
20
+ {"omnicat" => 2, "v-01" => 1, "written" => 1, "2011" => 1},
21
+ @doc.tokens
22
+ )
23
+ end
24
+
25
+ def test_increment_count
26
+ @doc.increment_count
27
+ assert_equal(2, @doc.count)
28
+ end
29
+
30
+ def test_decrement_count
31
+ @doc.decrement_count
32
+ assert_equal(0, @doc.count)
33
+ end
34
+
35
+ def test_decrement_count_if_zero
36
+ @doc.decrement_count
37
+ @doc.decrement_count
38
+ assert_equal(0, @doc.count)
39
+ end
40
+ end
@@ -2,9 +2,11 @@ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
2
2
 
3
3
  class TestHash < Test::Unit::TestCase
4
4
  def test_to_hash
5
- categories_hash = { "pos" => { doc_count: 0, prior: 0.0, tokens: {}, token_count: 0 } }
5
+ categories_hash = {
6
+ "pos" => { doc_count: 0, docs: {}, tokens: {}, token_count: 0 }
7
+ }
6
8
  categories = OmniCat::Hash.new
7
- categories["pos"] = OmniCat::Classifiers::BayesInternals::Category.new(categories_hash["pos"])
9
+ categories["pos"] = OmniCat::Classifiers::StrategyInternals::Category.new(categories_hash["pos"])
8
10
  assert_equal(categories_hash, categories.to_hash)
9
11
  end
10
12
  end
metadata CHANGED
@@ -1,18 +1,20 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: omnicat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.2.0
5
+ prerelease:
5
6
  platform: ruby
6
7
  authors:
7
8
  - Mustafa Turan
8
9
  autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
- date: 2013-06-18 00:00:00.000000000 Z
12
+ date: 2013-07-06 00:00:00.000000000 Z
12
13
  dependencies:
13
14
  - !ruby/object:Gem::Dependency
14
15
  name: bundler
15
16
  requirement: !ruby/object:Gem::Requirement
17
+ none: false
16
18
  requirements:
17
19
  - - ~>
18
20
  - !ruby/object:Gem::Version
@@ -20,6 +22,7 @@ dependencies:
20
22
  type: :development
21
23
  prerelease: false
22
24
  version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
23
26
  requirements:
24
27
  - - ~>
25
28
  - !ruby/object:Gem::Version
@@ -27,15 +30,17 @@ dependencies:
27
30
  - !ruby/object:Gem::Dependency
28
31
  name: rake
29
32
  requirement: !ruby/object:Gem::Requirement
33
+ none: false
30
34
  requirements:
31
- - - '>='
35
+ - - ! '>='
32
36
  - !ruby/object:Gem::Version
33
37
  version: '0'
34
38
  type: :development
35
39
  prerelease: false
36
40
  version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
37
42
  requirements:
38
- - - '>='
43
+ - - ! '>='
39
44
  - !ruby/object:Gem::Version
40
45
  version: '0'
41
46
  description: A generalized framework for text classifications.
@@ -45,6 +50,7 @@ executables: []
45
50
  extensions: []
46
51
  extra_rdoc_files: []
47
52
  files:
53
+ - .gitignore
48
54
  - .travis.yml
49
55
  - CHANGELOG.txt
50
56
  - Gemfile
@@ -54,43 +60,44 @@ files:
54
60
  - lib/omnicat.rb
55
61
  - lib/omnicat/array.rb
56
62
  - lib/omnicat/base.rb
57
- - lib/omnicat/bayes.rb
58
- - lib/omnicat/classifiers/base.rb
59
- - lib/omnicat/classifiers/bayes.rb
60
- - lib/omnicat/classifiers/bayes_internals/category.rb
63
+ - lib/omnicat/classifier.rb
64
+ - lib/omnicat/classifiers/strategy.rb
65
+ - lib/omnicat/classifiers/strategy_internals/category.rb
66
+ - lib/omnicat/configuration.rb
67
+ - lib/omnicat/doc.rb
61
68
  - lib/omnicat/hash.rb
62
69
  - lib/omnicat/result.rb
63
- - lib/omnicat/string.rb
64
70
  - lib/omnicat/version.rb
65
71
  - lib/test/test_helper.rb
66
72
  - lib/test/unit/array_test.rb
67
- - lib/test/unit/base_test.rb
68
- - lib/test/unit/bayes_test.rb
73
+ - lib/test/unit/classifier_test.rb
74
+ - lib/test/unit/classifiers/strategy_test.rb
75
+ - lib/test/unit/doc_test.rb
69
76
  - lib/test/unit/hash_test.rb
70
- - lib/test/unit/string_test.rb
71
77
  - omnicat.gemspec
72
78
  homepage: https://github.com/mustafaturan/omnicat
73
79
  licenses:
74
80
  - MIT
75
- metadata: {}
76
81
  post_install_message:
77
82
  rdoc_options: []
78
83
  require_paths:
79
84
  - lib
80
85
  required_ruby_version: !ruby/object:Gem::Requirement
86
+ none: false
81
87
  requirements:
82
- - - '>='
88
+ - - ! '>='
83
89
  - !ruby/object:Gem::Version
84
90
  version: '0'
85
91
  required_rubygems_version: !ruby/object:Gem::Requirement
92
+ none: false
86
93
  requirements:
87
- - - '>='
94
+ - - ! '>='
88
95
  - !ruby/object:Gem::Version
89
96
  version: '0'
90
97
  requirements: []
91
98
  rubyforge_project:
92
- rubygems_version: 2.0.3
99
+ rubygems_version: 1.8.23
93
100
  signing_key:
94
- specification_version: 4
101
+ specification_version: 3
95
102
  summary: A generalized framework for text classifications.
96
103
  test_files: []
checksums.yaml DELETED
@@ -1,7 +0,0 @@
1
- ---
2
- SHA1:
3
- metadata.gz: ea920e881bd63f956dd1237f666d008f893668af
4
- data.tar.gz: f9d1ec2fe73eb047c5ac661c42600cff033fd35f
5
- SHA512:
6
- metadata.gz: 4c65cec9bf29fc07b9b0f0eee51da3bfc40f2ba8e443daf287b3e76f499b9084e8526baeb7b7319acd7eeda826ff9a892a0e761848d23e52af2e4545cfbd60ff
7
- data.tar.gz: 3f153307273e1c94bea62399a1d1f8d039b4c17956187779f08726429329a84acbce2ede51c7ade3c2ef2b1a778f37da664ae9855144f07a2c906f23d0ee5d80
data/lib/omnicat/bayes.rb DELETED
@@ -1,3 +0,0 @@
1
- require File.dirname(__FILE__) + '/classifiers/base'
2
- require File.dirname(__FILE__) + '/classifiers/bayes'
3
- require File.dirname(__FILE__) + '/classifiers/bayes_internals/category'
@@ -1,55 +0,0 @@
1
- module OmniCat
2
- module Classifiers
3
- class Base < ::OmniCat::Base
4
- # Allows adding multiple classification categories
5
- #
6
- # ==== Parameters
7
- #
8
- # * +names+ - Array of categories
9
- #
10
- # ==== Examples
11
- #
12
- # # Add multiple categories for classification
13
- # bayes.add_categories(["positive", "negative", "neutral"])
14
- def add_categories(names)
15
- names.each { |name| add_category(name) }
16
- end
17
-
18
- # Train the desired category with multiple documents
19
- #
20
- # ==== Parameters
21
- #
22
- # * +category+ - Name of the category from added categories list
23
- # * +docs+ - Array of documents
24
- #
25
- # ==== Examples
26
- #
27
- # # Add multiple docs for training the category
28
- # bayes.train("positive", ["clear documentation", "good, very well"])
29
- # bayes.train("negative", ["bad interface", "damn"])
30
- def train_batch(category, docs)
31
- docs.each { |doc| train(category, doc) }
32
- end
33
-
34
- # Classify the multiple documents at a time
35
- #
36
- # ==== Parameters
37
- #
38
- # * +docs+ - Array of documents
39
- #
40
- # ==== Returns
41
- #
42
- # * +result_set+ - Array of OmniCat::Result objects
43
- #
44
- # ==== Examples
45
- #
46
- # # Classify multiple documents
47
- # bayes.classify_batch(["good documentation", "damn workin again"])
48
- # =>
49
- def classify_batch(docs)
50
- docs.collect { |doc| classify(doc) }
51
- end
52
-
53
- end
54
- end
55
- end
@@ -1,174 +0,0 @@
1
- module OmniCat
2
- module Classifiers
3
- class Bayes < ::OmniCat::Classifiers::Base
4
-
5
- attr_accessor :categories # ::OmniCat::Hash - Hash of categories
6
- attr_accessor :category_count # Integer - Total category count
7
- attr_accessor :doc_count # Integer - Total token count
8
- attr_accessor :token_count # Integer - Total token count
9
- attr_accessor :uniq_token_count # Integer - Total uniq token count
10
- attr_accessor :k_value # Integer - Helper value for skipping some Bayes algorithm errors
11
-
12
- def initialize(bayes_hash = {})
13
- self.categories = ::OmniCat::Hash.new
14
- if bayes_hash.has_key?(:categories)
15
- bayes_hash[:categories].each do |name, category|
16
- self.categories[name] = ::OmniCat::Classifiers::BayesInternals::Category.new(category)
17
- end
18
- end
19
- self.category_count = bayes_hash[:category_count].to_i
20
- self.doc_count = bayes_hash[:doc_count].to_i
21
- self.k_value = bayes_hash[:k_value] || 1.0
22
- self.token_count = bayes_hash[:token_count].to_i
23
- self.uniq_token_count = bayes_hash[:uniq_token_count].to_i
24
- end
25
-
26
- # Allows adding new classification category
27
- #
28
- # ==== Parameters
29
- #
30
- # * +name+ - Name for category
31
- #
32
- # ==== Examples
33
- #
34
- # # Create a classification category
35
- # bayes = Bayes.new
36
- # bayes.add_category("positive")
37
- def add_category(name)
38
- if category_exists?(name)
39
- raise StandardError,
40
- "Category with name '#{name}' is already exists!"
41
- else
42
- self.category_count +=1
43
- self.categories[name] = ::OmniCat::Classifiers::BayesInternals::Category.new
44
- end
45
- end
46
-
47
- # Train the desired category with a document
48
- #
49
- # ==== Parameters
50
- #
51
- # * +category+ - Name of the category from added categories list
52
- # * +doc+ - Document text
53
- #
54
- # ==== Examples
55
- #
56
- # # Train the desired category
57
- # bayes.train("positive", "clear documentation")
58
- # bayes.train("positive", "good, very well")
59
- # bayes.train("negative", "bad dog")
60
- # bayes.train("neutral", "how is the management gui")
61
- def train(category_name, doc)
62
- if category_exists?(category_name)
63
- increment_doc_counts(category_name)
64
- update_priors
65
- doc.tokenize_with_counts.each do |token, count|
66
- increment_token_counts(category_name, token, count)
67
- self.categories[category_name].tokens[token] = self.categories[category_name].tokens[token].to_i + count
68
- end
69
- else
70
- raise StandardError,
71
- "Category with name '#{category_name}' does not exist!"
72
- end
73
- end
74
-
75
- # Classify the given document
76
- #
77
- # ==== Parameters
78
- #
79
- # * +doc+ - The document for classification
80
- #
81
- # ==== Returns
82
- #
83
- # * +result+ - OmniCat::Result object
84
- #
85
- # ==== Examples
86
- #
87
- # # Classify a document
88
- # bayes.classify("good documentation")
89
- # =>
90
- def classify(doc)
91
- if category_count < 2
92
- return raise StandardError,
93
- "At least 2 categories needed for classification process!"
94
- end
95
- score = -1000000
96
- result = ::OmniCat::Result.new
97
- self.categories.each do |category_name, category|
98
- result.scores[category_name] = doc_probability(category, doc)
99
- if result.scores[category_name] > score
100
- result.category[:name] = category_name
101
- score = result.scores[category_name]
102
- end
103
- result.total_score += result.scores[category_name]
104
- end
105
- result.total_score = 1 if result.total_score == 0
106
- result.category[:percentage] = (
107
- result.scores[result.category[:name]] * 100.0 /
108
- result.total_score
109
- ).floor
110
- result
111
- end
112
-
113
- private
114
- # nodoc
115
- def category_exists?(category_name)
116
- categories.has_key?(category_name)
117
- end
118
-
119
- # nodoc
120
- def increment_doc_counts(category_name)
121
- self.doc_count += 1
122
- self.categories[category_name].doc_count += 1
123
- end
124
-
125
- # nodoc
126
- def update_priors
127
- self.categories.each do |_, category|
128
- category.prior = category.doc_count / doc_count.to_f
129
- end
130
- end
131
-
132
- # nodoc
133
- def increment_token_counts(category_name, token, count)
134
- increment_uniq_token_count(token)
135
- self.token_count += count
136
- self.categories[category_name].token_count += count
137
- end
138
-
139
- # nodoc
140
- def increment_uniq_token_count(token)
141
- uniq_token_addition = 0
142
- categories.each do |_, category|
143
- if category.tokens.has_key?(token)
144
- uniq_token_addition = 1
145
- break
146
- end
147
- end
148
- self.uniq_token_count += 1 if uniq_token_addition == 0
149
- end
150
-
151
- # nodoc
152
- def doc_probability(category, doc)
153
- score = k_value
154
- doc.tokenize_with_counts.each do |token, count|
155
- score *= token_probability(category, token, count)
156
- end
157
- category.prior * score
158
- end
159
-
160
- # nodoc
161
- def token_probability(category, token, count)
162
- if category.tokens[token].to_i == 0
163
- k_value / token_count
164
- else
165
- count * (
166
- (category.tokens[token].to_i + k_value) /
167
- (category.token_count + uniq_token_count)
168
- )
169
- end
170
- end
171
-
172
- end
173
- end
174
- end
@@ -1,16 +0,0 @@
1
- module OmniCat
2
- module Classifiers
3
- module BayesInternals
4
- class Category < ::OmniCat::Base
5
- attr_accessor :doc_count, :prior, :tokens, :token_count
6
-
7
- def initialize(category_hash = {})
8
- self.doc_count = category_hash[:doc_count].to_i
9
- self.prior = category_hash[:prior].to_f
10
- self.tokens = category_hash[:tokens] || {}
11
- self.token_count = category_hash[:token_count].to_i
12
- end
13
- end
14
- end
15
- end
16
- end
@@ -1,10 +0,0 @@
1
- # encoding: UTF-8
2
- class String
3
- def omnicat_tokenize
4
- self.scan(/([\p{L}\-0-9]{2,})/).collect{ |str_arr| str_arr.first }
5
- end
6
-
7
- def tokenize_with_counts
8
- self.omnicat_tokenize.hashify_with_counts
9
- end
10
- end
@@ -1,49 +0,0 @@
1
- require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
2
-
3
- class TestBase < Test::Unit::TestCase
4
- def setup
5
- @bayes = OmniCat::Classifiers::Bayes.new
6
- end
7
-
8
- def test_add_categories
9
- @bayes.add_categories ["neutral", "positive", "negative"]
10
- assert_not_nil(@bayes.categories["neutral"])
11
- assert_equal(
12
- ["neutral", "positive", "negative"],
13
- @bayes.categories.keys
14
- )
15
- end
16
-
17
- def test_train_batch
18
- @bayes.add_category "positive"
19
- @bayes.train_batch "positive", ["good job ever", "valid syntax",
20
- "best moments of my life"]
21
- assert_equal(
22
- 3,
23
- @bayes.categories["positive"].doc_count
24
- )
25
- end
26
-
27
- def test_classify_batch
28
- @bayes.add_category "positive"
29
- @bayes.add_category "negative"
30
- @bayes.train_batch "positive", ["good job ever", "valid syntax",
31
- "best moments of my life"]
32
- @bayes.train_batch("negative", ["bad work", "awfull day", "never liked it"])
33
- results = @bayes.classify_batch(
34
- ["good sytanx research", "bad words"]
35
- )
36
-
37
- assert_equal(2, results.count)
38
-
39
- assert_equal(
40
- "positive",
41
- results[0].category[:name]
42
- )
43
- assert_equal(
44
- "negative",
45
- results[1].category[:name]
46
- )
47
-
48
- end
49
- end
@@ -1,85 +0,0 @@
1
- require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
2
-
3
- class TestBayes < Test::Unit::TestCase
4
- def setup
5
- @bayes = OmniCat::Classifiers::Bayes.new
6
- end
7
-
8
- def test_add_category
9
- @bayes.add_category "neutral"
10
- assert_not_nil(@bayes.categories["neutral"])
11
- assert_equal(
12
- ["neutral"],
13
- @bayes.categories.keys
14
- )
15
- assert_equal(
16
- 0,
17
- @bayes.categories["neutral"].doc_count
18
- )
19
- assert_equal(
20
- {},
21
- @bayes.categories["neutral"].tokens
22
- )
23
- assert_equal(
24
- 0,
25
- @bayes.categories["neutral"].token_count
26
- )
27
- end
28
-
29
- def test_add_category_that_already_exists
30
- @bayes.add_category "neutral"
31
- assert_raise(StandardError) { @bayes.add_category "neutral" }
32
- end
33
-
34
- def test_train_valid_category
35
- @bayes.add_category "neutral"
36
- @bayes.train "neutral", "how are you?"
37
- assert_equal(
38
- 1,
39
- @bayes.categories["neutral"].doc_count
40
- )
41
- assert_equal(
42
- {"how" => 1, "are" => 1, "you" => 1},
43
- @bayes.categories["neutral"].tokens
44
- )
45
- assert_equal(
46
- 3,
47
- @bayes.categories["neutral"].token_count
48
- )
49
- end
50
-
51
- def test_train_missing_category
52
- assert_raise(StandardError) { @bayes.train "neutral", "how are you?" }
53
- end
54
-
55
- def test_classify
56
- @bayes.add_category "positive"
57
- @bayes.add_category "negative"
58
- @bayes.train("positive", "good job")
59
- @bayes.train("negative", "bad work")
60
- assert_equal(
61
- "positive",
62
- @bayes.classify("very good position for this sentence").category[:name]
63
- )
64
- assert_equal(
65
- "negative",
66
- @bayes.classify("bad words").category[:name]
67
- )
68
- end
69
-
70
- def test_initialize_with_hash
71
- bayes1 = ::OmniCat::Classifiers::Bayes.new
72
- bayes1.add_category "positive"
73
- bayes1.add_category "negative"
74
- bayes1.train("positive", "good job")
75
- bayes1.train("negative", "bad work")
76
- h1 = bayes1.to_hash
77
-
78
- bayes2 = ::OmniCat::Classifiers::Bayes.new(h1)
79
- assert_equal(h1, bayes2.to_hash)
80
- end
81
-
82
- def test_classify_with_insufficient_categories
83
- assert_raise(StandardError) { @bayes.classify "blank" }
84
- end
85
- end
@@ -1,17 +0,0 @@
1
- require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
2
-
3
- class TestString < Test::Unit::TestCase
4
- def test_omnicat_tokenize
5
- assert_equal(
6
- ["mustafa", "turan", "omni-cat-v0", "1986"],
7
- "mustafa turan omni-cat-v0 1986 1 a s d".omnicat_tokenize
8
- )
9
- end
10
-
11
- def test_tokenize_with_counts
12
- assert_equal(
13
- {"omnicat" => 2, "written" => 1, "at" => 1, "2011" => 1},
14
- "omnicat written at 2011, omnicat".tokenize_with_counts
15
- )
16
- end
17
- end