omnicat 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.travis.yml +6 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +99 -0
- data/Rakefile +1 -0
- data/lib/omnicat.rb +7 -0
- data/lib/omnicat/array.rb +9 -0
- data/lib/omnicat/base.rb +16 -0
- data/lib/omnicat/bayes.rb +3 -0
- data/lib/omnicat/classifiers/base.rb +55 -0
- data/lib/omnicat/classifiers/bayes.rb +127 -0
- data/lib/omnicat/classifiers/bayes_internals/category.rb +15 -0
- data/lib/omnicat/hash.rb +13 -0
- data/lib/omnicat/result.rb +11 -0
- data/lib/omnicat/string.rb +9 -0
- data/lib/omnicat/version.rb +3 -0
- data/lib/test/test_helper.rb +2 -0
- data/lib/test/unit/array_test.rb +10 -0
- data/lib/test/unit/base_test.rb +49 -0
- data/lib/test/unit/bayes_test.rb +85 -0
- data/lib/test/unit/hash_test.rb +10 -0
- data/lib/test/unit/string_test.rb +17 -0
- data/omnicat.gemspec +23 -0
- metadata +95 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA1:
|
|
3
|
+
metadata.gz: 1468554ebab3a7d69abfb6b806ed2acc474a0fbb
|
|
4
|
+
data.tar.gz: 7cafcd36ce8030ba19dc1f48f3be8cc016f49c1d
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: ce946b646239bfec96ce58bc296d9b866c72603d5113b56fb4c65418b14457fad6072026fe90330250e04dc4467476d874fdd29327253793b10fdf5a90526f37
|
|
7
|
+
data.tar.gz: 1281d217ed03836696b9025e368f274941b481ba42d94af5de045fb7c06a6ee231ae8c6bec4e0758fe6e438f707559322f955f60b4c26b7f0e3c1ad3993328b1
|
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
Copyright (c) 2013 Mustafa Turan
|
|
2
|
+
|
|
3
|
+
MIT License
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
|
6
|
+
a copy of this software and associated documentation files (the
|
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
|
11
|
+
the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be
|
|
14
|
+
included in all copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# OmniCat
|
|
2
|
+
|
|
3
|
+
A generalized framework for text classifications. For now, it only supports Naive Bayes algorithm for text classification.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
Add this line to your application's Gemfile:
|
|
8
|
+
|
|
9
|
+
gem 'omnicat'
|
|
10
|
+
|
|
11
|
+
And then execute:
|
|
12
|
+
|
|
13
|
+
$ bundle
|
|
14
|
+
|
|
15
|
+
Or install it yourself as:
|
|
16
|
+
|
|
17
|
+
$ gem install omnicat
|
|
18
|
+
|
|
19
|
+
## Usage
|
|
20
|
+
|
|
21
|
+
See rdoc for detailed usage.
|
|
22
|
+
|
|
23
|
+
### Bayes classifier
|
|
24
|
+
Create a Bayes classifier object.
|
|
25
|
+
|
|
26
|
+
bayes = OmniCat::Classifiers::Bayes.new
|
|
27
|
+
|
|
28
|
+
### Create categories
|
|
29
|
+
Create a classification category.
|
|
30
|
+
|
|
31
|
+
bayes.add_category('positive')
|
|
32
|
+
bayes.add_category('negative')
|
|
33
|
+
|
|
34
|
+
### Train
|
|
35
|
+
Train category with a document.
|
|
36
|
+
|
|
37
|
+
bayes.train('positive', 'great if you are in a slap happy mood .')
|
|
38
|
+
bayes.train('negative', 'bad tracking issue')
|
|
39
|
+
|
|
40
|
+
### Train batch
|
|
41
|
+
Train category with multiple documents.
|
|
42
|
+
|
|
43
|
+
bayes.train_batch('positive', [
|
|
44
|
+
'a feel-good picture in the best sense of the term...',
|
|
45
|
+
'it is a feel-good movie about which you can actually feel good.',
|
|
46
|
+
'love and money both of them are good choises'
|
|
47
|
+
])
|
|
48
|
+
bayes.train_batch('negative', [
|
|
49
|
+
'simplistic , silly and tedious .',
|
|
50
|
+
'interesting , but not compelling . ',
|
|
51
|
+
'seems clever but not especially compelling'
|
|
52
|
+
])
|
|
53
|
+
|
|
54
|
+
### Classify
|
|
55
|
+
Classify a document.
|
|
56
|
+
|
|
57
|
+
result = bayes.classify('I feel so good and happy')
|
|
58
|
+
=> #<OmniCat::Result:0x007fe59b97b548 @category={:name=>"negative", :percentage=>99}, @scores={"positive"=>1.749909854122994e-07, "negative"=>0.014084507042253521}, @total_score=0.014084682033238934>
|
|
59
|
+
result.to_hash
|
|
60
|
+
=> {:category=>{:name=>"negative", :percentage=>99}, :scores=>{"positive"=>1.749909854122994e-07, "negative"=>0.014084507042253521}, :total_score=>0.014084682033238934}
|
|
61
|
+
|
|
62
|
+
### Classify batch
|
|
63
|
+
Classify multiple documents at a time.
|
|
64
|
+
|
|
65
|
+
results = bayes.classify_batch(
|
|
66
|
+
[
|
|
67
|
+
'the movie is silly so not compelling enough',
|
|
68
|
+
'a good piece of work'
|
|
69
|
+
]
|
|
70
|
+
)
|
|
71
|
+
=> [#<OmniCat::Result:0x007fe59b949d90 @category={:name=>"negative", :percentage=>75}, @scores={"positive"=>7.962089836259623e-06, "negative"=>2.5145916163515512e-05}, @total_score=3.3108005999775135e-05>, #<OmniCat::Result:0x007fe59c9d7d10 @category={:name=>"positive", :percentage=>100}, @scores={"positive"=>0.0005434126313247192, "negative"=>0}, @total_score=0.0005434126313247192>]
|
|
72
|
+
|
|
73
|
+
### Convert to hash
|
|
74
|
+
Convert full Bayes object to hash.
|
|
75
|
+
|
|
76
|
+
# For storing, restoring modal data
|
|
77
|
+
bayes_hash = bayes.to_hash
|
|
78
|
+
|
|
79
|
+
### Load from hash
|
|
80
|
+
Load full Bayes object from hash.
|
|
81
|
+
|
|
82
|
+
another_bayes_obj = OmniCat::Classifiers::Bayes.new(bayes_hash)
|
|
83
|
+
another_bayes_obj.classify('best senses')
|
|
84
|
+
|
|
85
|
+
## Todo
|
|
86
|
+
* Add more text classification modules such as Support Vector Machine (SVM).
|
|
87
|
+
* Add text cleaning/manipulating extensions such as stopwords cleaner, stemmer, and pos-tagger, etc...
|
|
88
|
+
|
|
89
|
+
## Contributing
|
|
90
|
+
|
|
91
|
+
1. Fork it
|
|
92
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
|
93
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
|
94
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
|
95
|
+
5. Create new Pull Request
|
|
96
|
+
|
|
97
|
+
## Copyright
|
|
98
|
+
Copyright © 2013 Mustafa Turan. See LICENSE for details.
|
|
99
|
+
|
data/Rakefile
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
require "bundler/gem_tasks"
|
data/lib/omnicat.rb
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
require File.dirname(__FILE__) + '/omnicat/version'
|
|
2
|
+
require File.dirname(__FILE__) + '/omnicat/string'
|
|
3
|
+
require File.dirname(__FILE__) + '/omnicat/array'
|
|
4
|
+
require File.dirname(__FILE__) + '/omnicat/hash'
|
|
5
|
+
require File.dirname(__FILE__) + '/omnicat/base'
|
|
6
|
+
require File.dirname(__FILE__) + '/omnicat/result'
|
|
7
|
+
require File.dirname(__FILE__) + '/omnicat/bayes'
|
data/lib/omnicat/base.rb
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
module OmniCat
|
|
2
|
+
class Base
|
|
3
|
+
# Returns Base object as Hash
|
|
4
|
+
#
|
|
5
|
+
# @return Base object instance variables in a Hash
|
|
6
|
+
def to_hash
|
|
7
|
+
hash = {}
|
|
8
|
+
self.instance_variables.each do |key|
|
|
9
|
+
if val = instance_variable_get(key)
|
|
10
|
+
hash[key[1..-1].to_sym] = val.class.to_s.include?('OmniCat') ? val.to_hash : val
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
hash
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
module OmniCat
|
|
2
|
+
module Classifiers
|
|
3
|
+
class Base < ::OmniCat::Base
|
|
4
|
+
# Allows adding multiple classification categories
|
|
5
|
+
#
|
|
6
|
+
# ==== Parameters
|
|
7
|
+
#
|
|
8
|
+
# * +names+ - Array of categories
|
|
9
|
+
#
|
|
10
|
+
# ==== Examples
|
|
11
|
+
#
|
|
12
|
+
# # Add multiple categories for classification
|
|
13
|
+
# bayes.add_categories(["positive", "negative", "neutral"])
|
|
14
|
+
def add_categories(names)
|
|
15
|
+
names.each { |name| add_category(name) }
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Train the desired category with multiple documents
|
|
19
|
+
#
|
|
20
|
+
# ==== Parameters
|
|
21
|
+
#
|
|
22
|
+
# * +category+ - Name of the category from added categories list
|
|
23
|
+
# * +docs+ - Array of documents
|
|
24
|
+
#
|
|
25
|
+
# ==== Examples
|
|
26
|
+
#
|
|
27
|
+
# # Add multiple docs for training the category
|
|
28
|
+
# bayes.train("positive", ["clear documentation", "good, very well"])
|
|
29
|
+
# bayes.train("negative", ["bad interface", "damn"])
|
|
30
|
+
def train_batch(category, docs)
|
|
31
|
+
docs.each { |doc| train(category, doc) }
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Classify the multiple documents at a time
|
|
35
|
+
#
|
|
36
|
+
# ==== Parameters
|
|
37
|
+
#
|
|
38
|
+
# * +docs+ - Array of documents
|
|
39
|
+
#
|
|
40
|
+
# ==== Returns
|
|
41
|
+
#
|
|
42
|
+
# * +result_set+ - Array of OmniCat::Result objects
|
|
43
|
+
#
|
|
44
|
+
# ==== Examples
|
|
45
|
+
#
|
|
46
|
+
# # Classify multiple documents
|
|
47
|
+
# bayes.classify_batch(["good documentation", "damn workin again"])
|
|
48
|
+
# =>
|
|
49
|
+
def classify_batch(docs)
|
|
50
|
+
docs.collect { |doc| classify(doc) }
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
module OmniCat
|
|
2
|
+
module Classifiers
|
|
3
|
+
class Bayes < ::OmniCat::Classifiers::Base
|
|
4
|
+
|
|
5
|
+
attr_accessor :categories, :category_count, :doc_count, :token_count
|
|
6
|
+
attr_accessor :k_value # helper val for skipping some Bayes theorem errors
|
|
7
|
+
|
|
8
|
+
def initialize(bayes_hash = {})
|
|
9
|
+
self.categories = ::OmniCat::Hash.new
|
|
10
|
+
if bayes_hash.has_key?(:categories)
|
|
11
|
+
bayes_hash[:categories].each do |name, category|
|
|
12
|
+
self.categories[name] = ::OmniCat::Classifiers::BayesInternals::Category.new(category)
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
self.category_count = bayes_hash[:category_count].to_i
|
|
16
|
+
self.doc_count = bayes_hash[:doc_count].to_i
|
|
17
|
+
self.k_value = bayes_hash[:k_value] || 1.0
|
|
18
|
+
self.token_count = bayes_hash[:token_count].to_i
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# Allows adding new classification category
|
|
22
|
+
#
|
|
23
|
+
# ==== Parameters
|
|
24
|
+
#
|
|
25
|
+
# * +name+ - Name for category
|
|
26
|
+
#
|
|
27
|
+
# ==== Examples
|
|
28
|
+
#
|
|
29
|
+
# # Create a classification category
|
|
30
|
+
# bayes = Bayes.new
|
|
31
|
+
# bayes.add_category("positive")
|
|
32
|
+
def add_category(name)
|
|
33
|
+
if category_exists?(name)
|
|
34
|
+
raise StandardError,
|
|
35
|
+
"Category with name '#{name}' is already exists!"
|
|
36
|
+
else
|
|
37
|
+
self.category_count +=1
|
|
38
|
+
self.categories[name] = ::OmniCat::Classifiers::BayesInternals::Category.new
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Train the desired category with a document
|
|
43
|
+
#
|
|
44
|
+
# ==== Parameters
|
|
45
|
+
#
|
|
46
|
+
# * +category+ - Name of the category from added categories list
|
|
47
|
+
# * +doc+ - Document text
|
|
48
|
+
#
|
|
49
|
+
# ==== Examples
|
|
50
|
+
#
|
|
51
|
+
# # Train the desired category
|
|
52
|
+
# bayes.train("positive", "clear documentation")
|
|
53
|
+
# bayes.train("positive", "good, very well")
|
|
54
|
+
# bayes.train("negative", "bad dog")
|
|
55
|
+
# bayes.train("neutral", "how is the management gui")
|
|
56
|
+
def train(category, doc)
|
|
57
|
+
if category_exists?(category)
|
|
58
|
+
self.doc_count += 1
|
|
59
|
+
categories[category].doc_count += 1
|
|
60
|
+
doc.tokenize_with_counts.each do |token, count|
|
|
61
|
+
self.token_count += count
|
|
62
|
+
self.categories[category].tokens[token] = self.categories[category].tokens[token].to_i + count
|
|
63
|
+
self.categories[category].token_count += count
|
|
64
|
+
end
|
|
65
|
+
else
|
|
66
|
+
raise StandardError,
|
|
67
|
+
"Category with name '#{category}' does not exist!"
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Classify the given document
|
|
72
|
+
#
|
|
73
|
+
# ==== Parameters
|
|
74
|
+
#
|
|
75
|
+
# * +doc+ - The document for classification
|
|
76
|
+
#
|
|
77
|
+
# ==== Returns
|
|
78
|
+
#
|
|
79
|
+
# * +result+ - OmniCat::Result object
|
|
80
|
+
#
|
|
81
|
+
# ==== Examples
|
|
82
|
+
#
|
|
83
|
+
# # Classify a document
|
|
84
|
+
# bayes.classify("good documentation")
|
|
85
|
+
# =>
|
|
86
|
+
def classify(doc)
|
|
87
|
+
if category_count < 2
|
|
88
|
+
return raise StandardError,
|
|
89
|
+
"At least 2 categories needed for classification process!"
|
|
90
|
+
end
|
|
91
|
+
score = -1000000
|
|
92
|
+
result = ::OmniCat::Result.new
|
|
93
|
+
categories.each do |name, category|
|
|
94
|
+
prior = category.doc_count / doc_count.to_f
|
|
95
|
+
result.scores[name] = k_value
|
|
96
|
+
doc.tokenize_with_counts.each do |token, count|
|
|
97
|
+
result.scores[name] *= (
|
|
98
|
+
(category.tokens[token].to_i + k_value) /
|
|
99
|
+
(category.token_count + token_count)
|
|
100
|
+
) if category.tokens.has_key?(token)
|
|
101
|
+
end
|
|
102
|
+
result.scores[name] = (
|
|
103
|
+
result.scores[name].to_f == 1.0 ? 0 : (prior * result.scores[name])
|
|
104
|
+
)
|
|
105
|
+
if result.scores[name] > score
|
|
106
|
+
result.category[:name] = name;
|
|
107
|
+
score = result.scores[name];
|
|
108
|
+
end
|
|
109
|
+
result.total_score += result.scores[name]
|
|
110
|
+
end
|
|
111
|
+
result.total_score = 1 if result.total_score == 0
|
|
112
|
+
result.category[:percentage] = (
|
|
113
|
+
result.scores[result.category[:name]] * 100.0 /
|
|
114
|
+
result.total_score
|
|
115
|
+
).floor
|
|
116
|
+
result
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
private
|
|
120
|
+
# nodoc
|
|
121
|
+
def category_exists?(category_name)
|
|
122
|
+
categories.has_key?(category_name)
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
module OmniCat
|
|
2
|
+
module Classifiers
|
|
3
|
+
module BayesInternals
|
|
4
|
+
class Category < ::OmniCat::Base
|
|
5
|
+
attr_accessor :doc_count, :tokens, :token_count
|
|
6
|
+
|
|
7
|
+
def initialize(category_hash = {})
|
|
8
|
+
self.doc_count = category_hash[:doc_count].to_i
|
|
9
|
+
self.tokens = category_hash[:tokens] || {}
|
|
10
|
+
self.token_count = category_hash[:token_count].to_i
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
data/lib/omnicat/hash.rb
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
|
|
2
|
+
|
|
3
|
+
class TestArray < Test::Unit::TestCase
|
|
4
|
+
def test_hashify_with_counts
|
|
5
|
+
assert_equal(
|
|
6
|
+
{"omnicat" => 2, "written" => 1, "at" => 1, "2011" => 1},
|
|
7
|
+
["omnicat", "written", "at", "2011", "omnicat"].hashify_with_counts
|
|
8
|
+
)
|
|
9
|
+
end
|
|
10
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
|
|
2
|
+
|
|
3
|
+
class TestBase < Test::Unit::TestCase
|
|
4
|
+
def setup
|
|
5
|
+
@bayes = OmniCat::Classifiers::Bayes.new
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
def test_add_categories
|
|
9
|
+
@bayes.add_categories ["neutral", "positive", "negative"]
|
|
10
|
+
assert_not_nil(@bayes.categories["neutral"])
|
|
11
|
+
assert_equal(
|
|
12
|
+
["neutral", "positive", "negative"],
|
|
13
|
+
@bayes.categories.keys
|
|
14
|
+
)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def test_train_batch
|
|
18
|
+
@bayes.add_category "positive"
|
|
19
|
+
@bayes.train_batch "positive", ["good job ever", "valid syntax",
|
|
20
|
+
"best moments of my life"]
|
|
21
|
+
assert_equal(
|
|
22
|
+
3,
|
|
23
|
+
@bayes.categories["positive"].doc_count
|
|
24
|
+
)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def test_classify_batch
|
|
28
|
+
@bayes.add_category "positive"
|
|
29
|
+
@bayes.add_category "negative"
|
|
30
|
+
@bayes.train_batch "positive", ["good job ever", "valid syntax",
|
|
31
|
+
"best moments of my life"]
|
|
32
|
+
@bayes.train_batch("negative", ["bad work", "awfull day", "never liked it"])
|
|
33
|
+
results = @bayes.classify_batch(
|
|
34
|
+
["good sytanx research", "bad words"]
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
assert_equal(2, results.count)
|
|
38
|
+
|
|
39
|
+
assert_equal(
|
|
40
|
+
"positive",
|
|
41
|
+
results[0].category[:name]
|
|
42
|
+
)
|
|
43
|
+
assert_equal(
|
|
44
|
+
"negative",
|
|
45
|
+
results[1].category[:name]
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
|
|
2
|
+
|
|
3
|
+
class TestBayes < Test::Unit::TestCase
|
|
4
|
+
def setup
|
|
5
|
+
@bayes = OmniCat::Classifiers::Bayes.new
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
def test_add_category
|
|
9
|
+
@bayes.add_category "neutral"
|
|
10
|
+
assert_not_nil(@bayes.categories["neutral"])
|
|
11
|
+
assert_equal(
|
|
12
|
+
["neutral"],
|
|
13
|
+
@bayes.categories.keys
|
|
14
|
+
)
|
|
15
|
+
assert_equal(
|
|
16
|
+
0,
|
|
17
|
+
@bayes.categories["neutral"].doc_count
|
|
18
|
+
)
|
|
19
|
+
assert_equal(
|
|
20
|
+
{},
|
|
21
|
+
@bayes.categories["neutral"].tokens
|
|
22
|
+
)
|
|
23
|
+
assert_equal(
|
|
24
|
+
0,
|
|
25
|
+
@bayes.categories["neutral"].token_count
|
|
26
|
+
)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def test_add_category_that_already_exists
|
|
30
|
+
@bayes.add_category "neutral"
|
|
31
|
+
assert_raise(StandardError) { @bayes.add_category "neutral" }
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def test_train_valid_category
|
|
35
|
+
@bayes.add_category "neutral"
|
|
36
|
+
@bayes.train "neutral", "how are you?"
|
|
37
|
+
assert_equal(
|
|
38
|
+
1,
|
|
39
|
+
@bayes.categories["neutral"].doc_count
|
|
40
|
+
)
|
|
41
|
+
assert_equal(
|
|
42
|
+
{"how" => 1, "are" => 1, "you" => 1},
|
|
43
|
+
@bayes.categories["neutral"].tokens
|
|
44
|
+
)
|
|
45
|
+
assert_equal(
|
|
46
|
+
3,
|
|
47
|
+
@bayes.categories["neutral"].token_count
|
|
48
|
+
)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def test_train_missing_category
|
|
52
|
+
assert_raise(StandardError) { @bayes.train "neutral", "how are you?" }
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def test_classify
|
|
56
|
+
@bayes.add_category "positive"
|
|
57
|
+
@bayes.add_category "negative"
|
|
58
|
+
@bayes.train("positive", "good job")
|
|
59
|
+
@bayes.train("negative", "bad work")
|
|
60
|
+
assert_equal(
|
|
61
|
+
"positive",
|
|
62
|
+
@bayes.classify("very good position for this sentence").category[:name]
|
|
63
|
+
)
|
|
64
|
+
assert_equal(
|
|
65
|
+
"negative",
|
|
66
|
+
@bayes.classify("bad words").category[:name]
|
|
67
|
+
)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def test_initialize_with_hash
|
|
71
|
+
bayes1 = ::OmniCat::Classifiers::Bayes.new
|
|
72
|
+
bayes1.add_category "positive"
|
|
73
|
+
bayes1.add_category "negative"
|
|
74
|
+
bayes1.train("positive", "good job")
|
|
75
|
+
bayes1.train("negative", "bad work")
|
|
76
|
+
h1 = bayes1.to_hash
|
|
77
|
+
|
|
78
|
+
bayes2 = ::OmniCat::Classifiers::Bayes.new(h1)
|
|
79
|
+
assert_equal(h1, bayes2.to_hash)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def test_classify_with_insufficient_categories
|
|
83
|
+
assert_raise(StandardError) { @bayes.classify "blank" }
|
|
84
|
+
end
|
|
85
|
+
end
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
|
|
2
|
+
|
|
3
|
+
class TestHash < Test::Unit::TestCase
|
|
4
|
+
def test_to_hash
|
|
5
|
+
categories_hash = { "pos" => { doc_count: 0, tokens: {}, token_count: 0 } }
|
|
6
|
+
categories = OmniCat::Hash.new
|
|
7
|
+
categories["pos"] = OmniCat::Classifiers::BayesInternals::Category.new(categories_hash["pos"])
|
|
8
|
+
assert_equal(categories_hash, categories.to_hash)
|
|
9
|
+
end
|
|
10
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
|
|
2
|
+
|
|
3
|
+
class TestString < Test::Unit::TestCase
|
|
4
|
+
def test_omnicat_tokenize
|
|
5
|
+
assert_equal(
|
|
6
|
+
["mustafa", "turan", "omni-cat-v0", "1986"],
|
|
7
|
+
"mustafa turan omni-cat-v0 1986 1 a s d".omnicat_tokenize
|
|
8
|
+
)
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def test_tokenize_with_counts
|
|
12
|
+
assert_equal(
|
|
13
|
+
{"omnicat" => 2, "written" => 1, "at" => 1, "2011" => 1},
|
|
14
|
+
"omnicat written at 2011, omnicat".tokenize_with_counts
|
|
15
|
+
)
|
|
16
|
+
end
|
|
17
|
+
end
|
data/omnicat.gemspec
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# coding: utf-8
|
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
|
+
require 'omnicat/version'
|
|
5
|
+
|
|
6
|
+
Gem::Specification.new do |spec|
|
|
7
|
+
spec.name = "omnicat"
|
|
8
|
+
spec.version = OmniCat::VERSION
|
|
9
|
+
spec.authors = ["Mustafa Turan"]
|
|
10
|
+
spec.email = ["mustafaturan.net@gmail.com"]
|
|
11
|
+
spec.description = %q{A generalized framework for text classifications.}
|
|
12
|
+
spec.summary = spec.description
|
|
13
|
+
spec.homepage = "https://github.com/mustafaturan/omnicat"
|
|
14
|
+
spec.license = "MIT"
|
|
15
|
+
|
|
16
|
+
spec.files = `git ls-files`.split($/)
|
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
|
19
|
+
spec.require_paths = ["lib"]
|
|
20
|
+
|
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
|
22
|
+
spec.add_development_dependency "rake"
|
|
23
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: omnicat
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Mustafa Turan
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2013-06-15 00:00:00.000000000 Z
|
|
12
|
+
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: bundler
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - ~>
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '1.3'
|
|
20
|
+
type: :development
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - ~>
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '1.3'
|
|
27
|
+
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: rake
|
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - '>='
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: '0'
|
|
34
|
+
type: :development
|
|
35
|
+
prerelease: false
|
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
+
requirements:
|
|
38
|
+
- - '>='
|
|
39
|
+
- !ruby/object:Gem::Version
|
|
40
|
+
version: '0'
|
|
41
|
+
description: A generalized framework for text classifications.
|
|
42
|
+
email:
|
|
43
|
+
- mustafaturan.net@gmail.com
|
|
44
|
+
executables: []
|
|
45
|
+
extensions: []
|
|
46
|
+
extra_rdoc_files: []
|
|
47
|
+
files:
|
|
48
|
+
- .travis.yml
|
|
49
|
+
- Gemfile
|
|
50
|
+
- LICENSE.txt
|
|
51
|
+
- README.md
|
|
52
|
+
- Rakefile
|
|
53
|
+
- lib/omnicat.rb
|
|
54
|
+
- lib/omnicat/array.rb
|
|
55
|
+
- lib/omnicat/base.rb
|
|
56
|
+
- lib/omnicat/bayes.rb
|
|
57
|
+
- lib/omnicat/classifiers/base.rb
|
|
58
|
+
- lib/omnicat/classifiers/bayes.rb
|
|
59
|
+
- lib/omnicat/classifiers/bayes_internals/category.rb
|
|
60
|
+
- lib/omnicat/hash.rb
|
|
61
|
+
- lib/omnicat/result.rb
|
|
62
|
+
- lib/omnicat/string.rb
|
|
63
|
+
- lib/omnicat/version.rb
|
|
64
|
+
- lib/test/test_helper.rb
|
|
65
|
+
- lib/test/unit/array_test.rb
|
|
66
|
+
- lib/test/unit/base_test.rb
|
|
67
|
+
- lib/test/unit/bayes_test.rb
|
|
68
|
+
- lib/test/unit/hash_test.rb
|
|
69
|
+
- lib/test/unit/string_test.rb
|
|
70
|
+
- omnicat.gemspec
|
|
71
|
+
homepage: https://github.com/mustafaturan/omnicat
|
|
72
|
+
licenses:
|
|
73
|
+
- MIT
|
|
74
|
+
metadata: {}
|
|
75
|
+
post_install_message:
|
|
76
|
+
rdoc_options: []
|
|
77
|
+
require_paths:
|
|
78
|
+
- lib
|
|
79
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
80
|
+
requirements:
|
|
81
|
+
- - '>='
|
|
82
|
+
- !ruby/object:Gem::Version
|
|
83
|
+
version: '0'
|
|
84
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
85
|
+
requirements:
|
|
86
|
+
- - '>='
|
|
87
|
+
- !ruby/object:Gem::Version
|
|
88
|
+
version: '0'
|
|
89
|
+
requirements: []
|
|
90
|
+
rubyforge_project:
|
|
91
|
+
rubygems_version: 2.0.3
|
|
92
|
+
signing_key:
|
|
93
|
+
specification_version: 4
|
|
94
|
+
summary: A generalized framework for text classifications.
|
|
95
|
+
test_files: []
|