omnicat 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 65d6fbb05908315286db625537482191aa4067e4
4
+ data.tar.gz: c79f9470ca00165736ed3bc78b8dd9f6578b682a
5
+ SHA512:
6
+ metadata.gz: 069cefc4f20e453dd59af4fb7ad6e46e0b9529bbb63b8dfe998ea3058a0f8f00c308210a3527fa901e422b20af481586a27fe0fd6eab3bd5133f0026e8cb3fea
7
+ data.tar.gz: 79a279215e8eb6984725c9fe302970c233dec11ef58095e8edefec68bf1596e5fd41f07c3a33826b3c26f3917491c8c067114bd83c05286abec68d498b6a0e33
@@ -3,6 +3,7 @@ require File.dirname(__FILE__) + '/omnicat/configuration'
3
3
  require File.dirname(__FILE__) + '/omnicat/array'
4
4
  require File.dirname(__FILE__) + '/omnicat/base'
5
5
  require File.dirname(__FILE__) + '/omnicat/doc'
6
+ require File.dirname(__FILE__) + '/omnicat/score'
6
7
  require File.dirname(__FILE__) + '/omnicat/result'
7
8
  require File.dirname(__FILE__) + '/omnicat/classifier'
8
9
 
@@ -10,7 +10,7 @@ module OmniCat
10
10
  attr_accessor :strategy
11
11
 
12
12
  # delegate category methods
13
- def_delegators :@strategy, :add_category, :add_categories
13
+ def_delegators :@strategy, :add_category, :add_categories, :category_size_limit
14
14
 
15
15
  # delegate training methods
16
16
  def_delegators :@strategy, :train, :train_batch, :untrain, :untrain_batch
@@ -26,30 +26,79 @@ module OmniCat
26
26
  @strategy = classifier
27
27
  end
28
28
 
29
+ # Changes classifier strategy and train new strategy if needed
30
+ #
29
31
  def strategy=(classifier)
30
32
  is_interchangeable?(classifier)
31
- if @strategy && classifier.doc_count == 0
33
+ if @strategy && classifier.category_count == 0
32
34
  previous_strategy = @strategy
33
35
  @strategy = classifier
34
- # pass previous strategy contents into the new one
35
- previous_strategy.categories.each do |category_name, category|
36
- @strategy.add_category(category_name)
37
- category.docs.each do |_, doc|
38
- doc.count.times do
39
- @strategy.train(category_name, doc.content)
40
- end
41
- end
42
- end
36
+ convert_categories_with_docs(previous_strategy)
43
37
  else
44
38
  @strategy = classifier
45
39
  end
46
40
  end
47
41
 
48
42
  private
43
+ # nodoc
44
+ def convert_categories_with_docs(previous_strategy)
45
+ if previous_strategy.categories.is_a?(Hash)
46
+ convert_categories_hash(previous_strategy.categories)
47
+ else
48
+ convert_categories_array(previous_strategy.categories)
49
+ end
50
+ end
51
+
52
+ # nodoc
53
+ def convert_categories_array(categories)
54
+ categories.each do |category|
55
+ convert_category(category)
56
+ end
57
+ end
58
+
59
+ # nodoc
60
+ def convert_categories_hash(categories)
61
+ categories.each do |_, category|
62
+ convert_category(category)
63
+ end
64
+ end
65
+
66
+ # nodoc
67
+ def convert_category(category)
68
+ @strategy.add_category(category.name)
69
+ if category.docs.is_a?(Hash)
70
+ convert_docs_hash(category.name, category.docs)
71
+ else
72
+ convert_docs_array(category.name, category.docs)
73
+ end
74
+ end
75
+
76
+ # nodoc
77
+ def convert_docs_array(category_name, docs)
78
+ docs.each do |doc|
79
+ convert_doc(category_name, doc)
80
+ end
81
+ end
82
+
83
+ # nodoc
84
+ def convert_docs_hash(category_name, docs)
85
+ docs.each do |_, doc|
86
+ convert_doc(category_name, doc)
87
+ end
88
+ end
89
+
90
+ # nodoc
91
+ def convert_doc(category_name, doc)
92
+ doc.count.times do
93
+ @strategy.train(category_name, doc.content)
94
+ end
95
+ end
96
+
97
+ # nodoc
49
98
  def is_interchangeable?(classifier)
50
- if classifier.category_size_limit
99
+ unless classifier.category_size_limit == 0
51
100
  if @strategy.category_count > classifier.category_size_limit
52
- raise StandardError,
101
+ raise StandardError,
53
102
  'New classifier category size limit is less than the current classifier\'s category count.'
54
103
  end
55
104
  end
@@ -9,12 +9,12 @@ module OmniCat
9
9
  #
10
10
  # The class supplies abstract methods for possible text classifiers
11
11
  class Strategy < ::OmniCat::Base
12
- attr_accessor :categories # ::OmniCat::Hash - Hash of categories
12
+ attr_accessor :categories # Hash - Hash of categories
13
13
  attr_accessor :category_count # Integer - Total category count
14
- attr_accessor :category_size_limit # Integer - Max allowed category
15
- attr_accessor :doc_count # Integer - Total token count
14
+ attr_accessor :category_size_limit # Integer - Max allowed category size
15
+ attr_accessor :doc_count # Integer - Total doc count
16
16
  attr_accessor :token_count # Integer - Total token count
17
- attr_accessor :uniq_token_count # Integer - Total uniq token count
17
+ attr_accessor :unique_token_count # Integer - Total uniq token count
18
18
 
19
19
  def initialize(strategy_hash = {})
20
20
  @categories = {}
@@ -22,16 +22,16 @@ module OmniCat
22
22
  @category_size_limit = strategy_hash[:category_size_limit].to_i
23
23
  @doc_count = strategy_hash[:doc_count].to_i
24
24
  @token_count = strategy_hash[:token_count].to_i
25
- @uniq_token_count = strategy_hash[:uniq_token_count].to_i
25
+ @unique_token_count = strategy_hash[:unique_token_count].to_i
26
26
  end
27
27
 
28
28
  # Abstract method for adding new classification category
29
29
  #
30
30
  # ==== Parameters
31
31
  #
32
- # * +name+ - Name for category
32
+ # * +category_name+ - Name for category
33
33
  #
34
- def add_category(name)
34
+ def add_category(category_name)
35
35
  not_implemented_error(__callee__)
36
36
  end
37
37
 
@@ -39,20 +39,20 @@ module OmniCat
39
39
  #
40
40
  # ==== Parameters
41
41
  #
42
- # * +names+ - Array of categories
42
+ # * +category_names+ - Array of categories
43
43
  #
44
- def add_categories(names)
45
- names.each { |name| add_category(name) }
44
+ def add_categories(category_names)
45
+ category_names.each { |category_name| add_category(category_name) }
46
46
  end
47
47
 
48
48
  # Abstract method for training the desired category with a document
49
49
  #
50
50
  # ==== Parameters
51
51
  #
52
- # * +category+ - Name of the category from added categories list
53
- # * +doc+ - Document text
52
+ # * +category_name+ - Name of the category from added categories list
53
+ # * +doc_content+ - Document text
54
54
  #
55
- def train(category_name, doc)
55
+ def train(category_name, doc_content)
56
56
  not_implemented_error(__callee__)
57
57
  end
58
58
 
@@ -60,21 +60,21 @@ module OmniCat
60
60
  #
61
61
  # ==== Parameters
62
62
  #
63
- # * +category+ - Name of the category from added categories list
64
- # * +docs+ - Array of documents
63
+ # * +category_name+ - Name of the category from added categories list
64
+ # * +doc_contents+ - Array of documents
65
65
  #
66
- def train_batch(category, docs)
67
- docs.each { |doc| train(category, doc) }
66
+ def train_batch(category_name, doc_contents)
67
+ doc_contents.each { |doc_content| train(category_name, doc_content) }
68
68
  end
69
69
 
70
70
  # Abstract method for untraining the desired category with a document
71
71
  #
72
72
  # ==== Parameters
73
73
  #
74
- # * +category+ - Name of the category from added categories list
75
- # * +doc+ - Document text
74
+ # * +category_name+ - Name of the category from added categories list
75
+ # * +doc_content+ - Document text
76
76
  #
77
- def untrain(category_name, doc)
77
+ def untrain(category_name, doc_content)
78
78
  not_implemented_error(__callee__)
79
79
  end
80
80
 
@@ -82,24 +82,24 @@ module OmniCat
82
82
  #
83
83
  # ==== Parameters
84
84
  #
85
- # * +category+ - Name of the category from added categories list
86
- # * +docs+ - Array of documents
85
+ # * +category_name+ - Name of the category from added categories list
86
+ # * +doc_contents+ - Array of documents
87
87
  #
88
- def untrain_batch(category, docs)
89
- docs.each { |doc| untrain(category, doc) }
88
+ def untrain_batch(category_name, doc_contents)
89
+ doc_contents.each { |doc_content| untrain(category_name, doc_content) }
90
90
  end
91
91
 
92
92
  # Abstract method for classifying the given document
93
93
  #
94
94
  # ==== Parameters
95
95
  #
96
- # * +doc+ - The document for classification
96
+ # * +doc_content+ - The document for classification
97
97
  #
98
98
  # ==== Returns
99
99
  #
100
100
  # * +result+ - OmniCat::Result object
101
101
  #
102
- def classify(doc)
102
+ def classify(doc_content)
103
103
  not_implemented_error(__callee__)
104
104
  end
105
105
 
@@ -107,14 +107,14 @@ module OmniCat
107
107
  #
108
108
  # ==== Parameters
109
109
  #
110
- # * +docs+ - Array of documents
110
+ # * +doc_contents+ - Array of documents
111
111
  #
112
112
  # ==== Returns
113
113
  #
114
114
  # * +result_set+ - Array of OmniCat::Result objects
115
115
  #
116
- def classify_batch(docs)
117
- docs.collect { |doc| classify(doc) }
116
+ def classify_batch(doc_contents)
117
+ doc_contents.collect { |doc_content| classify(doc_content) }
118
118
  end
119
119
 
120
120
  private
@@ -157,7 +157,7 @@ module OmniCat
157
157
  raise StandardError,
158
158
  'At least 2 categories needed for classification process!'
159
159
  false
160
- elsif doc_avability? == false
160
+ elsif doc_availability? == false
161
161
  raise StandardError,
162
162
  'Each category must trained with at least one document!'
163
163
  false
@@ -167,7 +167,7 @@ module OmniCat
167
167
  end
168
168
 
169
169
  # nodoc
170
- def doc_avability?
170
+ def doc_availability?
171
171
  @categories.each do |_, category|
172
172
  return false if category.doc_count == 0
173
173
  end
@@ -4,11 +4,12 @@ module OmniCat
4
4
  module Classifiers
5
5
  module StrategyInternals
6
6
  class Category < ::OmniCat::Base
7
- attr_accessor :doc_count, :docs, :tokens, :token_count
7
+ attr_accessor :doc_count, :docs, :name, :token_count, :tokens
8
8
 
9
9
  def initialize(category_hash = {})
10
10
  @doc_count = category_hash[:doc_count].to_i
11
11
  @docs = category_hash[:docs] || {}
12
+ @name = category_hash[:name] || nil
12
13
  @tokens = category_hash[:tokens] || {}
13
14
  @token_count = category_hash[:token_count].to_i
14
15
  end
@@ -6,7 +6,7 @@ require 'logger'
6
6
  module OmniCat
7
7
  class Configuration
8
8
  include Singleton
9
- attr_accessor :logger
9
+ attr_accessor :auto_train # [:off, :continues, :unique]
10
10
  attr_accessor :exclude_tokens, :logger, :token_patterns
11
11
 
12
12
  def self.default_logger
@@ -16,6 +16,7 @@ module OmniCat
16
16
  end
17
17
 
18
18
  @@defaults = {
19
+ auto_train: :off,
19
20
  exclude_tokens: ['a','about','across','after','all','almost','also','am','among','an','and','are','as','at','be','because','been','by','did','do','does','else','ever','every','for','from','get','got','had','has','have','he','her','hers','him','his','how','however','i','if','in','into','is','it','its','just','least','let','may','me','might','most','must','my','of','often','on','only','or','other','our','own','rather','said','say','says','she','should','since','so','some','than','that','the','their','them','then','there','these','they','this','tis','to','too','twas','us','wants','was','we','were','what','when','where','which','while','who','whom','will','with','would','yet','you','your'],
20
21
  logger: default_logger,
21
22
  token_patterns: {
@@ -1,12 +1,15 @@
1
1
  # encoding: UTF-8
2
+ require 'digest'
2
3
  require File.dirname(__FILE__) + '/base'
3
4
 
4
5
  module OmniCat
5
6
  class Doc < ::OmniCat::Base
6
- attr_reader :content, :count, :tokens
7
+ attr_reader :content, :content_md5, :count, :tokens
7
8
 
8
9
  def initialize(doc_hash = {})
10
+ @auto_classified = doc_hash[:auto_classified] || false
9
11
  @content = doc_hash[:content]
12
+ @content_md5 = doc_hash[:content_md5] || Digest::MD5.hexdigest("#{@content}")
10
13
  @count = (doc_hash[:count] || 1).to_i
11
14
  @tokens = tokenize_with_counts unless @tokens.is_a?(Hash)
12
15
  end
@@ -2,12 +2,52 @@ require File.dirname(__FILE__) + '/base'
2
2
 
3
3
  module OmniCat
4
4
  class Result < ::OmniCat::Base
5
- attr_accessor :category, :scores, :total_score
5
+ attr_accessor :scores
6
6
 
7
7
  def initialize
8
- self.category = {}
9
- self.scores = {}
10
- self.total_score = 0
8
+ @top_score_key = nil
9
+ @scores = {}
10
+ @total_score = 0
11
11
  end
12
+
13
+ # Method for adding new score to result
14
+ #
15
+ # ==== Parameters
16
+ #
17
+ # * +score+ - OmniCat::Score
18
+ #
19
+ def add_score(score)
20
+ @total_score += score.value
21
+ @scores[score.key] = score
22
+ if @top_score_key.nil? || @scores[@top_score_key].value < score.value
23
+ @top_score_key = score.key
24
+ end
25
+ end
26
+
27
+ # Method for getting highest ranked score
28
+ #
29
+ # ==== Returns
30
+ #
31
+ # * +score+ - OmniCat::Score
32
+ #
33
+ def top_score
34
+ @scores[@top_score_key]
35
+ end
36
+
37
+ # Method for calculating percentages for scores
38
+ #
39
+ def calculate_percentages
40
+ @scores.each do |key, score|
41
+ @scores[key].percentage = percentage(score.value)
42
+ end
43
+ end
44
+
45
+ private
46
+ attr_reader :top_score_key, :total_score
47
+
48
+ # nodoc
49
+ def percentage(value)
50
+ (value * 100.0 / @total_score).round(0)
51
+ end
12
52
  end
13
53
  end
@@ -0,0 +1,15 @@
1
+ require File.dirname(__FILE__) + '/base'
2
+
3
+ module OmniCat
4
+ class Score < ::OmniCat::Base
5
+ attr_accessor :key
6
+ attr_accessor :value
7
+ attr_accessor :percentage
8
+
9
+ def initialize(score_hash = {})
10
+ @key = score_hash[:key]
11
+ @value = score_hash[:value]
12
+ @percentage = score_hash[:percentage] || 0
13
+ end
14
+ end
15
+ end
@@ -1,3 +1,3 @@
1
1
  module OmniCat
2
- VERSION = '0.2.2'
2
+ VERSION = '0.3.0'
3
3
  end
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
18
18
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
19
  spec.require_paths = ['lib']
20
20
 
21
- spec.add_dependency 'hashable', '~> 0.1.0'
21
+ spec.add_runtime_dependency 'hashable', '~> 0.1', '>= 0.1.2'
22
22
  spec.add_development_dependency 'bundler', '~> 1.3'
23
- spec.add_development_dependency 'rake'
23
+ spec.add_development_dependency 'rake', '~> 10.1'
24
24
  end
@@ -0,0 +1,32 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
2
+
3
+ class TestResult < Test::Unit::TestCase
4
+ def setup
5
+ @result = OmniCat::Result.new
6
+ @score1 = OmniCat::Score.new(key: 'pos', value: 0.43)
7
+ @score2 = OmniCat::Score.new(key: 'net', value: 0.76)
8
+ @score3 = OmniCat::Score.new(key: 'neg', value: 0.11)
9
+ end
10
+
11
+ def test_add_score
12
+ @result.add_score(@score1)
13
+ assert_equal(@score1, @result.scores[@score1.key])
14
+ end
15
+
16
+ def test_top_score
17
+ @result.add_score(@score1)
18
+ @result.add_score(@score2)
19
+ @result.add_score(@score3)
20
+ assert_equal(@score2, @result.top_score)
21
+ end
22
+
23
+ def test_percentage
24
+ @result.add_score(@score1)
25
+ @result.add_score(@score2)
26
+ @result.add_score(@score3)
27
+ @result.calculate_percentages
28
+ assert_equal(33, @score1.percentage)
29
+ assert_equal(58, @score2.percentage)
30
+ assert_equal(8, @score3.percentage)
31
+ end
32
+ end
@@ -0,0 +1,11 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
2
+
3
+ class TestScore < Test::Unit::TestCase
4
+ def setup
5
+ @score = OmniCat::Score.new(key: 'pos', value: 0.43)
6
+ end
7
+
8
+ def test_percentage
9
+ assert_equal(0, @score.percentage)
10
+ end
11
+ end
metadata CHANGED
@@ -1,64 +1,63 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: omnicat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
5
- prerelease:
4
+ version: 0.3.0
6
5
  platform: ruby
7
6
  authors:
8
7
  - Mustafa Turan
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-07-11 00:00:00.000000000 Z
11
+ date: 2014-02-19 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: hashable
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
- - - ~>
17
+ - - "~>"
20
18
  - !ruby/object:Gem::Version
21
- version: 0.1.0
19
+ version: '0.1'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 0.1.2
22
23
  type: :runtime
23
24
  prerelease: false
24
25
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
26
  requirements:
27
- - - ~>
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '0.1'
30
+ - - ">="
28
31
  - !ruby/object:Gem::Version
29
- version: 0.1.0
32
+ version: 0.1.2
30
33
  - !ruby/object:Gem::Dependency
31
34
  name: bundler
32
35
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
36
  requirements:
35
- - - ~>
37
+ - - "~>"
36
38
  - !ruby/object:Gem::Version
37
39
  version: '1.3'
38
40
  type: :development
39
41
  prerelease: false
40
42
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
43
  requirements:
43
- - - ~>
44
+ - - "~>"
44
45
  - !ruby/object:Gem::Version
45
46
  version: '1.3'
46
47
  - !ruby/object:Gem::Dependency
47
48
  name: rake
48
49
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
50
  requirements:
51
- - - ! '>='
51
+ - - "~>"
52
52
  - !ruby/object:Gem::Version
53
- version: '0'
53
+ version: '10.1'
54
54
  type: :development
55
55
  prerelease: false
56
56
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
57
  requirements:
59
- - - ! '>='
58
+ - - "~>"
60
59
  - !ruby/object:Gem::Version
61
- version: '0'
60
+ version: '10.1'
62
61
  description: A generalized framework for text classifications.
63
62
  email:
64
63
  - mustafaturan.net@gmail.com
@@ -66,8 +65,8 @@ executables: []
66
65
  extensions: []
67
66
  extra_rdoc_files: []
68
67
  files:
69
- - .gitignore
70
- - .travis.yml
68
+ - ".gitignore"
69
+ - ".travis.yml"
71
70
  - CHANGELOG.txt
72
71
  - Gemfile
73
72
  - LICENSE.txt
@@ -82,6 +81,7 @@ files:
82
81
  - lib/omnicat/configuration.rb
83
82
  - lib/omnicat/doc.rb
84
83
  - lib/omnicat/result.rb
84
+ - lib/omnicat/score.rb
85
85
  - lib/omnicat/version.rb
86
86
  - omnicat.gemspec
87
87
  - test/test_helper.rb
@@ -89,30 +89,31 @@ files:
89
89
  - test/unit/classifier_test.rb
90
90
  - test/unit/classifiers/strategy_test.rb
91
91
  - test/unit/doc_test.rb
92
+ - test/unit/result_test.rb
93
+ - test/unit/score_test.rb
92
94
  homepage: https://github.com/mustafaturan/omnicat
93
95
  licenses:
94
96
  - MIT
97
+ metadata: {}
95
98
  post_install_message:
96
99
  rdoc_options: []
97
100
  require_paths:
98
101
  - lib
99
102
  required_ruby_version: !ruby/object:Gem::Requirement
100
- none: false
101
103
  requirements:
102
- - - ! '>='
104
+ - - ">="
103
105
  - !ruby/object:Gem::Version
104
106
  version: '0'
105
107
  required_rubygems_version: !ruby/object:Gem::Requirement
106
- none: false
107
108
  requirements:
108
- - - ! '>='
109
+ - - ">="
109
110
  - !ruby/object:Gem::Version
110
111
  version: '0'
111
112
  requirements: []
112
113
  rubyforge_project:
113
- rubygems_version: 1.8.23
114
+ rubygems_version: 2.2.0
114
115
  signing_key:
115
- specification_version: 3
116
+ specification_version: 4
116
117
  summary: A generalized framework for text classifications.
117
118
  test_files:
118
119
  - test/test_helper.rb
@@ -120,3 +121,5 @@ test_files:
120
121
  - test/unit/classifier_test.rb
121
122
  - test/unit/classifiers/strategy_test.rb
122
123
  - test/unit/doc_test.rb
124
+ - test/unit/result_test.rb
125
+ - test/unit/score_test.rb