omnicat 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 65d6fbb05908315286db625537482191aa4067e4
4
+ data.tar.gz: c79f9470ca00165736ed3bc78b8dd9f6578b682a
5
+ SHA512:
6
+ metadata.gz: 069cefc4f20e453dd59af4fb7ad6e46e0b9529bbb63b8dfe998ea3058a0f8f00c308210a3527fa901e422b20af481586a27fe0fd6eab3bd5133f0026e8cb3fea
7
+ data.tar.gz: 79a279215e8eb6984725c9fe302970c233dec11ef58095e8edefec68bf1596e5fd41f07c3a33826b3c26f3917491c8c067114bd83c05286abec68d498b6a0e33
@@ -3,6 +3,7 @@ require File.dirname(__FILE__) + '/omnicat/configuration'
3
3
  require File.dirname(__FILE__) + '/omnicat/array'
4
4
  require File.dirname(__FILE__) + '/omnicat/base'
5
5
  require File.dirname(__FILE__) + '/omnicat/doc'
6
+ require File.dirname(__FILE__) + '/omnicat/score'
6
7
  require File.dirname(__FILE__) + '/omnicat/result'
7
8
  require File.dirname(__FILE__) + '/omnicat/classifier'
8
9
 
@@ -10,7 +10,7 @@ module OmniCat
10
10
  attr_accessor :strategy
11
11
 
12
12
  # delegate category methods
13
- def_delegators :@strategy, :add_category, :add_categories
13
+ def_delegators :@strategy, :add_category, :add_categories, :category_size_limit
14
14
 
15
15
  # delegate training methods
16
16
  def_delegators :@strategy, :train, :train_batch, :untrain, :untrain_batch
@@ -26,30 +26,79 @@ module OmniCat
26
26
  @strategy = classifier
27
27
  end
28
28
 
29
+ # Changes classifier strategy and train new strategy if needed
30
+ #
29
31
  def strategy=(classifier)
30
32
  is_interchangeable?(classifier)
31
- if @strategy && classifier.doc_count == 0
33
+ if @strategy && classifier.category_count == 0
32
34
  previous_strategy = @strategy
33
35
  @strategy = classifier
34
- # pass previous strategy contents into the new one
35
- previous_strategy.categories.each do |category_name, category|
36
- @strategy.add_category(category_name)
37
- category.docs.each do |_, doc|
38
- doc.count.times do
39
- @strategy.train(category_name, doc.content)
40
- end
41
- end
42
- end
36
+ convert_categories_with_docs(previous_strategy)
43
37
  else
44
38
  @strategy = classifier
45
39
  end
46
40
  end
47
41
 
48
42
  private
43
+ # nodoc
44
+ def convert_categories_with_docs(previous_strategy)
45
+ if previous_strategy.categories.is_a?(Hash)
46
+ convert_categories_hash(previous_strategy.categories)
47
+ else
48
+ convert_categories_array(previous_strategy.categories)
49
+ end
50
+ end
51
+
52
+ # nodoc
53
+ def convert_categories_array(categories)
54
+ categories.each do |category|
55
+ convert_category(category)
56
+ end
57
+ end
58
+
59
+ # nodoc
60
+ def convert_categories_hash(categories)
61
+ categories.each do |_, category|
62
+ convert_category(category)
63
+ end
64
+ end
65
+
66
+ # nodoc
67
+ def convert_category(category)
68
+ @strategy.add_category(category.name)
69
+ if category.docs.is_a?(Hash)
70
+ convert_docs_hash(category.name, category.docs)
71
+ else
72
+ convert_docs_array(category.name, category.docs)
73
+ end
74
+ end
75
+
76
+ # nodoc
77
+ def convert_docs_array(category_name, docs)
78
+ docs.each do |doc|
79
+ convert_doc(category_name, doc)
80
+ end
81
+ end
82
+
83
+ # nodoc
84
+ def convert_docs_hash(category_name, docs)
85
+ docs.each do |_, doc|
86
+ convert_doc(category_name, doc)
87
+ end
88
+ end
89
+
90
+ # nodoc
91
+ def convert_doc(category_name, doc)
92
+ doc.count.times do
93
+ @strategy.train(category_name, doc.content)
94
+ end
95
+ end
96
+
97
+ # nodoc
49
98
  def is_interchangeable?(classifier)
50
- if classifier.category_size_limit
99
+ unless classifier.category_size_limit == 0
51
100
  if @strategy.category_count > classifier.category_size_limit
52
- raise StandardError,
101
+ raise StandardError,
53
102
  'New classifier category size limit is less than the current classifier\'s category count.'
54
103
  end
55
104
  end
@@ -9,12 +9,12 @@ module OmniCat
9
9
  #
10
10
  # The class supplies abstract methods for possible text classifiers
11
11
  class Strategy < ::OmniCat::Base
12
- attr_accessor :categories # ::OmniCat::Hash - Hash of categories
12
+ attr_accessor :categories # Hash - Hash of categories
13
13
  attr_accessor :category_count # Integer - Total category count
14
- attr_accessor :category_size_limit # Integer - Max allowed category
15
- attr_accessor :doc_count # Integer - Total token count
14
+ attr_accessor :category_size_limit # Integer - Max allowed category size
15
+ attr_accessor :doc_count # Integer - Total doc count
16
16
  attr_accessor :token_count # Integer - Total token count
17
- attr_accessor :uniq_token_count # Integer - Total uniq token count
17
+ attr_accessor :unique_token_count # Integer - Total uniq token count
18
18
 
19
19
  def initialize(strategy_hash = {})
20
20
  @categories = {}
@@ -22,16 +22,16 @@ module OmniCat
22
22
  @category_size_limit = strategy_hash[:category_size_limit].to_i
23
23
  @doc_count = strategy_hash[:doc_count].to_i
24
24
  @token_count = strategy_hash[:token_count].to_i
25
- @uniq_token_count = strategy_hash[:uniq_token_count].to_i
25
+ @unique_token_count = strategy_hash[:unique_token_count].to_i
26
26
  end
27
27
 
28
28
  # Abstract method for adding new classification category
29
29
  #
30
30
  # ==== Parameters
31
31
  #
32
- # * +name+ - Name for category
32
+ # * +category_name+ - Name for category
33
33
  #
34
- def add_category(name)
34
+ def add_category(category_name)
35
35
  not_implemented_error(__callee__)
36
36
  end
37
37
 
@@ -39,20 +39,20 @@ module OmniCat
39
39
  #
40
40
  # ==== Parameters
41
41
  #
42
- # * +names+ - Array of categories
42
+ # * +category_names+ - Array of categories
43
43
  #
44
- def add_categories(names)
45
- names.each { |name| add_category(name) }
44
+ def add_categories(category_names)
45
+ category_names.each { |category_name| add_category(category_name) }
46
46
  end
47
47
 
48
48
  # Abstract method for training the desired category with a document
49
49
  #
50
50
  # ==== Parameters
51
51
  #
52
- # * +category+ - Name of the category from added categories list
53
- # * +doc+ - Document text
52
+ # * +category_name+ - Name of the category from added categories list
53
+ # * +doc_content+ - Document text
54
54
  #
55
- def train(category_name, doc)
55
+ def train(category_name, doc_content)
56
56
  not_implemented_error(__callee__)
57
57
  end
58
58
 
@@ -60,21 +60,21 @@ module OmniCat
60
60
  #
61
61
  # ==== Parameters
62
62
  #
63
- # * +category+ - Name of the category from added categories list
64
- # * +docs+ - Array of documents
63
+ # * +category_name+ - Name of the category from added categories list
64
+ # * +doc_contents+ - Array of documents
65
65
  #
66
- def train_batch(category, docs)
67
- docs.each { |doc| train(category, doc) }
66
+ def train_batch(category_name, doc_contents)
67
+ doc_contents.each { |doc_content| train(category_name, doc_content) }
68
68
  end
69
69
 
70
70
  # Abstract method for untraining the desired category with a document
71
71
  #
72
72
  # ==== Parameters
73
73
  #
74
- # * +category+ - Name of the category from added categories list
75
- # * +doc+ - Document text
74
+ # * +category_name+ - Name of the category from added categories list
75
+ # * +doc_content+ - Document text
76
76
  #
77
- def untrain(category_name, doc)
77
+ def untrain(category_name, doc_content)
78
78
  not_implemented_error(__callee__)
79
79
  end
80
80
 
@@ -82,24 +82,24 @@ module OmniCat
82
82
  #
83
83
  # ==== Parameters
84
84
  #
85
- # * +category+ - Name of the category from added categories list
86
- # * +docs+ - Array of documents
85
+ # * +category_name+ - Name of the category from added categories list
86
+ # * +doc_contents+ - Array of documents
87
87
  #
88
- def untrain_batch(category, docs)
89
- docs.each { |doc| untrain(category, doc) }
88
+ def untrain_batch(category_name, doc_contents)
89
+ doc_contents.each { |doc_content| untrain(category_name, doc_content) }
90
90
  end
91
91
 
92
92
  # Abstract method for classifying the given document
93
93
  #
94
94
  # ==== Parameters
95
95
  #
96
- # * +doc+ - The document for classification
96
+ # * +doc_content+ - The document for classification
97
97
  #
98
98
  # ==== Returns
99
99
  #
100
100
  # * +result+ - OmniCat::Result object
101
101
  #
102
- def classify(doc)
102
+ def classify(doc_content)
103
103
  not_implemented_error(__callee__)
104
104
  end
105
105
 
@@ -107,14 +107,14 @@ module OmniCat
107
107
  #
108
108
  # ==== Parameters
109
109
  #
110
- # * +docs+ - Array of documents
110
+ # * +doc_contents+ - Array of documents
111
111
  #
112
112
  # ==== Returns
113
113
  #
114
114
  # * +result_set+ - Array of OmniCat::Result objects
115
115
  #
116
- def classify_batch(docs)
117
- docs.collect { |doc| classify(doc) }
116
+ def classify_batch(doc_contents)
117
+ doc_contents.collect { |doc_content| classify(doc_content) }
118
118
  end
119
119
 
120
120
  private
@@ -157,7 +157,7 @@ module OmniCat
157
157
  raise StandardError,
158
158
  'At least 2 categories needed for classification process!'
159
159
  false
160
- elsif doc_avability? == false
160
+ elsif doc_availability? == false
161
161
  raise StandardError,
162
162
  'Each category must trained with at least one document!'
163
163
  false
@@ -167,7 +167,7 @@ module OmniCat
167
167
  end
168
168
 
169
169
  # nodoc
170
- def doc_avability?
170
+ def doc_availability?
171
171
  @categories.each do |_, category|
172
172
  return false if category.doc_count == 0
173
173
  end
@@ -4,11 +4,12 @@ module OmniCat
4
4
  module Classifiers
5
5
  module StrategyInternals
6
6
  class Category < ::OmniCat::Base
7
- attr_accessor :doc_count, :docs, :tokens, :token_count
7
+ attr_accessor :doc_count, :docs, :name, :token_count, :tokens
8
8
 
9
9
  def initialize(category_hash = {})
10
10
  @doc_count = category_hash[:doc_count].to_i
11
11
  @docs = category_hash[:docs] || {}
12
+ @name = category_hash[:name] || nil
12
13
  @tokens = category_hash[:tokens] || {}
13
14
  @token_count = category_hash[:token_count].to_i
14
15
  end
@@ -6,7 +6,7 @@ require 'logger'
6
6
  module OmniCat
7
7
  class Configuration
8
8
  include Singleton
9
- attr_accessor :logger
9
+ attr_accessor :auto_train # [:off, :continues, :unique]
10
10
  attr_accessor :exclude_tokens, :logger, :token_patterns
11
11
 
12
12
  def self.default_logger
@@ -16,6 +16,7 @@ module OmniCat
16
16
  end
17
17
 
18
18
  @@defaults = {
19
+ auto_train: :off,
19
20
  exclude_tokens: ['a','about','across','after','all','almost','also','am','among','an','and','are','as','at','be','because','been','by','did','do','does','else','ever','every','for','from','get','got','had','has','have','he','her','hers','him','his','how','however','i','if','in','into','is','it','its','just','least','let','may','me','might','most','must','my','of','often','on','only','or','other','our','own','rather','said','say','says','she','should','since','so','some','than','that','the','their','them','then','there','these','they','this','tis','to','too','twas','us','wants','was','we','were','what','when','where','which','while','who','whom','will','with','would','yet','you','your'],
20
21
  logger: default_logger,
21
22
  token_patterns: {
@@ -1,12 +1,15 @@
1
1
  # encoding: UTF-8
2
+ require 'digest'
2
3
  require File.dirname(__FILE__) + '/base'
3
4
 
4
5
  module OmniCat
5
6
  class Doc < ::OmniCat::Base
6
- attr_reader :content, :count, :tokens
7
+ attr_reader :content, :content_md5, :count, :tokens
7
8
 
8
9
  def initialize(doc_hash = {})
10
+ @auto_classified = doc_hash[:auto_classified] || false
9
11
  @content = doc_hash[:content]
12
+ @content_md5 = doc_hash[:content_md5] || Digest::MD5.hexdigest("#{@content}")
10
13
  @count = (doc_hash[:count] || 1).to_i
11
14
  @tokens = tokenize_with_counts unless @tokens.is_a?(Hash)
12
15
  end
@@ -2,12 +2,52 @@ require File.dirname(__FILE__) + '/base'
2
2
 
3
3
  module OmniCat
4
4
  class Result < ::OmniCat::Base
5
- attr_accessor :category, :scores, :total_score
5
+ attr_accessor :scores
6
6
 
7
7
  def initialize
8
- self.category = {}
9
- self.scores = {}
10
- self.total_score = 0
8
+ @top_score_key = nil
9
+ @scores = {}
10
+ @total_score = 0
11
11
  end
12
+
13
+ # Method for adding new score to result
14
+ #
15
+ # ==== Parameters
16
+ #
17
+ # * +score+ - OmniCat::Score
18
+ #
19
+ def add_score(score)
20
+ @total_score += score.value
21
+ @scores[score.key] = score
22
+ if @top_score_key.nil? || @scores[@top_score_key].value < score.value
23
+ @top_score_key = score.key
24
+ end
25
+ end
26
+
27
+ # Method for getting highest ranked score
28
+ #
29
+ # ==== Returns
30
+ #
31
+ # * +score+ - OmniCat::Score
32
+ #
33
+ def top_score
34
+ @scores[@top_score_key]
35
+ end
36
+
37
+ # Method for calculating percentages for scores
38
+ #
39
+ def calculate_percentages
40
+ @scores.each do |key, score|
41
+ @scores[key].percentage = percentage(score.value)
42
+ end
43
+ end
44
+
45
+ private
46
+ attr_reader :top_score_key, :total_score
47
+
48
+ # nodoc
49
+ def percentage(value)
50
+ (value * 100.0 / @total_score).round(0)
51
+ end
12
52
  end
13
53
  end
@@ -0,0 +1,15 @@
1
+ require File.dirname(__FILE__) + '/base'
2
+
3
+ module OmniCat
4
+ class Score < ::OmniCat::Base
5
+ attr_accessor :key
6
+ attr_accessor :value
7
+ attr_accessor :percentage
8
+
9
+ def initialize(score_hash = {})
10
+ @key = score_hash[:key]
11
+ @value = score_hash[:value]
12
+ @percentage = score_hash[:percentage] || 0
13
+ end
14
+ end
15
+ end
@@ -1,3 +1,3 @@
1
1
  module OmniCat
2
- VERSION = '0.2.2'
2
+ VERSION = '0.3.0'
3
3
  end
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
18
18
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
19
  spec.require_paths = ['lib']
20
20
 
21
- spec.add_dependency 'hashable', '~> 0.1.0'
21
+ spec.add_runtime_dependency 'hashable', '~> 0.1', '>= 0.1.2'
22
22
  spec.add_development_dependency 'bundler', '~> 1.3'
23
- spec.add_development_dependency 'rake'
23
+ spec.add_development_dependency 'rake', '~> 10.1'
24
24
  end
@@ -0,0 +1,32 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
2
+
3
+ class TestResult < Test::Unit::TestCase
4
+ def setup
5
+ @result = OmniCat::Result.new
6
+ @score1 = OmniCat::Score.new(key: 'pos', value: 0.43)
7
+ @score2 = OmniCat::Score.new(key: 'net', value: 0.76)
8
+ @score3 = OmniCat::Score.new(key: 'neg', value: 0.11)
9
+ end
10
+
11
+ def test_add_score
12
+ @result.add_score(@score1)
13
+ assert_equal(@score1, @result.scores[@score1.key])
14
+ end
15
+
16
+ def test_top_score
17
+ @result.add_score(@score1)
18
+ @result.add_score(@score2)
19
+ @result.add_score(@score3)
20
+ assert_equal(@score2, @result.top_score)
21
+ end
22
+
23
+ def test_percentage
24
+ @result.add_score(@score1)
25
+ @result.add_score(@score2)
26
+ @result.add_score(@score3)
27
+ @result.calculate_percentages
28
+ assert_equal(33, @score1.percentage)
29
+ assert_equal(58, @score2.percentage)
30
+ assert_equal(8, @score3.percentage)
31
+ end
32
+ end
@@ -0,0 +1,11 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'test_helper'))
2
+
3
+ class TestScore < Test::Unit::TestCase
4
+ def setup
5
+ @score = OmniCat::Score.new(key: 'pos', value: 0.43)
6
+ end
7
+
8
+ def test_percentage
9
+ assert_equal(0, @score.percentage)
10
+ end
11
+ end
metadata CHANGED
@@ -1,64 +1,63 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: omnicat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
5
- prerelease:
4
+ version: 0.3.0
6
5
  platform: ruby
7
6
  authors:
8
7
  - Mustafa Turan
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-07-11 00:00:00.000000000 Z
11
+ date: 2014-02-19 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: hashable
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
- - - ~>
17
+ - - "~>"
20
18
  - !ruby/object:Gem::Version
21
- version: 0.1.0
19
+ version: '0.1'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 0.1.2
22
23
  type: :runtime
23
24
  prerelease: false
24
25
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
26
  requirements:
27
- - - ~>
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '0.1'
30
+ - - ">="
28
31
  - !ruby/object:Gem::Version
29
- version: 0.1.0
32
+ version: 0.1.2
30
33
  - !ruby/object:Gem::Dependency
31
34
  name: bundler
32
35
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
36
  requirements:
35
- - - ~>
37
+ - - "~>"
36
38
  - !ruby/object:Gem::Version
37
39
  version: '1.3'
38
40
  type: :development
39
41
  prerelease: false
40
42
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
43
  requirements:
43
- - - ~>
44
+ - - "~>"
44
45
  - !ruby/object:Gem::Version
45
46
  version: '1.3'
46
47
  - !ruby/object:Gem::Dependency
47
48
  name: rake
48
49
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
50
  requirements:
51
- - - ! '>='
51
+ - - "~>"
52
52
  - !ruby/object:Gem::Version
53
- version: '0'
53
+ version: '10.1'
54
54
  type: :development
55
55
  prerelease: false
56
56
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
57
  requirements:
59
- - - ! '>='
58
+ - - "~>"
60
59
  - !ruby/object:Gem::Version
61
- version: '0'
60
+ version: '10.1'
62
61
  description: A generalized framework for text classifications.
63
62
  email:
64
63
  - mustafaturan.net@gmail.com
@@ -66,8 +65,8 @@ executables: []
66
65
  extensions: []
67
66
  extra_rdoc_files: []
68
67
  files:
69
- - .gitignore
70
- - .travis.yml
68
+ - ".gitignore"
69
+ - ".travis.yml"
71
70
  - CHANGELOG.txt
72
71
  - Gemfile
73
72
  - LICENSE.txt
@@ -82,6 +81,7 @@ files:
82
81
  - lib/omnicat/configuration.rb
83
82
  - lib/omnicat/doc.rb
84
83
  - lib/omnicat/result.rb
84
+ - lib/omnicat/score.rb
85
85
  - lib/omnicat/version.rb
86
86
  - omnicat.gemspec
87
87
  - test/test_helper.rb
@@ -89,30 +89,31 @@ files:
89
89
  - test/unit/classifier_test.rb
90
90
  - test/unit/classifiers/strategy_test.rb
91
91
  - test/unit/doc_test.rb
92
+ - test/unit/result_test.rb
93
+ - test/unit/score_test.rb
92
94
  homepage: https://github.com/mustafaturan/omnicat
93
95
  licenses:
94
96
  - MIT
97
+ metadata: {}
95
98
  post_install_message:
96
99
  rdoc_options: []
97
100
  require_paths:
98
101
  - lib
99
102
  required_ruby_version: !ruby/object:Gem::Requirement
100
- none: false
101
103
  requirements:
102
- - - ! '>='
104
+ - - ">="
103
105
  - !ruby/object:Gem::Version
104
106
  version: '0'
105
107
  required_rubygems_version: !ruby/object:Gem::Requirement
106
- none: false
107
108
  requirements:
108
- - - ! '>='
109
+ - - ">="
109
110
  - !ruby/object:Gem::Version
110
111
  version: '0'
111
112
  requirements: []
112
113
  rubyforge_project:
113
- rubygems_version: 1.8.23
114
+ rubygems_version: 2.2.0
114
115
  signing_key:
115
- specification_version: 3
116
+ specification_version: 4
116
117
  summary: A generalized framework for text classifications.
117
118
  test_files:
118
119
  - test/test_helper.rb
@@ -120,3 +121,5 @@ test_files:
120
121
  - test/unit/classifier_test.rb
121
122
  - test/unit/classifiers/strategy_test.rb
122
123
  - test/unit/doc_test.rb
124
+ - test/unit/result_test.rb
125
+ - test/unit/score_test.rb