yury-classifier 1.3.4 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Manifest CHANGED
@@ -1,3 +1,4 @@
1
+ classifier.gemspec
1
2
  lib/classifier/base.rb
2
3
  lib/classifier/bayes.rb
3
4
  lib/classifier/extensions/vector.rb
data/Rakefile CHANGED
@@ -2,14 +2,14 @@ require 'rubygems'
2
2
  require 'rake'
3
3
  require 'echoe'
4
4
 
5
- Echoe.new('classifier', '1.3.3') do |p|
5
+ Echoe.new('classifier', '1.3.5') do |p|
6
6
  p.description = "A general classifier module to allow Bayesian and other types of classifications."
7
7
  p.url = "http://github.com/yury/classifier"
8
8
  p.author = "Yury Korolev"
9
9
  p.email = "yury.korolev@gmail.com"
10
10
  p.ignore_pattern = ["tmp/*", "script/*"]
11
11
  p.development_dependencies = []
12
- p.runtime_dependencies = ["activesupport >= 2.2.2", "ruby-stemmer >= 0.5.1"]
12
+ p.runtime_dependencies = ["activesupport >=2.2.2", "ruby-stemmer >=0.5.1"]
13
13
  end
14
14
 
15
15
  Dir["#{File.dirname(__FILE__)}/tasks/*.rake"].sort.each { |ext| load ext }
data/classifier.gemspec CHANGED
@@ -2,15 +2,15 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{classifier}
5
- s.version = "1.3.4"
5
+ s.version = "1.3.5"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Yury Korolev"]
9
- s.date = %q{2009-03-22}
9
+ s.date = %q{2009-04-14}
10
10
  s.description = %q{A general classifier module to allow Bayesian and other types of classifications.}
11
11
  s.email = %q{yury.korolev@gmail.com}
12
12
  s.extra_rdoc_files = ["lib/classifier/base.rb", "lib/classifier/bayes.rb", "lib/classifier/extensions/vector.rb", "lib/classifier/extensions/vector_serialize.rb", "lib/classifier/lsi/content_node.rb", "lib/classifier/lsi/summary.rb", "lib/classifier/lsi/word_list.rb", "lib/classifier/lsi.rb", "lib/classifier.rb", "lib/init.rb", "LICENSE", "README"]
13
- s.files = ["lib/classifier/base.rb", "lib/classifier/bayes.rb", "lib/classifier/extensions/vector.rb", "lib/classifier/extensions/vector_serialize.rb", "lib/classifier/lsi/content_node.rb", "lib/classifier/lsi/summary.rb", "lib/classifier/lsi/word_list.rb", "lib/classifier/lsi.rb", "lib/classifier.rb", "lib/init.rb", "LICENSE", "Manifest", "Rakefile", "README", "test/base_test.rb", "test/bayes/bayesian_test.rb", "test/lsi/lsi_test.rb", "test/test_helper.rb", "classifier.gemspec"]
13
+ s.files = ["classifier.gemspec", "lib/classifier/base.rb", "lib/classifier/bayes.rb", "lib/classifier/extensions/vector.rb", "lib/classifier/extensions/vector_serialize.rb", "lib/classifier/lsi/content_node.rb", "lib/classifier/lsi/summary.rb", "lib/classifier/lsi/word_list.rb", "lib/classifier/lsi.rb", "lib/classifier.rb", "lib/init.rb", "LICENSE", "Manifest", "Rakefile", "README", "test/base_test.rb", "test/bayes/bayesian_test.rb", "test/lsi/lsi_test.rb", "test/test_helper.rb"]
14
14
  s.has_rdoc = true
15
15
  s.homepage = %q{http://github.com/yury/classifier}
16
16
  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Classifier", "--main", "README"]
@@ -25,14 +25,14 @@ Gem::Specification.new do |s|
25
25
  s.specification_version = 2
26
26
 
27
27
  if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
28
- s.add_runtime_dependency(%q<activesupport>, [">= 0", "= 2.2.2"])
29
- s.add_runtime_dependency(%q<ruby-stemmer>, [">= 0", "= 0.5.1"])
28
+ s.add_runtime_dependency(%q<activesupport>, [">= 2.2.2"])
29
+ s.add_runtime_dependency(%q<ruby-stemmer>, [">= 0.5.1"])
30
30
  else
31
- s.add_dependency(%q<activesupport>, [">= 0", "= 2.2.2"])
32
- s.add_dependency(%q<ruby-stemmer>, [">= 0", "= 0.5.1"])
31
+ s.add_dependency(%q<activesupport>, [">= 2.2.2"])
32
+ s.add_dependency(%q<ruby-stemmer>, [">= 0.5.1"])
33
33
  end
34
34
  else
35
- s.add_dependency(%q<activesupport>, [">= 0", "= 2.2.2"])
36
- s.add_dependency(%q<ruby-stemmer>, [">= 0", "= 0.5.1"])
35
+ s.add_dependency(%q<activesupport>, [">= 2.2.2"])
36
+ s.add_dependency(%q<ruby-stemmer>, [">= 0.5.1"])
37
37
  end
38
38
  end
@@ -54,7 +54,8 @@ module Classifier
54
54
  if ( term > 0 )
55
55
  weighted_total += (( term / total_words ) * Math.log( term / total_words ))
56
56
  end
57
- end
57
+ end
58
+ weighted_total = -1.0 if weighted_total.zero? # if no word in list is known
58
59
  vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
59
60
  end
60
61
 
@@ -180,6 +180,7 @@ module Classifier
180
180
  content_node = node_for_content( doc, &block )
181
181
  result =
182
182
  @items.keys.collect do |item|
183
+ next if @items[item].search_vector.blank? # not enough data
183
184
  if $GSL
184
185
  val = content_node.search_vector * @items[item].search_vector.col
185
186
  else
@@ -187,7 +188,7 @@ module Classifier
187
188
  end
188
189
  [item, val]
189
190
  end
190
- result.sort_by { |x| x[1] }.reverse
191
+ result.compact.sort_by { |x| x[1] }.reverse
191
192
  end
192
193
 
193
194
  # Similar to proximity_array_for_content, this function takes similar
@@ -201,6 +202,7 @@ module Classifier
201
202
  content_node = node_for_content( doc, &block )
202
203
  result =
203
204
  @items.keys.collect do |item|
205
+ next if @items[item].search_norm.blank? # not enough data
204
206
  if $GSL
205
207
  val = content_node.search_norm * @items[item].search_norm.col
206
208
  else
@@ -208,7 +210,7 @@ module Classifier
208
210
  end
209
211
  [item, val]
210
212
  end
211
- result.sort_by { |x| x[1] }.reverse
213
+ result.compact.sort_by { |x| x[1] }.reverse
212
214
  end
213
215
 
214
216
  # This function allows for text-based search of your index. Unlike other functions
@@ -266,7 +268,23 @@ module Classifier
266
268
  ranking = votes.keys.sort_by { |x| votes[x] }
267
269
  return ranking[-1]
268
270
  end
269
-
271
+
272
+ # Same as previous but returns all results, also more permissive in default cut-off
273
+ def classify_multiple( doc, cutoff=0.50, &block )
274
+ icutoff = (@items.size * cutoff).round
275
+ carry = proximity_array_for_content( doc, &block )
276
+ carry = carry[0..icutoff-1]
277
+ votes = {}
278
+ carry.each do |pair|
279
+ categories = @items[pair[0]].categories
280
+ categories.each do |category|
281
+ votes[category] ||= 0.0
282
+ votes[category] += pair[1]
283
+ end
284
+ end
285
+ votes.delete_if{|key, value| value<1 }.keys.sort_by { |x| -votes[x] }
286
+ end
287
+
270
288
  # Prototype, only works on indexed documents.
271
289
  # I have no clue if this is going to work, but in theory
272
290
  # it's supposed to.
data/test/lsi/lsi_test.rb CHANGED
@@ -8,6 +8,9 @@ class LSITest < Test::Unit::TestCase
8
8
  @str3 = "This text revolves around cats. Cats."
9
9
  @str4 = "This text also involves cats. Cats!"
10
10
  @str5 = "This text involves birds. Birds."
11
+ @str6 = "Is it about dogs or birds?"
12
+ @str7 = "Is it about birds or cats?"
13
+ @str8 = "I would prefer a bird over thousand cats or dogs because birds are smaller."
11
14
  end
12
15
 
13
16
  def test_basic_indexing
@@ -29,6 +32,14 @@ class LSITest < Test::Unit::TestCase
29
32
  assert ! lsi.needs_rebuild?
30
33
  end
31
34
 
35
+ def test_basic_categorizing_with_too_small_dataset
36
+ lsi = Classifier::LSI.new
37
+ lsi.add_item @str2, "Dog"
38
+
39
+ assert_equal nil, lsi.classify( @str1 )
40
+ assert_equal [], lsi.classify_multiple( @str3 )
41
+ end
42
+
32
43
  def test_basic_categorizing
33
44
  lsi = Classifier::LSI.new
34
45
  lsi.add_item @str2, "Dog"
@@ -38,9 +49,42 @@ class LSITest < Test::Unit::TestCase
38
49
 
39
50
  assert_equal "Dog", lsi.classify( @str1 )
40
51
  assert_equal "Cat", lsi.classify( @str3 )
41
- assert_equal "Bird", lsi.classify( @str5 )
52
+ assert_equal "Bird", lsi.classify( @str5 )
53
+ assert_equal "Dog", lsi.classify( @str6 )
54
+ assert_equal "Bird", lsi.classify( @str7 )
55
+ assert_equal "Bird", lsi.classify( @str8 )
42
56
  end
43
-
57
+
58
+ def test_multiple_categorizing
59
+ lsi = Classifier::LSI.new
60
+ lsi.add_item @str1, "Dog"
61
+ lsi.add_item @str2, "Dog"
62
+ lsi.add_item @str3, "Cat"
63
+ lsi.add_item @str4, "Cat"
64
+ lsi.add_item @str5, "Bird"
65
+
66
+ assert_equal ["Dog", "Bird"], lsi.classify_multiple( @str6 )
67
+ assert_equal ["Cat", "Bird"], lsi.classify_multiple( @str7 )
68
+ assert_equal ["Bird"], lsi.classify_multiple( @str8 )
69
+ end
70
+
71
+ def test_multiple_categorizing_reverse
72
+ lsi = Classifier::LSI.new
73
+ lsi.add_item @str1, "Dog"
74
+ lsi.add_item @str3, "Cat"
75
+ lsi.add_item @str4, "Cat"
76
+ lsi.add_item @str6, "Dog", "Bird", "Flying"
77
+ lsi.add_item @str7, "Cat", "Bird"
78
+ lsi.add_item @str8, "Bird", "Dog", "Cat"
79
+
80
+ assert_equal ["Dog"], lsi.classify_multiple( @str2 )
81
+ assert_equal ["Cat", "Bird"], lsi.classify_multiple( @str5 )
82
+
83
+ # test with a word unknown alone
84
+ assert_equal "Bird", lsi.classify( "Bird!" )
85
+ assert_equal ["Bird", "Dog", "Cat"], lsi.classify_multiple( "Bird!" )
86
+ end
87
+
44
88
  def test_external_classifying
45
89
  lsi = Classifier::LSI.new
46
90
  bayes = Classifier::Bayes.new :categories => ['Dog', 'Cat', 'Bird']
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: yury-classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.4
4
+ version: 1.3.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yury Korolev
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-03-22 00:00:00 -07:00
12
+ date: 2009-04-14 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -19,9 +19,6 @@ dependencies:
19
19
  version_requirements: !ruby/object:Gem::Requirement
20
20
  requirements:
21
21
  - - ">="
22
- - !ruby/object:Gem::Version
23
- version: "0"
24
- - - "="
25
22
  - !ruby/object:Gem::Version
26
23
  version: 2.2.2
27
24
  version:
@@ -32,9 +29,6 @@ dependencies:
32
29
  version_requirements: !ruby/object:Gem::Requirement
33
30
  requirements:
34
31
  - - ">="
35
- - !ruby/object:Gem::Version
36
- version: "0"
37
- - - "="
38
32
  - !ruby/object:Gem::Version
39
33
  version: 0.5.1
40
34
  version:
@@ -58,6 +52,7 @@ extra_rdoc_files:
58
52
  - LICENSE
59
53
  - README
60
54
  files:
55
+ - classifier.gemspec
61
56
  - lib/classifier/base.rb
62
57
  - lib/classifier/bayes.rb
63
58
  - lib/classifier/extensions/vector.rb
@@ -76,7 +71,6 @@ files:
76
71
  - test/bayes/bayesian_test.rb
77
72
  - test/lsi/lsi_test.rb
78
73
  - test/test_helper.rb
79
- - classifier.gemspec
80
74
  has_rdoc: true
81
75
  homepage: http://github.com/yury/classifier
82
76
  post_install_message: