yury-classifier 1.3.4 → 1.3.5

Sign up to get free protection for your applications and to get access to all the features.
data/Manifest CHANGED
@@ -1,3 +1,4 @@
1
+ classifier.gemspec
1
2
  lib/classifier/base.rb
2
3
  lib/classifier/bayes.rb
3
4
  lib/classifier/extensions/vector.rb
data/Rakefile CHANGED
@@ -2,14 +2,14 @@ require 'rubygems'
2
2
  require 'rake'
3
3
  require 'echoe'
4
4
 
5
- Echoe.new('classifier', '1.3.3') do |p|
5
+ Echoe.new('classifier', '1.3.5') do |p|
6
6
  p.description = "A general classifier module to allow Bayesian and other types of classifications."
7
7
  p.url = "http://github.com/yury/classifier"
8
8
  p.author = "Yury Korolev"
9
9
  p.email = "yury.korolev@gmail.com"
10
10
  p.ignore_pattern = ["tmp/*", "script/*"]
11
11
  p.development_dependencies = []
12
- p.runtime_dependencies = ["activesupport >= 2.2.2", "ruby-stemmer >= 0.5.1"]
12
+ p.runtime_dependencies = ["activesupport >=2.2.2", "ruby-stemmer >=0.5.1"]
13
13
  end
14
14
 
15
15
  Dir["#{File.dirname(__FILE__)}/tasks/*.rake"].sort.each { |ext| load ext }
data/classifier.gemspec CHANGED
@@ -2,15 +2,15 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{classifier}
5
- s.version = "1.3.4"
5
+ s.version = "1.3.5"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Yury Korolev"]
9
- s.date = %q{2009-03-22}
9
+ s.date = %q{2009-04-14}
10
10
  s.description = %q{A general classifier module to allow Bayesian and other types of classifications.}
11
11
  s.email = %q{yury.korolev@gmail.com}
12
12
  s.extra_rdoc_files = ["lib/classifier/base.rb", "lib/classifier/bayes.rb", "lib/classifier/extensions/vector.rb", "lib/classifier/extensions/vector_serialize.rb", "lib/classifier/lsi/content_node.rb", "lib/classifier/lsi/summary.rb", "lib/classifier/lsi/word_list.rb", "lib/classifier/lsi.rb", "lib/classifier.rb", "lib/init.rb", "LICENSE", "README"]
13
- s.files = ["lib/classifier/base.rb", "lib/classifier/bayes.rb", "lib/classifier/extensions/vector.rb", "lib/classifier/extensions/vector_serialize.rb", "lib/classifier/lsi/content_node.rb", "lib/classifier/lsi/summary.rb", "lib/classifier/lsi/word_list.rb", "lib/classifier/lsi.rb", "lib/classifier.rb", "lib/init.rb", "LICENSE", "Manifest", "Rakefile", "README", "test/base_test.rb", "test/bayes/bayesian_test.rb", "test/lsi/lsi_test.rb", "test/test_helper.rb", "classifier.gemspec"]
13
+ s.files = ["classifier.gemspec", "lib/classifier/base.rb", "lib/classifier/bayes.rb", "lib/classifier/extensions/vector.rb", "lib/classifier/extensions/vector_serialize.rb", "lib/classifier/lsi/content_node.rb", "lib/classifier/lsi/summary.rb", "lib/classifier/lsi/word_list.rb", "lib/classifier/lsi.rb", "lib/classifier.rb", "lib/init.rb", "LICENSE", "Manifest", "Rakefile", "README", "test/base_test.rb", "test/bayes/bayesian_test.rb", "test/lsi/lsi_test.rb", "test/test_helper.rb"]
14
14
  s.has_rdoc = true
15
15
  s.homepage = %q{http://github.com/yury/classifier}
16
16
  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Classifier", "--main", "README"]
@@ -25,14 +25,14 @@ Gem::Specification.new do |s|
25
25
  s.specification_version = 2
26
26
 
27
27
  if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
28
- s.add_runtime_dependency(%q<activesupport>, [">= 0", "= 2.2.2"])
29
- s.add_runtime_dependency(%q<ruby-stemmer>, [">= 0", "= 0.5.1"])
28
+ s.add_runtime_dependency(%q<activesupport>, [">= 2.2.2"])
29
+ s.add_runtime_dependency(%q<ruby-stemmer>, [">= 0.5.1"])
30
30
  else
31
- s.add_dependency(%q<activesupport>, [">= 0", "= 2.2.2"])
32
- s.add_dependency(%q<ruby-stemmer>, [">= 0", "= 0.5.1"])
31
+ s.add_dependency(%q<activesupport>, [">= 2.2.2"])
32
+ s.add_dependency(%q<ruby-stemmer>, [">= 0.5.1"])
33
33
  end
34
34
  else
35
- s.add_dependency(%q<activesupport>, [">= 0", "= 2.2.2"])
36
- s.add_dependency(%q<ruby-stemmer>, [">= 0", "= 0.5.1"])
35
+ s.add_dependency(%q<activesupport>, [">= 2.2.2"])
36
+ s.add_dependency(%q<ruby-stemmer>, [">= 0.5.1"])
37
37
  end
38
38
  end
@@ -54,7 +54,8 @@ module Classifier
54
54
  if ( term > 0 )
55
55
  weighted_total += (( term / total_words ) * Math.log( term / total_words ))
56
56
  end
57
- end
57
+ end
58
+ weighted_total = -1.0 if weighted_total.zero? # if no word in list is known
58
59
  vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
59
60
  end
60
61
 
@@ -180,6 +180,7 @@ module Classifier
180
180
  content_node = node_for_content( doc, &block )
181
181
  result =
182
182
  @items.keys.collect do |item|
183
+ next if @items[item].search_vector.blank? # not enough data
183
184
  if $GSL
184
185
  val = content_node.search_vector * @items[item].search_vector.col
185
186
  else
@@ -187,7 +188,7 @@ module Classifier
187
188
  end
188
189
  [item, val]
189
190
  end
190
- result.sort_by { |x| x[1] }.reverse
191
+ result.compact.sort_by { |x| x[1] }.reverse
191
192
  end
192
193
 
193
194
  # Similar to proximity_array_for_content, this function takes similar
@@ -201,6 +202,7 @@ module Classifier
201
202
  content_node = node_for_content( doc, &block )
202
203
  result =
203
204
  @items.keys.collect do |item|
205
+ next if @items[item].search_norm.blank? # not enough data
204
206
  if $GSL
205
207
  val = content_node.search_norm * @items[item].search_norm.col
206
208
  else
@@ -208,7 +210,7 @@ module Classifier
208
210
  end
209
211
  [item, val]
210
212
  end
211
- result.sort_by { |x| x[1] }.reverse
213
+ result.compact.sort_by { |x| x[1] }.reverse
212
214
  end
213
215
 
214
216
  # This function allows for text-based search of your index. Unlike other functions
@@ -266,7 +268,23 @@ module Classifier
266
268
  ranking = votes.keys.sort_by { |x| votes[x] }
267
269
  return ranking[-1]
268
270
  end
269
-
271
+
272
+ # Same as previous but returns all results, also more permissive in default cut-off
273
+ def classify_multiple( doc, cutoff=0.50, &block )
274
+ icutoff = (@items.size * cutoff).round
275
+ carry = proximity_array_for_content( doc, &block )
276
+ carry = carry[0..icutoff-1]
277
+ votes = {}
278
+ carry.each do |pair|
279
+ categories = @items[pair[0]].categories
280
+ categories.each do |category|
281
+ votes[category] ||= 0.0
282
+ votes[category] += pair[1]
283
+ end
284
+ end
285
+ votes.delete_if{|key, value| value<1 }.keys.sort_by { |x| -votes[x] }
286
+ end
287
+
270
288
  # Prototype, only works on indexed documents.
271
289
  # I have no clue if this is going to work, but in theory
272
290
  # it's supposed to.
data/test/lsi/lsi_test.rb CHANGED
@@ -8,6 +8,9 @@ class LSITest < Test::Unit::TestCase
8
8
  @str3 = "This text revolves around cats. Cats."
9
9
  @str4 = "This text also involves cats. Cats!"
10
10
  @str5 = "This text involves birds. Birds."
11
+ @str6 = "Is it about dogs or birds?"
12
+ @str7 = "Is it about birds or cats?"
13
+ @str8 = "I would prefer a bird over thousand cats or dogs because birds are smaller."
11
14
  end
12
15
 
13
16
  def test_basic_indexing
@@ -29,6 +32,14 @@ class LSITest < Test::Unit::TestCase
29
32
  assert ! lsi.needs_rebuild?
30
33
  end
31
34
 
35
+ def test_basic_categorizing_with_too_small_dataset
36
+ lsi = Classifier::LSI.new
37
+ lsi.add_item @str2, "Dog"
38
+
39
+ assert_equal nil, lsi.classify( @str1 )
40
+ assert_equal [], lsi.classify_multiple( @str3 )
41
+ end
42
+
32
43
  def test_basic_categorizing
33
44
  lsi = Classifier::LSI.new
34
45
  lsi.add_item @str2, "Dog"
@@ -38,9 +49,42 @@ class LSITest < Test::Unit::TestCase
38
49
 
39
50
  assert_equal "Dog", lsi.classify( @str1 )
40
51
  assert_equal "Cat", lsi.classify( @str3 )
41
- assert_equal "Bird", lsi.classify( @str5 )
52
+ assert_equal "Bird", lsi.classify( @str5 )
53
+ assert_equal "Dog", lsi.classify( @str6 )
54
+ assert_equal "Bird", lsi.classify( @str7 )
55
+ assert_equal "Bird", lsi.classify( @str8 )
42
56
  end
43
-
57
+
58
+ def test_multiple_categorizing
59
+ lsi = Classifier::LSI.new
60
+ lsi.add_item @str1, "Dog"
61
+ lsi.add_item @str2, "Dog"
62
+ lsi.add_item @str3, "Cat"
63
+ lsi.add_item @str4, "Cat"
64
+ lsi.add_item @str5, "Bird"
65
+
66
+ assert_equal ["Dog", "Bird"], lsi.classify_multiple( @str6 )
67
+ assert_equal ["Cat", "Bird"], lsi.classify_multiple( @str7 )
68
+ assert_equal ["Bird"], lsi.classify_multiple( @str8 )
69
+ end
70
+
71
+ def test_multiple_categorizing_reverse
72
+ lsi = Classifier::LSI.new
73
+ lsi.add_item @str1, "Dog"
74
+ lsi.add_item @str3, "Cat"
75
+ lsi.add_item @str4, "Cat"
76
+ lsi.add_item @str6, "Dog", "Bird", "Flying"
77
+ lsi.add_item @str7, "Cat", "Bird"
78
+ lsi.add_item @str8, "Bird", "Dog", "Cat"
79
+
80
+ assert_equal ["Dog"], lsi.classify_multiple( @str2 )
81
+ assert_equal ["Cat", "Bird"], lsi.classify_multiple( @str5 )
82
+
83
+ # test with a word unknown alone
84
+ assert_equal "Bird", lsi.classify( "Bird!" )
85
+ assert_equal ["Bird", "Dog", "Cat"], lsi.classify_multiple( "Bird!" )
86
+ end
87
+
44
88
  def test_external_classifying
45
89
  lsi = Classifier::LSI.new
46
90
  bayes = Classifier::Bayes.new :categories => ['Dog', 'Cat', 'Bird']
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: yury-classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.4
4
+ version: 1.3.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yury Korolev
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-03-22 00:00:00 -07:00
12
+ date: 2009-04-14 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -19,9 +19,6 @@ dependencies:
19
19
  version_requirements: !ruby/object:Gem::Requirement
20
20
  requirements:
21
21
  - - ">="
22
- - !ruby/object:Gem::Version
23
- version: "0"
24
- - - "="
25
22
  - !ruby/object:Gem::Version
26
23
  version: 2.2.2
27
24
  version:
@@ -32,9 +29,6 @@ dependencies:
32
29
  version_requirements: !ruby/object:Gem::Requirement
33
30
  requirements:
34
31
  - - ">="
35
- - !ruby/object:Gem::Version
36
- version: "0"
37
- - - "="
38
32
  - !ruby/object:Gem::Version
39
33
  version: 0.5.1
40
34
  version:
@@ -58,6 +52,7 @@ extra_rdoc_files:
58
52
  - LICENSE
59
53
  - README
60
54
  files:
55
+ - classifier.gemspec
61
56
  - lib/classifier/base.rb
62
57
  - lib/classifier/bayes.rb
63
58
  - lib/classifier/extensions/vector.rb
@@ -76,7 +71,6 @@ files:
76
71
  - test/bayes/bayesian_test.rb
77
72
  - test/lsi/lsi_test.rb
78
73
  - test/test_helper.rb
79
- - classifier.gemspec
80
74
  has_rdoc: true
81
75
  homepage: http://github.com/yury/classifier
82
76
  post_install_message: