yury-classifier 1.3.4 → 1.3.5
Sign up to get free protection for your applications and to get access to all the features.
- data/Manifest +1 -0
- data/Rakefile +2 -2
- data/classifier.gemspec +9 -9
- data/lib/classifier/lsi/content_node.rb +2 -1
- data/lib/classifier/lsi.rb +21 -3
- data/test/lsi/lsi_test.rb +46 -2
- metadata +3 -9
data/Manifest
CHANGED
data/Rakefile
CHANGED
@@ -2,14 +2,14 @@ require 'rubygems'
|
|
2
2
|
require 'rake'
|
3
3
|
require 'echoe'
|
4
4
|
|
5
|
-
Echoe.new('classifier', '1.3.
|
5
|
+
Echoe.new('classifier', '1.3.5') do |p|
|
6
6
|
p.description = "A general classifier module to allow Bayesian and other types of classifications."
|
7
7
|
p.url = "http://github.com/yury/classifier"
|
8
8
|
p.author = "Yury Korolev"
|
9
9
|
p.email = "yury.korolev@gmail.com"
|
10
10
|
p.ignore_pattern = ["tmp/*", "script/*"]
|
11
11
|
p.development_dependencies = []
|
12
|
-
p.runtime_dependencies = ["activesupport >=
|
12
|
+
p.runtime_dependencies = ["activesupport >=2.2.2", "ruby-stemmer >=0.5.1"]
|
13
13
|
end
|
14
14
|
|
15
15
|
Dir["#{File.dirname(__FILE__)}/tasks/*.rake"].sort.each { |ext| load ext }
|
data/classifier.gemspec
CHANGED
@@ -2,15 +2,15 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{classifier}
|
5
|
-
s.version = "1.3.
|
5
|
+
s.version = "1.3.5"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Yury Korolev"]
|
9
|
-
s.date = %q{2009-
|
9
|
+
s.date = %q{2009-04-14}
|
10
10
|
s.description = %q{A general classifier module to allow Bayesian and other types of classifications.}
|
11
11
|
s.email = %q{yury.korolev@gmail.com}
|
12
12
|
s.extra_rdoc_files = ["lib/classifier/base.rb", "lib/classifier/bayes.rb", "lib/classifier/extensions/vector.rb", "lib/classifier/extensions/vector_serialize.rb", "lib/classifier/lsi/content_node.rb", "lib/classifier/lsi/summary.rb", "lib/classifier/lsi/word_list.rb", "lib/classifier/lsi.rb", "lib/classifier.rb", "lib/init.rb", "LICENSE", "README"]
|
13
|
-
s.files = ["lib/classifier/base.rb", "lib/classifier/bayes.rb", "lib/classifier/extensions/vector.rb", "lib/classifier/extensions/vector_serialize.rb", "lib/classifier/lsi/content_node.rb", "lib/classifier/lsi/summary.rb", "lib/classifier/lsi/word_list.rb", "lib/classifier/lsi.rb", "lib/classifier.rb", "lib/init.rb", "LICENSE", "Manifest", "Rakefile", "README", "test/base_test.rb", "test/bayes/bayesian_test.rb", "test/lsi/lsi_test.rb", "test/test_helper.rb"
|
13
|
+
s.files = ["classifier.gemspec", "lib/classifier/base.rb", "lib/classifier/bayes.rb", "lib/classifier/extensions/vector.rb", "lib/classifier/extensions/vector_serialize.rb", "lib/classifier/lsi/content_node.rb", "lib/classifier/lsi/summary.rb", "lib/classifier/lsi/word_list.rb", "lib/classifier/lsi.rb", "lib/classifier.rb", "lib/init.rb", "LICENSE", "Manifest", "Rakefile", "README", "test/base_test.rb", "test/bayes/bayesian_test.rb", "test/lsi/lsi_test.rb", "test/test_helper.rb"]
|
14
14
|
s.has_rdoc = true
|
15
15
|
s.homepage = %q{http://github.com/yury/classifier}
|
16
16
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Classifier", "--main", "README"]
|
@@ -25,14 +25,14 @@ Gem::Specification.new do |s|
|
|
25
25
|
s.specification_version = 2
|
26
26
|
|
27
27
|
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
28
|
-
s.add_runtime_dependency(%q<activesupport>, [">=
|
29
|
-
s.add_runtime_dependency(%q<ruby-stemmer>, [">= 0
|
28
|
+
s.add_runtime_dependency(%q<activesupport>, [">= 2.2.2"])
|
29
|
+
s.add_runtime_dependency(%q<ruby-stemmer>, [">= 0.5.1"])
|
30
30
|
else
|
31
|
-
s.add_dependency(%q<activesupport>, [">=
|
32
|
-
s.add_dependency(%q<ruby-stemmer>, [">= 0
|
31
|
+
s.add_dependency(%q<activesupport>, [">= 2.2.2"])
|
32
|
+
s.add_dependency(%q<ruby-stemmer>, [">= 0.5.1"])
|
33
33
|
end
|
34
34
|
else
|
35
|
-
s.add_dependency(%q<activesupport>, [">=
|
36
|
-
s.add_dependency(%q<ruby-stemmer>, [">= 0
|
35
|
+
s.add_dependency(%q<activesupport>, [">= 2.2.2"])
|
36
|
+
s.add_dependency(%q<ruby-stemmer>, [">= 0.5.1"])
|
37
37
|
end
|
38
38
|
end
|
@@ -54,7 +54,8 @@ module Classifier
|
|
54
54
|
if ( term > 0 )
|
55
55
|
weighted_total += (( term / total_words ) * Math.log( term / total_words ))
|
56
56
|
end
|
57
|
-
end
|
57
|
+
end
|
58
|
+
weighted_total = -1.0 if weighted_total.zero? # if no word in list is known
|
58
59
|
vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
|
59
60
|
end
|
60
61
|
|
data/lib/classifier/lsi.rb
CHANGED
@@ -180,6 +180,7 @@ module Classifier
|
|
180
180
|
content_node = node_for_content( doc, &block )
|
181
181
|
result =
|
182
182
|
@items.keys.collect do |item|
|
183
|
+
next if @items[item].search_vector.blank? # not enough data
|
183
184
|
if $GSL
|
184
185
|
val = content_node.search_vector * @items[item].search_vector.col
|
185
186
|
else
|
@@ -187,7 +188,7 @@ module Classifier
|
|
187
188
|
end
|
188
189
|
[item, val]
|
189
190
|
end
|
190
|
-
result.sort_by { |x| x[1] }.reverse
|
191
|
+
result.compact.sort_by { |x| x[1] }.reverse
|
191
192
|
end
|
192
193
|
|
193
194
|
# Similar to proximity_array_for_content, this function takes similar
|
@@ -201,6 +202,7 @@ module Classifier
|
|
201
202
|
content_node = node_for_content( doc, &block )
|
202
203
|
result =
|
203
204
|
@items.keys.collect do |item|
|
205
|
+
next if @items[item].search_norm.blank? # not enough data
|
204
206
|
if $GSL
|
205
207
|
val = content_node.search_norm * @items[item].search_norm.col
|
206
208
|
else
|
@@ -208,7 +210,7 @@ module Classifier
|
|
208
210
|
end
|
209
211
|
[item, val]
|
210
212
|
end
|
211
|
-
result.sort_by { |x| x[1] }.reverse
|
213
|
+
result.compact.sort_by { |x| x[1] }.reverse
|
212
214
|
end
|
213
215
|
|
214
216
|
# This function allows for text-based search of your index. Unlike other functions
|
@@ -266,7 +268,23 @@ module Classifier
|
|
266
268
|
ranking = votes.keys.sort_by { |x| votes[x] }
|
267
269
|
return ranking[-1]
|
268
270
|
end
|
269
|
-
|
271
|
+
|
272
|
+
# Same as previous but returns all results, also more permissive in default cut-off
|
273
|
+
def classify_multiple( doc, cutoff=0.50, &block )
|
274
|
+
icutoff = (@items.size * cutoff).round
|
275
|
+
carry = proximity_array_for_content( doc, &block )
|
276
|
+
carry = carry[0..icutoff-1]
|
277
|
+
votes = {}
|
278
|
+
carry.each do |pair|
|
279
|
+
categories = @items[pair[0]].categories
|
280
|
+
categories.each do |category|
|
281
|
+
votes[category] ||= 0.0
|
282
|
+
votes[category] += pair[1]
|
283
|
+
end
|
284
|
+
end
|
285
|
+
votes.delete_if{|key, value| value<1 }.keys.sort_by { |x| -votes[x] }
|
286
|
+
end
|
287
|
+
|
270
288
|
# Prototype, only works on indexed documents.
|
271
289
|
# I have no clue if this is going to work, but in theory
|
272
290
|
# it's supposed to.
|
data/test/lsi/lsi_test.rb
CHANGED
@@ -8,6 +8,9 @@ class LSITest < Test::Unit::TestCase
|
|
8
8
|
@str3 = "This text revolves around cats. Cats."
|
9
9
|
@str4 = "This text also involves cats. Cats!"
|
10
10
|
@str5 = "This text involves birds. Birds."
|
11
|
+
@str6 = "Is it about dogs or birds?"
|
12
|
+
@str7 = "Is it about birds or cats?"
|
13
|
+
@str8 = "I would prefer a bird over thousand cats or dogs because birds are smaller."
|
11
14
|
end
|
12
15
|
|
13
16
|
def test_basic_indexing
|
@@ -29,6 +32,14 @@ class LSITest < Test::Unit::TestCase
|
|
29
32
|
assert ! lsi.needs_rebuild?
|
30
33
|
end
|
31
34
|
|
35
|
+
def test_basic_categorizing_with_too_small_dataset
|
36
|
+
lsi = Classifier::LSI.new
|
37
|
+
lsi.add_item @str2, "Dog"
|
38
|
+
|
39
|
+
assert_equal nil, lsi.classify( @str1 )
|
40
|
+
assert_equal [], lsi.classify_multiple( @str3 )
|
41
|
+
end
|
42
|
+
|
32
43
|
def test_basic_categorizing
|
33
44
|
lsi = Classifier::LSI.new
|
34
45
|
lsi.add_item @str2, "Dog"
|
@@ -38,9 +49,42 @@ class LSITest < Test::Unit::TestCase
|
|
38
49
|
|
39
50
|
assert_equal "Dog", lsi.classify( @str1 )
|
40
51
|
assert_equal "Cat", lsi.classify( @str3 )
|
41
|
-
assert_equal "Bird", lsi.classify( @str5 )
|
52
|
+
assert_equal "Bird", lsi.classify( @str5 )
|
53
|
+
assert_equal "Dog", lsi.classify( @str6 )
|
54
|
+
assert_equal "Bird", lsi.classify( @str7 )
|
55
|
+
assert_equal "Bird", lsi.classify( @str8 )
|
42
56
|
end
|
43
|
-
|
57
|
+
|
58
|
+
def test_multiple_categorizing
|
59
|
+
lsi = Classifier::LSI.new
|
60
|
+
lsi.add_item @str1, "Dog"
|
61
|
+
lsi.add_item @str2, "Dog"
|
62
|
+
lsi.add_item @str3, "Cat"
|
63
|
+
lsi.add_item @str4, "Cat"
|
64
|
+
lsi.add_item @str5, "Bird"
|
65
|
+
|
66
|
+
assert_equal ["Dog", "Bird"], lsi.classify_multiple( @str6 )
|
67
|
+
assert_equal ["Cat", "Bird"], lsi.classify_multiple( @str7 )
|
68
|
+
assert_equal ["Bird"], lsi.classify_multiple( @str8 )
|
69
|
+
end
|
70
|
+
|
71
|
+
def test_multiple_categorizing_reverse
|
72
|
+
lsi = Classifier::LSI.new
|
73
|
+
lsi.add_item @str1, "Dog"
|
74
|
+
lsi.add_item @str3, "Cat"
|
75
|
+
lsi.add_item @str4, "Cat"
|
76
|
+
lsi.add_item @str6, "Dog", "Bird", "Flying"
|
77
|
+
lsi.add_item @str7, "Cat", "Bird"
|
78
|
+
lsi.add_item @str8, "Bird", "Dog", "Cat"
|
79
|
+
|
80
|
+
assert_equal ["Dog"], lsi.classify_multiple( @str2 )
|
81
|
+
assert_equal ["Cat", "Bird"], lsi.classify_multiple( @str5 )
|
82
|
+
|
83
|
+
# test with a word unknown alone
|
84
|
+
assert_equal "Bird", lsi.classify( "Bird!" )
|
85
|
+
assert_equal ["Bird", "Dog", "Cat"], lsi.classify_multiple( "Bird!" )
|
86
|
+
end
|
87
|
+
|
44
88
|
def test_external_classifying
|
45
89
|
lsi = Classifier::LSI.new
|
46
90
|
bayes = Classifier::Bayes.new :categories => ['Dog', 'Cat', 'Bird']
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yury-classifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yury Korolev
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-04-14 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -19,9 +19,6 @@ dependencies:
|
|
19
19
|
version_requirements: !ruby/object:Gem::Requirement
|
20
20
|
requirements:
|
21
21
|
- - ">="
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: "0"
|
24
|
-
- - "="
|
25
22
|
- !ruby/object:Gem::Version
|
26
23
|
version: 2.2.2
|
27
24
|
version:
|
@@ -32,9 +29,6 @@ dependencies:
|
|
32
29
|
version_requirements: !ruby/object:Gem::Requirement
|
33
30
|
requirements:
|
34
31
|
- - ">="
|
35
|
-
- !ruby/object:Gem::Version
|
36
|
-
version: "0"
|
37
|
-
- - "="
|
38
32
|
- !ruby/object:Gem::Version
|
39
33
|
version: 0.5.1
|
40
34
|
version:
|
@@ -58,6 +52,7 @@ extra_rdoc_files:
|
|
58
52
|
- LICENSE
|
59
53
|
- README
|
60
54
|
files:
|
55
|
+
- classifier.gemspec
|
61
56
|
- lib/classifier/base.rb
|
62
57
|
- lib/classifier/bayes.rb
|
63
58
|
- lib/classifier/extensions/vector.rb
|
@@ -76,7 +71,6 @@ files:
|
|
76
71
|
- test/bayes/bayesian_test.rb
|
77
72
|
- test/lsi/lsi_test.rb
|
78
73
|
- test/test_helper.rb
|
79
|
-
- classifier.gemspec
|
80
74
|
has_rdoc: true
|
81
75
|
homepage: http://github.com/yury/classifier
|
82
76
|
post_install_message:
|