yury-classifier 1.3.4 → 1.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Manifest +1 -0
- data/Rakefile +2 -2
- data/classifier.gemspec +9 -9
- data/lib/classifier/lsi/content_node.rb +2 -1
- data/lib/classifier/lsi.rb +21 -3
- data/test/lsi/lsi_test.rb +46 -2
- metadata +3 -9
data/Manifest
CHANGED
data/Rakefile
CHANGED
@@ -2,14 +2,14 @@ require 'rubygems'
|
|
2
2
|
require 'rake'
|
3
3
|
require 'echoe'
|
4
4
|
|
5
|
-
Echoe.new('classifier', '1.3.
|
5
|
+
Echoe.new('classifier', '1.3.5') do |p|
|
6
6
|
p.description = "A general classifier module to allow Bayesian and other types of classifications."
|
7
7
|
p.url = "http://github.com/yury/classifier"
|
8
8
|
p.author = "Yury Korolev"
|
9
9
|
p.email = "yury.korolev@gmail.com"
|
10
10
|
p.ignore_pattern = ["tmp/*", "script/*"]
|
11
11
|
p.development_dependencies = []
|
12
|
-
p.runtime_dependencies = ["activesupport >=
|
12
|
+
p.runtime_dependencies = ["activesupport >=2.2.2", "ruby-stemmer >=0.5.1"]
|
13
13
|
end
|
14
14
|
|
15
15
|
Dir["#{File.dirname(__FILE__)}/tasks/*.rake"].sort.each { |ext| load ext }
|
data/classifier.gemspec
CHANGED
@@ -2,15 +2,15 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{classifier}
|
5
|
-
s.version = "1.3.
|
5
|
+
s.version = "1.3.5"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Yury Korolev"]
|
9
|
-
s.date = %q{2009-
|
9
|
+
s.date = %q{2009-04-14}
|
10
10
|
s.description = %q{A general classifier module to allow Bayesian and other types of classifications.}
|
11
11
|
s.email = %q{yury.korolev@gmail.com}
|
12
12
|
s.extra_rdoc_files = ["lib/classifier/base.rb", "lib/classifier/bayes.rb", "lib/classifier/extensions/vector.rb", "lib/classifier/extensions/vector_serialize.rb", "lib/classifier/lsi/content_node.rb", "lib/classifier/lsi/summary.rb", "lib/classifier/lsi/word_list.rb", "lib/classifier/lsi.rb", "lib/classifier.rb", "lib/init.rb", "LICENSE", "README"]
|
13
|
-
s.files = ["lib/classifier/base.rb", "lib/classifier/bayes.rb", "lib/classifier/extensions/vector.rb", "lib/classifier/extensions/vector_serialize.rb", "lib/classifier/lsi/content_node.rb", "lib/classifier/lsi/summary.rb", "lib/classifier/lsi/word_list.rb", "lib/classifier/lsi.rb", "lib/classifier.rb", "lib/init.rb", "LICENSE", "Manifest", "Rakefile", "README", "test/base_test.rb", "test/bayes/bayesian_test.rb", "test/lsi/lsi_test.rb", "test/test_helper.rb"
|
13
|
+
s.files = ["classifier.gemspec", "lib/classifier/base.rb", "lib/classifier/bayes.rb", "lib/classifier/extensions/vector.rb", "lib/classifier/extensions/vector_serialize.rb", "lib/classifier/lsi/content_node.rb", "lib/classifier/lsi/summary.rb", "lib/classifier/lsi/word_list.rb", "lib/classifier/lsi.rb", "lib/classifier.rb", "lib/init.rb", "LICENSE", "Manifest", "Rakefile", "README", "test/base_test.rb", "test/bayes/bayesian_test.rb", "test/lsi/lsi_test.rb", "test/test_helper.rb"]
|
14
14
|
s.has_rdoc = true
|
15
15
|
s.homepage = %q{http://github.com/yury/classifier}
|
16
16
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Classifier", "--main", "README"]
|
@@ -25,14 +25,14 @@ Gem::Specification.new do |s|
|
|
25
25
|
s.specification_version = 2
|
26
26
|
|
27
27
|
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
28
|
-
s.add_runtime_dependency(%q<activesupport>, [">=
|
29
|
-
s.add_runtime_dependency(%q<ruby-stemmer>, [">= 0
|
28
|
+
s.add_runtime_dependency(%q<activesupport>, [">= 2.2.2"])
|
29
|
+
s.add_runtime_dependency(%q<ruby-stemmer>, [">= 0.5.1"])
|
30
30
|
else
|
31
|
-
s.add_dependency(%q<activesupport>, [">=
|
32
|
-
s.add_dependency(%q<ruby-stemmer>, [">= 0
|
31
|
+
s.add_dependency(%q<activesupport>, [">= 2.2.2"])
|
32
|
+
s.add_dependency(%q<ruby-stemmer>, [">= 0.5.1"])
|
33
33
|
end
|
34
34
|
else
|
35
|
-
s.add_dependency(%q<activesupport>, [">=
|
36
|
-
s.add_dependency(%q<ruby-stemmer>, [">= 0
|
35
|
+
s.add_dependency(%q<activesupport>, [">= 2.2.2"])
|
36
|
+
s.add_dependency(%q<ruby-stemmer>, [">= 0.5.1"])
|
37
37
|
end
|
38
38
|
end
|
@@ -54,7 +54,8 @@ module Classifier
|
|
54
54
|
if ( term > 0 )
|
55
55
|
weighted_total += (( term / total_words ) * Math.log( term / total_words ))
|
56
56
|
end
|
57
|
-
end
|
57
|
+
end
|
58
|
+
weighted_total = -1.0 if weighted_total.zero? # if no word in list is known
|
58
59
|
vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
|
59
60
|
end
|
60
61
|
|
data/lib/classifier/lsi.rb
CHANGED
@@ -180,6 +180,7 @@ module Classifier
|
|
180
180
|
content_node = node_for_content( doc, &block )
|
181
181
|
result =
|
182
182
|
@items.keys.collect do |item|
|
183
|
+
next if @items[item].search_vector.blank? # not enough data
|
183
184
|
if $GSL
|
184
185
|
val = content_node.search_vector * @items[item].search_vector.col
|
185
186
|
else
|
@@ -187,7 +188,7 @@ module Classifier
|
|
187
188
|
end
|
188
189
|
[item, val]
|
189
190
|
end
|
190
|
-
result.sort_by { |x| x[1] }.reverse
|
191
|
+
result.compact.sort_by { |x| x[1] }.reverse
|
191
192
|
end
|
192
193
|
|
193
194
|
# Similar to proximity_array_for_content, this function takes similar
|
@@ -201,6 +202,7 @@ module Classifier
|
|
201
202
|
content_node = node_for_content( doc, &block )
|
202
203
|
result =
|
203
204
|
@items.keys.collect do |item|
|
205
|
+
next if @items[item].search_norm.blank? # not enough data
|
204
206
|
if $GSL
|
205
207
|
val = content_node.search_norm * @items[item].search_norm.col
|
206
208
|
else
|
@@ -208,7 +210,7 @@ module Classifier
|
|
208
210
|
end
|
209
211
|
[item, val]
|
210
212
|
end
|
211
|
-
result.sort_by { |x| x[1] }.reverse
|
213
|
+
result.compact.sort_by { |x| x[1] }.reverse
|
212
214
|
end
|
213
215
|
|
214
216
|
# This function allows for text-based search of your index. Unlike other functions
|
@@ -266,7 +268,23 @@ module Classifier
|
|
266
268
|
ranking = votes.keys.sort_by { |x| votes[x] }
|
267
269
|
return ranking[-1]
|
268
270
|
end
|
269
|
-
|
271
|
+
|
272
|
+
# Same as previous but returns all results, also more permissive in default cut-off
|
273
|
+
def classify_multiple( doc, cutoff=0.50, &block )
|
274
|
+
icutoff = (@items.size * cutoff).round
|
275
|
+
carry = proximity_array_for_content( doc, &block )
|
276
|
+
carry = carry[0..icutoff-1]
|
277
|
+
votes = {}
|
278
|
+
carry.each do |pair|
|
279
|
+
categories = @items[pair[0]].categories
|
280
|
+
categories.each do |category|
|
281
|
+
votes[category] ||= 0.0
|
282
|
+
votes[category] += pair[1]
|
283
|
+
end
|
284
|
+
end
|
285
|
+
votes.delete_if{|key, value| value<1 }.keys.sort_by { |x| -votes[x] }
|
286
|
+
end
|
287
|
+
|
270
288
|
# Prototype, only works on indexed documents.
|
271
289
|
# I have no clue if this is going to work, but in theory
|
272
290
|
# it's supposed to.
|
data/test/lsi/lsi_test.rb
CHANGED
@@ -8,6 +8,9 @@ class LSITest < Test::Unit::TestCase
|
|
8
8
|
@str3 = "This text revolves around cats. Cats."
|
9
9
|
@str4 = "This text also involves cats. Cats!"
|
10
10
|
@str5 = "This text involves birds. Birds."
|
11
|
+
@str6 = "Is it about dogs or birds?"
|
12
|
+
@str7 = "Is it about birds or cats?"
|
13
|
+
@str8 = "I would prefer a bird over thousand cats or dogs because birds are smaller."
|
11
14
|
end
|
12
15
|
|
13
16
|
def test_basic_indexing
|
@@ -29,6 +32,14 @@ class LSITest < Test::Unit::TestCase
|
|
29
32
|
assert ! lsi.needs_rebuild?
|
30
33
|
end
|
31
34
|
|
35
|
+
def test_basic_categorizing_with_too_small_dataset
|
36
|
+
lsi = Classifier::LSI.new
|
37
|
+
lsi.add_item @str2, "Dog"
|
38
|
+
|
39
|
+
assert_equal nil, lsi.classify( @str1 )
|
40
|
+
assert_equal [], lsi.classify_multiple( @str3 )
|
41
|
+
end
|
42
|
+
|
32
43
|
def test_basic_categorizing
|
33
44
|
lsi = Classifier::LSI.new
|
34
45
|
lsi.add_item @str2, "Dog"
|
@@ -38,9 +49,42 @@ class LSITest < Test::Unit::TestCase
|
|
38
49
|
|
39
50
|
assert_equal "Dog", lsi.classify( @str1 )
|
40
51
|
assert_equal "Cat", lsi.classify( @str3 )
|
41
|
-
assert_equal "Bird", lsi.classify( @str5 )
|
52
|
+
assert_equal "Bird", lsi.classify( @str5 )
|
53
|
+
assert_equal "Dog", lsi.classify( @str6 )
|
54
|
+
assert_equal "Bird", lsi.classify( @str7 )
|
55
|
+
assert_equal "Bird", lsi.classify( @str8 )
|
42
56
|
end
|
43
|
-
|
57
|
+
|
58
|
+
def test_multiple_categorizing
|
59
|
+
lsi = Classifier::LSI.new
|
60
|
+
lsi.add_item @str1, "Dog"
|
61
|
+
lsi.add_item @str2, "Dog"
|
62
|
+
lsi.add_item @str3, "Cat"
|
63
|
+
lsi.add_item @str4, "Cat"
|
64
|
+
lsi.add_item @str5, "Bird"
|
65
|
+
|
66
|
+
assert_equal ["Dog", "Bird"], lsi.classify_multiple( @str6 )
|
67
|
+
assert_equal ["Cat", "Bird"], lsi.classify_multiple( @str7 )
|
68
|
+
assert_equal ["Bird"], lsi.classify_multiple( @str8 )
|
69
|
+
end
|
70
|
+
|
71
|
+
def test_multiple_categorizing_reverse
|
72
|
+
lsi = Classifier::LSI.new
|
73
|
+
lsi.add_item @str1, "Dog"
|
74
|
+
lsi.add_item @str3, "Cat"
|
75
|
+
lsi.add_item @str4, "Cat"
|
76
|
+
lsi.add_item @str6, "Dog", "Bird", "Flying"
|
77
|
+
lsi.add_item @str7, "Cat", "Bird"
|
78
|
+
lsi.add_item @str8, "Bird", "Dog", "Cat"
|
79
|
+
|
80
|
+
assert_equal ["Dog"], lsi.classify_multiple( @str2 )
|
81
|
+
assert_equal ["Cat", "Bird"], lsi.classify_multiple( @str5 )
|
82
|
+
|
83
|
+
# test with a word unknown alone
|
84
|
+
assert_equal "Bird", lsi.classify( "Bird!" )
|
85
|
+
assert_equal ["Bird", "Dog", "Cat"], lsi.classify_multiple( "Bird!" )
|
86
|
+
end
|
87
|
+
|
44
88
|
def test_external_classifying
|
45
89
|
lsi = Classifier::LSI.new
|
46
90
|
bayes = Classifier::Bayes.new :categories => ['Dog', 'Cat', 'Bird']
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yury-classifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yury Korolev
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-04-14 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -19,9 +19,6 @@ dependencies:
|
|
19
19
|
version_requirements: !ruby/object:Gem::Requirement
|
20
20
|
requirements:
|
21
21
|
- - ">="
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: "0"
|
24
|
-
- - "="
|
25
22
|
- !ruby/object:Gem::Version
|
26
23
|
version: 2.2.2
|
27
24
|
version:
|
@@ -32,9 +29,6 @@ dependencies:
|
|
32
29
|
version_requirements: !ruby/object:Gem::Requirement
|
33
30
|
requirements:
|
34
31
|
- - ">="
|
35
|
-
- !ruby/object:Gem::Version
|
36
|
-
version: "0"
|
37
|
-
- - "="
|
38
32
|
- !ruby/object:Gem::Version
|
39
33
|
version: 0.5.1
|
40
34
|
version:
|
@@ -58,6 +52,7 @@ extra_rdoc_files:
|
|
58
52
|
- LICENSE
|
59
53
|
- README
|
60
54
|
files:
|
55
|
+
- classifier.gemspec
|
61
56
|
- lib/classifier/base.rb
|
62
57
|
- lib/classifier/bayes.rb
|
63
58
|
- lib/classifier/extensions/vector.rb
|
@@ -76,7 +71,6 @@ files:
|
|
76
71
|
- test/bayes/bayesian_test.rb
|
77
72
|
- test/lsi/lsi_test.rb
|
78
73
|
- test/test_helper.rb
|
79
|
-
- classifier.gemspec
|
80
74
|
has_rdoc: true
|
81
75
|
homepage: http://github.com/yury/classifier
|
82
76
|
post_install_message:
|