classifier 1.3.4 → 1.3.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/LICENSE +2 -2
- data/lib/classifier.rb +1 -1
- data/lib/classifier/bayes.rb +7 -7
- data/lib/classifier/extensions/vector.rb +9 -9
- data/lib/classifier/extensions/vector_serialize.rb +4 -4
- data/lib/classifier/extensions/word_hash.rb +8 -8
- data/lib/classifier/lsi.rb +68 -68
- data/lib/classifier/lsi/content_node.rb +17 -17
- data/lib/classifier/lsi/summary.rb +4 -4
- data/lib/classifier/lsi/word_list.rb +7 -7
- metadata +72 -25
- data/Gemfile +0 -5
- data/Gemfile.lock +0 -26
- data/README.markdown +0 -97
- data/Rakefile +0 -84
- data/test/bayes/bayesian_test.rb +0 -33
- data/test/extensions/word_hash_test.rb +0 -35
- data/test/lsi/lsi_test.rb +0 -123
- data/test/test_helper.rb +0 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 1d453b4ca9e0a0c44a2b8d3c9ef55db5b55a17efe5ff0cfbcab93f24965cd536
|
4
|
+
data.tar.gz: 26f5d9595ddd35c8d7c239f946afa8251a4b68f24f5b8bc41e30be13ded60547
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c13c1666c7d2fe92d47ab7ced0a885fec7b13719aea25f283a013d5015744d6aea7d473706af54042972ba687b214c2a3a619f58d75560d90e57c3569d38f957
|
7
|
+
data.tar.gz: 23261ba6708307ecf6faac636d8572c50720fbf9a2e6db6ee736a07ce3de445daa431afb3bbf2c49dd4d2bd699327ca1c419029b4972236c52a9a5c1f00ab5a2
|
data/LICENSE
CHANGED
@@ -146,7 +146,7 @@ such a program is covered only if its contents constitute a work based
|
|
146
146
|
on the Library (independent of the use of the Library in a tool for
|
147
147
|
writing it). Whether that is true depends on what the Library does
|
148
148
|
and what the program that uses the Library does.
|
149
|
-
|
149
|
+
|
150
150
|
1. You may copy and distribute verbatim copies of the Library's
|
151
151
|
complete source code as you receive it, in any medium, provided that
|
152
152
|
you conspicuously and appropriately publish on each copy an
|
@@ -426,4 +426,4 @@ the Free Software Foundation.
|
|
426
426
|
14. If you wish to incorporate parts of the Library into other free
|
427
427
|
programs whose distribution conditions are incompatible with these,
|
428
428
|
write to the author to ask for permission. For software which is
|
429
|
-
copyrighted by
|
429
|
+
copyrighted by
|
data/lib/classifier.rb
CHANGED
data/lib/classifier/bayes.rb
CHANGED
@@ -6,7 +6,7 @@ module Classifier
|
|
6
6
|
|
7
7
|
class Bayes
|
8
8
|
# The class can be created with one or more categories, each of which will be
|
9
|
-
# initialized and given a training method. E.g.,
|
9
|
+
# initialized and given a training method. E.g.,
|
10
10
|
# b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
|
11
11
|
def initialize(*categories)
|
12
12
|
@categories = Hash.new
|
@@ -56,7 +56,7 @@ class Bayes
|
|
56
56
|
end
|
57
57
|
end
|
58
58
|
end
|
59
|
-
|
59
|
+
|
60
60
|
#
|
61
61
|
# Returns the scores in each category the provided +text+. E.g.,
|
62
62
|
# b.classifications "I hate bad words and you"
|
@@ -80,14 +80,14 @@ class Bayes
|
|
80
80
|
end
|
81
81
|
|
82
82
|
#
|
83
|
-
# Returns the classification of the provided +text+, which is one of the
|
83
|
+
# Returns the classification of the provided +text+, which is one of the
|
84
84
|
# categories given in the initializer. E.g.,
|
85
85
|
# b.classify "I hate bad words and you"
|
86
86
|
# => 'Uninteresting'
|
87
87
|
def classify(text)
|
88
88
|
(classifications(text).sort_by { |a| -a[1] })[0][0]
|
89
89
|
end
|
90
|
-
|
90
|
+
|
91
91
|
#
|
92
92
|
# Provides training and untraining methods for the categories specified in Bayes#new
|
93
93
|
# For example:
|
@@ -106,7 +106,7 @@ class Bayes
|
|
106
106
|
super #raise StandardError, "No such method: #{name}"
|
107
107
|
end
|
108
108
|
end
|
109
|
-
|
109
|
+
|
110
110
|
#
|
111
111
|
# Provides a list of category names
|
112
112
|
# For example:
|
@@ -115,7 +115,7 @@ class Bayes
|
|
115
115
|
def categories # :nodoc:
|
116
116
|
@categories.keys.collect {|c| c.to_s}
|
117
117
|
end
|
118
|
-
|
118
|
+
|
119
119
|
#
|
120
120
|
# Allows you to add categories to the classifier.
|
121
121
|
# For example:
|
@@ -128,7 +128,7 @@ class Bayes
|
|
128
128
|
def add_category(category)
|
129
129
|
@categories[category.prepare_category_name] = Hash.new
|
130
130
|
end
|
131
|
-
|
131
|
+
|
132
132
|
alias append_category add_category
|
133
133
|
end
|
134
134
|
|
@@ -1,5 +1,5 @@
|
|
1
1
|
# Author:: Ernest Ellingson
|
2
|
-
# Copyright:: Copyright (c) 2005
|
2
|
+
# Copyright:: Copyright (c) 2005
|
3
3
|
|
4
4
|
# These are extensions to the std-lib 'matrix' to allow an all ruby SVD
|
5
5
|
|
@@ -9,7 +9,7 @@ require 'mathn'
|
|
9
9
|
class Array
|
10
10
|
def sum(identity = 0, &block)
|
11
11
|
return identity unless size > 0
|
12
|
-
|
12
|
+
|
13
13
|
if block_given?
|
14
14
|
map(&block).sum
|
15
15
|
else
|
@@ -22,7 +22,7 @@ class Vector
|
|
22
22
|
def magnitude
|
23
23
|
sumsqs = 0.0
|
24
24
|
self.size.times do |i|
|
25
|
-
sumsqs += self[i] ** 2.0
|
25
|
+
sumsqs += self[i] ** 2.0
|
26
26
|
end
|
27
27
|
Math.sqrt(sumsqs)
|
28
28
|
end
|
@@ -42,7 +42,7 @@ class Matrix
|
|
42
42
|
def Matrix.diag(s)
|
43
43
|
Matrix.diagonal(*s)
|
44
44
|
end
|
45
|
-
|
45
|
+
|
46
46
|
alias :trans :transpose
|
47
47
|
|
48
48
|
def SV_decomp(maxSweeps = 20)
|
@@ -51,7 +51,7 @@ class Matrix
|
|
51
51
|
else
|
52
52
|
q = self * self.trans
|
53
53
|
end
|
54
|
-
|
54
|
+
|
55
55
|
qrot = q.dup
|
56
56
|
v = Matrix.identity(q.row_size)
|
57
57
|
azrot = nil
|
@@ -75,16 +75,16 @@ class Matrix
|
|
75
75
|
mzrot[col,col] = hcos
|
76
76
|
qrot = mzrot.trans * qrot * mzrot
|
77
77
|
v = v * mzrot
|
78
|
-
end
|
78
|
+
end
|
79
79
|
end
|
80
80
|
s_old = qrot.dup if cnt == 1
|
81
|
-
sum_qrot = 0.0
|
81
|
+
sum_qrot = 0.0
|
82
82
|
if cnt > 1
|
83
83
|
qrot.row_size.times do |r|
|
84
84
|
sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
|
85
85
|
end
|
86
86
|
s_old = qrot.dup
|
87
|
-
end
|
87
|
+
end
|
88
88
|
break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
|
89
89
|
end # of do while true
|
90
90
|
s = []
|
@@ -93,7 +93,7 @@ class Matrix
|
|
93
93
|
end
|
94
94
|
#puts "cnt = #{cnt}"
|
95
95
|
if self.row_size >= self.column_size
|
96
|
-
mu = self * v * Matrix.diagonal(*s).inverse
|
96
|
+
mu = self * v * Matrix.diagonal(*s).inverse
|
97
97
|
return [mu, v, s]
|
98
98
|
else
|
99
99
|
puts v.row_size
|
@@ -1,17 +1,17 @@
|
|
1
1
|
module GSL
|
2
|
-
|
2
|
+
|
3
3
|
class Vector
|
4
4
|
def _dump(v)
|
5
5
|
Marshal.dump( self.to_a )
|
6
6
|
end
|
7
|
-
|
7
|
+
|
8
8
|
def self._load(arr)
|
9
9
|
arry = Marshal.load(arr)
|
10
10
|
return GSL::Vector.alloc(arry)
|
11
11
|
end
|
12
|
-
|
12
|
+
|
13
13
|
end
|
14
|
-
|
14
|
+
|
15
15
|
class Matrix
|
16
16
|
class <<self
|
17
17
|
alias :diag :diagonal
|
@@ -4,20 +4,20 @@
|
|
4
4
|
|
5
5
|
require "set"
|
6
6
|
|
7
|
-
# These are extensions to the String class to provide convenience
|
7
|
+
# These are extensions to the String class to provide convenience
|
8
8
|
# methods for the Classifier package.
|
9
9
|
class String
|
10
|
-
|
11
|
-
# Removes common punctuation symbols, returning a new string.
|
10
|
+
|
11
|
+
# Removes common punctuation symbols, returning a new string.
|
12
12
|
# E.g.,
|
13
13
|
# "Hello (greeting's), with {braces} < >...?".without_punctuation
|
14
14
|
# => "Hello greetings with braces "
|
15
15
|
def without_punctuation
|
16
16
|
tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
|
17
17
|
end
|
18
|
-
|
18
|
+
|
19
19
|
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
20
|
-
# interned, and indexes to its frequency in the document.
|
20
|
+
# interned, and indexes to its frequency in the document.
|
21
21
|
def word_hash
|
22
22
|
word_hash = clean_word_hash()
|
23
23
|
symbol_hash = word_hash_for_symbols(gsub(/[\w]/," ").split)
|
@@ -28,9 +28,9 @@ class String
|
|
28
28
|
def clean_word_hash
|
29
29
|
word_hash_for_words gsub(/[^\w\s]/,"").split
|
30
30
|
end
|
31
|
-
|
31
|
+
|
32
32
|
private
|
33
|
-
|
33
|
+
|
34
34
|
def word_hash_for_words(words)
|
35
35
|
d = Hash.new(0)
|
36
36
|
words.each do |word|
|
@@ -50,7 +50,7 @@ class String
|
|
50
50
|
end
|
51
51
|
return d
|
52
52
|
end
|
53
|
-
|
53
|
+
|
54
54
|
CORPUS_SKIP_WORDS = Set.new([
|
55
55
|
"a",
|
56
56
|
"again",
|
data/lib/classifier/lsi.rb
CHANGED
@@ -4,30 +4,30 @@
|
|
4
4
|
|
5
5
|
begin
|
6
6
|
raise LoadError if ENV['NATIVE_VECTOR'] == "true" # to test the native vector class, try `rake test NATIVE_VECTOR=true`
|
7
|
-
|
7
|
+
|
8
8
|
require 'gsl' # requires http://rb-gsl.rubyforge.org/
|
9
9
|
require 'classifier/extensions/vector_serialize'
|
10
10
|
$GSL = true
|
11
|
-
|
11
|
+
|
12
12
|
rescue LoadError
|
13
13
|
warn "Notice: for 10x faster LSI support, please install http://rb-gsl.rubyforge.org/"
|
14
|
-
require 'classifier/extensions/vector'
|
14
|
+
require 'classifier/extensions/vector'
|
15
15
|
end
|
16
|
-
|
16
|
+
|
17
17
|
require 'classifier/lsi/word_list'
|
18
18
|
require 'classifier/lsi/content_node'
|
19
19
|
require 'classifier/lsi/summary'
|
20
20
|
|
21
21
|
module Classifier
|
22
|
-
|
22
|
+
|
23
23
|
# This class implements a Latent Semantic Indexer, which can search, classify and cluster
|
24
24
|
# data based on underlying semantic relations. For more information on the algorithms used,
|
25
25
|
# please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
|
26
26
|
class LSI
|
27
|
-
|
27
|
+
|
28
28
|
attr_reader :word_list
|
29
29
|
attr_accessor :auto_rebuild
|
30
|
-
|
30
|
+
|
31
31
|
# Create a fresh index.
|
32
32
|
# If you want to call #build_index manually, use
|
33
33
|
# Classifier::LSI.new :auto_rebuild => false
|
@@ -37,20 +37,20 @@ module Classifier
|
|
37
37
|
@word_list, @items = WordList.new, {}
|
38
38
|
@version, @built_at_version = 0, -1
|
39
39
|
end
|
40
|
-
|
40
|
+
|
41
41
|
# Returns true if the index needs to be rebuilt. The index needs
|
42
42
|
# to be built after all informaton is added, but before you start
|
43
43
|
# using it for search, classification and cluster detection.
|
44
44
|
def needs_rebuild?
|
45
45
|
(@items.keys.size > 1) && (@version != @built_at_version)
|
46
46
|
end
|
47
|
-
|
48
|
-
# Adds an item to the index. item is assumed to be a string, but
|
47
|
+
|
48
|
+
# Adds an item to the index. item is assumed to be a string, but
|
49
49
|
# any item may be indexed so long as it responds to #to_s or if
|
50
|
-
# you provide an optional block explaining how the indexer can
|
50
|
+
# you provide an optional block explaining how the indexer can
|
51
51
|
# fetch fresh string data. This optional block is passed the item,
|
52
52
|
# so the item may only be a reference to a URL or file name.
|
53
|
-
#
|
53
|
+
#
|
54
54
|
# For example:
|
55
55
|
# lsi = Classifier::LSI.new
|
56
56
|
# lsi.add_item "This is just plain text"
|
@@ -65,14 +65,14 @@ module Classifier
|
|
65
65
|
build_index if @auto_rebuild
|
66
66
|
end
|
67
67
|
|
68
|
-
# A less flexible shorthand for add_item that assumes
|
68
|
+
# A less flexible shorthand for add_item that assumes
|
69
69
|
# you are passing in a string with no categorries. item
|
70
|
-
# will be duck typed via to_s .
|
70
|
+
# will be duck typed via to_s .
|
71
71
|
#
|
72
72
|
def <<( item )
|
73
73
|
add_item item
|
74
74
|
end
|
75
|
-
|
75
|
+
|
76
76
|
# Returns the categories for a given indexed items. You are free to add and remove
|
77
77
|
# items from this as you see fit. It does not invalide an index to change its categories.
|
78
78
|
def categories_for(item)
|
@@ -80,7 +80,7 @@ module Classifier
|
|
80
80
|
return @items[item].categories
|
81
81
|
end
|
82
82
|
|
83
|
-
# Removes an item from the database, if it is indexed.
|
83
|
+
# Removes an item from the database, if it is indexed.
|
84
84
|
#
|
85
85
|
def remove_item( item )
|
86
86
|
if @items.keys.contain? item
|
@@ -88,12 +88,12 @@ module Classifier
|
|
88
88
|
@version += 1
|
89
89
|
end
|
90
90
|
end
|
91
|
-
|
92
|
-
# Returns an array of items that are indexed.
|
91
|
+
|
92
|
+
# Returns an array of items that are indexed.
|
93
93
|
def items
|
94
94
|
@items.keys
|
95
95
|
end
|
96
|
-
|
96
|
+
|
97
97
|
# Returns the categories for a given indexed items. You are free to add and remove
|
98
98
|
# items from this as you see fit. It does not invalide an index to change its categories.
|
99
99
|
def categories_for(item)
|
@@ -103,30 +103,30 @@ module Classifier
|
|
103
103
|
|
104
104
|
# This function rebuilds the index if needs_rebuild? returns true.
|
105
105
|
# For very large document spaces, this indexing operation may take some
|
106
|
-
# time to complete, so it may be wise to place the operation in another
|
107
|
-
# thread.
|
106
|
+
# time to complete, so it may be wise to place the operation in another
|
107
|
+
# thread.
|
108
108
|
#
|
109
109
|
# As a rule, indexing will be fairly swift on modern machines until
|
110
|
-
# you have well over 500 documents indexed, or have an incredibly diverse
|
111
|
-
# vocabulary for your documents.
|
110
|
+
# you have well over 500 documents indexed, or have an incredibly diverse
|
111
|
+
# vocabulary for your documents.
|
112
112
|
#
|
113
113
|
# The optional parameter "cutoff" is a tuning parameter. When the index is
|
114
|
-
# built, a certain number of s-values are discarded from the system. The
|
114
|
+
# built, a certain number of s-values are discarded from the system. The
|
115
115
|
# cutoff parameter tells the indexer how many of these values to keep.
|
116
116
|
# A value of 1 for cutoff means that no semantic analysis will take place,
|
117
117
|
# turning the LSI class into a simple vector search engine.
|
118
118
|
def build_index( cutoff=0.75 )
|
119
119
|
return unless needs_rebuild?
|
120
120
|
make_word_list
|
121
|
-
|
121
|
+
|
122
122
|
doc_list = @items.values
|
123
123
|
tda = doc_list.collect { |node| node.raw_vector_with( @word_list ) }
|
124
|
-
|
124
|
+
|
125
125
|
if $GSL
|
126
126
|
tdm = GSL::Matrix.alloc(*tda).trans
|
127
127
|
ntdm = build_reduced_matrix(tdm, cutoff)
|
128
128
|
|
129
|
-
ntdm.size[1].times do |col|
|
129
|
+
ntdm.size[1].times do |col|
|
130
130
|
vec = GSL::Vector.alloc( ntdm.column(col) ).row
|
131
131
|
doc_list[col].lsi_vector = vec
|
132
132
|
doc_list[col].lsi_norm = vec.normalize
|
@@ -134,50 +134,50 @@ module Classifier
|
|
134
134
|
else
|
135
135
|
tdm = Matrix.rows(tda).trans
|
136
136
|
ntdm = build_reduced_matrix(tdm, cutoff)
|
137
|
-
|
137
|
+
|
138
138
|
ntdm.row_size.times do |col|
|
139
139
|
doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
|
140
140
|
doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
|
141
141
|
end
|
142
142
|
end
|
143
|
-
|
143
|
+
|
144
144
|
@built_at_version = @version
|
145
145
|
end
|
146
|
-
|
146
|
+
|
147
147
|
# This method returns max_chunks entries, ordered by their average semantic rating.
|
148
148
|
# Essentially, the average distance of each entry from all other entries is calculated,
|
149
149
|
# the highest are returned.
|
150
150
|
#
|
151
151
|
# This can be used to build a summary service, or to provide more information about
|
152
152
|
# your dataset's general content. For example, if you were to use categorize on the
|
153
|
-
# results of this data, you could gather information on what your dataset is generally
|
153
|
+
# results of this data, you could gather information on what your dataset is generally
|
154
154
|
# about.
|
155
155
|
def highest_relative_content( max_chunks=10 )
|
156
156
|
return [] if needs_rebuild?
|
157
|
-
|
157
|
+
|
158
158
|
avg_density = Hash.new
|
159
159
|
@items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
|
160
|
-
|
160
|
+
|
161
161
|
avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
|
162
162
|
end
|
163
163
|
|
164
|
-
# This function is the primitive that find_related and classify
|
164
|
+
# This function is the primitive that find_related and classify
|
165
165
|
# build upon. It returns an array of 2-element arrays. The first element
|
166
166
|
# of this array is a document, and the second is its "score", defining
|
167
167
|
# how "close" it is to other indexed items.
|
168
|
-
#
|
168
|
+
#
|
169
169
|
# These values are somewhat arbitrary, having to do with the vector space
|
170
170
|
# created by your content, so the magnitude is interpretable but not always
|
171
|
-
# meaningful between indexes.
|
171
|
+
# meaningful between indexes.
|
172
172
|
#
|
173
173
|
# The parameter doc is the content to compare. If that content is not
|
174
|
-
# indexed, you can pass an optional block to define how to create the
|
175
|
-
# text data. See add_item for examples of how this works.
|
174
|
+
# indexed, you can pass an optional block to define how to create the
|
175
|
+
# text data. See add_item for examples of how this works.
|
176
176
|
def proximity_array_for_content( doc, &block )
|
177
177
|
return [] if needs_rebuild?
|
178
|
-
|
178
|
+
|
179
179
|
content_node = node_for_content( doc, &block )
|
180
|
-
result =
|
180
|
+
result =
|
181
181
|
@items.keys.collect do |item|
|
182
182
|
if $GSL
|
183
183
|
val = content_node.search_vector * @items[item].search_vector.col
|
@@ -187,18 +187,18 @@ module Classifier
|
|
187
187
|
[item, val]
|
188
188
|
end
|
189
189
|
result.sort_by { |x| x[1] }.reverse
|
190
|
-
end
|
191
|
-
|
190
|
+
end
|
191
|
+
|
192
192
|
# Similar to proximity_array_for_content, this function takes similar
|
193
193
|
# arguments and returns a similar array. However, it uses the normalized
|
194
|
-
# calculated vectors instead of their full versions. This is useful when
|
194
|
+
# calculated vectors instead of their full versions. This is useful when
|
195
195
|
# you're trying to perform operations on content that is much smaller than
|
196
196
|
# the text you're working with. search uses this primitive.
|
197
197
|
def proximity_norms_for_content( doc, &block )
|
198
198
|
return [] if needs_rebuild?
|
199
|
-
|
199
|
+
|
200
200
|
content_node = node_for_content( doc, &block )
|
201
|
-
result =
|
201
|
+
result =
|
202
202
|
@items.keys.collect do |item|
|
203
203
|
if $GSL
|
204
204
|
val = content_node.search_norm * @items[item].search_norm.col
|
@@ -208,12 +208,12 @@ module Classifier
|
|
208
208
|
[item, val]
|
209
209
|
end
|
210
210
|
result.sort_by { |x| x[1] }.reverse
|
211
|
-
end
|
212
|
-
|
211
|
+
end
|
212
|
+
|
213
213
|
# This function allows for text-based search of your index. Unlike other functions
|
214
214
|
# like find_related and classify, search only takes short strings. It will also ignore
|
215
|
-
# factors like repeated words. It is best for short, google-like search terms.
|
216
|
-
# A search will first priortize lexical relationships, then semantic ones.
|
215
|
+
# factors like repeated words. It is best for short, google-like search terms.
|
216
|
+
# A search will first priortize lexical relationships, then semantic ones.
|
217
217
|
#
|
218
218
|
# While this may seem backwards compared to the other functions that LSI supports,
|
219
219
|
# it is actually the same algorithm, just applied on a smaller document.
|
@@ -223,30 +223,30 @@ module Classifier
|
|
223
223
|
result = carry.collect { |x| x[0] }
|
224
224
|
return result[0..max_nearest-1]
|
225
225
|
end
|
226
|
-
|
226
|
+
|
227
227
|
# This function takes content and finds other documents
|
228
228
|
# that are semantically "close", returning an array of documents sorted
|
229
229
|
# from most to least relavant.
|
230
|
-
# max_nearest specifies the number of documents to return. A value of
|
231
|
-
# 0 means that it returns all the indexed documents, sorted by relavence.
|
230
|
+
# max_nearest specifies the number of documents to return. A value of
|
231
|
+
# 0 means that it returns all the indexed documents, sorted by relavence.
|
232
232
|
#
|
233
|
-
# This is particularly useful for identifing clusters in your document space.
|
233
|
+
# This is particularly useful for identifing clusters in your document space.
|
234
234
|
# For example you may want to identify several "What's Related" items for weblog
|
235
235
|
# articles, or find paragraphs that relate to each other in an essay.
|
236
236
|
def find_related( doc, max_nearest=3, &block )
|
237
|
-
carry =
|
237
|
+
carry =
|
238
238
|
proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
|
239
239
|
result = carry.collect { |x| x[0] }
|
240
240
|
return result[0..max_nearest-1]
|
241
241
|
end
|
242
|
-
|
243
|
-
# This function uses a voting system to categorize documents, based on
|
244
|
-
# the categories of other documents. It uses the same logic as the
|
242
|
+
|
243
|
+
# This function uses a voting system to categorize documents, based on
|
244
|
+
# the categories of other documents. It uses the same logic as the
|
245
245
|
# find_related function to find related documents, then returns the
|
246
|
-
# most obvious category from this list.
|
246
|
+
# most obvious category from this list.
|
247
247
|
#
|
248
|
-
# cutoff signifies the number of documents to consider when clasifying
|
249
|
-
# text. A cutoff of 1 means that every document in the index votes on
|
248
|
+
# cutoff signifies the number of documents to consider when clasifying
|
249
|
+
# text. A cutoff of 1 means that every document in the index votes on
|
250
250
|
# what category the document is in. This may not always make sense.
|
251
251
|
#
|
252
252
|
def classify( doc, cutoff=0.30, &block )
|
@@ -256,16 +256,16 @@ module Classifier
|
|
256
256
|
votes = {}
|
257
257
|
carry.each do |pair|
|
258
258
|
categories = @items[pair[0]].categories
|
259
|
-
categories.each do |category|
|
259
|
+
categories.each do |category|
|
260
260
|
votes[category] ||= 0.0
|
261
|
-
votes[category] += pair[1]
|
261
|
+
votes[category] += pair[1]
|
262
262
|
end
|
263
263
|
end
|
264
|
-
|
264
|
+
|
265
265
|
ranking = votes.keys.sort_by { |x| votes[x] }
|
266
266
|
return ranking[-1]
|
267
267
|
end
|
268
|
-
|
268
|
+
|
269
269
|
# Prototype, only works on indexed documents.
|
270
270
|
# I have no clue if this is going to work, but in theory
|
271
271
|
# it's supposed to.
|
@@ -289,8 +289,8 @@ module Classifier
|
|
289
289
|
# Reconstruct the term document matrix, only with reduced rank
|
290
290
|
u * ($GSL ? GSL::Matrix : ::Matrix).diag( s ) * v.trans
|
291
291
|
end
|
292
|
-
|
293
|
-
def node_for_content(item, &block)
|
292
|
+
|
293
|
+
def node_for_content(item, &block)
|
294
294
|
if @items[item]
|
295
295
|
return @items[item]
|
296
296
|
else
|
@@ -302,10 +302,10 @@ module Classifier
|
|
302
302
|
cn.raw_vector_with( @word_list ) # make the lsi raw and norm vectors
|
303
303
|
end
|
304
304
|
end
|
305
|
-
|
305
|
+
|
306
306
|
return cn
|
307
307
|
end
|
308
|
-
|
308
|
+
|
309
309
|
def make_word_list
|
310
310
|
@word_list = WordList.new
|
311
311
|
@items.each_value do |node|
|
@@ -4,14 +4,14 @@
|
|
4
4
|
|
5
5
|
module Classifier
|
6
6
|
|
7
|
-
# This is an internal data structure class for the LSI node. Save for
|
7
|
+
# This is an internal data structure class for the LSI node. Save for
|
8
8
|
# raw_vector_with, it should be fairly straightforward to understand.
|
9
9
|
# You should never have to use it directly.
|
10
10
|
class ContentNode
|
11
|
-
attr_accessor :raw_vector, :raw_norm,
|
11
|
+
attr_accessor :raw_vector, :raw_norm,
|
12
12
|
:lsi_vector, :lsi_norm,
|
13
|
-
:categories
|
14
|
-
|
13
|
+
:categories
|
14
|
+
|
15
15
|
attr_reader :word_hash
|
16
16
|
# If text_proc is not specified, the source will be duck-typed
|
17
17
|
# via source.to_s
|
@@ -19,17 +19,17 @@ module Classifier
|
|
19
19
|
@categories = categories || []
|
20
20
|
@word_hash = word_hash
|
21
21
|
end
|
22
|
-
|
22
|
+
|
23
23
|
# Use this to fetch the appropriate search vector.
|
24
24
|
def search_vector
|
25
25
|
@lsi_vector || @raw_vector
|
26
26
|
end
|
27
|
-
|
27
|
+
|
28
28
|
# Use this to fetch the appropriate search vector in normalized form.
|
29
29
|
def search_norm
|
30
30
|
@lsi_norm || @raw_norm
|
31
31
|
end
|
32
|
-
|
32
|
+
|
33
33
|
# Creates the raw vector out of word_hash using word_list as the
|
34
34
|
# key for mapping the vector space.
|
35
35
|
def raw_vector_with( word_list )
|
@@ -42,22 +42,22 @@ module Classifier
|
|
42
42
|
@word_hash.each_key do |word|
|
43
43
|
vec[word_list[word]] = @word_hash[word] if word_list[word]
|
44
44
|
end
|
45
|
-
|
45
|
+
|
46
46
|
# Perform the scaling transform
|
47
47
|
total_words = vec.sum
|
48
|
-
|
48
|
+
|
49
49
|
# Perform first-order association transform if this vector has more
|
50
|
-
# than one word in it.
|
51
|
-
if total_words > 1.0
|
50
|
+
# than one word in it.
|
51
|
+
if total_words > 1.0
|
52
52
|
weighted_total = 0.0
|
53
53
|
vec.each do |term|
|
54
54
|
if ( term > 0 )
|
55
55
|
weighted_total += (( term / total_words ) * Math.log( term / total_words ))
|
56
56
|
end
|
57
|
-
end
|
57
|
+
end
|
58
58
|
vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
|
59
59
|
end
|
60
|
-
|
60
|
+
|
61
61
|
if $GSL
|
62
62
|
@raw_norm = vec.normalize
|
63
63
|
@raw_vector = vec
|
@@ -65,8 +65,8 @@ module Classifier
|
|
65
65
|
@raw_norm = Vector[*vec].normalize
|
66
66
|
@raw_vector = Vector[*vec]
|
67
67
|
end
|
68
|
-
end
|
69
|
-
|
70
|
-
end
|
71
|
-
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
|
72
72
|
end
|
@@ -14,13 +14,13 @@ class String
|
|
14
14
|
def split_sentences
|
15
15
|
split /(\.|\!|\?)/ # TODO: make this less primitive
|
16
16
|
end
|
17
|
-
|
17
|
+
|
18
18
|
def split_paragraphs
|
19
19
|
split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
|
20
20
|
end
|
21
|
-
|
21
|
+
|
22
22
|
private
|
23
|
-
|
23
|
+
|
24
24
|
def perform_lsi(chunks, count, separator)
|
25
25
|
lsi = Classifier::LSI.new :auto_rebuild => false
|
26
26
|
chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
|
@@ -28,4 +28,4 @@ class String
|
|
28
28
|
summaries = lsi.highest_relative_content count
|
29
29
|
return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
|
30
30
|
end
|
31
|
-
end
|
31
|
+
end
|
@@ -2,35 +2,35 @@
|
|
2
2
|
# Copyright:: Copyright (c) 2005 David Fayram II
|
3
3
|
# License:: LGPL
|
4
4
|
|
5
|
-
module Classifier
|
5
|
+
module Classifier
|
6
6
|
# This class keeps a word => index mapping. It is used to map stemmed words
|
7
7
|
# to dimensions of a vector.
|
8
|
-
|
8
|
+
|
9
9
|
class WordList
|
10
10
|
def initialize
|
11
11
|
@location_table = Hash.new
|
12
12
|
end
|
13
|
-
|
13
|
+
|
14
14
|
# Adds a word (if it is new) and assigns it a unique dimension.
|
15
15
|
def add_word(word)
|
16
16
|
term = word
|
17
17
|
@location_table[term] = @location_table.size unless @location_table[term]
|
18
18
|
end
|
19
|
-
|
19
|
+
|
20
20
|
# Returns the dimension of the word or nil if the word is not in the space.
|
21
21
|
def [](lookup)
|
22
22
|
term = lookup
|
23
23
|
@location_table[term]
|
24
24
|
end
|
25
|
-
|
25
|
+
|
26
26
|
def word_for_index(ind)
|
27
27
|
@location_table.invert[ind]
|
28
28
|
end
|
29
|
-
|
29
|
+
|
30
30
|
# Returns the number of words mapped.
|
31
31
|
def size
|
32
32
|
@location_table.size
|
33
33
|
end
|
34
|
-
|
34
|
+
|
35
35
|
end
|
36
36
|
end
|
metadata
CHANGED
@@ -1,36 +1,94 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: classifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Lucas Carlson
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-04-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: fast-stemmer
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: 1.0.0
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- -
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: 1.0.0
|
27
|
-
|
28
|
-
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: mathn
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: minitest
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rdoc
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
description: A general classifier module to allow Bayesian and other types of classifications.
|
29
84
|
email: lucas@rufy.com
|
30
85
|
executables: []
|
31
86
|
extensions: []
|
32
87
|
extra_rdoc_files: []
|
33
88
|
files:
|
89
|
+
- LICENSE
|
90
|
+
- bin/bayes.rb
|
91
|
+
- bin/summarize.rb
|
34
92
|
- lib/classifier.rb
|
35
93
|
- lib/classifier/bayes.rb
|
36
94
|
- lib/classifier/extensions/string.rb
|
@@ -41,19 +99,9 @@ files:
|
|
41
99
|
- lib/classifier/lsi/content_node.rb
|
42
100
|
- lib/classifier/lsi/summary.rb
|
43
101
|
- lib/classifier/lsi/word_list.rb
|
44
|
-
|
45
|
-
|
46
|
-
-
|
47
|
-
- test/extensions/word_hash_test.rb
|
48
|
-
- test/lsi/lsi_test.rb
|
49
|
-
- test/test_helper.rb
|
50
|
-
- Gemfile
|
51
|
-
- Gemfile.lock
|
52
|
-
- LICENSE
|
53
|
-
- README.markdown
|
54
|
-
- Rakefile
|
55
|
-
homepage: http://classifier.rufy.com/
|
56
|
-
licenses: []
|
102
|
+
homepage: https://github.com/cardmagic/classifier
|
103
|
+
licenses:
|
104
|
+
- LGPL
|
57
105
|
metadata: {}
|
58
106
|
post_install_message:
|
59
107
|
rdoc_options: []
|
@@ -61,18 +109,17 @@ require_paths:
|
|
61
109
|
- lib
|
62
110
|
required_ruby_version: !ruby/object:Gem::Requirement
|
63
111
|
requirements:
|
64
|
-
- -
|
112
|
+
- - ">="
|
65
113
|
- !ruby/object:Gem::Version
|
66
114
|
version: '0'
|
67
115
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
68
116
|
requirements:
|
69
|
-
- -
|
117
|
+
- - ">="
|
70
118
|
- !ruby/object:Gem::Version
|
71
119
|
version: '0'
|
72
|
-
requirements:
|
73
|
-
- A porter-stemmer module to split word stems.
|
120
|
+
requirements: []
|
74
121
|
rubyforge_project:
|
75
|
-
rubygems_version: 2.
|
122
|
+
rubygems_version: 2.7.6
|
76
123
|
signing_key:
|
77
124
|
specification_version: 4
|
78
125
|
summary: A general classifier module to allow Bayesian and other types of classifications.
|
data/Gemfile
DELETED
data/Gemfile.lock
DELETED
@@ -1,26 +0,0 @@
|
|
1
|
-
GEM
|
2
|
-
remote: https://rubygems.org/
|
3
|
-
specs:
|
4
|
-
diff-lcs (1.2.5)
|
5
|
-
fast-stemmer (1.0.2)
|
6
|
-
json (1.8.1)
|
7
|
-
rake (10.1.1)
|
8
|
-
rdoc (4.1.0)
|
9
|
-
json (~> 1.4)
|
10
|
-
rspec (2.14.1)
|
11
|
-
rspec-core (~> 2.14.0)
|
12
|
-
rspec-expectations (~> 2.14.0)
|
13
|
-
rspec-mocks (~> 2.14.0)
|
14
|
-
rspec-core (2.14.7)
|
15
|
-
rspec-expectations (2.14.4)
|
16
|
-
diff-lcs (>= 1.1.3, < 2.0)
|
17
|
-
rspec-mocks (2.14.4)
|
18
|
-
|
19
|
-
PLATFORMS
|
20
|
-
ruby
|
21
|
-
|
22
|
-
DEPENDENCIES
|
23
|
-
fast-stemmer
|
24
|
-
rake
|
25
|
-
rdoc
|
26
|
-
rspec
|
data/README.markdown
DELETED
@@ -1,97 +0,0 @@
|
|
1
|
-
## Welcome to Classifier
|
2
|
-
|
3
|
-
Classifier is a general module to allow Bayesian and other types of classifications.
|
4
|
-
|
5
|
-
## Download
|
6
|
-
|
7
|
-
* https://github.com/cardmagic/classifier
|
8
|
-
* gem install classifier
|
9
|
-
* git clone https://github.com/cardmagic/classifier.git
|
10
|
-
|
11
|
-
## Dependencies
|
12
|
-
|
13
|
-
If you install Classifier from source, you'll need to install Roman Shterenzon's fast-stemmer gem with RubyGems as follows:
|
14
|
-
|
15
|
-
gem install fast-stemmer
|
16
|
-
|
17
|
-
If you would like to speed up LSI classification by at least 10x, please install the following libraries:
|
18
|
-
GNU GSL:: http://www.gnu.org/software/gsl
|
19
|
-
rb-gsl:: http://rb-gsl.rubyforge.org
|
20
|
-
|
21
|
-
Notice that LSI will work without these libraries, but as soon as they are installed, Classifier will make use of them. No configuration changes are needed, we like to keep things ridiculously easy for you.
|
22
|
-
|
23
|
-
## Bayes
|
24
|
-
|
25
|
-
A Bayesian classifier by Lucas Carlson. Bayesian Classifiers are accurate, fast, and have modest memory requirements.
|
26
|
-
|
27
|
-
### Usage
|
28
|
-
|
29
|
-
require 'classifier'
|
30
|
-
b = Classifier::Bayes.new 'Interesting', 'Uninteresting'
|
31
|
-
b.train_interesting "here are some good words. I hope you love them"
|
32
|
-
b.train_uninteresting "here are some bad words, I hate you"
|
33
|
-
b.classify "I hate bad words and you" # returns 'Uninteresting'
|
34
|
-
|
35
|
-
require 'madeleine'
|
36
|
-
m = SnapshotMadeleine.new("bayes_data") {
|
37
|
-
Classifier::Bayes.new 'Interesting', 'Uninteresting'
|
38
|
-
}
|
39
|
-
m.system.train_interesting "here are some good words. I hope you love them"
|
40
|
-
m.system.train_uninteresting "here are some bad words, I hate you"
|
41
|
-
m.take_snapshot
|
42
|
-
m.system.classify "I love you" # returns 'Interesting'
|
43
|
-
|
44
|
-
Using Madeleine, your application can persist the learned data over time.
|
45
|
-
|
46
|
-
### Bayesian Classification
|
47
|
-
|
48
|
-
* http://www.process.com/precisemail/bayesian_filtering.htm
|
49
|
-
* http://en.wikipedia.org/wiki/Bayesian_filtering
|
50
|
-
* http://www.paulgraham.com/spam.html
|
51
|
-
|
52
|
-
## LSI
|
53
|
-
|
54
|
-
A Latent Semantic Indexer by David Fayram. Latent Semantic Indexing engines
|
55
|
-
are not as fast or as small as Bayesian classifiers, but are more flexible, providing
|
56
|
-
fast search and clustering detection as well as semantic analysis of the text that
|
57
|
-
theoretically simulates human learning.
|
58
|
-
|
59
|
-
### Usage
|
60
|
-
|
61
|
-
require 'classifier'
|
62
|
-
lsi = Classifier::LSI.new
|
63
|
-
strings = [ ["This text deals with dogs. Dogs.", :dog],
|
64
|
-
["This text involves dogs too. Dogs! ", :dog],
|
65
|
-
["This text revolves around cats. Cats.", :cat],
|
66
|
-
["This text also involves cats. Cats!", :cat],
|
67
|
-
["This text involves birds. Birds.",:bird ]]
|
68
|
-
strings.each {|x| lsi.add_item x.first, x.last}
|
69
|
-
|
70
|
-
lsi.search("dog", 3)
|
71
|
-
# returns => ["This text deals with dogs. Dogs.", "This text involves dogs too. Dogs! ",
|
72
|
-
# "This text also involves cats. Cats!"]
|
73
|
-
|
74
|
-
lsi.find_related(strings[2], 2)
|
75
|
-
# returns => ["This text revolves around cats. Cats.", "This text also involves cats. Cats!"]
|
76
|
-
|
77
|
-
lsi.classify "This text is also about dogs!"
|
78
|
-
# returns => :dog
|
79
|
-
|
80
|
-
Please see the Classifier::LSI documentation for more information. It is possible to index, search and classify
|
81
|
-
with more than just simple strings.
|
82
|
-
|
83
|
-
### Latent Semantic Indexing
|
84
|
-
|
85
|
-
* http://www.c2.com/cgi/wiki?LatentSemanticIndexing
|
86
|
-
* http://www.chadfowler.com/index.cgi/Computing/LatentSemanticIndexing.rdoc
|
87
|
-
* http://en.wikipedia.org/wiki/Latent_semantic_analysis
|
88
|
-
|
89
|
-
## Authors
|
90
|
-
|
91
|
-
* Lucas Carlson (lucas@rufy.com)
|
92
|
-
* David Fayram II (dfayram@gmail.com)
|
93
|
-
* Cameron McBride (cameron.mcbride@gmail.com)
|
94
|
-
* Ivan Acosta-Rubio (ivan@softwarecriollo.com)
|
95
|
-
|
96
|
-
This library is released under the terms of the GNU LGPL. See LICENSE for more details.
|
97
|
-
|
data/Rakefile
DELETED
@@ -1,84 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'rake'
|
3
|
-
require 'rake/testtask'
|
4
|
-
require 'rdoc/task'
|
5
|
-
require 'rake/contrib/rubyforgepublisher'
|
6
|
-
|
7
|
-
desc "Default Task"
|
8
|
-
task :default => [ :test ]
|
9
|
-
|
10
|
-
# Run the unit tests
|
11
|
-
desc "Run all unit tests"
|
12
|
-
Rake::TestTask.new("test") { |t|
|
13
|
-
t.libs << "lib"
|
14
|
-
t.pattern = 'test/*/*_test.rb'
|
15
|
-
t.verbose = true
|
16
|
-
}
|
17
|
-
|
18
|
-
# Make a console, useful when working on tests
|
19
|
-
desc "Generate a test console"
|
20
|
-
task :console do
|
21
|
-
verbose( false ) { sh "irb -I lib/ -r 'classifier'" }
|
22
|
-
end
|
23
|
-
|
24
|
-
# Genereate the RDoc documentation
|
25
|
-
desc "Create documentation"
|
26
|
-
Rake::RDocTask.new("doc") { |rdoc|
|
27
|
-
rdoc.title = "Ruby Classifier - Bayesian and LSI classification library"
|
28
|
-
rdoc.rdoc_dir = 'html'
|
29
|
-
rdoc.rdoc_files.include('README')
|
30
|
-
rdoc.rdoc_files.include('lib/**/*.rb')
|
31
|
-
}
|
32
|
-
|
33
|
-
# Genereate the package
|
34
|
-
spec = Gem::Specification.new do |s|
|
35
|
-
|
36
|
-
#### Basic information.
|
37
|
-
|
38
|
-
s.name = 'classifier'
|
39
|
-
s.version = PKG_VERSION
|
40
|
-
s.summary = <<-EOF
|
41
|
-
A general classifier module to allow Bayesian and other types of classifications.
|
42
|
-
EOF
|
43
|
-
s.description = <<-EOF
|
44
|
-
A general classifier module to allow Bayesian and other types of classifications.
|
45
|
-
EOF
|
46
|
-
|
47
|
-
#### Which files are to be included in this gem? Everything! (Except CVS directories.)
|
48
|
-
|
49
|
-
s.files = PKG_FILES
|
50
|
-
|
51
|
-
#### Load-time details: library and application (you will need one or both).
|
52
|
-
|
53
|
-
s.require_path = 'lib'
|
54
|
-
s.autorequire = 'classifier'
|
55
|
-
|
56
|
-
#### Documentation and testing.
|
57
|
-
|
58
|
-
s.has_rdoc = true
|
59
|
-
|
60
|
-
#### Dependencies and requirements.
|
61
|
-
|
62
|
-
s.add_dependency('fast-stemmer', '>= 1.0.0')
|
63
|
-
s.requirements << "A porter-stemmer module to split word stems."
|
64
|
-
|
65
|
-
#### Author and project details.
|
66
|
-
s.author = "Lucas Carlson"
|
67
|
-
s.email = "lucas@rufy.com"
|
68
|
-
s.homepage = "http://classifier.rufy.com/"
|
69
|
-
end
|
70
|
-
|
71
|
-
desc "Report code statistics (KLOCs, etc) from the application"
|
72
|
-
task :stats do
|
73
|
-
require 'code_statistics'
|
74
|
-
CodeStatistics.new(
|
75
|
-
["Library", "lib"],
|
76
|
-
["Units", "test"]
|
77
|
-
).to_s
|
78
|
-
end
|
79
|
-
|
80
|
-
desc "Publish new documentation"
|
81
|
-
task :publish do
|
82
|
-
`ssh rufy update-classifier-doc`
|
83
|
-
Rake::RubyForgePublisher.new('classifier', 'cardmagic').upload
|
84
|
-
end
|
data/test/bayes/bayesian_test.rb
DELETED
@@ -1,33 +0,0 @@
|
|
1
|
-
require File.dirname(__FILE__) + '/../test_helper'
|
2
|
-
class BayesianTest < Test::Unit::TestCase
|
3
|
-
def setup
|
4
|
-
@classifier = Classifier::Bayes.new 'Interesting', 'Uninteresting'
|
5
|
-
end
|
6
|
-
|
7
|
-
def test_good_training
|
8
|
-
assert_nothing_raised { @classifier.train_interesting "love" }
|
9
|
-
end
|
10
|
-
|
11
|
-
def test_bad_training
|
12
|
-
assert_raise(StandardError) { @classifier.train_no_category "words" }
|
13
|
-
end
|
14
|
-
|
15
|
-
def test_bad_method
|
16
|
-
assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
|
17
|
-
end
|
18
|
-
|
19
|
-
def test_categories
|
20
|
-
assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
|
21
|
-
end
|
22
|
-
|
23
|
-
def test_add_category
|
24
|
-
@classifier.add_category 'Test'
|
25
|
-
assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
|
26
|
-
end
|
27
|
-
|
28
|
-
def test_classification
|
29
|
-
@classifier.train_interesting "here are some good words. I hope you love them"
|
30
|
-
@classifier.train_uninteresting "here are some bad words, I hate you"
|
31
|
-
assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
|
32
|
-
end
|
33
|
-
end
|
@@ -1,35 +0,0 @@
|
|
1
|
-
require File.dirname(__FILE__) + '/../test_helper'
|
2
|
-
class StringExtensionsTest < Test::Unit::TestCase
|
3
|
-
def test_word_hash
|
4
|
-
hash = {:good=>1, :"!"=>1, :hope=>1, :"'"=>1, :"."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
|
5
|
-
assert_equal hash, "here are some good words of test's. I hope you love them!".word_hash
|
6
|
-
end
|
7
|
-
|
8
|
-
|
9
|
-
def test_clean_word_hash
|
10
|
-
hash = {:good=>1, :word=>1, :hope=>1, :love=>1, :them=>1, :test=>1}
|
11
|
-
assert_equal hash, "here are some good words of test's. I hope you love them!".clean_word_hash
|
12
|
-
end
|
13
|
-
|
14
|
-
end
|
15
|
-
|
16
|
-
|
17
|
-
class ArrayExtensionsTest < Test::Unit::TestCase
|
18
|
-
|
19
|
-
def test_plays_nicely_with_any_array
|
20
|
-
assert_equal [Array].sum, Array
|
21
|
-
end
|
22
|
-
|
23
|
-
def test_monkey_path_array_sum
|
24
|
-
assert_equal [1,2,3].sum, 6
|
25
|
-
end
|
26
|
-
|
27
|
-
def test_summing_an_empty_array
|
28
|
-
assert_equal [nil].sum, 0
|
29
|
-
end
|
30
|
-
|
31
|
-
def test_summing_an_empty_array
|
32
|
-
assert_equal Array[].sum, 0
|
33
|
-
end
|
34
|
-
|
35
|
-
end
|
data/test/lsi/lsi_test.rb
DELETED
@@ -1,123 +0,0 @@
|
|
1
|
-
require File.dirname(__FILE__) + '/../test_helper'
|
2
|
-
class LSITest < Test::Unit::TestCase
|
3
|
-
def setup
|
4
|
-
# we repeat principle words to help weight them.
|
5
|
-
# This test is rather delicate, since this system is mostly noise.
|
6
|
-
@str1 = "This text deals with dogs. Dogs."
|
7
|
-
@str2 = "This text involves dogs too. Dogs! "
|
8
|
-
@str3 = "This text revolves around cats. Cats."
|
9
|
-
@str4 = "This text also involves cats. Cats!"
|
10
|
-
@str5 = "This text involves birds. Birds."
|
11
|
-
end
|
12
|
-
|
13
|
-
def test_basic_indexing
|
14
|
-
lsi = Classifier::LSI.new
|
15
|
-
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
16
|
-
assert ! lsi.needs_rebuild?
|
17
|
-
|
18
|
-
# note that the closest match to str1 is str2, even though it is not
|
19
|
-
# the closest text match.
|
20
|
-
assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
|
21
|
-
end
|
22
|
-
|
23
|
-
def test_not_auto_rebuild
|
24
|
-
lsi = Classifier::LSI.new :auto_rebuild => false
|
25
|
-
lsi.add_item @str1, "Dog"
|
26
|
-
lsi.add_item @str2, "Dog"
|
27
|
-
assert lsi.needs_rebuild?
|
28
|
-
lsi.build_index
|
29
|
-
assert ! lsi.needs_rebuild?
|
30
|
-
end
|
31
|
-
|
32
|
-
def test_basic_categorizing
|
33
|
-
lsi = Classifier::LSI.new
|
34
|
-
lsi.add_item @str2, "Dog"
|
35
|
-
lsi.add_item @str3, "Cat"
|
36
|
-
lsi.add_item @str4, "Cat"
|
37
|
-
lsi.add_item @str5, "Bird"
|
38
|
-
|
39
|
-
assert_equal "Dog", lsi.classify( @str1 )
|
40
|
-
assert_equal "Cat", lsi.classify( @str3 )
|
41
|
-
assert_equal "Bird", lsi.classify( @str5 )
|
42
|
-
end
|
43
|
-
|
44
|
-
def test_external_classifying
|
45
|
-
lsi = Classifier::LSI.new
|
46
|
-
bayes = Classifier::Bayes.new 'Dog', 'Cat', 'Bird'
|
47
|
-
lsi.add_item @str1, "Dog" ; bayes.train_dog @str1
|
48
|
-
lsi.add_item @str2, "Dog" ; bayes.train_dog @str2
|
49
|
-
lsi.add_item @str3, "Cat" ; bayes.train_cat @str3
|
50
|
-
lsi.add_item @str4, "Cat" ; bayes.train_cat @str4
|
51
|
-
lsi.add_item @str5, "Bird" ; bayes.train_bird @str5
|
52
|
-
|
53
|
-
# We're talking about dogs. Even though the text matches the corpus on
|
54
|
-
# cats better. Dogs have more semantic weight than cats. So bayes
|
55
|
-
# will fail here, but the LSI recognizes content.
|
56
|
-
tricky_case = "This text revolves around dogs."
|
57
|
-
assert_equal "Dog", lsi.classify( tricky_case )
|
58
|
-
assert_not_equal "Dog", bayes.classify( tricky_case )
|
59
|
-
end
|
60
|
-
|
61
|
-
def test_recategorize_interface
|
62
|
-
lsi = Classifier::LSI.new
|
63
|
-
lsi.add_item @str1, "Dog"
|
64
|
-
lsi.add_item @str2, "Dog"
|
65
|
-
lsi.add_item @str3, "Cat"
|
66
|
-
lsi.add_item @str4, "Cat"
|
67
|
-
lsi.add_item @str5, "Bird"
|
68
|
-
|
69
|
-
tricky_case = "This text revolves around dogs."
|
70
|
-
assert_equal "Dog", lsi.classify( tricky_case )
|
71
|
-
|
72
|
-
# Recategorize as needed.
|
73
|
-
lsi.categories_for(@str1).clear.push "Cow"
|
74
|
-
lsi.categories_for(@str2).clear.push "Cow"
|
75
|
-
|
76
|
-
assert !lsi.needs_rebuild?
|
77
|
-
assert_equal "Cow", lsi.classify( tricky_case )
|
78
|
-
end
|
79
|
-
|
80
|
-
def test_search
|
81
|
-
lsi = Classifier::LSI.new
|
82
|
-
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
83
|
-
|
84
|
-
# Searching by content and text, note that @str2 comes up first, because
|
85
|
-
# both "dog" and "involve" are present. But, the next match is @str1 instead
|
86
|
-
# of @str4, because "dog" carries more weight than involves.
|
87
|
-
assert_equal( [@str2, @str1, @str4, @str5, @str3],
|
88
|
-
lsi.search("dog involves", 100) )
|
89
|
-
|
90
|
-
# Keyword search shows how the space is mapped out in relation to
|
91
|
-
# dog when magnitude is remove. Note the relations. We move from dog
|
92
|
-
# through involve and then finally to other words.
|
93
|
-
assert_equal( [@str1, @str2, @str4, @str5, @str3],
|
94
|
-
lsi.search("dog", 5) )
|
95
|
-
end
|
96
|
-
|
97
|
-
def test_serialize_safe
|
98
|
-
lsi = Classifier::LSI.new
|
99
|
-
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
100
|
-
|
101
|
-
lsi_md = Marshal.dump lsi
|
102
|
-
lsi_m = Marshal.load lsi_md
|
103
|
-
|
104
|
-
assert_equal lsi_m.search("cat", 3), lsi.search("cat", 3)
|
105
|
-
assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
|
106
|
-
end
|
107
|
-
|
108
|
-
def test_keyword_search
|
109
|
-
lsi = Classifier::LSI.new
|
110
|
-
lsi.add_item @str1, "Dog"
|
111
|
-
lsi.add_item @str2, "Dog"
|
112
|
-
lsi.add_item @str3, "Cat"
|
113
|
-
lsi.add_item @str4, "Cat"
|
114
|
-
lsi.add_item @str5, "Bird"
|
115
|
-
|
116
|
-
assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1)
|
117
|
-
end
|
118
|
-
|
119
|
-
def test_summary
|
120
|
-
assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2)
|
121
|
-
end
|
122
|
-
|
123
|
-
end
|
data/test/test_helper.rb
DELETED