classifier 1.3.5 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/classifier/bayes.rb +128 -120
- data/lib/classifier/extensions/string.rb +1 -1
- data/lib/classifier/extensions/vector.rb +66 -72
- data/lib/classifier/extensions/vector_serialize.rb +6 -8
- data/lib/classifier/extensions/word_hash.rb +108 -114
- data/lib/classifier/lsi/content_node.rb +25 -23
- data/lib/classifier/lsi/summary.rb +20 -20
- data/lib/classifier/lsi/word_list.rb +1 -2
- data/lib/classifier/lsi.rb +112 -89
- data/lib/classifier.rb +1 -0
- data/test/test_helper.rb +5 -0
- metadata +7 -21
data/lib/classifier/lsi.rb
CHANGED
@@ -3,15 +3,16 @@
|
|
3
3
|
# License:: LGPL
|
4
4
|
|
5
5
|
begin
|
6
|
-
|
7
|
-
|
8
|
-
require 'gsl' # requires http://rb-gsl.rubyforge.org/
|
9
|
-
require 'classifier/extensions/vector_serialize'
|
10
|
-
$GSL = true
|
6
|
+
# to test the native vector class, try `rake test NATIVE_VECTOR=true`
|
7
|
+
raise LoadError if ENV['NATIVE_VECTOR'] == 'true'
|
11
8
|
|
9
|
+
require 'gsl' # requires https://github.com/SciRuby/rb-gsl/
|
10
|
+
require 'classifier/extensions/vector_serialize'
|
11
|
+
$GSL = true
|
12
12
|
rescue LoadError
|
13
|
-
|
14
|
-
|
13
|
+
warn 'Notice: for 10x faster LSI support, please install https://github.com/SciRuby/rb-gsl/'
|
14
|
+
$GSL = false
|
15
|
+
require 'classifier/extensions/vector'
|
15
16
|
end
|
16
17
|
|
17
18
|
require 'classifier/lsi/word_list'
|
@@ -19,12 +20,10 @@ require 'classifier/lsi/content_node'
|
|
19
20
|
require 'classifier/lsi/summary'
|
20
21
|
|
21
22
|
module Classifier
|
22
|
-
|
23
23
|
# This class implements a Latent Semantic Indexer, which can search, classify and cluster
|
24
24
|
# data based on underlying semantic relations. For more information on the algorithms used,
|
25
25
|
# please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
|
26
26
|
class LSI
|
27
|
-
|
28
27
|
attr_reader :word_list
|
29
28
|
attr_accessor :auto_rebuild
|
30
29
|
|
@@ -34,8 +33,10 @@ module Classifier
|
|
34
33
|
#
|
35
34
|
def initialize(options = {})
|
36
35
|
@auto_rebuild = true unless options[:auto_rebuild] == false
|
37
|
-
@word_list
|
38
|
-
@
|
36
|
+
@word_list = WordList.new
|
37
|
+
@items = {}
|
38
|
+
@version = 0
|
39
|
+
@built_at_version = -1
|
39
40
|
end
|
40
41
|
|
41
42
|
# Returns true if the index needs to be rebuilt. The index needs
|
@@ -58,7 +59,7 @@ module Classifier
|
|
58
59
|
# ar = ActiveRecordObject.find( :all )
|
59
60
|
# lsi.add_item ar, *ar.categories { |x| ar.content }
|
60
61
|
#
|
61
|
-
def add_item(
|
62
|
+
def add_item(item, *categories, &block)
|
62
63
|
clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
|
63
64
|
@items[item] = ContentNode.new(clean_word_hash, *categories)
|
64
65
|
@version += 1
|
@@ -69,24 +70,25 @@ module Classifier
|
|
69
70
|
# you are passing in a string with no categorries. item
|
70
71
|
# will be duck typed via to_s .
|
71
72
|
#
|
72
|
-
def <<(
|
73
|
-
add_item
|
73
|
+
def <<(item)
|
74
|
+
add_item(item)
|
74
75
|
end
|
75
76
|
|
76
77
|
# Returns the categories for a given indexed items. You are free to add and remove
|
77
78
|
# items from this as you see fit. It does not invalide an index to change its categories.
|
78
79
|
def categories_for(item)
|
79
80
|
return [] unless @items[item]
|
80
|
-
|
81
|
+
|
82
|
+
@items[item].categories
|
81
83
|
end
|
82
84
|
|
83
85
|
# Removes an item from the database, if it is indexed.
|
84
86
|
#
|
85
|
-
def remove_item(
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
87
|
+
def remove_item(item)
|
88
|
+
return unless @items.key?(item)
|
89
|
+
|
90
|
+
@items.delete(item)
|
91
|
+
@version += 1
|
90
92
|
end
|
91
93
|
|
92
94
|
# Returns an array of items that are indexed.
|
@@ -94,13 +96,6 @@ module Classifier
|
|
94
96
|
@items.keys
|
95
97
|
end
|
96
98
|
|
97
|
-
# Returns the categories for a given indexed items. You are free to add and remove
|
98
|
-
# items from this as you see fit. It does not invalide an index to change its categories.
|
99
|
-
def categories_for(item)
|
100
|
-
return [] unless @items[item]
|
101
|
-
return @items[item].categories
|
102
|
-
end
|
103
|
-
|
104
99
|
# This function rebuilds the index if needs_rebuild? returns true.
|
105
100
|
# For very large document spaces, this indexing operation may take some
|
106
101
|
# time to complete, so it may be wise to place the operation in another
|
@@ -115,30 +110,31 @@ module Classifier
|
|
115
110
|
# cutoff parameter tells the indexer how many of these values to keep.
|
116
111
|
# A value of 1 for cutoff means that no semantic analysis will take place,
|
117
112
|
# turning the LSI class into a simple vector search engine.
|
118
|
-
def build_index(
|
113
|
+
def build_index(cutoff = 0.75)
|
119
114
|
return unless needs_rebuild?
|
115
|
+
|
120
116
|
make_word_list
|
121
117
|
|
122
118
|
doc_list = @items.values
|
123
|
-
tda = doc_list.collect { |node| node.raw_vector_with(
|
119
|
+
tda = doc_list.collect { |node| node.raw_vector_with(@word_list) }
|
124
120
|
|
125
121
|
if $GSL
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
122
|
+
tdm = GSL::Matrix.alloc(*tda).trans
|
123
|
+
ntdm = build_reduced_matrix(tdm, cutoff)
|
124
|
+
|
125
|
+
ntdm.size[1].times do |col|
|
126
|
+
vec = GSL::Vector.alloc(ntdm.column(col)).row
|
127
|
+
doc_list[col].lsi_vector = vec
|
128
|
+
doc_list[col].lsi_norm = vec.normalize
|
129
|
+
end
|
134
130
|
else
|
135
|
-
|
136
|
-
|
131
|
+
tdm = Matrix.rows(tda).trans
|
132
|
+
ntdm = build_reduced_matrix(tdm, cutoff)
|
137
133
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
134
|
+
ntdm.row_size.times do |col|
|
135
|
+
doc_list[col].lsi_vector = ntdm.column(col) if doc_list[col]
|
136
|
+
doc_list[col].lsi_norm = ntdm.column(col).normalize if doc_list[col]
|
137
|
+
end
|
142
138
|
end
|
143
139
|
|
144
140
|
@built_at_version = @version
|
@@ -152,13 +148,13 @@ module Classifier
|
|
152
148
|
# your dataset's general content. For example, if you were to use categorize on the
|
153
149
|
# results of this data, you could gather information on what your dataset is generally
|
154
150
|
# about.
|
155
|
-
def highest_relative_content(
|
156
|
-
|
151
|
+
def highest_relative_content(max_chunks = 10)
|
152
|
+
return [] if needs_rebuild?
|
157
153
|
|
158
|
-
|
159
|
-
|
154
|
+
avg_density = {}
|
155
|
+
@items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x, y| x + y[1] } }
|
160
156
|
|
161
|
-
|
157
|
+
avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks - 1].map
|
162
158
|
end
|
163
159
|
|
164
160
|
# This function is the primitive that find_related and classify
|
@@ -173,17 +169,17 @@ module Classifier
|
|
173
169
|
# The parameter doc is the content to compare. If that content is not
|
174
170
|
# indexed, you can pass an optional block to define how to create the
|
175
171
|
# text data. See add_item for examples of how this works.
|
176
|
-
def proximity_array_for_content(
|
172
|
+
def proximity_array_for_content(doc, &block)
|
177
173
|
return [] if needs_rebuild?
|
178
174
|
|
179
|
-
content_node = node_for_content(
|
175
|
+
content_node = node_for_content(doc, &block)
|
180
176
|
result =
|
181
177
|
@items.keys.collect do |item|
|
182
|
-
if $GSL
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
178
|
+
val = if $GSL
|
179
|
+
content_node.search_vector * @items[item].search_vector.col
|
180
|
+
else
|
181
|
+
(Matrix[content_node.search_vector] * @items[item].search_vector)[0]
|
182
|
+
end
|
187
183
|
[item, val]
|
188
184
|
end
|
189
185
|
result.sort_by { |x| x[1] }.reverse
|
@@ -194,17 +190,17 @@ module Classifier
|
|
194
190
|
# calculated vectors instead of their full versions. This is useful when
|
195
191
|
# you're trying to perform operations on content that is much smaller than
|
196
192
|
# the text you're working with. search uses this primitive.
|
197
|
-
def proximity_norms_for_content(
|
193
|
+
def proximity_norms_for_content(doc, &block)
|
198
194
|
return [] if needs_rebuild?
|
199
195
|
|
200
|
-
content_node = node_for_content(
|
196
|
+
content_node = node_for_content(doc, &block)
|
201
197
|
result =
|
202
198
|
@items.keys.collect do |item|
|
203
|
-
if $GSL
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
199
|
+
val = if $GSL
|
200
|
+
content_node.search_norm * @items[item].search_norm.col
|
201
|
+
else
|
202
|
+
(Matrix[content_node.search_norm] * @items[item].search_norm)[0]
|
203
|
+
end
|
208
204
|
[item, val]
|
209
205
|
end
|
210
206
|
result.sort_by { |x| x[1] }.reverse
|
@@ -217,11 +213,12 @@ module Classifier
|
|
217
213
|
#
|
218
214
|
# While this may seem backwards compared to the other functions that LSI supports,
|
219
215
|
# it is actually the same algorithm, just applied on a smaller document.
|
220
|
-
def search(
|
216
|
+
def search(string, max_nearest = 3)
|
221
217
|
return [] if needs_rebuild?
|
222
|
-
|
218
|
+
|
219
|
+
carry = proximity_norms_for_content(string)
|
223
220
|
result = carry.collect { |x| x[0] }
|
224
|
-
|
221
|
+
result[0..max_nearest - 1]
|
225
222
|
end
|
226
223
|
|
227
224
|
# This function takes content and finds other documents
|
@@ -233,11 +230,11 @@ module Classifier
|
|
233
230
|
# This is particularly useful for identifing clusters in your document space.
|
234
231
|
# For example you may want to identify several "What's Related" items for weblog
|
235
232
|
# articles, or find paragraphs that relate to each other in an essay.
|
236
|
-
def find_related(
|
233
|
+
def find_related(doc, max_nearest = 3, &block)
|
237
234
|
carry =
|
238
|
-
proximity_array_for_content(
|
235
|
+
proximity_array_for_content(doc, &block).reject { |pair| pair[0] == doc }
|
239
236
|
result = carry.collect { |x| x[0] }
|
240
|
-
|
237
|
+
result[0..max_nearest - 1]
|
241
238
|
end
|
242
239
|
|
243
240
|
# This function uses a voting system to categorize documents, based on
|
@@ -249,10 +246,17 @@ module Classifier
|
|
249
246
|
# text. A cutoff of 1 means that every document in the index votes on
|
250
247
|
# what category the document is in. This may not always make sense.
|
251
248
|
#
|
252
|
-
def classify(
|
249
|
+
def classify(doc, cutoff = 0.30, &block)
|
250
|
+
votes = vote(doc, cutoff, &block)
|
251
|
+
|
252
|
+
ranking = votes.keys.sort_by { |x| votes[x] }
|
253
|
+
ranking[-1]
|
254
|
+
end
|
255
|
+
|
256
|
+
def vote(doc, cutoff = 0.30, &block)
|
253
257
|
icutoff = (@items.size * cutoff).round
|
254
|
-
carry = proximity_array_for_content(
|
255
|
-
carry = carry[0..icutoff-1]
|
258
|
+
carry = proximity_array_for_content(doc, &block)
|
259
|
+
carry = carry[0..icutoff - 1]
|
256
260
|
votes = {}
|
257
261
|
carry.each do |pair|
|
258
262
|
categories = @items[pair[0]].categories
|
@@ -261,23 +265,46 @@ module Classifier
|
|
261
265
|
votes[category] += pair[1]
|
262
266
|
end
|
263
267
|
end
|
268
|
+
votes
|
269
|
+
end
|
270
|
+
|
271
|
+
# Returns the same category as classify() but also returns
|
272
|
+
# a confidence value derived from the vote share that the
|
273
|
+
# winning category got.
|
274
|
+
#
|
275
|
+
# e.g.
|
276
|
+
# category,confidence = classify_with_confidence(doc)
|
277
|
+
# if confidence < 0.3
|
278
|
+
# category = nil
|
279
|
+
# end
|
280
|
+
#
|
281
|
+
#
|
282
|
+
# See classify() for argument docs
|
283
|
+
def classify_with_confidence(doc, cutoff = 0.30, &block)
|
284
|
+
votes = vote(doc, cutoff, &block)
|
285
|
+
votes_sum = votes.values.inject(0.0) { |sum, v| sum + v }
|
286
|
+
return [nil, nil] if votes_sum.zero?
|
264
287
|
|
265
288
|
ranking = votes.keys.sort_by { |x| votes[x] }
|
266
|
-
|
289
|
+
winner = ranking[-1]
|
290
|
+
vote_share = votes[winner] / votes_sum.to_f
|
291
|
+
[winner, vote_share]
|
267
292
|
end
|
268
293
|
|
269
294
|
# Prototype, only works on indexed documents.
|
270
295
|
# I have no clue if this is going to work, but in theory
|
271
296
|
# it's supposed to.
|
272
|
-
def highest_ranked_stems(
|
273
|
-
raise
|
297
|
+
def highest_ranked_stems(doc, count = 3)
|
298
|
+
raise 'Requested stem ranking on non-indexed content!' unless @items[doc]
|
299
|
+
|
274
300
|
arr = node_for_content(doc).lsi_vector.to_a
|
275
|
-
top_n = arr.sort.reverse[0..count-1]
|
276
|
-
|
301
|
+
top_n = arr.sort.reverse[0..count - 1]
|
302
|
+
top_n.collect { |x| @word_list.word_for_index(arr.index(x)) }
|
277
303
|
end
|
278
304
|
|
279
305
|
private
|
280
|
-
|
306
|
+
|
307
|
+
def build_reduced_matrix(matrix, cutoff = 0.75)
|
281
308
|
# TODO: Check that M>=N on these dimensions! Transpose helps assure this
|
282
309
|
u, v, s = matrix.SV_decomp
|
283
310
|
|
@@ -287,23 +314,21 @@ module Classifier
|
|
287
314
|
s[ord] = 0.0 if s[ord] < s_cutoff
|
288
315
|
end
|
289
316
|
# Reconstruct the term document matrix, only with reduced rank
|
290
|
-
u * ($GSL ? GSL::Matrix : ::Matrix).diag(
|
317
|
+
u * ($GSL ? GSL::Matrix : ::Matrix).diag(s) * v.trans
|
291
318
|
end
|
292
319
|
|
293
320
|
def node_for_content(item, &block)
|
294
|
-
if @items[item]
|
295
|
-
return @items[item]
|
296
|
-
else
|
297
|
-
clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
|
321
|
+
return @items[item] if @items[item]
|
298
322
|
|
299
|
-
|
323
|
+
clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
|
300
324
|
|
301
|
-
|
302
|
-
|
303
|
-
|
325
|
+
cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
|
326
|
+
|
327
|
+
unless needs_rebuild?
|
328
|
+
cn.raw_vector_with(@word_list) # make the lsi raw and norm vectors
|
304
329
|
end
|
305
330
|
|
306
|
-
|
331
|
+
cn
|
307
332
|
end
|
308
333
|
|
309
334
|
def make_word_list
|
@@ -312,7 +337,5 @@ module Classifier
|
|
312
337
|
node.word_hash.each_key { |key| @word_list.add_word key }
|
313
338
|
end
|
314
339
|
end
|
315
|
-
|
316
340
|
end
|
317
341
|
end
|
318
|
-
|
data/lib/classifier.rb
CHANGED
data/test/test_helper.rb
ADDED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: classifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Lucas Carlson
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-07-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: fast-stemmer
|
@@ -24,20 +24,6 @@ dependencies:
|
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: 1.0.0
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: mathn
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - ">="
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '0'
|
34
|
-
type: :runtime
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
-
requirements:
|
38
|
-
- - ">="
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
version: '0'
|
41
27
|
- !ruby/object:Gem::Dependency
|
42
28
|
name: rake
|
43
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -99,11 +85,12 @@ files:
|
|
99
85
|
- lib/classifier/lsi/content_node.rb
|
100
86
|
- lib/classifier/lsi/summary.rb
|
101
87
|
- lib/classifier/lsi/word_list.rb
|
88
|
+
- test/test_helper.rb
|
102
89
|
homepage: https://github.com/cardmagic/classifier
|
103
90
|
licenses:
|
104
91
|
- LGPL
|
105
92
|
metadata: {}
|
106
|
-
post_install_message:
|
93
|
+
post_install_message:
|
107
94
|
rdoc_options: []
|
108
95
|
require_paths:
|
109
96
|
- lib
|
@@ -118,9 +105,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
118
105
|
- !ruby/object:Gem::Version
|
119
106
|
version: '0'
|
120
107
|
requirements: []
|
121
|
-
|
122
|
-
|
123
|
-
signing_key:
|
108
|
+
rubygems_version: 3.5.9
|
109
|
+
signing_key:
|
124
110
|
specification_version: 4
|
125
111
|
summary: A general classifier module to allow Bayesian and other types of classifications.
|
126
112
|
test_files: []
|