josephwilk-semantic 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,35 @@
1
+ module Semantic
2
+ class Search
3
+
4
+ def initialize(documents, options={})
5
+ Semantic.logger.level = Logger::INFO if options[:verbose]
6
+
7
+ @builder = VectorSpace::Builder.new(options)
8
+ @matrix_transformer = MatrixTransformer.new(options)
9
+
10
+ @vector_space_model = @builder.build_document_matrix(documents)
11
+
12
+ Semantic.logger.info(@vector_space_model)
13
+
14
+ @vector_space_model = @matrix_transformer.apply_transforms(@vector_space_model)
15
+ end
16
+
17
+ def related(documentId)
18
+ ratings = []
19
+ for index in (0...@vector_space_model.ncol)
20
+ ratings << Compare.similarity(@vector_space_model.column(documentId), @vector_space_model.column(index))
21
+ end
22
+ ratings
23
+ end
24
+
25
+ def search(searchList)
26
+ ratings = []
27
+ query_vector = @builder.build_query_vector(searchList)
28
+ for index in (0...@vector_space_model.ncol)
29
+ ratings << Compare.similarity(query_vector, @vector_space_model.column(index))
30
+ end
31
+ ratings
32
+ end
33
+
34
+ end
35
+ end
@@ -0,0 +1 @@
1
+ %w{tf_idf lsa}.each{|f| require "semantic/transform/#{f}_transform.rb"}
@@ -0,0 +1,34 @@
1
+ module Semantic
2
+ module Transform
3
+ class LSA
4
+
5
+ class << self
6
+
7
+ # Calculate SVD of objects matrix: U . SIGMA . VT = MATRIX
8
+ # Reduce the dimension of sigma by specified factor producing sigma'.
9
+ # Then dot product the matrices: U . SIGMA' . VT = MATRIX'
10
+ def transform(matrix, dimensions=1)
11
+ columns = matrix.num_columns
12
+
13
+ if dimensions <= columns: #Its a valid reduction
14
+
15
+ u, sigma, vt = matrix.singular_value_decomposition
16
+
17
+ #Dimension reduction, build SIGMA'
18
+ for index in ((columns-dimensions)...columns)
19
+ sigma[index,index]=0
20
+ end
21
+
22
+ #Reconstruct MATRIX' and Save transform
23
+ matrix = u * sigma * vt
24
+
25
+ else
26
+ raise Exception, "dimension reduction cannot be greater than %s" % rows
27
+ end
28
+
29
+ matrix
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,42 @@
1
+ module Semantic
2
+ module Transform
3
+ class TFIDF
4
+
5
+ @@number_of_documents_with_term = []
6
+
7
+ def self.transform(matrix)
8
+ number_of_documents = matrix.num_columns
9
+ @@number_of_documents_with_term = []
10
+
11
+ matrix.columns.each_with_index do |document, column_index|
12
+ document_term_total = document.rows.inject(0.0) {|word_sum, word_count| word_sum + word_count.to_f }
13
+
14
+ document.rows.each_with_index do |term_weight, row_index|
15
+ unless term_weight.to_f == 0.0
16
+ matrix[row_index, column_index] = (term_weight / document_term_total) *
17
+ Math.log((number_of_documents / number_of_documents_with_term(row_index, matrix).to_f).abs)
18
+ end
19
+ end
20
+ end
21
+ matrix
22
+ end
23
+
24
+ def self.number_of_documents_with_term(row_index, matrix)
25
+ return @@number_of_documents_with_term[row_index] unless @@number_of_documents_with_term[row_index].nil?
26
+
27
+ term_document_occurences = 0
28
+
29
+ rows,cols = matrix.dimensions
30
+
31
+ for n in (0...cols)
32
+ if matrix[row_index, n] > 0 #Term appears in document
33
+ term_document_occurences += 1
34
+ end
35
+ end
36
+ @@number_of_documents_with_term[row_index] = term_document_occurences
37
+ @@number_of_documents_with_term[row_index]
38
+ end
39
+
40
+ end
41
+ end
42
+ end
@@ -0,0 +1 @@
1
+ %w{model builder}.each{|f| require "semantic/vector_space/#{f}"}
@@ -0,0 +1,69 @@
1
+ module Semantic
2
+ module VectorSpace
3
+ #A algebraic model for representing text documents as vectors of identifiers.
4
+ #A document is represented as a vector. Each dimension of the vector corresponds to a
5
+ #separate term. If a term occurs in the document, then the value in the vector is non-zero.
6
+ class Builder
7
+
8
+ def initialize(options={})
9
+ @parser = Parser.new
10
+ @options = options
11
+ @parsed_document_cache = []
12
+ end
13
+
14
+ def build_document_matrix(documents)
15
+ @vector_keyword_index = build_vector_keyword_index(documents)
16
+
17
+ document_vectors = documents.enum_for(:each_with_index).map{|document,document_id| build_vector(document, document_id)}
18
+ document_matrix = Linalg::DMatrix.join_columns(document_vectors)
19
+
20
+ Model.new(document_matrix, @vector_keyword_index)
21
+ end
22
+
23
+ def build_query_vector(term_list)
24
+ build_vector(term_list.join(" "))
25
+ end
26
+
27
+ private
28
+ def build_vector_keyword_index(documents)
29
+ parse_and_cache(documents)
30
+ vocabulary_list = find_unique_vocabulary
31
+ map_vocabulary_to_vector_positions(vocabulary_list)
32
+ end
33
+
34
+ def parse_and_cache(documents)
35
+ documents.each_with_index do |document, index|
36
+ @parsed_document_cache[index] = @parser.tokenise_and_filter(document)
37
+ end
38
+ end
39
+
40
+ def find_unique_vocabulary
41
+ vocabulary_list = @parsed_document_cache.inject([]) { |parsed_document, vocabulary_list| vocabulary_list + parsed_document }
42
+ vocabulary_list.uniq
43
+ end
44
+
45
+ def map_vocabulary_to_vector_positions(vocabulary_list)
46
+ vector_index={}
47
+ column = 0
48
+ vocabulary_list.each do |word|
49
+ vector_index[word] = column
50
+ column += 1
51
+ end
52
+ vector_index
53
+ end
54
+
55
+ def build_vector(word_string, document_id=nil)
56
+ if document_id.nil?
57
+ word_list = @parser.tokenise_and_filter(word_string)
58
+ else
59
+ word_list = @parsed_document_cache[document_id]
60
+ end
61
+
62
+ vector = Linalg::DMatrix.new(@vector_keyword_index.length, 1)
63
+ word_list.each { |word| vector[@vector_keyword_index[word] , 0] += 1 if @vector_keyword_index.has_key?(word) }
64
+ vector
65
+ end
66
+
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,45 @@
1
+ require 'linalg'
2
+ require 'delegate'
3
+
4
+ module Semantic
5
+ module VectorSpace
6
+
7
+ class Model < DelegateClass(::Linalg::DMatrix)
8
+
9
+ def initialize(matrix, keywords)
10
+ @keywords = keywords || {}
11
+ super(matrix)
12
+ end
13
+
14
+ def matrix=(matrix)
15
+ @_dc_obj = matrix
16
+ end
17
+
18
+ def matrix
19
+ @_dc_obj
20
+ end
21
+
22
+ def to_s
23
+ out = StringIO.new
24
+ out.print " " * 9
25
+
26
+ matrix.ncol.times do |id|
27
+ out.print " D#{id+1} "
28
+ end
29
+ out.puts
30
+
31
+ matrix.rows.each_with_index do |terms, index|
32
+ out.print "#{@keywords.index(index).ljust(6)}" if @keywords.has_value?(index)
33
+ out.print "[ "
34
+ terms.columns.each do |document|
35
+ out.print "%+0.2f " % document
36
+ end
37
+ out.print "]"
38
+ out.puts
39
+ end
40
+ out.string
41
+ end
42
+
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,9 @@
1
+ module Semantic #:nodoc:
2
+ class VERSION #:nodoc:
3
+ MAJOR = 0
4
+ MINOR = 1
5
+ TINY = 0
6
+
7
+ STRING = [MAJOR, MINOR, TINY].join('.')
8
+ end
9
+ end
@@ -0,0 +1,571 @@
1
+ a
2
+ a's
3
+ able
4
+ about
5
+ above
6
+ according
7
+ accordingly
8
+ across
9
+ actually
10
+ after
11
+ afterwards
12
+ again
13
+ against
14
+ ain't
15
+ all
16
+ allow
17
+ allows
18
+ almost
19
+ alone
20
+ along
21
+ already
22
+ also
23
+ although
24
+ always
25
+ am
26
+ among
27
+ amongst
28
+ an
29
+ and
30
+ another
31
+ any
32
+ anybody
33
+ anyhow
34
+ anyone
35
+ anything
36
+ anyway
37
+ anyways
38
+ anywhere
39
+ apart
40
+ appear
41
+ appreciate
42
+ appropriate
43
+ are
44
+ aren't
45
+ around
46
+ as
47
+ aside
48
+ ask
49
+ asking
50
+ associated
51
+ at
52
+ available
53
+ away
54
+ awfully
55
+ b
56
+ be
57
+ became
58
+ because
59
+ become
60
+ becomes
61
+ becoming
62
+ been
63
+ before
64
+ beforehand
65
+ behind
66
+ being
67
+ believe
68
+ below
69
+ beside
70
+ besides
71
+ best
72
+ better
73
+ between
74
+ beyond
75
+ both
76
+ brief
77
+ but
78
+ by
79
+ c
80
+ c'mon
81
+ c's
82
+ came
83
+ can
84
+ can't
85
+ cannot
86
+ cant
87
+ cause
88
+ causes
89
+ certain
90
+ certainly
91
+ changes
92
+ clearly
93
+ co
94
+ com
95
+ come
96
+ comes
97
+ concerning
98
+ consequently
99
+ consider
100
+ considering
101
+ contain
102
+ containing
103
+ contains
104
+ corresponding
105
+ could
106
+ couldn't
107
+ course
108
+ currently
109
+ d
110
+ definitely
111
+ described
112
+ despite
113
+ did
114
+ didn't
115
+ different
116
+ do
117
+ does
118
+ doesn't
119
+ doing
120
+ don't
121
+ done
122
+ down
123
+ downwards
124
+ during
125
+ e
126
+ each
127
+ edu
128
+ eg
129
+ eight
130
+ either
131
+ else
132
+ elsewhere
133
+ enough
134
+ entirely
135
+ especially
136
+ et
137
+ etc
138
+ even
139
+ ever
140
+ every
141
+ everybody
142
+ everyone
143
+ everything
144
+ everywhere
145
+ ex
146
+ exactly
147
+ example
148
+ except
149
+ f
150
+ far
151
+ few
152
+ fifth
153
+ first
154
+ five
155
+ followed
156
+ following
157
+ follows
158
+ for
159
+ former
160
+ formerly
161
+ forth
162
+ four
163
+ from
164
+ further
165
+ furthermore
166
+ g
167
+ get
168
+ gets
169
+ getting
170
+ given
171
+ gives
172
+ go
173
+ goes
174
+ going
175
+ gone
176
+ got
177
+ gotten
178
+ greetings
179
+ h
180
+ had
181
+ hadn't
182
+ happens
183
+ hardly
184
+ has
185
+ hasn't
186
+ have
187
+ haven't
188
+ having
189
+ he
190
+ he's
191
+ hello
192
+ help
193
+ hence
194
+ her
195
+ here
196
+ here's
197
+ hereafter
198
+ hereby
199
+ herein
200
+ hereupon
201
+ hers
202
+ herself
203
+ hi
204
+ him
205
+ himself
206
+ his
207
+ hither
208
+ hopefully
209
+ how
210
+ howbeit
211
+ however
212
+ i
213
+ i'd
214
+ i'll
215
+ i'm
216
+ i've
217
+ ie
218
+ if
219
+ ignored
220
+ immediate
221
+ in
222
+ inasmuch
223
+ inc
224
+ indeed
225
+ indicate
226
+ indicated
227
+ indicates
228
+ inner
229
+ insofar
230
+ instead
231
+ into
232
+ inward
233
+ is
234
+ isn't
235
+ it
236
+ it'd
237
+ it'll
238
+ it's
239
+ its
240
+ itself
241
+ j
242
+ just
243
+ k
244
+ keep
245
+ keeps
246
+ kept
247
+ know
248
+ knows
249
+ known
250
+ l
251
+ last
252
+ lately
253
+ later
254
+ latter
255
+ latterly
256
+ least
257
+ less
258
+ lest
259
+ let
260
+ let's
261
+ like
262
+ liked
263
+ likely
264
+ little
265
+ look
266
+ looking
267
+ looks
268
+ ltd
269
+ m
270
+ mainly
271
+ many
272
+ may
273
+ maybe
274
+ me
275
+ mean
276
+ meanwhile
277
+ merely
278
+ might
279
+ more
280
+ moreover
281
+ most
282
+ mostly
283
+ much
284
+ must
285
+ my
286
+ myself
287
+ n
288
+ name
289
+ namely
290
+ nd
291
+ near
292
+ nearly
293
+ necessary
294
+ need
295
+ needs
296
+ neither
297
+ never
298
+ nevertheless
299
+ new
300
+ next
301
+ nine
302
+ no
303
+ nobody
304
+ non
305
+ none
306
+ noone
307
+ nor
308
+ normally
309
+ not
310
+ nothing
311
+ novel
312
+ now
313
+ nowhere
314
+ o
315
+ obviously
316
+ of
317
+ off
318
+ often
319
+ oh
320
+ ok
321
+ okay
322
+ old
323
+ on
324
+ once
325
+ one
326
+ ones
327
+ only
328
+ onto
329
+ or
330
+ other
331
+ others
332
+ otherwise
333
+ ought
334
+ our
335
+ ours
336
+ ourselves
337
+ out
338
+ outside
339
+ over
340
+ overall
341
+ own
342
+ p
343
+ particular
344
+ particularly
345
+ per
346
+ perhaps
347
+ placed
348
+ please
349
+ plus
350
+ possible
351
+ presumably
352
+ probably
353
+ provides
354
+ q
355
+ que
356
+ quite
357
+ qv
358
+ r
359
+ rather
360
+ rd
361
+ re
362
+ really
363
+ reasonably
364
+ regarding
365
+ regardless
366
+ regards
367
+ relatively
368
+ respectively
369
+ right
370
+ s
371
+ said
372
+ same
373
+ saw
374
+ say
375
+ saying
376
+ says
377
+ second
378
+ secondly
379
+ see
380
+ seeing
381
+ seem
382
+ seemed
383
+ seeming
384
+ seems
385
+ seen
386
+ self
387
+ selves
388
+ sensible
389
+ sent
390
+ serious
391
+ seriously
392
+ seven
393
+ several
394
+ shall
395
+ she
396
+ should
397
+ shouldn't
398
+ since
399
+ six
400
+ so
401
+ some
402
+ somebody
403
+ somehow
404
+ someone
405
+ something
406
+ sometime
407
+ sometimes
408
+ somewhat
409
+ somewhere
410
+ soon
411
+ sorry
412
+ specified
413
+ specify
414
+ specifying
415
+ still
416
+ sub
417
+ such
418
+ sup
419
+ sure
420
+ t
421
+ t's
422
+ take
423
+ taken
424
+ tell
425
+ tends
426
+ th
427
+ than
428
+ thank
429
+ thanks
430
+ thanx
431
+ that
432
+ that's
433
+ thats
434
+ the
435
+ their
436
+ theirs
437
+ them
438
+ themselves
439
+ then
440
+ thence
441
+ there
442
+ there's
443
+ thereafter
444
+ thereby
445
+ therefore
446
+ therein
447
+ theres
448
+ thereupon
449
+ these
450
+ they
451
+ they'd
452
+ they'll
453
+ they're
454
+ they've
455
+ think
456
+ third
457
+ this
458
+ thorough
459
+ thoroughly
460
+ those
461
+ though
462
+ three
463
+ through
464
+ throughout
465
+ thru
466
+ thus
467
+ to
468
+ together
469
+ too
470
+ took
471
+ toward
472
+ towards
473
+ tried
474
+ tries
475
+ truly
476
+ try
477
+ trying
478
+ twice
479
+ two
480
+ u
481
+ un
482
+ under
483
+ unfortunately
484
+ unless
485
+ unlikely
486
+ until
487
+ unto
488
+ up
489
+ upon
490
+ us
491
+ use
492
+ used
493
+ useful
494
+ uses
495
+ using
496
+ usually
497
+ uucp
498
+ v
499
+ value
500
+ various
501
+ very
502
+ via
503
+ viz
504
+ vs
505
+ w
506
+ want
507
+ wants
508
+ was
509
+ wasn't
510
+ way
511
+ we
512
+ we'd
513
+ we'll
514
+ we're
515
+ we've
516
+ welcome
517
+ well
518
+ went
519
+ were
520
+ weren't
521
+ what
522
+ what's
523
+ whatever
524
+ when
525
+ whence
526
+ whenever
527
+ where
528
+ where's
529
+ whereafter
530
+ whereas
531
+ whereby
532
+ wherein
533
+ whereupon
534
+ wherever
535
+ whether
536
+ which
537
+ while
538
+ whither
539
+ who
540
+ who's
541
+ whoever
542
+ whole
543
+ whom
544
+ whose
545
+ why
546
+ will
547
+ willing
548
+ wish
549
+ with
550
+ within
551
+ without
552
+ won't
553
+ wonder
554
+ would
555
+ would
556
+ wouldn't
557
+ x
558
+ y
559
+ yes
560
+ yet
561
+ you
562
+ you'd
563
+ you'll
564
+ you're
565
+ you've
566
+ your
567
+ yours
568
+ yourself
569
+ yourselves
570
+ z
571
+ zero