josephwilk-rsemantic 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/semantic.rb ADDED
@@ -0,0 +1,29 @@
1
+ $:.unshift(File.dirname(__FILE__)) unless
2
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
+
4
+ require "semantic/vector_space"
5
+ require "semantic/compare"
6
+ require "semantic/parser"
7
+ require "semantic/matrix_transformer"
8
+ require "semantic/search"
9
+ require "semantic/transform"
10
+ require "semantic/version"
11
+
12
+ require 'rubygems'
13
+ require 'linalg'
14
+ #http://rubyforge.org/projects/stemmer/
15
+ #A processor for removing the commoner morphological and inflexional endings from words in English
16
+ require 'stemmer'
17
+ require 'logger'
18
+
19
+ module Semantic
20
+
21
+ def self.logger
22
+ return @logger if @logger
23
+ @logger = Logger.new(STDOUT)
24
+ @logger.formatter = proc { |severity, time, progname, msg| "#{msg}\n" }
25
+ @logger.level = Logger::ERROR
26
+ @logger
27
+ end
28
+
29
+ end
@@ -0,0 +1,571 @@
1
+ a
2
+ a's
3
+ able
4
+ about
5
+ above
6
+ according
7
+ accordingly
8
+ across
9
+ actually
10
+ after
11
+ afterwards
12
+ again
13
+ against
14
+ ain't
15
+ all
16
+ allow
17
+ allows
18
+ almost
19
+ alone
20
+ along
21
+ already
22
+ also
23
+ although
24
+ always
25
+ am
26
+ among
27
+ amongst
28
+ an
29
+ and
30
+ another
31
+ any
32
+ anybody
33
+ anyhow
34
+ anyone
35
+ anything
36
+ anyway
37
+ anyways
38
+ anywhere
39
+ apart
40
+ appear
41
+ appreciate
42
+ appropriate
43
+ are
44
+ aren't
45
+ around
46
+ as
47
+ aside
48
+ ask
49
+ asking
50
+ associated
51
+ at
52
+ available
53
+ away
54
+ awfully
55
+ b
56
+ be
57
+ became
58
+ because
59
+ become
60
+ becomes
61
+ becoming
62
+ been
63
+ before
64
+ beforehand
65
+ behind
66
+ being
67
+ believe
68
+ below
69
+ beside
70
+ besides
71
+ best
72
+ better
73
+ between
74
+ beyond
75
+ both
76
+ brief
77
+ but
78
+ by
79
+ c
80
+ c'mon
81
+ c's
82
+ came
83
+ can
84
+ can't
85
+ cannot
86
+ cant
87
+ cause
88
+ causes
89
+ certain
90
+ certainly
91
+ changes
92
+ clearly
93
+ co
94
+ com
95
+ come
96
+ comes
97
+ concerning
98
+ consequently
99
+ consider
100
+ considering
101
+ contain
102
+ containing
103
+ contains
104
+ corresponding
105
+ could
106
+ couldn't
107
+ course
108
+ currently
109
+ d
110
+ definitely
111
+ described
112
+ despite
113
+ did
114
+ didn't
115
+ different
116
+ do
117
+ does
118
+ doesn't
119
+ doing
120
+ don't
121
+ done
122
+ down
123
+ downwards
124
+ during
125
+ e
126
+ each
127
+ edu
128
+ eg
129
+ eight
130
+ either
131
+ else
132
+ elsewhere
133
+ enough
134
+ entirely
135
+ especially
136
+ et
137
+ etc
138
+ even
139
+ ever
140
+ every
141
+ everybody
142
+ everyone
143
+ everything
144
+ everywhere
145
+ ex
146
+ exactly
147
+ example
148
+ except
149
+ f
150
+ far
151
+ few
152
+ fifth
153
+ first
154
+ five
155
+ followed
156
+ following
157
+ follows
158
+ for
159
+ former
160
+ formerly
161
+ forth
162
+ four
163
+ from
164
+ further
165
+ furthermore
166
+ g
167
+ get
168
+ gets
169
+ getting
170
+ given
171
+ gives
172
+ go
173
+ goes
174
+ going
175
+ gone
176
+ got
177
+ gotten
178
+ greetings
179
+ h
180
+ had
181
+ hadn't
182
+ happens
183
+ hardly
184
+ has
185
+ hasn't
186
+ have
187
+ haven't
188
+ having
189
+ he
190
+ he's
191
+ hello
192
+ help
193
+ hence
194
+ her
195
+ here
196
+ here's
197
+ hereafter
198
+ hereby
199
+ herein
200
+ hereupon
201
+ hers
202
+ herself
203
+ hi
204
+ him
205
+ himself
206
+ his
207
+ hither
208
+ hopefully
209
+ how
210
+ howbeit
211
+ however
212
+ i
213
+ i'd
214
+ i'll
215
+ i'm
216
+ i've
217
+ ie
218
+ if
219
+ ignored
220
+ immediate
221
+ in
222
+ inasmuch
223
+ inc
224
+ indeed
225
+ indicate
226
+ indicated
227
+ indicates
228
+ inner
229
+ insofar
230
+ instead
231
+ into
232
+ inward
233
+ is
234
+ isn't
235
+ it
236
+ it'd
237
+ it'll
238
+ it's
239
+ its
240
+ itself
241
+ j
242
+ just
243
+ k
244
+ keep
245
+ keeps
246
+ kept
247
+ know
248
+ knows
249
+ known
250
+ l
251
+ last
252
+ lately
253
+ later
254
+ latter
255
+ latterly
256
+ least
257
+ less
258
+ lest
259
+ let
260
+ let's
261
+ like
262
+ liked
263
+ likely
264
+ little
265
+ look
266
+ looking
267
+ looks
268
+ ltd
269
+ m
270
+ mainly
271
+ many
272
+ may
273
+ maybe
274
+ me
275
+ mean
276
+ meanwhile
277
+ merely
278
+ might
279
+ more
280
+ moreover
281
+ most
282
+ mostly
283
+ much
284
+ must
285
+ my
286
+ myself
287
+ n
288
+ name
289
+ namely
290
+ nd
291
+ near
292
+ nearly
293
+ necessary
294
+ need
295
+ needs
296
+ neither
297
+ never
298
+ nevertheless
299
+ new
300
+ next
301
+ nine
302
+ no
303
+ nobody
304
+ non
305
+ none
306
+ noone
307
+ nor
308
+ normally
309
+ not
310
+ nothing
311
+ novel
312
+ now
313
+ nowhere
314
+ o
315
+ obviously
316
+ of
317
+ off
318
+ often
319
+ oh
320
+ ok
321
+ okay
322
+ old
323
+ on
324
+ once
325
+ one
326
+ ones
327
+ only
328
+ onto
329
+ or
330
+ other
331
+ others
332
+ otherwise
333
+ ought
334
+ our
335
+ ours
336
+ ourselves
337
+ out
338
+ outside
339
+ over
340
+ overall
341
+ own
342
+ p
343
+ particular
344
+ particularly
345
+ per
346
+ perhaps
347
+ placed
348
+ please
349
+ plus
350
+ possible
351
+ presumably
352
+ probably
353
+ provides
354
+ q
355
+ que
356
+ quite
357
+ qv
358
+ r
359
+ rather
360
+ rd
361
+ re
362
+ really
363
+ reasonably
364
+ regarding
365
+ regardless
366
+ regards
367
+ relatively
368
+ respectively
369
+ right
370
+ s
371
+ said
372
+ same
373
+ saw
374
+ say
375
+ saying
376
+ says
377
+ second
378
+ secondly
379
+ see
380
+ seeing
381
+ seem
382
+ seemed
383
+ seeming
384
+ seems
385
+ seen
386
+ self
387
+ selves
388
+ sensible
389
+ sent
390
+ serious
391
+ seriously
392
+ seven
393
+ several
394
+ shall
395
+ she
396
+ should
397
+ shouldn't
398
+ since
399
+ six
400
+ so
401
+ some
402
+ somebody
403
+ somehow
404
+ someone
405
+ something
406
+ sometime
407
+ sometimes
408
+ somewhat
409
+ somewhere
410
+ soon
411
+ sorry
412
+ specified
413
+ specify
414
+ specifying
415
+ still
416
+ sub
417
+ such
418
+ sup
419
+ sure
420
+ t
421
+ t's
422
+ take
423
+ taken
424
+ tell
425
+ tends
426
+ th
427
+ than
428
+ thank
429
+ thanks
430
+ thanx
431
+ that
432
+ that's
433
+ thats
434
+ the
435
+ their
436
+ theirs
437
+ them
438
+ themselves
439
+ then
440
+ thence
441
+ there
442
+ there's
443
+ thereafter
444
+ thereby
445
+ therefore
446
+ therein
447
+ theres
448
+ thereupon
449
+ these
450
+ they
451
+ they'd
452
+ they'll
453
+ they're
454
+ they've
455
+ think
456
+ third
457
+ this
458
+ thorough
459
+ thoroughly
460
+ those
461
+ though
462
+ three
463
+ through
464
+ throughout
465
+ thru
466
+ thus
467
+ to
468
+ together
469
+ too
470
+ took
471
+ toward
472
+ towards
473
+ tried
474
+ tries
475
+ truly
476
+ try
477
+ trying
478
+ twice
479
+ two
480
+ u
481
+ un
482
+ under
483
+ unfortunately
484
+ unless
485
+ unlikely
486
+ until
487
+ unto
488
+ up
489
+ upon
490
+ us
491
+ use
492
+ used
493
+ useful
494
+ uses
495
+ using
496
+ usually
497
+ uucp
498
+ v
499
+ value
500
+ various
501
+ very
502
+ via
503
+ viz
504
+ vs
505
+ w
506
+ want
507
+ wants
508
+ was
509
+ wasn't
510
+ way
511
+ we
512
+ we'd
513
+ we'll
514
+ we're
515
+ we've
516
+ welcome
517
+ well
518
+ went
519
+ were
520
+ weren't
521
+ what
522
+ what's
523
+ whatever
524
+ when
525
+ whence
526
+ whenever
527
+ where
528
+ where's
529
+ whereafter
530
+ whereas
531
+ whereby
532
+ wherein
533
+ whereupon
534
+ wherever
535
+ whether
536
+ which
537
+ while
538
+ whither
539
+ who
540
+ who's
541
+ whoever
542
+ whole
543
+ whom
544
+ whose
545
+ why
546
+ will
547
+ willing
548
+ wish
549
+ with
550
+ within
551
+ without
552
+ won't
553
+ wonder
554
+ would
555
+ would
556
+ wouldn't
557
+ x
558
+ y
559
+ yes
560
+ yet
561
+ you
562
+ you'd
563
+ you'll
564
+ you're
565
+ you've
566
+ your
567
+ yours
568
+ yourself
569
+ yourselves
570
+ z
571
+ zero
data/rsemantic.gemspec ADDED
@@ -0,0 +1,41 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = %q{rsemantic}
3
+ s.version = "0.1.0"
4
+
5
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
6
+ s.authors = ["Joseph Wilk"]
7
+ s.date = %q{2008-11-13}
8
+ s.description = %q{A document vector search with flexible matrix transforms. Currently supports Latent semantic analysis and Term frequency - inverse document frequency}
9
+ s.email = ["josephwilk@joesniff.co.uk"]
10
+ s.extra_rdoc_files = ["History.txt", "Manifest.txt", "README.txt", "TODO.txt"]
11
+ s.files = ["History.txt", "Manifest.txt", "README.txt", "Rakefile", "TODO.txt", "config/hoe.rb", "config/requirements.rb", "gem_tasks/deployment.rake", "gem_tasks/environment.rake", "gem_tasks/examples.rake", "gem_tasks/fix_cr_lf.rake", "gem_tasks/gemspec.rake", "gem_tasks/rspec.rake", "gem_tasks/website.rake", "lib/semantic.rb", "lib/semantic/compare.rb", "lib/semantic/matrix_transformer.rb", "lib/semantic/parser.rb", "lib/semantic/search.rb", "lib/semantic/transform.rb", "lib/semantic/transform/lsa_transform.rb", "lib/semantic/transform/tf_idf_transform.rb", "lib/semantic/vector_space.rb", "lib/semantic/vector_space/builder.rb", "lib/semantic/vector_space/model.rb", "lib/semantic/version.rb", "resources/english.stop", "rsemantic.gemspec", "spec/semantic/compare_spec.rb", "spec/semantic/matrix_transformer_spec.rb", "spec/semantic/parser_spec.rb", "spec/semantic/search_spec.rb", "spec/semantic/transform/lsa_transform_spec.rb", "spec/semantic/transform/tf_idf_transform_spec.rb", "spec/semantic/vector_space/builder_spec.rb", "spec/semantic/vector_space/model_spec.rb", "spec/spec.opts", "spec/spec_helper.rb"]
12
+ s.has_rdoc = true
13
+ s.homepage = %q{http://github.com/josephwilk/rsemantic}
14
+ s.rdoc_options = ["--main", "README.txt"]
15
+ s.require_paths = ["lib"]
16
+ s.rubyforge_project = %q{rsemantic}
17
+ s.rubygems_version = %q{1.2.0}
18
+ s.summary = %q{A document vector search with flexible matrix transforms. Currently supports Latent semantic analysis and Term frequency - inverse document frequency}
19
+
20
+ if s.respond_to? :specification_version then
21
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
22
+ s.specification_version = 2
23
+
24
+ if current_version >= 3 then
25
+ s.add_runtime_dependency(%q<term-ansicolor>, [">= 1.0.3"])
26
+ s.add_runtime_dependency(%q<rspec>, [">= 1.1.5"])
27
+ s.add_runtime_dependency(%q<diff-lcs>, [">= 1.1.2"])
28
+ s.add_development_dependency(%q<hoe>, [">= 1.8.2"])
29
+ else
30
+ s.add_dependency(%q<term-ansicolor>, [">= 1.0.3"])
31
+ s.add_dependency(%q<rspec>, [">= 1.1.5"])
32
+ s.add_dependency(%q<diff-lcs>, [">= 1.1.2"])
33
+ s.add_dependency(%q<hoe>, [">= 1.8.2"])
34
+ end
35
+ else
36
+ s.add_dependency(%q<term-ansicolor>, [">= 1.0.3"])
37
+ s.add_dependency(%q<rspec>, [">= 1.1.5"])
38
+ s.add_dependency(%q<diff-lcs>, [">= 1.1.2"])
39
+ s.add_dependency(%q<hoe>, [">= 1.8.2"])
40
+ end
41
+ end
@@ -0,0 +1,16 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ module Semantic
4
+ describe Compare do
5
+
6
+ def vector(values)
7
+ Linalg::DMatrix.columns([values])
8
+ end
9
+
10
+ it "should calculate cosine" do
11
+ cosine = Compare.cosine( vector([0.1,0.5]), vector([0.9, 0.3]) )
12
+ cosine.should be_close(0.4961, 0.0001)
13
+ end
14
+
15
+ end
16
+ end
@@ -0,0 +1,51 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ module Semantic
4
+ describe MatrixTransformer do
5
+
6
+ def mock_transform
7
+ @transform ||= mock(Transform)
8
+ end
9
+
10
+ def mock_vector_space
11
+ mock("vector space", :matrix => Linalg::DMatrix.rows([[1,0],[0,1]]), :matrix= => nil )
12
+ end
13
+
14
+
15
+ describe "transforming matrix" do
16
+
17
+ it "should ignore invalid transform class" do
18
+ matrix_transformer = MatrixTransformer.new(:transforms => [:FAKE])
19
+ lambda {
20
+ matrix_transformer.apply_transforms(mock_vector_space)
21
+ }.should_not raise_error
22
+ end
23
+
24
+ it "should use defaults transforms in none are specified" do
25
+ matrix_transformer = MatrixTransformer.new
26
+ Transform.should_receive(:const_get).with(:LSA).and_return(mock_transform)
27
+ Transform.should_receive(:const_get).with(:TFIDF).and_return(mock_transform)
28
+
29
+ matrix_transformer.apply_transforms(mock_vector_space)
30
+ end
31
+
32
+ it "should send transform message to class to transform matrix" do
33
+ matrix_transformer = MatrixTransformer.new(:transforms => [:LSA])
34
+ Transform.stub!(:const_get).and_return(mock_transform)
35
+
36
+ mock_transform.should_receive(:transform)
37
+
38
+ matrix_transformer.apply_transforms(mock_vector_space)
39
+ end
40
+
41
+ it "should check that transform class is capable of transforming" do
42
+ matrix_transformer = MatrixTransformer.new(:transforms => [:LSA])
43
+ Transform.stub!(:const_get).and_return(mock_transform)
44
+ mock_transform.should_receive(:respond_to?).with(:transform)
45
+
46
+ matrix_transformer.apply_transforms(mock_vector_space)
47
+ end
48
+
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,34 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ module Semantic
4
+ describe Parser do
5
+
6
+ it "should remove stop words" do
7
+ file = mock("file")
8
+ file.stub!(:read).and_return("a to be")
9
+ File.stub!(:open).and_yield(file)
10
+ parser = Parser.new
11
+
12
+ parser.remove_stop_words(['a','house']).should == ['house']
13
+ end
14
+
15
+ it "should remove any non characters" do
16
+ file = mock("file")
17
+ file.stub!(:read).and_return("a to be")
18
+ File.stub!(:open).and_yield(file)
19
+
20
+ parser = Parser.new
21
+ parser.tokenise_and_stem("dragon.").should == ["dragon"]
22
+ end
23
+
24
+ it "should tokenise the string" do
25
+ parser = Parser.new
26
+
27
+ parser.stub!(:remove_stop_words).and_return(['mouse','trap'])
28
+ parser.should_receive(:tokenise_and_stem).and_return(['mouse','trap'])
29
+
30
+ parser.tokenise_and_filter(['the mouse trap'])
31
+ end
32
+
33
+ end
34
+ end