lda-ruby 0.3.5 → 0.3.6
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +10 -1
- data/Rakefile +1 -0
- data/VERSION.yml +1 -1
- data/lda-ruby.gemspec +5 -2
- data/lib/lda-ruby/config/stopwords.yml +571 -0
- data/lib/lda-ruby/corpus/corpus.rb +8 -3
- data/lib/lda-ruby/corpus/text_corpus.rb +1 -1
- data/lib/lda-ruby/document/document.rb +5 -2
- data/lib/lda-ruby/document/text_document.rb +1 -0
- data/test/lda_ruby_test.rb +10 -3
- data/test/simple_test.rb +26 -0
- data/test/simple_yaml.rb +23 -0
- metadata +7 -4
data/CHANGELOG
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
version 0.3.6
|
2
|
+
=============
|
3
|
+
|
4
|
+
- added stopwords list and included downcasing to improve performance
|
5
|
+
|
6
|
+
version 0.3.5
|
7
|
+
=============
|
8
|
+
|
9
|
+
- Bug fix for text documents by Rio Akasaka
|
10
|
+
|
1
11
|
Version 0.3.4
|
2
12
|
=============
|
3
13
|
|
@@ -20,7 +30,6 @@ Version 0.2.3
|
|
20
30
|
|
21
31
|
- Bug fixes by Todd Foster
|
22
32
|
|
23
|
-
|
24
33
|
Version 0.2.2
|
25
34
|
=============
|
26
35
|
|
data/Rakefile
CHANGED
@@ -12,6 +12,7 @@ begin
|
|
12
12
|
gem.homepage = "http://github.com/ealdent/lda-ruby"
|
13
13
|
gem.authors = ['David Blei', 'Jason Adams', 'Rio Akasaka']
|
14
14
|
gem.extensions = ['ext/lda-ruby/extconf.rb']
|
15
|
+
gem.files.include 'stopwords.txt'
|
15
16
|
gem.require_paths = ['lib', 'ext']
|
16
17
|
gem.add_dependency 'shoulda'
|
17
18
|
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
data/VERSION.yml
CHANGED
data/lda-ruby.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{lda-ruby}
|
8
|
-
s.version = "0.3.
|
8
|
+
s.version = "0.3.6"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["David Blei", "Jason Adams", "Rio Akasaka"]
|
12
|
-
s.date = %q{2011-08-
|
12
|
+
s.date = %q{2011-08-05}
|
13
13
|
s.description = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.}
|
14
14
|
s.email = %q{jasonmadams@gmail.com}
|
15
15
|
s.extensions = ["ext/lda-ruby/extconf.rb"]
|
@@ -40,6 +40,7 @@ Gem::Specification.new do |s|
|
|
40
40
|
"ext/lda-ruby/utils.h",
|
41
41
|
"lda-ruby.gemspec",
|
42
42
|
"lib/lda-ruby.rb",
|
43
|
+
"lib/lda-ruby/config/stopwords.yml",
|
43
44
|
"lib/lda-ruby/corpus/corpus.rb",
|
44
45
|
"lib/lda-ruby/corpus/data_corpus.rb",
|
45
46
|
"lib/lda-ruby/corpus/directory_corpus.rb",
|
@@ -54,6 +55,8 @@ Gem::Specification.new do |s|
|
|
54
55
|
"test/data/sample.rb",
|
55
56
|
"test/data/wiki-test-docs.yml",
|
56
57
|
"test/lda_ruby_test.rb",
|
58
|
+
"test/simple_test.rb",
|
59
|
+
"test/simple_yaml.rb",
|
57
60
|
"test/test_helper.rb"
|
58
61
|
]
|
59
62
|
s.homepage = %q{http://github.com/ealdent/lda-ruby}
|
@@ -0,0 +1,571 @@
|
|
1
|
+
---
|
2
|
+
- a
|
3
|
+
- a's
|
4
|
+
- able
|
5
|
+
- about
|
6
|
+
- above
|
7
|
+
- according
|
8
|
+
- accordingly
|
9
|
+
- across
|
10
|
+
- actually
|
11
|
+
- after
|
12
|
+
- afterwards
|
13
|
+
- again
|
14
|
+
- against
|
15
|
+
- ain't
|
16
|
+
- all
|
17
|
+
- allow
|
18
|
+
- allows
|
19
|
+
- almost
|
20
|
+
- alone
|
21
|
+
- along
|
22
|
+
- already
|
23
|
+
- also
|
24
|
+
- although
|
25
|
+
- always
|
26
|
+
- am
|
27
|
+
- among
|
28
|
+
- amongst
|
29
|
+
- an
|
30
|
+
- and
|
31
|
+
- another
|
32
|
+
- any
|
33
|
+
- anybody
|
34
|
+
- anyhow
|
35
|
+
- anyone
|
36
|
+
- anything
|
37
|
+
- anyway
|
38
|
+
- anyways
|
39
|
+
- anywhere
|
40
|
+
- apart
|
41
|
+
- appear
|
42
|
+
- appreciate
|
43
|
+
- appropriate
|
44
|
+
- are
|
45
|
+
- aren't
|
46
|
+
- around
|
47
|
+
- as
|
48
|
+
- aside
|
49
|
+
- ask
|
50
|
+
- asking
|
51
|
+
- associated
|
52
|
+
- at
|
53
|
+
- available
|
54
|
+
- away
|
55
|
+
- awfully
|
56
|
+
- b
|
57
|
+
- be
|
58
|
+
- became
|
59
|
+
- because
|
60
|
+
- become
|
61
|
+
- becomes
|
62
|
+
- becoming
|
63
|
+
- been
|
64
|
+
- before
|
65
|
+
- beforehand
|
66
|
+
- behind
|
67
|
+
- being
|
68
|
+
- believe
|
69
|
+
- below
|
70
|
+
- beside
|
71
|
+
- besides
|
72
|
+
- best
|
73
|
+
- better
|
74
|
+
- between
|
75
|
+
- beyond
|
76
|
+
- both
|
77
|
+
- brief
|
78
|
+
- but
|
79
|
+
- by
|
80
|
+
- c
|
81
|
+
- c'mon
|
82
|
+
- c's
|
83
|
+
- came
|
84
|
+
- can
|
85
|
+
- can't
|
86
|
+
- cannot
|
87
|
+
- cant
|
88
|
+
- cause
|
89
|
+
- causes
|
90
|
+
- certain
|
91
|
+
- certainly
|
92
|
+
- changes
|
93
|
+
- clearly
|
94
|
+
- co
|
95
|
+
- com
|
96
|
+
- come
|
97
|
+
- comes
|
98
|
+
- concerning
|
99
|
+
- consequently
|
100
|
+
- consider
|
101
|
+
- considering
|
102
|
+
- contain
|
103
|
+
- containing
|
104
|
+
- contains
|
105
|
+
- corresponding
|
106
|
+
- could
|
107
|
+
- couldn't
|
108
|
+
- course
|
109
|
+
- currently
|
110
|
+
- d
|
111
|
+
- definitely
|
112
|
+
- described
|
113
|
+
- despite
|
114
|
+
- did
|
115
|
+
- didn't
|
116
|
+
- different
|
117
|
+
- do
|
118
|
+
- does
|
119
|
+
- doesn't
|
120
|
+
- doing
|
121
|
+
- don't
|
122
|
+
- done
|
123
|
+
- down
|
124
|
+
- downwards
|
125
|
+
- during
|
126
|
+
- e
|
127
|
+
- each
|
128
|
+
- edu
|
129
|
+
- eg
|
130
|
+
- eight
|
131
|
+
- either
|
132
|
+
- else
|
133
|
+
- elsewhere
|
134
|
+
- enough
|
135
|
+
- entirely
|
136
|
+
- especially
|
137
|
+
- et
|
138
|
+
- etc
|
139
|
+
- even
|
140
|
+
- ever
|
141
|
+
- every
|
142
|
+
- everybody
|
143
|
+
- everyone
|
144
|
+
- everything
|
145
|
+
- everywhere
|
146
|
+
- ex
|
147
|
+
- exactly
|
148
|
+
- example
|
149
|
+
- except
|
150
|
+
- f
|
151
|
+
- far
|
152
|
+
- few
|
153
|
+
- fifth
|
154
|
+
- first
|
155
|
+
- five
|
156
|
+
- followed
|
157
|
+
- following
|
158
|
+
- follows
|
159
|
+
- for
|
160
|
+
- former
|
161
|
+
- formerly
|
162
|
+
- forth
|
163
|
+
- four
|
164
|
+
- from
|
165
|
+
- further
|
166
|
+
- furthermore
|
167
|
+
- g
|
168
|
+
- get
|
169
|
+
- gets
|
170
|
+
- getting
|
171
|
+
- given
|
172
|
+
- gives
|
173
|
+
- go
|
174
|
+
- goes
|
175
|
+
- going
|
176
|
+
- gone
|
177
|
+
- got
|
178
|
+
- gotten
|
179
|
+
- greetings
|
180
|
+
- h
|
181
|
+
- had
|
182
|
+
- hadn't
|
183
|
+
- happens
|
184
|
+
- hardly
|
185
|
+
- has
|
186
|
+
- hasn't
|
187
|
+
- have
|
188
|
+
- haven't
|
189
|
+
- having
|
190
|
+
- he
|
191
|
+
- he's
|
192
|
+
- hello
|
193
|
+
- help
|
194
|
+
- hence
|
195
|
+
- her
|
196
|
+
- here
|
197
|
+
- here's
|
198
|
+
- hereafter
|
199
|
+
- hereby
|
200
|
+
- herein
|
201
|
+
- hereupon
|
202
|
+
- hers
|
203
|
+
- herself
|
204
|
+
- hi
|
205
|
+
- him
|
206
|
+
- himself
|
207
|
+
- his
|
208
|
+
- hither
|
209
|
+
- hopefully
|
210
|
+
- how
|
211
|
+
- howbeit
|
212
|
+
- however
|
213
|
+
- i
|
214
|
+
- i'd
|
215
|
+
- i'll
|
216
|
+
- i'm
|
217
|
+
- i've
|
218
|
+
- ie
|
219
|
+
- if
|
220
|
+
- ignored
|
221
|
+
- immediate
|
222
|
+
- in
|
223
|
+
- inasmuch
|
224
|
+
- inc
|
225
|
+
- indeed
|
226
|
+
- indicate
|
227
|
+
- indicated
|
228
|
+
- indicates
|
229
|
+
- inner
|
230
|
+
- insofar
|
231
|
+
- instead
|
232
|
+
- into
|
233
|
+
- inward
|
234
|
+
- is
|
235
|
+
- isn't
|
236
|
+
- it
|
237
|
+
- it'd
|
238
|
+
- it'll
|
239
|
+
- it's
|
240
|
+
- its
|
241
|
+
- itself
|
242
|
+
- j
|
243
|
+
- just
|
244
|
+
- k
|
245
|
+
- keep
|
246
|
+
- keeps
|
247
|
+
- kept
|
248
|
+
- know
|
249
|
+
- knows
|
250
|
+
- known
|
251
|
+
- l
|
252
|
+
- last
|
253
|
+
- lately
|
254
|
+
- later
|
255
|
+
- latter
|
256
|
+
- latterly
|
257
|
+
- least
|
258
|
+
- less
|
259
|
+
- lest
|
260
|
+
- let
|
261
|
+
- let's
|
262
|
+
- like
|
263
|
+
- liked
|
264
|
+
- likely
|
265
|
+
- little
|
266
|
+
- look
|
267
|
+
- looking
|
268
|
+
- looks
|
269
|
+
- ltd
|
270
|
+
- m
|
271
|
+
- mainly
|
272
|
+
- many
|
273
|
+
- may
|
274
|
+
- maybe
|
275
|
+
- me
|
276
|
+
- mean
|
277
|
+
- meanwhile
|
278
|
+
- merely
|
279
|
+
- might
|
280
|
+
- more
|
281
|
+
- moreover
|
282
|
+
- most
|
283
|
+
- mostly
|
284
|
+
- much
|
285
|
+
- must
|
286
|
+
- my
|
287
|
+
- myself
|
288
|
+
- n
|
289
|
+
- name
|
290
|
+
- namely
|
291
|
+
- nd
|
292
|
+
- near
|
293
|
+
- nearly
|
294
|
+
- necessary
|
295
|
+
- need
|
296
|
+
- needs
|
297
|
+
- neither
|
298
|
+
- never
|
299
|
+
- nevertheless
|
300
|
+
- new
|
301
|
+
- next
|
302
|
+
- nine
|
303
|
+
- "no"
|
304
|
+
- nobody
|
305
|
+
- non
|
306
|
+
- none
|
307
|
+
- noone
|
308
|
+
- nor
|
309
|
+
- normally
|
310
|
+
- not
|
311
|
+
- nothing
|
312
|
+
- novel
|
313
|
+
- now
|
314
|
+
- nowhere
|
315
|
+
- o
|
316
|
+
- obviously
|
317
|
+
- of
|
318
|
+
- "off"
|
319
|
+
- often
|
320
|
+
- oh
|
321
|
+
- ok
|
322
|
+
- okay
|
323
|
+
- old
|
324
|
+
- "on"
|
325
|
+
- once
|
326
|
+
- one
|
327
|
+
- ones
|
328
|
+
- only
|
329
|
+
- onto
|
330
|
+
- or
|
331
|
+
- other
|
332
|
+
- others
|
333
|
+
- otherwise
|
334
|
+
- ought
|
335
|
+
- our
|
336
|
+
- ours
|
337
|
+
- ourselves
|
338
|
+
- out
|
339
|
+
- outside
|
340
|
+
- over
|
341
|
+
- overall
|
342
|
+
- own
|
343
|
+
- p
|
344
|
+
- particular
|
345
|
+
- particularly
|
346
|
+
- per
|
347
|
+
- perhaps
|
348
|
+
- placed
|
349
|
+
- please
|
350
|
+
- plus
|
351
|
+
- possible
|
352
|
+
- presumably
|
353
|
+
- probably
|
354
|
+
- provides
|
355
|
+
- q
|
356
|
+
- que
|
357
|
+
- quite
|
358
|
+
- qv
|
359
|
+
- r
|
360
|
+
- rather
|
361
|
+
- rd
|
362
|
+
- re
|
363
|
+
- really
|
364
|
+
- reasonably
|
365
|
+
- regarding
|
366
|
+
- regardless
|
367
|
+
- regards
|
368
|
+
- relatively
|
369
|
+
- respectively
|
370
|
+
- right
|
371
|
+
- s
|
372
|
+
- said
|
373
|
+
- same
|
374
|
+
- saw
|
375
|
+
- say
|
376
|
+
- saying
|
377
|
+
- says
|
378
|
+
- second
|
379
|
+
- secondly
|
380
|
+
- see
|
381
|
+
- seeing
|
382
|
+
- seem
|
383
|
+
- seemed
|
384
|
+
- seeming
|
385
|
+
- seems
|
386
|
+
- seen
|
387
|
+
- self
|
388
|
+
- selves
|
389
|
+
- sensible
|
390
|
+
- sent
|
391
|
+
- serious
|
392
|
+
- seriously
|
393
|
+
- seven
|
394
|
+
- several
|
395
|
+
- shall
|
396
|
+
- she
|
397
|
+
- should
|
398
|
+
- shouldn't
|
399
|
+
- since
|
400
|
+
- six
|
401
|
+
- so
|
402
|
+
- some
|
403
|
+
- somebody
|
404
|
+
- somehow
|
405
|
+
- someone
|
406
|
+
- something
|
407
|
+
- sometime
|
408
|
+
- sometimes
|
409
|
+
- somewhat
|
410
|
+
- somewhere
|
411
|
+
- soon
|
412
|
+
- sorry
|
413
|
+
- specified
|
414
|
+
- specify
|
415
|
+
- specifying
|
416
|
+
- still
|
417
|
+
- sub
|
418
|
+
- such
|
419
|
+
- sup
|
420
|
+
- sure
|
421
|
+
- t
|
422
|
+
- t's
|
423
|
+
- take
|
424
|
+
- taken
|
425
|
+
- tell
|
426
|
+
- tends
|
427
|
+
- th
|
428
|
+
- than
|
429
|
+
- thank
|
430
|
+
- thanks
|
431
|
+
- thanx
|
432
|
+
- that
|
433
|
+
- that's
|
434
|
+
- thats
|
435
|
+
- the
|
436
|
+
- their
|
437
|
+
- theirs
|
438
|
+
- them
|
439
|
+
- themselves
|
440
|
+
- then
|
441
|
+
- thence
|
442
|
+
- there
|
443
|
+
- there's
|
444
|
+
- thereafter
|
445
|
+
- thereby
|
446
|
+
- therefore
|
447
|
+
- therein
|
448
|
+
- theres
|
449
|
+
- thereupon
|
450
|
+
- these
|
451
|
+
- they
|
452
|
+
- they'd
|
453
|
+
- they'll
|
454
|
+
- they're
|
455
|
+
- they've
|
456
|
+
- think
|
457
|
+
- third
|
458
|
+
- this
|
459
|
+
- thorough
|
460
|
+
- thoroughly
|
461
|
+
- those
|
462
|
+
- though
|
463
|
+
- three
|
464
|
+
- through
|
465
|
+
- throughout
|
466
|
+
- thru
|
467
|
+
- thus
|
468
|
+
- to
|
469
|
+
- together
|
470
|
+
- too
|
471
|
+
- took
|
472
|
+
- toward
|
473
|
+
- towards
|
474
|
+
- tried
|
475
|
+
- tries
|
476
|
+
- truly
|
477
|
+
- try
|
478
|
+
- trying
|
479
|
+
- twice
|
480
|
+
- two
|
481
|
+
- u
|
482
|
+
- un
|
483
|
+
- under
|
484
|
+
- unfortunately
|
485
|
+
- unless
|
486
|
+
- unlikely
|
487
|
+
- until
|
488
|
+
- unto
|
489
|
+
- up
|
490
|
+
- upon
|
491
|
+
- us
|
492
|
+
- use
|
493
|
+
- used
|
494
|
+
- useful
|
495
|
+
- uses
|
496
|
+
- using
|
497
|
+
- usually
|
498
|
+
- v
|
499
|
+
- value
|
500
|
+
- various
|
501
|
+
- very
|
502
|
+
- via
|
503
|
+
- viz
|
504
|
+
- vs
|
505
|
+
- w
|
506
|
+
- want
|
507
|
+
- wants
|
508
|
+
- was
|
509
|
+
- wasn't
|
510
|
+
- way
|
511
|
+
- we
|
512
|
+
- we'd
|
513
|
+
- we'll
|
514
|
+
- we're
|
515
|
+
- we've
|
516
|
+
- welcome
|
517
|
+
- well
|
518
|
+
- went
|
519
|
+
- were
|
520
|
+
- weren't
|
521
|
+
- what
|
522
|
+
- what's
|
523
|
+
- whatever
|
524
|
+
- when
|
525
|
+
- whence
|
526
|
+
- whenever
|
527
|
+
- where
|
528
|
+
- where's
|
529
|
+
- whereafter
|
530
|
+
- whereas
|
531
|
+
- whereby
|
532
|
+
- wherein
|
533
|
+
- whereupon
|
534
|
+
- wherever
|
535
|
+
- whether
|
536
|
+
- which
|
537
|
+
- while
|
538
|
+
- whither
|
539
|
+
- who
|
540
|
+
- who's
|
541
|
+
- whoever
|
542
|
+
- whole
|
543
|
+
- whom
|
544
|
+
- whose
|
545
|
+
- why
|
546
|
+
- will
|
547
|
+
- willing
|
548
|
+
- wish
|
549
|
+
- with
|
550
|
+
- within
|
551
|
+
- without
|
552
|
+
- won't
|
553
|
+
- wonder
|
554
|
+
- would
|
555
|
+
- would
|
556
|
+
- wouldn't
|
557
|
+
- x
|
558
|
+
- y
|
559
|
+
- "yes"
|
560
|
+
- yet
|
561
|
+
- you
|
562
|
+
- you'd
|
563
|
+
- you'll
|
564
|
+
- you're
|
565
|
+
- you've
|
566
|
+
- your
|
567
|
+
- yours
|
568
|
+
- yourself
|
569
|
+
- yourselves
|
570
|
+
- z
|
571
|
+
- zero
|
@@ -2,13 +2,15 @@ require 'set'
|
|
2
2
|
|
3
3
|
module Lda
|
4
4
|
class Corpus
|
5
|
-
attr_reader :documents, :num_docs, :num_terms, :vocabulary
|
5
|
+
attr_reader :documents, :num_docs, :num_terms, :vocabulary, :stopwords
|
6
6
|
|
7
7
|
def initialize
|
8
8
|
@documents = Array.new
|
9
9
|
@all_terms = Set.new
|
10
10
|
@num_terms = @num_docs = 0
|
11
11
|
@vocabulary = Vocabulary.new
|
12
|
+
@stopwords = YAML.load_file(File.join(File.dirname(__FILE__), '..', 'config', 'stopwords.yml'))
|
13
|
+
@stopwords.map! { |w| w.strip }
|
12
14
|
end
|
13
15
|
|
14
16
|
def add_document(doc)
|
@@ -21,10 +23,13 @@ module Lda
|
|
21
23
|
@num_terms = @all_terms.size
|
22
24
|
|
23
25
|
update_vocabulary(doc)
|
24
|
-
|
25
26
|
nil
|
26
27
|
end
|
27
|
-
|
28
|
+
|
29
|
+
def remove_word(word)
|
30
|
+
@vocabulary.words.delete word
|
31
|
+
end
|
32
|
+
|
28
33
|
protected
|
29
34
|
|
30
35
|
def update_vocabulary(doc)
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
|
1
3
|
module Lda
|
2
4
|
class Document
|
3
5
|
attr_reader :corpus, :words, :counts, :length, :total, :tokens
|
@@ -29,8 +31,9 @@ module Lda
|
|
29
31
|
end
|
30
32
|
|
31
33
|
def tokenize(text)
|
32
|
-
clean_text = text.gsub(/[^A-Za-z'\s]+/, ' ').gsub(/\s+/, ' ') # remove everything but letters and ' and leave only single spaces
|
34
|
+
clean_text = text.gsub(/[^A-Za-z'\s]+/, ' ').gsub(/\s+/, ' ').downcase # remove everything but letters and ' and leave only single spaces
|
33
35
|
@tokens = handle(clean_text.split(' '))
|
36
|
+
nil
|
34
37
|
end
|
35
38
|
end
|
36
|
-
end
|
39
|
+
end
|
data/test/lda_ruby_test.rb
CHANGED
@@ -1,4 +1,11 @@
|
|
1
|
-
require '
|
1
|
+
require 'rubygems'
|
2
|
+
require 'test/unit'
|
3
|
+
require 'shoulda'
|
4
|
+
require 'yaml'
|
5
|
+
|
6
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
7
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
8
|
+
require 'lda-ruby'
|
2
9
|
|
3
10
|
class LdaRubyTest < Test::Unit::TestCase
|
4
11
|
context "A Document instance" do
|
@@ -66,7 +73,7 @@ class LdaRubyTest < Test::Unit::TestCase
|
|
66
73
|
|
67
74
|
context "A typical TextDocument" do
|
68
75
|
setup do
|
69
|
-
@text = '
|
76
|
+
@text = 'stop words stop stop masterful stoppage buffalo buffalo buffalo'
|
70
77
|
@document = Lda::TextDocument.new(@corpus, @text)
|
71
78
|
end
|
72
79
|
|
@@ -104,7 +111,7 @@ class LdaRubyTest < Test::Unit::TestCase
|
|
104
111
|
|
105
112
|
should "update vocabulary with words in the document" do
|
106
113
|
@corpus.add_document(@document2)
|
107
|
-
assert_equal @corpus.vocabulary.words.member?('
|
114
|
+
assert_equal @corpus.vocabulary.words.member?('lame'), true
|
108
115
|
end
|
109
116
|
end
|
110
117
|
|
data/test/simple_test.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'shoulda'
|
3
|
+
require 'yaml'
|
4
|
+
require 'lda-ruby'
|
5
|
+
|
6
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
7
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
8
|
+
|
9
|
+
class Test::Unit::TestCase
|
10
|
+
|
11
|
+
@corpus = Lda::Corpus.new
|
12
|
+
@document1 = Lda::TextDocument.new(@corpus, 'Dom Cobb is a skilled thief, the absolute best in the dangerous art of extraction, stealing valuable secrets from deep within the subconscious during the dream state, when the mind is at its most vulnerable. Cobb\'s rare ability has made him a coveted player in this treacherous new world of corporate espionage, but it has also made him an international fugitive and cost him everything he has ever loved. Now Cobb is being offered a chance at redemption. One last job could give him his life back but only if he can accomplish the impossible-inception. Instead of the perfect heist, Cobb and his team of specialists have to pull off the reverse: their task is not to steal an idea but to plant one. If they succeed, it could be the perfect crime. But no amount of careful planning or expertise can prepare the team for the dangerous enemy that seems to predict their every move. An enemy that only Cobb could have seen coming.')
|
13
|
+
@document2 = Lda::TextDocument.new(@corpus, 'When his brother is killed in a robbery, paraplegic Marine Jake Sully decides to take his place in a mission on the distant world of Pandora. There he learns of greedy corporate figurehead Parker Selfridge\'s intentions of driving off the native humanoid \"Na\'vi\" in order to mine for the precious material scattered throughout their rich woodland. In exchange for the spinal surgery that will fix his legs, Jake gathers intel for the cooperating military unit spearheaded by gung-ho Colonel Quaritch, while simultaneously attempting to infiltrate the Na\'vi people with the use of an \"avatar\" identity. While Jake begins to bond with the native tribe and quickly falls in love with the beautiful alien Neytiri, the restless Colonel moves forward with his ruthless extermination tactics, forcing the soldier to take a stand - and fight back in an epic battle for the fate of Pandora.')
|
14
|
+
|
15
|
+
@corpus.add_document(@document1)
|
16
|
+
@corpus.add_document(@document2)
|
17
|
+
@corpus.remove_word("cobb")
|
18
|
+
@lda = Lda::Lda.new(@corpus)
|
19
|
+
|
20
|
+
@lda.verbose = false
|
21
|
+
@lda.num_topics = 2
|
22
|
+
@lda.em('random')
|
23
|
+
topics = @lda.top_words(5)
|
24
|
+
puts topics
|
25
|
+
|
26
|
+
end
|
data/test/simple_yaml.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'shoulda'
|
3
|
+
require 'yaml'
|
4
|
+
require 'lda-ruby'
|
5
|
+
|
6
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
7
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
8
|
+
|
9
|
+
class Test::Unit::TestCase
|
10
|
+
|
11
|
+
@filename = File.join(File.dirname(__FILE__), 'data', 'wiki-test-docs.yml')
|
12
|
+
@filedocs = YAML::load_file(@filename)
|
13
|
+
@corpus = Lda::TextCorpus.new(@filename)
|
14
|
+
|
15
|
+
@lda = Lda::Lda.new(@corpus)
|
16
|
+
|
17
|
+
@lda.verbose = false
|
18
|
+
@lda.num_topics = 20
|
19
|
+
@lda.em('random')
|
20
|
+
@lda.print_topics(20)
|
21
|
+
|
22
|
+
|
23
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lda-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -11,12 +11,12 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2011-08-
|
14
|
+
date: 2011-08-05 00:00:00.000000000 -04:00
|
15
15
|
default_executable:
|
16
16
|
dependencies:
|
17
17
|
- !ruby/object:Gem::Dependency
|
18
18
|
name: shoulda
|
19
|
-
requirement: &
|
19
|
+
requirement: &2153224820 !ruby/object:Gem::Requirement
|
20
20
|
none: false
|
21
21
|
requirements:
|
22
22
|
- - ! '>='
|
@@ -24,7 +24,7 @@ dependencies:
|
|
24
24
|
version: '0'
|
25
25
|
type: :runtime
|
26
26
|
prerelease: false
|
27
|
-
version_requirements: *
|
27
|
+
version_requirements: *2153224820
|
28
28
|
description: Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.
|
29
29
|
email: jasonmadams@gmail.com
|
30
30
|
executables: []
|
@@ -56,6 +56,7 @@ files:
|
|
56
56
|
- ext/lda-ruby/utils.h
|
57
57
|
- lda-ruby.gemspec
|
58
58
|
- lib/lda-ruby.rb
|
59
|
+
- lib/lda-ruby/config/stopwords.yml
|
59
60
|
- lib/lda-ruby/corpus/corpus.rb
|
60
61
|
- lib/lda-ruby/corpus/data_corpus.rb
|
61
62
|
- lib/lda-ruby/corpus/directory_corpus.rb
|
@@ -70,6 +71,8 @@ files:
|
|
70
71
|
- test/data/sample.rb
|
71
72
|
- test/data/wiki-test-docs.yml
|
72
73
|
- test/lda_ruby_test.rb
|
74
|
+
- test/simple_test.rb
|
75
|
+
- test/simple_yaml.rb
|
73
76
|
- test/test_helper.rb
|
74
77
|
has_rdoc: true
|
75
78
|
homepage: http://github.com/ealdent/lda-ruby
|