lda-ruby 0.3.5 → 0.3.6

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,13 @@
1
+ version 0.3.6
2
+ =============
3
+
4
+ - added stopwords list and included downcasing to improve performance
5
+
6
+ version 0.3.5
7
+ =============
8
+
9
+ - Bug fix for text documents by Rio Akasaka
10
+
1
11
  Version 0.3.4
2
12
  =============
3
13
 
@@ -20,7 +30,6 @@ Version 0.2.3
20
30
 
21
31
  - Bug fixes by Todd Foster
22
32
 
23
-
24
33
  Version 0.2.2
25
34
  =============
26
35
 
data/Rakefile CHANGED
@@ -12,6 +12,7 @@ begin
12
12
  gem.homepage = "http://github.com/ealdent/lda-ruby"
13
13
  gem.authors = ['David Blei', 'Jason Adams', 'Rio Akasaka']
14
14
  gem.extensions = ['ext/lda-ruby/extconf.rb']
15
+ gem.files.include 'stopwords.txt'
15
16
  gem.require_paths = ['lib', 'ext']
16
17
  gem.add_dependency 'shoulda'
17
18
  # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
@@ -1,5 +1,5 @@
1
1
  ---
2
2
  :major: 0
3
3
  :minor: 3
4
- :patch: 5
4
+ :patch: 6
5
5
  :build:
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{lda-ruby}
8
- s.version = "0.3.5"
8
+ s.version = "0.3.6"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["David Blei", "Jason Adams", "Rio Akasaka"]
12
- s.date = %q{2011-08-03}
12
+ s.date = %q{2011-08-05}
13
13
  s.description = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.}
14
14
  s.email = %q{jasonmadams@gmail.com}
15
15
  s.extensions = ["ext/lda-ruby/extconf.rb"]
@@ -40,6 +40,7 @@ Gem::Specification.new do |s|
40
40
  "ext/lda-ruby/utils.h",
41
41
  "lda-ruby.gemspec",
42
42
  "lib/lda-ruby.rb",
43
+ "lib/lda-ruby/config/stopwords.yml",
43
44
  "lib/lda-ruby/corpus/corpus.rb",
44
45
  "lib/lda-ruby/corpus/data_corpus.rb",
45
46
  "lib/lda-ruby/corpus/directory_corpus.rb",
@@ -54,6 +55,8 @@ Gem::Specification.new do |s|
54
55
  "test/data/sample.rb",
55
56
  "test/data/wiki-test-docs.yml",
56
57
  "test/lda_ruby_test.rb",
58
+ "test/simple_test.rb",
59
+ "test/simple_yaml.rb",
57
60
  "test/test_helper.rb"
58
61
  ]
59
62
  s.homepage = %q{http://github.com/ealdent/lda-ruby}
@@ -0,0 +1,571 @@
1
+ ---
2
+ - a
3
+ - a's
4
+ - able
5
+ - about
6
+ - above
7
+ - according
8
+ - accordingly
9
+ - across
10
+ - actually
11
+ - after
12
+ - afterwards
13
+ - again
14
+ - against
15
+ - ain't
16
+ - all
17
+ - allow
18
+ - allows
19
+ - almost
20
+ - alone
21
+ - along
22
+ - already
23
+ - also
24
+ - although
25
+ - always
26
+ - am
27
+ - among
28
+ - amongst
29
+ - an
30
+ - and
31
+ - another
32
+ - any
33
+ - anybody
34
+ - anyhow
35
+ - anyone
36
+ - anything
37
+ - anyway
38
+ - anyways
39
+ - anywhere
40
+ - apart
41
+ - appear
42
+ - appreciate
43
+ - appropriate
44
+ - are
45
+ - aren't
46
+ - around
47
+ - as
48
+ - aside
49
+ - ask
50
+ - asking
51
+ - associated
52
+ - at
53
+ - available
54
+ - away
55
+ - awfully
56
+ - b
57
+ - be
58
+ - became
59
+ - because
60
+ - become
61
+ - becomes
62
+ - becoming
63
+ - been
64
+ - before
65
+ - beforehand
66
+ - behind
67
+ - being
68
+ - believe
69
+ - below
70
+ - beside
71
+ - besides
72
+ - best
73
+ - better
74
+ - between
75
+ - beyond
76
+ - both
77
+ - brief
78
+ - but
79
+ - by
80
+ - c
81
+ - c'mon
82
+ - c's
83
+ - came
84
+ - can
85
+ - can't
86
+ - cannot
87
+ - cant
88
+ - cause
89
+ - causes
90
+ - certain
91
+ - certainly
92
+ - changes
93
+ - clearly
94
+ - co
95
+ - com
96
+ - come
97
+ - comes
98
+ - concerning
99
+ - consequently
100
+ - consider
101
+ - considering
102
+ - contain
103
+ - containing
104
+ - contains
105
+ - corresponding
106
+ - could
107
+ - couldn't
108
+ - course
109
+ - currently
110
+ - d
111
+ - definitely
112
+ - described
113
+ - despite
114
+ - did
115
+ - didn't
116
+ - different
117
+ - do
118
+ - does
119
+ - doesn't
120
+ - doing
121
+ - don't
122
+ - done
123
+ - down
124
+ - downwards
125
+ - during
126
+ - e
127
+ - each
128
+ - edu
129
+ - eg
130
+ - eight
131
+ - either
132
+ - else
133
+ - elsewhere
134
+ - enough
135
+ - entirely
136
+ - especially
137
+ - et
138
+ - etc
139
+ - even
140
+ - ever
141
+ - every
142
+ - everybody
143
+ - everyone
144
+ - everything
145
+ - everywhere
146
+ - ex
147
+ - exactly
148
+ - example
149
+ - except
150
+ - f
151
+ - far
152
+ - few
153
+ - fifth
154
+ - first
155
+ - five
156
+ - followed
157
+ - following
158
+ - follows
159
+ - for
160
+ - former
161
+ - formerly
162
+ - forth
163
+ - four
164
+ - from
165
+ - further
166
+ - furthermore
167
+ - g
168
+ - get
169
+ - gets
170
+ - getting
171
+ - given
172
+ - gives
173
+ - go
174
+ - goes
175
+ - going
176
+ - gone
177
+ - got
178
+ - gotten
179
+ - greetings
180
+ - h
181
+ - had
182
+ - hadn't
183
+ - happens
184
+ - hardly
185
+ - has
186
+ - hasn't
187
+ - have
188
+ - haven't
189
+ - having
190
+ - he
191
+ - he's
192
+ - hello
193
+ - help
194
+ - hence
195
+ - her
196
+ - here
197
+ - here's
198
+ - hereafter
199
+ - hereby
200
+ - herein
201
+ - hereupon
202
+ - hers
203
+ - herself
204
+ - hi
205
+ - him
206
+ - himself
207
+ - his
208
+ - hither
209
+ - hopefully
210
+ - how
211
+ - howbeit
212
+ - however
213
+ - i
214
+ - i'd
215
+ - i'll
216
+ - i'm
217
+ - i've
218
+ - ie
219
+ - if
220
+ - ignored
221
+ - immediate
222
+ - in
223
+ - inasmuch
224
+ - inc
225
+ - indeed
226
+ - indicate
227
+ - indicated
228
+ - indicates
229
+ - inner
230
+ - insofar
231
+ - instead
232
+ - into
233
+ - inward
234
+ - is
235
+ - isn't
236
+ - it
237
+ - it'd
238
+ - it'll
239
+ - it's
240
+ - its
241
+ - itself
242
+ - j
243
+ - just
244
+ - k
245
+ - keep
246
+ - keeps
247
+ - kept
248
+ - know
249
+ - knows
250
+ - known
251
+ - l
252
+ - last
253
+ - lately
254
+ - later
255
+ - latter
256
+ - latterly
257
+ - least
258
+ - less
259
+ - lest
260
+ - let
261
+ - let's
262
+ - like
263
+ - liked
264
+ - likely
265
+ - little
266
+ - look
267
+ - looking
268
+ - looks
269
+ - ltd
270
+ - m
271
+ - mainly
272
+ - many
273
+ - may
274
+ - maybe
275
+ - me
276
+ - mean
277
+ - meanwhile
278
+ - merely
279
+ - might
280
+ - more
281
+ - moreover
282
+ - most
283
+ - mostly
284
+ - much
285
+ - must
286
+ - my
287
+ - myself
288
+ - n
289
+ - name
290
+ - namely
291
+ - nd
292
+ - near
293
+ - nearly
294
+ - necessary
295
+ - need
296
+ - needs
297
+ - neither
298
+ - never
299
+ - nevertheless
300
+ - new
301
+ - next
302
+ - nine
303
+ - "no"
304
+ - nobody
305
+ - non
306
+ - none
307
+ - noone
308
+ - nor
309
+ - normally
310
+ - not
311
+ - nothing
312
+ - novel
313
+ - now
314
+ - nowhere
315
+ - o
316
+ - obviously
317
+ - of
318
+ - "off"
319
+ - often
320
+ - oh
321
+ - ok
322
+ - okay
323
+ - old
324
+ - "on"
325
+ - once
326
+ - one
327
+ - ones
328
+ - only
329
+ - onto
330
+ - or
331
+ - other
332
+ - others
333
+ - otherwise
334
+ - ought
335
+ - our
336
+ - ours
337
+ - ourselves
338
+ - out
339
+ - outside
340
+ - over
341
+ - overall
342
+ - own
343
+ - p
344
+ - particular
345
+ - particularly
346
+ - per
347
+ - perhaps
348
+ - placed
349
+ - please
350
+ - plus
351
+ - possible
352
+ - presumably
353
+ - probably
354
+ - provides
355
+ - q
356
+ - que
357
+ - quite
358
+ - qv
359
+ - r
360
+ - rather
361
+ - rd
362
+ - re
363
+ - really
364
+ - reasonably
365
+ - regarding
366
+ - regardless
367
+ - regards
368
+ - relatively
369
+ - respectively
370
+ - right
371
+ - s
372
+ - said
373
+ - same
374
+ - saw
375
+ - say
376
+ - saying
377
+ - says
378
+ - second
379
+ - secondly
380
+ - see
381
+ - seeing
382
+ - seem
383
+ - seemed
384
+ - seeming
385
+ - seems
386
+ - seen
387
+ - self
388
+ - selves
389
+ - sensible
390
+ - sent
391
+ - serious
392
+ - seriously
393
+ - seven
394
+ - several
395
+ - shall
396
+ - she
397
+ - should
398
+ - shouldn't
399
+ - since
400
+ - six
401
+ - so
402
+ - some
403
+ - somebody
404
+ - somehow
405
+ - someone
406
+ - something
407
+ - sometime
408
+ - sometimes
409
+ - somewhat
410
+ - somewhere
411
+ - soon
412
+ - sorry
413
+ - specified
414
+ - specify
415
+ - specifying
416
+ - still
417
+ - sub
418
+ - such
419
+ - sup
420
+ - sure
421
+ - t
422
+ - t's
423
+ - take
424
+ - taken
425
+ - tell
426
+ - tends
427
+ - th
428
+ - than
429
+ - thank
430
+ - thanks
431
+ - thanx
432
+ - that
433
+ - that's
434
+ - thats
435
+ - the
436
+ - their
437
+ - theirs
438
+ - them
439
+ - themselves
440
+ - then
441
+ - thence
442
+ - there
443
+ - there's
444
+ - thereafter
445
+ - thereby
446
+ - therefore
447
+ - therein
448
+ - theres
449
+ - thereupon
450
+ - these
451
+ - they
452
+ - they'd
453
+ - they'll
454
+ - they're
455
+ - they've
456
+ - think
457
+ - third
458
+ - this
459
+ - thorough
460
+ - thoroughly
461
+ - those
462
+ - though
463
+ - three
464
+ - through
465
+ - throughout
466
+ - thru
467
+ - thus
468
+ - to
469
+ - together
470
+ - too
471
+ - took
472
+ - toward
473
+ - towards
474
+ - tried
475
+ - tries
476
+ - truly
477
+ - try
478
+ - trying
479
+ - twice
480
+ - two
481
+ - u
482
+ - un
483
+ - under
484
+ - unfortunately
485
+ - unless
486
+ - unlikely
487
+ - until
488
+ - unto
489
+ - up
490
+ - upon
491
+ - us
492
+ - use
493
+ - used
494
+ - useful
495
+ - uses
496
+ - using
497
+ - usually
498
+ - v
499
+ - value
500
+ - various
501
+ - very
502
+ - via
503
+ - viz
504
+ - vs
505
+ - w
506
+ - want
507
+ - wants
508
+ - was
509
+ - wasn't
510
+ - way
511
+ - we
512
+ - we'd
513
+ - we'll
514
+ - we're
515
+ - we've
516
+ - welcome
517
+ - well
518
+ - went
519
+ - were
520
+ - weren't
521
+ - what
522
+ - what's
523
+ - whatever
524
+ - when
525
+ - whence
526
+ - whenever
527
+ - where
528
+ - where's
529
+ - whereafter
530
+ - whereas
531
+ - whereby
532
+ - wherein
533
+ - whereupon
534
+ - wherever
535
+ - whether
536
+ - which
537
+ - while
538
+ - whither
539
+ - who
540
+ - who's
541
+ - whoever
542
+ - whole
543
+ - whom
544
+ - whose
545
+ - why
546
+ - will
547
+ - willing
548
+ - wish
549
+ - with
550
+ - within
551
+ - without
552
+ - won't
553
+ - wonder
554
+ - would
555
+ - would
556
+ - wouldn't
557
+ - x
558
+ - y
559
+ - "yes"
560
+ - yet
561
+ - you
562
+ - you'd
563
+ - you'll
564
+ - you're
565
+ - you've
566
+ - your
567
+ - yours
568
+ - yourself
569
+ - yourselves
570
+ - z
571
+ - zero
@@ -2,13 +2,15 @@ require 'set'
2
2
 
3
3
  module Lda
4
4
  class Corpus
5
- attr_reader :documents, :num_docs, :num_terms, :vocabulary
5
+ attr_reader :documents, :num_docs, :num_terms, :vocabulary, :stopwords
6
6
 
7
7
  def initialize
8
8
  @documents = Array.new
9
9
  @all_terms = Set.new
10
10
  @num_terms = @num_docs = 0
11
11
  @vocabulary = Vocabulary.new
12
+ @stopwords = YAML.load_file(File.join(File.dirname(__FILE__), '..', 'config', 'stopwords.yml'))
13
+ @stopwords.map! { |w| w.strip }
12
14
  end
13
15
 
14
16
  def add_document(doc)
@@ -21,10 +23,13 @@ module Lda
21
23
  @num_terms = @all_terms.size
22
24
 
23
25
  update_vocabulary(doc)
24
-
25
26
  nil
26
27
  end
27
-
28
+
29
+ def remove_word(word)
30
+ @vocabulary.words.delete word
31
+ end
32
+
28
33
  protected
29
34
 
30
35
  def update_vocabulary(doc)
@@ -19,4 +19,4 @@ module Lda
19
19
  end
20
20
  end
21
21
  end
22
- end
22
+ end
@@ -1,3 +1,5 @@
1
+ require 'yaml'
2
+
1
3
  module Lda
2
4
  class Document
3
5
  attr_reader :corpus, :words, :counts, :length, :total, :tokens
@@ -29,8 +31,9 @@ module Lda
29
31
  end
30
32
 
31
33
  def tokenize(text)
32
- clean_text = text.gsub(/[^A-Za-z'\s]+/, ' ').gsub(/\s+/, ' ') # remove everything but letters and ' and leave only single spaces
34
+ clean_text = text.gsub(/[^A-Za-z'\s]+/, ' ').gsub(/\s+/, ' ').downcase # remove everything but letters and ' and leave only single spaces
33
35
  @tokens = handle(clean_text.split(' '))
36
+ nil
34
37
  end
35
38
  end
36
- end
39
+ end
@@ -7,6 +7,7 @@ module Lda
7
7
  @filename = nil
8
8
 
9
9
  tokenize(text)
10
+ @tokens.reject! { |w| @corpus.stopwords.include?(w) }
10
11
  build_from_tokens
11
12
  end
12
13
 
@@ -1,4 +1,11 @@
1
- require 'test_helper'
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+ require 'yaml'
5
+
6
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
7
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
8
+ require 'lda-ruby'
2
9
 
3
10
  class LdaRubyTest < Test::Unit::TestCase
4
11
  context "A Document instance" do
@@ -66,7 +73,7 @@ class LdaRubyTest < Test::Unit::TestCase
66
73
 
67
74
  context "A typical TextDocument" do
68
75
  setup do
69
- @text = 'what is that which is what is else what is'
76
+ @text = 'stop words stop stop masterful stoppage buffalo buffalo buffalo'
70
77
  @document = Lda::TextDocument.new(@corpus, @text)
71
78
  end
72
79
 
@@ -104,7 +111,7 @@ class LdaRubyTest < Test::Unit::TestCase
104
111
 
105
112
  should "update vocabulary with words in the document" do
106
113
  @corpus.add_document(@document2)
107
- assert_equal @corpus.vocabulary.words.member?('second'), true
114
+ assert_equal @corpus.vocabulary.words.member?('lame'), true
108
115
  end
109
116
  end
110
117
 
@@ -0,0 +1,26 @@
1
+ require 'rubygems'
2
+ require 'shoulda'
3
+ require 'yaml'
4
+ require 'lda-ruby'
5
+
6
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
7
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
8
+
9
+ class Test::Unit::TestCase
10
+
11
+ @corpus = Lda::Corpus.new
12
+ @document1 = Lda::TextDocument.new(@corpus, 'Dom Cobb is a skilled thief, the absolute best in the dangerous art of extraction, stealing valuable secrets from deep within the subconscious during the dream state, when the mind is at its most vulnerable. Cobb\'s rare ability has made him a coveted player in this treacherous new world of corporate espionage, but it has also made him an international fugitive and cost him everything he has ever loved. Now Cobb is being offered a chance at redemption. One last job could give him his life back but only if he can accomplish the impossible-inception. Instead of the perfect heist, Cobb and his team of specialists have to pull off the reverse: their task is not to steal an idea but to plant one. If they succeed, it could be the perfect crime. But no amount of careful planning or expertise can prepare the team for the dangerous enemy that seems to predict their every move. An enemy that only Cobb could have seen coming.')
13
+ @document2 = Lda::TextDocument.new(@corpus, 'When his brother is killed in a robbery, paraplegic Marine Jake Sully decides to take his place in a mission on the distant world of Pandora. There he learns of greedy corporate figurehead Parker Selfridge\'s intentions of driving off the native humanoid \"Na\'vi\" in order to mine for the precious material scattered throughout their rich woodland. In exchange for the spinal surgery that will fix his legs, Jake gathers intel for the cooperating military unit spearheaded by gung-ho Colonel Quaritch, while simultaneously attempting to infiltrate the Na\'vi people with the use of an \"avatar\" identity. While Jake begins to bond with the native tribe and quickly falls in love with the beautiful alien Neytiri, the restless Colonel moves forward with his ruthless extermination tactics, forcing the soldier to take a stand - and fight back in an epic battle for the fate of Pandora.')
14
+
15
+ @corpus.add_document(@document1)
16
+ @corpus.add_document(@document2)
17
+ @corpus.remove_word("cobb")
18
+ @lda = Lda::Lda.new(@corpus)
19
+
20
+ @lda.verbose = false
21
+ @lda.num_topics = 2
22
+ @lda.em('random')
23
+ topics = @lda.top_words(5)
24
+ puts topics
25
+
26
+ end
@@ -0,0 +1,23 @@
1
+ require 'rubygems'
2
+ require 'shoulda'
3
+ require 'yaml'
4
+ require 'lda-ruby'
5
+
6
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
7
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
8
+
9
+ class Test::Unit::TestCase
10
+
11
+ @filename = File.join(File.dirname(__FILE__), 'data', 'wiki-test-docs.yml')
12
+ @filedocs = YAML::load_file(@filename)
13
+ @corpus = Lda::TextCorpus.new(@filename)
14
+
15
+ @lda = Lda::Lda.new(@corpus)
16
+
17
+ @lda.verbose = false
18
+ @lda.num_topics = 20
19
+ @lda.em('random')
20
+ @lda.print_topics(20)
21
+
22
+
23
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lda-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.5
4
+ version: 0.3.6
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -11,12 +11,12 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2011-08-03 00:00:00.000000000 -04:00
14
+ date: 2011-08-05 00:00:00.000000000 -04:00
15
15
  default_executable:
16
16
  dependencies:
17
17
  - !ruby/object:Gem::Dependency
18
18
  name: shoulda
19
- requirement: &2153196880 !ruby/object:Gem::Requirement
19
+ requirement: &2153224820 !ruby/object:Gem::Requirement
20
20
  none: false
21
21
  requirements:
22
22
  - - ! '>='
@@ -24,7 +24,7 @@ dependencies:
24
24
  version: '0'
25
25
  type: :runtime
26
26
  prerelease: false
27
- version_requirements: *2153196880
27
+ version_requirements: *2153224820
28
28
  description: Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.
29
29
  email: jasonmadams@gmail.com
30
30
  executables: []
@@ -56,6 +56,7 @@ files:
56
56
  - ext/lda-ruby/utils.h
57
57
  - lda-ruby.gemspec
58
58
  - lib/lda-ruby.rb
59
+ - lib/lda-ruby/config/stopwords.yml
59
60
  - lib/lda-ruby/corpus/corpus.rb
60
61
  - lib/lda-ruby/corpus/data_corpus.rb
61
62
  - lib/lda-ruby/corpus/directory_corpus.rb
@@ -70,6 +71,8 @@ files:
70
71
  - test/data/sample.rb
71
72
  - test/data/wiki-test-docs.yml
72
73
  - test/lda_ruby_test.rb
74
+ - test/simple_test.rb
75
+ - test/simple_yaml.rb
73
76
  - test/test_helper.rb
74
77
  has_rdoc: true
75
78
  homepage: http://github.com/ealdent/lda-ruby