lda-ruby 0.4.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +61 -0
  3. data/Gemfile +9 -0
  4. data/README.md +157 -0
  5. data/VERSION.yml +5 -0
  6. data/docs/modernization-handoff.md +190 -0
  7. data/docs/porting-strategy.md +127 -0
  8. data/docs/precompiled-platform-policy.md +68 -0
  9. data/docs/release-runbook.md +157 -0
  10. data/ext/lda-ruby/cokus.c +145 -0
  11. data/ext/lda-ruby/cokus.h +27 -0
  12. data/ext/lda-ruby/extconf.rb +13 -0
  13. data/ext/lda-ruby/lda-alpha.c +96 -0
  14. data/ext/lda-ruby/lda-alpha.h +21 -0
  15. data/ext/lda-ruby/lda-data.c +67 -0
  16. data/ext/lda-ruby/lda-data.h +14 -0
  17. data/ext/lda-ruby/lda-inference.c +1023 -0
  18. data/ext/lda-ruby/lda-inference.h +63 -0
  19. data/ext/lda-ruby/lda-model.c +345 -0
  20. data/ext/lda-ruby/lda-model.h +31 -0
  21. data/ext/lda-ruby/lda.h +54 -0
  22. data/ext/lda-ruby/utils.c +111 -0
  23. data/ext/lda-ruby/utils.h +18 -0
  24. data/ext/lda-ruby-rust/Cargo.toml +12 -0
  25. data/ext/lda-ruby-rust/README.md +48 -0
  26. data/ext/lda-ruby-rust/extconf.rb +123 -0
  27. data/ext/lda-ruby-rust/src/lib.rs +456 -0
  28. data/lda-ruby.gemspec +78 -0
  29. data/lib/lda-ruby/backends/base.rb +129 -0
  30. data/lib/lda-ruby/backends/native.rb +158 -0
  31. data/lib/lda-ruby/backends/pure_ruby.rb +613 -0
  32. data/lib/lda-ruby/backends/rust.rb +226 -0
  33. data/lib/lda-ruby/backends.rb +58 -0
  34. data/lib/lda-ruby/config/stopwords.yml +571 -0
  35. data/lib/lda-ruby/corpus/corpus.rb +45 -0
  36. data/lib/lda-ruby/corpus/data_corpus.rb +22 -0
  37. data/lib/lda-ruby/corpus/directory_corpus.rb +25 -0
  38. data/lib/lda-ruby/corpus/text_corpus.rb +27 -0
  39. data/lib/lda-ruby/document/data_document.rb +30 -0
  40. data/lib/lda-ruby/document/document.rb +40 -0
  41. data/lib/lda-ruby/document/text_document.rb +39 -0
  42. data/lib/lda-ruby/lda.so +0 -0
  43. data/lib/lda-ruby/rust_build_policy.rb +21 -0
  44. data/lib/lda-ruby/version.rb +5 -0
  45. data/lib/lda-ruby/vocabulary.rb +46 -0
  46. data/lib/lda-ruby.rb +413 -0
  47. data/lib/lda_ruby_rust.so +0 -0
  48. data/license.txt +504 -0
  49. data/test/backend_compatibility_test.rb +146 -0
  50. data/test/backends_selection_test.rb +100 -0
  51. data/test/data/docs.dat +46 -0
  52. data/test/data/sample.rb +20 -0
  53. data/test/data/wiki-test-docs.yml +123 -0
  54. data/test/gemspec_test.rb +27 -0
  55. data/test/lda_ruby_test.rb +319 -0
  56. data/test/packaged_gem_smoke_test.rb +33 -0
  57. data/test/release_scripts_test.rb +54 -0
  58. data/test/rust_build_policy_test.rb +23 -0
  59. data/test/simple_pipeline_test.rb +22 -0
  60. data/test/simple_yaml.rb +17 -0
  61. data/test/test_helper.rb +10 -0
  62. metadata +111 -0
@@ -0,0 +1,571 @@
1
+ ---
2
+ - a
3
+ - a's
4
+ - able
5
+ - about
6
+ - above
7
+ - according
8
+ - accordingly
9
+ - across
10
+ - actually
11
+ - after
12
+ - afterwards
13
+ - again
14
+ - against
15
+ - ain't
16
+ - all
17
+ - allow
18
+ - allows
19
+ - almost
20
+ - alone
21
+ - along
22
+ - already
23
+ - also
24
+ - although
25
+ - always
26
+ - am
27
+ - among
28
+ - amongst
29
+ - an
30
+ - and
31
+ - another
32
+ - any
33
+ - anybody
34
+ - anyhow
35
+ - anyone
36
+ - anything
37
+ - anyway
38
+ - anyways
39
+ - anywhere
40
+ - apart
41
+ - appear
42
+ - appreciate
43
+ - appropriate
44
+ - are
45
+ - aren't
46
+ - around
47
+ - as
48
+ - aside
49
+ - ask
50
+ - asking
51
+ - associated
52
+ - at
53
+ - available
54
+ - away
55
+ - awfully
56
+ - b
57
+ - be
58
+ - became
59
+ - because
60
+ - become
61
+ - becomes
62
+ - becoming
63
+ - been
64
+ - before
65
+ - beforehand
66
+ - behind
67
+ - being
68
+ - believe
69
+ - below
70
+ - beside
71
+ - besides
72
+ - best
73
+ - better
74
+ - between
75
+ - beyond
76
+ - both
77
+ - brief
78
+ - but
79
+ - by
80
+ - c
81
+ - c'mon
82
+ - c's
83
+ - came
84
+ - can
85
+ - can't
86
+ - cannot
87
+ - cant
88
+ - cause
89
+ - causes
90
+ - certain
91
+ - certainly
92
+ - changes
93
+ - clearly
94
+ - co
95
+ - com
96
+ - come
97
+ - comes
98
+ - concerning
99
+ - consequently
100
+ - consider
101
+ - considering
102
+ - contain
103
+ - containing
104
+ - contains
105
+ - corresponding
106
+ - could
107
+ - couldn't
108
+ - course
109
+ - currently
110
+ - d
111
+ - definitely
112
+ - described
113
+ - despite
114
+ - did
115
+ - didn't
116
+ - different
117
+ - do
118
+ - does
119
+ - doesn't
120
+ - doing
121
+ - don't
122
+ - done
123
+ - down
124
+ - downwards
125
+ - during
126
+ - e
127
+ - each
128
+ - edu
129
+ - eg
130
+ - eight
131
+ - either
132
+ - else
133
+ - elsewhere
134
+ - enough
135
+ - entirely
136
+ - especially
137
+ - et
138
+ - etc
139
+ - even
140
+ - ever
141
+ - every
142
+ - everybody
143
+ - everyone
144
+ - everything
145
+ - everywhere
146
+ - ex
147
+ - exactly
148
+ - example
149
+ - except
150
+ - f
151
+ - far
152
+ - few
153
+ - fifth
154
+ - first
155
+ - five
156
+ - followed
157
+ - following
158
+ - follows
159
+ - for
160
+ - former
161
+ - formerly
162
+ - forth
163
+ - four
164
+ - from
165
+ - further
166
+ - furthermore
167
+ - g
168
+ - get
169
+ - gets
170
+ - getting
171
+ - given
172
+ - gives
173
+ - go
174
+ - goes
175
+ - going
176
+ - gone
177
+ - got
178
+ - gotten
179
+ - greetings
180
+ - h
181
+ - had
182
+ - hadn't
183
+ - happens
184
+ - hardly
185
+ - has
186
+ - hasn't
187
+ - have
188
+ - haven't
189
+ - having
190
+ - he
191
+ - he's
192
+ - hello
193
+ - help
194
+ - hence
195
+ - her
196
+ - here
197
+ - here's
198
+ - hereafter
199
+ - hereby
200
+ - herein
201
+ - hereupon
202
+ - hers
203
+ - herself
204
+ - hi
205
+ - him
206
+ - himself
207
+ - his
208
+ - hither
209
+ - hopefully
210
+ - how
211
+ - howbeit
212
+ - however
213
+ - i
214
+ - i'd
215
+ - i'll
216
+ - i'm
217
+ - i've
218
+ - ie
219
+ - if
220
+ - ignored
221
+ - immediate
222
+ - in
223
+ - inasmuch
224
+ - inc
225
+ - indeed
226
+ - indicate
227
+ - indicated
228
+ - indicates
229
+ - inner
230
+ - insofar
231
+ - instead
232
+ - into
233
+ - inward
234
+ - is
235
+ - isn't
236
+ - it
237
+ - it'd
238
+ - it'll
239
+ - it's
240
+ - its
241
+ - itself
242
+ - j
243
+ - just
244
+ - k
245
+ - keep
246
+ - keeps
247
+ - kept
248
+ - know
249
+ - knows
250
+ - known
251
+ - l
252
+ - last
253
+ - lately
254
+ - later
255
+ - latter
256
+ - latterly
257
+ - least
258
+ - less
259
+ - lest
260
+ - let
261
+ - let's
262
+ - like
263
+ - liked
264
+ - likely
265
+ - little
266
+ - look
267
+ - looking
268
+ - looks
269
+ - ltd
270
+ - m
271
+ - mainly
272
+ - many
273
+ - may
274
+ - maybe
275
+ - me
276
+ - mean
277
+ - meanwhile
278
+ - merely
279
+ - might
280
+ - more
281
+ - moreover
282
+ - most
283
+ - mostly
284
+ - much
285
+ - must
286
+ - my
287
+ - myself
288
+ - n
289
+ - name
290
+ - namely
291
+ - nd
292
+ - near
293
+ - nearly
294
+ - necessary
295
+ - need
296
+ - needs
297
+ - neither
298
+ - never
299
+ - nevertheless
300
+ - new
301
+ - next
302
+ - nine
303
+ - "no"
304
+ - nobody
305
+ - non
306
+ - none
307
+ - noone
308
+ - nor
309
+ - normally
310
+ - not
311
+ - nothing
312
+ - novel
313
+ - now
314
+ - nowhere
315
+ - o
316
+ - obviously
317
+ - of
318
+ - "off"
319
+ - often
320
+ - oh
321
+ - ok
322
+ - okay
323
+ - old
324
+ - "on"
325
+ - once
326
+ - one
327
+ - ones
328
+ - only
329
+ - onto
330
+ - or
331
+ - other
332
+ - others
333
+ - otherwise
334
+ - ought
335
+ - our
336
+ - ours
337
+ - ourselves
338
+ - out
339
+ - outside
340
+ - over
341
+ - overall
342
+ - own
343
+ - p
344
+ - particular
345
+ - particularly
346
+ - per
347
+ - perhaps
348
+ - placed
349
+ - please
350
+ - plus
351
+ - possible
352
+ - presumably
353
+ - probably
354
+ - provides
355
+ - q
356
+ - que
357
+ - quite
358
+ - qv
359
+ - r
360
+ - rather
361
+ - rd
362
+ - re
363
+ - really
364
+ - reasonably
365
+ - regarding
366
+ - regardless
367
+ - regards
368
+ - relatively
369
+ - respectively
370
+ - right
371
+ - s
372
+ - said
373
+ - same
374
+ - saw
375
+ - say
376
+ - saying
377
+ - says
378
+ - second
379
+ - secondly
380
+ - see
381
+ - seeing
382
+ - seem
383
+ - seemed
384
+ - seeming
385
+ - seems
386
+ - seen
387
+ - self
388
+ - selves
389
+ - sensible
390
+ - sent
391
+ - serious
392
+ - seriously
393
+ - seven
394
+ - several
395
+ - shall
396
+ - she
397
+ - should
398
+ - shouldn't
399
+ - since
400
+ - six
401
+ - so
402
+ - some
403
+ - somebody
404
+ - somehow
405
+ - someone
406
+ - something
407
+ - sometime
408
+ - sometimes
409
+ - somewhat
410
+ - somewhere
411
+ - soon
412
+ - sorry
413
+ - specified
414
+ - specify
415
+ - specifying
416
+ - still
417
+ - sub
418
+ - such
419
+ - sup
420
+ - sure
421
+ - t
422
+ - t's
423
+ - take
424
+ - taken
425
+ - tell
426
+ - tends
427
+ - th
428
+ - than
429
+ - thank
430
+ - thanks
431
+ - thanx
432
+ - that
433
+ - that's
434
+ - thats
435
+ - the
436
+ - their
437
+ - theirs
438
+ - them
439
+ - themselves
440
+ - then
441
+ - thence
442
+ - there
443
+ - there's
444
+ - thereafter
445
+ - thereby
446
+ - therefore
447
+ - therein
448
+ - theres
449
+ - thereupon
450
+ - these
451
+ - they
452
+ - they'd
453
+ - they'll
454
+ - they're
455
+ - they've
456
+ - think
457
+ - third
458
+ - this
459
+ - thorough
460
+ - thoroughly
461
+ - those
462
+ - though
463
+ - three
464
+ - through
465
+ - throughout
466
+ - thru
467
+ - thus
468
+ - to
469
+ - together
470
+ - too
471
+ - took
472
+ - toward
473
+ - towards
474
+ - tried
475
+ - tries
476
+ - truly
477
+ - try
478
+ - trying
479
+ - twice
480
+ - two
481
+ - u
482
+ - un
483
+ - under
484
+ - unfortunately
485
+ - unless
486
+ - unlikely
487
+ - until
488
+ - unto
489
+ - up
490
+ - upon
491
+ - us
492
+ - use
493
+ - used
494
+ - useful
495
+ - uses
496
+ - using
497
+ - usually
498
+ - v
499
+ - value
500
+ - various
501
+ - very
502
+ - via
503
+ - viz
504
+ - vs
505
+ - w
506
+ - want
507
+ - wants
508
+ - was
509
+ - wasn't
510
+ - way
511
+ - we
512
+ - we'd
513
+ - we'll
514
+ - we're
515
+ - we've
516
+ - welcome
517
+ - well
518
+ - went
519
+ - were
520
+ - weren't
521
+ - what
522
+ - what's
523
+ - whatever
524
+ - when
525
+ - whence
526
+ - whenever
527
+ - where
528
+ - where's
529
+ - whereafter
530
+ - whereas
531
+ - whereby
532
+ - wherein
533
+ - whereupon
534
+ - wherever
535
+ - whether
536
+ - which
537
+ - while
538
+ - whither
539
+ - who
540
+ - who's
541
+ - whoever
542
+ - whole
543
+ - whom
544
+ - whose
545
+ - why
546
+ - will
547
+ - willing
548
+ - wish
549
+ - with
550
+ - within
551
+ - without
552
+ - won't
553
+ - wonder
554
+ - would
555
+ - would
556
+ - wouldn't
557
+ - x
558
+ - y
559
+ - "yes"
560
+ - yet
561
+ - you
562
+ - you'd
563
+ - you'll
564
+ - you're
565
+ - you've
566
+ - your
567
+ - yours
568
+ - yourself
569
+ - yourselves
570
+ - z
571
+ - zero
@@ -0,0 +1,45 @@
1
+ require "set"
2
+ require "yaml"
3
+
4
+ module Lda
5
+ class Corpus
6
+ attr_reader :documents, :num_docs, :num_terms, :vocabulary, :stopwords
7
+
8
+ def initialize(stop_word_list = nil)
9
+ @documents = []
10
+ @all_terms = Set.new
11
+ @num_terms = @num_docs = 0
12
+ @vocabulary = Vocabulary.new
13
+ @stopwords = if stop_word_list.nil?
14
+ File.join(File.dirname(__FILE__), '..', 'config', 'stopwords.yml')
15
+ else
16
+ stop_word_list
17
+ end
18
+ @stopwords = YAML.load_file(@stopwords)
19
+ @stopwords.map!(&:strip)
20
+ end
21
+
22
+ def add_document(doc)
23
+ raise 'Parameter +doc+ must be of type Document' unless doc.is_a?(Document)
24
+
25
+ @documents << doc
26
+
27
+ @all_terms += doc.words
28
+ @num_docs += 1
29
+ @num_terms = @all_terms.size
30
+
31
+ update_vocabulary(doc)
32
+ nil
33
+ end
34
+
35
+ def remove_word(word)
36
+ @vocabulary.words.delete word
37
+ end
38
+
39
+ protected
40
+
41
+ def update_vocabulary(doc)
42
+ doc.tokens.each { |w| @vocabulary.check_word(w) }
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,22 @@
1
+ module Lda
2
+ class DataCorpus < Corpus
3
+ attr_reader :filename
4
+
5
+ def initialize(filename)
6
+ super()
7
+
8
+ @filename = filename
9
+ load_from_file
10
+ end
11
+
12
+ protected
13
+
14
+ def load_from_file
15
+ txt = File.open(@filename, 'r', &:read)
16
+ lines = txt.split(/[\r\n]+/)
17
+ lines.each do |line|
18
+ add_document(DataDocument.new(self, line))
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,25 @@
1
+ module Lda
2
+ class DirectoryCorpus < Corpus
3
+ attr_reader :path, :extension
4
+
5
+ # load documents from a directory
6
+ def initialize(path, extension = nil)
7
+ super()
8
+
9
+ @path = path.dup.freeze
10
+ @extension = extension ? extension.dup.freeze : nil
11
+
12
+ load_from_directory
13
+ end
14
+
15
+ protected
16
+
17
+ def load_from_directory
18
+ dir_glob = File.join(@path, (@extension ? "*.#{@extension}" : '*'))
19
+
20
+ Dir.glob(dir_glob).each do |filename|
21
+ add_document(TextDocument.build_from_file(self, filename))
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,27 @@
1
+ module Lda
2
+ class TextCorpus < Corpus
3
+ attr_reader :filename
4
+
5
+ # Loads text documents from a YAML file or an array of strings
6
+ def initialize(input_data)
7
+ super()
8
+
9
+ docs = if input_data.is_a?(String) && File.exist?(input_data)
10
+ # yaml file containing an array of strings representing each document
11
+ YAML.load_file(input_data)
12
+ elsif input_data.is_a?(Array)
13
+ # an array of strings representing each document
14
+ input_data.dup
15
+ elsif input_data.is_a?(String)
16
+ # a single string representing one document
17
+ [input_data]
18
+ else
19
+ raise 'Unknown input type: please pass in a valid filename or an array of strings.'
20
+ end
21
+
22
+ docs.each do |doc|
23
+ add_document(TextDocument.new(self, doc))
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,30 @@
1
+ #
2
+ # Create the Document using the svmlight-style text line:
3
+ #
4
+ # num_words w1:freq1 w2:freq2 ... w_n:freq_n
5
+ #
6
+ # Ex.
7
+ # 5 1:2 3:1 4:2 7:3 12:1
8
+ #
9
+ # The value for the number of words should equal the number of pairs
10
+ # following it, though this isn't at all enforced. Order of word-pair
11
+ # indices is not important.
12
+ #
13
+
14
+ module Lda
15
+ class DataDocument < Document
16
+ def initialize(corpus, data)
17
+ super(corpus)
18
+
19
+ items = data.split(/\s+/)
20
+ pairs = items[1..items.size].map { |item| item.split(':') }
21
+
22
+ pairs.each do |feature_identifier, feature_weight|
23
+ @words << feature_identifier.to_i
24
+ @counts << feature_weight.to_i
25
+ end
26
+
27
+ recompute
28
+ end
29
+ end
30
+ end