lda-ruby 0.3.1 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.markdown +1 -2
- data/Rakefile +3 -1
- data/VERSION.yml +2 -1
- data/ext/lda-ruby/lda-inference.c +3 -3
- data/ext/lda-ruby/lda-model.h +2 -0
- data/lda-ruby.gemspec +49 -51
- data/test/data/sample.rb +20 -0
- data/test/lda_ruby_test.rb +8 -8
- data/test/test_helper.rb +1 -0
- metadata +39 -34
- data/.gitignore +0 -5
data/README.markdown
CHANGED
@@ -15,7 +15,7 @@ The original C code relied on files for the input and output. We felt it was nec
|
|
15
15
|
lda.load_vocabulary("data/vocab.txt")
|
16
16
|
lda.print_topics(20) # print the topic 20 words per topic
|
17
17
|
|
18
|
-
|
18
|
+
If you have general questions about Latent Dirichlet Allocation, I urge you to use the [topic models mailing list][topic-models], since the people who monitor that are very knowledgeable. If you encounter bugs specific to lda-ruby, please post an issue on the Github project.
|
19
19
|
|
20
20
|
## Resources
|
21
21
|
|
@@ -24,7 +24,6 @@ You can check out the mailing list for this project if you have any questions or
|
|
24
24
|
+ [Wikipedia article on LDA][wikipedia]
|
25
25
|
+ [Sample AP data][ap-data]
|
26
26
|
|
27
|
-
|
28
27
|
## References
|
29
28
|
|
30
29
|
Blei, David M., Ng, Andrew Y., and Jordan, Michael I. 2003. Latent dirichlet allocation. Journal of Machine Learning Research. 3 (Mar. 2003), 993-1022 [[pdf][pdf]].
|
data/Rakefile
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'rake'
|
3
|
+
require 'yaml'
|
3
4
|
|
4
5
|
begin
|
5
6
|
require 'jeweler'
|
@@ -9,9 +10,10 @@ begin
|
|
9
10
|
gem.description = %Q{Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.}
|
10
11
|
gem.email = "jasonmadams@gmail.com"
|
11
12
|
gem.homepage = "http://github.com/ealdent/lda-ruby"
|
12
|
-
gem.authors = ['David Blei', 'Jason Adams']
|
13
|
+
gem.authors = ['David Blei', 'Jason Adams', 'Rio Akasaka']
|
13
14
|
gem.extensions = ['ext/lda-ruby/extconf.rb']
|
14
15
|
gem.require_paths = ['lib', 'ext']
|
16
|
+
gem.add_dependency 'shoulda'
|
15
17
|
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
16
18
|
end
|
17
19
|
|
data/VERSION.yml
CHANGED
@@ -778,7 +778,7 @@ static VALUE wrap_em(VALUE self, VALUE start) {
|
|
778
778
|
if (!corpus_loaded)
|
779
779
|
return Qnil;
|
780
780
|
|
781
|
-
run_quiet_em(
|
781
|
+
run_quiet_em(StringValuePtr(start), last_corpus);
|
782
782
|
|
783
783
|
return Qnil;
|
784
784
|
}
|
@@ -788,7 +788,7 @@ static VALUE wrap_em(VALUE self, VALUE start) {
|
|
788
788
|
* Load settings from the given file.
|
789
789
|
*/
|
790
790
|
static VALUE wrap_load_settings(VALUE self, VALUE settings_file) {
|
791
|
-
read_settings(
|
791
|
+
read_settings(StringValuePtr(settings_file));
|
792
792
|
|
793
793
|
return Qtrue;
|
794
794
|
}
|
@@ -800,7 +800,7 @@ static VALUE wrap_load_settings(VALUE self, VALUE settings_file) {
|
|
800
800
|
*/
|
801
801
|
static VALUE wrap_load_corpus(VALUE self, VALUE filename) {
|
802
802
|
if (!corpus_loaded) {
|
803
|
-
last_corpus = read_data(
|
803
|
+
last_corpus = read_data(StringValuePtr(filename));
|
804
804
|
corpus_loaded = TRUE;
|
805
805
|
return Qtrue;
|
806
806
|
} else {
|
data/ext/lda-ruby/lda-model.h
CHANGED
@@ -15,6 +15,8 @@
|
|
15
15
|
void free_lda_model(lda_model*);
|
16
16
|
void save_lda_model(lda_model*, char*);
|
17
17
|
lda_model* new_lda_model(int, int);
|
18
|
+
lda_model* quiet_new_lda_model(int num_terms, int num_topics);
|
19
|
+
lda_model* new_lda_model(int num_terms, int num_topics);
|
18
20
|
lda_suffstats* new_lda_suffstats(lda_model* model);
|
19
21
|
void free_lda_suffstats(lda_model* model, lda_suffstats* ss);
|
20
22
|
void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c);
|
data/lda-ruby.gemspec
CHANGED
@@ -1,78 +1,76 @@
|
|
1
1
|
# Generated by jeweler
|
2
|
-
# DO NOT EDIT THIS FILE
|
3
|
-
# Instead, edit Jeweler::Tasks in Rakefile, and run
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{lda-ruby}
|
8
|
-
s.version = "0.3.
|
8
|
+
s.version = "0.3.4"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
-
s.authors = ["David Blei", "Jason Adams"]
|
12
|
-
s.date = %q{
|
11
|
+
s.authors = ["David Blei", "Jason Adams", "Rio Akasaka"]
|
12
|
+
s.date = %q{2011-07-29}
|
13
13
|
s.description = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.}
|
14
14
|
s.email = %q{jasonmadams@gmail.com}
|
15
15
|
s.extensions = ["ext/lda-ruby/extconf.rb"]
|
16
16
|
s.extra_rdoc_files = [
|
17
17
|
"README",
|
18
|
-
|
18
|
+
"README.markdown"
|
19
19
|
]
|
20
20
|
s.files = [
|
21
|
-
"
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
21
|
+
"CHANGELOG",
|
22
|
+
"README",
|
23
|
+
"README.markdown",
|
24
|
+
"Rakefile",
|
25
|
+
"VERSION.yml",
|
26
|
+
"ext/lda-ruby/Makefile",
|
27
|
+
"ext/lda-ruby/cokus.c",
|
28
|
+
"ext/lda-ruby/cokus.h",
|
29
|
+
"ext/lda-ruby/extconf.rb",
|
30
|
+
"ext/lda-ruby/lda-alpha.c",
|
31
|
+
"ext/lda-ruby/lda-alpha.h",
|
32
|
+
"ext/lda-ruby/lda-data.c",
|
33
|
+
"ext/lda-ruby/lda-data.h",
|
34
|
+
"ext/lda-ruby/lda-inference.c",
|
35
|
+
"ext/lda-ruby/lda-inference.h",
|
36
|
+
"ext/lda-ruby/lda-model.c",
|
37
|
+
"ext/lda-ruby/lda-model.h",
|
38
|
+
"ext/lda-ruby/lda.h",
|
39
|
+
"ext/lda-ruby/utils.c",
|
40
|
+
"ext/lda-ruby/utils.h",
|
41
|
+
"lda-ruby.gemspec",
|
42
|
+
"lib/lda-ruby.rb",
|
43
|
+
"lib/lda-ruby/corpus/corpus.rb",
|
44
|
+
"lib/lda-ruby/corpus/data_corpus.rb",
|
45
|
+
"lib/lda-ruby/corpus/directory_corpus.rb",
|
46
|
+
"lib/lda-ruby/corpus/text_corpus.rb",
|
47
|
+
"lib/lda-ruby/document/data_document.rb",
|
48
|
+
"lib/lda-ruby/document/document.rb",
|
49
|
+
"lib/lda-ruby/document/text_document.rb",
|
50
|
+
"lib/lda-ruby/vocabulary.rb",
|
51
|
+
"license.txt",
|
52
|
+
"test/data/.gitignore",
|
53
|
+
"test/data/docs.dat",
|
54
|
+
"test/data/sample.rb",
|
55
|
+
"test/data/wiki-test-docs.yml",
|
56
|
+
"test/lda_ruby_test.rb",
|
57
|
+
"test/test_helper.rb"
|
58
58
|
]
|
59
59
|
s.homepage = %q{http://github.com/ealdent/lda-ruby}
|
60
|
-
s.rdoc_options = ["--charset=UTF-8"]
|
61
60
|
s.require_paths = ["lib", "ext"]
|
62
|
-
s.rubygems_version = %q{1.
|
61
|
+
s.rubygems_version = %q{1.6.2}
|
63
62
|
s.summary = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei.}
|
64
|
-
s.test_files = [
|
65
|
-
"test/lda_ruby_test.rb",
|
66
|
-
"test/test_helper.rb"
|
67
|
-
]
|
68
63
|
|
69
64
|
if s.respond_to? :specification_version then
|
70
|
-
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
71
65
|
s.specification_version = 3
|
72
66
|
|
73
|
-
if Gem::Version.new(Gem::
|
67
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
68
|
+
s.add_runtime_dependency(%q<shoulda>, [">= 0"])
|
74
69
|
else
|
70
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
75
71
|
end
|
76
72
|
else
|
73
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
77
74
|
end
|
78
75
|
end
|
76
|
+
|
data/test/data/sample.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'lda-ruby'
|
5
|
+
|
6
|
+
# Load the Corpus. The AP data from David Blei's website is in the "DataCorpus" format
|
7
|
+
corpus = Lda::DataCorpus.new("ap/ap.dat")
|
8
|
+
|
9
|
+
# Initialize the Lda instance with the corpus
|
10
|
+
lda = Lda::Lda.new(corpus)
|
11
|
+
|
12
|
+
# Run the EM algorithm using random starting points. Fixed starting points will use the first n documents
|
13
|
+
# to initialize the topics, where n is the number of topics.
|
14
|
+
lda.em("random") # run EM algorithm using random starting points
|
15
|
+
|
16
|
+
# Load the vocabulary file necessary with DataCorpus objects
|
17
|
+
lda.load_vocabulary("ap/vocab.txt")
|
18
|
+
|
19
|
+
# Print the top 20 words per topic
|
20
|
+
lda.print_topics(20)
|
data/test/lda_ruby_test.rb
CHANGED
@@ -110,7 +110,7 @@ class LdaRubyTest < Test::Unit::TestCase
|
|
110
110
|
|
111
111
|
context "An Lda::DataCorpus instance loaded from a file" do
|
112
112
|
setup do
|
113
|
-
@filename = 'data
|
113
|
+
@filename = File.join(File.dirname(__FILE__), 'data', 'docs.dat')
|
114
114
|
@filetext = File.open(@filename, 'r') { |f| f.read }
|
115
115
|
@corpus = Lda::DataCorpus.new(@filename)
|
116
116
|
end
|
@@ -126,7 +126,7 @@ class LdaRubyTest < Test::Unit::TestCase
|
|
126
126
|
|
127
127
|
context "An Lda::TextCorpus instance loaded from a file" do
|
128
128
|
setup do
|
129
|
-
@filename = 'data
|
129
|
+
@filename = File.join(File.dirname(__FILE__), 'data', 'wiki-test-docs.yml')
|
130
130
|
@filedocs = YAML::load_file(@filename)
|
131
131
|
@corpus = Lda::TextCorpus.new(@filename)
|
132
132
|
end
|
@@ -142,13 +142,13 @@ class LdaRubyTest < Test::Unit::TestCase
|
|
142
142
|
|
143
143
|
context "An Lda::DirectoryCorpus instance loaded from a directory" do
|
144
144
|
setup do
|
145
|
-
@path = 'data
|
145
|
+
@path = File.join(File.dirname(__FILE__), 'data', 'tmp')
|
146
146
|
@extension = 'txt'
|
147
147
|
Dir.mkdir(@path)
|
148
|
-
@original_filename = 'data
|
148
|
+
@original_filename = File.join(File.dirname(__FILE__), 'data', 'wiki-test-docs.yml')
|
149
149
|
@filedocs = YAML::load_file(@original_filename)
|
150
150
|
@filedocs.each_with_index do |doc, idx|
|
151
|
-
File.open("
|
151
|
+
File.open(File.join(@path, "doc_#{idx + 1}.txt"), 'w') { |f| f.write(doc) }
|
152
152
|
end
|
153
153
|
|
154
154
|
@corpus = Lda::DirectoryCorpus.new(@path, @extension)
|
@@ -173,11 +173,11 @@ class LdaRubyTest < Test::Unit::TestCase
|
|
173
173
|
setup do
|
174
174
|
@vocab = Lda::Vocabulary.new
|
175
175
|
@words = ['word1', 'word2', 'word3', 'word4', 'word5', 'word6']
|
176
|
-
@filename1 = 'data
|
176
|
+
@filename1 = File.join(File.dirname(__FILE__), 'data', 'tmp_file.txt')
|
177
177
|
File.open(@filename1, 'w') do |f|
|
178
178
|
@words.each { |w| f.write("#{w}\n") }
|
179
179
|
end
|
180
|
-
@filename2 = 'data
|
180
|
+
@filename2 = File.join(File.dirname(__FILE__), 'data', 'tmp_file.yml')
|
181
181
|
File.open(@filename2, 'w') { |f| YAML::dump(@words, f) }
|
182
182
|
end
|
183
183
|
|
@@ -208,7 +208,7 @@ class LdaRubyTest < Test::Unit::TestCase
|
|
208
208
|
|
209
209
|
context "An Lda::Lda instance" do
|
210
210
|
setup do
|
211
|
-
@filename = 'data
|
211
|
+
@filename = File.join(File.dirname(__FILE__), 'data', 'wiki-test-docs.yml')
|
212
212
|
@filedocs = YAML::load_file(@filename)
|
213
213
|
@corpus = Lda::TextCorpus.new(@filename)
|
214
214
|
|
data/test/test_helper.rb
CHANGED
metadata
CHANGED
@@ -1,30 +1,39 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: lda-ruby
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.3.4
|
5
|
+
prerelease:
|
5
6
|
platform: ruby
|
6
|
-
authors:
|
7
|
+
authors:
|
7
8
|
- David Blei
|
8
9
|
- Jason Adams
|
10
|
+
- Rio Akasaka
|
9
11
|
autorequire:
|
10
12
|
bindir: bin
|
11
13
|
cert_chain: []
|
12
|
-
|
13
|
-
date: 2009-08-11 00:00:00 -04:00
|
14
|
+
date: 2011-07-29 00:00:00.000000000 -04:00
|
14
15
|
default_executable:
|
15
|
-
dependencies:
|
16
|
-
|
16
|
+
dependencies:
|
17
|
+
- !ruby/object:Gem::Dependency
|
18
|
+
name: shoulda
|
19
|
+
requirement: &2161566540 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - ! '>='
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: '0'
|
25
|
+
type: :runtime
|
26
|
+
prerelease: false
|
27
|
+
version_requirements: *2161566540
|
17
28
|
description: Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.
|
18
29
|
email: jasonmadams@gmail.com
|
19
30
|
executables: []
|
20
|
-
|
21
|
-
extensions:
|
31
|
+
extensions:
|
22
32
|
- ext/lda-ruby/extconf.rb
|
23
|
-
extra_rdoc_files:
|
33
|
+
extra_rdoc_files:
|
24
34
|
- README
|
25
35
|
- README.markdown
|
26
|
-
files:
|
27
|
-
- .gitignore
|
36
|
+
files:
|
28
37
|
- CHANGELOG
|
29
38
|
- README
|
30
39
|
- README.markdown
|
@@ -58,38 +67,34 @@ files:
|
|
58
67
|
- license.txt
|
59
68
|
- test/data/.gitignore
|
60
69
|
- test/data/docs.dat
|
70
|
+
- test/data/sample.rb
|
61
71
|
- test/data/wiki-test-docs.yml
|
62
72
|
- test/lda_ruby_test.rb
|
63
73
|
- test/test_helper.rb
|
64
74
|
has_rdoc: true
|
65
75
|
homepage: http://github.com/ealdent/lda-ruby
|
66
76
|
licenses: []
|
67
|
-
|
68
77
|
post_install_message:
|
69
|
-
rdoc_options:
|
70
|
-
|
71
|
-
require_paths:
|
78
|
+
rdoc_options: []
|
79
|
+
require_paths:
|
72
80
|
- lib
|
73
81
|
- ext
|
74
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
82
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
83
|
+
none: false
|
84
|
+
requirements:
|
85
|
+
- - ! '>='
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '0'
|
88
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
86
94
|
requirements: []
|
87
|
-
|
88
95
|
rubyforge_project:
|
89
|
-
rubygems_version: 1.
|
96
|
+
rubygems_version: 1.6.2
|
90
97
|
signing_key:
|
91
98
|
specification_version: 3
|
92
99
|
summary: Ruby port of Latent Dirichlet Allocation by David M. Blei.
|
93
|
-
test_files:
|
94
|
-
- test/lda_ruby_test.rb
|
95
|
-
- test/test_helper.rb
|
100
|
+
test_files: []
|