lda-ruby 0.3.1 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README.markdown +1 -2
- data/Rakefile +3 -1
- data/VERSION.yml +2 -1
- data/ext/lda-ruby/lda-inference.c +3 -3
- data/ext/lda-ruby/lda-model.h +2 -0
- data/lda-ruby.gemspec +49 -51
- data/test/data/sample.rb +20 -0
- data/test/lda_ruby_test.rb +8 -8
- data/test/test_helper.rb +1 -0
- metadata +39 -34
- data/.gitignore +0 -5
data/README.markdown
CHANGED
@@ -15,7 +15,7 @@ The original C code relied on files for the input and output. We felt it was nec
|
|
15
15
|
lda.load_vocabulary("data/vocab.txt")
|
16
16
|
lda.print_topics(20) # print the topic 20 words per topic
|
17
17
|
|
18
|
-
|
18
|
+
If you have general questions about Latent Dirichlet Allocation, I urge you to use the [topic models mailing list][topic-models], since the people who monitor that are very knowledgeable. If you encounter bugs specific to lda-ruby, please post an issue on the Github project.
|
19
19
|
|
20
20
|
## Resources
|
21
21
|
|
@@ -24,7 +24,6 @@ You can check out the mailing list for this project if you have any questions or
|
|
24
24
|
+ [Wikipedia article on LDA][wikipedia]
|
25
25
|
+ [Sample AP data][ap-data]
|
26
26
|
|
27
|
-
|
28
27
|
## References
|
29
28
|
|
30
29
|
Blei, David M., Ng, Andrew Y., and Jordan, Michael I. 2003. Latent dirichlet allocation. Journal of Machine Learning Research. 3 (Mar. 2003), 993-1022 [[pdf][pdf]].
|
data/Rakefile
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'rake'
|
3
|
+
require 'yaml'
|
3
4
|
|
4
5
|
begin
|
5
6
|
require 'jeweler'
|
@@ -9,9 +10,10 @@ begin
|
|
9
10
|
gem.description = %Q{Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.}
|
10
11
|
gem.email = "jasonmadams@gmail.com"
|
11
12
|
gem.homepage = "http://github.com/ealdent/lda-ruby"
|
12
|
-
gem.authors = ['David Blei', 'Jason Adams']
|
13
|
+
gem.authors = ['David Blei', 'Jason Adams', 'Rio Akasaka']
|
13
14
|
gem.extensions = ['ext/lda-ruby/extconf.rb']
|
14
15
|
gem.require_paths = ['lib', 'ext']
|
16
|
+
gem.add_dependency 'shoulda'
|
15
17
|
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
16
18
|
end
|
17
19
|
|
data/VERSION.yml
CHANGED
@@ -778,7 +778,7 @@ static VALUE wrap_em(VALUE self, VALUE start) {
|
|
778
778
|
if (!corpus_loaded)
|
779
779
|
return Qnil;
|
780
780
|
|
781
|
-
run_quiet_em(
|
781
|
+
run_quiet_em(StringValuePtr(start), last_corpus);
|
782
782
|
|
783
783
|
return Qnil;
|
784
784
|
}
|
@@ -788,7 +788,7 @@ static VALUE wrap_em(VALUE self, VALUE start) {
|
|
788
788
|
* Load settings from the given file.
|
789
789
|
*/
|
790
790
|
static VALUE wrap_load_settings(VALUE self, VALUE settings_file) {
|
791
|
-
read_settings(
|
791
|
+
read_settings(StringValuePtr(settings_file));
|
792
792
|
|
793
793
|
return Qtrue;
|
794
794
|
}
|
@@ -800,7 +800,7 @@ static VALUE wrap_load_settings(VALUE self, VALUE settings_file) {
|
|
800
800
|
*/
|
801
801
|
static VALUE wrap_load_corpus(VALUE self, VALUE filename) {
|
802
802
|
if (!corpus_loaded) {
|
803
|
-
last_corpus = read_data(
|
803
|
+
last_corpus = read_data(StringValuePtr(filename));
|
804
804
|
corpus_loaded = TRUE;
|
805
805
|
return Qtrue;
|
806
806
|
} else {
|
data/ext/lda-ruby/lda-model.h
CHANGED
@@ -15,6 +15,8 @@
|
|
15
15
|
void free_lda_model(lda_model*);
|
16
16
|
void save_lda_model(lda_model*, char*);
|
17
17
|
lda_model* new_lda_model(int, int);
|
18
|
+
lda_model* quiet_new_lda_model(int num_terms, int num_topics);
|
19
|
+
lda_model* new_lda_model(int num_terms, int num_topics);
|
18
20
|
lda_suffstats* new_lda_suffstats(lda_model* model);
|
19
21
|
void free_lda_suffstats(lda_model* model, lda_suffstats* ss);
|
20
22
|
void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c);
|
data/lda-ruby.gemspec
CHANGED
@@ -1,78 +1,76 @@
|
|
1
1
|
# Generated by jeweler
|
2
|
-
# DO NOT EDIT THIS FILE
|
3
|
-
# Instead, edit Jeweler::Tasks in Rakefile, and run
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{lda-ruby}
|
8
|
-
s.version = "0.3.
|
8
|
+
s.version = "0.3.4"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
-
s.authors = ["David Blei", "Jason Adams"]
|
12
|
-
s.date = %q{
|
11
|
+
s.authors = ["David Blei", "Jason Adams", "Rio Akasaka"]
|
12
|
+
s.date = %q{2011-07-29}
|
13
13
|
s.description = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.}
|
14
14
|
s.email = %q{jasonmadams@gmail.com}
|
15
15
|
s.extensions = ["ext/lda-ruby/extconf.rb"]
|
16
16
|
s.extra_rdoc_files = [
|
17
17
|
"README",
|
18
|
-
|
18
|
+
"README.markdown"
|
19
19
|
]
|
20
20
|
s.files = [
|
21
|
-
"
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
21
|
+
"CHANGELOG",
|
22
|
+
"README",
|
23
|
+
"README.markdown",
|
24
|
+
"Rakefile",
|
25
|
+
"VERSION.yml",
|
26
|
+
"ext/lda-ruby/Makefile",
|
27
|
+
"ext/lda-ruby/cokus.c",
|
28
|
+
"ext/lda-ruby/cokus.h",
|
29
|
+
"ext/lda-ruby/extconf.rb",
|
30
|
+
"ext/lda-ruby/lda-alpha.c",
|
31
|
+
"ext/lda-ruby/lda-alpha.h",
|
32
|
+
"ext/lda-ruby/lda-data.c",
|
33
|
+
"ext/lda-ruby/lda-data.h",
|
34
|
+
"ext/lda-ruby/lda-inference.c",
|
35
|
+
"ext/lda-ruby/lda-inference.h",
|
36
|
+
"ext/lda-ruby/lda-model.c",
|
37
|
+
"ext/lda-ruby/lda-model.h",
|
38
|
+
"ext/lda-ruby/lda.h",
|
39
|
+
"ext/lda-ruby/utils.c",
|
40
|
+
"ext/lda-ruby/utils.h",
|
41
|
+
"lda-ruby.gemspec",
|
42
|
+
"lib/lda-ruby.rb",
|
43
|
+
"lib/lda-ruby/corpus/corpus.rb",
|
44
|
+
"lib/lda-ruby/corpus/data_corpus.rb",
|
45
|
+
"lib/lda-ruby/corpus/directory_corpus.rb",
|
46
|
+
"lib/lda-ruby/corpus/text_corpus.rb",
|
47
|
+
"lib/lda-ruby/document/data_document.rb",
|
48
|
+
"lib/lda-ruby/document/document.rb",
|
49
|
+
"lib/lda-ruby/document/text_document.rb",
|
50
|
+
"lib/lda-ruby/vocabulary.rb",
|
51
|
+
"license.txt",
|
52
|
+
"test/data/.gitignore",
|
53
|
+
"test/data/docs.dat",
|
54
|
+
"test/data/sample.rb",
|
55
|
+
"test/data/wiki-test-docs.yml",
|
56
|
+
"test/lda_ruby_test.rb",
|
57
|
+
"test/test_helper.rb"
|
58
58
|
]
|
59
59
|
s.homepage = %q{http://github.com/ealdent/lda-ruby}
|
60
|
-
s.rdoc_options = ["--charset=UTF-8"]
|
61
60
|
s.require_paths = ["lib", "ext"]
|
62
|
-
s.rubygems_version = %q{1.
|
61
|
+
s.rubygems_version = %q{1.6.2}
|
63
62
|
s.summary = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei.}
|
64
|
-
s.test_files = [
|
65
|
-
"test/lda_ruby_test.rb",
|
66
|
-
"test/test_helper.rb"
|
67
|
-
]
|
68
63
|
|
69
64
|
if s.respond_to? :specification_version then
|
70
|
-
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
71
65
|
s.specification_version = 3
|
72
66
|
|
73
|
-
if Gem::Version.new(Gem::
|
67
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
68
|
+
s.add_runtime_dependency(%q<shoulda>, [">= 0"])
|
74
69
|
else
|
70
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
75
71
|
end
|
76
72
|
else
|
73
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
77
74
|
end
|
78
75
|
end
|
76
|
+
|
data/test/data/sample.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'lda-ruby'
|
5
|
+
|
6
|
+
# Load the Corpus. The AP data from David Blei's website is in the "DataCorpus" format
|
7
|
+
corpus = Lda::DataCorpus.new("ap/ap.dat")
|
8
|
+
|
9
|
+
# Initialize the Lda instance with the corpus
|
10
|
+
lda = Lda::Lda.new(corpus)
|
11
|
+
|
12
|
+
# Run the EM algorithm using random starting points. Fixed starting points will use the first n documents
|
13
|
+
# to initialize the topics, where n is the number of topics.
|
14
|
+
lda.em("random") # run EM algorithm using random starting points
|
15
|
+
|
16
|
+
# Load the vocabulary file necessary with DataCorpus objects
|
17
|
+
lda.load_vocabulary("ap/vocab.txt")
|
18
|
+
|
19
|
+
# Print the top 20 words per topic
|
20
|
+
lda.print_topics(20)
|
data/test/lda_ruby_test.rb
CHANGED
@@ -110,7 +110,7 @@ class LdaRubyTest < Test::Unit::TestCase
|
|
110
110
|
|
111
111
|
context "An Lda::DataCorpus instance loaded from a file" do
|
112
112
|
setup do
|
113
|
-
@filename = 'data
|
113
|
+
@filename = File.join(File.dirname(__FILE__), 'data', 'docs.dat')
|
114
114
|
@filetext = File.open(@filename, 'r') { |f| f.read }
|
115
115
|
@corpus = Lda::DataCorpus.new(@filename)
|
116
116
|
end
|
@@ -126,7 +126,7 @@ class LdaRubyTest < Test::Unit::TestCase
|
|
126
126
|
|
127
127
|
context "An Lda::TextCorpus instance loaded from a file" do
|
128
128
|
setup do
|
129
|
-
@filename = 'data
|
129
|
+
@filename = File.join(File.dirname(__FILE__), 'data', 'wiki-test-docs.yml')
|
130
130
|
@filedocs = YAML::load_file(@filename)
|
131
131
|
@corpus = Lda::TextCorpus.new(@filename)
|
132
132
|
end
|
@@ -142,13 +142,13 @@ class LdaRubyTest < Test::Unit::TestCase
|
|
142
142
|
|
143
143
|
context "An Lda::DirectoryCorpus instance loaded from a directory" do
|
144
144
|
setup do
|
145
|
-
@path = 'data
|
145
|
+
@path = File.join(File.dirname(__FILE__), 'data', 'tmp')
|
146
146
|
@extension = 'txt'
|
147
147
|
Dir.mkdir(@path)
|
148
|
-
@original_filename = 'data
|
148
|
+
@original_filename = File.join(File.dirname(__FILE__), 'data', 'wiki-test-docs.yml')
|
149
149
|
@filedocs = YAML::load_file(@original_filename)
|
150
150
|
@filedocs.each_with_index do |doc, idx|
|
151
|
-
File.open("
|
151
|
+
File.open(File.join(@path, "doc_#{idx + 1}.txt"), 'w') { |f| f.write(doc) }
|
152
152
|
end
|
153
153
|
|
154
154
|
@corpus = Lda::DirectoryCorpus.new(@path, @extension)
|
@@ -173,11 +173,11 @@ class LdaRubyTest < Test::Unit::TestCase
|
|
173
173
|
setup do
|
174
174
|
@vocab = Lda::Vocabulary.new
|
175
175
|
@words = ['word1', 'word2', 'word3', 'word4', 'word5', 'word6']
|
176
|
-
@filename1 = 'data
|
176
|
+
@filename1 = File.join(File.dirname(__FILE__), 'data', 'tmp_file.txt')
|
177
177
|
File.open(@filename1, 'w') do |f|
|
178
178
|
@words.each { |w| f.write("#{w}\n") }
|
179
179
|
end
|
180
|
-
@filename2 = 'data
|
180
|
+
@filename2 = File.join(File.dirname(__FILE__), 'data', 'tmp_file.yml')
|
181
181
|
File.open(@filename2, 'w') { |f| YAML::dump(@words, f) }
|
182
182
|
end
|
183
183
|
|
@@ -208,7 +208,7 @@ class LdaRubyTest < Test::Unit::TestCase
|
|
208
208
|
|
209
209
|
context "An Lda::Lda instance" do
|
210
210
|
setup do
|
211
|
-
@filename = 'data
|
211
|
+
@filename = File.join(File.dirname(__FILE__), 'data', 'wiki-test-docs.yml')
|
212
212
|
@filedocs = YAML::load_file(@filename)
|
213
213
|
@corpus = Lda::TextCorpus.new(@filename)
|
214
214
|
|
data/test/test_helper.rb
CHANGED
metadata
CHANGED
@@ -1,30 +1,39 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: lda-ruby
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.3.4
|
5
|
+
prerelease:
|
5
6
|
platform: ruby
|
6
|
-
authors:
|
7
|
+
authors:
|
7
8
|
- David Blei
|
8
9
|
- Jason Adams
|
10
|
+
- Rio Akasaka
|
9
11
|
autorequire:
|
10
12
|
bindir: bin
|
11
13
|
cert_chain: []
|
12
|
-
|
13
|
-
date: 2009-08-11 00:00:00 -04:00
|
14
|
+
date: 2011-07-29 00:00:00.000000000 -04:00
|
14
15
|
default_executable:
|
15
|
-
dependencies:
|
16
|
-
|
16
|
+
dependencies:
|
17
|
+
- !ruby/object:Gem::Dependency
|
18
|
+
name: shoulda
|
19
|
+
requirement: &2161566540 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - ! '>='
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: '0'
|
25
|
+
type: :runtime
|
26
|
+
prerelease: false
|
27
|
+
version_requirements: *2161566540
|
17
28
|
description: Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.
|
18
29
|
email: jasonmadams@gmail.com
|
19
30
|
executables: []
|
20
|
-
|
21
|
-
extensions:
|
31
|
+
extensions:
|
22
32
|
- ext/lda-ruby/extconf.rb
|
23
|
-
extra_rdoc_files:
|
33
|
+
extra_rdoc_files:
|
24
34
|
- README
|
25
35
|
- README.markdown
|
26
|
-
files:
|
27
|
-
- .gitignore
|
36
|
+
files:
|
28
37
|
- CHANGELOG
|
29
38
|
- README
|
30
39
|
- README.markdown
|
@@ -58,38 +67,34 @@ files:
|
|
58
67
|
- license.txt
|
59
68
|
- test/data/.gitignore
|
60
69
|
- test/data/docs.dat
|
70
|
+
- test/data/sample.rb
|
61
71
|
- test/data/wiki-test-docs.yml
|
62
72
|
- test/lda_ruby_test.rb
|
63
73
|
- test/test_helper.rb
|
64
74
|
has_rdoc: true
|
65
75
|
homepage: http://github.com/ealdent/lda-ruby
|
66
76
|
licenses: []
|
67
|
-
|
68
77
|
post_install_message:
|
69
|
-
rdoc_options:
|
70
|
-
|
71
|
-
require_paths:
|
78
|
+
rdoc_options: []
|
79
|
+
require_paths:
|
72
80
|
- lib
|
73
81
|
- ext
|
74
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
82
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
83
|
+
none: false
|
84
|
+
requirements:
|
85
|
+
- - ! '>='
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '0'
|
88
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
86
94
|
requirements: []
|
87
|
-
|
88
95
|
rubyforge_project:
|
89
|
-
rubygems_version: 1.
|
96
|
+
rubygems_version: 1.6.2
|
90
97
|
signing_key:
|
91
98
|
specification_version: 3
|
92
99
|
summary: Ruby port of Latent Dirichlet Allocation by David M. Blei.
|
93
|
-
test_files:
|
94
|
-
- test/lda_ruby_test.rb
|
95
|
-
- test/test_helper.rb
|
100
|
+
test_files: []
|