lda-ruby 0.3.7 → 0.3.8
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION.yml +2 -2
- data/lda-ruby.gemspec +3 -3
- data/lib/lda-ruby/corpus/corpus.rb +7 -3
- data/lib/lda-ruby/document/document.rb +4 -1
- metadata +49 -29
data/VERSION.yml
CHANGED
data/lda-ruby.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{lda-ruby}
|
8
|
-
s.version = "0.3.
|
8
|
+
s.version = "0.3.8"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["David Blei", "Jason Adams", "Rio Akasaka"]
|
12
|
-
s.date = %q{2011-
|
12
|
+
s.date = %q{2011-10-18}
|
13
13
|
s.description = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.}
|
14
14
|
s.email = %q{jasonmadams@gmail.com}
|
15
15
|
s.extensions = ["ext/lda-ruby/extconf.rb"]
|
@@ -61,7 +61,7 @@ Gem::Specification.new do |s|
|
|
61
61
|
]
|
62
62
|
s.homepage = %q{http://github.com/ealdent/lda-ruby}
|
63
63
|
s.require_paths = ["lib", "ext"]
|
64
|
-
s.rubygems_version = %q{1.
|
64
|
+
s.rubygems_version = %q{1.5.2}
|
65
65
|
s.summary = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei.}
|
66
66
|
|
67
67
|
if s.respond_to? :specification_version then
|
@@ -4,15 +4,19 @@ module Lda
|
|
4
4
|
class Corpus
|
5
5
|
attr_reader :documents, :num_docs, :num_terms, :vocabulary, :stopwords
|
6
6
|
|
7
|
-
def initialize
|
7
|
+
def initialize(stop_word_list = nil)
|
8
8
|
@documents = Array.new
|
9
9
|
@all_terms = Set.new
|
10
10
|
@num_terms = @num_docs = 0
|
11
11
|
@vocabulary = Vocabulary.new
|
12
|
-
|
12
|
+
if stop_word_list.nil?
|
13
|
+
@stopwords = YAML.load_file(File.join(File.dirname(__FILE__), '..', 'config', 'stopwords.yml'))
|
14
|
+
else
|
15
|
+
@stopwords = YAML.load_file(stop_word_list)
|
16
|
+
end
|
13
17
|
@stopwords.map! { |w| w.strip }
|
14
18
|
end
|
15
|
-
|
19
|
+
|
16
20
|
def add_document(doc)
|
17
21
|
raise 'Parameter +doc+ must be of type Document' unless doc.kind_of?(Document)
|
18
22
|
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# coding: utf-8
|
1
2
|
require 'yaml'
|
2
3
|
|
3
4
|
module Lda
|
@@ -31,7 +32,9 @@ module Lda
|
|
31
32
|
end
|
32
33
|
|
33
34
|
def tokenize(text)
|
34
|
-
|
35
|
+
# now respects Umlaute
|
36
|
+
clean_text = text.gsub(/[^a-zäöüß'-]+/i, ' ').gsub(/\s+/, ' ').downcase # remove everything but letters and ' and leave only single spaces
|
37
|
+
# clean_text = text.gsub(/[^A-Za-z'\s]+/, ' ').gsub(/\s+/, ' ').downcase # remove everything but letters and ' and leave only single spaces
|
35
38
|
@tokens = handle(clean_text.split(' '))
|
36
39
|
nil
|
37
40
|
end
|
metadata
CHANGED
@@ -1,39 +1,49 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: lda-ruby
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 3
|
5
5
|
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 3
|
9
|
+
- 8
|
10
|
+
version: 0.3.8
|
6
11
|
platform: ruby
|
7
|
-
authors:
|
12
|
+
authors:
|
8
13
|
- David Blei
|
9
14
|
- Jason Adams
|
10
15
|
- Rio Akasaka
|
11
16
|
autorequire:
|
12
17
|
bindir: bin
|
13
18
|
cert_chain: []
|
14
|
-
|
19
|
+
|
20
|
+
date: 2011-10-18 00:00:00 -04:00
|
15
21
|
default_executable:
|
16
|
-
dependencies:
|
17
|
-
- !ruby/object:Gem::Dependency
|
22
|
+
dependencies:
|
23
|
+
- !ruby/object:Gem::Dependency
|
18
24
|
name: shoulda
|
19
|
-
|
25
|
+
prerelease: false
|
26
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
27
|
none: false
|
21
|
-
requirements:
|
22
|
-
- -
|
23
|
-
- !ruby/object:Gem::Version
|
24
|
-
|
28
|
+
requirements:
|
29
|
+
- - ">="
|
30
|
+
- !ruby/object:Gem::Version
|
31
|
+
hash: 3
|
32
|
+
segments:
|
33
|
+
- 0
|
34
|
+
version: "0"
|
25
35
|
type: :runtime
|
26
|
-
|
27
|
-
version_requirements: *2153174960
|
36
|
+
version_requirements: *id001
|
28
37
|
description: Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.
|
29
38
|
email: jasonmadams@gmail.com
|
30
39
|
executables: []
|
31
|
-
|
40
|
+
|
41
|
+
extensions:
|
32
42
|
- ext/lda-ruby/extconf.rb
|
33
|
-
extra_rdoc_files:
|
43
|
+
extra_rdoc_files:
|
34
44
|
- README
|
35
45
|
- README.markdown
|
36
|
-
files:
|
46
|
+
files:
|
37
47
|
- CHANGELOG
|
38
48
|
- README
|
39
49
|
- README.markdown
|
@@ -77,27 +87,37 @@ files:
|
|
77
87
|
has_rdoc: true
|
78
88
|
homepage: http://github.com/ealdent/lda-ruby
|
79
89
|
licenses: []
|
90
|
+
|
80
91
|
post_install_message:
|
81
92
|
rdoc_options: []
|
82
|
-
|
93
|
+
|
94
|
+
require_paths:
|
83
95
|
- lib
|
84
96
|
- ext
|
85
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
97
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
86
98
|
none: false
|
87
|
-
requirements:
|
88
|
-
- -
|
89
|
-
- !ruby/object:Gem::Version
|
90
|
-
|
91
|
-
|
99
|
+
requirements:
|
100
|
+
- - ">="
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
hash: 3
|
103
|
+
segments:
|
104
|
+
- 0
|
105
|
+
version: "0"
|
106
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
92
107
|
none: false
|
93
|
-
requirements:
|
94
|
-
- -
|
95
|
-
- !ruby/object:Gem::Version
|
96
|
-
|
108
|
+
requirements:
|
109
|
+
- - ">="
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
hash: 3
|
112
|
+
segments:
|
113
|
+
- 0
|
114
|
+
version: "0"
|
97
115
|
requirements: []
|
116
|
+
|
98
117
|
rubyforge_project:
|
99
|
-
rubygems_version: 1.
|
118
|
+
rubygems_version: 1.5.2
|
100
119
|
signing_key:
|
101
120
|
specification_version: 3
|
102
121
|
summary: Ruby port of Latent Dirichlet Allocation by David M. Blei.
|
103
122
|
test_files: []
|
123
|
+
|