lda-ruby 0.3.7 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  ---
2
+ :build:
2
3
  :major: 0
3
4
  :minor: 3
4
- :patch: 7
5
- :build:
5
+ :patch: 8
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{lda-ruby}
8
- s.version = "0.3.7"
8
+ s.version = "0.3.8"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["David Blei", "Jason Adams", "Rio Akasaka"]
12
- s.date = %q{2011-08-06}
12
+ s.date = %q{2011-10-18}
13
13
  s.description = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.}
14
14
  s.email = %q{jasonmadams@gmail.com}
15
15
  s.extensions = ["ext/lda-ruby/extconf.rb"]
@@ -61,7 +61,7 @@ Gem::Specification.new do |s|
61
61
  ]
62
62
  s.homepage = %q{http://github.com/ealdent/lda-ruby}
63
63
  s.require_paths = ["lib", "ext"]
64
- s.rubygems_version = %q{1.6.2}
64
+ s.rubygems_version = %q{1.5.2}
65
65
  s.summary = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei.}
66
66
 
67
67
  if s.respond_to? :specification_version then
@@ -4,15 +4,19 @@ module Lda
4
4
  class Corpus
5
5
  attr_reader :documents, :num_docs, :num_terms, :vocabulary, :stopwords
6
6
 
7
- def initialize
7
+ def initialize(stop_word_list = nil)
8
8
  @documents = Array.new
9
9
  @all_terms = Set.new
10
10
  @num_terms = @num_docs = 0
11
11
  @vocabulary = Vocabulary.new
12
- @stopwords = YAML.load_file(File.join(File.dirname(__FILE__), '..', 'config', 'stopwords.yml'))
12
+ if stop_word_list.nil?
13
+ @stopwords = YAML.load_file(File.join(File.dirname(__FILE__), '..', 'config', 'stopwords.yml'))
14
+ else
15
+ @stopwords = YAML.load_file(stop_word_list)
16
+ end
13
17
  @stopwords.map! { |w| w.strip }
14
18
  end
15
-
19
+
16
20
  def add_document(doc)
17
21
  raise 'Parameter +doc+ must be of type Document' unless doc.kind_of?(Document)
18
22
 
@@ -1,3 +1,4 @@
1
+ # coding: utf-8
1
2
  require 'yaml'
2
3
 
3
4
  module Lda
@@ -31,7 +32,9 @@ module Lda
31
32
  end
32
33
 
33
34
  def tokenize(text)
34
- clean_text = text.gsub(/[^A-Za-z'\s]+/, ' ').gsub(/\s+/, ' ').downcase # remove everything but letters and ' and leave only single spaces
35
+ # now respects Umlaute
36
+ clean_text = text.gsub(/[^a-zäöüß'-]+/i, ' ').gsub(/\s+/, ' ').downcase # remove everything but letters and ' and leave only single spaces
37
+ # clean_text = text.gsub(/[^A-Za-z'\s]+/, ' ').gsub(/\s+/, ' ').downcase # remove everything but letters and ' and leave only single spaces
35
38
  @tokens = handle(clean_text.split(' '))
36
39
  nil
37
40
  end
metadata CHANGED
@@ -1,39 +1,49 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: lda-ruby
3
- version: !ruby/object:Gem::Version
4
- version: 0.3.7
3
+ version: !ruby/object:Gem::Version
4
+ hash: 3
5
5
  prerelease:
6
+ segments:
7
+ - 0
8
+ - 3
9
+ - 8
10
+ version: 0.3.8
6
11
  platform: ruby
7
- authors:
12
+ authors:
8
13
  - David Blei
9
14
  - Jason Adams
10
15
  - Rio Akasaka
11
16
  autorequire:
12
17
  bindir: bin
13
18
  cert_chain: []
14
- date: 2011-08-06 00:00:00.000000000 -04:00
19
+
20
+ date: 2011-10-18 00:00:00 -04:00
15
21
  default_executable:
16
- dependencies:
17
- - !ruby/object:Gem::Dependency
22
+ dependencies:
23
+ - !ruby/object:Gem::Dependency
18
24
  name: shoulda
19
- requirement: &2153174960 !ruby/object:Gem::Requirement
25
+ prerelease: false
26
+ requirement: &id001 !ruby/object:Gem::Requirement
20
27
  none: false
21
- requirements:
22
- - - ! '>='
23
- - !ruby/object:Gem::Version
24
- version: '0'
28
+ requirements:
29
+ - - ">="
30
+ - !ruby/object:Gem::Version
31
+ hash: 3
32
+ segments:
33
+ - 0
34
+ version: "0"
25
35
  type: :runtime
26
- prerelease: false
27
- version_requirements: *2153174960
36
+ version_requirements: *id001
28
37
  description: Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.
29
38
  email: jasonmadams@gmail.com
30
39
  executables: []
31
- extensions:
40
+
41
+ extensions:
32
42
  - ext/lda-ruby/extconf.rb
33
- extra_rdoc_files:
43
+ extra_rdoc_files:
34
44
  - README
35
45
  - README.markdown
36
- files:
46
+ files:
37
47
  - CHANGELOG
38
48
  - README
39
49
  - README.markdown
@@ -77,27 +87,37 @@ files:
77
87
  has_rdoc: true
78
88
  homepage: http://github.com/ealdent/lda-ruby
79
89
  licenses: []
90
+
80
91
  post_install_message:
81
92
  rdoc_options: []
82
- require_paths:
93
+
94
+ require_paths:
83
95
  - lib
84
96
  - ext
85
- required_ruby_version: !ruby/object:Gem::Requirement
97
+ required_ruby_version: !ruby/object:Gem::Requirement
86
98
  none: false
87
- requirements:
88
- - - ! '>='
89
- - !ruby/object:Gem::Version
90
- version: '0'
91
- required_rubygems_version: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ hash: 3
103
+ segments:
104
+ - 0
105
+ version: "0"
106
+ required_rubygems_version: !ruby/object:Gem::Requirement
92
107
  none: false
93
- requirements:
94
- - - ! '>='
95
- - !ruby/object:Gem::Version
96
- version: '0'
108
+ requirements:
109
+ - - ">="
110
+ - !ruby/object:Gem::Version
111
+ hash: 3
112
+ segments:
113
+ - 0
114
+ version: "0"
97
115
  requirements: []
116
+
98
117
  rubyforge_project:
99
- rubygems_version: 1.6.2
118
+ rubygems_version: 1.5.2
100
119
  signing_key:
101
120
  specification_version: 3
102
121
  summary: Ruby port of Latent Dirichlet Allocation by David M. Blei.
103
122
  test_files: []
123
+