lda-ruby 0.3.7 → 0.3.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
1
  ---
2
+ :build:
2
3
  :major: 0
3
4
  :minor: 3
4
- :patch: 7
5
- :build:
5
+ :patch: 8
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{lda-ruby}
8
- s.version = "0.3.7"
8
+ s.version = "0.3.8"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["David Blei", "Jason Adams", "Rio Akasaka"]
12
- s.date = %q{2011-08-06}
12
+ s.date = %q{2011-10-18}
13
13
  s.description = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.}
14
14
  s.email = %q{jasonmadams@gmail.com}
15
15
  s.extensions = ["ext/lda-ruby/extconf.rb"]
@@ -61,7 +61,7 @@ Gem::Specification.new do |s|
61
61
  ]
62
62
  s.homepage = %q{http://github.com/ealdent/lda-ruby}
63
63
  s.require_paths = ["lib", "ext"]
64
- s.rubygems_version = %q{1.6.2}
64
+ s.rubygems_version = %q{1.5.2}
65
65
  s.summary = %q{Ruby port of Latent Dirichlet Allocation by David M. Blei.}
66
66
 
67
67
  if s.respond_to? :specification_version then
@@ -4,15 +4,19 @@ module Lda
4
4
  class Corpus
5
5
  attr_reader :documents, :num_docs, :num_terms, :vocabulary, :stopwords
6
6
 
7
- def initialize
7
+ def initialize(stop_word_list = nil)
8
8
  @documents = Array.new
9
9
  @all_terms = Set.new
10
10
  @num_terms = @num_docs = 0
11
11
  @vocabulary = Vocabulary.new
12
- @stopwords = YAML.load_file(File.join(File.dirname(__FILE__), '..', 'config', 'stopwords.yml'))
12
+ if stop_word_list.nil?
13
+ @stopwords = YAML.load_file(File.join(File.dirname(__FILE__), '..', 'config', 'stopwords.yml'))
14
+ else
15
+ @stopwords = YAML.load_file(stop_word_list)
16
+ end
13
17
  @stopwords.map! { |w| w.strip }
14
18
  end
15
-
19
+
16
20
  def add_document(doc)
17
21
  raise 'Parameter +doc+ must be of type Document' unless doc.kind_of?(Document)
18
22
 
@@ -1,3 +1,4 @@
1
+ # coding: utf-8
1
2
  require 'yaml'
2
3
 
3
4
  module Lda
@@ -31,7 +32,9 @@ module Lda
31
32
  end
32
33
 
33
34
  def tokenize(text)
34
- clean_text = text.gsub(/[^A-Za-z'\s]+/, ' ').gsub(/\s+/, ' ').downcase # remove everything but letters and ' and leave only single spaces
35
+ # now respects Umlaute
36
+ clean_text = text.gsub(/[^a-zäöüß'-]+/i, ' ').gsub(/\s+/, ' ').downcase # remove everything but letters and ' and leave only single spaces
37
+ # clean_text = text.gsub(/[^A-Za-z'\s]+/, ' ').gsub(/\s+/, ' ').downcase # remove everything but letters and ' and leave only single spaces
35
38
  @tokens = handle(clean_text.split(' '))
36
39
  nil
37
40
  end
metadata CHANGED
@@ -1,39 +1,49 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: lda-ruby
3
- version: !ruby/object:Gem::Version
4
- version: 0.3.7
3
+ version: !ruby/object:Gem::Version
4
+ hash: 3
5
5
  prerelease:
6
+ segments:
7
+ - 0
8
+ - 3
9
+ - 8
10
+ version: 0.3.8
6
11
  platform: ruby
7
- authors:
12
+ authors:
8
13
  - David Blei
9
14
  - Jason Adams
10
15
  - Rio Akasaka
11
16
  autorequire:
12
17
  bindir: bin
13
18
  cert_chain: []
14
- date: 2011-08-06 00:00:00.000000000 -04:00
19
+
20
+ date: 2011-10-18 00:00:00 -04:00
15
21
  default_executable:
16
- dependencies:
17
- - !ruby/object:Gem::Dependency
22
+ dependencies:
23
+ - !ruby/object:Gem::Dependency
18
24
  name: shoulda
19
- requirement: &2153174960 !ruby/object:Gem::Requirement
25
+ prerelease: false
26
+ requirement: &id001 !ruby/object:Gem::Requirement
20
27
  none: false
21
- requirements:
22
- - - ! '>='
23
- - !ruby/object:Gem::Version
24
- version: '0'
28
+ requirements:
29
+ - - ">="
30
+ - !ruby/object:Gem::Version
31
+ hash: 3
32
+ segments:
33
+ - 0
34
+ version: "0"
25
35
  type: :runtime
26
- prerelease: false
27
- version_requirements: *2153174960
36
+ version_requirements: *id001
28
37
  description: Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.
29
38
  email: jasonmadams@gmail.com
30
39
  executables: []
31
- extensions:
40
+
41
+ extensions:
32
42
  - ext/lda-ruby/extconf.rb
33
- extra_rdoc_files:
43
+ extra_rdoc_files:
34
44
  - README
35
45
  - README.markdown
36
- files:
46
+ files:
37
47
  - CHANGELOG
38
48
  - README
39
49
  - README.markdown
@@ -77,27 +87,37 @@ files:
77
87
  has_rdoc: true
78
88
  homepage: http://github.com/ealdent/lda-ruby
79
89
  licenses: []
90
+
80
91
  post_install_message:
81
92
  rdoc_options: []
82
- require_paths:
93
+
94
+ require_paths:
83
95
  - lib
84
96
  - ext
85
- required_ruby_version: !ruby/object:Gem::Requirement
97
+ required_ruby_version: !ruby/object:Gem::Requirement
86
98
  none: false
87
- requirements:
88
- - - ! '>='
89
- - !ruby/object:Gem::Version
90
- version: '0'
91
- required_rubygems_version: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ hash: 3
103
+ segments:
104
+ - 0
105
+ version: "0"
106
+ required_rubygems_version: !ruby/object:Gem::Requirement
92
107
  none: false
93
- requirements:
94
- - - ! '>='
95
- - !ruby/object:Gem::Version
96
- version: '0'
108
+ requirements:
109
+ - - ">="
110
+ - !ruby/object:Gem::Version
111
+ hash: 3
112
+ segments:
113
+ - 0
114
+ version: "0"
97
115
  requirements: []
116
+
98
117
  rubyforge_project:
99
- rubygems_version: 1.6.2
118
+ rubygems_version: 1.5.2
100
119
  signing_key:
101
120
  specification_version: 3
102
121
  summary: Ruby port of Latent Dirichlet Allocation by David M. Blei.
103
122
  test_files: []
123
+