sclust 1.0.0 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
metadata CHANGED
@@ -1,38 +1,58 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sclust
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 2.1.0
5
5
  platform: ruby
6
6
  authors:
7
- - Sam Baskinger
7
+ - Sam Baskinger
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-12-01 00:00:00 -06:00
12
+ date: 2010-10-02 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
- - !ruby/object:Gem::Dependency
16
- name: log4r
17
- type: :runtime
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
20
- requirements:
21
- - - ">="
22
- - !ruby/object:Gem::Version
23
- version: 1.0.5
24
- version:
25
- - !ruby/object:Gem::Dependency
26
- name: mechanize
27
- type: :runtime
28
- version_requirement:
29
- version_requirements: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: 0.9.3
34
- version:
35
- description: A k-mean text clustering library for ruby.
15
+ - !ruby/object:Gem::Dependency
16
+ name: log4r
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.0.5
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: mechanize
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 1.0.0
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: stemmer
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: 1.0.1
44
+ version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: nokogiri
47
+ type: :runtime
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: 1.4.1
54
+ version:
55
+ description: A k-mean and LDA text clustering library for ruby.
36
56
  email: basking2@rubyforge.org.com
37
57
  executables: []
38
58
 
@@ -41,10 +61,18 @@ extensions: []
41
61
  extra_rdoc_files: []
42
62
 
43
63
  files:
44
- - lib/sclust/cluster.rb
45
- - lib/sclust/doc.rb
46
- - lib/sclust/doccluster.rb
47
- - lib/sclust/doccol.rb
64
+ - lib/sclust/kmean/doccluster.rb
65
+ - lib/sclust/kmean/cluster.rb
66
+ - lib/sclust/lda/lda2.rb
67
+ - lib/sclust/lda/lda.rb
68
+ - lib/sclust/util/weightedmovingaverage.rb
69
+ - lib/sclust/util/doc.rb
70
+ - lib/sclust/util/sparse_vector.rb
71
+ - lib/sclust/util/rss.rb
72
+ - lib/sclust/util/word.rb
73
+ - lib/sclust/util/filters.rb
74
+ - lib/sclust/util/doccol.rb
75
+ - lib/sclust/util/stopwords.rb
48
76
  has_rdoc: true
49
77
  homepage: http://sclust.rubyforge.org
50
78
  licenses: []
@@ -53,18 +81,18 @@ post_install_message:
53
81
  rdoc_options: []
54
82
 
55
83
  require_paths:
56
- - lib
84
+ - lib
57
85
  required_ruby_version: !ruby/object:Gem::Requirement
58
86
  requirements:
59
- - - ">="
60
- - !ruby/object:Gem::Version
61
- version: 1.6.8
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: 1.6.8
62
90
  version:
63
91
  required_rubygems_version: !ruby/object:Gem::Requirement
64
92
  requirements:
65
- - - ">="
66
- - !ruby/object:Gem::Version
67
- version: "0"
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ version: "0"
68
96
  version:
69
97
  requirements: []
70
98
 
@@ -72,9 +100,15 @@ rubyforge_project: http://sclust.rubyforge.org/
72
100
  rubygems_version: 1.3.5
73
101
  signing_key:
74
102
  specification_version: 3
75
- summary: k-mean clustering.
103
+ summary: k-mean/lda clustering.
76
104
  test_files:
77
- - tests/clustertest.rb
78
- - tests/test001.rb
79
- - tests/clustertest.rb
80
- - tests/test001.rb
105
+ - tests/filters_test.rb
106
+ - tests/test001.rb
107
+ - tests/ldatest.rb
108
+ - tests/clustertest.rb
109
+ - tests/sparse_vector_test.rb
110
+ - tests/filters_test.rb
111
+ - tests/test001.rb
112
+ - tests/ldatest.rb
113
+ - tests/clustertest.rb
114
+ - tests/sparse_vector_test.rb
@@ -1,197 +0,0 @@
1
- require 'rubygems'
2
- require 'log4r'
3
-
4
- module SClust
5
-
6
- class CosineDistance
7
-
8
- # Given two vectors, compute the distance
9
- def self.distance(a,b)
10
-
11
- acc1 = 0.0
12
- acc2 = 0.0
13
- acc3 = 0.0
14
-
15
- 0.upto(a.length-1) do |i|
16
- acc1 += a[i]*b[i]
17
- acc2 *= a[i]*a[i]
18
- acc3 *= b[i]*b[i]
19
- end
20
-
21
- 1 - ( acc1 / (Math.sqrt(acc2) * Math.sqrt(acc3)) )
22
- end
23
- end
24
-
25
- class ClusterPoint
26
-
27
- attr_reader :terms, :values, :cluster, :source_object
28
- attr_writer :cluster, :source_object
29
-
30
- # Initialize the ClusterPoint with a list of terms (labels, objects, whatever) and numeric values.
31
- def initialize(terms, values, source_object = nil)
32
- @terms = terms
33
- @values = values
34
- @cluster = nil
35
- @source_object = source_object
36
- end
37
-
38
- def distance(clusterPoint)
39
- CosineDistance.distance(@values, clusterPoint.values)
40
- end
41
-
42
- # Add each item in the cluster point to this cluster point adjusting the values per the given weight.
43
- # Weght is a value from 0.0 - 1.0, inclusive. A value of 1 means that this clusterPoint is 100% assigned to
44
- # this cluster point while a weight value of 0 will have no effect.
45
- def add(clusterPoint, weight)
46
- 0.upto(@values.length-1) { |i| @values[i] = ( @values[i] * (1-weight)) + (clusterPoint.values[i] * weight) }
47
-
48
- # Validation code
49
- #0.upto(@values.length-1) do |i|
50
- # if ( @values[i].nan? || ! @values[i].finite? )
51
- # throw Exception.new("Cluster has invalid number #{@values[i]}")
52
- # end
53
- #end
54
- end
55
-
56
-
57
- # Similar to add, but subtract.
58
- def sub(clusterPoint, weight)
59
- 0.upto(@values.length-1) { |i| @values[i] = ( @values[i] - (clusterPoint.values[i] * weight) ) / (1 - weight) }
60
-
61
- # Validation code
62
- #0.upto(@values.length-1) do |i|
63
- # if ( @values[i].nan? || ! @values[i].finite? )
64
- # throw Exception.new("Cluster has invalid number #{@values[i]} w:#{weight} and #{clusterPoint.values[i]}")
65
- # end
66
- #end
67
- end
68
-
69
- def get_max_terms(n=3)
70
-
71
- values = {}
72
-
73
- @terms.length.times do |i|
74
- t = @terms[i]
75
- v = @values[i]
76
- values[v] = [] unless values.has_key?(v)
77
- values[v] << t
78
- end
79
-
80
- vlist = values.keys.sort { |x,y| ( x > y ) ? -1 : 1 }
81
-
82
- result = []
83
-
84
- n = vlist.length if ( n > vlist.length )
85
-
86
- n.times { |i| result += values[vlist[i]] }
87
-
88
- result.slice(0,n)
89
-
90
- end
91
-
92
- def get_term_value(term)
93
- i=0
94
-
95
- catch(:found) do
96
- @terms.each do |t|
97
- throw :found if ( t == term )
98
- i+=1
99
- end
100
- end
101
-
102
- @values[i]
103
- end
104
-
105
- end
106
-
107
- class Cluster
108
-
109
- attr_reader :center, :size
110
-
111
- def initialize(centerPoint)
112
- @fixed = false
113
- @center = centerPoint.clone
114
- @size = 1
115
- end
116
-
117
- def +(point)
118
- point.cluster = self
119
-
120
- @size+=1
121
-
122
- @center.add(point, 1.0/@size.to_f)
123
- end
124
-
125
- def -(point)
126
- point.cluster = nil
127
-
128
- @center.sub(point, 1.0/@size.to_f)
129
-
130
- @size-=1
131
- end
132
-
133
- def get_max_terms(n=3)
134
- @center.get_max_terms(n)
135
- end
136
-
137
- end
138
-
139
- class Clusterer
140
-
141
- attr_reader :clusters, :points, :cluster_count, :iterations, :logger
142
- attr_writer :clusters, :points, :cluster_count, :iterations
143
-
144
- # Optionally takes a notifier.
145
- def initialize(points)
146
- @iterations = 2
147
- @cluster_count = 10
148
- @points = points
149
- @clusters = []
150
- @logger = Log4r::Logger.new('Clusterer')
151
-
152
- # Randomly select a few starting documents.
153
- @cluster_count.times { @clusters << Cluster.new(@points[rand(points.length)]) }
154
- end
155
-
156
- def +(point)
157
- @points << point
158
- end
159
-
160
- def each_cluster(&c)
161
- @clusters.each { |cluster| yield cluster }
162
- end
163
-
164
- def assign_all_points
165
-
166
- @points.each do |pt|
167
-
168
- @logger.debug("Assigning point #{pt}.")
169
-
170
- min_cluster = @clusters[0]
171
- min_dst = min_cluster.center.distance(pt)
172
-
173
- @clusters.each do |cluster|
174
-
175
- tmp_distance = cluster.center.distance(pt)
176
-
177
- if ( tmp_distance < min_dst )
178
- min_cluster = cluster
179
- min_dst = tmp_distance
180
- end
181
- end
182
-
183
- pt.cluster - pt if pt.cluster
184
-
185
- min_cluster + pt
186
- end
187
- end
188
-
189
- def cluster
190
- iterations.times do |i|
191
- @logger.info("Starting iteration #{i+1} of #{iterations}.")
192
- assign_all_points
193
- end
194
- end
195
- end
196
-
197
- end
@@ -1,92 +0,0 @@
1
- module SClust
2
-
3
- # Filters a document term
4
- class DocumentTermFilter
5
- # Return nil if the term should be excluded. Otherwise the version of the term
6
- # that should be included is returned.
7
- def filter(term)
8
- if ( term.nil? )
9
- nil
10
- elsif (term.size < 2)
11
- nil
12
- elsif ( term =~ /^[\d\.]+$/ )
13
- nil
14
- else
15
- term.downcase!
16
- end
17
- end
18
- end
19
-
20
- class NullFilter
21
- def filter(term)
22
- term
23
- end
24
- end
25
-
26
- class Document
27
-
28
- attr_reader :terms, :userDate, :filter
29
-
30
- # Takes { :userData, :ngrams => [1,2,3], :filter }
31
- def initialize(text, opts={})
32
-
33
- @text = text
34
- @userData = opts[:userData]
35
-
36
- opts[:ngramrange] ||= [ 1, 2, 3 ]
37
- opts[:filter] ||= DocumentTermFilter.new()
38
-
39
- word_arr = text.split(/[ ,\.\t!\?\(\)\{\}\[\]\t\r\n]+/m)
40
-
41
- @terms = Hash.new(0)
42
-
43
- # Array of counts of grams built.
44
- builtGramCounts = []
45
-
46
- # Build a set of n-grams from our requested ngram range.
47
- opts[:ngrams].each do |n|
48
-
49
- builtGramCounts[n] = 0
50
-
51
- # For each word in our list...
52
- 0.upto(word_arr.length-1) do |j|
53
-
54
- if ( n + j < word_arr.length )
55
-
56
- term = word_arr[j]
57
-
58
- (n-1).times { |ngram| term += " #{word_arr[j+ngram+1]}" }
59
-
60
- end
61
-
62
- term = opts[:filter].filter(term)
63
-
64
- @terms[term] += 1.0 if term
65
-
66
- builtGramCounts[n] += 1
67
-
68
- end
69
-
70
- end
71
-
72
- @terms.each { |k,v| @terms[k] /= @terms.length }
73
-
74
- end
75
-
76
- def term_frequency(term)
77
- @terms[term]
78
- end
79
-
80
- alias tf term_frequency
81
-
82
- def each_term(&call)
83
- terms.each_key { |k| yield k }
84
- end
85
-
86
- def has_term?(term)
87
- @terms.has_key?(term)
88
- end
89
-
90
- end
91
-
92
- end
@@ -1,39 +0,0 @@
1
- require 'sclust/doc'
2
- require 'sclust/doccol'
3
- require 'sclust/cluster'
4
-
5
- module SClust
6
-
7
- # A document clusterer that overrides the + operator
8
- # to allow for adding Document objects.
9
- class DocumentClusterer < Clusterer
10
-
11
- def initialize(documentCollection)
12
-
13
- # List of all terms
14
- term_list = documentCollection.terms.keys.sort
15
- point_list = []
16
-
17
- documentCollection.doclist.each do |doc|
18
-
19
- doc_terms = [] # Sorted list of terms.
20
- doc_term_values = [] # Corosponding values.
21
-
22
-
23
- # Buid a BIG term vector list for this document.
24
- term_list.each do |term|
25
- doc_terms << term
26
- doc_term_values << doc.tf(term) - documentCollection.idf(term)
27
- end
28
-
29
- # def initialize(terms, values, source_object = nil)
30
- point_list << ClusterPoint.new(doc_terms, doc_term_values, doc)
31
- end
32
-
33
- super(point_list)
34
-
35
- end
36
-
37
- end
38
-
39
- end