sclust 1.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
metadata CHANGED
@@ -1,38 +1,58 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sclust
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 2.1.0
5
5
  platform: ruby
6
6
  authors:
7
- - Sam Baskinger
7
+ - Sam Baskinger
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-12-01 00:00:00 -06:00
12
+ date: 2010-10-02 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
- - !ruby/object:Gem::Dependency
16
- name: log4r
17
- type: :runtime
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
20
- requirements:
21
- - - ">="
22
- - !ruby/object:Gem::Version
23
- version: 1.0.5
24
- version:
25
- - !ruby/object:Gem::Dependency
26
- name: mechanize
27
- type: :runtime
28
- version_requirement:
29
- version_requirements: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: 0.9.3
34
- version:
35
- description: A k-mean text clustering library for ruby.
15
+ - !ruby/object:Gem::Dependency
16
+ name: log4r
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.0.5
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: mechanize
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 1.0.0
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: stemmer
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: 1.0.1
44
+ version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: nokogiri
47
+ type: :runtime
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: 1.4.1
54
+ version:
55
+ description: A k-mean and LDA text clustering library for ruby.
36
56
  email: basking2@rubyforge.org.com
37
57
  executables: []
38
58
 
@@ -41,10 +61,18 @@ extensions: []
41
61
  extra_rdoc_files: []
42
62
 
43
63
  files:
44
- - lib/sclust/cluster.rb
45
- - lib/sclust/doc.rb
46
- - lib/sclust/doccluster.rb
47
- - lib/sclust/doccol.rb
64
+ - lib/sclust/kmean/doccluster.rb
65
+ - lib/sclust/kmean/cluster.rb
66
+ - lib/sclust/lda/lda2.rb
67
+ - lib/sclust/lda/lda.rb
68
+ - lib/sclust/util/weightedmovingaverage.rb
69
+ - lib/sclust/util/doc.rb
70
+ - lib/sclust/util/sparse_vector.rb
71
+ - lib/sclust/util/rss.rb
72
+ - lib/sclust/util/word.rb
73
+ - lib/sclust/util/filters.rb
74
+ - lib/sclust/util/doccol.rb
75
+ - lib/sclust/util/stopwords.rb
48
76
  has_rdoc: true
49
77
  homepage: http://sclust.rubyforge.org
50
78
  licenses: []
@@ -53,18 +81,18 @@ post_install_message:
53
81
  rdoc_options: []
54
82
 
55
83
  require_paths:
56
- - lib
84
+ - lib
57
85
  required_ruby_version: !ruby/object:Gem::Requirement
58
86
  requirements:
59
- - - ">="
60
- - !ruby/object:Gem::Version
61
- version: 1.6.8
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: 1.6.8
62
90
  version:
63
91
  required_rubygems_version: !ruby/object:Gem::Requirement
64
92
  requirements:
65
- - - ">="
66
- - !ruby/object:Gem::Version
67
- version: "0"
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ version: "0"
68
96
  version:
69
97
  requirements: []
70
98
 
@@ -72,9 +100,15 @@ rubyforge_project: http://sclust.rubyforge.org/
72
100
  rubygems_version: 1.3.5
73
101
  signing_key:
74
102
  specification_version: 3
75
- summary: k-mean clustering.
103
+ summary: k-mean/lda clustering.
76
104
  test_files:
77
- - tests/clustertest.rb
78
- - tests/test001.rb
79
- - tests/clustertest.rb
80
- - tests/test001.rb
105
+ - tests/filters_test.rb
106
+ - tests/test001.rb
107
+ - tests/ldatest.rb
108
+ - tests/clustertest.rb
109
+ - tests/sparse_vector_test.rb
110
+ - tests/filters_test.rb
111
+ - tests/test001.rb
112
+ - tests/ldatest.rb
113
+ - tests/clustertest.rb
114
+ - tests/sparse_vector_test.rb
@@ -1,197 +0,0 @@
1
- require 'rubygems'
2
- require 'log4r'
3
-
4
- module SClust
5
-
6
- class CosineDistance
7
-
8
- # Given two vectors, compute the distance
9
- def self.distance(a,b)
10
-
11
- acc1 = 0.0
12
- acc2 = 0.0
13
- acc3 = 0.0
14
-
15
- 0.upto(a.length-1) do |i|
16
- acc1 += a[i]*b[i]
17
- acc2 *= a[i]*a[i]
18
- acc3 *= b[i]*b[i]
19
- end
20
-
21
- 1 - ( acc1 / (Math.sqrt(acc2) * Math.sqrt(acc3)) )
22
- end
23
- end
24
-
25
- class ClusterPoint
26
-
27
- attr_reader :terms, :values, :cluster, :source_object
28
- attr_writer :cluster, :source_object
29
-
30
- # Initialize the ClusterPoint with a list of terms (labels, objects, whatever) and numeric values.
31
- def initialize(terms, values, source_object = nil)
32
- @terms = terms
33
- @values = values
34
- @cluster = nil
35
- @source_object = source_object
36
- end
37
-
38
- def distance(clusterPoint)
39
- CosineDistance.distance(@values, clusterPoint.values)
40
- end
41
-
42
- # Add each item in the cluster point to this cluster point adjusting the values per the given weight.
43
- # Weght is a value from 0.0 - 1.0, inclusive. A value of 1 means that this clusterPoint is 100% assigned to
44
- # this cluster point while a weight value of 0 will have no effect.
45
- def add(clusterPoint, weight)
46
- 0.upto(@values.length-1) { |i| @values[i] = ( @values[i] * (1-weight)) + (clusterPoint.values[i] * weight) }
47
-
48
- # Validation code
49
- #0.upto(@values.length-1) do |i|
50
- # if ( @values[i].nan? || ! @values[i].finite? )
51
- # throw Exception.new("Cluster has invalid number #{@values[i]}")
52
- # end
53
- #end
54
- end
55
-
56
-
57
- # Similar to add, but subtract.
58
- def sub(clusterPoint, weight)
59
- 0.upto(@values.length-1) { |i| @values[i] = ( @values[i] - (clusterPoint.values[i] * weight) ) / (1 - weight) }
60
-
61
- # Validation code
62
- #0.upto(@values.length-1) do |i|
63
- # if ( @values[i].nan? || ! @values[i].finite? )
64
- # throw Exception.new("Cluster has invalid number #{@values[i]} w:#{weight} and #{clusterPoint.values[i]}")
65
- # end
66
- #end
67
- end
68
-
69
- def get_max_terms(n=3)
70
-
71
- values = {}
72
-
73
- @terms.length.times do |i|
74
- t = @terms[i]
75
- v = @values[i]
76
- values[v] = [] unless values.has_key?(v)
77
- values[v] << t
78
- end
79
-
80
- vlist = values.keys.sort { |x,y| ( x > y ) ? -1 : 1 }
81
-
82
- result = []
83
-
84
- n = vlist.length if ( n > vlist.length )
85
-
86
- n.times { |i| result += values[vlist[i]] }
87
-
88
- result.slice(0,n)
89
-
90
- end
91
-
92
- def get_term_value(term)
93
- i=0
94
-
95
- catch(:found) do
96
- @terms.each do |t|
97
- throw :found if ( t == term )
98
- i+=1
99
- end
100
- end
101
-
102
- @values[i]
103
- end
104
-
105
- end
106
-
107
- class Cluster
108
-
109
- attr_reader :center, :size
110
-
111
- def initialize(centerPoint)
112
- @fixed = false
113
- @center = centerPoint.clone
114
- @size = 1
115
- end
116
-
117
- def +(point)
118
- point.cluster = self
119
-
120
- @size+=1
121
-
122
- @center.add(point, 1.0/@size.to_f)
123
- end
124
-
125
- def -(point)
126
- point.cluster = nil
127
-
128
- @center.sub(point, 1.0/@size.to_f)
129
-
130
- @size-=1
131
- end
132
-
133
- def get_max_terms(n=3)
134
- @center.get_max_terms(n)
135
- end
136
-
137
- end
138
-
139
- class Clusterer
140
-
141
- attr_reader :clusters, :points, :cluster_count, :iterations, :logger
142
- attr_writer :clusters, :points, :cluster_count, :iterations
143
-
144
- # Optionally takes a notifier.
145
- def initialize(points)
146
- @iterations = 2
147
- @cluster_count = 10
148
- @points = points
149
- @clusters = []
150
- @logger = Log4r::Logger.new('Clusterer')
151
-
152
- # Randomly select a few starting documents.
153
- @cluster_count.times { @clusters << Cluster.new(@points[rand(points.length)]) }
154
- end
155
-
156
- def +(point)
157
- @points << point
158
- end
159
-
160
- def each_cluster(&c)
161
- @clusters.each { |cluster| yield cluster }
162
- end
163
-
164
- def assign_all_points
165
-
166
- @points.each do |pt|
167
-
168
- @logger.debug("Assigning point #{pt}.")
169
-
170
- min_cluster = @clusters[0]
171
- min_dst = min_cluster.center.distance(pt)
172
-
173
- @clusters.each do |cluster|
174
-
175
- tmp_distance = cluster.center.distance(pt)
176
-
177
- if ( tmp_distance < min_dst )
178
- min_cluster = cluster
179
- min_dst = tmp_distance
180
- end
181
- end
182
-
183
- pt.cluster - pt if pt.cluster
184
-
185
- min_cluster + pt
186
- end
187
- end
188
-
189
- def cluster
190
- iterations.times do |i|
191
- @logger.info("Starting iteration #{i+1} of #{iterations}.")
192
- assign_all_points
193
- end
194
- end
195
- end
196
-
197
- end
@@ -1,92 +0,0 @@
1
- module SClust
2
-
3
- # Filters a document term
4
- class DocumentTermFilter
5
- # Return nil if the term should be excluded. Otherwise the version of the term
6
- # that should be included is returned.
7
- def filter(term)
8
- if ( term.nil? )
9
- nil
10
- elsif (term.size < 2)
11
- nil
12
- elsif ( term =~ /^[\d\.]+$/ )
13
- nil
14
- else
15
- term.downcase!
16
- end
17
- end
18
- end
19
-
20
- class NullFilter
21
- def filter(term)
22
- term
23
- end
24
- end
25
-
26
- class Document
27
-
28
- attr_reader :terms, :userDate, :filter
29
-
30
- # Takes { :userData, :ngrams => [1,2,3], :filter }
31
- def initialize(text, opts={})
32
-
33
- @text = text
34
- @userData = opts[:userData]
35
-
36
- opts[:ngramrange] ||= [ 1, 2, 3 ]
37
- opts[:filter] ||= DocumentTermFilter.new()
38
-
39
- word_arr = text.split(/[ ,\.\t!\?\(\)\{\}\[\]\t\r\n]+/m)
40
-
41
- @terms = Hash.new(0)
42
-
43
- # Array of counts of grams built.
44
- builtGramCounts = []
45
-
46
- # Build a set of n-grams from our requested ngram range.
47
- opts[:ngrams].each do |n|
48
-
49
- builtGramCounts[n] = 0
50
-
51
- # For each word in our list...
52
- 0.upto(word_arr.length-1) do |j|
53
-
54
- if ( n + j < word_arr.length )
55
-
56
- term = word_arr[j]
57
-
58
- (n-1).times { |ngram| term += " #{word_arr[j+ngram+1]}" }
59
-
60
- end
61
-
62
- term = opts[:filter].filter(term)
63
-
64
- @terms[term] += 1.0 if term
65
-
66
- builtGramCounts[n] += 1
67
-
68
- end
69
-
70
- end
71
-
72
- @terms.each { |k,v| @terms[k] /= @terms.length }
73
-
74
- end
75
-
76
- def term_frequency(term)
77
- @terms[term]
78
- end
79
-
80
- alias tf term_frequency
81
-
82
- def each_term(&call)
83
- terms.each_key { |k| yield k }
84
- end
85
-
86
- def has_term?(term)
87
- @terms.has_key?(term)
88
- end
89
-
90
- end
91
-
92
- end
@@ -1,39 +0,0 @@
1
- require 'sclust/doc'
2
- require 'sclust/doccol'
3
- require 'sclust/cluster'
4
-
5
- module SClust
6
-
7
- # A document clusterer that overrides the + operator
8
- # to allow for adding Document objects.
9
- class DocumentClusterer < Clusterer
10
-
11
- def initialize(documentCollection)
12
-
13
- # List of all terms
14
- term_list = documentCollection.terms.keys.sort
15
- point_list = []
16
-
17
- documentCollection.doclist.each do |doc|
18
-
19
- doc_terms = [] # Sorted list of terms.
20
- doc_term_values = [] # Corosponding values.
21
-
22
-
23
- # Buid a BIG term vector list for this document.
24
- term_list.each do |term|
25
- doc_terms << term
26
- doc_term_values << doc.tf(term) - documentCollection.idf(term)
27
- end
28
-
29
- # def initialize(terms, values, source_object = nil)
30
- point_list << ClusterPoint.new(doc_terms, doc_term_values, doc)
31
- end
32
-
33
- super(point_list)
34
-
35
- end
36
-
37
- end
38
-
39
- end