sclust 1.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/sclust/kmean/cluster.rb +294 -0
- data/lib/sclust/kmean/doccluster.rb +83 -0
- data/lib/sclust/lda/lda.rb +243 -0
- data/lib/sclust/lda/lda2.rb +328 -0
- data/lib/sclust/util/doc.rb +134 -0
- data/lib/sclust/util/doccol.rb +187 -0
- data/lib/sclust/util/filters.rb +210 -0
- data/lib/sclust/util/rss.rb +96 -0
- data/lib/sclust/util/sparse_vector.rb +96 -0
- data/lib/sclust/util/stopwords.rb +1149 -0
- data/lib/sclust/util/weightedmovingaverage.rb +25 -0
- data/lib/sclust/util/word.rb +53 -0
- data/tests/clustertest.rb +56 -29
- data/tests/filters_test.rb +48 -0
- data/tests/ldatest.rb +75 -0
- data/tests/sparse_vector_test.rb +61 -0
- data/tests/test001.rb +49 -19
- metadata +74 -40
- data/lib/sclust/cluster.rb +0 -197
- data/lib/sclust/doc.rb +0 -92
- data/lib/sclust/doccluster.rb +0 -39
- data/lib/sclust/doccol.rb +0 -75
metadata
CHANGED
@@ -1,38 +1,58 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sclust
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 2.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
|
7
|
+
- Sam Baskinger
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2010-10-02 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: log4r
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.0.5
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: mechanize
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.0.0
|
34
|
+
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: stemmer
|
37
|
+
type: :runtime
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 1.0.1
|
44
|
+
version:
|
45
|
+
- !ruby/object:Gem::Dependency
|
46
|
+
name: nokogiri
|
47
|
+
type: :runtime
|
48
|
+
version_requirement:
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 1.4.1
|
54
|
+
version:
|
55
|
+
description: A k-mean and LDA text clustering library for ruby.
|
36
56
|
email: basking2@rubyforge.org.com
|
37
57
|
executables: []
|
38
58
|
|
@@ -41,10 +61,18 @@ extensions: []
|
|
41
61
|
extra_rdoc_files: []
|
42
62
|
|
43
63
|
files:
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
64
|
+
- lib/sclust/kmean/doccluster.rb
|
65
|
+
- lib/sclust/kmean/cluster.rb
|
66
|
+
- lib/sclust/lda/lda2.rb
|
67
|
+
- lib/sclust/lda/lda.rb
|
68
|
+
- lib/sclust/util/weightedmovingaverage.rb
|
69
|
+
- lib/sclust/util/doc.rb
|
70
|
+
- lib/sclust/util/sparse_vector.rb
|
71
|
+
- lib/sclust/util/rss.rb
|
72
|
+
- lib/sclust/util/word.rb
|
73
|
+
- lib/sclust/util/filters.rb
|
74
|
+
- lib/sclust/util/doccol.rb
|
75
|
+
- lib/sclust/util/stopwords.rb
|
48
76
|
has_rdoc: true
|
49
77
|
homepage: http://sclust.rubyforge.org
|
50
78
|
licenses: []
|
@@ -53,18 +81,18 @@ post_install_message:
|
|
53
81
|
rdoc_options: []
|
54
82
|
|
55
83
|
require_paths:
|
56
|
-
|
84
|
+
- lib
|
57
85
|
required_ruby_version: !ruby/object:Gem::Requirement
|
58
86
|
requirements:
|
59
|
-
|
60
|
-
|
61
|
-
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 1.6.8
|
62
90
|
version:
|
63
91
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
64
92
|
requirements:
|
65
|
-
|
66
|
-
|
67
|
-
|
93
|
+
- - ">="
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: "0"
|
68
96
|
version:
|
69
97
|
requirements: []
|
70
98
|
|
@@ -72,9 +100,15 @@ rubyforge_project: http://sclust.rubyforge.org/
|
|
72
100
|
rubygems_version: 1.3.5
|
73
101
|
signing_key:
|
74
102
|
specification_version: 3
|
75
|
-
summary: k-mean clustering.
|
103
|
+
summary: k-mean/lda clustering.
|
76
104
|
test_files:
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
105
|
+
- tests/filters_test.rb
|
106
|
+
- tests/test001.rb
|
107
|
+
- tests/ldatest.rb
|
108
|
+
- tests/clustertest.rb
|
109
|
+
- tests/sparse_vector_test.rb
|
110
|
+
- tests/filters_test.rb
|
111
|
+
- tests/test001.rb
|
112
|
+
- tests/ldatest.rb
|
113
|
+
- tests/clustertest.rb
|
114
|
+
- tests/sparse_vector_test.rb
|
data/lib/sclust/cluster.rb
DELETED
@@ -1,197 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'log4r'
|
3
|
-
|
4
|
-
module SClust
|
5
|
-
|
6
|
-
class CosineDistance
|
7
|
-
|
8
|
-
# Given two vectors, compute the distance
|
9
|
-
def self.distance(a,b)
|
10
|
-
|
11
|
-
acc1 = 0.0
|
12
|
-
acc2 = 0.0
|
13
|
-
acc3 = 0.0
|
14
|
-
|
15
|
-
0.upto(a.length-1) do |i|
|
16
|
-
acc1 += a[i]*b[i]
|
17
|
-
acc2 *= a[i]*a[i]
|
18
|
-
acc3 *= b[i]*b[i]
|
19
|
-
end
|
20
|
-
|
21
|
-
1 - ( acc1 / (Math.sqrt(acc2) * Math.sqrt(acc3)) )
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
class ClusterPoint
|
26
|
-
|
27
|
-
attr_reader :terms, :values, :cluster, :source_object
|
28
|
-
attr_writer :cluster, :source_object
|
29
|
-
|
30
|
-
# Initialize the ClusterPoint with a list of terms (labels, objects, whatever) and numeric values.
|
31
|
-
def initialize(terms, values, source_object = nil)
|
32
|
-
@terms = terms
|
33
|
-
@values = values
|
34
|
-
@cluster = nil
|
35
|
-
@source_object = source_object
|
36
|
-
end
|
37
|
-
|
38
|
-
def distance(clusterPoint)
|
39
|
-
CosineDistance.distance(@values, clusterPoint.values)
|
40
|
-
end
|
41
|
-
|
42
|
-
# Add each item in the cluster point to this cluster point adjusting the values per the given weight.
|
43
|
-
# Weght is a value from 0.0 - 1.0, inclusive. A value of 1 means that this clusterPoint is 100% assigned to
|
44
|
-
# this cluster point while a weight value of 0 will have no effect.
|
45
|
-
def add(clusterPoint, weight)
|
46
|
-
0.upto(@values.length-1) { |i| @values[i] = ( @values[i] * (1-weight)) + (clusterPoint.values[i] * weight) }
|
47
|
-
|
48
|
-
# Validation code
|
49
|
-
#0.upto(@values.length-1) do |i|
|
50
|
-
# if ( @values[i].nan? || ! @values[i].finite? )
|
51
|
-
# throw Exception.new("Cluster has invalid number #{@values[i]}")
|
52
|
-
# end
|
53
|
-
#end
|
54
|
-
end
|
55
|
-
|
56
|
-
|
57
|
-
# Similar to add, but subtract.
|
58
|
-
def sub(clusterPoint, weight)
|
59
|
-
0.upto(@values.length-1) { |i| @values[i] = ( @values[i] - (clusterPoint.values[i] * weight) ) / (1 - weight) }
|
60
|
-
|
61
|
-
# Validation code
|
62
|
-
#0.upto(@values.length-1) do |i|
|
63
|
-
# if ( @values[i].nan? || ! @values[i].finite? )
|
64
|
-
# throw Exception.new("Cluster has invalid number #{@values[i]} w:#{weight} and #{clusterPoint.values[i]}")
|
65
|
-
# end
|
66
|
-
#end
|
67
|
-
end
|
68
|
-
|
69
|
-
def get_max_terms(n=3)
|
70
|
-
|
71
|
-
values = {}
|
72
|
-
|
73
|
-
@terms.length.times do |i|
|
74
|
-
t = @terms[i]
|
75
|
-
v = @values[i]
|
76
|
-
values[v] = [] unless values.has_key?(v)
|
77
|
-
values[v] << t
|
78
|
-
end
|
79
|
-
|
80
|
-
vlist = values.keys.sort { |x,y| ( x > y ) ? -1 : 1 }
|
81
|
-
|
82
|
-
result = []
|
83
|
-
|
84
|
-
n = vlist.length if ( n > vlist.length )
|
85
|
-
|
86
|
-
n.times { |i| result += values[vlist[i]] }
|
87
|
-
|
88
|
-
result.slice(0,n)
|
89
|
-
|
90
|
-
end
|
91
|
-
|
92
|
-
def get_term_value(term)
|
93
|
-
i=0
|
94
|
-
|
95
|
-
catch(:found) do
|
96
|
-
@terms.each do |t|
|
97
|
-
throw :found if ( t == term )
|
98
|
-
i+=1
|
99
|
-
end
|
100
|
-
end
|
101
|
-
|
102
|
-
@values[i]
|
103
|
-
end
|
104
|
-
|
105
|
-
end
|
106
|
-
|
107
|
-
class Cluster
|
108
|
-
|
109
|
-
attr_reader :center, :size
|
110
|
-
|
111
|
-
def initialize(centerPoint)
|
112
|
-
@fixed = false
|
113
|
-
@center = centerPoint.clone
|
114
|
-
@size = 1
|
115
|
-
end
|
116
|
-
|
117
|
-
def +(point)
|
118
|
-
point.cluster = self
|
119
|
-
|
120
|
-
@size+=1
|
121
|
-
|
122
|
-
@center.add(point, 1.0/@size.to_f)
|
123
|
-
end
|
124
|
-
|
125
|
-
def -(point)
|
126
|
-
point.cluster = nil
|
127
|
-
|
128
|
-
@center.sub(point, 1.0/@size.to_f)
|
129
|
-
|
130
|
-
@size-=1
|
131
|
-
end
|
132
|
-
|
133
|
-
def get_max_terms(n=3)
|
134
|
-
@center.get_max_terms(n)
|
135
|
-
end
|
136
|
-
|
137
|
-
end
|
138
|
-
|
139
|
-
class Clusterer
|
140
|
-
|
141
|
-
attr_reader :clusters, :points, :cluster_count, :iterations, :logger
|
142
|
-
attr_writer :clusters, :points, :cluster_count, :iterations
|
143
|
-
|
144
|
-
# Optionally takes a notifier.
|
145
|
-
def initialize(points)
|
146
|
-
@iterations = 2
|
147
|
-
@cluster_count = 10
|
148
|
-
@points = points
|
149
|
-
@clusters = []
|
150
|
-
@logger = Log4r::Logger.new('Clusterer')
|
151
|
-
|
152
|
-
# Randomly select a few starting documents.
|
153
|
-
@cluster_count.times { @clusters << Cluster.new(@points[rand(points.length)]) }
|
154
|
-
end
|
155
|
-
|
156
|
-
def +(point)
|
157
|
-
@points << point
|
158
|
-
end
|
159
|
-
|
160
|
-
def each_cluster(&c)
|
161
|
-
@clusters.each { |cluster| yield cluster }
|
162
|
-
end
|
163
|
-
|
164
|
-
def assign_all_points
|
165
|
-
|
166
|
-
@points.each do |pt|
|
167
|
-
|
168
|
-
@logger.debug("Assigning point #{pt}.")
|
169
|
-
|
170
|
-
min_cluster = @clusters[0]
|
171
|
-
min_dst = min_cluster.center.distance(pt)
|
172
|
-
|
173
|
-
@clusters.each do |cluster|
|
174
|
-
|
175
|
-
tmp_distance = cluster.center.distance(pt)
|
176
|
-
|
177
|
-
if ( tmp_distance < min_dst )
|
178
|
-
min_cluster = cluster
|
179
|
-
min_dst = tmp_distance
|
180
|
-
end
|
181
|
-
end
|
182
|
-
|
183
|
-
pt.cluster - pt if pt.cluster
|
184
|
-
|
185
|
-
min_cluster + pt
|
186
|
-
end
|
187
|
-
end
|
188
|
-
|
189
|
-
def cluster
|
190
|
-
iterations.times do |i|
|
191
|
-
@logger.info("Starting iteration #{i+1} of #{iterations}.")
|
192
|
-
assign_all_points
|
193
|
-
end
|
194
|
-
end
|
195
|
-
end
|
196
|
-
|
197
|
-
end
|
data/lib/sclust/doc.rb
DELETED
@@ -1,92 +0,0 @@
|
|
1
|
-
module SClust
|
2
|
-
|
3
|
-
# Filters a document term
|
4
|
-
class DocumentTermFilter
|
5
|
-
# Return nil if the term should be excluded. Otherwise the version of the term
|
6
|
-
# that should be included is returned.
|
7
|
-
def filter(term)
|
8
|
-
if ( term.nil? )
|
9
|
-
nil
|
10
|
-
elsif (term.size < 2)
|
11
|
-
nil
|
12
|
-
elsif ( term =~ /^[\d\.]+$/ )
|
13
|
-
nil
|
14
|
-
else
|
15
|
-
term.downcase!
|
16
|
-
end
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
class NullFilter
|
21
|
-
def filter(term)
|
22
|
-
term
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
class Document
|
27
|
-
|
28
|
-
attr_reader :terms, :userDate, :filter
|
29
|
-
|
30
|
-
# Takes { :userData, :ngrams => [1,2,3], :filter }
|
31
|
-
def initialize(text, opts={})
|
32
|
-
|
33
|
-
@text = text
|
34
|
-
@userData = opts[:userData]
|
35
|
-
|
36
|
-
opts[:ngramrange] ||= [ 1, 2, 3 ]
|
37
|
-
opts[:filter] ||= DocumentTermFilter.new()
|
38
|
-
|
39
|
-
word_arr = text.split(/[ ,\.\t!\?\(\)\{\}\[\]\t\r\n]+/m)
|
40
|
-
|
41
|
-
@terms = Hash.new(0)
|
42
|
-
|
43
|
-
# Array of counts of grams built.
|
44
|
-
builtGramCounts = []
|
45
|
-
|
46
|
-
# Build a set of n-grams from our requested ngram range.
|
47
|
-
opts[:ngrams].each do |n|
|
48
|
-
|
49
|
-
builtGramCounts[n] = 0
|
50
|
-
|
51
|
-
# For each word in our list...
|
52
|
-
0.upto(word_arr.length-1) do |j|
|
53
|
-
|
54
|
-
if ( n + j < word_arr.length )
|
55
|
-
|
56
|
-
term = word_arr[j]
|
57
|
-
|
58
|
-
(n-1).times { |ngram| term += " #{word_arr[j+ngram+1]}" }
|
59
|
-
|
60
|
-
end
|
61
|
-
|
62
|
-
term = opts[:filter].filter(term)
|
63
|
-
|
64
|
-
@terms[term] += 1.0 if term
|
65
|
-
|
66
|
-
builtGramCounts[n] += 1
|
67
|
-
|
68
|
-
end
|
69
|
-
|
70
|
-
end
|
71
|
-
|
72
|
-
@terms.each { |k,v| @terms[k] /= @terms.length }
|
73
|
-
|
74
|
-
end
|
75
|
-
|
76
|
-
def term_frequency(term)
|
77
|
-
@terms[term]
|
78
|
-
end
|
79
|
-
|
80
|
-
alias tf term_frequency
|
81
|
-
|
82
|
-
def each_term(&call)
|
83
|
-
terms.each_key { |k| yield k }
|
84
|
-
end
|
85
|
-
|
86
|
-
def has_term?(term)
|
87
|
-
@terms.has_key?(term)
|
88
|
-
end
|
89
|
-
|
90
|
-
end
|
91
|
-
|
92
|
-
end
|
data/lib/sclust/doccluster.rb
DELETED
@@ -1,39 +0,0 @@
|
|
1
|
-
require 'sclust/doc'
|
2
|
-
require 'sclust/doccol'
|
3
|
-
require 'sclust/cluster'
|
4
|
-
|
5
|
-
module SClust
|
6
|
-
|
7
|
-
# A document clusterer that overrides the + operator
|
8
|
-
# to allow for adding Document objects.
|
9
|
-
class DocumentClusterer < Clusterer
|
10
|
-
|
11
|
-
def initialize(documentCollection)
|
12
|
-
|
13
|
-
# List of all terms
|
14
|
-
term_list = documentCollection.terms.keys.sort
|
15
|
-
point_list = []
|
16
|
-
|
17
|
-
documentCollection.doclist.each do |doc|
|
18
|
-
|
19
|
-
doc_terms = [] # Sorted list of terms.
|
20
|
-
doc_term_values = [] # Corosponding values.
|
21
|
-
|
22
|
-
|
23
|
-
# Buid a BIG term vector list for this document.
|
24
|
-
term_list.each do |term|
|
25
|
-
doc_terms << term
|
26
|
-
doc_term_values << doc.tf(term) - documentCollection.idf(term)
|
27
|
-
end
|
28
|
-
|
29
|
-
# def initialize(terms, values, source_object = nil)
|
30
|
-
point_list << ClusterPoint.new(doc_terms, doc_term_values, doc)
|
31
|
-
end
|
32
|
-
|
33
|
-
super(point_list)
|
34
|
-
|
35
|
-
end
|
36
|
-
|
37
|
-
end
|
38
|
-
|
39
|
-
end
|