sclust 1.0.0 → 2.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/sclust/kmean/cluster.rb +294 -0
- data/lib/sclust/kmean/doccluster.rb +83 -0
- data/lib/sclust/lda/lda.rb +243 -0
- data/lib/sclust/lda/lda2.rb +328 -0
- data/lib/sclust/util/doc.rb +134 -0
- data/lib/sclust/util/doccol.rb +187 -0
- data/lib/sclust/util/filters.rb +210 -0
- data/lib/sclust/util/rss.rb +96 -0
- data/lib/sclust/util/sparse_vector.rb +96 -0
- data/lib/sclust/util/stopwords.rb +1149 -0
- data/lib/sclust/util/weightedmovingaverage.rb +25 -0
- data/lib/sclust/util/word.rb +53 -0
- data/tests/clustertest.rb +56 -29
- data/tests/filters_test.rb +48 -0
- data/tests/ldatest.rb +75 -0
- data/tests/sparse_vector_test.rb +61 -0
- data/tests/test001.rb +49 -19
- metadata +74 -40
- data/lib/sclust/cluster.rb +0 -197
- data/lib/sclust/doc.rb +0 -92
- data/lib/sclust/doccluster.rb +0 -39
- data/lib/sclust/doccol.rb +0 -75
metadata
CHANGED
@@ -1,38 +1,58 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sclust
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 2.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
|
7
|
+
- Sam Baskinger
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2010-10-02 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: log4r
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.0.5
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: mechanize
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.0.0
|
34
|
+
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: stemmer
|
37
|
+
type: :runtime
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 1.0.1
|
44
|
+
version:
|
45
|
+
- !ruby/object:Gem::Dependency
|
46
|
+
name: nokogiri
|
47
|
+
type: :runtime
|
48
|
+
version_requirement:
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 1.4.1
|
54
|
+
version:
|
55
|
+
description: A k-mean and LDA text clustering library for ruby.
|
36
56
|
email: basking2@rubyforge.org.com
|
37
57
|
executables: []
|
38
58
|
|
@@ -41,10 +61,18 @@ extensions: []
|
|
41
61
|
extra_rdoc_files: []
|
42
62
|
|
43
63
|
files:
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
64
|
+
- lib/sclust/kmean/doccluster.rb
|
65
|
+
- lib/sclust/kmean/cluster.rb
|
66
|
+
- lib/sclust/lda/lda2.rb
|
67
|
+
- lib/sclust/lda/lda.rb
|
68
|
+
- lib/sclust/util/weightedmovingaverage.rb
|
69
|
+
- lib/sclust/util/doc.rb
|
70
|
+
- lib/sclust/util/sparse_vector.rb
|
71
|
+
- lib/sclust/util/rss.rb
|
72
|
+
- lib/sclust/util/word.rb
|
73
|
+
- lib/sclust/util/filters.rb
|
74
|
+
- lib/sclust/util/doccol.rb
|
75
|
+
- lib/sclust/util/stopwords.rb
|
48
76
|
has_rdoc: true
|
49
77
|
homepage: http://sclust.rubyforge.org
|
50
78
|
licenses: []
|
@@ -53,18 +81,18 @@ post_install_message:
|
|
53
81
|
rdoc_options: []
|
54
82
|
|
55
83
|
require_paths:
|
56
|
-
|
84
|
+
- lib
|
57
85
|
required_ruby_version: !ruby/object:Gem::Requirement
|
58
86
|
requirements:
|
59
|
-
|
60
|
-
|
61
|
-
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 1.6.8
|
62
90
|
version:
|
63
91
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
64
92
|
requirements:
|
65
|
-
|
66
|
-
|
67
|
-
|
93
|
+
- - ">="
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: "0"
|
68
96
|
version:
|
69
97
|
requirements: []
|
70
98
|
|
@@ -72,9 +100,15 @@ rubyforge_project: http://sclust.rubyforge.org/
|
|
72
100
|
rubygems_version: 1.3.5
|
73
101
|
signing_key:
|
74
102
|
specification_version: 3
|
75
|
-
summary: k-mean clustering.
|
103
|
+
summary: k-mean/lda clustering.
|
76
104
|
test_files:
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
105
|
+
- tests/filters_test.rb
|
106
|
+
- tests/test001.rb
|
107
|
+
- tests/ldatest.rb
|
108
|
+
- tests/clustertest.rb
|
109
|
+
- tests/sparse_vector_test.rb
|
110
|
+
- tests/filters_test.rb
|
111
|
+
- tests/test001.rb
|
112
|
+
- tests/ldatest.rb
|
113
|
+
- tests/clustertest.rb
|
114
|
+
- tests/sparse_vector_test.rb
|
data/lib/sclust/cluster.rb
DELETED
@@ -1,197 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'log4r'
|
3
|
-
|
4
|
-
module SClust
|
5
|
-
|
6
|
-
class CosineDistance
|
7
|
-
|
8
|
-
# Given two vectors, compute the distance
|
9
|
-
def self.distance(a,b)
|
10
|
-
|
11
|
-
acc1 = 0.0
|
12
|
-
acc2 = 0.0
|
13
|
-
acc3 = 0.0
|
14
|
-
|
15
|
-
0.upto(a.length-1) do |i|
|
16
|
-
acc1 += a[i]*b[i]
|
17
|
-
acc2 *= a[i]*a[i]
|
18
|
-
acc3 *= b[i]*b[i]
|
19
|
-
end
|
20
|
-
|
21
|
-
1 - ( acc1 / (Math.sqrt(acc2) * Math.sqrt(acc3)) )
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
class ClusterPoint
|
26
|
-
|
27
|
-
attr_reader :terms, :values, :cluster, :source_object
|
28
|
-
attr_writer :cluster, :source_object
|
29
|
-
|
30
|
-
# Initialize the ClusterPoint with a list of terms (labels, objects, whatever) and numeric values.
|
31
|
-
def initialize(terms, values, source_object = nil)
|
32
|
-
@terms = terms
|
33
|
-
@values = values
|
34
|
-
@cluster = nil
|
35
|
-
@source_object = source_object
|
36
|
-
end
|
37
|
-
|
38
|
-
def distance(clusterPoint)
|
39
|
-
CosineDistance.distance(@values, clusterPoint.values)
|
40
|
-
end
|
41
|
-
|
42
|
-
# Add each item in the cluster point to this cluster point adjusting the values per the given weight.
|
43
|
-
# Weght is a value from 0.0 - 1.0, inclusive. A value of 1 means that this clusterPoint is 100% assigned to
|
44
|
-
# this cluster point while a weight value of 0 will have no effect.
|
45
|
-
def add(clusterPoint, weight)
|
46
|
-
0.upto(@values.length-1) { |i| @values[i] = ( @values[i] * (1-weight)) + (clusterPoint.values[i] * weight) }
|
47
|
-
|
48
|
-
# Validation code
|
49
|
-
#0.upto(@values.length-1) do |i|
|
50
|
-
# if ( @values[i].nan? || ! @values[i].finite? )
|
51
|
-
# throw Exception.new("Cluster has invalid number #{@values[i]}")
|
52
|
-
# end
|
53
|
-
#end
|
54
|
-
end
|
55
|
-
|
56
|
-
|
57
|
-
# Similar to add, but subtract.
|
58
|
-
def sub(clusterPoint, weight)
|
59
|
-
0.upto(@values.length-1) { |i| @values[i] = ( @values[i] - (clusterPoint.values[i] * weight) ) / (1 - weight) }
|
60
|
-
|
61
|
-
# Validation code
|
62
|
-
#0.upto(@values.length-1) do |i|
|
63
|
-
# if ( @values[i].nan? || ! @values[i].finite? )
|
64
|
-
# throw Exception.new("Cluster has invalid number #{@values[i]} w:#{weight} and #{clusterPoint.values[i]}")
|
65
|
-
# end
|
66
|
-
#end
|
67
|
-
end
|
68
|
-
|
69
|
-
def get_max_terms(n=3)
|
70
|
-
|
71
|
-
values = {}
|
72
|
-
|
73
|
-
@terms.length.times do |i|
|
74
|
-
t = @terms[i]
|
75
|
-
v = @values[i]
|
76
|
-
values[v] = [] unless values.has_key?(v)
|
77
|
-
values[v] << t
|
78
|
-
end
|
79
|
-
|
80
|
-
vlist = values.keys.sort { |x,y| ( x > y ) ? -1 : 1 }
|
81
|
-
|
82
|
-
result = []
|
83
|
-
|
84
|
-
n = vlist.length if ( n > vlist.length )
|
85
|
-
|
86
|
-
n.times { |i| result += values[vlist[i]] }
|
87
|
-
|
88
|
-
result.slice(0,n)
|
89
|
-
|
90
|
-
end
|
91
|
-
|
92
|
-
def get_term_value(term)
|
93
|
-
i=0
|
94
|
-
|
95
|
-
catch(:found) do
|
96
|
-
@terms.each do |t|
|
97
|
-
throw :found if ( t == term )
|
98
|
-
i+=1
|
99
|
-
end
|
100
|
-
end
|
101
|
-
|
102
|
-
@values[i]
|
103
|
-
end
|
104
|
-
|
105
|
-
end
|
106
|
-
|
107
|
-
class Cluster
|
108
|
-
|
109
|
-
attr_reader :center, :size
|
110
|
-
|
111
|
-
def initialize(centerPoint)
|
112
|
-
@fixed = false
|
113
|
-
@center = centerPoint.clone
|
114
|
-
@size = 1
|
115
|
-
end
|
116
|
-
|
117
|
-
def +(point)
|
118
|
-
point.cluster = self
|
119
|
-
|
120
|
-
@size+=1
|
121
|
-
|
122
|
-
@center.add(point, 1.0/@size.to_f)
|
123
|
-
end
|
124
|
-
|
125
|
-
def -(point)
|
126
|
-
point.cluster = nil
|
127
|
-
|
128
|
-
@center.sub(point, 1.0/@size.to_f)
|
129
|
-
|
130
|
-
@size-=1
|
131
|
-
end
|
132
|
-
|
133
|
-
def get_max_terms(n=3)
|
134
|
-
@center.get_max_terms(n)
|
135
|
-
end
|
136
|
-
|
137
|
-
end
|
138
|
-
|
139
|
-
class Clusterer
|
140
|
-
|
141
|
-
attr_reader :clusters, :points, :cluster_count, :iterations, :logger
|
142
|
-
attr_writer :clusters, :points, :cluster_count, :iterations
|
143
|
-
|
144
|
-
# Optionally takes a notifier.
|
145
|
-
def initialize(points)
|
146
|
-
@iterations = 2
|
147
|
-
@cluster_count = 10
|
148
|
-
@points = points
|
149
|
-
@clusters = []
|
150
|
-
@logger = Log4r::Logger.new('Clusterer')
|
151
|
-
|
152
|
-
# Randomly select a few starting documents.
|
153
|
-
@cluster_count.times { @clusters << Cluster.new(@points[rand(points.length)]) }
|
154
|
-
end
|
155
|
-
|
156
|
-
def +(point)
|
157
|
-
@points << point
|
158
|
-
end
|
159
|
-
|
160
|
-
def each_cluster(&c)
|
161
|
-
@clusters.each { |cluster| yield cluster }
|
162
|
-
end
|
163
|
-
|
164
|
-
def assign_all_points
|
165
|
-
|
166
|
-
@points.each do |pt|
|
167
|
-
|
168
|
-
@logger.debug("Assigning point #{pt}.")
|
169
|
-
|
170
|
-
min_cluster = @clusters[0]
|
171
|
-
min_dst = min_cluster.center.distance(pt)
|
172
|
-
|
173
|
-
@clusters.each do |cluster|
|
174
|
-
|
175
|
-
tmp_distance = cluster.center.distance(pt)
|
176
|
-
|
177
|
-
if ( tmp_distance < min_dst )
|
178
|
-
min_cluster = cluster
|
179
|
-
min_dst = tmp_distance
|
180
|
-
end
|
181
|
-
end
|
182
|
-
|
183
|
-
pt.cluster - pt if pt.cluster
|
184
|
-
|
185
|
-
min_cluster + pt
|
186
|
-
end
|
187
|
-
end
|
188
|
-
|
189
|
-
def cluster
|
190
|
-
iterations.times do |i|
|
191
|
-
@logger.info("Starting iteration #{i+1} of #{iterations}.")
|
192
|
-
assign_all_points
|
193
|
-
end
|
194
|
-
end
|
195
|
-
end
|
196
|
-
|
197
|
-
end
|
data/lib/sclust/doc.rb
DELETED
@@ -1,92 +0,0 @@
|
|
1
|
-
module SClust
|
2
|
-
|
3
|
-
# Filters a document term
|
4
|
-
class DocumentTermFilter
|
5
|
-
# Return nil if the term should be excluded. Otherwise the version of the term
|
6
|
-
# that should be included is returned.
|
7
|
-
def filter(term)
|
8
|
-
if ( term.nil? )
|
9
|
-
nil
|
10
|
-
elsif (term.size < 2)
|
11
|
-
nil
|
12
|
-
elsif ( term =~ /^[\d\.]+$/ )
|
13
|
-
nil
|
14
|
-
else
|
15
|
-
term.downcase!
|
16
|
-
end
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
class NullFilter
|
21
|
-
def filter(term)
|
22
|
-
term
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
class Document
|
27
|
-
|
28
|
-
attr_reader :terms, :userDate, :filter
|
29
|
-
|
30
|
-
# Takes { :userData, :ngrams => [1,2,3], :filter }
|
31
|
-
def initialize(text, opts={})
|
32
|
-
|
33
|
-
@text = text
|
34
|
-
@userData = opts[:userData]
|
35
|
-
|
36
|
-
opts[:ngramrange] ||= [ 1, 2, 3 ]
|
37
|
-
opts[:filter] ||= DocumentTermFilter.new()
|
38
|
-
|
39
|
-
word_arr = text.split(/[ ,\.\t!\?\(\)\{\}\[\]\t\r\n]+/m)
|
40
|
-
|
41
|
-
@terms = Hash.new(0)
|
42
|
-
|
43
|
-
# Array of counts of grams built.
|
44
|
-
builtGramCounts = []
|
45
|
-
|
46
|
-
# Build a set of n-grams from our requested ngram range.
|
47
|
-
opts[:ngrams].each do |n|
|
48
|
-
|
49
|
-
builtGramCounts[n] = 0
|
50
|
-
|
51
|
-
# For each word in our list...
|
52
|
-
0.upto(word_arr.length-1) do |j|
|
53
|
-
|
54
|
-
if ( n + j < word_arr.length )
|
55
|
-
|
56
|
-
term = word_arr[j]
|
57
|
-
|
58
|
-
(n-1).times { |ngram| term += " #{word_arr[j+ngram+1]}" }
|
59
|
-
|
60
|
-
end
|
61
|
-
|
62
|
-
term = opts[:filter].filter(term)
|
63
|
-
|
64
|
-
@terms[term] += 1.0 if term
|
65
|
-
|
66
|
-
builtGramCounts[n] += 1
|
67
|
-
|
68
|
-
end
|
69
|
-
|
70
|
-
end
|
71
|
-
|
72
|
-
@terms.each { |k,v| @terms[k] /= @terms.length }
|
73
|
-
|
74
|
-
end
|
75
|
-
|
76
|
-
def term_frequency(term)
|
77
|
-
@terms[term]
|
78
|
-
end
|
79
|
-
|
80
|
-
alias tf term_frequency
|
81
|
-
|
82
|
-
def each_term(&call)
|
83
|
-
terms.each_key { |k| yield k }
|
84
|
-
end
|
85
|
-
|
86
|
-
def has_term?(term)
|
87
|
-
@terms.has_key?(term)
|
88
|
-
end
|
89
|
-
|
90
|
-
end
|
91
|
-
|
92
|
-
end
|
data/lib/sclust/doccluster.rb
DELETED
@@ -1,39 +0,0 @@
|
|
1
|
-
require 'sclust/doc'
|
2
|
-
require 'sclust/doccol'
|
3
|
-
require 'sclust/cluster'
|
4
|
-
|
5
|
-
module SClust
|
6
|
-
|
7
|
-
# A document clusterer that overrides the + operator
|
8
|
-
# to allow for adding Document objects.
|
9
|
-
class DocumentClusterer < Clusterer
|
10
|
-
|
11
|
-
def initialize(documentCollection)
|
12
|
-
|
13
|
-
# List of all terms
|
14
|
-
term_list = documentCollection.terms.keys.sort
|
15
|
-
point_list = []
|
16
|
-
|
17
|
-
documentCollection.doclist.each do |doc|
|
18
|
-
|
19
|
-
doc_terms = [] # Sorted list of terms.
|
20
|
-
doc_term_values = [] # Corosponding values.
|
21
|
-
|
22
|
-
|
23
|
-
# Buid a BIG term vector list for this document.
|
24
|
-
term_list.each do |term|
|
25
|
-
doc_terms << term
|
26
|
-
doc_term_values << doc.tf(term) - documentCollection.idf(term)
|
27
|
-
end
|
28
|
-
|
29
|
-
# def initialize(terms, values, source_object = nil)
|
30
|
-
point_list << ClusterPoint.new(doc_terms, doc_term_values, doc)
|
31
|
-
end
|
32
|
-
|
33
|
-
super(point_list)
|
34
|
-
|
35
|
-
end
|
36
|
-
|
37
|
-
end
|
38
|
-
|
39
|
-
end
|