sclust 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/sclust/cluster.rb +197 -0
- data/lib/sclust/doc.rb +92 -0
- data/lib/sclust/doccluster.rb +39 -0
- data/lib/sclust/doccol.rb +75 -0
- data/tests/clustertest.rb +51 -0
- data/tests/test001.rb +49 -0
- metadata +80 -0
@@ -0,0 +1,197 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'log4r'
|
3
|
+
|
4
|
+
module SClust
|
5
|
+
|
6
|
+
class CosineDistance
|
7
|
+
|
8
|
+
# Given two vectors, compute the distance
|
9
|
+
def self.distance(a,b)
|
10
|
+
|
11
|
+
acc1 = 0.0
|
12
|
+
acc2 = 0.0
|
13
|
+
acc3 = 0.0
|
14
|
+
|
15
|
+
0.upto(a.length-1) do |i|
|
16
|
+
acc1 += a[i]*b[i]
|
17
|
+
acc2 *= a[i]*a[i]
|
18
|
+
acc3 *= b[i]*b[i]
|
19
|
+
end
|
20
|
+
|
21
|
+
1 - ( acc1 / (Math.sqrt(acc2) * Math.sqrt(acc3)) )
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
class ClusterPoint
|
26
|
+
|
27
|
+
attr_reader :terms, :values, :cluster, :source_object
|
28
|
+
attr_writer :cluster, :source_object
|
29
|
+
|
30
|
+
# Initialize the ClusterPoint with a list of terms (labels, objects, whatever) and numeric values.
|
31
|
+
def initialize(terms, values, source_object = nil)
|
32
|
+
@terms = terms
|
33
|
+
@values = values
|
34
|
+
@cluster = nil
|
35
|
+
@source_object = source_object
|
36
|
+
end
|
37
|
+
|
38
|
+
def distance(clusterPoint)
|
39
|
+
CosineDistance.distance(@values, clusterPoint.values)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Add each item in the cluster point to this cluster point adjusting the values per the given weight.
|
43
|
+
# Weght is a value from 0.0 - 1.0, inclusive. A value of 1 means that this clusterPoint is 100% assigned to
|
44
|
+
# this cluster point while a weight value of 0 will have no effect.
|
45
|
+
def add(clusterPoint, weight)
|
46
|
+
0.upto(@values.length-1) { |i| @values[i] = ( @values[i] * (1-weight)) + (clusterPoint.values[i] * weight) }
|
47
|
+
|
48
|
+
# Validation code
|
49
|
+
#0.upto(@values.length-1) do |i|
|
50
|
+
# if ( @values[i].nan? || ! @values[i].finite? )
|
51
|
+
# throw Exception.new("Cluster has invalid number #{@values[i]}")
|
52
|
+
# end
|
53
|
+
#end
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
# Similar to add, but subtract.
|
58
|
+
def sub(clusterPoint, weight)
|
59
|
+
0.upto(@values.length-1) { |i| @values[i] = ( @values[i] - (clusterPoint.values[i] * weight) ) / (1 - weight) }
|
60
|
+
|
61
|
+
# Validation code
|
62
|
+
#0.upto(@values.length-1) do |i|
|
63
|
+
# if ( @values[i].nan? || ! @values[i].finite? )
|
64
|
+
# throw Exception.new("Cluster has invalid number #{@values[i]} w:#{weight} and #{clusterPoint.values[i]}")
|
65
|
+
# end
|
66
|
+
#end
|
67
|
+
end
|
68
|
+
|
69
|
+
def get_max_terms(n=3)
|
70
|
+
|
71
|
+
values = {}
|
72
|
+
|
73
|
+
@terms.length.times do |i|
|
74
|
+
t = @terms[i]
|
75
|
+
v = @values[i]
|
76
|
+
values[v] = [] unless values.has_key?(v)
|
77
|
+
values[v] << t
|
78
|
+
end
|
79
|
+
|
80
|
+
vlist = values.keys.sort { |x,y| ( x > y ) ? -1 : 1 }
|
81
|
+
|
82
|
+
result = []
|
83
|
+
|
84
|
+
n = vlist.length if ( n > vlist.length )
|
85
|
+
|
86
|
+
n.times { |i| result += values[vlist[i]] }
|
87
|
+
|
88
|
+
result.slice(0,n)
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
def get_term_value(term)
|
93
|
+
i=0
|
94
|
+
|
95
|
+
catch(:found) do
|
96
|
+
@terms.each do |t|
|
97
|
+
throw :found if ( t == term )
|
98
|
+
i+=1
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
@values[i]
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
|
107
|
+
class Cluster
|
108
|
+
|
109
|
+
attr_reader :center, :size
|
110
|
+
|
111
|
+
def initialize(centerPoint)
|
112
|
+
@fixed = false
|
113
|
+
@center = centerPoint.clone
|
114
|
+
@size = 1
|
115
|
+
end
|
116
|
+
|
117
|
+
def +(point)
|
118
|
+
point.cluster = self
|
119
|
+
|
120
|
+
@size+=1
|
121
|
+
|
122
|
+
@center.add(point, 1.0/@size.to_f)
|
123
|
+
end
|
124
|
+
|
125
|
+
def -(point)
|
126
|
+
point.cluster = nil
|
127
|
+
|
128
|
+
@center.sub(point, 1.0/@size.to_f)
|
129
|
+
|
130
|
+
@size-=1
|
131
|
+
end
|
132
|
+
|
133
|
+
def get_max_terms(n=3)
|
134
|
+
@center.get_max_terms(n)
|
135
|
+
end
|
136
|
+
|
137
|
+
end
|
138
|
+
|
139
|
+
class Clusterer
|
140
|
+
|
141
|
+
attr_reader :clusters, :points, :cluster_count, :iterations, :logger
|
142
|
+
attr_writer :clusters, :points, :cluster_count, :iterations
|
143
|
+
|
144
|
+
# Optionally takes a notifier.
|
145
|
+
def initialize(points)
|
146
|
+
@iterations = 2
|
147
|
+
@cluster_count = 10
|
148
|
+
@points = points
|
149
|
+
@clusters = []
|
150
|
+
@logger = Log4r::Logger.new('Clusterer')
|
151
|
+
|
152
|
+
# Randomly select a few starting documents.
|
153
|
+
@cluster_count.times { @clusters << Cluster.new(@points[rand(points.length)]) }
|
154
|
+
end
|
155
|
+
|
156
|
+
def +(point)
|
157
|
+
@points << point
|
158
|
+
end
|
159
|
+
|
160
|
+
def each_cluster(&c)
|
161
|
+
@clusters.each { |cluster| yield cluster }
|
162
|
+
end
|
163
|
+
|
164
|
+
def assign_all_points
|
165
|
+
|
166
|
+
@points.each do |pt|
|
167
|
+
|
168
|
+
@logger.debug("Assigning point #{pt}.")
|
169
|
+
|
170
|
+
min_cluster = @clusters[0]
|
171
|
+
min_dst = min_cluster.center.distance(pt)
|
172
|
+
|
173
|
+
@clusters.each do |cluster|
|
174
|
+
|
175
|
+
tmp_distance = cluster.center.distance(pt)
|
176
|
+
|
177
|
+
if ( tmp_distance < min_dst )
|
178
|
+
min_cluster = cluster
|
179
|
+
min_dst = tmp_distance
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
pt.cluster - pt if pt.cluster
|
184
|
+
|
185
|
+
min_cluster + pt
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
def cluster
|
190
|
+
iterations.times do |i|
|
191
|
+
@logger.info("Starting iteration #{i+1} of #{iterations}.")
|
192
|
+
assign_all_points
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
end
|
data/lib/sclust/doc.rb
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
module SClust
|
2
|
+
|
3
|
+
# Filters a document term
|
4
|
+
class DocumentTermFilter
|
5
|
+
# Return nil if the term should be excluded. Otherwise the version of the term
|
6
|
+
# that should be included is returned.
|
7
|
+
def filter(term)
|
8
|
+
if ( term.nil? )
|
9
|
+
nil
|
10
|
+
elsif (term.size < 2)
|
11
|
+
nil
|
12
|
+
elsif ( term =~ /^[\d\.]+$/ )
|
13
|
+
nil
|
14
|
+
else
|
15
|
+
term.downcase!
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
class NullFilter
|
21
|
+
def filter(term)
|
22
|
+
term
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
class Document
|
27
|
+
|
28
|
+
attr_reader :terms, :userDate, :filter
|
29
|
+
|
30
|
+
# Takes { :userData, :ngrams => [1,2,3], :filter }
|
31
|
+
def initialize(text, opts={})
|
32
|
+
|
33
|
+
@text = text
|
34
|
+
@userData = opts[:userData]
|
35
|
+
|
36
|
+
opts[:ngramrange] ||= [ 1, 2, 3 ]
|
37
|
+
opts[:filter] ||= DocumentTermFilter.new()
|
38
|
+
|
39
|
+
word_arr = text.split(/[ ,\.\t!\?\(\)\{\}\[\]\t\r\n]+/m)
|
40
|
+
|
41
|
+
@terms = Hash.new(0)
|
42
|
+
|
43
|
+
# Array of counts of grams built.
|
44
|
+
builtGramCounts = []
|
45
|
+
|
46
|
+
# Build a set of n-grams from our requested ngram range.
|
47
|
+
opts[:ngrams].each do |n|
|
48
|
+
|
49
|
+
builtGramCounts[n] = 0
|
50
|
+
|
51
|
+
# For each word in our list...
|
52
|
+
0.upto(word_arr.length-1) do |j|
|
53
|
+
|
54
|
+
if ( n + j < word_arr.length )
|
55
|
+
|
56
|
+
term = word_arr[j]
|
57
|
+
|
58
|
+
(n-1).times { |ngram| term += " #{word_arr[j+ngram+1]}" }
|
59
|
+
|
60
|
+
end
|
61
|
+
|
62
|
+
term = opts[:filter].filter(term)
|
63
|
+
|
64
|
+
@terms[term] += 1.0 if term
|
65
|
+
|
66
|
+
builtGramCounts[n] += 1
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
|
72
|
+
@terms.each { |k,v| @terms[k] /= @terms.length }
|
73
|
+
|
74
|
+
end
|
75
|
+
|
76
|
+
def term_frequency(term)
|
77
|
+
@terms[term]
|
78
|
+
end
|
79
|
+
|
80
|
+
alias tf term_frequency
|
81
|
+
|
82
|
+
def each_term(&call)
|
83
|
+
terms.each_key { |k| yield k }
|
84
|
+
end
|
85
|
+
|
86
|
+
def has_term?(term)
|
87
|
+
@terms.has_key?(term)
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'sclust/doc'
|
2
|
+
require 'sclust/doccol'
|
3
|
+
require 'sclust/cluster'
|
4
|
+
|
5
|
+
module SClust
|
6
|
+
|
7
|
+
# A document clusterer that overrides the + operator
|
8
|
+
# to allow for adding Document objects.
|
9
|
+
class DocumentClusterer < Clusterer
|
10
|
+
|
11
|
+
def initialize(documentCollection)
|
12
|
+
|
13
|
+
# List of all terms
|
14
|
+
term_list = documentCollection.terms.keys.sort
|
15
|
+
point_list = []
|
16
|
+
|
17
|
+
documentCollection.doclist.each do |doc|
|
18
|
+
|
19
|
+
doc_terms = [] # Sorted list of terms.
|
20
|
+
doc_term_values = [] # Corosponding values.
|
21
|
+
|
22
|
+
|
23
|
+
# Buid a BIG term vector list for this document.
|
24
|
+
term_list.each do |term|
|
25
|
+
doc_terms << term
|
26
|
+
doc_term_values << doc.tf(term) - documentCollection.idf(term)
|
27
|
+
end
|
28
|
+
|
29
|
+
# def initialize(terms, values, source_object = nil)
|
30
|
+
point_list << ClusterPoint.new(doc_terms, doc_term_values, doc)
|
31
|
+
end
|
32
|
+
|
33
|
+
super(point_list)
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'log4r'
|
3
|
+
|
4
|
+
module SClust
|
5
|
+
class DocumentCollection
|
6
|
+
|
7
|
+
# terms - a hash were they keys are the terms in the documents and the values stored are the number of documents contiaining the term.
|
8
|
+
attr_reader :terms
|
9
|
+
|
10
|
+
# A list of documents
|
11
|
+
attr_reader :doclist
|
12
|
+
|
13
|
+
# Log4r::Logger for this document collection.
|
14
|
+
attr_reader :logger
|
15
|
+
|
16
|
+
def initialize()
|
17
|
+
@logger = Log4r::Logger.new("SClust::DocumentCollection")
|
18
|
+
@terms = Hash.new(0)
|
19
|
+
@doclist = []
|
20
|
+
end
|
21
|
+
|
22
|
+
# Add a document to the collection and adjust the @terms attribute to store any new terms in the document.
|
23
|
+
# The document is also added to the @doclist attribute.
|
24
|
+
def +(d)
|
25
|
+
|
26
|
+
d.each_term do |term|
|
27
|
+
@terms[term] += 1.0
|
28
|
+
end
|
29
|
+
|
30
|
+
@doclist<<d
|
31
|
+
|
32
|
+
@logger.info("There are #{@doclist.size} documents and #{@terms.size} terms.")
|
33
|
+
|
34
|
+
self
|
35
|
+
end
|
36
|
+
|
37
|
+
def drop_terms(min_frequency=0.10, max_frequency=0.80)
|
38
|
+
|
39
|
+
min_docs = @doclist.length * min_frequency
|
40
|
+
max_docs = @doclist.length * max_frequency
|
41
|
+
|
42
|
+
@logger.info("Analyzing #{@terms.length} terms for removal.")
|
43
|
+
@logger.info("Upper/lower boundary are #{max_frequency}/#{min_frequency}% document frequency or #{max_docs}/#{min_docs} documents.")
|
44
|
+
|
45
|
+
remove_list = []
|
46
|
+
|
47
|
+
@terms.each do |term, frequency|
|
48
|
+
|
49
|
+
if ( frequency < min_docs or frequency > max_docs )
|
50
|
+
@logger.info("Removing term #{term} occuring in #{frequency} documents out of #{@doclist.length}")
|
51
|
+
@terms.delete(term)
|
52
|
+
remove_list << term
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
@logger.info("Removed #{remove_list.length} of #{@terms.length + remove_list.length} terms. Updating #{doclist.length} documents.")
|
57
|
+
|
58
|
+
@doclist.each do |doc|
|
59
|
+
remove_list.each do |term|
|
60
|
+
doc.terms.delete(term)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def inverse_document_frequency(term)
|
66
|
+
Math.log( @terms.length / @terms[term] )
|
67
|
+
end
|
68
|
+
|
69
|
+
alias idf inverse_document_frequency
|
70
|
+
|
71
|
+
def each_term(&c)
|
72
|
+
@terms.each_key { |k| yield k }
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
|
3
|
+
require 'sclust/doccluster'
|
4
|
+
|
5
|
+
class ClusterTest < Test::Unit::TestCase
|
6
|
+
|
7
|
+
def setup()
|
8
|
+
@dc = SClust::DocumentCollection.new()
|
9
|
+
filter = SClust::NullFilter.new()
|
10
|
+
d1 = SClust::Document.new("a b c d d e a q a b", :filter=>filter, :ngrams=>[1])
|
11
|
+
d2 = SClust::Document.new("a b d e a", :filter=>filter, :ngrams=>[1])
|
12
|
+
d3 = SClust::Document.new("bob", :filter=>filter, :ngrams=>[1])
|
13
|
+
d4 = SClust::Document.new("frank a", :filter=>filter, :ngrams=>[1])
|
14
|
+
|
15
|
+
@dc + d1
|
16
|
+
@dc + d2
|
17
|
+
@dc + d3
|
18
|
+
@dc + d4
|
19
|
+
end
|
20
|
+
|
21
|
+
def teardown()
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_makecluster()
|
25
|
+
c = SClust::DocumentClusterer.new(@dc)
|
26
|
+
|
27
|
+
c.cluster
|
28
|
+
|
29
|
+
c.each_cluster do |cl|
|
30
|
+
|
31
|
+
max = 0
|
32
|
+
|
33
|
+
0.upto(cl.center.terms.length - 1) do |i|
|
34
|
+
|
35
|
+
term = cl.center.terms[i]
|
36
|
+
value = cl.center.values[i]
|
37
|
+
|
38
|
+
max = i if ( cl.center.values[i] > cl.center.values[max] )
|
39
|
+
end
|
40
|
+
|
41
|
+
puts("Cluster: #{cl.center.terms[max]} #{cl.center.values[max]}")
|
42
|
+
|
43
|
+
cl.center.get_max_terms(3).each do |t|
|
44
|
+
puts("Got Term: #{t} with value #{cl.center.get_term_value(t)}")
|
45
|
+
end
|
46
|
+
|
47
|
+
assert(cl.center.values[max] == cl.center.get_term_value(cl.center.get_max_terms(1)[0]), "Max value was not found.")
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
data/tests/test001.rb
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'sclust/doc'
|
2
|
+
require 'sclust/doccol'
|
3
|
+
require 'test/unit'
|
4
|
+
|
5
|
+
|
6
|
+
class DocTests < Test::Unit::TestCase
|
7
|
+
|
8
|
+
#def setup
|
9
|
+
#end
|
10
|
+
|
11
|
+
#def teardown
|
12
|
+
#end
|
13
|
+
|
14
|
+
def test_builddoc
|
15
|
+
d = SClust::Document.new("hi, this is a nice doc! Yup. Oh? A very nice doc, indeed.")
|
16
|
+
|
17
|
+
d.terms.each do |k,v|
|
18
|
+
assert(k != ".", "Period found")
|
19
|
+
assert(k != "", "Empty term found")
|
20
|
+
#puts("#{k}=#{v}")
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
class DocCollectionTests < Test::Unit::TestCase
|
28
|
+
|
29
|
+
def test_collectionadd()
|
30
|
+
dc = SClust::DocumentCollection.new()
|
31
|
+
d1 = SClust::Document.new("a b c d d e a q a b")
|
32
|
+
d2 = SClust::Document.new("a b d e a")
|
33
|
+
d3 = SClust::Document.new("bob")
|
34
|
+
d4 = SClust::Document.new("frank a")
|
35
|
+
|
36
|
+
dc + d1
|
37
|
+
dc + d2
|
38
|
+
dc + d3
|
39
|
+
dc + d4
|
40
|
+
|
41
|
+
dc.terms.each do |k,v|
|
42
|
+
if k == "a"
|
43
|
+
assert(v == 3, "A appers in 3 documents out of 4.")
|
44
|
+
assert(dc.idf("a") > 2.2, "Known value for a")
|
45
|
+
assert(dc.idf("a") < 2.3, "Known value for a")
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
metadata
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: sclust
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Sam Baskinger
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-12-01 00:00:00 -06:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: log4r
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.0.5
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: mechanize
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.9.3
|
34
|
+
version:
|
35
|
+
description: A k-mean text clustering library for ruby.
|
36
|
+
email: basking2@rubyforge.org.com
|
37
|
+
executables: []
|
38
|
+
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files: []
|
42
|
+
|
43
|
+
files:
|
44
|
+
- lib/sclust/cluster.rb
|
45
|
+
- lib/sclust/doc.rb
|
46
|
+
- lib/sclust/doccluster.rb
|
47
|
+
- lib/sclust/doccol.rb
|
48
|
+
has_rdoc: true
|
49
|
+
homepage: http://sclust.rubyforge.org
|
50
|
+
licenses: []
|
51
|
+
|
52
|
+
post_install_message:
|
53
|
+
rdoc_options: []
|
54
|
+
|
55
|
+
require_paths:
|
56
|
+
- lib
|
57
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 1.6.8
|
62
|
+
version:
|
63
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: "0"
|
68
|
+
version:
|
69
|
+
requirements: []
|
70
|
+
|
71
|
+
rubyforge_project: http://sclust.rubyforge.org/
|
72
|
+
rubygems_version: 1.3.5
|
73
|
+
signing_key:
|
74
|
+
specification_version: 3
|
75
|
+
summary: k-mean clustering.
|
76
|
+
test_files:
|
77
|
+
- tests/clustertest.rb
|
78
|
+
- tests/test001.rb
|
79
|
+
- tests/clustertest.rb
|
80
|
+
- tests/test001.rb
|