sclust 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/sclust/cluster.rb +197 -0
- data/lib/sclust/doc.rb +92 -0
- data/lib/sclust/doccluster.rb +39 -0
- data/lib/sclust/doccol.rb +75 -0
- data/tests/clustertest.rb +51 -0
- data/tests/test001.rb +49 -0
- metadata +80 -0
@@ -0,0 +1,197 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'log4r'
|
3
|
+
|
4
|
+
module SClust
|
5
|
+
|
6
|
+
class CosineDistance
|
7
|
+
|
8
|
+
# Given two vectors, compute the distance
|
9
|
+
def self.distance(a,b)
|
10
|
+
|
11
|
+
acc1 = 0.0
|
12
|
+
acc2 = 0.0
|
13
|
+
acc3 = 0.0
|
14
|
+
|
15
|
+
0.upto(a.length-1) do |i|
|
16
|
+
acc1 += a[i]*b[i]
|
17
|
+
acc2 *= a[i]*a[i]
|
18
|
+
acc3 *= b[i]*b[i]
|
19
|
+
end
|
20
|
+
|
21
|
+
1 - ( acc1 / (Math.sqrt(acc2) * Math.sqrt(acc3)) )
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
class ClusterPoint
|
26
|
+
|
27
|
+
attr_reader :terms, :values, :cluster, :source_object
|
28
|
+
attr_writer :cluster, :source_object
|
29
|
+
|
30
|
+
# Initialize the ClusterPoint with a list of terms (labels, objects, whatever) and numeric values.
|
31
|
+
def initialize(terms, values, source_object = nil)
|
32
|
+
@terms = terms
|
33
|
+
@values = values
|
34
|
+
@cluster = nil
|
35
|
+
@source_object = source_object
|
36
|
+
end
|
37
|
+
|
38
|
+
def distance(clusterPoint)
|
39
|
+
CosineDistance.distance(@values, clusterPoint.values)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Add each item in the cluster point to this cluster point adjusting the values per the given weight.
|
43
|
+
# Weght is a value from 0.0 - 1.0, inclusive. A value of 1 means that this clusterPoint is 100% assigned to
|
44
|
+
# this cluster point while a weight value of 0 will have no effect.
|
45
|
+
def add(clusterPoint, weight)
|
46
|
+
0.upto(@values.length-1) { |i| @values[i] = ( @values[i] * (1-weight)) + (clusterPoint.values[i] * weight) }
|
47
|
+
|
48
|
+
# Validation code
|
49
|
+
#0.upto(@values.length-1) do |i|
|
50
|
+
# if ( @values[i].nan? || ! @values[i].finite? )
|
51
|
+
# throw Exception.new("Cluster has invalid number #{@values[i]}")
|
52
|
+
# end
|
53
|
+
#end
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
# Similar to add, but subtract.
|
58
|
+
def sub(clusterPoint, weight)
|
59
|
+
0.upto(@values.length-1) { |i| @values[i] = ( @values[i] - (clusterPoint.values[i] * weight) ) / (1 - weight) }
|
60
|
+
|
61
|
+
# Validation code
|
62
|
+
#0.upto(@values.length-1) do |i|
|
63
|
+
# if ( @values[i].nan? || ! @values[i].finite? )
|
64
|
+
# throw Exception.new("Cluster has invalid number #{@values[i]} w:#{weight} and #{clusterPoint.values[i]}")
|
65
|
+
# end
|
66
|
+
#end
|
67
|
+
end
|
68
|
+
|
69
|
+
def get_max_terms(n=3)
|
70
|
+
|
71
|
+
values = {}
|
72
|
+
|
73
|
+
@terms.length.times do |i|
|
74
|
+
t = @terms[i]
|
75
|
+
v = @values[i]
|
76
|
+
values[v] = [] unless values.has_key?(v)
|
77
|
+
values[v] << t
|
78
|
+
end
|
79
|
+
|
80
|
+
vlist = values.keys.sort { |x,y| ( x > y ) ? -1 : 1 }
|
81
|
+
|
82
|
+
result = []
|
83
|
+
|
84
|
+
n = vlist.length if ( n > vlist.length )
|
85
|
+
|
86
|
+
n.times { |i| result += values[vlist[i]] }
|
87
|
+
|
88
|
+
result.slice(0,n)
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
def get_term_value(term)
|
93
|
+
i=0
|
94
|
+
|
95
|
+
catch(:found) do
|
96
|
+
@terms.each do |t|
|
97
|
+
throw :found if ( t == term )
|
98
|
+
i+=1
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
@values[i]
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
|
107
|
+
class Cluster
|
108
|
+
|
109
|
+
attr_reader :center, :size
|
110
|
+
|
111
|
+
def initialize(centerPoint)
|
112
|
+
@fixed = false
|
113
|
+
@center = centerPoint.clone
|
114
|
+
@size = 1
|
115
|
+
end
|
116
|
+
|
117
|
+
def +(point)
|
118
|
+
point.cluster = self
|
119
|
+
|
120
|
+
@size+=1
|
121
|
+
|
122
|
+
@center.add(point, 1.0/@size.to_f)
|
123
|
+
end
|
124
|
+
|
125
|
+
def -(point)
|
126
|
+
point.cluster = nil
|
127
|
+
|
128
|
+
@center.sub(point, 1.0/@size.to_f)
|
129
|
+
|
130
|
+
@size-=1
|
131
|
+
end
|
132
|
+
|
133
|
+
def get_max_terms(n=3)
|
134
|
+
@center.get_max_terms(n)
|
135
|
+
end
|
136
|
+
|
137
|
+
end
|
138
|
+
|
139
|
+
class Clusterer
|
140
|
+
|
141
|
+
attr_reader :clusters, :points, :cluster_count, :iterations, :logger
|
142
|
+
attr_writer :clusters, :points, :cluster_count, :iterations
|
143
|
+
|
144
|
+
# Optionally takes a notifier.
|
145
|
+
def initialize(points)
|
146
|
+
@iterations = 2
|
147
|
+
@cluster_count = 10
|
148
|
+
@points = points
|
149
|
+
@clusters = []
|
150
|
+
@logger = Log4r::Logger.new('Clusterer')
|
151
|
+
|
152
|
+
# Randomly select a few starting documents.
|
153
|
+
@cluster_count.times { @clusters << Cluster.new(@points[rand(points.length)]) }
|
154
|
+
end
|
155
|
+
|
156
|
+
def +(point)
|
157
|
+
@points << point
|
158
|
+
end
|
159
|
+
|
160
|
+
def each_cluster(&c)
|
161
|
+
@clusters.each { |cluster| yield cluster }
|
162
|
+
end
|
163
|
+
|
164
|
+
def assign_all_points
|
165
|
+
|
166
|
+
@points.each do |pt|
|
167
|
+
|
168
|
+
@logger.debug("Assigning point #{pt}.")
|
169
|
+
|
170
|
+
min_cluster = @clusters[0]
|
171
|
+
min_dst = min_cluster.center.distance(pt)
|
172
|
+
|
173
|
+
@clusters.each do |cluster|
|
174
|
+
|
175
|
+
tmp_distance = cluster.center.distance(pt)
|
176
|
+
|
177
|
+
if ( tmp_distance < min_dst )
|
178
|
+
min_cluster = cluster
|
179
|
+
min_dst = tmp_distance
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
pt.cluster - pt if pt.cluster
|
184
|
+
|
185
|
+
min_cluster + pt
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
def cluster
|
190
|
+
iterations.times do |i|
|
191
|
+
@logger.info("Starting iteration #{i+1} of #{iterations}.")
|
192
|
+
assign_all_points
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
end
|
data/lib/sclust/doc.rb
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
module SClust
|
2
|
+
|
3
|
+
# Filters a document term
|
4
|
+
class DocumentTermFilter
|
5
|
+
# Return nil if the term should be excluded. Otherwise the version of the term
|
6
|
+
# that should be included is returned.
|
7
|
+
def filter(term)
|
8
|
+
if ( term.nil? )
|
9
|
+
nil
|
10
|
+
elsif (term.size < 2)
|
11
|
+
nil
|
12
|
+
elsif ( term =~ /^[\d\.]+$/ )
|
13
|
+
nil
|
14
|
+
else
|
15
|
+
term.downcase!
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
class NullFilter
|
21
|
+
def filter(term)
|
22
|
+
term
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
class Document
|
27
|
+
|
28
|
+
attr_reader :terms, :userDate, :filter
|
29
|
+
|
30
|
+
# Takes { :userData, :ngrams => [1,2,3], :filter }
|
31
|
+
def initialize(text, opts={})
|
32
|
+
|
33
|
+
@text = text
|
34
|
+
@userData = opts[:userData]
|
35
|
+
|
36
|
+
opts[:ngramrange] ||= [ 1, 2, 3 ]
|
37
|
+
opts[:filter] ||= DocumentTermFilter.new()
|
38
|
+
|
39
|
+
word_arr = text.split(/[ ,\.\t!\?\(\)\{\}\[\]\t\r\n]+/m)
|
40
|
+
|
41
|
+
@terms = Hash.new(0)
|
42
|
+
|
43
|
+
# Array of counts of grams built.
|
44
|
+
builtGramCounts = []
|
45
|
+
|
46
|
+
# Build a set of n-grams from our requested ngram range.
|
47
|
+
opts[:ngrams].each do |n|
|
48
|
+
|
49
|
+
builtGramCounts[n] = 0
|
50
|
+
|
51
|
+
# For each word in our list...
|
52
|
+
0.upto(word_arr.length-1) do |j|
|
53
|
+
|
54
|
+
if ( n + j < word_arr.length )
|
55
|
+
|
56
|
+
term = word_arr[j]
|
57
|
+
|
58
|
+
(n-1).times { |ngram| term += " #{word_arr[j+ngram+1]}" }
|
59
|
+
|
60
|
+
end
|
61
|
+
|
62
|
+
term = opts[:filter].filter(term)
|
63
|
+
|
64
|
+
@terms[term] += 1.0 if term
|
65
|
+
|
66
|
+
builtGramCounts[n] += 1
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
|
72
|
+
@terms.each { |k,v| @terms[k] /= @terms.length }
|
73
|
+
|
74
|
+
end
|
75
|
+
|
76
|
+
def term_frequency(term)
|
77
|
+
@terms[term]
|
78
|
+
end
|
79
|
+
|
80
|
+
alias tf term_frequency
|
81
|
+
|
82
|
+
def each_term(&call)
|
83
|
+
terms.each_key { |k| yield k }
|
84
|
+
end
|
85
|
+
|
86
|
+
def has_term?(term)
|
87
|
+
@terms.has_key?(term)
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'sclust/doc'
|
2
|
+
require 'sclust/doccol'
|
3
|
+
require 'sclust/cluster'
|
4
|
+
|
5
|
+
module SClust
|
6
|
+
|
7
|
+
# A document clusterer that overrides the + operator
|
8
|
+
# to allow for adding Document objects.
|
9
|
+
class DocumentClusterer < Clusterer
|
10
|
+
|
11
|
+
def initialize(documentCollection)
|
12
|
+
|
13
|
+
# List of all terms
|
14
|
+
term_list = documentCollection.terms.keys.sort
|
15
|
+
point_list = []
|
16
|
+
|
17
|
+
documentCollection.doclist.each do |doc|
|
18
|
+
|
19
|
+
doc_terms = [] # Sorted list of terms.
|
20
|
+
doc_term_values = [] # Corosponding values.
|
21
|
+
|
22
|
+
|
23
|
+
# Buid a BIG term vector list for this document.
|
24
|
+
term_list.each do |term|
|
25
|
+
doc_terms << term
|
26
|
+
doc_term_values << doc.tf(term) - documentCollection.idf(term)
|
27
|
+
end
|
28
|
+
|
29
|
+
# def initialize(terms, values, source_object = nil)
|
30
|
+
point_list << ClusterPoint.new(doc_terms, doc_term_values, doc)
|
31
|
+
end
|
32
|
+
|
33
|
+
super(point_list)
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'log4r'
|
3
|
+
|
4
|
+
module SClust
|
5
|
+
class DocumentCollection
|
6
|
+
|
7
|
+
# terms - a hash were they keys are the terms in the documents and the values stored are the number of documents contiaining the term.
|
8
|
+
attr_reader :terms
|
9
|
+
|
10
|
+
# A list of documents
|
11
|
+
attr_reader :doclist
|
12
|
+
|
13
|
+
# Log4r::Logger for this document collection.
|
14
|
+
attr_reader :logger
|
15
|
+
|
16
|
+
def initialize()
|
17
|
+
@logger = Log4r::Logger.new("SClust::DocumentCollection")
|
18
|
+
@terms = Hash.new(0)
|
19
|
+
@doclist = []
|
20
|
+
end
|
21
|
+
|
22
|
+
# Add a document to the collection and adjust the @terms attribute to store any new terms in the document.
|
23
|
+
# The document is also added to the @doclist attribute.
|
24
|
+
def +(d)
|
25
|
+
|
26
|
+
d.each_term do |term|
|
27
|
+
@terms[term] += 1.0
|
28
|
+
end
|
29
|
+
|
30
|
+
@doclist<<d
|
31
|
+
|
32
|
+
@logger.info("There are #{@doclist.size} documents and #{@terms.size} terms.")
|
33
|
+
|
34
|
+
self
|
35
|
+
end
|
36
|
+
|
37
|
+
def drop_terms(min_frequency=0.10, max_frequency=0.80)
|
38
|
+
|
39
|
+
min_docs = @doclist.length * min_frequency
|
40
|
+
max_docs = @doclist.length * max_frequency
|
41
|
+
|
42
|
+
@logger.info("Analyzing #{@terms.length} terms for removal.")
|
43
|
+
@logger.info("Upper/lower boundary are #{max_frequency}/#{min_frequency}% document frequency or #{max_docs}/#{min_docs} documents.")
|
44
|
+
|
45
|
+
remove_list = []
|
46
|
+
|
47
|
+
@terms.each do |term, frequency|
|
48
|
+
|
49
|
+
if ( frequency < min_docs or frequency > max_docs )
|
50
|
+
@logger.info("Removing term #{term} occuring in #{frequency} documents out of #{@doclist.length}")
|
51
|
+
@terms.delete(term)
|
52
|
+
remove_list << term
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
@logger.info("Removed #{remove_list.length} of #{@terms.length + remove_list.length} terms. Updating #{doclist.length} documents.")
|
57
|
+
|
58
|
+
@doclist.each do |doc|
|
59
|
+
remove_list.each do |term|
|
60
|
+
doc.terms.delete(term)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def inverse_document_frequency(term)
|
66
|
+
Math.log( @terms.length / @terms[term] )
|
67
|
+
end
|
68
|
+
|
69
|
+
alias idf inverse_document_frequency
|
70
|
+
|
71
|
+
def each_term(&c)
|
72
|
+
@terms.each_key { |k| yield k }
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
|
3
|
+
require 'sclust/doccluster'
|
4
|
+
|
5
|
+
class ClusterTest < Test::Unit::TestCase
|
6
|
+
|
7
|
+
def setup()
|
8
|
+
@dc = SClust::DocumentCollection.new()
|
9
|
+
filter = SClust::NullFilter.new()
|
10
|
+
d1 = SClust::Document.new("a b c d d e a q a b", :filter=>filter, :ngrams=>[1])
|
11
|
+
d2 = SClust::Document.new("a b d e a", :filter=>filter, :ngrams=>[1])
|
12
|
+
d3 = SClust::Document.new("bob", :filter=>filter, :ngrams=>[1])
|
13
|
+
d4 = SClust::Document.new("frank a", :filter=>filter, :ngrams=>[1])
|
14
|
+
|
15
|
+
@dc + d1
|
16
|
+
@dc + d2
|
17
|
+
@dc + d3
|
18
|
+
@dc + d4
|
19
|
+
end
|
20
|
+
|
21
|
+
def teardown()
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_makecluster()
|
25
|
+
c = SClust::DocumentClusterer.new(@dc)
|
26
|
+
|
27
|
+
c.cluster
|
28
|
+
|
29
|
+
c.each_cluster do |cl|
|
30
|
+
|
31
|
+
max = 0
|
32
|
+
|
33
|
+
0.upto(cl.center.terms.length - 1) do |i|
|
34
|
+
|
35
|
+
term = cl.center.terms[i]
|
36
|
+
value = cl.center.values[i]
|
37
|
+
|
38
|
+
max = i if ( cl.center.values[i] > cl.center.values[max] )
|
39
|
+
end
|
40
|
+
|
41
|
+
puts("Cluster: #{cl.center.terms[max]} #{cl.center.values[max]}")
|
42
|
+
|
43
|
+
cl.center.get_max_terms(3).each do |t|
|
44
|
+
puts("Got Term: #{t} with value #{cl.center.get_term_value(t)}")
|
45
|
+
end
|
46
|
+
|
47
|
+
assert(cl.center.values[max] == cl.center.get_term_value(cl.center.get_max_terms(1)[0]), "Max value was not found.")
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
data/tests/test001.rb
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'sclust/doc'
|
2
|
+
require 'sclust/doccol'
|
3
|
+
require 'test/unit'
|
4
|
+
|
5
|
+
|
6
|
+
class DocTests < Test::Unit::TestCase
|
7
|
+
|
8
|
+
#def setup
|
9
|
+
#end
|
10
|
+
|
11
|
+
#def teardown
|
12
|
+
#end
|
13
|
+
|
14
|
+
def test_builddoc
|
15
|
+
d = SClust::Document.new("hi, this is a nice doc! Yup. Oh? A very nice doc, indeed.")
|
16
|
+
|
17
|
+
d.terms.each do |k,v|
|
18
|
+
assert(k != ".", "Period found")
|
19
|
+
assert(k != "", "Empty term found")
|
20
|
+
#puts("#{k}=#{v}")
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
class DocCollectionTests < Test::Unit::TestCase
|
28
|
+
|
29
|
+
def test_collectionadd()
|
30
|
+
dc = SClust::DocumentCollection.new()
|
31
|
+
d1 = SClust::Document.new("a b c d d e a q a b")
|
32
|
+
d2 = SClust::Document.new("a b d e a")
|
33
|
+
d3 = SClust::Document.new("bob")
|
34
|
+
d4 = SClust::Document.new("frank a")
|
35
|
+
|
36
|
+
dc + d1
|
37
|
+
dc + d2
|
38
|
+
dc + d3
|
39
|
+
dc + d4
|
40
|
+
|
41
|
+
dc.terms.each do |k,v|
|
42
|
+
if k == "a"
|
43
|
+
assert(v == 3, "A appers in 3 documents out of 4.")
|
44
|
+
assert(dc.idf("a") > 2.2, "Known value for a")
|
45
|
+
assert(dc.idf("a") < 2.3, "Known value for a")
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
metadata
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: sclust
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Sam Baskinger
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-12-01 00:00:00 -06:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: log4r
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.0.5
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: mechanize
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.9.3
|
34
|
+
version:
|
35
|
+
description: A k-mean text clustering library for ruby.
|
36
|
+
email: basking2@rubyforge.org.com
|
37
|
+
executables: []
|
38
|
+
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files: []
|
42
|
+
|
43
|
+
files:
|
44
|
+
- lib/sclust/cluster.rb
|
45
|
+
- lib/sclust/doc.rb
|
46
|
+
- lib/sclust/doccluster.rb
|
47
|
+
- lib/sclust/doccol.rb
|
48
|
+
has_rdoc: true
|
49
|
+
homepage: http://sclust.rubyforge.org
|
50
|
+
licenses: []
|
51
|
+
|
52
|
+
post_install_message:
|
53
|
+
rdoc_options: []
|
54
|
+
|
55
|
+
require_paths:
|
56
|
+
- lib
|
57
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 1.6.8
|
62
|
+
version:
|
63
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: "0"
|
68
|
+
version:
|
69
|
+
requirements: []
|
70
|
+
|
71
|
+
rubyforge_project: http://sclust.rubyforge.org/
|
72
|
+
rubygems_version: 1.3.5
|
73
|
+
signing_key:
|
74
|
+
specification_version: 3
|
75
|
+
summary: k-mean clustering.
|
76
|
+
test_files:
|
77
|
+
- tests/clustertest.rb
|
78
|
+
- tests/test001.rb
|
79
|
+
- tests/clustertest.rb
|
80
|
+
- tests/test001.rb
|