sclust 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,197 @@
1
+ require 'rubygems'
2
+ require 'log4r'
3
+
4
+ module SClust
5
+
6
+ class CosineDistance
7
+
8
+ # Given two vectors, compute the distance
9
+ def self.distance(a,b)
10
+
11
+ acc1 = 0.0
12
+ acc2 = 0.0
13
+ acc3 = 0.0
14
+
15
+ 0.upto(a.length-1) do |i|
16
+ acc1 += a[i]*b[i]
17
+ acc2 *= a[i]*a[i]
18
+ acc3 *= b[i]*b[i]
19
+ end
20
+
21
+ 1 - ( acc1 / (Math.sqrt(acc2) * Math.sqrt(acc3)) )
22
+ end
23
+ end
24
+
25
+ class ClusterPoint
26
+
27
+ attr_reader :terms, :values, :cluster, :source_object
28
+ attr_writer :cluster, :source_object
29
+
30
+ # Initialize the ClusterPoint with a list of terms (labels, objects, whatever) and numeric values.
31
+ def initialize(terms, values, source_object = nil)
32
+ @terms = terms
33
+ @values = values
34
+ @cluster = nil
35
+ @source_object = source_object
36
+ end
37
+
38
+ def distance(clusterPoint)
39
+ CosineDistance.distance(@values, clusterPoint.values)
40
+ end
41
+
42
+ # Add each item in the cluster point to this cluster point adjusting the values per the given weight.
43
+ # Weght is a value from 0.0 - 1.0, inclusive. A value of 1 means that this clusterPoint is 100% assigned to
44
+ # this cluster point while a weight value of 0 will have no effect.
45
+ def add(clusterPoint, weight)
46
+ 0.upto(@values.length-1) { |i| @values[i] = ( @values[i] * (1-weight)) + (clusterPoint.values[i] * weight) }
47
+
48
+ # Validation code
49
+ #0.upto(@values.length-1) do |i|
50
+ # if ( @values[i].nan? || ! @values[i].finite? )
51
+ # throw Exception.new("Cluster has invalid number #{@values[i]}")
52
+ # end
53
+ #end
54
+ end
55
+
56
+
57
+ # Similar to add, but subtract.
58
+ def sub(clusterPoint, weight)
59
+ 0.upto(@values.length-1) { |i| @values[i] = ( @values[i] - (clusterPoint.values[i] * weight) ) / (1 - weight) }
60
+
61
+ # Validation code
62
+ #0.upto(@values.length-1) do |i|
63
+ # if ( @values[i].nan? || ! @values[i].finite? )
64
+ # throw Exception.new("Cluster has invalid number #{@values[i]} w:#{weight} and #{clusterPoint.values[i]}")
65
+ # end
66
+ #end
67
+ end
68
+
69
+ def get_max_terms(n=3)
70
+
71
+ values = {}
72
+
73
+ @terms.length.times do |i|
74
+ t = @terms[i]
75
+ v = @values[i]
76
+ values[v] = [] unless values.has_key?(v)
77
+ values[v] << t
78
+ end
79
+
80
+ vlist = values.keys.sort { |x,y| ( x > y ) ? -1 : 1 }
81
+
82
+ result = []
83
+
84
+ n = vlist.length if ( n > vlist.length )
85
+
86
+ n.times { |i| result += values[vlist[i]] }
87
+
88
+ result.slice(0,n)
89
+
90
+ end
91
+
92
+ def get_term_value(term)
93
+ i=0
94
+
95
+ catch(:found) do
96
+ @terms.each do |t|
97
+ throw :found if ( t == term )
98
+ i+=1
99
+ end
100
+ end
101
+
102
+ @values[i]
103
+ end
104
+
105
+ end
106
+
107
+ class Cluster
108
+
109
+ attr_reader :center, :size
110
+
111
+ def initialize(centerPoint)
112
+ @fixed = false
113
+ @center = centerPoint.clone
114
+ @size = 1
115
+ end
116
+
117
+ def +(point)
118
+ point.cluster = self
119
+
120
+ @size+=1
121
+
122
+ @center.add(point, 1.0/@size.to_f)
123
+ end
124
+
125
+ def -(point)
126
+ point.cluster = nil
127
+
128
+ @center.sub(point, 1.0/@size.to_f)
129
+
130
+ @size-=1
131
+ end
132
+
133
+ def get_max_terms(n=3)
134
+ @center.get_max_terms(n)
135
+ end
136
+
137
+ end
138
+
139
+ class Clusterer
140
+
141
+ attr_reader :clusters, :points, :cluster_count, :iterations, :logger
142
+ attr_writer :clusters, :points, :cluster_count, :iterations
143
+
144
+ # Optionally takes a notifier.
145
+ def initialize(points)
146
+ @iterations = 2
147
+ @cluster_count = 10
148
+ @points = points
149
+ @clusters = []
150
+ @logger = Log4r::Logger.new('Clusterer')
151
+
152
+ # Randomly select a few starting documents.
153
+ @cluster_count.times { @clusters << Cluster.new(@points[rand(points.length)]) }
154
+ end
155
+
156
+ def +(point)
157
+ @points << point
158
+ end
159
+
160
+ def each_cluster(&c)
161
+ @clusters.each { |cluster| yield cluster }
162
+ end
163
+
164
+ def assign_all_points
165
+
166
+ @points.each do |pt|
167
+
168
+ @logger.debug("Assigning point #{pt}.")
169
+
170
+ min_cluster = @clusters[0]
171
+ min_dst = min_cluster.center.distance(pt)
172
+
173
+ @clusters.each do |cluster|
174
+
175
+ tmp_distance = cluster.center.distance(pt)
176
+
177
+ if ( tmp_distance < min_dst )
178
+ min_cluster = cluster
179
+ min_dst = tmp_distance
180
+ end
181
+ end
182
+
183
+ pt.cluster - pt if pt.cluster
184
+
185
+ min_cluster + pt
186
+ end
187
+ end
188
+
189
+ def cluster
190
+ iterations.times do |i|
191
+ @logger.info("Starting iteration #{i+1} of #{iterations}.")
192
+ assign_all_points
193
+ end
194
+ end
195
+ end
196
+
197
+ end
data/lib/sclust/doc.rb ADDED
@@ -0,0 +1,92 @@
1
+ module SClust
2
+
3
+ # Filters a document term
4
+ class DocumentTermFilter
5
+ # Return nil if the term should be excluded. Otherwise the version of the term
6
+ # that should be included is returned.
7
+ def filter(term)
8
+ if ( term.nil? )
9
+ nil
10
+ elsif (term.size < 2)
11
+ nil
12
+ elsif ( term =~ /^[\d\.]+$/ )
13
+ nil
14
+ else
15
+ term.downcase!
16
+ end
17
+ end
18
+ end
19
+
20
+ class NullFilter
21
+ def filter(term)
22
+ term
23
+ end
24
+ end
25
+
26
+ class Document
27
+
28
+ attr_reader :terms, :userDate, :filter
29
+
30
+ # Takes { :userData, :ngrams => [1,2,3], :filter }
31
+ def initialize(text, opts={})
32
+
33
+ @text = text
34
+ @userData = opts[:userData]
35
+
36
+ opts[:ngramrange] ||= [ 1, 2, 3 ]
37
+ opts[:filter] ||= DocumentTermFilter.new()
38
+
39
+ word_arr = text.split(/[ ,\.\t!\?\(\)\{\}\[\]\t\r\n]+/m)
40
+
41
+ @terms = Hash.new(0)
42
+
43
+ # Array of counts of grams built.
44
+ builtGramCounts = []
45
+
46
+ # Build a set of n-grams from our requested ngram range.
47
+ opts[:ngrams].each do |n|
48
+
49
+ builtGramCounts[n] = 0
50
+
51
+ # For each word in our list...
52
+ 0.upto(word_arr.length-1) do |j|
53
+
54
+ if ( n + j < word_arr.length )
55
+
56
+ term = word_arr[j]
57
+
58
+ (n-1).times { |ngram| term += " #{word_arr[j+ngram+1]}" }
59
+
60
+ end
61
+
62
+ term = opts[:filter].filter(term)
63
+
64
+ @terms[term] += 1.0 if term
65
+
66
+ builtGramCounts[n] += 1
67
+
68
+ end
69
+
70
+ end
71
+
72
+ @terms.each { |k,v| @terms[k] /= @terms.length }
73
+
74
+ end
75
+
76
+ def term_frequency(term)
77
+ @terms[term]
78
+ end
79
+
80
+ alias tf term_frequency
81
+
82
+ def each_term(&call)
83
+ terms.each_key { |k| yield k }
84
+ end
85
+
86
+ def has_term?(term)
87
+ @terms.has_key?(term)
88
+ end
89
+
90
+ end
91
+
92
+ end
@@ -0,0 +1,39 @@
1
+ require 'sclust/doc'
2
+ require 'sclust/doccol'
3
+ require 'sclust/cluster'
4
+
5
+ module SClust
6
+
7
+ # A document clusterer that overrides the + operator
8
+ # to allow for adding Document objects.
9
+ class DocumentClusterer < Clusterer
10
+
11
+ def initialize(documentCollection)
12
+
13
+ # List of all terms
14
+ term_list = documentCollection.terms.keys.sort
15
+ point_list = []
16
+
17
+ documentCollection.doclist.each do |doc|
18
+
19
+ doc_terms = [] # Sorted list of terms.
20
+ doc_term_values = [] # Corosponding values.
21
+
22
+
23
+ # Buid a BIG term vector list for this document.
24
+ term_list.each do |term|
25
+ doc_terms << term
26
+ doc_term_values << doc.tf(term) - documentCollection.idf(term)
27
+ end
28
+
29
+ # def initialize(terms, values, source_object = nil)
30
+ point_list << ClusterPoint.new(doc_terms, doc_term_values, doc)
31
+ end
32
+
33
+ super(point_list)
34
+
35
+ end
36
+
37
+ end
38
+
39
+ end
@@ -0,0 +1,75 @@
1
+ require 'rubygems'
2
+ require 'log4r'
3
+
4
+ module SClust
5
+ class DocumentCollection
6
+
7
+ # terms - a hash were they keys are the terms in the documents and the values stored are the number of documents contiaining the term.
8
+ attr_reader :terms
9
+
10
+ # A list of documents
11
+ attr_reader :doclist
12
+
13
+ # Log4r::Logger for this document collection.
14
+ attr_reader :logger
15
+
16
+ def initialize()
17
+ @logger = Log4r::Logger.new("SClust::DocumentCollection")
18
+ @terms = Hash.new(0)
19
+ @doclist = []
20
+ end
21
+
22
+ # Add a document to the collection and adjust the @terms attribute to store any new terms in the document.
23
+ # The document is also added to the @doclist attribute.
24
+ def +(d)
25
+
26
+ d.each_term do |term|
27
+ @terms[term] += 1.0
28
+ end
29
+
30
+ @doclist<<d
31
+
32
+ @logger.info("There are #{@doclist.size} documents and #{@terms.size} terms.")
33
+
34
+ self
35
+ end
36
+
37
+ def drop_terms(min_frequency=0.10, max_frequency=0.80)
38
+
39
+ min_docs = @doclist.length * min_frequency
40
+ max_docs = @doclist.length * max_frequency
41
+
42
+ @logger.info("Analyzing #{@terms.length} terms for removal.")
43
+ @logger.info("Upper/lower boundary are #{max_frequency}/#{min_frequency}% document frequency or #{max_docs}/#{min_docs} documents.")
44
+
45
+ remove_list = []
46
+
47
+ @terms.each do |term, frequency|
48
+
49
+ if ( frequency < min_docs or frequency > max_docs )
50
+ @logger.info("Removing term #{term} occuring in #{frequency} documents out of #{@doclist.length}")
51
+ @terms.delete(term)
52
+ remove_list << term
53
+ end
54
+ end
55
+
56
+ @logger.info("Removed #{remove_list.length} of #{@terms.length + remove_list.length} terms. Updating #{doclist.length} documents.")
57
+
58
+ @doclist.each do |doc|
59
+ remove_list.each do |term|
60
+ doc.terms.delete(term)
61
+ end
62
+ end
63
+ end
64
+
65
+ def inverse_document_frequency(term)
66
+ Math.log( @terms.length / @terms[term] )
67
+ end
68
+
69
+ alias idf inverse_document_frequency
70
+
71
+ def each_term(&c)
72
+ @terms.each_key { |k| yield k }
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,51 @@
1
+ require 'test/unit'
2
+
3
+ require 'sclust/doccluster'
4
+
5
+ class ClusterTest < Test::Unit::TestCase
6
+
7
+ def setup()
8
+ @dc = SClust::DocumentCollection.new()
9
+ filter = SClust::NullFilter.new()
10
+ d1 = SClust::Document.new("a b c d d e a q a b", :filter=>filter, :ngrams=>[1])
11
+ d2 = SClust::Document.new("a b d e a", :filter=>filter, :ngrams=>[1])
12
+ d3 = SClust::Document.new("bob", :filter=>filter, :ngrams=>[1])
13
+ d4 = SClust::Document.new("frank a", :filter=>filter, :ngrams=>[1])
14
+
15
+ @dc + d1
16
+ @dc + d2
17
+ @dc + d3
18
+ @dc + d4
19
+ end
20
+
21
+ def teardown()
22
+ end
23
+
24
+ def test_makecluster()
25
+ c = SClust::DocumentClusterer.new(@dc)
26
+
27
+ c.cluster
28
+
29
+ c.each_cluster do |cl|
30
+
31
+ max = 0
32
+
33
+ 0.upto(cl.center.terms.length - 1) do |i|
34
+
35
+ term = cl.center.terms[i]
36
+ value = cl.center.values[i]
37
+
38
+ max = i if ( cl.center.values[i] > cl.center.values[max] )
39
+ end
40
+
41
+ puts("Cluster: #{cl.center.terms[max]} #{cl.center.values[max]}")
42
+
43
+ cl.center.get_max_terms(3).each do |t|
44
+ puts("Got Term: #{t} with value #{cl.center.get_term_value(t)}")
45
+ end
46
+
47
+ assert(cl.center.values[max] == cl.center.get_term_value(cl.center.get_max_terms(1)[0]), "Max value was not found.")
48
+ end
49
+ end
50
+
51
+ end
data/tests/test001.rb ADDED
@@ -0,0 +1,49 @@
1
+ require 'sclust/doc'
2
+ require 'sclust/doccol'
3
+ require 'test/unit'
4
+
5
+
6
+ class DocTests < Test::Unit::TestCase
7
+
8
+ #def setup
9
+ #end
10
+
11
+ #def teardown
12
+ #end
13
+
14
+ def test_builddoc
15
+ d = SClust::Document.new("hi, this is a nice doc! Yup. Oh? A very nice doc, indeed.")
16
+
17
+ d.terms.each do |k,v|
18
+ assert(k != ".", "Period found")
19
+ assert(k != "", "Empty term found")
20
+ #puts("#{k}=#{v}")
21
+ end
22
+
23
+ end
24
+
25
+ end
26
+
27
+ class DocCollectionTests < Test::Unit::TestCase
28
+
29
+ def test_collectionadd()
30
+ dc = SClust::DocumentCollection.new()
31
+ d1 = SClust::Document.new("a b c d d e a q a b")
32
+ d2 = SClust::Document.new("a b d e a")
33
+ d3 = SClust::Document.new("bob")
34
+ d4 = SClust::Document.new("frank a")
35
+
36
+ dc + d1
37
+ dc + d2
38
+ dc + d3
39
+ dc + d4
40
+
41
+ dc.terms.each do |k,v|
42
+ if k == "a"
43
+ assert(v == 3, "A appers in 3 documents out of 4.")
44
+ assert(dc.idf("a") > 2.2, "Known value for a")
45
+ assert(dc.idf("a") < 2.3, "Known value for a")
46
+ end
47
+ end
48
+ end
49
+ end
metadata ADDED
@@ -0,0 +1,80 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sclust
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Sam Baskinger
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-12-01 00:00:00 -06:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: log4r
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.0.5
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: mechanize
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.9.3
34
+ version:
35
+ description: A k-mean text clustering library for ruby.
36
+ email: basking2@rubyforge.org.com
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files: []
42
+
43
+ files:
44
+ - lib/sclust/cluster.rb
45
+ - lib/sclust/doc.rb
46
+ - lib/sclust/doccluster.rb
47
+ - lib/sclust/doccol.rb
48
+ has_rdoc: true
49
+ homepage: http://sclust.rubyforge.org
50
+ licenses: []
51
+
52
+ post_install_message:
53
+ rdoc_options: []
54
+
55
+ require_paths:
56
+ - lib
57
+ required_ruby_version: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: 1.6.8
62
+ version:
63
+ required_rubygems_version: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: "0"
68
+ version:
69
+ requirements: []
70
+
71
+ rubyforge_project: http://sclust.rubyforge.org/
72
+ rubygems_version: 1.3.5
73
+ signing_key:
74
+ specification_version: 3
75
+ summary: k-mean clustering.
76
+ test_files:
77
+ - tests/clustertest.rb
78
+ - tests/test001.rb
79
+ - tests/clustertest.rb
80
+ - tests/test001.rb