sclust 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,197 @@
1
+ require 'rubygems'
2
+ require 'log4r'
3
+
4
+ module SClust
5
+
6
+ class CosineDistance
7
+
8
+ # Given two vectors, compute the distance
9
+ def self.distance(a,b)
10
+
11
+ acc1 = 0.0
12
+ acc2 = 0.0
13
+ acc3 = 0.0
14
+
15
+ 0.upto(a.length-1) do |i|
16
+ acc1 += a[i]*b[i]
17
+ acc2 *= a[i]*a[i]
18
+ acc3 *= b[i]*b[i]
19
+ end
20
+
21
+ 1 - ( acc1 / (Math.sqrt(acc2) * Math.sqrt(acc3)) )
22
+ end
23
+ end
24
+
25
+ class ClusterPoint
26
+
27
+ attr_reader :terms, :values, :cluster, :source_object
28
+ attr_writer :cluster, :source_object
29
+
30
+ # Initialize the ClusterPoint with a list of terms (labels, objects, whatever) and numeric values.
31
+ def initialize(terms, values, source_object = nil)
32
+ @terms = terms
33
+ @values = values
34
+ @cluster = nil
35
+ @source_object = source_object
36
+ end
37
+
38
+ def distance(clusterPoint)
39
+ CosineDistance.distance(@values, clusterPoint.values)
40
+ end
41
+
42
+ # Add each item in the cluster point to this cluster point adjusting the values per the given weight.
43
+ # Weght is a value from 0.0 - 1.0, inclusive. A value of 1 means that this clusterPoint is 100% assigned to
44
+ # this cluster point while a weight value of 0 will have no effect.
45
+ def add(clusterPoint, weight)
46
+ 0.upto(@values.length-1) { |i| @values[i] = ( @values[i] * (1-weight)) + (clusterPoint.values[i] * weight) }
47
+
48
+ # Validation code
49
+ #0.upto(@values.length-1) do |i|
50
+ # if ( @values[i].nan? || ! @values[i].finite? )
51
+ # throw Exception.new("Cluster has invalid number #{@values[i]}")
52
+ # end
53
+ #end
54
+ end
55
+
56
+
57
+ # Similar to add, but subtract.
58
+ def sub(clusterPoint, weight)
59
+ 0.upto(@values.length-1) { |i| @values[i] = ( @values[i] - (clusterPoint.values[i] * weight) ) / (1 - weight) }
60
+
61
+ # Validation code
62
+ #0.upto(@values.length-1) do |i|
63
+ # if ( @values[i].nan? || ! @values[i].finite? )
64
+ # throw Exception.new("Cluster has invalid number #{@values[i]} w:#{weight} and #{clusterPoint.values[i]}")
65
+ # end
66
+ #end
67
+ end
68
+
69
+ def get_max_terms(n=3)
70
+
71
+ values = {}
72
+
73
+ @terms.length.times do |i|
74
+ t = @terms[i]
75
+ v = @values[i]
76
+ values[v] = [] unless values.has_key?(v)
77
+ values[v] << t
78
+ end
79
+
80
+ vlist = values.keys.sort { |x,y| ( x > y ) ? -1 : 1 }
81
+
82
+ result = []
83
+
84
+ n = vlist.length if ( n > vlist.length )
85
+
86
+ n.times { |i| result += values[vlist[i]] }
87
+
88
+ result.slice(0,n)
89
+
90
+ end
91
+
92
+ def get_term_value(term)
93
+ i=0
94
+
95
+ catch(:found) do
96
+ @terms.each do |t|
97
+ throw :found if ( t == term )
98
+ i+=1
99
+ end
100
+ end
101
+
102
+ @values[i]
103
+ end
104
+
105
+ end
106
+
107
+ class Cluster
108
+
109
+ attr_reader :center, :size
110
+
111
+ def initialize(centerPoint)
112
+ @fixed = false
113
+ @center = centerPoint.clone
114
+ @size = 1
115
+ end
116
+
117
+ def +(point)
118
+ point.cluster = self
119
+
120
+ @size+=1
121
+
122
+ @center.add(point, 1.0/@size.to_f)
123
+ end
124
+
125
+ def -(point)
126
+ point.cluster = nil
127
+
128
+ @center.sub(point, 1.0/@size.to_f)
129
+
130
+ @size-=1
131
+ end
132
+
133
+ def get_max_terms(n=3)
134
+ @center.get_max_terms(n)
135
+ end
136
+
137
+ end
138
+
139
+ class Clusterer
140
+
141
+ attr_reader :clusters, :points, :cluster_count, :iterations, :logger
142
+ attr_writer :clusters, :points, :cluster_count, :iterations
143
+
144
+ # Optionally takes a notifier.
145
+ def initialize(points)
146
+ @iterations = 2
147
+ @cluster_count = 10
148
+ @points = points
149
+ @clusters = []
150
+ @logger = Log4r::Logger.new('Clusterer')
151
+
152
+ # Randomly select a few starting documents.
153
+ @cluster_count.times { @clusters << Cluster.new(@points[rand(points.length)]) }
154
+ end
155
+
156
+ def +(point)
157
+ @points << point
158
+ end
159
+
160
+ def each_cluster(&c)
161
+ @clusters.each { |cluster| yield cluster }
162
+ end
163
+
164
+ def assign_all_points
165
+
166
+ @points.each do |pt|
167
+
168
+ @logger.debug("Assigning point #{pt}.")
169
+
170
+ min_cluster = @clusters[0]
171
+ min_dst = min_cluster.center.distance(pt)
172
+
173
+ @clusters.each do |cluster|
174
+
175
+ tmp_distance = cluster.center.distance(pt)
176
+
177
+ if ( tmp_distance < min_dst )
178
+ min_cluster = cluster
179
+ min_dst = tmp_distance
180
+ end
181
+ end
182
+
183
+ pt.cluster - pt if pt.cluster
184
+
185
+ min_cluster + pt
186
+ end
187
+ end
188
+
189
+ def cluster
190
+ iterations.times do |i|
191
+ @logger.info("Starting iteration #{i+1} of #{iterations}.")
192
+ assign_all_points
193
+ end
194
+ end
195
+ end
196
+
197
+ end
data/lib/sclust/doc.rb ADDED
@@ -0,0 +1,92 @@
1
+ module SClust
2
+
3
+ # Filters a document term
4
+ class DocumentTermFilter
5
+ # Return nil if the term should be excluded. Otherwise the version of the term
6
+ # that should be included is returned.
7
+ def filter(term)
8
+ if ( term.nil? )
9
+ nil
10
+ elsif (term.size < 2)
11
+ nil
12
+ elsif ( term =~ /^[\d\.]+$/ )
13
+ nil
14
+ else
15
+ term.downcase!
16
+ end
17
+ end
18
+ end
19
+
20
+ class NullFilter
21
+ def filter(term)
22
+ term
23
+ end
24
+ end
25
+
26
+ class Document
27
+
28
+ attr_reader :terms, :userDate, :filter
29
+
30
+ # Takes { :userData, :ngrams => [1,2,3], :filter }
31
+ def initialize(text, opts={})
32
+
33
+ @text = text
34
+ @userData = opts[:userData]
35
+
36
+ opts[:ngramrange] ||= [ 1, 2, 3 ]
37
+ opts[:filter] ||= DocumentTermFilter.new()
38
+
39
+ word_arr = text.split(/[ ,\.\t!\?\(\)\{\}\[\]\t\r\n]+/m)
40
+
41
+ @terms = Hash.new(0)
42
+
43
+ # Array of counts of grams built.
44
+ builtGramCounts = []
45
+
46
+ # Build a set of n-grams from our requested ngram range.
47
+ opts[:ngrams].each do |n|
48
+
49
+ builtGramCounts[n] = 0
50
+
51
+ # For each word in our list...
52
+ 0.upto(word_arr.length-1) do |j|
53
+
54
+ if ( n + j < word_arr.length )
55
+
56
+ term = word_arr[j]
57
+
58
+ (n-1).times { |ngram| term += " #{word_arr[j+ngram+1]}" }
59
+
60
+ end
61
+
62
+ term = opts[:filter].filter(term)
63
+
64
+ @terms[term] += 1.0 if term
65
+
66
+ builtGramCounts[n] += 1
67
+
68
+ end
69
+
70
+ end
71
+
72
+ @terms.each { |k,v| @terms[k] /= @terms.length }
73
+
74
+ end
75
+
76
+ def term_frequency(term)
77
+ @terms[term]
78
+ end
79
+
80
+ alias tf term_frequency
81
+
82
+ def each_term(&call)
83
+ terms.each_key { |k| yield k }
84
+ end
85
+
86
+ def has_term?(term)
87
+ @terms.has_key?(term)
88
+ end
89
+
90
+ end
91
+
92
+ end
@@ -0,0 +1,39 @@
1
+ require 'sclust/doc'
2
+ require 'sclust/doccol'
3
+ require 'sclust/cluster'
4
+
5
+ module SClust
6
+
7
+ # A document clusterer that overrides the + operator
8
+ # to allow for adding Document objects.
9
+ class DocumentClusterer < Clusterer
10
+
11
+ def initialize(documentCollection)
12
+
13
+ # List of all terms
14
+ term_list = documentCollection.terms.keys.sort
15
+ point_list = []
16
+
17
+ documentCollection.doclist.each do |doc|
18
+
19
+ doc_terms = [] # Sorted list of terms.
20
+ doc_term_values = [] # Corosponding values.
21
+
22
+
23
+ # Buid a BIG term vector list for this document.
24
+ term_list.each do |term|
25
+ doc_terms << term
26
+ doc_term_values << doc.tf(term) - documentCollection.idf(term)
27
+ end
28
+
29
+ # def initialize(terms, values, source_object = nil)
30
+ point_list << ClusterPoint.new(doc_terms, doc_term_values, doc)
31
+ end
32
+
33
+ super(point_list)
34
+
35
+ end
36
+
37
+ end
38
+
39
+ end
@@ -0,0 +1,75 @@
1
+ require 'rubygems'
2
+ require 'log4r'
3
+
4
+ module SClust
5
+ class DocumentCollection
6
+
7
+ # terms - a hash were they keys are the terms in the documents and the values stored are the number of documents contiaining the term.
8
+ attr_reader :terms
9
+
10
+ # A list of documents
11
+ attr_reader :doclist
12
+
13
+ # Log4r::Logger for this document collection.
14
+ attr_reader :logger
15
+
16
+ def initialize()
17
+ @logger = Log4r::Logger.new("SClust::DocumentCollection")
18
+ @terms = Hash.new(0)
19
+ @doclist = []
20
+ end
21
+
22
+ # Add a document to the collection and adjust the @terms attribute to store any new terms in the document.
23
+ # The document is also added to the @doclist attribute.
24
+ def +(d)
25
+
26
+ d.each_term do |term|
27
+ @terms[term] += 1.0
28
+ end
29
+
30
+ @doclist<<d
31
+
32
+ @logger.info("There are #{@doclist.size} documents and #{@terms.size} terms.")
33
+
34
+ self
35
+ end
36
+
37
+ def drop_terms(min_frequency=0.10, max_frequency=0.80)
38
+
39
+ min_docs = @doclist.length * min_frequency
40
+ max_docs = @doclist.length * max_frequency
41
+
42
+ @logger.info("Analyzing #{@terms.length} terms for removal.")
43
+ @logger.info("Upper/lower boundary are #{max_frequency}/#{min_frequency}% document frequency or #{max_docs}/#{min_docs} documents.")
44
+
45
+ remove_list = []
46
+
47
+ @terms.each do |term, frequency|
48
+
49
+ if ( frequency < min_docs or frequency > max_docs )
50
+ @logger.info("Removing term #{term} occuring in #{frequency} documents out of #{@doclist.length}")
51
+ @terms.delete(term)
52
+ remove_list << term
53
+ end
54
+ end
55
+
56
+ @logger.info("Removed #{remove_list.length} of #{@terms.length + remove_list.length} terms. Updating #{doclist.length} documents.")
57
+
58
+ @doclist.each do |doc|
59
+ remove_list.each do |term|
60
+ doc.terms.delete(term)
61
+ end
62
+ end
63
+ end
64
+
65
+ def inverse_document_frequency(term)
66
+ Math.log( @terms.length / @terms[term] )
67
+ end
68
+
69
+ alias idf inverse_document_frequency
70
+
71
+ def each_term(&c)
72
+ @terms.each_key { |k| yield k }
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,51 @@
1
+ require 'test/unit'
2
+
3
+ require 'sclust/doccluster'
4
+
5
+ class ClusterTest < Test::Unit::TestCase
6
+
7
+ def setup()
8
+ @dc = SClust::DocumentCollection.new()
9
+ filter = SClust::NullFilter.new()
10
+ d1 = SClust::Document.new("a b c d d e a q a b", :filter=>filter, :ngrams=>[1])
11
+ d2 = SClust::Document.new("a b d e a", :filter=>filter, :ngrams=>[1])
12
+ d3 = SClust::Document.new("bob", :filter=>filter, :ngrams=>[1])
13
+ d4 = SClust::Document.new("frank a", :filter=>filter, :ngrams=>[1])
14
+
15
+ @dc + d1
16
+ @dc + d2
17
+ @dc + d3
18
+ @dc + d4
19
+ end
20
+
21
+ def teardown()
22
+ end
23
+
24
+ def test_makecluster()
25
+ c = SClust::DocumentClusterer.new(@dc)
26
+
27
+ c.cluster
28
+
29
+ c.each_cluster do |cl|
30
+
31
+ max = 0
32
+
33
+ 0.upto(cl.center.terms.length - 1) do |i|
34
+
35
+ term = cl.center.terms[i]
36
+ value = cl.center.values[i]
37
+
38
+ max = i if ( cl.center.values[i] > cl.center.values[max] )
39
+ end
40
+
41
+ puts("Cluster: #{cl.center.terms[max]} #{cl.center.values[max]}")
42
+
43
+ cl.center.get_max_terms(3).each do |t|
44
+ puts("Got Term: #{t} with value #{cl.center.get_term_value(t)}")
45
+ end
46
+
47
+ assert(cl.center.values[max] == cl.center.get_term_value(cl.center.get_max_terms(1)[0]), "Max value was not found.")
48
+ end
49
+ end
50
+
51
+ end
data/tests/test001.rb ADDED
@@ -0,0 +1,49 @@
1
+ require 'sclust/doc'
2
+ require 'sclust/doccol'
3
+ require 'test/unit'
4
+
5
+
6
+ class DocTests < Test::Unit::TestCase
7
+
8
+ #def setup
9
+ #end
10
+
11
+ #def teardown
12
+ #end
13
+
14
+ def test_builddoc
15
+ d = SClust::Document.new("hi, this is a nice doc! Yup. Oh? A very nice doc, indeed.")
16
+
17
+ d.terms.each do |k,v|
18
+ assert(k != ".", "Period found")
19
+ assert(k != "", "Empty term found")
20
+ #puts("#{k}=#{v}")
21
+ end
22
+
23
+ end
24
+
25
+ end
26
+
27
+ class DocCollectionTests < Test::Unit::TestCase
28
+
29
+ def test_collectionadd()
30
+ dc = SClust::DocumentCollection.new()
31
+ d1 = SClust::Document.new("a b c d d e a q a b")
32
+ d2 = SClust::Document.new("a b d e a")
33
+ d3 = SClust::Document.new("bob")
34
+ d4 = SClust::Document.new("frank a")
35
+
36
+ dc + d1
37
+ dc + d2
38
+ dc + d3
39
+ dc + d4
40
+
41
+ dc.terms.each do |k,v|
42
+ if k == "a"
43
+ assert(v == 3, "A appers in 3 documents out of 4.")
44
+ assert(dc.idf("a") > 2.2, "Known value for a")
45
+ assert(dc.idf("a") < 2.3, "Known value for a")
46
+ end
47
+ end
48
+ end
49
+ end
metadata ADDED
@@ -0,0 +1,80 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sclust
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Sam Baskinger
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-12-01 00:00:00 -06:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: log4r
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.0.5
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: mechanize
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.9.3
34
+ version:
35
+ description: A k-mean text clustering library for ruby.
36
+ email: basking2@rubyforge.org.com
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files: []
42
+
43
+ files:
44
+ - lib/sclust/cluster.rb
45
+ - lib/sclust/doc.rb
46
+ - lib/sclust/doccluster.rb
47
+ - lib/sclust/doccol.rb
48
+ has_rdoc: true
49
+ homepage: http://sclust.rubyforge.org
50
+ licenses: []
51
+
52
+ post_install_message:
53
+ rdoc_options: []
54
+
55
+ require_paths:
56
+ - lib
57
+ required_ruby_version: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: 1.6.8
62
+ version:
63
+ required_rubygems_version: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: "0"
68
+ version:
69
+ requirements: []
70
+
71
+ rubyforge_project: http://sclust.rubyforge.org/
72
+ rubygems_version: 1.3.5
73
+ signing_key:
74
+ specification_version: 3
75
+ summary: k-mean clustering.
76
+ test_files:
77
+ - tests/clustertest.rb
78
+ - tests/test001.rb
79
+ - tests/clustertest.rb
80
+ - tests/test001.rb