clusterer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1,21 @@
1
+ A ruby library which implements clustering algorithms for text
2
+ mining.
3
+
4
+ Currently implemented algorithms are K-Means, and Hierarchical
5
+ clustering.
6
+
7
+ Hierarchical gives better results, but complexity roughly O(n*n)
8
+
9
+ K-means is very fast, O(k*n*i), i is number of iterations.
10
+
11
+ the examples need google/yahoo api keys, and the yahoo example requires
12
+ ysearch-rb from
13
+
14
+ http://developer.yahoo.com/download/download.html
15
+
16
+
17
+ Hybrid clustering algorithms + more similarity metrics + semi-supervised
18
+ clustering... coming soon ... (or submit pacthes/show keen interest, if
19
+ you want faster results)
20
+
21
+ Happy hacking......
@@ -0,0 +1,49 @@
1
+ #The MIT License
2
+
3
+ #Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
4
+
5
+ require 'soap/wsdlDriver'
6
+ require 'clusterer'
7
+
8
+ ## try using HTML stripping to get better results
9
+
10
+ WSDL_URL = "http://api.google.com/GoogleSearch.wsdl"
11
+ driver = SOAP::WSDLDriverFactory.new(WSDL_URL).create_rpc_driver
12
+ query = 'kreeti'
13
+ key = ""
14
+
15
+ results = driver.doGoogleSearch(key, query, 0, 10, true, "", 1, "lang_en", "", "")
16
+ count= results.resultElements.size
17
+ max_count = results.estimatedTotalResultsCount.to_i
18
+ results = results.resultElements
19
+
20
+ while (count < 100 && count <= max_count)
21
+ more_results = driver.doGoogleSearch(key, query, count, 10, true, "", 1, "lang_en", "", "")
22
+ results.concat(more_results.resultElements)
23
+ count += more_results.resultElements.size
24
+ end
25
+
26
+ clusters = Clusterer::Clustering.kmeans_clustering(results.collect {|r| r.title.to_s.gsub(/<\/?[^>]*>/, "") +
27
+ " " + r.snippet.to_s.gsub(/<\/?[^>]*>/, "")})
28
+
29
+ #writing the output
30
+ File.open("temp.html","w") do |f|
31
+ f.write("<ul>")
32
+ clusters.each do |clus|
33
+ f.write("<li>")
34
+ f.write("<ul>")
35
+ clus.each do |d|
36
+ f.write("<li>")
37
+ f.write("<span class='title'>")
38
+ f.write(results[d].title)
39
+ f.write("</span>")
40
+ f.write("<span class='snippet'>")
41
+ f.write(results[d].snippet)
42
+ f.write("</span>")
43
+ f.write("</li>")
44
+ end
45
+ f.write("</ul>")
46
+ end
47
+ f.write("</ul>")
48
+ f.write("</li>")
49
+ end
@@ -0,0 +1,59 @@
1
+ #The MIT License
2
+
3
+ #Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
4
+
5
+ require 'clusterer'
6
+ require 'ysearch-rb/lib/ysearch'
7
+
8
+ ## try using HTML stripping to get better results
9
+
10
+ # get the query parameter
11
+ query = "kreeti"
12
+
13
+ ##
14
+ # create a web search object:
15
+ # Arguments:
16
+ # 1. App ID (You can get one at http://developer.yahoo.net)
17
+ # 2. The query
18
+ # 3. type can be one of: 'all', 'any' or 'phrase'
19
+ # 4. The no. of results
20
+ ##
21
+ obj = WebSearch.new('YahooDemo', query, 'all', 100)
22
+
23
+ results = obj.parse_results
24
+
25
+ # count= results.resultElements.size
26
+ # max_count = results.estimatedTotalResultsCount.to_i
27
+ # results = results.resultElements
28
+
29
+ # while (count < 100 && count <= max_count)
30
+ # more_results = driver.doGoogleSearch(key, query, count, 10, true, "", 1, "lang_en", "", "")
31
+ # results.concat(more_results.resultElements)
32
+ # count += more_results.resultElements.size
33
+ # end
34
+
35
+ #kmeans_clustering
36
+ clusters = Clusterer::Clustering.hierarchical_clustering(results.collect {|r| r['Title'].to_s.gsub(/<\/?[^>]*>/, "") +
37
+ " " + r['Summary'].to_s.gsub(/<\/?[^>]*>/, "")})
38
+
39
+ #writing the output
40
+ File.open("temp.html","w") do |f|
41
+ f.write("<ul>")
42
+ clusters.each do |clus|
43
+ f.write("<li>")
44
+ f.write("<ul>")
45
+ clus.each do |d|
46
+ f.write("<li>")
47
+ f.write("<span class='title'>")
48
+ f.write(results[d]['Title'])
49
+ f.write("</span>")
50
+ f.write("<span class='snippet'>")
51
+ f.write(results[d]['Summary'])
52
+ f.write("</span>")
53
+ f.write("</li>")
54
+ end
55
+ f.write("</ul>")
56
+ end
57
+ f.write("</ul>")
58
+ f.write("</li>")
59
+ end
data/lib/clusterer.rb ADDED
@@ -0,0 +1,100 @@
1
+ #The MIT License
2
+
3
+ ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
4
+
5
+ require 'word_hash'
6
+ require 'similarity'
7
+
8
+ module Clusterer
9
+ class Clustering
10
+ class << self
11
+ #returns clusters containing index of the elements in doc
12
+ def kmeans_clustering (docs, k = nil, max_iter = 10, &similarity_function)
13
+ similarity_function = Proc.new {|*args| Similarity.vector_similarity(*args)} unless similarity_function
14
+ k = Math.sqrt(docs.size) unless k
15
+ docs_hash = Array.new(docs.size)
16
+ clusters = Array.new(k)
17
+ cluster_centers = Array.new(k)
18
+ old_cluster_centers = Array.new(k)
19
+ docs.each_with_index {|d,i| docs_hash[i] = d.clean_word_hash}
20
+ 0.upto(k - 1) {|i| x = rand(docs.size); clusters[i], cluster_centers[i] = [x],docs_hash[x].clone }
21
+ iter = 0
22
+ while (!max_iter || iter < max_iter) && !convergence(cluster_centers,old_cluster_centers)
23
+ puts "Iteration ....#{iter}...#{clusters.inspect}"
24
+ 0.upto(k - 1) {|i| clusters[i] = []; old_cluster_centers[i] = cluster_centers[i]}
25
+ docs_hash.each_with_index do |doc, i|
26
+ max_value, max_index = 0, 0
27
+ cluster_centers.each_with_index do |cen, j|
28
+ sim = similarity_function.call(doc,cen)
29
+ max_value, max_index = sim,j if sim >= max_value
30
+ end
31
+ clusters[max_index] << i
32
+ end
33
+ recalculate_centers(cluster_centers,clusters,docs_hash)
34
+ iter += 1
35
+ end
36
+ clusters
37
+ end
38
+
39
+ def hierarchical_clustering (docs, k = nil, &similarity_function)
40
+ similarity_function = Proc.new {|*args| Similarity.vector_similarity(*args)} unless similarity_function
41
+ k = Math.sqrt(docs.size) unless k
42
+ docs_hash = Array.new(docs.size)
43
+ clusters = Array.new(docs.size)
44
+ cluster_centers = Array.new(docs.size)
45
+ docs.each_with_index do |d,i|
46
+ cluster_centers[i] = d.clean_word_hash
47
+ clusters[i] = [i]
48
+ end
49
+ iter = 0
50
+ while (clusters.size > k)
51
+ puts "Iteration ....#{iter}...#{clusters.inspect}"
52
+ min_value, min_index = clusters.size[0], 0
53
+ clusters.each_with_index {|a, i| (min_value, min_index = a.size, i) if a.size <= min_value}
54
+ p = cluster_centers.delete_at(min_index)
55
+ c = clusters.delete_at(min_index)
56
+ max_value, max_index = 0, 0
57
+ cluster_centers.each_with_index do |cen, j|
58
+ sim = similarity_function.call(p,cen)
59
+ max_value, max_index = sim,j if sim >= max_value
60
+ end
61
+ merge_clusters(clusters[max_index],cluster_centers[max_index],c,p)
62
+ iter += 1
63
+ end
64
+ clusters
65
+ end
66
+
67
+ private
68
+ #merge cluster 2 into cluster 1
69
+ def merge_clusters(cluster1, cluster_center1, cluster2, cluster_center2)
70
+ cluster_center1.each_key {|k| cluster_center1[k] *= cluster1.size}
71
+ cluster_center2.each_key {|k| cluster_center2[k] *= cluster2.size}
72
+ cluster_center1.merge!(cluster_center2) {|k,o,n| cluster_center1[k] = o + n}
73
+ cluster1.concat(cluster2)
74
+ cluster_center1.each_key {|k| cluster_center1[k] /= cluster1.size.to_f}
75
+ cluster_center1.delete(:total)
76
+ end
77
+
78
+ def recalculate_centers(cluster_centers,clusters,docs_hash)
79
+ clusters.each_with_index do |cluster,i|
80
+ center = { }
81
+ cluster.each {|d| docs_hash[d].each {|w,v| center[w] = (center[w] || 0) + v} }
82
+ total = 0
83
+ count = cluster.size
84
+ count = 1 if count ==0
85
+ center.each_key {|w| next unless w.class == String;
86
+ center[w] /=count.to_f; total += center[w]**2}
87
+ total = 1 if total == 0
88
+ center[:total] = total
89
+ cluster_centers[i] = center
90
+ end
91
+ end
92
+
93
+
94
+ def convergence(new_centers,old_centers)
95
+ new_centers.each_with_index {|c,i| return false unless c == old_centers[i]}
96
+ return true
97
+ end
98
+ end
99
+ end
100
+ end
data/lib/similarity.rb ADDED
@@ -0,0 +1,27 @@
1
+ #The MIT License
2
+
3
+ ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
4
+
5
+ module Clusterer
6
+ module Similarity
7
+ #find similarity between two clusters, or two points
8
+ def Similarity.vector_similarity(cluster1, cluster2)
9
+ similarity = 0
10
+ total = 0
11
+ cluster1.each do |w,value|
12
+ next unless w.class == String
13
+ total += (value*value) unless cluster1[:total]
14
+ similarity += (value * (cluster2[w] || 0))
15
+ end
16
+ cluster1[:total] = total unless cluster1[:total]
17
+ unless cluster2[:total]
18
+ total = 0
19
+ cluster2.each_value {|v| total += (v*v) }
20
+ total = 1 if total == 0
21
+ cluster2[:total] = total
22
+ end
23
+ cluster1[:total] = 1 if cluster1[:total] == 0
24
+ similarity /= Math.sqrt(cluster1[:total] * cluster2[:total]).to_f
25
+ end
26
+ end
27
+ end
data/lib/word_hash.rb ADDED
@@ -0,0 +1,93 @@
1
+ #The MIT License
2
+
3
+ ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
4
+
5
+ begin
6
+ require 'stemmer'
7
+ rescue LoadError
8
+ puts "Please install stemmer from http://rubyforge.org/projects/stemmer or 'gem install stemmer'"
9
+ exit(-1)
10
+ end
11
+
12
+ class String
13
+ def clean_word_hash
14
+ word_hash gsub(/[^\w\s]/,"").split
15
+ end
16
+ private
17
+ def word_hash(words)
18
+ h = Hash.new
19
+ words.each do |w|
20
+ w = w.downcase.stem
21
+ h[w] = (h[w] || 0) + 1 if w.size > 2 and !STOP_WORDS.include?(w)
22
+ end
23
+ h
24
+ end
25
+
26
+ STOP_WORDS = ["and",
27
+ "but",
28
+ "came",
29
+ "can",
30
+ "cant",
31
+ "com",
32
+ "couldnt",
33
+ "did",
34
+ "didn",
35
+ "didnt",
36
+ "doesnt",
37
+ "dont",
38
+ "ever",
39
+ "first",
40
+ "for",
41
+ "from",
42
+ "have",
43
+ "her",
44
+ "here",
45
+ "him",
46
+ "how",
47
+ "into",
48
+ "isnt",
49
+ "itll",
50
+ "just",
51
+ "last",
52
+ "least",
53
+ "like",
54
+ "most",
55
+ "new",
56
+ "not",
57
+ "now",
58
+ "sai",
59
+ "said",
60
+ "she",
61
+ "should",
62
+ "since",
63
+ "some",
64
+ "than",
65
+ "thi",
66
+ "that",
67
+ "the",
68
+ "thei",
69
+ "their",
70
+ "then",
71
+ "those",
72
+ "told",
73
+ "too",
74
+ "true",
75
+ "try",
76
+ "until",
77
+ "url",
78
+ "wasnt",
79
+ "were",
80
+ "when",
81
+ "who",
82
+ "whether",
83
+ "while",
84
+ "will",
85
+ "with",
86
+ "within",
87
+ "would",
88
+ "www",
89
+ "yes",
90
+ "you",
91
+ "youll",
92
+ ]
93
+ end
@@ -0,0 +1,20 @@
1
+ #The MIT License
2
+
3
+ ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
4
+
5
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
6
+
7
+ require 'test/unit'
8
+ require 'clusterer'
9
+
10
+ class TestClusterer < Test::Unit::TestCase
11
+ def test_simple_kmeans
12
+ assert_not_equal [], Clusterer::Clustering.kmeans_clustering(["hello world","mea culpa","goodbye world"])
13
+ assert_equal 2, Clusterer::Clustering.kmeans_clustering(["hello world","mea culpa","goodbye world"],2).size
14
+ end
15
+
16
+ def test_simple_hierarchical_clustering
17
+ assert_not_equal [], Clusterer::Clustering.hierarchical_clustering(["hello world","mea culpa","goodbye world"])
18
+ assert_equal 2, Clusterer::Clustering.hierarchical_clustering(["hello world","mea culpa","goodbye world"],2).size
19
+ end
20
+ end
metadata ADDED
@@ -0,0 +1,59 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.11
3
+ specification_version: 1
4
+ name: clusterer
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.1.0
7
+ date: 2006-08-22 00:00:00 +05:30
8
+ summary: A library of clustering algorithms for text data.
9
+ require_paths:
10
+ - lib
11
+ email: ssinghi@kreeti.com
12
+ homepage: http://rubyforge.org/projects/clusterer/
13
+ rubyforge_project:
14
+ description:
15
+ autorequire: clusterer
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ authors:
29
+ - Surendra K Singhi
30
+ files:
31
+ - tests/clusterer_test.rb
32
+ - lib/clusterer.rb
33
+ - lib/similarity.rb
34
+ - lib/word_hash.rb
35
+ - examples/google_search_cluster.rb
36
+ - examples/yahoo_search_cluster.rb
37
+ - README
38
+ test_files:
39
+ - tests/clusterer_test.rb
40
+ rdoc_options: []
41
+
42
+ extra_rdoc_files:
43
+ - README
44
+ executables: []
45
+
46
+ extensions: []
47
+
48
+ requirements: []
49
+
50
+ dependencies:
51
+ - !ruby/object:Gem::Dependency
52
+ name: stemmer
53
+ version_requirement:
54
+ version_requirements: !ruby/object:Gem::Version::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ version: 0.0.0
59
+ version: