clusterer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1,21 @@
1
+ A ruby library which implements clustering algorithms for text
2
+ mining.
3
+
4
+ Currently implemented algorithms are K-Means, and Hierarchical
5
+ clustering.
6
+
7
+ Hierarchical gives better results, but complexity roughly O(n*n)
8
+
9
+ K-means is very fast, O(k*n*i), i is number of iterations.
10
+
11
+ the examples need google/yahoo api keys, and the yahoo example requires
12
+ ysearch-rb from
13
+
14
+ http://developer.yahoo.com/download/download.html
15
+
16
+
17
+ Hybrid clustering algorithms + more similarity metrics + semi-supervised
18
+ clustering... coming soon ... (or submit pacthes/show keen interest, if
19
+ you want faster results)
20
+
21
+ Happy hacking......
@@ -0,0 +1,49 @@
1
+ #The MIT License
2
+
3
+ #Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
4
+
5
+ require 'soap/wsdlDriver'
6
+ require 'clusterer'
7
+
8
+ ## try using HTML stripping to get better results
9
+
10
+ WSDL_URL = "http://api.google.com/GoogleSearch.wsdl"
11
+ driver = SOAP::WSDLDriverFactory.new(WSDL_URL).create_rpc_driver
12
+ query = 'kreeti'
13
+ key = ""
14
+
15
+ results = driver.doGoogleSearch(key, query, 0, 10, true, "", 1, "lang_en", "", "")
16
+ count= results.resultElements.size
17
+ max_count = results.estimatedTotalResultsCount.to_i
18
+ results = results.resultElements
19
+
20
+ while (count < 100 && count <= max_count)
21
+ more_results = driver.doGoogleSearch(key, query, count, 10, true, "", 1, "lang_en", "", "")
22
+ results.concat(more_results.resultElements)
23
+ count += more_results.resultElements.size
24
+ end
25
+
26
+ clusters = Clusterer::Clustering.kmeans_clustering(results.collect {|r| r.title.to_s.gsub(/<\/?[^>]*>/, "") +
27
+ " " + r.snippet.to_s.gsub(/<\/?[^>]*>/, "")})
28
+
29
+ #writing the output
30
+ File.open("temp.html","w") do |f|
31
+ f.write("<ul>")
32
+ clusters.each do |clus|
33
+ f.write("<li>")
34
+ f.write("<ul>")
35
+ clus.each do |d|
36
+ f.write("<li>")
37
+ f.write("<span class='title'>")
38
+ f.write(results[d].title)
39
+ f.write("</span>")
40
+ f.write("<span class='snippet'>")
41
+ f.write(results[d].snippet)
42
+ f.write("</span>")
43
+ f.write("</li>")
44
+ end
45
+ f.write("</ul>")
46
+ end
47
+ f.write("</ul>")
48
+ f.write("</li>")
49
+ end
@@ -0,0 +1,59 @@
1
+ #The MIT License
2
+
3
+ #Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
4
+
5
+ require 'clusterer'
6
+ require 'ysearch-rb/lib/ysearch'
7
+
8
+ ## try using HTML stripping to get better results
9
+
10
+ # get the query parameter
11
+ query = "kreeti"
12
+
13
+ ##
14
+ # create a web search object:
15
+ # Arguments:
16
+ # 1. App ID (You can get one at http://developer.yahoo.net)
17
+ # 2. The query
18
+ # 3. type can be one of: 'all', 'any' or 'phrase'
19
+ # 4. The no. of results
20
+ ##
21
+ obj = WebSearch.new('YahooDemo', query, 'all', 100)
22
+
23
+ results = obj.parse_results
24
+
25
+ # count= results.resultElements.size
26
+ # max_count = results.estimatedTotalResultsCount.to_i
27
+ # results = results.resultElements
28
+
29
+ # while (count < 100 && count <= max_count)
30
+ # more_results = driver.doGoogleSearch(key, query, count, 10, true, "", 1, "lang_en", "", "")
31
+ # results.concat(more_results.resultElements)
32
+ # count += more_results.resultElements.size
33
+ # end
34
+
35
+ #kmeans_clustering
36
+ clusters = Clusterer::Clustering.hierarchical_clustering(results.collect {|r| r['Title'].to_s.gsub(/<\/?[^>]*>/, "") +
37
+ " " + r['Summary'].to_s.gsub(/<\/?[^>]*>/, "")})
38
+
39
+ #writing the output
40
+ File.open("temp.html","w") do |f|
41
+ f.write("<ul>")
42
+ clusters.each do |clus|
43
+ f.write("<li>")
44
+ f.write("<ul>")
45
+ clus.each do |d|
46
+ f.write("<li>")
47
+ f.write("<span class='title'>")
48
+ f.write(results[d]['Title'])
49
+ f.write("</span>")
50
+ f.write("<span class='snippet'>")
51
+ f.write(results[d]['Summary'])
52
+ f.write("</span>")
53
+ f.write("</li>")
54
+ end
55
+ f.write("</ul>")
56
+ end
57
+ f.write("</ul>")
58
+ f.write("</li>")
59
+ end
data/lib/clusterer.rb ADDED
@@ -0,0 +1,100 @@
1
+ #The MIT License
2
+
3
+ ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
4
+
5
+ require 'word_hash'
6
+ require 'similarity'
7
+
8
+ module Clusterer
9
+ class Clustering
10
+ class << self
11
+ #returns clusters containing index of the elements in doc
12
+ def kmeans_clustering (docs, k = nil, max_iter = 10, &similarity_function)
13
+ similarity_function = Proc.new {|*args| Similarity.vector_similarity(*args)} unless similarity_function
14
+ k = Math.sqrt(docs.size) unless k
15
+ docs_hash = Array.new(docs.size)
16
+ clusters = Array.new(k)
17
+ cluster_centers = Array.new(k)
18
+ old_cluster_centers = Array.new(k)
19
+ docs.each_with_index {|d,i| docs_hash[i] = d.clean_word_hash}
20
+ 0.upto(k - 1) {|i| x = rand(docs.size); clusters[i], cluster_centers[i] = [x],docs_hash[x].clone }
21
+ iter = 0
22
+ while (!max_iter || iter < max_iter) && !convergence(cluster_centers,old_cluster_centers)
23
+ puts "Iteration ....#{iter}...#{clusters.inspect}"
24
+ 0.upto(k - 1) {|i| clusters[i] = []; old_cluster_centers[i] = cluster_centers[i]}
25
+ docs_hash.each_with_index do |doc, i|
26
+ max_value, max_index = 0, 0
27
+ cluster_centers.each_with_index do |cen, j|
28
+ sim = similarity_function.call(doc,cen)
29
+ max_value, max_index = sim,j if sim >= max_value
30
+ end
31
+ clusters[max_index] << i
32
+ end
33
+ recalculate_centers(cluster_centers,clusters,docs_hash)
34
+ iter += 1
35
+ end
36
+ clusters
37
+ end
38
+
39
+ def hierarchical_clustering (docs, k = nil, &similarity_function)
40
+ similarity_function = Proc.new {|*args| Similarity.vector_similarity(*args)} unless similarity_function
41
+ k = Math.sqrt(docs.size) unless k
42
+ docs_hash = Array.new(docs.size)
43
+ clusters = Array.new(docs.size)
44
+ cluster_centers = Array.new(docs.size)
45
+ docs.each_with_index do |d,i|
46
+ cluster_centers[i] = d.clean_word_hash
47
+ clusters[i] = [i]
48
+ end
49
+ iter = 0
50
+ while (clusters.size > k)
51
+ puts "Iteration ....#{iter}...#{clusters.inspect}"
52
+ min_value, min_index = clusters.size[0], 0
53
+ clusters.each_with_index {|a, i| (min_value, min_index = a.size, i) if a.size <= min_value}
54
+ p = cluster_centers.delete_at(min_index)
55
+ c = clusters.delete_at(min_index)
56
+ max_value, max_index = 0, 0
57
+ cluster_centers.each_with_index do |cen, j|
58
+ sim = similarity_function.call(p,cen)
59
+ max_value, max_index = sim,j if sim >= max_value
60
+ end
61
+ merge_clusters(clusters[max_index],cluster_centers[max_index],c,p)
62
+ iter += 1
63
+ end
64
+ clusters
65
+ end
66
+
67
+ private
68
+ #merge cluster 2 into cluster 1
69
+ def merge_clusters(cluster1, cluster_center1, cluster2, cluster_center2)
70
+ cluster_center1.each_key {|k| cluster_center1[k] *= cluster1.size}
71
+ cluster_center2.each_key {|k| cluster_center2[k] *= cluster2.size}
72
+ cluster_center1.merge!(cluster_center2) {|k,o,n| cluster_center1[k] = o + n}
73
+ cluster1.concat(cluster2)
74
+ cluster_center1.each_key {|k| cluster_center1[k] /= cluster1.size.to_f}
75
+ cluster_center1.delete(:total)
76
+ end
77
+
78
+ def recalculate_centers(cluster_centers,clusters,docs_hash)
79
+ clusters.each_with_index do |cluster,i|
80
+ center = { }
81
+ cluster.each {|d| docs_hash[d].each {|w,v| center[w] = (center[w] || 0) + v} }
82
+ total = 0
83
+ count = cluster.size
84
+ count = 1 if count ==0
85
+ center.each_key {|w| next unless w.class == String;
86
+ center[w] /=count.to_f; total += center[w]**2}
87
+ total = 1 if total == 0
88
+ center[:total] = total
89
+ cluster_centers[i] = center
90
+ end
91
+ end
92
+
93
+
94
+ def convergence(new_centers,old_centers)
95
+ new_centers.each_with_index {|c,i| return false unless c == old_centers[i]}
96
+ return true
97
+ end
98
+ end
99
+ end
100
+ end
data/lib/similarity.rb ADDED
@@ -0,0 +1,27 @@
1
+ #The MIT License
2
+
3
+ ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
4
+
5
+ module Clusterer
6
+ module Similarity
7
+ #find similarity between two clusters, or two points
8
+ def Similarity.vector_similarity(cluster1, cluster2)
9
+ similarity = 0
10
+ total = 0
11
+ cluster1.each do |w,value|
12
+ next unless w.class == String
13
+ total += (value*value) unless cluster1[:total]
14
+ similarity += (value * (cluster2[w] || 0))
15
+ end
16
+ cluster1[:total] = total unless cluster1[:total]
17
+ unless cluster2[:total]
18
+ total = 0
19
+ cluster2.each_value {|v| total += (v*v) }
20
+ total = 1 if total == 0
21
+ cluster2[:total] = total
22
+ end
23
+ cluster1[:total] = 1 if cluster1[:total] == 0
24
+ similarity /= Math.sqrt(cluster1[:total] * cluster2[:total]).to_f
25
+ end
26
+ end
27
+ end
data/lib/word_hash.rb ADDED
@@ -0,0 +1,93 @@
1
+ #The MIT License
2
+
3
+ ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
4
+
5
+ begin
6
+ require 'stemmer'
7
+ rescue LoadError
8
+ puts "Please install stemmer from http://rubyforge.org/projects/stemmer or 'gem install stemmer'"
9
+ exit(-1)
10
+ end
11
+
12
+ class String
13
+ def clean_word_hash
14
+ word_hash gsub(/[^\w\s]/,"").split
15
+ end
16
+ private
17
+ def word_hash(words)
18
+ h = Hash.new
19
+ words.each do |w|
20
+ w = w.downcase.stem
21
+ h[w] = (h[w] || 0) + 1 if w.size > 2 and !STOP_WORDS.include?(w)
22
+ end
23
+ h
24
+ end
25
+
26
+ STOP_WORDS = ["and",
27
+ "but",
28
+ "came",
29
+ "can",
30
+ "cant",
31
+ "com",
32
+ "couldnt",
33
+ "did",
34
+ "didn",
35
+ "didnt",
36
+ "doesnt",
37
+ "dont",
38
+ "ever",
39
+ "first",
40
+ "for",
41
+ "from",
42
+ "have",
43
+ "her",
44
+ "here",
45
+ "him",
46
+ "how",
47
+ "into",
48
+ "isnt",
49
+ "itll",
50
+ "just",
51
+ "last",
52
+ "least",
53
+ "like",
54
+ "most",
55
+ "new",
56
+ "not",
57
+ "now",
58
+ "sai",
59
+ "said",
60
+ "she",
61
+ "should",
62
+ "since",
63
+ "some",
64
+ "than",
65
+ "thi",
66
+ "that",
67
+ "the",
68
+ "thei",
69
+ "their",
70
+ "then",
71
+ "those",
72
+ "told",
73
+ "too",
74
+ "true",
75
+ "try",
76
+ "until",
77
+ "url",
78
+ "wasnt",
79
+ "were",
80
+ "when",
81
+ "who",
82
+ "whether",
83
+ "while",
84
+ "will",
85
+ "with",
86
+ "within",
87
+ "would",
88
+ "www",
89
+ "yes",
90
+ "you",
91
+ "youll",
92
+ ]
93
+ end
@@ -0,0 +1,20 @@
1
+ #The MIT License
2
+
3
+ ###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
4
+
5
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
6
+
7
+ require 'test/unit'
8
+ require 'clusterer'
9
+
10
+ class TestClusterer < Test::Unit::TestCase
11
+ def test_simple_kmeans
12
+ assert_not_equal [], Clusterer::Clustering.kmeans_clustering(["hello world","mea culpa","goodbye world"])
13
+ assert_equal 2, Clusterer::Clustering.kmeans_clustering(["hello world","mea culpa","goodbye world"],2).size
14
+ end
15
+
16
+ def test_simple_hierarchical_clustering
17
+ assert_not_equal [], Clusterer::Clustering.hierarchical_clustering(["hello world","mea culpa","goodbye world"])
18
+ assert_equal 2, Clusterer::Clustering.hierarchical_clustering(["hello world","mea culpa","goodbye world"],2).size
19
+ end
20
+ end
metadata ADDED
@@ -0,0 +1,59 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.11
3
+ specification_version: 1
4
+ name: clusterer
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.1.0
7
+ date: 2006-08-22 00:00:00 +05:30
8
+ summary: A library of clustering algorithms for text data.
9
+ require_paths:
10
+ - lib
11
+ email: ssinghi@kreeti.com
12
+ homepage: http://rubyforge.org/projects/clusterer/
13
+ rubyforge_project:
14
+ description:
15
+ autorequire: clusterer
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ authors:
29
+ - Surendra K Singhi
30
+ files:
31
+ - tests/clusterer_test.rb
32
+ - lib/clusterer.rb
33
+ - lib/similarity.rb
34
+ - lib/word_hash.rb
35
+ - examples/google_search_cluster.rb
36
+ - examples/yahoo_search_cluster.rb
37
+ - README
38
+ test_files:
39
+ - tests/clusterer_test.rb
40
+ rdoc_options: []
41
+
42
+ extra_rdoc_files:
43
+ - README
44
+ executables: []
45
+
46
+ extensions: []
47
+
48
+ requirements: []
49
+
50
+ dependencies:
51
+ - !ruby/object:Gem::Dependency
52
+ name: stemmer
53
+ version_requirement:
54
+ version_requirements: !ruby/object:Gem::Version::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ version: 0.0.0
59
+ version: