clusterer 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +21 -0
- data/examples/google_search_cluster.rb +49 -0
- data/examples/yahoo_search_cluster.rb +59 -0
- data/lib/clusterer.rb +100 -0
- data/lib/similarity.rb +27 -0
- data/lib/word_hash.rb +93 -0
- data/tests/clusterer_test.rb +20 -0
- metadata +59 -0
data/README
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
A ruby library which implements clustering algorithms for text
|
2
|
+
mining.
|
3
|
+
|
4
|
+
Currently implemented algorithms are K-Means, and Hierarchical
|
5
|
+
clustering.
|
6
|
+
|
7
|
+
Hierarchical gives better results, but complexity roughly O(n*n)
|
8
|
+
|
9
|
+
K-means is very fast, O(k*n*i), i is number of iterations.
|
10
|
+
|
11
|
+
the examples need google/yahoo api keys, and the yahoo example requires
|
12
|
+
ysearch-rb from
|
13
|
+
|
14
|
+
http://developer.yahoo.com/download/download.html
|
15
|
+
|
16
|
+
|
17
|
+
Hybrid clustering algorithms + more similarity metrics + semi-supervised
|
18
|
+
clustering... coming soon ... (or submit pacthes/show keen interest, if
|
19
|
+
you want faster results)
|
20
|
+
|
21
|
+
Happy hacking......
|
@@ -0,0 +1,49 @@
|
|
1
|
+
#The MIT License
|
2
|
+
|
3
|
+
#Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
4
|
+
|
5
|
+
require 'soap/wsdlDriver'
|
6
|
+
require 'clusterer'
|
7
|
+
|
8
|
+
## try using HTML stripping to get better results
|
9
|
+
|
10
|
+
WSDL_URL = "http://api.google.com/GoogleSearch.wsdl"
|
11
|
+
driver = SOAP::WSDLDriverFactory.new(WSDL_URL).create_rpc_driver
|
12
|
+
query = 'kreeti'
|
13
|
+
key = ""
|
14
|
+
|
15
|
+
results = driver.doGoogleSearch(key, query, 0, 10, true, "", 1, "lang_en", "", "")
|
16
|
+
count= results.resultElements.size
|
17
|
+
max_count = results.estimatedTotalResultsCount.to_i
|
18
|
+
results = results.resultElements
|
19
|
+
|
20
|
+
while (count < 100 && count <= max_count)
|
21
|
+
more_results = driver.doGoogleSearch(key, query, count, 10, true, "", 1, "lang_en", "", "")
|
22
|
+
results.concat(more_results.resultElements)
|
23
|
+
count += more_results.resultElements.size
|
24
|
+
end
|
25
|
+
|
26
|
+
clusters = Clusterer::Clustering.kmeans_clustering(results.collect {|r| r.title.to_s.gsub(/<\/?[^>]*>/, "") +
|
27
|
+
" " + r.snippet.to_s.gsub(/<\/?[^>]*>/, "")})
|
28
|
+
|
29
|
+
#writing the output
|
30
|
+
File.open("temp.html","w") do |f|
|
31
|
+
f.write("<ul>")
|
32
|
+
clusters.each do |clus|
|
33
|
+
f.write("<li>")
|
34
|
+
f.write("<ul>")
|
35
|
+
clus.each do |d|
|
36
|
+
f.write("<li>")
|
37
|
+
f.write("<span class='title'>")
|
38
|
+
f.write(results[d].title)
|
39
|
+
f.write("</span>")
|
40
|
+
f.write("<span class='snippet'>")
|
41
|
+
f.write(results[d].snippet)
|
42
|
+
f.write("</span>")
|
43
|
+
f.write("</li>")
|
44
|
+
end
|
45
|
+
f.write("</ul>")
|
46
|
+
end
|
47
|
+
f.write("</ul>")
|
48
|
+
f.write("</li>")
|
49
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
#The MIT License
|
2
|
+
|
3
|
+
#Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
4
|
+
|
5
|
+
require 'clusterer'
|
6
|
+
require 'ysearch-rb/lib/ysearch'
|
7
|
+
|
8
|
+
## try using HTML stripping to get better results
|
9
|
+
|
10
|
+
# get the query parameter
|
11
|
+
query = "kreeti"
|
12
|
+
|
13
|
+
##
|
14
|
+
# create a web search object:
|
15
|
+
# Arguments:
|
16
|
+
# 1. App ID (You can get one at http://developer.yahoo.net)
|
17
|
+
# 2. The query
|
18
|
+
# 3. type can be one of: 'all', 'any' or 'phrase'
|
19
|
+
# 4. The no. of results
|
20
|
+
##
|
21
|
+
obj = WebSearch.new('YahooDemo', query, 'all', 100)
|
22
|
+
|
23
|
+
results = obj.parse_results
|
24
|
+
|
25
|
+
# count= results.resultElements.size
|
26
|
+
# max_count = results.estimatedTotalResultsCount.to_i
|
27
|
+
# results = results.resultElements
|
28
|
+
|
29
|
+
# while (count < 100 && count <= max_count)
|
30
|
+
# more_results = driver.doGoogleSearch(key, query, count, 10, true, "", 1, "lang_en", "", "")
|
31
|
+
# results.concat(more_results.resultElements)
|
32
|
+
# count += more_results.resultElements.size
|
33
|
+
# end
|
34
|
+
|
35
|
+
#kmeans_clustering
|
36
|
+
clusters = Clusterer::Clustering.hierarchical_clustering(results.collect {|r| r['Title'].to_s.gsub(/<\/?[^>]*>/, "") +
|
37
|
+
" " + r['Summary'].to_s.gsub(/<\/?[^>]*>/, "")})
|
38
|
+
|
39
|
+
#writing the output
|
40
|
+
File.open("temp.html","w") do |f|
|
41
|
+
f.write("<ul>")
|
42
|
+
clusters.each do |clus|
|
43
|
+
f.write("<li>")
|
44
|
+
f.write("<ul>")
|
45
|
+
clus.each do |d|
|
46
|
+
f.write("<li>")
|
47
|
+
f.write("<span class='title'>")
|
48
|
+
f.write(results[d]['Title'])
|
49
|
+
f.write("</span>")
|
50
|
+
f.write("<span class='snippet'>")
|
51
|
+
f.write(results[d]['Summary'])
|
52
|
+
f.write("</span>")
|
53
|
+
f.write("</li>")
|
54
|
+
end
|
55
|
+
f.write("</ul>")
|
56
|
+
end
|
57
|
+
f.write("</ul>")
|
58
|
+
f.write("</li>")
|
59
|
+
end
|
data/lib/clusterer.rb
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
#The MIT License
|
2
|
+
|
3
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
4
|
+
|
5
|
+
require 'word_hash'
|
6
|
+
require 'similarity'
|
7
|
+
|
8
|
+
module Clusterer
|
9
|
+
class Clustering
|
10
|
+
class << self
|
11
|
+
#returns clusters containing index of the elements in doc
|
12
|
+
def kmeans_clustering (docs, k = nil, max_iter = 10, &similarity_function)
|
13
|
+
similarity_function = Proc.new {|*args| Similarity.vector_similarity(*args)} unless similarity_function
|
14
|
+
k = Math.sqrt(docs.size) unless k
|
15
|
+
docs_hash = Array.new(docs.size)
|
16
|
+
clusters = Array.new(k)
|
17
|
+
cluster_centers = Array.new(k)
|
18
|
+
old_cluster_centers = Array.new(k)
|
19
|
+
docs.each_with_index {|d,i| docs_hash[i] = d.clean_word_hash}
|
20
|
+
0.upto(k - 1) {|i| x = rand(docs.size); clusters[i], cluster_centers[i] = [x],docs_hash[x].clone }
|
21
|
+
iter = 0
|
22
|
+
while (!max_iter || iter < max_iter) && !convergence(cluster_centers,old_cluster_centers)
|
23
|
+
puts "Iteration ....#{iter}...#{clusters.inspect}"
|
24
|
+
0.upto(k - 1) {|i| clusters[i] = []; old_cluster_centers[i] = cluster_centers[i]}
|
25
|
+
docs_hash.each_with_index do |doc, i|
|
26
|
+
max_value, max_index = 0, 0
|
27
|
+
cluster_centers.each_with_index do |cen, j|
|
28
|
+
sim = similarity_function.call(doc,cen)
|
29
|
+
max_value, max_index = sim,j if sim >= max_value
|
30
|
+
end
|
31
|
+
clusters[max_index] << i
|
32
|
+
end
|
33
|
+
recalculate_centers(cluster_centers,clusters,docs_hash)
|
34
|
+
iter += 1
|
35
|
+
end
|
36
|
+
clusters
|
37
|
+
end
|
38
|
+
|
39
|
+
def hierarchical_clustering (docs, k = nil, &similarity_function)
|
40
|
+
similarity_function = Proc.new {|*args| Similarity.vector_similarity(*args)} unless similarity_function
|
41
|
+
k = Math.sqrt(docs.size) unless k
|
42
|
+
docs_hash = Array.new(docs.size)
|
43
|
+
clusters = Array.new(docs.size)
|
44
|
+
cluster_centers = Array.new(docs.size)
|
45
|
+
docs.each_with_index do |d,i|
|
46
|
+
cluster_centers[i] = d.clean_word_hash
|
47
|
+
clusters[i] = [i]
|
48
|
+
end
|
49
|
+
iter = 0
|
50
|
+
while (clusters.size > k)
|
51
|
+
puts "Iteration ....#{iter}...#{clusters.inspect}"
|
52
|
+
min_value, min_index = clusters.size[0], 0
|
53
|
+
clusters.each_with_index {|a, i| (min_value, min_index = a.size, i) if a.size <= min_value}
|
54
|
+
p = cluster_centers.delete_at(min_index)
|
55
|
+
c = clusters.delete_at(min_index)
|
56
|
+
max_value, max_index = 0, 0
|
57
|
+
cluster_centers.each_with_index do |cen, j|
|
58
|
+
sim = similarity_function.call(p,cen)
|
59
|
+
max_value, max_index = sim,j if sim >= max_value
|
60
|
+
end
|
61
|
+
merge_clusters(clusters[max_index],cluster_centers[max_index],c,p)
|
62
|
+
iter += 1
|
63
|
+
end
|
64
|
+
clusters
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
#merge cluster 2 into cluster 1
|
69
|
+
def merge_clusters(cluster1, cluster_center1, cluster2, cluster_center2)
|
70
|
+
cluster_center1.each_key {|k| cluster_center1[k] *= cluster1.size}
|
71
|
+
cluster_center2.each_key {|k| cluster_center2[k] *= cluster2.size}
|
72
|
+
cluster_center1.merge!(cluster_center2) {|k,o,n| cluster_center1[k] = o + n}
|
73
|
+
cluster1.concat(cluster2)
|
74
|
+
cluster_center1.each_key {|k| cluster_center1[k] /= cluster1.size.to_f}
|
75
|
+
cluster_center1.delete(:total)
|
76
|
+
end
|
77
|
+
|
78
|
+
def recalculate_centers(cluster_centers,clusters,docs_hash)
|
79
|
+
clusters.each_with_index do |cluster,i|
|
80
|
+
center = { }
|
81
|
+
cluster.each {|d| docs_hash[d].each {|w,v| center[w] = (center[w] || 0) + v} }
|
82
|
+
total = 0
|
83
|
+
count = cluster.size
|
84
|
+
count = 1 if count ==0
|
85
|
+
center.each_key {|w| next unless w.class == String;
|
86
|
+
center[w] /=count.to_f; total += center[w]**2}
|
87
|
+
total = 1 if total == 0
|
88
|
+
center[:total] = total
|
89
|
+
cluster_centers[i] = center
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
|
94
|
+
def convergence(new_centers,old_centers)
|
95
|
+
new_centers.each_with_index {|c,i| return false unless c == old_centers[i]}
|
96
|
+
return true
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
data/lib/similarity.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
#The MIT License
|
2
|
+
|
3
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
4
|
+
|
5
|
+
module Clusterer
|
6
|
+
module Similarity
|
7
|
+
#find similarity between two clusters, or two points
|
8
|
+
def Similarity.vector_similarity(cluster1, cluster2)
|
9
|
+
similarity = 0
|
10
|
+
total = 0
|
11
|
+
cluster1.each do |w,value|
|
12
|
+
next unless w.class == String
|
13
|
+
total += (value*value) unless cluster1[:total]
|
14
|
+
similarity += (value * (cluster2[w] || 0))
|
15
|
+
end
|
16
|
+
cluster1[:total] = total unless cluster1[:total]
|
17
|
+
unless cluster2[:total]
|
18
|
+
total = 0
|
19
|
+
cluster2.each_value {|v| total += (v*v) }
|
20
|
+
total = 1 if total == 0
|
21
|
+
cluster2[:total] = total
|
22
|
+
end
|
23
|
+
cluster1[:total] = 1 if cluster1[:total] == 0
|
24
|
+
similarity /= Math.sqrt(cluster1[:total] * cluster2[:total]).to_f
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/lib/word_hash.rb
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
#The MIT License
|
2
|
+
|
3
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
4
|
+
|
5
|
+
begin
|
6
|
+
require 'stemmer'
|
7
|
+
rescue LoadError
|
8
|
+
puts "Please install stemmer from http://rubyforge.org/projects/stemmer or 'gem install stemmer'"
|
9
|
+
exit(-1)
|
10
|
+
end
|
11
|
+
|
12
|
+
class String
|
13
|
+
def clean_word_hash
|
14
|
+
word_hash gsub(/[^\w\s]/,"").split
|
15
|
+
end
|
16
|
+
private
|
17
|
+
def word_hash(words)
|
18
|
+
h = Hash.new
|
19
|
+
words.each do |w|
|
20
|
+
w = w.downcase.stem
|
21
|
+
h[w] = (h[w] || 0) + 1 if w.size > 2 and !STOP_WORDS.include?(w)
|
22
|
+
end
|
23
|
+
h
|
24
|
+
end
|
25
|
+
|
26
|
+
STOP_WORDS = ["and",
|
27
|
+
"but",
|
28
|
+
"came",
|
29
|
+
"can",
|
30
|
+
"cant",
|
31
|
+
"com",
|
32
|
+
"couldnt",
|
33
|
+
"did",
|
34
|
+
"didn",
|
35
|
+
"didnt",
|
36
|
+
"doesnt",
|
37
|
+
"dont",
|
38
|
+
"ever",
|
39
|
+
"first",
|
40
|
+
"for",
|
41
|
+
"from",
|
42
|
+
"have",
|
43
|
+
"her",
|
44
|
+
"here",
|
45
|
+
"him",
|
46
|
+
"how",
|
47
|
+
"into",
|
48
|
+
"isnt",
|
49
|
+
"itll",
|
50
|
+
"just",
|
51
|
+
"last",
|
52
|
+
"least",
|
53
|
+
"like",
|
54
|
+
"most",
|
55
|
+
"new",
|
56
|
+
"not",
|
57
|
+
"now",
|
58
|
+
"sai",
|
59
|
+
"said",
|
60
|
+
"she",
|
61
|
+
"should",
|
62
|
+
"since",
|
63
|
+
"some",
|
64
|
+
"than",
|
65
|
+
"thi",
|
66
|
+
"that",
|
67
|
+
"the",
|
68
|
+
"thei",
|
69
|
+
"their",
|
70
|
+
"then",
|
71
|
+
"those",
|
72
|
+
"told",
|
73
|
+
"too",
|
74
|
+
"true",
|
75
|
+
"try",
|
76
|
+
"until",
|
77
|
+
"url",
|
78
|
+
"wasnt",
|
79
|
+
"were",
|
80
|
+
"when",
|
81
|
+
"who",
|
82
|
+
"whether",
|
83
|
+
"while",
|
84
|
+
"will",
|
85
|
+
"with",
|
86
|
+
"within",
|
87
|
+
"would",
|
88
|
+
"www",
|
89
|
+
"yes",
|
90
|
+
"you",
|
91
|
+
"youll",
|
92
|
+
]
|
93
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
#The MIT License
|
2
|
+
|
3
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
4
|
+
|
5
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
6
|
+
|
7
|
+
require 'test/unit'
|
8
|
+
require 'clusterer'
|
9
|
+
|
10
|
+
class TestClusterer < Test::Unit::TestCase
|
11
|
+
def test_simple_kmeans
|
12
|
+
assert_not_equal [], Clusterer::Clustering.kmeans_clustering(["hello world","mea culpa","goodbye world"])
|
13
|
+
assert_equal 2, Clusterer::Clustering.kmeans_clustering(["hello world","mea culpa","goodbye world"],2).size
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_simple_hierarchical_clustering
|
17
|
+
assert_not_equal [], Clusterer::Clustering.hierarchical_clustering(["hello world","mea culpa","goodbye world"])
|
18
|
+
assert_equal 2, Clusterer::Clustering.hierarchical_clustering(["hello world","mea culpa","goodbye world"],2).size
|
19
|
+
end
|
20
|
+
end
|
metadata
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.8.11
|
3
|
+
specification_version: 1
|
4
|
+
name: clusterer
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.1.0
|
7
|
+
date: 2006-08-22 00:00:00 +05:30
|
8
|
+
summary: A library of clustering algorithms for text data.
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: ssinghi@kreeti.com
|
12
|
+
homepage: http://rubyforge.org/projects/clusterer/
|
13
|
+
rubyforge_project:
|
14
|
+
description:
|
15
|
+
autorequire: clusterer
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
authors:
|
29
|
+
- Surendra K Singhi
|
30
|
+
files:
|
31
|
+
- tests/clusterer_test.rb
|
32
|
+
- lib/clusterer.rb
|
33
|
+
- lib/similarity.rb
|
34
|
+
- lib/word_hash.rb
|
35
|
+
- examples/google_search_cluster.rb
|
36
|
+
- examples/yahoo_search_cluster.rb
|
37
|
+
- README
|
38
|
+
test_files:
|
39
|
+
- tests/clusterer_test.rb
|
40
|
+
rdoc_options: []
|
41
|
+
|
42
|
+
extra_rdoc_files:
|
43
|
+
- README
|
44
|
+
executables: []
|
45
|
+
|
46
|
+
extensions: []
|
47
|
+
|
48
|
+
requirements: []
|
49
|
+
|
50
|
+
dependencies:
|
51
|
+
- !ruby/object:Gem::Dependency
|
52
|
+
name: stemmer
|
53
|
+
version_requirement:
|
54
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: 0.0.0
|
59
|
+
version:
|