clusterer 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +21 -0
- data/examples/google_search_cluster.rb +49 -0
- data/examples/yahoo_search_cluster.rb +59 -0
- data/lib/clusterer.rb +100 -0
- data/lib/similarity.rb +27 -0
- data/lib/word_hash.rb +93 -0
- data/tests/clusterer_test.rb +20 -0
- metadata +59 -0
data/README
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
A ruby library which implements clustering algorithms for text
|
2
|
+
mining.
|
3
|
+
|
4
|
+
Currently implemented algorithms are K-Means, and Hierarchical
|
5
|
+
clustering.
|
6
|
+
|
7
|
+
Hierarchical gives better results, but complexity roughly O(n*n)
|
8
|
+
|
9
|
+
K-means is very fast, O(k*n*i), i is number of iterations.
|
10
|
+
|
11
|
+
the examples need google/yahoo api keys, and the yahoo example requires
|
12
|
+
ysearch-rb from
|
13
|
+
|
14
|
+
http://developer.yahoo.com/download/download.html
|
15
|
+
|
16
|
+
|
17
|
+
Hybrid clustering algorithms + more similarity metrics + semi-supervised
|
18
|
+
clustering... coming soon ... (or submit pacthes/show keen interest, if
|
19
|
+
you want faster results)
|
20
|
+
|
21
|
+
Happy hacking......
|
@@ -0,0 +1,49 @@
|
|
1
|
+
#The MIT License
|
2
|
+
|
3
|
+
#Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
4
|
+
|
5
|
+
require 'soap/wsdlDriver'
|
6
|
+
require 'clusterer'
|
7
|
+
|
8
|
+
## try using HTML stripping to get better results
|
9
|
+
|
10
|
+
WSDL_URL = "http://api.google.com/GoogleSearch.wsdl"
|
11
|
+
driver = SOAP::WSDLDriverFactory.new(WSDL_URL).create_rpc_driver
|
12
|
+
query = 'kreeti'
|
13
|
+
key = ""
|
14
|
+
|
15
|
+
results = driver.doGoogleSearch(key, query, 0, 10, true, "", 1, "lang_en", "", "")
|
16
|
+
count= results.resultElements.size
|
17
|
+
max_count = results.estimatedTotalResultsCount.to_i
|
18
|
+
results = results.resultElements
|
19
|
+
|
20
|
+
while (count < 100 && count <= max_count)
|
21
|
+
more_results = driver.doGoogleSearch(key, query, count, 10, true, "", 1, "lang_en", "", "")
|
22
|
+
results.concat(more_results.resultElements)
|
23
|
+
count += more_results.resultElements.size
|
24
|
+
end
|
25
|
+
|
26
|
+
clusters = Clusterer::Clustering.kmeans_clustering(results.collect {|r| r.title.to_s.gsub(/<\/?[^>]*>/, "") +
|
27
|
+
" " + r.snippet.to_s.gsub(/<\/?[^>]*>/, "")})
|
28
|
+
|
29
|
+
#writing the output
|
30
|
+
File.open("temp.html","w") do |f|
|
31
|
+
f.write("<ul>")
|
32
|
+
clusters.each do |clus|
|
33
|
+
f.write("<li>")
|
34
|
+
f.write("<ul>")
|
35
|
+
clus.each do |d|
|
36
|
+
f.write("<li>")
|
37
|
+
f.write("<span class='title'>")
|
38
|
+
f.write(results[d].title)
|
39
|
+
f.write("</span>")
|
40
|
+
f.write("<span class='snippet'>")
|
41
|
+
f.write(results[d].snippet)
|
42
|
+
f.write("</span>")
|
43
|
+
f.write("</li>")
|
44
|
+
end
|
45
|
+
f.write("</ul>")
|
46
|
+
end
|
47
|
+
f.write("</ul>")
|
48
|
+
f.write("</li>")
|
49
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
#The MIT License
|
2
|
+
|
3
|
+
#Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
4
|
+
|
5
|
+
require 'clusterer'
|
6
|
+
require 'ysearch-rb/lib/ysearch'
|
7
|
+
|
8
|
+
## try using HTML stripping to get better results
|
9
|
+
|
10
|
+
# get the query parameter
|
11
|
+
query = "kreeti"
|
12
|
+
|
13
|
+
##
|
14
|
+
# create a web search object:
|
15
|
+
# Arguments:
|
16
|
+
# 1. App ID (You can get one at http://developer.yahoo.net)
|
17
|
+
# 2. The query
|
18
|
+
# 3. type can be one of: 'all', 'any' or 'phrase'
|
19
|
+
# 4. The no. of results
|
20
|
+
##
|
21
|
+
obj = WebSearch.new('YahooDemo', query, 'all', 100)
|
22
|
+
|
23
|
+
results = obj.parse_results
|
24
|
+
|
25
|
+
# count= results.resultElements.size
|
26
|
+
# max_count = results.estimatedTotalResultsCount.to_i
|
27
|
+
# results = results.resultElements
|
28
|
+
|
29
|
+
# while (count < 100 && count <= max_count)
|
30
|
+
# more_results = driver.doGoogleSearch(key, query, count, 10, true, "", 1, "lang_en", "", "")
|
31
|
+
# results.concat(more_results.resultElements)
|
32
|
+
# count += more_results.resultElements.size
|
33
|
+
# end
|
34
|
+
|
35
|
+
#kmeans_clustering
|
36
|
+
clusters = Clusterer::Clustering.hierarchical_clustering(results.collect {|r| r['Title'].to_s.gsub(/<\/?[^>]*>/, "") +
|
37
|
+
" " + r['Summary'].to_s.gsub(/<\/?[^>]*>/, "")})
|
38
|
+
|
39
|
+
#writing the output
|
40
|
+
File.open("temp.html","w") do |f|
|
41
|
+
f.write("<ul>")
|
42
|
+
clusters.each do |clus|
|
43
|
+
f.write("<li>")
|
44
|
+
f.write("<ul>")
|
45
|
+
clus.each do |d|
|
46
|
+
f.write("<li>")
|
47
|
+
f.write("<span class='title'>")
|
48
|
+
f.write(results[d]['Title'])
|
49
|
+
f.write("</span>")
|
50
|
+
f.write("<span class='snippet'>")
|
51
|
+
f.write(results[d]['Summary'])
|
52
|
+
f.write("</span>")
|
53
|
+
f.write("</li>")
|
54
|
+
end
|
55
|
+
f.write("</ul>")
|
56
|
+
end
|
57
|
+
f.write("</ul>")
|
58
|
+
f.write("</li>")
|
59
|
+
end
|
data/lib/clusterer.rb
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
#The MIT License
|
2
|
+
|
3
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
4
|
+
|
5
|
+
require 'word_hash'
|
6
|
+
require 'similarity'
|
7
|
+
|
8
|
+
module Clusterer
|
9
|
+
class Clustering
|
10
|
+
class << self
|
11
|
+
#returns clusters containing index of the elements in doc
|
12
|
+
def kmeans_clustering (docs, k = nil, max_iter = 10, &similarity_function)
|
13
|
+
similarity_function = Proc.new {|*args| Similarity.vector_similarity(*args)} unless similarity_function
|
14
|
+
k = Math.sqrt(docs.size) unless k
|
15
|
+
docs_hash = Array.new(docs.size)
|
16
|
+
clusters = Array.new(k)
|
17
|
+
cluster_centers = Array.new(k)
|
18
|
+
old_cluster_centers = Array.new(k)
|
19
|
+
docs.each_with_index {|d,i| docs_hash[i] = d.clean_word_hash}
|
20
|
+
0.upto(k - 1) {|i| x = rand(docs.size); clusters[i], cluster_centers[i] = [x],docs_hash[x].clone }
|
21
|
+
iter = 0
|
22
|
+
while (!max_iter || iter < max_iter) && !convergence(cluster_centers,old_cluster_centers)
|
23
|
+
puts "Iteration ....#{iter}...#{clusters.inspect}"
|
24
|
+
0.upto(k - 1) {|i| clusters[i] = []; old_cluster_centers[i] = cluster_centers[i]}
|
25
|
+
docs_hash.each_with_index do |doc, i|
|
26
|
+
max_value, max_index = 0, 0
|
27
|
+
cluster_centers.each_with_index do |cen, j|
|
28
|
+
sim = similarity_function.call(doc,cen)
|
29
|
+
max_value, max_index = sim,j if sim >= max_value
|
30
|
+
end
|
31
|
+
clusters[max_index] << i
|
32
|
+
end
|
33
|
+
recalculate_centers(cluster_centers,clusters,docs_hash)
|
34
|
+
iter += 1
|
35
|
+
end
|
36
|
+
clusters
|
37
|
+
end
|
38
|
+
|
39
|
+
def hierarchical_clustering (docs, k = nil, &similarity_function)
|
40
|
+
similarity_function = Proc.new {|*args| Similarity.vector_similarity(*args)} unless similarity_function
|
41
|
+
k = Math.sqrt(docs.size) unless k
|
42
|
+
docs_hash = Array.new(docs.size)
|
43
|
+
clusters = Array.new(docs.size)
|
44
|
+
cluster_centers = Array.new(docs.size)
|
45
|
+
docs.each_with_index do |d,i|
|
46
|
+
cluster_centers[i] = d.clean_word_hash
|
47
|
+
clusters[i] = [i]
|
48
|
+
end
|
49
|
+
iter = 0
|
50
|
+
while (clusters.size > k)
|
51
|
+
puts "Iteration ....#{iter}...#{clusters.inspect}"
|
52
|
+
min_value, min_index = clusters.size[0], 0
|
53
|
+
clusters.each_with_index {|a, i| (min_value, min_index = a.size, i) if a.size <= min_value}
|
54
|
+
p = cluster_centers.delete_at(min_index)
|
55
|
+
c = clusters.delete_at(min_index)
|
56
|
+
max_value, max_index = 0, 0
|
57
|
+
cluster_centers.each_with_index do |cen, j|
|
58
|
+
sim = similarity_function.call(p,cen)
|
59
|
+
max_value, max_index = sim,j if sim >= max_value
|
60
|
+
end
|
61
|
+
merge_clusters(clusters[max_index],cluster_centers[max_index],c,p)
|
62
|
+
iter += 1
|
63
|
+
end
|
64
|
+
clusters
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
#merge cluster 2 into cluster 1
|
69
|
+
def merge_clusters(cluster1, cluster_center1, cluster2, cluster_center2)
|
70
|
+
cluster_center1.each_key {|k| cluster_center1[k] *= cluster1.size}
|
71
|
+
cluster_center2.each_key {|k| cluster_center2[k] *= cluster2.size}
|
72
|
+
cluster_center1.merge!(cluster_center2) {|k,o,n| cluster_center1[k] = o + n}
|
73
|
+
cluster1.concat(cluster2)
|
74
|
+
cluster_center1.each_key {|k| cluster_center1[k] /= cluster1.size.to_f}
|
75
|
+
cluster_center1.delete(:total)
|
76
|
+
end
|
77
|
+
|
78
|
+
def recalculate_centers(cluster_centers,clusters,docs_hash)
|
79
|
+
clusters.each_with_index do |cluster,i|
|
80
|
+
center = { }
|
81
|
+
cluster.each {|d| docs_hash[d].each {|w,v| center[w] = (center[w] || 0) + v} }
|
82
|
+
total = 0
|
83
|
+
count = cluster.size
|
84
|
+
count = 1 if count ==0
|
85
|
+
center.each_key {|w| next unless w.class == String;
|
86
|
+
center[w] /=count.to_f; total += center[w]**2}
|
87
|
+
total = 1 if total == 0
|
88
|
+
center[:total] = total
|
89
|
+
cluster_centers[i] = center
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
|
94
|
+
def convergence(new_centers,old_centers)
|
95
|
+
new_centers.each_with_index {|c,i| return false unless c == old_centers[i]}
|
96
|
+
return true
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
data/lib/similarity.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
#The MIT License
|
2
|
+
|
3
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
4
|
+
|
5
|
+
module Clusterer
|
6
|
+
module Similarity
|
7
|
+
#find similarity between two clusters, or two points
|
8
|
+
def Similarity.vector_similarity(cluster1, cluster2)
|
9
|
+
similarity = 0
|
10
|
+
total = 0
|
11
|
+
cluster1.each do |w,value|
|
12
|
+
next unless w.class == String
|
13
|
+
total += (value*value) unless cluster1[:total]
|
14
|
+
similarity += (value * (cluster2[w] || 0))
|
15
|
+
end
|
16
|
+
cluster1[:total] = total unless cluster1[:total]
|
17
|
+
unless cluster2[:total]
|
18
|
+
total = 0
|
19
|
+
cluster2.each_value {|v| total += (v*v) }
|
20
|
+
total = 1 if total == 0
|
21
|
+
cluster2[:total] = total
|
22
|
+
end
|
23
|
+
cluster1[:total] = 1 if cluster1[:total] == 0
|
24
|
+
similarity /= Math.sqrt(cluster1[:total] * cluster2[:total]).to_f
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/lib/word_hash.rb
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
#The MIT License
|
2
|
+
|
3
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
4
|
+
|
5
|
+
begin
|
6
|
+
require 'stemmer'
|
7
|
+
rescue LoadError
|
8
|
+
puts "Please install stemmer from http://rubyforge.org/projects/stemmer or 'gem install stemmer'"
|
9
|
+
exit(-1)
|
10
|
+
end
|
11
|
+
|
12
|
+
class String
|
13
|
+
def clean_word_hash
|
14
|
+
word_hash gsub(/[^\w\s]/,"").split
|
15
|
+
end
|
16
|
+
private
|
17
|
+
def word_hash(words)
|
18
|
+
h = Hash.new
|
19
|
+
words.each do |w|
|
20
|
+
w = w.downcase.stem
|
21
|
+
h[w] = (h[w] || 0) + 1 if w.size > 2 and !STOP_WORDS.include?(w)
|
22
|
+
end
|
23
|
+
h
|
24
|
+
end
|
25
|
+
|
26
|
+
STOP_WORDS = ["and",
|
27
|
+
"but",
|
28
|
+
"came",
|
29
|
+
"can",
|
30
|
+
"cant",
|
31
|
+
"com",
|
32
|
+
"couldnt",
|
33
|
+
"did",
|
34
|
+
"didn",
|
35
|
+
"didnt",
|
36
|
+
"doesnt",
|
37
|
+
"dont",
|
38
|
+
"ever",
|
39
|
+
"first",
|
40
|
+
"for",
|
41
|
+
"from",
|
42
|
+
"have",
|
43
|
+
"her",
|
44
|
+
"here",
|
45
|
+
"him",
|
46
|
+
"how",
|
47
|
+
"into",
|
48
|
+
"isnt",
|
49
|
+
"itll",
|
50
|
+
"just",
|
51
|
+
"last",
|
52
|
+
"least",
|
53
|
+
"like",
|
54
|
+
"most",
|
55
|
+
"new",
|
56
|
+
"not",
|
57
|
+
"now",
|
58
|
+
"sai",
|
59
|
+
"said",
|
60
|
+
"she",
|
61
|
+
"should",
|
62
|
+
"since",
|
63
|
+
"some",
|
64
|
+
"than",
|
65
|
+
"thi",
|
66
|
+
"that",
|
67
|
+
"the",
|
68
|
+
"thei",
|
69
|
+
"their",
|
70
|
+
"then",
|
71
|
+
"those",
|
72
|
+
"told",
|
73
|
+
"too",
|
74
|
+
"true",
|
75
|
+
"try",
|
76
|
+
"until",
|
77
|
+
"url",
|
78
|
+
"wasnt",
|
79
|
+
"were",
|
80
|
+
"when",
|
81
|
+
"who",
|
82
|
+
"whether",
|
83
|
+
"while",
|
84
|
+
"will",
|
85
|
+
"with",
|
86
|
+
"within",
|
87
|
+
"would",
|
88
|
+
"www",
|
89
|
+
"yes",
|
90
|
+
"you",
|
91
|
+
"youll",
|
92
|
+
]
|
93
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
#The MIT License
|
2
|
+
|
3
|
+
###Copyright (c) 2006 Surendra K Singhi <ssinghi AT kreeti DOT com>
|
4
|
+
|
5
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
6
|
+
|
7
|
+
require 'test/unit'
|
8
|
+
require 'clusterer'
|
9
|
+
|
10
|
+
class TestClusterer < Test::Unit::TestCase
|
11
|
+
def test_simple_kmeans
|
12
|
+
assert_not_equal [], Clusterer::Clustering.kmeans_clustering(["hello world","mea culpa","goodbye world"])
|
13
|
+
assert_equal 2, Clusterer::Clustering.kmeans_clustering(["hello world","mea culpa","goodbye world"],2).size
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_simple_hierarchical_clustering
|
17
|
+
assert_not_equal [], Clusterer::Clustering.hierarchical_clustering(["hello world","mea culpa","goodbye world"])
|
18
|
+
assert_equal 2, Clusterer::Clustering.hierarchical_clustering(["hello world","mea culpa","goodbye world"],2).size
|
19
|
+
end
|
20
|
+
end
|
metadata
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.8.11
|
3
|
+
specification_version: 1
|
4
|
+
name: clusterer
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.1.0
|
7
|
+
date: 2006-08-22 00:00:00 +05:30
|
8
|
+
summary: A library of clustering algorithms for text data.
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: ssinghi@kreeti.com
|
12
|
+
homepage: http://rubyforge.org/projects/clusterer/
|
13
|
+
rubyforge_project:
|
14
|
+
description:
|
15
|
+
autorequire: clusterer
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
authors:
|
29
|
+
- Surendra K Singhi
|
30
|
+
files:
|
31
|
+
- tests/clusterer_test.rb
|
32
|
+
- lib/clusterer.rb
|
33
|
+
- lib/similarity.rb
|
34
|
+
- lib/word_hash.rb
|
35
|
+
- examples/google_search_cluster.rb
|
36
|
+
- examples/yahoo_search_cluster.rb
|
37
|
+
- README
|
38
|
+
test_files:
|
39
|
+
- tests/clusterer_test.rb
|
40
|
+
rdoc_options: []
|
41
|
+
|
42
|
+
extra_rdoc_files:
|
43
|
+
- README
|
44
|
+
executables: []
|
45
|
+
|
46
|
+
extensions: []
|
47
|
+
|
48
|
+
requirements: []
|
49
|
+
|
50
|
+
dependencies:
|
51
|
+
- !ruby/object:Gem::Dependency
|
52
|
+
name: stemmer
|
53
|
+
version_requirement:
|
54
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: 0.0.0
|
59
|
+
version:
|