same_same 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +18 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +44 -0
- data/LICENSE.txt +22 -0
- data/README.md +39 -0
- data/Rakefile +1 -0
- data/examples/dbscan_digg.rb +25 -0
- data/examples/dbscan_lines.rb +35 -0
- data/examples/rock_digg.rb +20 -0
- data/examples/rock_lines.rb +31 -0
- data/lib/same_same.rb +15 -0
- data/lib/same_same/cluster.rb +27 -0
- data/lib/same_same/cluster_similarity.rb +10 -0
- data/lib/same_same/cosine_distance.rb +27 -0
- data/lib/same_same/cosine_similarity.rb +22 -0
- data/lib/same_same/data_point.rb +12 -0
- data/lib/same_same/dbscan_algorithm.rb +135 -0
- data/lib/same_same/dbscan_clusters.rb +88 -0
- data/lib/same_same/dbscan_neighborhood.rb +68 -0
- data/lib/same_same/dbscan_numeric_vectors.rb +7 -0
- data/lib/same_same/dbscan_term_frequency_vectors.rb +7 -0
- data/lib/same_same/dendrogram.rb +28 -0
- data/lib/same_same/dendrogram_printer.rb +74 -0
- data/lib/same_same/jaquard_coefficient.rb +9 -0
- data/lib/same_same/link_matrix.rb +62 -0
- data/lib/same_same/merge_goodness_measure.rb +30 -0
- data/lib/same_same/rock_algorithm.rb +51 -0
- data/lib/same_same/rock_clusters.rb +68 -0
- data/lib/same_same/similarity_matrix.rb +20 -0
- data/lib/same_same/symmetrical_matrix.rb +39 -0
- data/lib/same_same/term_frequency_builder.rb +20 -0
- data/lib/same_same/version.rb +3 -0
- data/same_same.gemspec +23 -0
- data/spec/fixtures/digg_stories.csv +49 -0
- data/spec/fixtures/lines.csv +899 -0
- data/spec/same_same/dbscan_algorithm_spec.rb +72 -0
- data/spec/same_same/jaquard_coefficient_spec.rb +24 -0
- data/spec/same_same/link_matrix_spec.rb +29 -0
- data/spec/same_same/merge_goodness_measure_spec.rb +34 -0
- data/spec/same_same/rock_algorithm_spec.rb +71 -0
- data/spec/same_same/similarity_matrix_spec.rb +20 -0
- data/spec/same_same/symmetrical_matrix_spec.rb +69 -0
- metadata +144 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
same_same (0.0.1)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: https://rubygems.org/
|
8
|
+
specs:
|
9
|
+
coderay (1.0.9)
|
10
|
+
colored (1.2)
|
11
|
+
columnize (0.3.6)
|
12
|
+
debugger (1.6.0)
|
13
|
+
columnize (>= 0.3.1)
|
14
|
+
debugger-linecache (~> 1.2.0)
|
15
|
+
debugger-ruby_core_source (~> 1.2.1)
|
16
|
+
debugger-linecache (1.2.0)
|
17
|
+
debugger-ruby_core_source (1.2.2)
|
18
|
+
diff-lcs (1.2.4)
|
19
|
+
method_source (0.8.1)
|
20
|
+
pry (0.9.12.2)
|
21
|
+
coderay (~> 1.0.5)
|
22
|
+
method_source (~> 0.8)
|
23
|
+
slop (~> 3.4)
|
24
|
+
pry-debugger (0.2.2)
|
25
|
+
debugger (~> 1.3)
|
26
|
+
pry (~> 0.9.10)
|
27
|
+
rspec (2.13.0)
|
28
|
+
rspec-core (~> 2.13.0)
|
29
|
+
rspec-expectations (~> 2.13.0)
|
30
|
+
rspec-mocks (~> 2.13.0)
|
31
|
+
rspec-core (2.13.1)
|
32
|
+
rspec-expectations (2.13.0)
|
33
|
+
diff-lcs (>= 1.1.3, < 2.0)
|
34
|
+
rspec-mocks (2.13.1)
|
35
|
+
slop (3.4.5)
|
36
|
+
|
37
|
+
PLATFORMS
|
38
|
+
ruby
|
39
|
+
|
40
|
+
DEPENDENCIES
|
41
|
+
colored
|
42
|
+
pry-debugger
|
43
|
+
rspec
|
44
|
+
same_same!
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Julian Russell
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
samesame
|
2
|
+
========
|
3
|
+
|
4
|
+
Ruby version of clustering algorithms from "Algorithms of the Intelligent Web"
|
5
|
+
|
6
|
+
## Status
|
7
|
+
|
8
|
+
Pretty much direct port of the sameple code from the book (Java).
|
9
|
+
|
10
|
+
### Todo
|
11
|
+
* **Expand specs**. The basics have specs, but some of the computation specs are just testing the thing doesn't blow up, NOT that the calculation is right. Lots of higher level code doesn't have any specs
|
12
|
+
* **Refactor**. Some of the classes and methods are filthy. Things like `Cluster` are just thin wrappers that delegate to arrays.
|
13
|
+
* **Push in more data and see what happens**
|
14
|
+
|
15
|
+
## Installation
|
16
|
+
|
17
|
+
Add this line to your application's Gemfile:
|
18
|
+
|
19
|
+
gem 'same_same'
|
20
|
+
|
21
|
+
And then execute:
|
22
|
+
|
23
|
+
$ bundle
|
24
|
+
|
25
|
+
Or install it yourself as:
|
26
|
+
|
27
|
+
$ gem install samesame
|
28
|
+
|
29
|
+
## Usage
|
30
|
+
|
31
|
+
TODO: Write usage instructions here
|
32
|
+
|
33
|
+
## Contributing
|
34
|
+
|
35
|
+
1. Fork it
|
36
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
37
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
38
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
39
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'same_same'
|
2
|
+
require 'same_same/dendrogram_printer'
|
3
|
+
require 'csv'
|
4
|
+
|
5
|
+
digg_rows = CSV.read("../spec/fixtures/digg_stories.csv", headers: true)
|
6
|
+
digg_data = digg_rows.map {|row|
|
7
|
+
SameSame::DataPoint.new( row["title"],
|
8
|
+
%w(category topic description).map {|key|
|
9
|
+
row[key]
|
10
|
+
}.join(" ").downcase.split(/\s+/)
|
11
|
+
)
|
12
|
+
}
|
13
|
+
|
14
|
+
distance = SameSame::CosineDistance.new
|
15
|
+
vector_builder = SameSame::DbscanTermFrequencyVectors.new
|
16
|
+
algo = SameSame::DbscanAlgorithm.new(
|
17
|
+
points: digg_data,
|
18
|
+
eps: 0.7,
|
19
|
+
min_points: 2,
|
20
|
+
vector_calculator: vector_builder,
|
21
|
+
distance: distance)
|
22
|
+
|
23
|
+
clusters = algo.cluster
|
24
|
+
|
25
|
+
SameSame::DendrogramPrinter.new.print_clusters( clusters )
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'same_same'
|
2
|
+
require 'same_same/dendrogram_printer'
|
3
|
+
require 'csv'
|
4
|
+
|
5
|
+
csv = CSV.read("../spec/fixtures/lines.csv", headers: true)
|
6
|
+
|
7
|
+
# , row['price']
|
8
|
+
groups = csv.group_by {|row| [row['categories']].join("-")}
|
9
|
+
|
10
|
+
fragments = groups.map {|group_key, group|
|
11
|
+
[group_key, group.map {|row|
|
12
|
+
SameSame::DataPoint.new( [row["id"], row["name"]].map {|t| t.gsub(/\s+/, ' ')}.join(": "),
|
13
|
+
%w(name price).map {|key|
|
14
|
+
row[key]
|
15
|
+
}.join(" ").downcase.split(/\s+/)
|
16
|
+
)
|
17
|
+
}]
|
18
|
+
}
|
19
|
+
|
20
|
+
distance = SameSame::CosineDistance.new
|
21
|
+
vector_builder = SameSame::DbscanTermFrequencyVectors.new
|
22
|
+
|
23
|
+
fragments.each do |key, group|
|
24
|
+
if group.size > 1
|
25
|
+
algo = SameSame::DbscanAlgorithm.new(
|
26
|
+
points: group,
|
27
|
+
eps: 0.3,
|
28
|
+
min_points: 2,
|
29
|
+
vector_calculator: vector_builder,
|
30
|
+
distance: distance)
|
31
|
+
|
32
|
+
clusters = algo.cluster
|
33
|
+
SameSame::DendrogramPrinter.new.print_clusters( clusters.select {|c| c.name != "Noise"} )
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'same_same'
|
2
|
+
require 'same_same/dendrogram_printer'
|
3
|
+
require 'csv'
|
4
|
+
|
5
|
+
digg_rows = CSV.read("../spec/fixtures/digg_stories.csv", headers: true)
|
6
|
+
digg_data = digg_rows.map {|row|
|
7
|
+
SameSame::DataPoint.new( row["title"],
|
8
|
+
%w(category topic description).map {|key|
|
9
|
+
row[key]
|
10
|
+
}.join(" ").downcase.split(/\s+/)
|
11
|
+
)
|
12
|
+
}
|
13
|
+
|
14
|
+
|
15
|
+
k = 2
|
16
|
+
th = 0.2
|
17
|
+
algo = SameSame::RockAlgorithm.new(datapoints: digg_data, k: k, th: th)
|
18
|
+
dnd = algo.cluster
|
19
|
+
|
20
|
+
SameSame::DendrogramPrinter.new.print_last(dnd)
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'same_same'
|
2
|
+
require 'same_same/dendrogram_printer'
|
3
|
+
require 'csv'
|
4
|
+
|
5
|
+
csv = CSV.read("../spec/fixtures/lines.csv", headers: true)
|
6
|
+
|
7
|
+
groups = csv.group_by {|row| [row['categories'], row['price']].join("-")}
|
8
|
+
|
9
|
+
fragments = groups.map {|group_key, group|
|
10
|
+
[group_key, group.map {|row|
|
11
|
+
SameSame::DataPoint.new( [row["id"], row["name"]].map {|t| t.gsub(/\s+/, ' ')}.join(": "),
|
12
|
+
%w(name price categories).map {|key|
|
13
|
+
row[key]
|
14
|
+
}.join(" ").downcase.split(/\s+/) + [row["name"].downcase.gsub(/\s+/, ' ')]
|
15
|
+
)
|
16
|
+
}]
|
17
|
+
}
|
18
|
+
|
19
|
+
k = 4
|
20
|
+
th = 0.4
|
21
|
+
fragments.each do |key, group|
|
22
|
+
if group.size > 1
|
23
|
+
algo = SameSame::RockAlgorithm.new(datapoints: group, k: k, th: th)
|
24
|
+
dnd = algo.cluster
|
25
|
+
if dnd.non_singelton_leaves?
|
26
|
+
SameSame::DendrogramPrinter.new.print_last(dnd)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
|
data/lib/same_same.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'same_same/cluster'
|
2
|
+
require 'same_same/cosine_distance'
|
3
|
+
require 'same_same/data_point'
|
4
|
+
require 'same_same/dbscan_algorithm'
|
5
|
+
require 'same_same/dbscan_numeric_vectors'
|
6
|
+
require 'same_same/dbscan_term_frequency_vectors'
|
7
|
+
require 'same_same/jaquard_coefficient'
|
8
|
+
require 'same_same/rock_algorithm'
|
9
|
+
require 'same_same/symmetrical_matrix'
|
10
|
+
require 'same_same/term_frequency_builder'
|
11
|
+
require 'same_same/version'
|
12
|
+
|
13
|
+
module Samesame
|
14
|
+
# Your code goes here...
|
15
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module SameSame
|
2
|
+
class Cluster
|
3
|
+
# note to self - unless I need to implement a heap
|
4
|
+
# more, i'm just wrapping an array and delegating...
|
5
|
+
attr_accessor :datapoints, :name
|
6
|
+
|
7
|
+
def initialize( dp, name = nil )
|
8
|
+
self.datapoints = dp
|
9
|
+
self.name = name
|
10
|
+
end
|
11
|
+
|
12
|
+
def +( other )
|
13
|
+
names = [name, other.name].compact
|
14
|
+
new_name = names.empty? ? nil : names.join("+")
|
15
|
+
Cluster.new( datapoints + other.datapoints, new_name )
|
16
|
+
end
|
17
|
+
|
18
|
+
def size
|
19
|
+
datapoints.size
|
20
|
+
end
|
21
|
+
|
22
|
+
def inject(i, &block)
|
23
|
+
datapoints.inject(i, &block)
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'same_same/cosine_similarity'
|
2
|
+
|
3
|
+
module SameSame
|
4
|
+
class CosineDistance
|
5
|
+
|
6
|
+
attr_accessor :cosin
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
self.cosin = CosineSimilarity.new
|
10
|
+
end
|
11
|
+
|
12
|
+
def distance(x, y)
|
13
|
+
sim = cosin.sim(x, y)
|
14
|
+
|
15
|
+
if sim < 0.0
|
16
|
+
throw new ArgumentError(
|
17
|
+
"Can't use this value to calculate distance." +
|
18
|
+
"x[]=" + x.inspect +
|
19
|
+
", y[]=" + y.inspect +
|
20
|
+
", cosin.sim(x,y)=" + sim)
|
21
|
+
end
|
22
|
+
|
23
|
+
1.0 - sim
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module SameSame
|
2
|
+
|
3
|
+
class CosineSimilarity
|
4
|
+
|
5
|
+
def similarity(x, y)
|
6
|
+
sim(*TermFrequencyBuilder.build_vectors( x.data, y.data ))
|
7
|
+
end
|
8
|
+
|
9
|
+
def sim(v1, v2)
|
10
|
+
dot_product(v1, v2) / (norm(v1) * norm(v2))
|
11
|
+
end
|
12
|
+
|
13
|
+
def dot_product(v1, v2)
|
14
|
+
v1.zip(v2).map {|val1,val2| val1 * val2}.inject(:+)
|
15
|
+
end
|
16
|
+
|
17
|
+
def norm(vector)
|
18
|
+
Math.sqrt( vector.map {|val| val ** 2}.inject(:+) )
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
@@ -0,0 +1,135 @@
|
|
1
|
+
require 'same_same/dbscan_neighborhood'
|
2
|
+
require 'same_same/dbscan_clusters'
|
3
|
+
|
4
|
+
module SameSame
|
5
|
+
|
6
|
+
# Implementation of DBSCAN clustering algorithm.
|
7
|
+
#
|
8
|
+
# Algorithm parameters:
|
9
|
+
#
|
10
|
+
# * Eps - threshold value to determine point neighbors. Two points are
|
11
|
+
# neighbors if the distance between them does not exceed this threshold value.
|
12
|
+
# * MinPts - minimum number of points in any cluster.
|
13
|
+
#
|
14
|
+
# Choice of parameter values depends on the data.
|
15
|
+
#
|
16
|
+
# Point types:
|
17
|
+
#
|
18
|
+
# * Core point - point that belongs to the core of the cluster. It has at least
|
19
|
+
# MinPts neighboring points.
|
20
|
+
# * Border point - is a neighbor to at least one core point but it doesn't
|
21
|
+
# have enough neighbors to be a core point.
|
22
|
+
# * Noise point - is a point that doesn't belong to any cluster because it is
|
23
|
+
# not close to any of the core points.
|
24
|
+
#
|
25
|
+
class DbscanAlgorithm
|
26
|
+
attr_accessor :points
|
27
|
+
|
28
|
+
# Sets of points. Initially all points will be assigned into
|
29
|
+
# Unclassified points set.
|
30
|
+
attr_accessor :dbscan_clusters
|
31
|
+
|
32
|
+
# Number of points that should exist in the neighborhood for a point
|
33
|
+
# to be a core point.
|
34
|
+
#
|
35
|
+
# Best value for this parameter depends on the data set.
|
36
|
+
attr_accessor :min_points
|
37
|
+
|
38
|
+
attr_accessor :neighborhood
|
39
|
+
|
40
|
+
|
41
|
+
# Initializes algorithm with all data that it needs.
|
42
|
+
#
|
43
|
+
# * points - points to cluster
|
44
|
+
# * eps - distance threshold value
|
45
|
+
# * min_points - number of neighbors for point to be considered a
|
46
|
+
# core point.
|
47
|
+
# * distance - distance measure to use (defaults to Cosine)
|
48
|
+
# * vector_calculator - calculates the vectors to use for distance comparison.
|
49
|
+
# defaults to DbscanNumericVectors which compares just
|
50
|
+
# the numeric attributes of the datapoint.
|
51
|
+
# Alternatively use DbscanTermFrequency.
|
52
|
+
def initialize(attrs = {})
|
53
|
+
self.points = attrs.fetch(:points)
|
54
|
+
self.min_points = attrs.fetch(:min_points)
|
55
|
+
distance = attrs[:distance] || CosineDistance.new
|
56
|
+
vector_calculator = attrs[:vector_calculator] || DbscanNumericVectors.new
|
57
|
+
|
58
|
+
self.neighborhood = DbscanNeighborhood.new( distance: distance,
|
59
|
+
eps: attrs.fetch(:eps),
|
60
|
+
points: points,
|
61
|
+
vector_calculator: vector_calculator )
|
62
|
+
|
63
|
+
# all points start as unclassifed
|
64
|
+
self.dbscan_clusters = DbscanClusters.new( points )
|
65
|
+
end
|
66
|
+
|
67
|
+
def cluster
|
68
|
+
cluster_id = dbscan_clusters.get_next_cluster_id
|
69
|
+
points.each do |p|
|
70
|
+
if dbscan_clusters.unclassified?(p)
|
71
|
+
if create_cluster(p, cluster_id)
|
72
|
+
cluster_id = dbscan_clusters.get_next_cluster_id
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
dbscan_clusters.to_clusters
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
def create_cluster( p, cluster_id)
|
82
|
+
neighbors = neighborhood.neighbors_of p
|
83
|
+
if neighbors.size < min_points
|
84
|
+
# Assign point into "Noise" group.
|
85
|
+
# It will have a chance to become a border point later on.
|
86
|
+
dbscan_clusters.assign_to_noise(p)
|
87
|
+
# return false to indicate that we didn't create any cluster
|
88
|
+
return false
|
89
|
+
end
|
90
|
+
|
91
|
+
# All points are reachable from the core point...
|
92
|
+
dbscan_clusters.assign_points(neighbors, cluster_id)
|
93
|
+
|
94
|
+
# Remove point itself.
|
95
|
+
neighbors.delete(p)
|
96
|
+
|
97
|
+
# Process the rest of the neighbors...
|
98
|
+
while !neighbors.empty?
|
99
|
+
# pick the first neighbor
|
100
|
+
neighbor = neighbors.first
|
101
|
+
|
102
|
+
# process neighbor
|
103
|
+
neighbors_neighbors = neighborhood.neighbors_of neighbor
|
104
|
+
|
105
|
+
if neighbors_neighbors.size < min_points
|
106
|
+
# do nothing. The neighbor is just a border point.
|
107
|
+
else
|
108
|
+
# neighbor is another core point.
|
109
|
+
neighbors_neighbors.each do |neighbors_neighbor|
|
110
|
+
|
111
|
+
if dbscan_clusters.noise?(neighbors_neighbor)
|
112
|
+
# It's a border point. We know that it doesn't have
|
113
|
+
# enough neighbors to be a core point. Just add it
|
114
|
+
# to the cluster.
|
115
|
+
dbscan_clusters.assign_point(neighbors_neighbor, cluster_id)
|
116
|
+
elsif dbscan_clusters.unclassified?(neighbors_neighbor)
|
117
|
+
|
118
|
+
# We don't know if this point has enough neighbors
|
119
|
+
# to be a core point... add it to the list of points
|
120
|
+
# to be checked.
|
121
|
+
neighbors.add(neighbors_neighbor)
|
122
|
+
|
123
|
+
# And assign it to the cluster
|
124
|
+
dbscan_clusters.assign_point(neighbors_neighbor, cluster_id)
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
neighbors.delete neighbor
|
130
|
+
end
|
131
|
+
true
|
132
|
+
end
|
133
|
+
|
134
|
+
end
|
135
|
+
end
|