same_same 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. data/.gitignore +18 -0
  2. data/Gemfile +4 -0
  3. data/Gemfile.lock +44 -0
  4. data/LICENSE.txt +22 -0
  5. data/README.md +39 -0
  6. data/Rakefile +1 -0
  7. data/examples/dbscan_digg.rb +25 -0
  8. data/examples/dbscan_lines.rb +35 -0
  9. data/examples/rock_digg.rb +20 -0
  10. data/examples/rock_lines.rb +31 -0
  11. data/lib/same_same.rb +15 -0
  12. data/lib/same_same/cluster.rb +27 -0
  13. data/lib/same_same/cluster_similarity.rb +10 -0
  14. data/lib/same_same/cosine_distance.rb +27 -0
  15. data/lib/same_same/cosine_similarity.rb +22 -0
  16. data/lib/same_same/data_point.rb +12 -0
  17. data/lib/same_same/dbscan_algorithm.rb +135 -0
  18. data/lib/same_same/dbscan_clusters.rb +88 -0
  19. data/lib/same_same/dbscan_neighborhood.rb +68 -0
  20. data/lib/same_same/dbscan_numeric_vectors.rb +7 -0
  21. data/lib/same_same/dbscan_term_frequency_vectors.rb +7 -0
  22. data/lib/same_same/dendrogram.rb +28 -0
  23. data/lib/same_same/dendrogram_printer.rb +74 -0
  24. data/lib/same_same/jaquard_coefficient.rb +9 -0
  25. data/lib/same_same/link_matrix.rb +62 -0
  26. data/lib/same_same/merge_goodness_measure.rb +30 -0
  27. data/lib/same_same/rock_algorithm.rb +51 -0
  28. data/lib/same_same/rock_clusters.rb +68 -0
  29. data/lib/same_same/similarity_matrix.rb +20 -0
  30. data/lib/same_same/symmetrical_matrix.rb +39 -0
  31. data/lib/same_same/term_frequency_builder.rb +20 -0
  32. data/lib/same_same/version.rb +3 -0
  33. data/same_same.gemspec +23 -0
  34. data/spec/fixtures/digg_stories.csv +49 -0
  35. data/spec/fixtures/lines.csv +899 -0
  36. data/spec/same_same/dbscan_algorithm_spec.rb +72 -0
  37. data/spec/same_same/jaquard_coefficient_spec.rb +24 -0
  38. data/spec/same_same/link_matrix_spec.rb +29 -0
  39. data/spec/same_same/merge_goodness_measure_spec.rb +34 -0
  40. data/spec/same_same/rock_algorithm_spec.rb +71 -0
  41. data/spec/same_same/similarity_matrix_spec.rb +20 -0
  42. data/spec/same_same/symmetrical_matrix_spec.rb +69 -0
  43. metadata +144 -0
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ coverage
6
+ InstalledFiles
7
+ lib/bundler/man
8
+ pkg
9
+ rdoc
10
+ spec/reports
11
+ test/tmp
12
+ test/version_tmp
13
+ tmp
14
+
15
+ # YARD artifacts
16
+ .yardoc
17
+ _yardoc
18
+ doc/
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in samesame.gemspec
4
+ gemspec
@@ -0,0 +1,44 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ same_same (0.0.1)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ coderay (1.0.9)
10
+ colored (1.2)
11
+ columnize (0.3.6)
12
+ debugger (1.6.0)
13
+ columnize (>= 0.3.1)
14
+ debugger-linecache (~> 1.2.0)
15
+ debugger-ruby_core_source (~> 1.2.1)
16
+ debugger-linecache (1.2.0)
17
+ debugger-ruby_core_source (1.2.2)
18
+ diff-lcs (1.2.4)
19
+ method_source (0.8.1)
20
+ pry (0.9.12.2)
21
+ coderay (~> 1.0.5)
22
+ method_source (~> 0.8)
23
+ slop (~> 3.4)
24
+ pry-debugger (0.2.2)
25
+ debugger (~> 1.3)
26
+ pry (~> 0.9.10)
27
+ rspec (2.13.0)
28
+ rspec-core (~> 2.13.0)
29
+ rspec-expectations (~> 2.13.0)
30
+ rspec-mocks (~> 2.13.0)
31
+ rspec-core (2.13.1)
32
+ rspec-expectations (2.13.0)
33
+ diff-lcs (>= 1.1.3, < 2.0)
34
+ rspec-mocks (2.13.1)
35
+ slop (3.4.5)
36
+
37
+ PLATFORMS
38
+ ruby
39
+
40
+ DEPENDENCIES
41
+ colored
42
+ pry-debugger
43
+ rspec
44
+ same_same!
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Julian Russell
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,39 @@
1
+ samesame
2
+ ========
3
+
4
+ Ruby version of clustering algorithms from "Algorithms of the Intelligent Web"
5
+
6
+ ## Status
7
+
8
+ Pretty much direct port of the sameple code from the book (Java).
9
+
10
+ ### Todo
11
+ * **Expand specs**. The basics have specs, but some of the computation specs are just testing the thing doesn't blow up, NOT that the calculation is right. Lots of higher level code doesn't have any specs
12
+ * **Refactor**. Some of the classes and methods are filthy. Things like `Cluster` are just thin wrappers that delegate to arrays.
13
+ * **Push in more data and see what happens**
14
+
15
+ ## Installation
16
+
17
+ Add this line to your application's Gemfile:
18
+
19
+ gem 'same_same'
20
+
21
+ And then execute:
22
+
23
+ $ bundle
24
+
25
+ Or install it yourself as:
26
+
27
+ $ gem install samesame
28
+
29
+ ## Usage
30
+
31
+ TODO: Write usage instructions here
32
+
33
+ ## Contributing
34
+
35
+ 1. Fork it
36
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
37
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
38
+ 4. Push to the branch (`git push origin my-new-feature`)
39
+ 5. Create new Pull Request
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,25 @@
1
+ require 'same_same'
2
+ require 'same_same/dendrogram_printer'
3
+ require 'csv'
4
+
5
+ digg_rows = CSV.read("../spec/fixtures/digg_stories.csv", headers: true)
6
+ digg_data = digg_rows.map {|row|
7
+ SameSame::DataPoint.new( row["title"],
8
+ %w(category topic description).map {|key|
9
+ row[key]
10
+ }.join(" ").downcase.split(/\s+/)
11
+ )
12
+ }
13
+
14
+ distance = SameSame::CosineDistance.new
15
+ vector_builder = SameSame::DbscanTermFrequencyVectors.new
16
+ algo = SameSame::DbscanAlgorithm.new(
17
+ points: digg_data,
18
+ eps: 0.7,
19
+ min_points: 2,
20
+ vector_calculator: vector_builder,
21
+ distance: distance)
22
+
23
+ clusters = algo.cluster
24
+
25
+ SameSame::DendrogramPrinter.new.print_clusters( clusters )
@@ -0,0 +1,35 @@
1
+ require 'same_same'
2
+ require 'same_same/dendrogram_printer'
3
+ require 'csv'
4
+
5
+ csv = CSV.read("../spec/fixtures/lines.csv", headers: true)
6
+
7
+ # , row['price']
8
+ groups = csv.group_by {|row| [row['categories']].join("-")}
9
+
10
+ fragments = groups.map {|group_key, group|
11
+ [group_key, group.map {|row|
12
+ SameSame::DataPoint.new( [row["id"], row["name"]].map {|t| t.gsub(/\s+/, ' ')}.join(": "),
13
+ %w(name price).map {|key|
14
+ row[key]
15
+ }.join(" ").downcase.split(/\s+/)
16
+ )
17
+ }]
18
+ }
19
+
20
+ distance = SameSame::CosineDistance.new
21
+ vector_builder = SameSame::DbscanTermFrequencyVectors.new
22
+
23
+ fragments.each do |key, group|
24
+ if group.size > 1
25
+ algo = SameSame::DbscanAlgorithm.new(
26
+ points: group,
27
+ eps: 0.3,
28
+ min_points: 2,
29
+ vector_calculator: vector_builder,
30
+ distance: distance)
31
+
32
+ clusters = algo.cluster
33
+ SameSame::DendrogramPrinter.new.print_clusters( clusters.select {|c| c.name != "Noise"} )
34
+ end
35
+ end
@@ -0,0 +1,20 @@
1
+ require 'same_same'
2
+ require 'same_same/dendrogram_printer'
3
+ require 'csv'
4
+
5
+ digg_rows = CSV.read("../spec/fixtures/digg_stories.csv", headers: true)
6
+ digg_data = digg_rows.map {|row|
7
+ SameSame::DataPoint.new( row["title"],
8
+ %w(category topic description).map {|key|
9
+ row[key]
10
+ }.join(" ").downcase.split(/\s+/)
11
+ )
12
+ }
13
+
14
+
15
+ k = 2
16
+ th = 0.2
17
+ algo = SameSame::RockAlgorithm.new(datapoints: digg_data, k: k, th: th)
18
+ dnd = algo.cluster
19
+
20
+ SameSame::DendrogramPrinter.new.print_last(dnd)
@@ -0,0 +1,31 @@
1
+ require 'same_same'
2
+ require 'same_same/dendrogram_printer'
3
+ require 'csv'
4
+
5
+ csv = CSV.read("../spec/fixtures/lines.csv", headers: true)
6
+
7
+ groups = csv.group_by {|row| [row['categories'], row['price']].join("-")}
8
+
9
+ fragments = groups.map {|group_key, group|
10
+ [group_key, group.map {|row|
11
+ SameSame::DataPoint.new( [row["id"], row["name"]].map {|t| t.gsub(/\s+/, ' ')}.join(": "),
12
+ %w(name price categories).map {|key|
13
+ row[key]
14
+ }.join(" ").downcase.split(/\s+/) + [row["name"].downcase.gsub(/\s+/, ' ')]
15
+ )
16
+ }]
17
+ }
18
+
19
+ k = 4
20
+ th = 0.4
21
+ fragments.each do |key, group|
22
+ if group.size > 1
23
+ algo = SameSame::RockAlgorithm.new(datapoints: group, k: k, th: th)
24
+ dnd = algo.cluster
25
+ if dnd.non_singelton_leaves?
26
+ SameSame::DendrogramPrinter.new.print_last(dnd)
27
+ end
28
+ end
29
+ end
30
+
31
+
@@ -0,0 +1,15 @@
1
+ require 'same_same/cluster'
2
+ require 'same_same/cosine_distance'
3
+ require 'same_same/data_point'
4
+ require 'same_same/dbscan_algorithm'
5
+ require 'same_same/dbscan_numeric_vectors'
6
+ require 'same_same/dbscan_term_frequency_vectors'
7
+ require 'same_same/jaquard_coefficient'
8
+ require 'same_same/rock_algorithm'
9
+ require 'same_same/symmetrical_matrix'
10
+ require 'same_same/term_frequency_builder'
11
+ require 'same_same/version'
12
+
13
+ module Samesame
14
+ # Your code goes here...
15
+ end
@@ -0,0 +1,27 @@
1
+ module SameSame
2
+ class Cluster
3
+ # note to self - unless I need to implement a heap
4
+ # more, i'm just wrapping an array and delegating...
5
+ attr_accessor :datapoints, :name
6
+
7
+ def initialize( dp, name = nil )
8
+ self.datapoints = dp
9
+ self.name = name
10
+ end
11
+
12
+ def +( other )
13
+ names = [name, other.name].compact
14
+ new_name = names.empty? ? nil : names.join("+")
15
+ Cluster.new( datapoints + other.datapoints, new_name )
16
+ end
17
+
18
+ def size
19
+ datapoints.size
20
+ end
21
+
22
+ def inject(i, &block)
23
+ datapoints.inject(i, &block)
24
+ end
25
+
26
+ end
27
+ end
@@ -0,0 +1,10 @@
1
+ module SameSame
2
+
3
+ class ClusterSimilarity < Struct.new( :cluster_key, :goodness )
4
+ include Comparable
5
+ def <=>(other)
6
+ goodness <=> other.goodness
7
+ end
8
+ end
9
+
10
+ end
@@ -0,0 +1,27 @@
1
+ require 'same_same/cosine_similarity'
2
+
3
+ module SameSame
4
+ class CosineDistance
5
+
6
+ attr_accessor :cosin
7
+
8
+ def initialize
9
+ self.cosin = CosineSimilarity.new
10
+ end
11
+
12
+ def distance(x, y)
13
+ sim = cosin.sim(x, y)
14
+
15
+ if sim < 0.0
16
+ throw new ArgumentError(
17
+ "Can't use this value to calculate distance." +
18
+ "x[]=" + x.inspect +
19
+ ", y[]=" + y.inspect +
20
+ ", cosin.sim(x,y)=" + sim)
21
+ end
22
+
23
+ 1.0 - sim
24
+ end
25
+ end
26
+
27
+ end
@@ -0,0 +1,22 @@
1
+ module SameSame
2
+
3
+ class CosineSimilarity
4
+
5
+ def similarity(x, y)
6
+ sim(*TermFrequencyBuilder.build_vectors( x.data, y.data ))
7
+ end
8
+
9
+ def sim(v1, v2)
10
+ dot_product(v1, v2) / (norm(v1) * norm(v2))
11
+ end
12
+
13
+ def dot_product(v1, v2)
14
+ v1.zip(v2).map {|val1,val2| val1 * val2}.inject(:+)
15
+ end
16
+
17
+ def norm(vector)
18
+ Math.sqrt( vector.map {|val| val ** 2}.inject(:+) )
19
+ end
20
+ end
21
+
22
+ end
@@ -0,0 +1,12 @@
1
+ module SameSame
2
+ class DataPoint < Struct.new( :id, :data )
3
+ def empty?
4
+ data.empty?
5
+ end
6
+
7
+ def size
8
+ data.size
9
+ end
10
+
11
+ end
12
+ end
@@ -0,0 +1,135 @@
1
+ require 'same_same/dbscan_neighborhood'
2
+ require 'same_same/dbscan_clusters'
3
+
4
+ module SameSame
5
+
6
+ # Implementation of DBSCAN clustering algorithm.
7
+ #
8
+ # Algorithm parameters:
9
+ #
10
+ # * Eps - threshold value to determine point neighbors. Two points are
11
+ # neighbors if the distance between them does not exceed this threshold value.
12
+ # * MinPts - minimum number of points in any cluster.
13
+ #
14
+ # Choice of parameter values depends on the data.
15
+ #
16
+ # Point types:
17
+ #
18
+ # * Core point - point that belongs to the core of the cluster. It has at least
19
+ # MinPts neighboring points.
20
+ # * Border point - is a neighbor to at least one core point but it doesn't
21
+ # have enough neighbors to be a core point.
22
+ # * Noise point - is a point that doesn't belong to any cluster because it is
23
+ # not close to any of the core points.
24
+ #
25
+ class DbscanAlgorithm
26
+ attr_accessor :points
27
+
28
+ # Sets of points. Initially all points will be assigned into
29
+ # Unclassified points set.
30
+ attr_accessor :dbscan_clusters
31
+
32
+ # Number of points that should exist in the neighborhood for a point
33
+ # to be a core point.
34
+ #
35
+ # Best value for this parameter depends on the data set.
36
+ attr_accessor :min_points
37
+
38
+ attr_accessor :neighborhood
39
+
40
+
41
+ # Initializes algorithm with all data that it needs.
42
+ #
43
+ # * points - points to cluster
44
+ # * eps - distance threshold value
45
+ # * min_points - number of neighbors for point to be considered a
46
+ # core point.
47
+ # * distance - distance measure to use (defaults to Cosine)
48
+ # * vector_calculator - calculates the vectors to use for distance comparison.
49
+ # defaults to DbscanNumericVectors which compares just
50
+ # the numeric attributes of the datapoint.
51
+ # Alternatively use DbscanTermFrequency.
52
+ def initialize(attrs = {})
53
+ self.points = attrs.fetch(:points)
54
+ self.min_points = attrs.fetch(:min_points)
55
+ distance = attrs[:distance] || CosineDistance.new
56
+ vector_calculator = attrs[:vector_calculator] || DbscanNumericVectors.new
57
+
58
+ self.neighborhood = DbscanNeighborhood.new( distance: distance,
59
+ eps: attrs.fetch(:eps),
60
+ points: points,
61
+ vector_calculator: vector_calculator )
62
+
63
+ # all points start as unclassifed
64
+ self.dbscan_clusters = DbscanClusters.new( points )
65
+ end
66
+
67
+ def cluster
68
+ cluster_id = dbscan_clusters.get_next_cluster_id
69
+ points.each do |p|
70
+ if dbscan_clusters.unclassified?(p)
71
+ if create_cluster(p, cluster_id)
72
+ cluster_id = dbscan_clusters.get_next_cluster_id
73
+ end
74
+ end
75
+ end
76
+
77
+ dbscan_clusters.to_clusters
78
+ end
79
+
80
+
81
+ def create_cluster( p, cluster_id)
82
+ neighbors = neighborhood.neighbors_of p
83
+ if neighbors.size < min_points
84
+ # Assign point into "Noise" group.
85
+ # It will have a chance to become a border point later on.
86
+ dbscan_clusters.assign_to_noise(p)
87
+ # return false to indicate that we didn't create any cluster
88
+ return false
89
+ end
90
+
91
+ # All points are reachable from the core point...
92
+ dbscan_clusters.assign_points(neighbors, cluster_id)
93
+
94
+ # Remove point itself.
95
+ neighbors.delete(p)
96
+
97
+ # Process the rest of the neighbors...
98
+ while !neighbors.empty?
99
+ # pick the first neighbor
100
+ neighbor = neighbors.first
101
+
102
+ # process neighbor
103
+ neighbors_neighbors = neighborhood.neighbors_of neighbor
104
+
105
+ if neighbors_neighbors.size < min_points
106
+ # do nothing. The neighbor is just a border point.
107
+ else
108
+ # neighbor is another core point.
109
+ neighbors_neighbors.each do |neighbors_neighbor|
110
+
111
+ if dbscan_clusters.noise?(neighbors_neighbor)
112
+ # It's a border point. We know that it doesn't have
113
+ # enough neighbors to be a core point. Just add it
114
+ # to the cluster.
115
+ dbscan_clusters.assign_point(neighbors_neighbor, cluster_id)
116
+ elsif dbscan_clusters.unclassified?(neighbors_neighbor)
117
+
118
+ # We don't know if this point has enough neighbors
119
+ # to be a core point... add it to the list of points
120
+ # to be checked.
121
+ neighbors.add(neighbors_neighbor)
122
+
123
+ # And assign it to the cluster
124
+ dbscan_clusters.assign_point(neighbors_neighbor, cluster_id)
125
+ end
126
+ end
127
+ end
128
+
129
+ neighbors.delete neighbor
130
+ end
131
+ true
132
+ end
133
+
134
+ end
135
+ end