same_same 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. data/.gitignore +18 -0
  2. data/Gemfile +4 -0
  3. data/Gemfile.lock +44 -0
  4. data/LICENSE.txt +22 -0
  5. data/README.md +39 -0
  6. data/Rakefile +1 -0
  7. data/examples/dbscan_digg.rb +25 -0
  8. data/examples/dbscan_lines.rb +35 -0
  9. data/examples/rock_digg.rb +20 -0
  10. data/examples/rock_lines.rb +31 -0
  11. data/lib/same_same.rb +15 -0
  12. data/lib/same_same/cluster.rb +27 -0
  13. data/lib/same_same/cluster_similarity.rb +10 -0
  14. data/lib/same_same/cosine_distance.rb +27 -0
  15. data/lib/same_same/cosine_similarity.rb +22 -0
  16. data/lib/same_same/data_point.rb +12 -0
  17. data/lib/same_same/dbscan_algorithm.rb +135 -0
  18. data/lib/same_same/dbscan_clusters.rb +88 -0
  19. data/lib/same_same/dbscan_neighborhood.rb +68 -0
  20. data/lib/same_same/dbscan_numeric_vectors.rb +7 -0
  21. data/lib/same_same/dbscan_term_frequency_vectors.rb +7 -0
  22. data/lib/same_same/dendrogram.rb +28 -0
  23. data/lib/same_same/dendrogram_printer.rb +74 -0
  24. data/lib/same_same/jaquard_coefficient.rb +9 -0
  25. data/lib/same_same/link_matrix.rb +62 -0
  26. data/lib/same_same/merge_goodness_measure.rb +30 -0
  27. data/lib/same_same/rock_algorithm.rb +51 -0
  28. data/lib/same_same/rock_clusters.rb +68 -0
  29. data/lib/same_same/similarity_matrix.rb +20 -0
  30. data/lib/same_same/symmetrical_matrix.rb +39 -0
  31. data/lib/same_same/term_frequency_builder.rb +20 -0
  32. data/lib/same_same/version.rb +3 -0
  33. data/same_same.gemspec +23 -0
  34. data/spec/fixtures/digg_stories.csv +49 -0
  35. data/spec/fixtures/lines.csv +899 -0
  36. data/spec/same_same/dbscan_algorithm_spec.rb +72 -0
  37. data/spec/same_same/jaquard_coefficient_spec.rb +24 -0
  38. data/spec/same_same/link_matrix_spec.rb +29 -0
  39. data/spec/same_same/merge_goodness_measure_spec.rb +34 -0
  40. data/spec/same_same/rock_algorithm_spec.rb +71 -0
  41. data/spec/same_same/similarity_matrix_spec.rb +20 -0
  42. data/spec/same_same/symmetrical_matrix_spec.rb +69 -0
  43. metadata +144 -0
@@ -0,0 +1,72 @@
1
+ require 'csv'
2
+ require 'same_same'
3
+ require 'same_same/dendrogram_printer'
4
+
5
+ # WARNING: This is just really testing that they don't blow up.
6
+ describe SameSame::RockAlgorithm do
7
+ let(:digg_data) {
8
+ digg_data = CSV.read("spec/fixtures/digg_stories.csv", headers: true)
9
+ i = 0
10
+ digg_data.map {|row|
11
+ i = i + 1
12
+ SameSame::DataPoint.new( "#{i}: #{row["title"]}",
13
+ %w(title category topic description).map {|key|
14
+ row[key]
15
+ }.join(" ").downcase.split(/\s+/).select {|w| w.size > 3}
16
+ )
17
+ }
18
+ }
19
+
20
+ let(:line_data) {
21
+ csv = CSV.read("spec/fixtures/lines.csv", headers: true)
22
+
23
+ # , csv['price']
24
+ groups = csv.group_by {|csv| [csv['categories']].join("-")}
25
+
26
+ groups.map {|key, group|
27
+ [key, group.map {|row|
28
+ SameSame::DataPoint.new( [row["id"], row["name"]].map {|t| t.gsub(/\s+/, ' ')}.join(": "),
29
+ %w(name price).map {|key|
30
+ row[key]
31
+ }.join(" ").downcase.split(/\s+/)
32
+ )
33
+ }]
34
+ }
35
+ }
36
+
37
+ it "works on the digg data" do
38
+ distance = SameSame::CosineDistance.new
39
+ vector_builder = SameSame::DbscanTermFrequencyVectors.new
40
+
41
+ algo = SameSame::DbscanAlgorithm.new(
42
+ points: digg_data,
43
+ eps: 0.7,
44
+ min_points: 2,
45
+ vector_calculator: vector_builder,
46
+ distance: distance)
47
+
48
+ clusters = algo.cluster
49
+
50
+ #SameSame::DendrogramPrinter.new.print_clusters( clusters )
51
+ end
52
+
53
+ it "works on lines" do
54
+ distance = SameSame::CosineDistance.new
55
+ vector_builder = SameSame::DbscanTermFrequencyVectors.new
56
+
57
+ line_data.each do |key, group|
58
+ if group.size > 1
59
+ algo = SameSame::DbscanAlgorithm.new(
60
+ points: group,
61
+ eps: 0.3,
62
+ min_points: 2,
63
+ vector_calculator: vector_builder,
64
+ distance: distance)
65
+
66
+ clusters = algo.cluster
67
+ #expect(clusters.size).to be > 1
68
+ #SameSame::DendrogramPrinter.new.print_clusters( clusters.select {|c| c.name != "Noise"} )
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,24 @@
1
+ require 'same_same'
2
+
3
+ describe SameSame::JaquardCoefficient do
4
+
5
+ describe '#similarity' do
6
+ it "raises ArgumentError if both sets are empty" do
7
+ expect {subject.similarity([],[])}.to raise_error(ArgumentError)
8
+ end
9
+
10
+ it "is 0 if either set is empty" do
11
+ expect(subject.similarity([],["yolo"])).to eq(0.0)
12
+ expect(subject.similarity(["yolo"],[])).to eq(0.0)
13
+ end
14
+
15
+ it "is 1.0 for identical sets" do
16
+ expect( subject.similarity(["yolo"], ["yolo"]) ).to eq(1.0)
17
+ expect( subject.similarity(["yolo", "oloy"], ["yolo", "oloy"]) ).to eq(1.0)
18
+ end
19
+
20
+ it "is 1/3 for 1 common element and 3 total" do
21
+ expect( subject.similarity(["yolo", "polo"], ["yolo", "oloy"]) ).to eq(1.0 / 3.0)
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,29 @@
1
+ require 'same_same'
2
+
3
+ describe SameSame::LinkMatrix do
4
+ let( :datapoints ) {(0..19).to_a}
5
+ let( :similarity_matrix ) {
6
+ SameSame::SymmetricalMatrix.new(datapoints.size).tap do |m|
7
+ datapoints.each do |i|
8
+ m.set(i,i,1.0)
9
+ end
10
+ (0..datapoints.size-1).each do |i|
11
+ (0..i-1).each do |j|
12
+ m.set(i,j,(i+j)/40.0)
13
+ end
14
+ end
15
+ end
16
+ }
17
+ let( :th ) {0.2}
18
+
19
+ subject {SameSame::LinkMatrix.new(similarity_matrix: similarity_matrix, datapoints: datapoints, th: th)}
20
+
21
+ describe "#number_of_links_between_points" do
22
+ it "returns a number" do
23
+ # WARNING: I don't know if these numbers are right!!!!
24
+ expect( subject.number_of_links_between_points(0,1) ).to eq(12)
25
+ expect( subject.number_of_links_between_points(1,2) ).to eq(13)
26
+ expect( subject.number_of_links_between_points(19,19) ).to eq(20)
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,34 @@
1
+ require 'same_same'
2
+
3
+ describe SameSame::MergeGoodnessMeasure do
4
+
5
+ context "0 proximity (all are neighbours)" do
6
+ subject {SameSame::MergeGoodnessMeasure.new(0.0)}
7
+
8
+ describe '#g' do
9
+ it "should do something I'm not quite sure of" do
10
+ # these 'specs' aren't specs - I'm not quite in a position
11
+ # to write anything specific here. Just really ensuring that
12
+ # the methods don't fail
13
+ expect( subject.g( 10, 10, 10 ) ).to eq(0.0016666666666666668)
14
+ expect( subject.g( 2, 10, 10 ) ).to eq(0.0003333333333333333)
15
+ end
16
+ end
17
+ end
18
+
19
+ context "1.0 proximity (no neighbours)" do
20
+ subject {SameSame::MergeGoodnessMeasure.new(1.0)}
21
+
22
+ describe '#g' do
23
+ it "should always return infinity" do
24
+ # note to self: this makes sense. If no datapoints are
25
+ # considered neighbours the goodness for any cluster will
26
+ # infinite (where infinite means bad). Why isn't it called
27
+ # a BADNESS measure then?
28
+ expect( subject.g( 10, 10, 10 ) ).to eq(Float::INFINITY)
29
+ expect( subject.g( 2, 10, 10 ) ).to eq(Float::INFINITY)
30
+ end
31
+ end
32
+ end
33
+
34
+ end
@@ -0,0 +1,71 @@
1
+ require 'csv'
2
+ require 'same_same'
3
+ require 'same_same/dendrogram_printer'
4
+
5
+ describe SameSame::RockAlgorithm do
6
+ let(:dp1) {["book"]}
7
+ let(:dp2) {["water", "sun", "sand", "swim"]}
8
+ let(:dp3) {["water", "sun", "swim", "read"]}
9
+ let(:dp4) {["read", "sand"]}
10
+ let(:points) {[dp1, dp2, dp3, dp4].map {|data| SameSame::DataPoint.new(data.join("-"), data)}}
11
+ let(:digg_data) {
12
+ digg_data = CSV.read("spec/fixtures/digg_stories.csv", headers: true)
13
+ digg_data.map {|row|
14
+ SameSame::DataPoint.new( row["title"],
15
+ %w(category topic description).map {|key|
16
+ row[key]
17
+ }.join(" ").downcase.split(/\s+/)
18
+ )
19
+ }
20
+ }
21
+
22
+ let(:line_data) {
23
+ csv = CSV.read("spec/fixtures/lines.csv", headers: true)
24
+
25
+ groups = csv.group_by {|csv| [csv['categories'], csv['price']].join("-")}
26
+
27
+ groups.map {|key, group|
28
+ [key, group.map {|row|
29
+ SameSame::DataPoint.new( [row["id"], row["name"]].map {|t| t.gsub(/\s+/, ' ')}.join(": "),
30
+ %w(name price price price price categories).map {|key|
31
+ row[key]
32
+ }.join(" ").downcase.split(/\s+/) + [row["name"].downcase.gsub(/\s+/, ' ')]
33
+ )
34
+ }]
35
+ }
36
+ }
37
+
38
+ # it "works" do
39
+ # k = 2
40
+ # th = 0.2
41
+ # algo = SameSame::RockAlgorithm.new(datapoints: points, k: k, th: th)
42
+ # dnd = algo.cluster
43
+
44
+ # SameSame::DendrogramPrinter.new.print(dnd)
45
+ # end
46
+
47
+
48
+
49
+ it "works on the digg data" do
50
+ k = 2
51
+ th = 0.2
52
+ algo = SameSame::RockAlgorithm.new(datapoints: digg_data, k: k, th: th)
53
+ dnd = algo.cluster
54
+
55
+ #SameSame::DendrogramPrinter.new.print_last(dnd)
56
+ end
57
+
58
+ it "works on lines" do
59
+ k = 4
60
+ th = 0.4
61
+ line_data.each do |key, group|
62
+ if group.size > 1
63
+ algo = SameSame::RockAlgorithm.new(datapoints: group, k: k, th: th)
64
+ dnd = algo.cluster
65
+ if dnd.non_singelton_leaves?
66
+ #SameSame::DendrogramPrinter.new.print_last(dnd)
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,20 @@
1
+ require 'same_same'
2
+
3
+ describe SameSame::SimilarityMatrix do
4
+ let(:measure) {double("Measure")}
5
+ let(:datapoints) {[1,2,3]}
6
+
7
+ subject {SameSame::SimilarityMatrix.new( measure, datapoints )}
8
+
9
+ describe "#[]" do
10
+ it "returns 1.0 without calculation for self similarity" do
11
+ expect( subject.lookup 1,1 ).to eq(1.0)
12
+ end
13
+
14
+ it "calculates simlarity for different datapoints" do
15
+ measure.should_receive( :similarity ).with( 2, 3 ).and_return 2.0
16
+ expect( subject.lookup 1,2 ).to eq(2.0)
17
+ end
18
+
19
+ end
20
+ end
@@ -0,0 +1,69 @@
1
+ require 'same_same'
2
+
3
+ describe SameSame::SymmetricalMatrix do
4
+
5
+ describe "#lookup" do
6
+
7
+ context "with no lookup" do
8
+ subject {SameSame::SymmetricalMatrix.new(5)}
9
+
10
+ it "is nil for unspecified elements" do
11
+ expect( subject.lookup(1,2) ).to be_nil
12
+ end
13
+
14
+ it "returns a set value" do
15
+ subject.set(1,2,"value")
16
+ expect( subject.lookup(1,2) ).to eq("value")
17
+ end
18
+
19
+ it "returns the inverse of a set value" do
20
+ subject.set(1,2,"value")
21
+ expect( subject.lookup(2,1) ).to eq("value")
22
+ end
23
+
24
+ it "raises argument error for out of bounds parameters" do
25
+ expect { subject.lookup(5,1) }.to raise_error( ArgumentError)
26
+ expect { subject.lookup(1,5) }.to raise_error( ArgumentError)
27
+ expect { subject.lookup(-1,1) }.to raise_error( ArgumentError)
28
+ expect { subject.lookup(1,-1) }.to raise_error( ArgumentError)
29
+ end
30
+
31
+ it "uses blocks to calulate missing values" do
32
+ expect( subject.lookup(1,2) {|x,y| "#{x}-#{y}"} ).to eq("1-2")
33
+ end
34
+
35
+ it "ignores blocks when value set" do
36
+ subject.set(1,2,"value")
37
+ expect( subject.lookup(1,2) {|x,y| "#{x}-#{y}"} ).to eq("value")
38
+ end
39
+
40
+ end
41
+
42
+ context "with a lookup proc" do
43
+ let( :proc ) {double("proc")}
44
+ subject {SameSame::SymmetricalMatrix.new(5, proc)}
45
+
46
+ it "is calculated for unspecified elements" do
47
+ proc.should_receive( :call ).with(1,2).and_return "1-2"
48
+ expect( subject.lookup(1,2) ).to eq("1-2")
49
+ end
50
+
51
+ it "is only calls the proc once" do
52
+ proc.should_receive( :call ).with(1,2).and_return "1-2"
53
+ expect( subject.lookup(1,2) ).to eq("1-2")
54
+ expect( subject.lookup(1,2) ).to eq("1-2")
55
+ end
56
+
57
+ it "is only calls the proc once if the inverse is looked up" do
58
+ proc.should_receive( :call ).with(1,2).and_return "1-2"
59
+ expect( subject.lookup(1,2) ).to eq("1-2")
60
+ expect( subject.lookup(2,1) ).to eq("1-2")
61
+ end
62
+
63
+ it "is prefers set values to the lookup" do
64
+ subject.set(1,2,"value")
65
+ expect( subject.lookup(1,2) ).to eq("value")
66
+ end
67
+ end
68
+ end
69
+ end
metadata ADDED
@@ -0,0 +1,144 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: same_same
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.1
6
+ platform: ruby
7
+ authors:
8
+ - Julian Russell
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-06-24 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ version_requirements: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ! '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ none: false
21
+ name: rspec
22
+ type: :development
23
+ prerelease: false
24
+ requirement: !ruby/object:Gem::Requirement
25
+ requirements:
26
+ - - ! '>='
27
+ - !ruby/object:Gem::Version
28
+ version: '0'
29
+ none: false
30
+ - !ruby/object:Gem::Dependency
31
+ version_requirements: !ruby/object:Gem::Requirement
32
+ requirements:
33
+ - - ! '>='
34
+ - !ruby/object:Gem::Version
35
+ version: '0'
36
+ none: false
37
+ name: pry-debugger
38
+ type: :development
39
+ prerelease: false
40
+ requirement: !ruby/object:Gem::Requirement
41
+ requirements:
42
+ - - ! '>='
43
+ - !ruby/object:Gem::Version
44
+ version: '0'
45
+ none: false
46
+ - !ruby/object:Gem::Dependency
47
+ version_requirements: !ruby/object:Gem::Requirement
48
+ requirements:
49
+ - - ! '>='
50
+ - !ruby/object:Gem::Version
51
+ version: '0'
52
+ none: false
53
+ name: colored
54
+ type: :development
55
+ prerelease: false
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ! '>='
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ none: false
62
+ description: Implementation of ROCK and DBSCAN clustering algorithms
63
+ email:
64
+ - julian@myfoodlink.com
65
+ executables: []
66
+ extensions: []
67
+ extra_rdoc_files: []
68
+ files:
69
+ - .gitignore
70
+ - Gemfile
71
+ - Gemfile.lock
72
+ - LICENSE.txt
73
+ - README.md
74
+ - Rakefile
75
+ - examples/dbscan_digg.rb
76
+ - examples/dbscan_lines.rb
77
+ - examples/rock_digg.rb
78
+ - examples/rock_lines.rb
79
+ - lib/same_same.rb
80
+ - lib/same_same/cluster.rb
81
+ - lib/same_same/cluster_similarity.rb
82
+ - lib/same_same/cosine_distance.rb
83
+ - lib/same_same/cosine_similarity.rb
84
+ - lib/same_same/data_point.rb
85
+ - lib/same_same/dbscan_algorithm.rb
86
+ - lib/same_same/dbscan_clusters.rb
87
+ - lib/same_same/dbscan_neighborhood.rb
88
+ - lib/same_same/dbscan_numeric_vectors.rb
89
+ - lib/same_same/dbscan_term_frequency_vectors.rb
90
+ - lib/same_same/dendrogram.rb
91
+ - lib/same_same/dendrogram_printer.rb
92
+ - lib/same_same/jaquard_coefficient.rb
93
+ - lib/same_same/link_matrix.rb
94
+ - lib/same_same/merge_goodness_measure.rb
95
+ - lib/same_same/rock_algorithm.rb
96
+ - lib/same_same/rock_clusters.rb
97
+ - lib/same_same/similarity_matrix.rb
98
+ - lib/same_same/symmetrical_matrix.rb
99
+ - lib/same_same/term_frequency_builder.rb
100
+ - lib/same_same/version.rb
101
+ - same_same.gemspec
102
+ - spec/fixtures/digg_stories.csv
103
+ - spec/fixtures/lines.csv
104
+ - spec/same_same/dbscan_algorithm_spec.rb
105
+ - spec/same_same/jaquard_coefficient_spec.rb
106
+ - spec/same_same/link_matrix_spec.rb
107
+ - spec/same_same/merge_goodness_measure_spec.rb
108
+ - spec/same_same/rock_algorithm_spec.rb
109
+ - spec/same_same/similarity_matrix_spec.rb
110
+ - spec/same_same/symmetrical_matrix_spec.rb
111
+ homepage: https://github.com/plusplus/same_same
112
+ licenses: []
113
+ post_install_message:
114
+ rdoc_options: []
115
+ require_paths:
116
+ - lib
117
+ required_ruby_version: !ruby/object:Gem::Requirement
118
+ requirements:
119
+ - - ! '>='
120
+ - !ruby/object:Gem::Version
121
+ version: '0'
122
+ none: false
123
+ required_rubygems_version: !ruby/object:Gem::Requirement
124
+ requirements:
125
+ - - ! '>='
126
+ - !ruby/object:Gem::Version
127
+ version: '0'
128
+ none: false
129
+ requirements: []
130
+ rubyforge_project:
131
+ rubygems_version: 1.8.23
132
+ signing_key:
133
+ specification_version: 3
134
+ summary: Implementation of ROCK and DBSCAN clustering algorithms
135
+ test_files:
136
+ - spec/fixtures/digg_stories.csv
137
+ - spec/fixtures/lines.csv
138
+ - spec/same_same/dbscan_algorithm_spec.rb
139
+ - spec/same_same/jaquard_coefficient_spec.rb
140
+ - spec/same_same/link_matrix_spec.rb
141
+ - spec/same_same/merge_goodness_measure_spec.rb
142
+ - spec/same_same/rock_algorithm_spec.rb
143
+ - spec/same_same/similarity_matrix_spec.rb
144
+ - spec/same_same/symmetrical_matrix_spec.rb