same_same 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. data/.gitignore +18 -0
  2. data/Gemfile +4 -0
  3. data/Gemfile.lock +44 -0
  4. data/LICENSE.txt +22 -0
  5. data/README.md +39 -0
  6. data/Rakefile +1 -0
  7. data/examples/dbscan_digg.rb +25 -0
  8. data/examples/dbscan_lines.rb +35 -0
  9. data/examples/rock_digg.rb +20 -0
  10. data/examples/rock_lines.rb +31 -0
  11. data/lib/same_same.rb +15 -0
  12. data/lib/same_same/cluster.rb +27 -0
  13. data/lib/same_same/cluster_similarity.rb +10 -0
  14. data/lib/same_same/cosine_distance.rb +27 -0
  15. data/lib/same_same/cosine_similarity.rb +22 -0
  16. data/lib/same_same/data_point.rb +12 -0
  17. data/lib/same_same/dbscan_algorithm.rb +135 -0
  18. data/lib/same_same/dbscan_clusters.rb +88 -0
  19. data/lib/same_same/dbscan_neighborhood.rb +68 -0
  20. data/lib/same_same/dbscan_numeric_vectors.rb +7 -0
  21. data/lib/same_same/dbscan_term_frequency_vectors.rb +7 -0
  22. data/lib/same_same/dendrogram.rb +28 -0
  23. data/lib/same_same/dendrogram_printer.rb +74 -0
  24. data/lib/same_same/jaquard_coefficient.rb +9 -0
  25. data/lib/same_same/link_matrix.rb +62 -0
  26. data/lib/same_same/merge_goodness_measure.rb +30 -0
  27. data/lib/same_same/rock_algorithm.rb +51 -0
  28. data/lib/same_same/rock_clusters.rb +68 -0
  29. data/lib/same_same/similarity_matrix.rb +20 -0
  30. data/lib/same_same/symmetrical_matrix.rb +39 -0
  31. data/lib/same_same/term_frequency_builder.rb +20 -0
  32. data/lib/same_same/version.rb +3 -0
  33. data/same_same.gemspec +23 -0
  34. data/spec/fixtures/digg_stories.csv +49 -0
  35. data/spec/fixtures/lines.csv +899 -0
  36. data/spec/same_same/dbscan_algorithm_spec.rb +72 -0
  37. data/spec/same_same/jaquard_coefficient_spec.rb +24 -0
  38. data/spec/same_same/link_matrix_spec.rb +29 -0
  39. data/spec/same_same/merge_goodness_measure_spec.rb +34 -0
  40. data/spec/same_same/rock_algorithm_spec.rb +71 -0
  41. data/spec/same_same/similarity_matrix_spec.rb +20 -0
  42. data/spec/same_same/symmetrical_matrix_spec.rb +69 -0
  43. metadata +144 -0
@@ -0,0 +1,72 @@
1
+ require 'csv'
2
+ require 'same_same'
3
+ require 'same_same/dendrogram_printer'
4
+
5
+ # WARNING: This is just really testing that they don't blow up.
6
+ describe SameSame::RockAlgorithm do
7
+ let(:digg_data) {
8
+ digg_data = CSV.read("spec/fixtures/digg_stories.csv", headers: true)
9
+ i = 0
10
+ digg_data.map {|row|
11
+ i = i + 1
12
+ SameSame::DataPoint.new( "#{i}: #{row["title"]}",
13
+ %w(title category topic description).map {|key|
14
+ row[key]
15
+ }.join(" ").downcase.split(/\s+/).select {|w| w.size > 3}
16
+ )
17
+ }
18
+ }
19
+
20
+ let(:line_data) {
21
+ csv = CSV.read("spec/fixtures/lines.csv", headers: true)
22
+
23
+ # , csv['price']
24
+ groups = csv.group_by {|csv| [csv['categories']].join("-")}
25
+
26
+ groups.map {|key, group|
27
+ [key, group.map {|row|
28
+ SameSame::DataPoint.new( [row["id"], row["name"]].map {|t| t.gsub(/\s+/, ' ')}.join(": "),
29
+ %w(name price).map {|key|
30
+ row[key]
31
+ }.join(" ").downcase.split(/\s+/)
32
+ )
33
+ }]
34
+ }
35
+ }
36
+
37
+ it "works on the digg data" do
38
+ distance = SameSame::CosineDistance.new
39
+ vector_builder = SameSame::DbscanTermFrequencyVectors.new
40
+
41
+ algo = SameSame::DbscanAlgorithm.new(
42
+ points: digg_data,
43
+ eps: 0.7,
44
+ min_points: 2,
45
+ vector_calculator: vector_builder,
46
+ distance: distance)
47
+
48
+ clusters = algo.cluster
49
+
50
+ #SameSame::DendrogramPrinter.new.print_clusters( clusters )
51
+ end
52
+
53
+ it "works on lines" do
54
+ distance = SameSame::CosineDistance.new
55
+ vector_builder = SameSame::DbscanTermFrequencyVectors.new
56
+
57
+ line_data.each do |key, group|
58
+ if group.size > 1
59
+ algo = SameSame::DbscanAlgorithm.new(
60
+ points: group,
61
+ eps: 0.3,
62
+ min_points: 2,
63
+ vector_calculator: vector_builder,
64
+ distance: distance)
65
+
66
+ clusters = algo.cluster
67
+ #expect(clusters.size).to be > 1
68
+ #SameSame::DendrogramPrinter.new.print_clusters( clusters.select {|c| c.name != "Noise"} )
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,24 @@
1
+ require 'same_same'
2
+
3
+ describe SameSame::JaquardCoefficient do
4
+
5
+ describe '#similarity' do
6
+ it "raises ArgumentError if both sets are empty" do
7
+ expect {subject.similarity([],[])}.to raise_error(ArgumentError)
8
+ end
9
+
10
+ it "is 0 if either set is empty" do
11
+ expect(subject.similarity([],["yolo"])).to eq(0.0)
12
+ expect(subject.similarity(["yolo"],[])).to eq(0.0)
13
+ end
14
+
15
+ it "is 1.0 for identical sets" do
16
+ expect( subject.similarity(["yolo"], ["yolo"]) ).to eq(1.0)
17
+ expect( subject.similarity(["yolo", "oloy"], ["yolo", "oloy"]) ).to eq(1.0)
18
+ end
19
+
20
+ it "is 1/3 for 1 common element and 3 total" do
21
+ expect( subject.similarity(["yolo", "polo"], ["yolo", "oloy"]) ).to eq(1.0 / 3.0)
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,29 @@
1
+ require 'same_same'
2
+
3
+ describe SameSame::LinkMatrix do
4
+ let( :datapoints ) {(0..19).to_a}
5
+ let( :similarity_matrix ) {
6
+ SameSame::SymmetricalMatrix.new(datapoints.size).tap do |m|
7
+ datapoints.each do |i|
8
+ m.set(i,i,1.0)
9
+ end
10
+ (0..datapoints.size-1).each do |i|
11
+ (0..i-1).each do |j|
12
+ m.set(i,j,(i+j)/40.0)
13
+ end
14
+ end
15
+ end
16
+ }
17
+ let( :th ) {0.2}
18
+
19
+ subject {SameSame::LinkMatrix.new(similarity_matrix: similarity_matrix, datapoints: datapoints, th: th)}
20
+
21
+ describe "#number_of_links_between_points" do
22
+ it "returns a number" do
23
+ # WARNING: I don't know if these numbers are right!!!!
24
+ expect( subject.number_of_links_between_points(0,1) ).to eq(12)
25
+ expect( subject.number_of_links_between_points(1,2) ).to eq(13)
26
+ expect( subject.number_of_links_between_points(19,19) ).to eq(20)
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,34 @@
1
+ require 'same_same'
2
+
3
+ describe SameSame::MergeGoodnessMeasure do
4
+
5
+ context "0 proximity (all are neighbours)" do
6
+ subject {SameSame::MergeGoodnessMeasure.new(0.0)}
7
+
8
+ describe '#g' do
9
+ it "should do something I'm not quite sure of" do
10
+ # these 'specs' aren't specs - I'm not quite in a position
11
+ # to write anything specific here. Just really ensuring that
12
+ # the methods don't fail
13
+ expect( subject.g( 10, 10, 10 ) ).to eq(0.0016666666666666668)
14
+ expect( subject.g( 2, 10, 10 ) ).to eq(0.0003333333333333333)
15
+ end
16
+ end
17
+ end
18
+
19
+ context "1.0 proximity (no neighbours)" do
20
+ subject {SameSame::MergeGoodnessMeasure.new(1.0)}
21
+
22
+ describe '#g' do
23
+ it "should always return infinity" do
24
+ # note to self: this makes sense. If no datapoints are
25
+ # considered neighbours the goodness for any cluster will
26
+ # infinite (where infinite means bad). Why isn't it called
27
+ # a BADNESS measure then?
28
+ expect( subject.g( 10, 10, 10 ) ).to eq(Float::INFINITY)
29
+ expect( subject.g( 2, 10, 10 ) ).to eq(Float::INFINITY)
30
+ end
31
+ end
32
+ end
33
+
34
+ end
@@ -0,0 +1,71 @@
1
+ require 'csv'
2
+ require 'same_same'
3
+ require 'same_same/dendrogram_printer'
4
+
5
+ describe SameSame::RockAlgorithm do
6
+ let(:dp1) {["book"]}
7
+ let(:dp2) {["water", "sun", "sand", "swim"]}
8
+ let(:dp3) {["water", "sun", "swim", "read"]}
9
+ let(:dp4) {["read", "sand"]}
10
+ let(:points) {[dp1, dp2, dp3, dp4].map {|data| SameSame::DataPoint.new(data.join("-"), data)}}
11
+ let(:digg_data) {
12
+ digg_data = CSV.read("spec/fixtures/digg_stories.csv", headers: true)
13
+ digg_data.map {|row|
14
+ SameSame::DataPoint.new( row["title"],
15
+ %w(category topic description).map {|key|
16
+ row[key]
17
+ }.join(" ").downcase.split(/\s+/)
18
+ )
19
+ }
20
+ }
21
+
22
+ let(:line_data) {
23
+ csv = CSV.read("spec/fixtures/lines.csv", headers: true)
24
+
25
+ groups = csv.group_by {|csv| [csv['categories'], csv['price']].join("-")}
26
+
27
+ groups.map {|key, group|
28
+ [key, group.map {|row|
29
+ SameSame::DataPoint.new( [row["id"], row["name"]].map {|t| t.gsub(/\s+/, ' ')}.join(": "),
30
+ %w(name price price price price categories).map {|key|
31
+ row[key]
32
+ }.join(" ").downcase.split(/\s+/) + [row["name"].downcase.gsub(/\s+/, ' ')]
33
+ )
34
+ }]
35
+ }
36
+ }
37
+
38
+ # it "works" do
39
+ # k = 2
40
+ # th = 0.2
41
+ # algo = SameSame::RockAlgorithm.new(datapoints: points, k: k, th: th)
42
+ # dnd = algo.cluster
43
+
44
+ # SameSame::DendrogramPrinter.new.print(dnd)
45
+ # end
46
+
47
+
48
+
49
+ it "works on the digg data" do
50
+ k = 2
51
+ th = 0.2
52
+ algo = SameSame::RockAlgorithm.new(datapoints: digg_data, k: k, th: th)
53
+ dnd = algo.cluster
54
+
55
+ #SameSame::DendrogramPrinter.new.print_last(dnd)
56
+ end
57
+
58
+ it "works on lines" do
59
+ k = 4
60
+ th = 0.4
61
+ line_data.each do |key, group|
62
+ if group.size > 1
63
+ algo = SameSame::RockAlgorithm.new(datapoints: group, k: k, th: th)
64
+ dnd = algo.cluster
65
+ if dnd.non_singelton_leaves?
66
+ #SameSame::DendrogramPrinter.new.print_last(dnd)
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,20 @@
1
+ require 'same_same'
2
+
3
+ describe SameSame::SimilarityMatrix do
4
+ let(:measure) {double("Measure")}
5
+ let(:datapoints) {[1,2,3]}
6
+
7
+ subject {SameSame::SimilarityMatrix.new( measure, datapoints )}
8
+
9
+ describe "#[]" do
10
+ it "returns 1.0 without calculation for self similarity" do
11
+ expect( subject.lookup 1,1 ).to eq(1.0)
12
+ end
13
+
14
+ it "calculates simlarity for different datapoints" do
15
+ measure.should_receive( :similarity ).with( 2, 3 ).and_return 2.0
16
+ expect( subject.lookup 1,2 ).to eq(2.0)
17
+ end
18
+
19
+ end
20
+ end
@@ -0,0 +1,69 @@
1
+ require 'same_same'
2
+
3
+ describe SameSame::SymmetricalMatrix do
4
+
5
+ describe "#lookup" do
6
+
7
+ context "with no lookup" do
8
+ subject {SameSame::SymmetricalMatrix.new(5)}
9
+
10
+ it "is nil for unspecified elements" do
11
+ expect( subject.lookup(1,2) ).to be_nil
12
+ end
13
+
14
+ it "returns a set value" do
15
+ subject.set(1,2,"value")
16
+ expect( subject.lookup(1,2) ).to eq("value")
17
+ end
18
+
19
+ it "returns the inverse of a set value" do
20
+ subject.set(1,2,"value")
21
+ expect( subject.lookup(2,1) ).to eq("value")
22
+ end
23
+
24
+ it "raises argument error for out of bounds parameters" do
25
+ expect { subject.lookup(5,1) }.to raise_error( ArgumentError)
26
+ expect { subject.lookup(1,5) }.to raise_error( ArgumentError)
27
+ expect { subject.lookup(-1,1) }.to raise_error( ArgumentError)
28
+ expect { subject.lookup(1,-1) }.to raise_error( ArgumentError)
29
+ end
30
+
31
+ it "uses blocks to calulate missing values" do
32
+ expect( subject.lookup(1,2) {|x,y| "#{x}-#{y}"} ).to eq("1-2")
33
+ end
34
+
35
+ it "ignores blocks when value set" do
36
+ subject.set(1,2,"value")
37
+ expect( subject.lookup(1,2) {|x,y| "#{x}-#{y}"} ).to eq("value")
38
+ end
39
+
40
+ end
41
+
42
+ context "with a lookup proc" do
43
+ let( :proc ) {double("proc")}
44
+ subject {SameSame::SymmetricalMatrix.new(5, proc)}
45
+
46
+ it "is calculated for unspecified elements" do
47
+ proc.should_receive( :call ).with(1,2).and_return "1-2"
48
+ expect( subject.lookup(1,2) ).to eq("1-2")
49
+ end
50
+
51
+ it "is only calls the proc once" do
52
+ proc.should_receive( :call ).with(1,2).and_return "1-2"
53
+ expect( subject.lookup(1,2) ).to eq("1-2")
54
+ expect( subject.lookup(1,2) ).to eq("1-2")
55
+ end
56
+
57
+ it "is only calls the proc once if the inverse is looked up" do
58
+ proc.should_receive( :call ).with(1,2).and_return "1-2"
59
+ expect( subject.lookup(1,2) ).to eq("1-2")
60
+ expect( subject.lookup(2,1) ).to eq("1-2")
61
+ end
62
+
63
+ it "is prefers set values to the lookup" do
64
+ subject.set(1,2,"value")
65
+ expect( subject.lookup(1,2) ).to eq("value")
66
+ end
67
+ end
68
+ end
69
+ end
metadata ADDED
@@ -0,0 +1,144 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: same_same
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.1
6
+ platform: ruby
7
+ authors:
8
+ - Julian Russell
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-06-24 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ version_requirements: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ! '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ none: false
21
+ name: rspec
22
+ type: :development
23
+ prerelease: false
24
+ requirement: !ruby/object:Gem::Requirement
25
+ requirements:
26
+ - - ! '>='
27
+ - !ruby/object:Gem::Version
28
+ version: '0'
29
+ none: false
30
+ - !ruby/object:Gem::Dependency
31
+ version_requirements: !ruby/object:Gem::Requirement
32
+ requirements:
33
+ - - ! '>='
34
+ - !ruby/object:Gem::Version
35
+ version: '0'
36
+ none: false
37
+ name: pry-debugger
38
+ type: :development
39
+ prerelease: false
40
+ requirement: !ruby/object:Gem::Requirement
41
+ requirements:
42
+ - - ! '>='
43
+ - !ruby/object:Gem::Version
44
+ version: '0'
45
+ none: false
46
+ - !ruby/object:Gem::Dependency
47
+ version_requirements: !ruby/object:Gem::Requirement
48
+ requirements:
49
+ - - ! '>='
50
+ - !ruby/object:Gem::Version
51
+ version: '0'
52
+ none: false
53
+ name: colored
54
+ type: :development
55
+ prerelease: false
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ! '>='
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ none: false
62
+ description: Implementation of ROCK and DBSCAN clustering algorithms
63
+ email:
64
+ - julian@myfoodlink.com
65
+ executables: []
66
+ extensions: []
67
+ extra_rdoc_files: []
68
+ files:
69
+ - .gitignore
70
+ - Gemfile
71
+ - Gemfile.lock
72
+ - LICENSE.txt
73
+ - README.md
74
+ - Rakefile
75
+ - examples/dbscan_digg.rb
76
+ - examples/dbscan_lines.rb
77
+ - examples/rock_digg.rb
78
+ - examples/rock_lines.rb
79
+ - lib/same_same.rb
80
+ - lib/same_same/cluster.rb
81
+ - lib/same_same/cluster_similarity.rb
82
+ - lib/same_same/cosine_distance.rb
83
+ - lib/same_same/cosine_similarity.rb
84
+ - lib/same_same/data_point.rb
85
+ - lib/same_same/dbscan_algorithm.rb
86
+ - lib/same_same/dbscan_clusters.rb
87
+ - lib/same_same/dbscan_neighborhood.rb
88
+ - lib/same_same/dbscan_numeric_vectors.rb
89
+ - lib/same_same/dbscan_term_frequency_vectors.rb
90
+ - lib/same_same/dendrogram.rb
91
+ - lib/same_same/dendrogram_printer.rb
92
+ - lib/same_same/jaquard_coefficient.rb
93
+ - lib/same_same/link_matrix.rb
94
+ - lib/same_same/merge_goodness_measure.rb
95
+ - lib/same_same/rock_algorithm.rb
96
+ - lib/same_same/rock_clusters.rb
97
+ - lib/same_same/similarity_matrix.rb
98
+ - lib/same_same/symmetrical_matrix.rb
99
+ - lib/same_same/term_frequency_builder.rb
100
+ - lib/same_same/version.rb
101
+ - same_same.gemspec
102
+ - spec/fixtures/digg_stories.csv
103
+ - spec/fixtures/lines.csv
104
+ - spec/same_same/dbscan_algorithm_spec.rb
105
+ - spec/same_same/jaquard_coefficient_spec.rb
106
+ - spec/same_same/link_matrix_spec.rb
107
+ - spec/same_same/merge_goodness_measure_spec.rb
108
+ - spec/same_same/rock_algorithm_spec.rb
109
+ - spec/same_same/similarity_matrix_spec.rb
110
+ - spec/same_same/symmetrical_matrix_spec.rb
111
+ homepage: https://github.com/plusplus/same_same
112
+ licenses: []
113
+ post_install_message:
114
+ rdoc_options: []
115
+ require_paths:
116
+ - lib
117
+ required_ruby_version: !ruby/object:Gem::Requirement
118
+ requirements:
119
+ - - ! '>='
120
+ - !ruby/object:Gem::Version
121
+ version: '0'
122
+ none: false
123
+ required_rubygems_version: !ruby/object:Gem::Requirement
124
+ requirements:
125
+ - - ! '>='
126
+ - !ruby/object:Gem::Version
127
+ version: '0'
128
+ none: false
129
+ requirements: []
130
+ rubyforge_project:
131
+ rubygems_version: 1.8.23
132
+ signing_key:
133
+ specification_version: 3
134
+ summary: Implementation of ROCK and DBSCAN clustering algorithms
135
+ test_files:
136
+ - spec/fixtures/digg_stories.csv
137
+ - spec/fixtures/lines.csv
138
+ - spec/same_same/dbscan_algorithm_spec.rb
139
+ - spec/same_same/jaquard_coefficient_spec.rb
140
+ - spec/same_same/link_matrix_spec.rb
141
+ - spec/same_same/merge_goodness_measure_spec.rb
142
+ - spec/same_same/rock_algorithm_spec.rb
143
+ - spec/same_same/similarity_matrix_spec.rb
144
+ - spec/same_same/symmetrical_matrix_spec.rb