tsuga 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (58) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +5 -0
  3. data/.rspec +3 -0
  4. data/.ruby-version +1 -0
  5. data/.travis.yml +12 -0
  6. data/Gemfile +16 -0
  7. data/Gemfile.lock +146 -0
  8. data/Guardfile +8 -0
  9. data/LICENSE.txt +22 -0
  10. data/README.md +161 -0
  11. data/Rakefile +1 -0
  12. data/lib/tsuga.rb +11 -0
  13. data/lib/tsuga/adapter.rb +4 -0
  14. data/lib/tsuga/adapter/active_record/base.rb +61 -0
  15. data/lib/tsuga/adapter/active_record/cluster.rb +52 -0
  16. data/lib/tsuga/adapter/active_record/migration.rb +50 -0
  17. data/lib/tsuga/adapter/active_record/record.rb +15 -0
  18. data/lib/tsuga/adapter/active_record/test.rb +73 -0
  19. data/lib/tsuga/adapter/memory/base.rb +146 -0
  20. data/lib/tsuga/adapter/memory/cluster.rb +32 -0
  21. data/lib/tsuga/adapter/memory/test.rb +27 -0
  22. data/lib/tsuga/adapter/mongoid/base.rb +41 -0
  23. data/lib/tsuga/adapter/mongoid/cluster.rb +29 -0
  24. data/lib/tsuga/adapter/mongoid/record.rb +16 -0
  25. data/lib/tsuga/adapter/mongoid/test.rb +77 -0
  26. data/lib/tsuga/adapter/sequel/base.rb +57 -0
  27. data/lib/tsuga/adapter/sequel/cluster.rb +43 -0
  28. data/lib/tsuga/adapter/sequel/record.rb +15 -0
  29. data/lib/tsuga/adapter/sequel/test.rb +73 -0
  30. data/lib/tsuga/adapter/shared.rb +4 -0
  31. data/lib/tsuga/adapter/shared/cluster.rb +19 -0
  32. data/lib/tsuga/errors.rb +3 -0
  33. data/lib/tsuga/model/cluster.rb +147 -0
  34. data/lib/tsuga/model/point.rb +206 -0
  35. data/lib/tsuga/model/record.rb +20 -0
  36. data/lib/tsuga/model/tile.rb +136 -0
  37. data/lib/tsuga/service/aggregator.rb +175 -0
  38. data/lib/tsuga/service/clusterer.rb +260 -0
  39. data/lib/tsuga/service/labeler.rb +20 -0
  40. data/lib/tsuga/version.rb +3 -0
  41. data/script/benchmark-aggregator.rb +72 -0
  42. data/script/benchmark-clusterer.rb +102 -0
  43. data/spec/adapter/memory/base_spec.rb +174 -0
  44. data/spec/adapter/memory/cluster_spec.rb +39 -0
  45. data/spec/adapter/shared/cluster_spec.rb +56 -0
  46. data/spec/integration/active_record_spec.rb +10 -0
  47. data/spec/integration/memory_spec.rb +10 -0
  48. data/spec/integration/mongoid_spec.rb +10 -0
  49. data/spec/integration/sequel_spec.rb +10 -0
  50. data/spec/integration/shared.rb +50 -0
  51. data/spec/model/point_spec.rb +102 -0
  52. data/spec/model/tile_spec.rb +116 -0
  53. data/spec/service/aggregator_spec.rb +143 -0
  54. data/spec/service/clusterer_spec.rb +84 -0
  55. data/spec/spec_helper.rb +26 -0
  56. data/spec/support/mongoid.yml +17 -0
  57. data/tsuga.gemspec +29 -0
  58. metadata +226 -0
@@ -0,0 +1,260 @@
1
+ require 'tsuga/model/tile'
2
+ require 'tsuga/service/aggregator'
3
+
4
+ require 'ruby-progressbar'
5
+
6
+ module Tsuga::Service
7
+ class Clusterer
8
+ PROXIMITY_RATIO = 0.15
9
+ RUN_SANITY_CHECK = false
10
+ VERBOSE = ENV['VERBOSE']
11
+ Tile = Tsuga::Model::Tile
12
+
13
+ attr_reader :_adapter, :_source, :_queue
14
+
15
+ def initialize(source: nil, adapter: nil)
16
+ @_source = source
17
+ @_adapter = adapter
18
+ @_queue = WriteQueue.new(adapter: adapter)
19
+ end
20
+
21
+ def run
22
+ # delete all clusters
23
+ _adapter.delete_all
24
+
25
+ # create lowest-level clusters
26
+ _source.find_each do |record|
27
+ _queue.push _adapter.build_from(Tsuga::MAX_DEPTH, record)
28
+ end
29
+ _queue.flush
30
+
31
+ # for all depths N from 18 to 3
32
+ (Tsuga::MAX_DEPTH-1).downto(Tsuga::MIN_DEPTH) do |depth|
33
+ progress.log "depth #{depth}" if VERBOSE
34
+ progress.title = "#{depth}.0" if VERBOSE
35
+
36
+ # create clusters at this level from children
37
+ # TODO: use a save queue, only run saves if > 100 clusters to write
38
+ cluster_ids = Set.new
39
+ _adapter.at_depth(depth+1).find_each do |child|
40
+ _queue.push _adapter.build_from(depth, child)
41
+ end
42
+ _queue.flush
43
+ cluster_ids = MutableSet.new(_adapter.at_depth(depth).collect_ids)
44
+
45
+ if cluster_ids.empty?
46
+ progress.log "nothing to cluster" if VERBOSE
47
+ break
48
+ end
49
+
50
+ # TODO: group points to cluster by tile, and run on tiles in parallel.
51
+
52
+ progress.title = "#{depth}.1" if VERBOSE
53
+ progress.log "started with #{cluster_ids.length} clusters" if VERBOSE
54
+ progress.set_phase(depth, 1, cluster_ids.length) if VERBOSE
55
+ while cluster_ids.any?
56
+ progress.set_progress(cluster_ids.length) if VERBOSE
57
+
58
+ cluster = _adapter.find_by_id(cluster_ids.first)
59
+ raise 'internal error: cluster was already removed' if cluster.nil?
60
+ tile = Tile.including(cluster, depth: depth)
61
+
62
+ clusters = _adapter.in_tile(*tile.neighbours).to_a
63
+ processed_cluster_ids = clusters.collect(&:id)
64
+
65
+ # clusters we aggregate in this loop iteration
66
+ # they are _not_ the same as what we pass to the aggregator,
67
+ # just those inside the fence
68
+ fenced_cluster_ids = _adapter.in_tile(tile).collect_ids
69
+ raise RuntimeError, 'no cluster in fence' if fenced_cluster_ids.empty?
70
+
71
+ Aggregator.new(clusters:clusters, ratio:PROXIMITY_RATIO, fence:tile).tap do |aggregator|
72
+ aggregator.run
73
+
74
+ if VERBOSE
75
+ progress.log("aggregator: %4d left, %2d processed, %2d in fence, %2d updated, %2d dropped" % [
76
+ cluster_ids.length,
77
+ processed_cluster_ids.length,
78
+ fenced_cluster_ids.length,
79
+ aggregator.updated_clusters.length,
80
+ aggregator.dropped_clusters.length])
81
+ if aggregator.updated_clusters.any?
82
+ progress.log("updated: #{aggregator.updated_clusters.collect(&:id).join(', ')}")
83
+ end
84
+ if aggregator.dropped_clusters.any?
85
+ progress.log("dropped: #{aggregator.dropped_clusters.collect(&:id).join(', ')}")
86
+ end
87
+ end
88
+
89
+ cluster_ids.remove! fenced_cluster_ids
90
+ # updated clusters may need to be reprocessed (they might have fallen close enough to tile edges)
91
+ # TODO: as further optimisation, do not mark for reprocessing clusters that are still inside the fence
92
+ cluster_ids.merge! aggregator.updated_clusters.collect(&:id)
93
+ # destroyed clusters may include some on the outer fringe of the fence tile
94
+ cluster_ids.remove! aggregator.dropped_clusters.collect(&:id)
95
+
96
+ aggregator.dropped_clusters.each(&:destroy)
97
+ _adapter.mass_update(aggregator.updated_clusters)
98
+ end
99
+
100
+ if RUN_SANITY_CHECK
101
+ # sanity check: all <cluster_ids> should exist
102
+ not_removed = cluster_ids - _adapter.at_depth(depth).collect_ids
103
+ if not_removed.any?
104
+ raise "cluster_ids contains IDs of deleted clusters: #{not_removed.to_a.join(', ')}"
105
+ end
106
+
107
+ # sanity check: sum of weights should match that of lower level
108
+ deeper_weight = _adapter.at_depth(depth+1).sum(:weight)
109
+ this_weight = _adapter.at_depth(depth).sum(:weight)
110
+ if deeper_weight != this_weight
111
+ raise "mismatch between weight at this depth (#{this_weight}) and deeper level (#{deeper_weight})"
112
+ end
113
+ end
114
+ end
115
+
116
+ # set parent_id in the whole tree
117
+ # this is made slightly more complicated by #find_each's scoping
118
+ progress.title = "#{depth}.2" if VERBOSE
119
+ child_mappings = {}
120
+ _adapter.at_depth(depth).find_each do |cluster|
121
+ cluster.children_ids.each do |child_id|
122
+ child_mappings[child_id] = cluster.id
123
+ end
124
+ end
125
+ child_mappings.each_pair do |child_id, parent_id|
126
+ cluster = _adapter.find_by_id(child_id)
127
+ cluster.parent_id = parent_id
128
+ _queue.push cluster
129
+ end
130
+ _queue.flush
131
+ end
132
+ progress.finish if VERBOSE
133
+ end
134
+
135
+ private
136
+
137
+ def progress
138
+ @_progressbar ||= ProgressBar.create.extend(SteppedProgressBar)
139
+ end
140
+
141
+ module SteppedProgressBar
142
+ def set_phase(depth, phase, count)
143
+ _compute_totals
144
+ @current_phase = phase
145
+ @current_depth = depth
146
+ @current_count = count
147
+ end
148
+
149
+ def set_progress(count)
150
+ key = [@current_depth,@current_phase]
151
+ self.progress = @phase_total[key] -
152
+ @phase_subtotal[key] * count / @current_count
153
+ rescue Exception => e
154
+ require 'pry' ; require 'pry-nav' ; binding.pry
155
+ end
156
+
157
+ private
158
+
159
+ MAX = Tsuga::MAX_DEPTH-1
160
+ MIN = Tsuga::MIN_DEPTH
161
+ FACTOR = 0.5
162
+
163
+ def _compute_totals
164
+ return if @phase_total
165
+ sum = 0
166
+ @phase_total = {}
167
+ @phase_subtotal = {}
168
+ MAX.downto(MIN) do |depth|
169
+ depth_weight = FACTOR ** (MAX-depth)
170
+ [1,1,1].each_with_index do |phase_weight, phase_index|
171
+ phase_subtotal = depth_weight * phase_weight
172
+ sum += phase_subtotal
173
+ @phase_total[[depth,phase_index]] = sum
174
+ @phase_subtotal[[depth,phase_index]] = phase_subtotal
175
+ end
176
+ end
177
+ self.total = sum
178
+ end
179
+ end
180
+
181
+ # A Set-like structure, with in-place merging with, and removing of, another enumerable.
182
+ class MutableSet
183
+ include Enumerable
184
+ extend Forwardable
185
+
186
+ def initialize(enum = nil)
187
+ @_data = {}
188
+ merge!(enum) if enum
189
+ end
190
+
191
+ def -(enum)
192
+ self.class.new.tap do |result|
193
+ result.instance_variable_set(:@_data, @_data.dup)
194
+ result.remove!(enum)
195
+ end
196
+ end
197
+
198
+ def each
199
+ @_data.each_key { |k| yield k }
200
+ end
201
+
202
+ def merge!(enum)
203
+ enum.each { |key| @_data[key] = true }
204
+ end
205
+
206
+ def remove!(enum)
207
+ enum.each { |key| @_data.delete(key) }
208
+ end
209
+
210
+ def_delegators :@_data, :size, :length, :empty?
211
+ end
212
+
213
+
214
+ # TODO: extract to a separate file
215
+ class WriteQueue
216
+ QUEUE_SIZE = 250
217
+
218
+ def initialize(adapter:nil)
219
+ @_adapter = adapter
220
+ @_queue = []
221
+ end
222
+
223
+ def push(value)
224
+ @_queue.push(value)
225
+ flush if @_queue.size > QUEUE_SIZE
226
+ nil
227
+ end
228
+
229
+ def flush
230
+ # separate inserts from updates
231
+ inserts = _queue.map { |c| c.new_record? ? c : nil }.compact
232
+ updates = _queue.map { |c| c.new_record? ? nil : c }.compact
233
+
234
+ _adapter.mass_create(inserts) if inserts.any?
235
+ _adapter.mass_update(updates) if updates.any?
236
+ _queue.clear
237
+ end
238
+
239
+ private
240
+
241
+ attr_reader :_queue, :_adapter
242
+ end
243
+
244
+ # return the record IDs used
245
+ def _build_clusters(tile)
246
+ used_ids = []
247
+ clusters = []
248
+
249
+ _adapter.in_tile(*tile.children).find_each do |child|
250
+ cluster = _adapter.build_from(tile.depth, child)
251
+ clusters << cluster
252
+ used_ids << child.id
253
+ end
254
+
255
+ return [used_ids, clusters]
256
+ end
257
+
258
+
259
+ end
260
+ end
@@ -0,0 +1,20 @@
1
+ module Tsuga::Service
2
+ # Adds geo hashes to records.
3
+ class Labeler
4
+ def initialize(adapter)
5
+ @_adapter = adapter
6
+ end
7
+
8
+ def run
9
+ _adapter.records.find_each do |record|
10
+ record.update_geohash
11
+ record.persist!
12
+ end
13
+ end
14
+
15
+ private
16
+
17
+ attr_reader :_adapter
18
+
19
+ end
20
+ end
@@ -0,0 +1,3 @@
1
+ module Tsuga
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,72 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bundler/setup'
3
+ require 'perftools'
4
+ require 'benchmark'
5
+ require 'tsuga/adapter/memory/test'
6
+ require 'tsuga/adapter/sequel/test'
7
+ require 'tsuga/adapter/mongoid/test'
8
+ require 'tsuga/service/aggregator'
9
+ require 'pry'
10
+ require 'pry-nav'
11
+
12
+ COUNT = ENV.fetch('COUNT', '100').to_i
13
+ ENV['CPUPROFILE_FREQUENCY'] ||= '500'
14
+
15
+ case ENV['ADAPTER']
16
+ when /memory/i
17
+ Cluster = Tsuga::Adapter::Memory::Test.clusters
18
+ when /mysql/i
19
+ DB = Sequel.connect 'mysql2://root@localhost/tsuga'
20
+ Cluster = Tsuga::Adapter::Sequel::Test.clusters
21
+ when /mongo/i
22
+ Cluster = Tsuga::Adapter::Mongoid::Test.clusters
23
+ else
24
+ puts 'specify an ADAPTER'
25
+ exit 1
26
+ end
27
+
28
+ RAW_PROFILE = "tmp/profile_#{ENV['ADAPTER']}"
29
+ PDF_PROFILE = "#{RAW_PROFILE}.pdf"
30
+
31
+ def new_cluster(depth, lat, lng)
32
+ Cluster.new.tap do |cluster|
33
+ cluster.depth = depth
34
+ cluster.lat = lat
35
+ cluster.lng = lng
36
+ cluster.weight = 1
37
+ cluster.sum_lat = lat
38
+ cluster.sum_lng = lng
39
+ cluster.children_ids = []
40
+ # cluster.persist!
41
+ end
42
+ end
43
+
44
+
45
+ PerfTools::CpuProfiler.start(RAW_PROFILE) do
46
+ begin
47
+ 10.times do |idx|
48
+ Cluster.delete_all
49
+ lat_max = 45 - 1e-4
50
+ lng_max = 90 - 1e-4
51
+ clusters = (1..COUNT).map { new_cluster(2, rand*lat_max, rand*lng_max) }
52
+
53
+ runtime = Benchmark.measure do
54
+ Tsuga::Service::Aggregator.new(clusters).run
55
+ end
56
+ puts "run #{idx}: #{runtime}"
57
+ end
58
+ rescue Exception => e
59
+ puts "caught #{e.class.name} (#{e.message})"
60
+ if ENV['DEBUG']
61
+ binding.pry
62
+ else
63
+ puts "set DEBUG next time to inspect"
64
+ end
65
+ $failure = true
66
+ end
67
+ end
68
+
69
+ unless $failure
70
+ system "pprof.rb --pdf #{RAW_PROFILE} > #{PDF_PROFILE}"
71
+ system "open #{PDF_PROFILE}"
72
+ end
@@ -0,0 +1,102 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bundler/setup'
4
+ require 'perftools'
5
+ require 'benchmark'
6
+ require 'zlib'
7
+ require 'yaml'
8
+ require 'csv'
9
+ require 'pry'
10
+ require 'ostruct'
11
+ require 'tsuga/adapter/memory/test'
12
+ require 'tsuga/adapter/sequel/test'
13
+ require 'tsuga/adapter/active_record/test'
14
+ require 'tsuga/adapter/mongoid/test'
15
+ require 'tsuga/service/clusterer'
16
+
17
+ ENV['CPUPROFILE_FREQUENCY'] ||= '500'
18
+
19
+ LIMIT = ENV.fetch('LIMIT', '200').to_i
20
+ SOURCE = ENV.fetch('SOURCE', 'doc/barcelona.csv.gz')
21
+ ADAPTER_NAME = ENV.fetch('ADAPTER','mysql')
22
+
23
+ case ADAPTER_NAME
24
+ when /memory/i
25
+ Adapter = Tsuga::Adapter::Memory::Test
26
+ when /sequel/i
27
+ DB = Sequel.connect 'mysql2://root@localhost/tsuga'
28
+ Adapter = Tsuga::Adapter::Sequel::Test
29
+ when /ar/i
30
+ ActiveRecord::Base.establish_connection(adapter:'mysql2', username:'root', host:'localhost', database:'tsuga')
31
+ ActiveRecord::Base.connection
32
+ Adapter = Tsuga::Adapter::ActiveRecord::Test
33
+ when /mongo/i
34
+ Adapter = Tsuga::Adapter::Mongoid::Test
35
+ else
36
+ puts 'specify an ADAPTER'
37
+ exit 1
38
+ end
39
+
40
+ Clusters = Adapter.clusters
41
+ Records = Adapter.records
42
+
43
+ RAW_PROFILE = "tmp/profile_#{ENV['ADAPTER']}"
44
+ PDF_PROFILE = "#{RAW_PROFILE}.pdf"
45
+
46
+ puts 'loading records...'
47
+ data = {}
48
+ Zlib::GzipReader.open(SOURCE) do |io|
49
+ CSV(io) do |csv|
50
+ csv.each do |row|
51
+ id,lng,lat = row
52
+ data[id] = {lat:lat.to_f, lng:lng.to_f}
53
+ break if data.size >= LIMIT
54
+ end
55
+ end
56
+ end
57
+
58
+ puts 'creating records...'
59
+ Records.delete_all
60
+ data.each_pair do |k,v|
61
+ Records.new(lat: v[:lat], lng: v[:lng]).persist!
62
+ end
63
+ puts " #{Records.count} records created"
64
+
65
+ puts 'profiling...'
66
+ PerfTools::CpuProfiler.start(RAW_PROFILE) do
67
+ begin
68
+ Tsuga::Service::Clusterer.new(source: Records, adapter: Clusters).run
69
+ puts "#{Clusters.count} clusters created"
70
+ rescue Exception => e
71
+ puts "caught #{e.class.name} (#{e.message})"
72
+ if ENV['DEBUG']
73
+ binding.pry
74
+ else
75
+ puts "set DEBUG next time to inspect"
76
+ end
77
+ $failure = true
78
+ end
79
+ end
80
+
81
+ unless $failure
82
+ system "pprof.rb --pdf #{RAW_PROFILE} > #{PDF_PROFILE}"
83
+ system "open #{PDF_PROFILE}"
84
+ end
85
+
86
+ __END__
87
+
88
+ 100,000 random records:
89
+ real 110m17.156s
90
+ user 83m0.333s
91
+ sys 8m34.427s
92
+
93
+ 10,000 real records (properties)
94
+ 122.76 real
95
+ 92.49 user
96
+ 7.50 sys
97
+
98
+ 20,000 real records (properties)
99
+ 239.47 real
100
+ 176.16 user
101
+ 15.94 sys
102
+