tsuga 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +5 -0
  3. data/.rspec +3 -0
  4. data/.ruby-version +1 -0
  5. data/.travis.yml +12 -0
  6. data/Gemfile +16 -0
  7. data/Gemfile.lock +146 -0
  8. data/Guardfile +8 -0
  9. data/LICENSE.txt +22 -0
  10. data/README.md +161 -0
  11. data/Rakefile +1 -0
  12. data/lib/tsuga.rb +11 -0
  13. data/lib/tsuga/adapter.rb +4 -0
  14. data/lib/tsuga/adapter/active_record/base.rb +61 -0
  15. data/lib/tsuga/adapter/active_record/cluster.rb +52 -0
  16. data/lib/tsuga/adapter/active_record/migration.rb +50 -0
  17. data/lib/tsuga/adapter/active_record/record.rb +15 -0
  18. data/lib/tsuga/adapter/active_record/test.rb +73 -0
  19. data/lib/tsuga/adapter/memory/base.rb +146 -0
  20. data/lib/tsuga/adapter/memory/cluster.rb +32 -0
  21. data/lib/tsuga/adapter/memory/test.rb +27 -0
  22. data/lib/tsuga/adapter/mongoid/base.rb +41 -0
  23. data/lib/tsuga/adapter/mongoid/cluster.rb +29 -0
  24. data/lib/tsuga/adapter/mongoid/record.rb +16 -0
  25. data/lib/tsuga/adapter/mongoid/test.rb +77 -0
  26. data/lib/tsuga/adapter/sequel/base.rb +57 -0
  27. data/lib/tsuga/adapter/sequel/cluster.rb +43 -0
  28. data/lib/tsuga/adapter/sequel/record.rb +15 -0
  29. data/lib/tsuga/adapter/sequel/test.rb +73 -0
  30. data/lib/tsuga/adapter/shared.rb +4 -0
  31. data/lib/tsuga/adapter/shared/cluster.rb +19 -0
  32. data/lib/tsuga/errors.rb +3 -0
  33. data/lib/tsuga/model/cluster.rb +147 -0
  34. data/lib/tsuga/model/point.rb +206 -0
  35. data/lib/tsuga/model/record.rb +20 -0
  36. data/lib/tsuga/model/tile.rb +136 -0
  37. data/lib/tsuga/service/aggregator.rb +175 -0
  38. data/lib/tsuga/service/clusterer.rb +260 -0
  39. data/lib/tsuga/service/labeler.rb +20 -0
  40. data/lib/tsuga/version.rb +3 -0
  41. data/script/benchmark-aggregator.rb +72 -0
  42. data/script/benchmark-clusterer.rb +102 -0
  43. data/spec/adapter/memory/base_spec.rb +174 -0
  44. data/spec/adapter/memory/cluster_spec.rb +39 -0
  45. data/spec/adapter/shared/cluster_spec.rb +56 -0
  46. data/spec/integration/active_record_spec.rb +10 -0
  47. data/spec/integration/memory_spec.rb +10 -0
  48. data/spec/integration/mongoid_spec.rb +10 -0
  49. data/spec/integration/sequel_spec.rb +10 -0
  50. data/spec/integration/shared.rb +50 -0
  51. data/spec/model/point_spec.rb +102 -0
  52. data/spec/model/tile_spec.rb +116 -0
  53. data/spec/service/aggregator_spec.rb +143 -0
  54. data/spec/service/clusterer_spec.rb +84 -0
  55. data/spec/spec_helper.rb +26 -0
  56. data/spec/support/mongoid.yml +17 -0
  57. data/tsuga.gemspec +29 -0
  58. metadata +226 -0
@@ -0,0 +1,260 @@
1
+ require 'tsuga/model/tile'
2
+ require 'tsuga/service/aggregator'
3
+
4
+ require 'ruby-progressbar'
5
+
6
+ module Tsuga::Service
7
+ class Clusterer
8
+ PROXIMITY_RATIO = 0.15
9
+ RUN_SANITY_CHECK = false
10
+ VERBOSE = ENV['VERBOSE']
11
+ Tile = Tsuga::Model::Tile
12
+
13
+ attr_reader :_adapter, :_source, :_queue
14
+
15
+ def initialize(source: nil, adapter: nil)
16
+ @_source = source
17
+ @_adapter = adapter
18
+ @_queue = WriteQueue.new(adapter: adapter)
19
+ end
20
+
21
+ def run
22
+ # delete all clusters
23
+ _adapter.delete_all
24
+
25
+ # create lowest-level clusters
26
+ _source.find_each do |record|
27
+ _queue.push _adapter.build_from(Tsuga::MAX_DEPTH, record)
28
+ end
29
+ _queue.flush
30
+
31
+ # for all depths N from 18 to 3
32
+ (Tsuga::MAX_DEPTH-1).downto(Tsuga::MIN_DEPTH) do |depth|
33
+ progress.log "depth #{depth}" if VERBOSE
34
+ progress.title = "#{depth}.0" if VERBOSE
35
+
36
+ # create clusters at this level from children
37
+ # TODO: use a save queue, only run saves if > 100 clusters to write
38
+ cluster_ids = Set.new
39
+ _adapter.at_depth(depth+1).find_each do |child|
40
+ _queue.push _adapter.build_from(depth, child)
41
+ end
42
+ _queue.flush
43
+ cluster_ids = MutableSet.new(_adapter.at_depth(depth).collect_ids)
44
+
45
+ if cluster_ids.empty?
46
+ progress.log "nothing to cluster" if VERBOSE
47
+ break
48
+ end
49
+
50
+ # TODO: group points to cluster by tile, and run on tiles in parallel.
51
+
52
+ progress.title = "#{depth}.1" if VERBOSE
53
+ progress.log "started with #{cluster_ids.length} clusters" if VERBOSE
54
+ progress.set_phase(depth, 1, cluster_ids.length) if VERBOSE
55
+ while cluster_ids.any?
56
+ progress.set_progress(cluster_ids.length) if VERBOSE
57
+
58
+ cluster = _adapter.find_by_id(cluster_ids.first)
59
+ raise 'internal error: cluster was already removed' if cluster.nil?
60
+ tile = Tile.including(cluster, depth: depth)
61
+
62
+ clusters = _adapter.in_tile(*tile.neighbours).to_a
63
+ processed_cluster_ids = clusters.collect(&:id)
64
+
65
+ # clusters we aggregate in this loop iteration
66
+ # they are _not_ the same as what we pass to the aggregator,
67
+ # just those inside the fence
68
+ fenced_cluster_ids = _adapter.in_tile(tile).collect_ids
69
+ raise RuntimeError, 'no cluster in fence' if fenced_cluster_ids.empty?
70
+
71
+ Aggregator.new(clusters:clusters, ratio:PROXIMITY_RATIO, fence:tile).tap do |aggregator|
72
+ aggregator.run
73
+
74
+ if VERBOSE
75
+ progress.log("aggregator: %4d left, %2d processed, %2d in fence, %2d updated, %2d dropped" % [
76
+ cluster_ids.length,
77
+ processed_cluster_ids.length,
78
+ fenced_cluster_ids.length,
79
+ aggregator.updated_clusters.length,
80
+ aggregator.dropped_clusters.length])
81
+ if aggregator.updated_clusters.any?
82
+ progress.log("updated: #{aggregator.updated_clusters.collect(&:id).join(', ')}")
83
+ end
84
+ if aggregator.dropped_clusters.any?
85
+ progress.log("dropped: #{aggregator.dropped_clusters.collect(&:id).join(', ')}")
86
+ end
87
+ end
88
+
89
+ cluster_ids.remove! fenced_cluster_ids
90
+ # updated clusters may need to be reprocessed (they might have fallen close enough to tile edges)
91
+ # TODO: as further optimisation, do not mark for reprocessing clusters that are still inside the fence
92
+ cluster_ids.merge! aggregator.updated_clusters.collect(&:id)
93
+ # destroyed clusters may include some on the outer fringe of the fence tile
94
+ cluster_ids.remove! aggregator.dropped_clusters.collect(&:id)
95
+
96
+ aggregator.dropped_clusters.each(&:destroy)
97
+ _adapter.mass_update(aggregator.updated_clusters)
98
+ end
99
+
100
+ if RUN_SANITY_CHECK
101
+ # sanity check: all <cluster_ids> should exist
102
+ not_removed = cluster_ids - _adapter.at_depth(depth).collect_ids
103
+ if not_removed.any?
104
+ raise "cluster_ids contains IDs of deleted clusters: #{not_removed.to_a.join(', ')}"
105
+ end
106
+
107
+ # sanity check: sum of weights should match that of lower level
108
+ deeper_weight = _adapter.at_depth(depth+1).sum(:weight)
109
+ this_weight = _adapter.at_depth(depth).sum(:weight)
110
+ if deeper_weight != this_weight
111
+ raise "mismatch between weight at this depth (#{this_weight}) and deeper level (#{deeper_weight})"
112
+ end
113
+ end
114
+ end
115
+
116
+ # set parent_id in the whole tree
117
+ # this is made slightly more complicated by #find_each's scoping
118
+ progress.title = "#{depth}.2" if VERBOSE
119
+ child_mappings = {}
120
+ _adapter.at_depth(depth).find_each do |cluster|
121
+ cluster.children_ids.each do |child_id|
122
+ child_mappings[child_id] = cluster.id
123
+ end
124
+ end
125
+ child_mappings.each_pair do |child_id, parent_id|
126
+ cluster = _adapter.find_by_id(child_id)
127
+ cluster.parent_id = parent_id
128
+ _queue.push cluster
129
+ end
130
+ _queue.flush
131
+ end
132
+ progress.finish if VERBOSE
133
+ end
134
+
135
+ private
136
+
137
+ def progress
138
+ @_progressbar ||= ProgressBar.create.extend(SteppedProgressBar)
139
+ end
140
+
141
+ module SteppedProgressBar
142
+ def set_phase(depth, phase, count)
143
+ _compute_totals
144
+ @current_phase = phase
145
+ @current_depth = depth
146
+ @current_count = count
147
+ end
148
+
149
+ def set_progress(count)
150
+ key = [@current_depth,@current_phase]
151
+ self.progress = @phase_total[key] -
152
+ @phase_subtotal[key] * count / @current_count
153
+ rescue Exception => e
154
+ require 'pry' ; require 'pry-nav' ; binding.pry
155
+ end
156
+
157
+ private
158
+
159
+ MAX = Tsuga::MAX_DEPTH-1
160
+ MIN = Tsuga::MIN_DEPTH
161
+ FACTOR = 0.5
162
+
163
+ def _compute_totals
164
+ return if @phase_total
165
+ sum = 0
166
+ @phase_total = {}
167
+ @phase_subtotal = {}
168
+ MAX.downto(MIN) do |depth|
169
+ depth_weight = FACTOR ** (MAX-depth)
170
+ [1,1,1].each_with_index do |phase_weight, phase_index|
171
+ phase_subtotal = depth_weight * phase_weight
172
+ sum += phase_subtotal
173
+ @phase_total[[depth,phase_index]] = sum
174
+ @phase_subtotal[[depth,phase_index]] = phase_subtotal
175
+ end
176
+ end
177
+ self.total = sum
178
+ end
179
+ end
180
+
181
+ # A Set-like structure, with in-place merging with, and removing of, another enumerable.
182
+ class MutableSet
183
+ include Enumerable
184
+ extend Forwardable
185
+
186
+ def initialize(enum = nil)
187
+ @_data = {}
188
+ merge!(enum) if enum
189
+ end
190
+
191
+ def -(enum)
192
+ self.class.new.tap do |result|
193
+ result.instance_variable_set(:@_data, @_data.dup)
194
+ result.remove!(enum)
195
+ end
196
+ end
197
+
198
+ def each
199
+ @_data.each_key { |k| yield k }
200
+ end
201
+
202
+ def merge!(enum)
203
+ enum.each { |key| @_data[key] = true }
204
+ end
205
+
206
+ def remove!(enum)
207
+ enum.each { |key| @_data.delete(key) }
208
+ end
209
+
210
+ def_delegators :@_data, :size, :length, :empty?
211
+ end
212
+
213
+
214
+ # TODO: extract to a separate file
215
+ class WriteQueue
216
+ QUEUE_SIZE = 250
217
+
218
+ def initialize(adapter:nil)
219
+ @_adapter = adapter
220
+ @_queue = []
221
+ end
222
+
223
+ def push(value)
224
+ @_queue.push(value)
225
+ flush if @_queue.size > QUEUE_SIZE
226
+ nil
227
+ end
228
+
229
+ def flush
230
+ # separate inserts from updates
231
+ inserts = _queue.map { |c| c.new_record? ? c : nil }.compact
232
+ updates = _queue.map { |c| c.new_record? ? nil : c }.compact
233
+
234
+ _adapter.mass_create(inserts) if inserts.any?
235
+ _adapter.mass_update(updates) if updates.any?
236
+ _queue.clear
237
+ end
238
+
239
+ private
240
+
241
+ attr_reader :_queue, :_adapter
242
+ end
243
+
244
+ # return the record IDs used
245
+ def _build_clusters(tile)
246
+ used_ids = []
247
+ clusters = []
248
+
249
+ _adapter.in_tile(*tile.children).find_each do |child|
250
+ cluster = _adapter.build_from(tile.depth, child)
251
+ clusters << cluster
252
+ used_ids << child.id
253
+ end
254
+
255
+ return [used_ids, clusters]
256
+ end
257
+
258
+
259
+ end
260
+ end
@@ -0,0 +1,20 @@
1
+ module Tsuga::Service
2
+ # Adds geo hashes to records.
3
+ class Labeler
4
+ def initialize(adapter)
5
+ @_adapter = adapter
6
+ end
7
+
8
+ def run
9
+ _adapter.records.find_each do |record|
10
+ record.update_geohash
11
+ record.persist!
12
+ end
13
+ end
14
+
15
+ private
16
+
17
+ attr_reader :_adapter
18
+
19
+ end
20
+ end
@@ -0,0 +1,3 @@
1
+ module Tsuga
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,72 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bundler/setup'
3
+ require 'perftools'
4
+ require 'benchmark'
5
+ require 'tsuga/adapter/memory/test'
6
+ require 'tsuga/adapter/sequel/test'
7
+ require 'tsuga/adapter/mongoid/test'
8
+ require 'tsuga/service/aggregator'
9
+ require 'pry'
10
+ require 'pry-nav'
11
+
12
+ COUNT = ENV.fetch('COUNT', '100').to_i
13
+ ENV['CPUPROFILE_FREQUENCY'] ||= '500'
14
+
15
+ case ENV['ADAPTER']
16
+ when /memory/i
17
+ Cluster = Tsuga::Adapter::Memory::Test.clusters
18
+ when /mysql/i
19
+ DB = Sequel.connect 'mysql2://root@localhost/tsuga'
20
+ Cluster = Tsuga::Adapter::Sequel::Test.clusters
21
+ when /mongo/i
22
+ Cluster = Tsuga::Adapter::Mongoid::Test.clusters
23
+ else
24
+ puts 'specify an ADAPTER'
25
+ exit 1
26
+ end
27
+
28
+ RAW_PROFILE = "tmp/profile_#{ENV['ADAPTER']}"
29
+ PDF_PROFILE = "#{RAW_PROFILE}.pdf"
30
+
31
+ def new_cluster(depth, lat, lng)
32
+ Cluster.new.tap do |cluster|
33
+ cluster.depth = depth
34
+ cluster.lat = lat
35
+ cluster.lng = lng
36
+ cluster.weight = 1
37
+ cluster.sum_lat = lat
38
+ cluster.sum_lng = lng
39
+ cluster.children_ids = []
40
+ # cluster.persist!
41
+ end
42
+ end
43
+
44
+
45
+ PerfTools::CpuProfiler.start(RAW_PROFILE) do
46
+ begin
47
+ 10.times do |idx|
48
+ Cluster.delete_all
49
+ lat_max = 45 - 1e-4
50
+ lng_max = 90 - 1e-4
51
+ clusters = (1..COUNT).map { new_cluster(2, rand*lat_max, rand*lng_max) }
52
+
53
+ runtime = Benchmark.measure do
54
+ Tsuga::Service::Aggregator.new(clusters).run
55
+ end
56
+ puts "run #{idx}: #{runtime}"
57
+ end
58
+ rescue Exception => e
59
+ puts "caught #{e.class.name} (#{e.message})"
60
+ if ENV['DEBUG']
61
+ binding.pry
62
+ else
63
+ puts "set DEBUG next time to inspect"
64
+ end
65
+ $failure = true
66
+ end
67
+ end
68
+
69
+ unless $failure
70
+ system "pprof.rb --pdf #{RAW_PROFILE} > #{PDF_PROFILE}"
71
+ system "open #{PDF_PROFILE}"
72
+ end
@@ -0,0 +1,102 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bundler/setup'
4
+ require 'perftools'
5
+ require 'benchmark'
6
+ require 'zlib'
7
+ require 'yaml'
8
+ require 'csv'
9
+ require 'pry'
10
+ require 'ostruct'
11
+ require 'tsuga/adapter/memory/test'
12
+ require 'tsuga/adapter/sequel/test'
13
+ require 'tsuga/adapter/active_record/test'
14
+ require 'tsuga/adapter/mongoid/test'
15
+ require 'tsuga/service/clusterer'
16
+
17
+ ENV['CPUPROFILE_FREQUENCY'] ||= '500'
18
+
19
+ LIMIT = ENV.fetch('LIMIT', '200').to_i
20
+ SOURCE = ENV.fetch('SOURCE', 'doc/barcelona.csv.gz')
21
+ ADAPTER_NAME = ENV.fetch('ADAPTER','mysql')
22
+
23
+ case ADAPTER_NAME
24
+ when /memory/i
25
+ Adapter = Tsuga::Adapter::Memory::Test
26
+ when /sequel/i
27
+ DB = Sequel.connect 'mysql2://root@localhost/tsuga'
28
+ Adapter = Tsuga::Adapter::Sequel::Test
29
+ when /ar/i
30
+ ActiveRecord::Base.establish_connection(adapter:'mysql2', username:'root', host:'localhost', database:'tsuga')
31
+ ActiveRecord::Base.connection
32
+ Adapter = Tsuga::Adapter::ActiveRecord::Test
33
+ when /mongo/i
34
+ Adapter = Tsuga::Adapter::Mongoid::Test
35
+ else
36
+ puts 'specify an ADAPTER'
37
+ exit 1
38
+ end
39
+
40
+ Clusters = Adapter.clusters
41
+ Records = Adapter.records
42
+
43
+ RAW_PROFILE = "tmp/profile_#{ENV['ADAPTER']}"
44
+ PDF_PROFILE = "#{RAW_PROFILE}.pdf"
45
+
46
+ puts 'loading records...'
47
+ data = {}
48
+ Zlib::GzipReader.open(SOURCE) do |io|
49
+ CSV(io) do |csv|
50
+ csv.each do |row|
51
+ id,lng,lat = row
52
+ data[id] = {lat:lat.to_f, lng:lng.to_f}
53
+ break if data.size >= LIMIT
54
+ end
55
+ end
56
+ end
57
+
58
+ puts 'creating records...'
59
+ Records.delete_all
60
+ data.each_pair do |k,v|
61
+ Records.new(lat: v[:lat], lng: v[:lng]).persist!
62
+ end
63
+ puts " #{Records.count} records created"
64
+
65
+ puts 'profiling...'
66
+ PerfTools::CpuProfiler.start(RAW_PROFILE) do
67
+ begin
68
+ Tsuga::Service::Clusterer.new(source: Records, adapter: Clusters).run
69
+ puts "#{Clusters.count} clusters created"
70
+ rescue Exception => e
71
+ puts "caught #{e.class.name} (#{e.message})"
72
+ if ENV['DEBUG']
73
+ binding.pry
74
+ else
75
+ puts "set DEBUG next time to inspect"
76
+ end
77
+ $failure = true
78
+ end
79
+ end
80
+
81
+ unless $failure
82
+ system "pprof.rb --pdf #{RAW_PROFILE} > #{PDF_PROFILE}"
83
+ system "open #{PDF_PROFILE}"
84
+ end
85
+
86
+ __END__
87
+
88
+ 100,000 random records:
89
+ real 110m17.156s
90
+ user 83m0.333s
91
+ sys 8m34.427s
92
+
93
+ 10,000 real records (properties)
94
+ 122.76 real
95
+ 92.49 user
96
+ 7.50 sys
97
+
98
+ 20,000 real records (properties)
99
+ 239.47 real
100
+ 176.16 user
101
+ 15.94 sys
102
+