nose 0.1.0pre

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +7 -0
  2. data/lib/nose/backend/cassandra.rb +390 -0
  3. data/lib/nose/backend/file.rb +185 -0
  4. data/lib/nose/backend/mongo.rb +242 -0
  5. data/lib/nose/backend.rb +557 -0
  6. data/lib/nose/cost/cassandra.rb +33 -0
  7. data/lib/nose/cost/entity_count.rb +27 -0
  8. data/lib/nose/cost/field_size.rb +31 -0
  9. data/lib/nose/cost/request_count.rb +32 -0
  10. data/lib/nose/cost.rb +68 -0
  11. data/lib/nose/debug.rb +45 -0
  12. data/lib/nose/enumerator.rb +199 -0
  13. data/lib/nose/indexes.rb +239 -0
  14. data/lib/nose/loader/csv.rb +99 -0
  15. data/lib/nose/loader/mysql.rb +199 -0
  16. data/lib/nose/loader/random.rb +48 -0
  17. data/lib/nose/loader/sql.rb +105 -0
  18. data/lib/nose/loader.rb +38 -0
  19. data/lib/nose/model/entity.rb +136 -0
  20. data/lib/nose/model/fields.rb +293 -0
  21. data/lib/nose/model.rb +113 -0
  22. data/lib/nose/parser.rb +202 -0
  23. data/lib/nose/plans/execution_plan.rb +282 -0
  24. data/lib/nose/plans/filter.rb +99 -0
  25. data/lib/nose/plans/index_lookup.rb +302 -0
  26. data/lib/nose/plans/limit.rb +42 -0
  27. data/lib/nose/plans/query_planner.rb +361 -0
  28. data/lib/nose/plans/sort.rb +49 -0
  29. data/lib/nose/plans/update.rb +60 -0
  30. data/lib/nose/plans/update_planner.rb +270 -0
  31. data/lib/nose/plans.rb +135 -0
  32. data/lib/nose/proxy/mysql.rb +275 -0
  33. data/lib/nose/proxy.rb +102 -0
  34. data/lib/nose/query_graph.rb +481 -0
  35. data/lib/nose/random/barbasi_albert.rb +48 -0
  36. data/lib/nose/random/watts_strogatz.rb +50 -0
  37. data/lib/nose/random.rb +391 -0
  38. data/lib/nose/schema.rb +89 -0
  39. data/lib/nose/search/constraints.rb +143 -0
  40. data/lib/nose/search/problem.rb +328 -0
  41. data/lib/nose/search/results.rb +200 -0
  42. data/lib/nose/search.rb +266 -0
  43. data/lib/nose/serialize.rb +747 -0
  44. data/lib/nose/statements/connection.rb +160 -0
  45. data/lib/nose/statements/delete.rb +83 -0
  46. data/lib/nose/statements/insert.rb +146 -0
  47. data/lib/nose/statements/query.rb +161 -0
  48. data/lib/nose/statements/update.rb +101 -0
  49. data/lib/nose/statements.rb +645 -0
  50. data/lib/nose/timing.rb +79 -0
  51. data/lib/nose/util.rb +305 -0
  52. data/lib/nose/workload.rb +244 -0
  53. data/lib/nose.rb +37 -0
  54. data/templates/workload.erb +42 -0
  55. metadata +700 -0
@@ -0,0 +1,266 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'search/constraints'
4
+ require_relative 'search/problem'
5
+ require_relative 'search/results'
6
+
7
+ require 'logging'
8
+ require 'ostruct'
9
+ require 'tempfile'
10
+
11
+ module NoSE
12
+ # ILP construction and schema search
13
+ module Search
14
+ # Searches for the optimal indices for a given workload
15
+ class Search
16
+ def initialize(workload, cost_model, objective = Objective::COST,
17
+ by_id_graph = false)
18
+ @logger = Logging.logger['nose::search']
19
+ @workload = workload
20
+ @cost_model = cost_model
21
+ @objective = objective
22
+ @by_id_graph = by_id_graph
23
+
24
+ # For now we only support optimization based on cost when grouping by
25
+ # ID graphs, but support for other objectives is still feasible
26
+ fail 'Only cost-based optimization allowed when using ID graphs' \
27
+ if @by_id_graph && objective != Objective::COST
28
+ end
29
+
30
+ # Search for optimal indices using an ILP which searches for
31
+ # non-overlapping indices
32
+ # @return [Results]
33
+ def search_overlap(indexes, max_space = Float::INFINITY)
34
+ return if indexes.empty?
35
+
36
+ # Get the costs of all queries and updates
37
+ query_weights = combine_query_weights indexes
38
+ costs, trees = query_costs query_weights, indexes
39
+ update_costs, update_plans = update_costs trees, indexes
40
+
41
+ log_search_start costs, query_weights
42
+
43
+ solver_params = {
44
+ max_space: max_space,
45
+ costs: costs,
46
+ update_costs: update_costs,
47
+ cost_model: @cost_model,
48
+ by_id_graph: @by_id_graph
49
+ }
50
+ search_result query_weights, indexes, solver_params, trees,
51
+ update_plans
52
+ end
53
+
54
+ private
55
+
56
+ # Combine the weights of queries and statements
57
+ # @return [void]
58
+ def combine_query_weights(indexes)
59
+ indexes = indexes.map(&:to_id_graph).uniq if @by_id_graph
60
+ query_weights = Hash[@workload.support_queries(indexes).map do |query|
61
+ [query, @workload.statement_weights[query.statement]]
62
+ end]
63
+ query_weights.merge!(@workload.statement_weights.select do |stmt, _|
64
+ stmt.is_a? Query
65
+ end.to_h)
66
+
67
+ query_weights
68
+ end
69
+
70
+ # Produce a useful log message before starting the search
71
+ # @return [void]
72
+ def log_search_start(costs, query_weights)
73
+ @logger.debug do
74
+ "Costs: \n" + pp_s(costs) + "\n" \
75
+ "Search with queries:\n" + \
76
+ query_weights.each_key.map.with_index do |query, i|
77
+ "#{i} #{query.inspect}"
78
+ end.join("\n")
79
+ end
80
+ end
81
+
82
+ # Run the solver and get the results of search
83
+ # @return [Results]
84
+ def search_result(query_weights, indexes, solver_params, trees,
85
+ update_plans)
86
+ # Solve the LP using MIPPeR
87
+ result = solve_mipper query_weights.keys, indexes, **solver_params
88
+
89
+ result.workload = @workload
90
+ result.plans_from_trees trees
91
+ result.cost_model = @cost_model
92
+
93
+ # Select the relevant update plans
94
+ update_plans = update_plans.values.flatten(1).select do |plan|
95
+ result.indexes.include? plan.index
96
+ end
97
+ update_plans.each do |plan|
98
+ plan.select_query_plans(&result.method(:select_plan))
99
+ end
100
+ result.update_plans = update_plans
101
+
102
+ result.validate
103
+
104
+ result
105
+ end
106
+
107
+ # Select the plans to use for a given set of indexes
108
+ # @return [Array<Plans::QueryPlan>]
109
+ def select_plans(trees, indexes)
110
+ trees.map do |tree|
111
+ # Exclude support queries since they will be in update plans
112
+ query = tree.query
113
+ next if query.is_a?(SupportQuery)
114
+
115
+ # Select the exact plan to use for these indexes
116
+ tree.select_using_indexes(indexes).min_by(&:cost)
117
+ end.compact
118
+ end
119
+
120
+ # Solve the index selection problem using MIPPeR
121
+ # @return [Results]
122
+ def solve_mipper(queries, indexes, data)
123
+ # Construct and solve the ILP
124
+ problem = Problem.new queries, @workload.updates, data, @objective
125
+ problem.solve
126
+
127
+ # We won't get here if there's no valdi solution
128
+ @logger.debug 'Found solution with total cost ' \
129
+ "#{problem.objective_value}"
130
+
131
+ # Return the selected indices
132
+ selected_indexes = problem.selected_indexes
133
+
134
+ @logger.debug do
135
+ "Selected indexes:\n" + selected_indexes.map do |index|
136
+ "#{indexes.index index} #{index.inspect}"
137
+ end.join("\n")
138
+ end
139
+
140
+ problem.result
141
+ end
142
+
143
+ # Produce the cost of updates in the workload
144
+ def update_costs(trees, indexes)
145
+ planner = Plans::UpdatePlanner.new @workload.model, trees, @cost_model,
146
+ @by_id_graph
147
+ update_costs = Hash.new { |h, k| h[k] = {} }
148
+ update_plans = Hash.new { |h, k| h[k] = [] }
149
+ @workload.statements.each do |statement|
150
+ next if statement.is_a? Query
151
+
152
+ populate_update_costs planner, statement, indexes,
153
+ update_costs, update_plans
154
+ end
155
+
156
+ [update_costs, update_plans]
157
+ end
158
+
159
+ # Populate the cost of all necessary plans for the given satement
160
+ # @return [void]
161
+ def populate_update_costs(planner, statement, indexes,
162
+ update_costs, update_plans)
163
+ planner.find_plans_for_update(statement, indexes).each do |plan|
164
+ weight = @workload.statement_weights[statement]
165
+ update_costs[statement][plan.index] = plan.update_cost * weight
166
+ update_plans[statement] << plan
167
+ end
168
+ end
169
+
170
+ # Get the cost of using each index for each query in a workload
171
+ def query_costs(query_weights, indexes)
172
+ planner = Plans::QueryPlanner.new @workload, indexes, @cost_model
173
+
174
+ results = Parallel.map(query_weights) do |query, weight|
175
+ query_cost planner, query, weight
176
+ end
177
+ costs = Hash[query_weights.each_key.map.with_index do |query, q|
178
+ [query, results[q].first]
179
+ end]
180
+
181
+ [costs, results.map(&:last)]
182
+ end
183
+
184
+ # Get the cost for indices for an individual query
185
+ def query_cost(planner, query, weight)
186
+ query_costs = {}
187
+
188
+ tree = planner.find_plans_for_query(query)
189
+ tree.each do |plan|
190
+ steps_by_index = []
191
+ plan.each do |step|
192
+ if step.is_a? Plans::IndexLookupPlanStep
193
+ steps_by_index.push [step]
194
+ else
195
+ steps_by_index.last.push step
196
+ end
197
+ end
198
+
199
+ populate_query_costs query_costs, steps_by_index, weight, query, tree
200
+ end
201
+
202
+ [query_costs, tree]
203
+ end
204
+
205
+ # Store the costs and indexes for this plan in a nested hash
206
+ # @return [void]
207
+ def populate_query_costs(query_costs, steps_by_index, weight,
208
+ query, tree)
209
+ # The first key is the query and the second is the index
210
+ #
211
+ # The value is a two-element array with the indices which are
212
+ # jointly used to answer a step in the query plan along with
213
+ # the cost of all plan steps for the part of the query graph
214
+ steps_by_index.each do |steps|
215
+ # Get the indexes for these plan steps
216
+ index_step = steps.first
217
+
218
+ # Calculate the cost for just these steps in the plan
219
+ cost = steps.sum_by(&:cost) * weight
220
+
221
+ # Don't count the cost for sorting at the end
222
+ sort_step = steps.find { |s| s.is_a? Plans::SortPlanStep }
223
+ cost -= sort_step.cost * weight unless sort_step.nil?
224
+
225
+ if query_costs.key? index_step.index
226
+ current_cost = query_costs[index_step.index].last
227
+
228
+ # We must always have the same cost
229
+ if (current_cost - cost).abs >= 10E-6
230
+ index = index_step.index
231
+ p query
232
+ puts "Index #{index.key} does not have equivalent cost"
233
+ puts "Current cost: #{current_cost}, discovered cost: #{cost}"
234
+
235
+ puts "\nCurrent steps"
236
+ query_costs[index_step.index].first.each { |s| p s }
237
+
238
+ puts "\nDiscovered steps"
239
+ steps.each { |s| p s }
240
+ puts
241
+
242
+ puts '======================================='
243
+ tree.sort_by(&:cost).each do |plan|
244
+ next unless plan.indexes.include?(index_step.index)
245
+ plan.each do |step|
246
+ print(format('%.3f', step.cost).rjust(7) + ' ')
247
+ p step
248
+ end
249
+ puts "#{format('%.3f', plan.cost).rjust(7)} total"
250
+ puts '======================================='
251
+ end
252
+
253
+ puts
254
+ p tree
255
+
256
+ fail
257
+ end
258
+ else
259
+ # We either found a new plan or something cheaper
260
+ query_costs[index_step.index] = [steps, cost]
261
+ end
262
+ end
263
+ end
264
+ end
265
+ end
266
+ end