nose 0.1.0pre
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/nose/backend/cassandra.rb +390 -0
- data/lib/nose/backend/file.rb +185 -0
- data/lib/nose/backend/mongo.rb +242 -0
- data/lib/nose/backend.rb +557 -0
- data/lib/nose/cost/cassandra.rb +33 -0
- data/lib/nose/cost/entity_count.rb +27 -0
- data/lib/nose/cost/field_size.rb +31 -0
- data/lib/nose/cost/request_count.rb +32 -0
- data/lib/nose/cost.rb +68 -0
- data/lib/nose/debug.rb +45 -0
- data/lib/nose/enumerator.rb +199 -0
- data/lib/nose/indexes.rb +239 -0
- data/lib/nose/loader/csv.rb +99 -0
- data/lib/nose/loader/mysql.rb +199 -0
- data/lib/nose/loader/random.rb +48 -0
- data/lib/nose/loader/sql.rb +105 -0
- data/lib/nose/loader.rb +38 -0
- data/lib/nose/model/entity.rb +136 -0
- data/lib/nose/model/fields.rb +293 -0
- data/lib/nose/model.rb +113 -0
- data/lib/nose/parser.rb +202 -0
- data/lib/nose/plans/execution_plan.rb +282 -0
- data/lib/nose/plans/filter.rb +99 -0
- data/lib/nose/plans/index_lookup.rb +302 -0
- data/lib/nose/plans/limit.rb +42 -0
- data/lib/nose/plans/query_planner.rb +361 -0
- data/lib/nose/plans/sort.rb +49 -0
- data/lib/nose/plans/update.rb +60 -0
- data/lib/nose/plans/update_planner.rb +270 -0
- data/lib/nose/plans.rb +135 -0
- data/lib/nose/proxy/mysql.rb +275 -0
- data/lib/nose/proxy.rb +102 -0
- data/lib/nose/query_graph.rb +481 -0
- data/lib/nose/random/barbasi_albert.rb +48 -0
- data/lib/nose/random/watts_strogatz.rb +50 -0
- data/lib/nose/random.rb +391 -0
- data/lib/nose/schema.rb +89 -0
- data/lib/nose/search/constraints.rb +143 -0
- data/lib/nose/search/problem.rb +328 -0
- data/lib/nose/search/results.rb +200 -0
- data/lib/nose/search.rb +266 -0
- data/lib/nose/serialize.rb +747 -0
- data/lib/nose/statements/connection.rb +160 -0
- data/lib/nose/statements/delete.rb +83 -0
- data/lib/nose/statements/insert.rb +146 -0
- data/lib/nose/statements/query.rb +161 -0
- data/lib/nose/statements/update.rb +101 -0
- data/lib/nose/statements.rb +645 -0
- data/lib/nose/timing.rb +79 -0
- data/lib/nose/util.rb +305 -0
- data/lib/nose/workload.rb +244 -0
- data/lib/nose.rb +37 -0
- data/templates/workload.erb +42 -0
- metadata +700 -0
data/lib/nose/search.rb
ADDED
@@ -0,0 +1,266 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'search/constraints'
|
4
|
+
require_relative 'search/problem'
|
5
|
+
require_relative 'search/results'
|
6
|
+
|
7
|
+
require 'logging'
|
8
|
+
require 'ostruct'
|
9
|
+
require 'tempfile'
|
10
|
+
|
11
|
+
module NoSE
|
12
|
+
# ILP construction and schema search
|
13
|
+
module Search
|
14
|
+
# Searches for the optimal indices for a given workload
|
15
|
+
class Search
|
16
|
+
def initialize(workload, cost_model, objective = Objective::COST,
|
17
|
+
by_id_graph = false)
|
18
|
+
@logger = Logging.logger['nose::search']
|
19
|
+
@workload = workload
|
20
|
+
@cost_model = cost_model
|
21
|
+
@objective = objective
|
22
|
+
@by_id_graph = by_id_graph
|
23
|
+
|
24
|
+
# For now we only support optimization based on cost when grouping by
|
25
|
+
# ID graphs, but support for other objectives is still feasible
|
26
|
+
fail 'Only cost-based optimization allowed when using ID graphs' \
|
27
|
+
if @by_id_graph && objective != Objective::COST
|
28
|
+
end
|
29
|
+
|
30
|
+
# Search for optimal indices using an ILP which searches for
|
31
|
+
# non-overlapping indices
|
32
|
+
# @return [Results]
|
33
|
+
def search_overlap(indexes, max_space = Float::INFINITY)
|
34
|
+
return if indexes.empty?
|
35
|
+
|
36
|
+
# Get the costs of all queries and updates
|
37
|
+
query_weights = combine_query_weights indexes
|
38
|
+
costs, trees = query_costs query_weights, indexes
|
39
|
+
update_costs, update_plans = update_costs trees, indexes
|
40
|
+
|
41
|
+
log_search_start costs, query_weights
|
42
|
+
|
43
|
+
solver_params = {
|
44
|
+
max_space: max_space,
|
45
|
+
costs: costs,
|
46
|
+
update_costs: update_costs,
|
47
|
+
cost_model: @cost_model,
|
48
|
+
by_id_graph: @by_id_graph
|
49
|
+
}
|
50
|
+
search_result query_weights, indexes, solver_params, trees,
|
51
|
+
update_plans
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
# Combine the weights of queries and statements
|
57
|
+
# @return [void]
|
58
|
+
def combine_query_weights(indexes)
|
59
|
+
indexes = indexes.map(&:to_id_graph).uniq if @by_id_graph
|
60
|
+
query_weights = Hash[@workload.support_queries(indexes).map do |query|
|
61
|
+
[query, @workload.statement_weights[query.statement]]
|
62
|
+
end]
|
63
|
+
query_weights.merge!(@workload.statement_weights.select do |stmt, _|
|
64
|
+
stmt.is_a? Query
|
65
|
+
end.to_h)
|
66
|
+
|
67
|
+
query_weights
|
68
|
+
end
|
69
|
+
|
70
|
+
# Produce a useful log message before starting the search
|
71
|
+
# @return [void]
|
72
|
+
def log_search_start(costs, query_weights)
|
73
|
+
@logger.debug do
|
74
|
+
"Costs: \n" + pp_s(costs) + "\n" \
|
75
|
+
"Search with queries:\n" + \
|
76
|
+
query_weights.each_key.map.with_index do |query, i|
|
77
|
+
"#{i} #{query.inspect}"
|
78
|
+
end.join("\n")
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# Run the solver and get the results of search
|
83
|
+
# @return [Results]
|
84
|
+
def search_result(query_weights, indexes, solver_params, trees,
|
85
|
+
update_plans)
|
86
|
+
# Solve the LP using MIPPeR
|
87
|
+
result = solve_mipper query_weights.keys, indexes, **solver_params
|
88
|
+
|
89
|
+
result.workload = @workload
|
90
|
+
result.plans_from_trees trees
|
91
|
+
result.cost_model = @cost_model
|
92
|
+
|
93
|
+
# Select the relevant update plans
|
94
|
+
update_plans = update_plans.values.flatten(1).select do |plan|
|
95
|
+
result.indexes.include? plan.index
|
96
|
+
end
|
97
|
+
update_plans.each do |plan|
|
98
|
+
plan.select_query_plans(&result.method(:select_plan))
|
99
|
+
end
|
100
|
+
result.update_plans = update_plans
|
101
|
+
|
102
|
+
result.validate
|
103
|
+
|
104
|
+
result
|
105
|
+
end
|
106
|
+
|
107
|
+
# Select the plans to use for a given set of indexes
|
108
|
+
# @return [Array<Plans::QueryPlan>]
|
109
|
+
def select_plans(trees, indexes)
|
110
|
+
trees.map do |tree|
|
111
|
+
# Exclude support queries since they will be in update plans
|
112
|
+
query = tree.query
|
113
|
+
next if query.is_a?(SupportQuery)
|
114
|
+
|
115
|
+
# Select the exact plan to use for these indexes
|
116
|
+
tree.select_using_indexes(indexes).min_by(&:cost)
|
117
|
+
end.compact
|
118
|
+
end
|
119
|
+
|
120
|
+
# Solve the index selection problem using MIPPeR
|
121
|
+
# @return [Results]
|
122
|
+
def solve_mipper(queries, indexes, data)
|
123
|
+
# Construct and solve the ILP
|
124
|
+
problem = Problem.new queries, @workload.updates, data, @objective
|
125
|
+
problem.solve
|
126
|
+
|
127
|
+
# We won't get here if there's no valdi solution
|
128
|
+
@logger.debug 'Found solution with total cost ' \
|
129
|
+
"#{problem.objective_value}"
|
130
|
+
|
131
|
+
# Return the selected indices
|
132
|
+
selected_indexes = problem.selected_indexes
|
133
|
+
|
134
|
+
@logger.debug do
|
135
|
+
"Selected indexes:\n" + selected_indexes.map do |index|
|
136
|
+
"#{indexes.index index} #{index.inspect}"
|
137
|
+
end.join("\n")
|
138
|
+
end
|
139
|
+
|
140
|
+
problem.result
|
141
|
+
end
|
142
|
+
|
143
|
+
# Produce the cost of updates in the workload
|
144
|
+
def update_costs(trees, indexes)
|
145
|
+
planner = Plans::UpdatePlanner.new @workload.model, trees, @cost_model,
|
146
|
+
@by_id_graph
|
147
|
+
update_costs = Hash.new { |h, k| h[k] = {} }
|
148
|
+
update_plans = Hash.new { |h, k| h[k] = [] }
|
149
|
+
@workload.statements.each do |statement|
|
150
|
+
next if statement.is_a? Query
|
151
|
+
|
152
|
+
populate_update_costs planner, statement, indexes,
|
153
|
+
update_costs, update_plans
|
154
|
+
end
|
155
|
+
|
156
|
+
[update_costs, update_plans]
|
157
|
+
end
|
158
|
+
|
159
|
+
# Populate the cost of all necessary plans for the given satement
|
160
|
+
# @return [void]
|
161
|
+
def populate_update_costs(planner, statement, indexes,
|
162
|
+
update_costs, update_plans)
|
163
|
+
planner.find_plans_for_update(statement, indexes).each do |plan|
|
164
|
+
weight = @workload.statement_weights[statement]
|
165
|
+
update_costs[statement][plan.index] = plan.update_cost * weight
|
166
|
+
update_plans[statement] << plan
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
# Get the cost of using each index for each query in a workload
|
171
|
+
def query_costs(query_weights, indexes)
|
172
|
+
planner = Plans::QueryPlanner.new @workload, indexes, @cost_model
|
173
|
+
|
174
|
+
results = Parallel.map(query_weights) do |query, weight|
|
175
|
+
query_cost planner, query, weight
|
176
|
+
end
|
177
|
+
costs = Hash[query_weights.each_key.map.with_index do |query, q|
|
178
|
+
[query, results[q].first]
|
179
|
+
end]
|
180
|
+
|
181
|
+
[costs, results.map(&:last)]
|
182
|
+
end
|
183
|
+
|
184
|
+
# Get the cost for indices for an individual query
|
185
|
+
def query_cost(planner, query, weight)
|
186
|
+
query_costs = {}
|
187
|
+
|
188
|
+
tree = planner.find_plans_for_query(query)
|
189
|
+
tree.each do |plan|
|
190
|
+
steps_by_index = []
|
191
|
+
plan.each do |step|
|
192
|
+
if step.is_a? Plans::IndexLookupPlanStep
|
193
|
+
steps_by_index.push [step]
|
194
|
+
else
|
195
|
+
steps_by_index.last.push step
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
populate_query_costs query_costs, steps_by_index, weight, query, tree
|
200
|
+
end
|
201
|
+
|
202
|
+
[query_costs, tree]
|
203
|
+
end
|
204
|
+
|
205
|
+
# Store the costs and indexes for this plan in a nested hash
|
206
|
+
# @return [void]
|
207
|
+
def populate_query_costs(query_costs, steps_by_index, weight,
|
208
|
+
query, tree)
|
209
|
+
# The first key is the query and the second is the index
|
210
|
+
#
|
211
|
+
# The value is a two-element array with the indices which are
|
212
|
+
# jointly used to answer a step in the query plan along with
|
213
|
+
# the cost of all plan steps for the part of the query graph
|
214
|
+
steps_by_index.each do |steps|
|
215
|
+
# Get the indexes for these plan steps
|
216
|
+
index_step = steps.first
|
217
|
+
|
218
|
+
# Calculate the cost for just these steps in the plan
|
219
|
+
cost = steps.sum_by(&:cost) * weight
|
220
|
+
|
221
|
+
# Don't count the cost for sorting at the end
|
222
|
+
sort_step = steps.find { |s| s.is_a? Plans::SortPlanStep }
|
223
|
+
cost -= sort_step.cost * weight unless sort_step.nil?
|
224
|
+
|
225
|
+
if query_costs.key? index_step.index
|
226
|
+
current_cost = query_costs[index_step.index].last
|
227
|
+
|
228
|
+
# We must always have the same cost
|
229
|
+
if (current_cost - cost).abs >= 10E-6
|
230
|
+
index = index_step.index
|
231
|
+
p query
|
232
|
+
puts "Index #{index.key} does not have equivalent cost"
|
233
|
+
puts "Current cost: #{current_cost}, discovered cost: #{cost}"
|
234
|
+
|
235
|
+
puts "\nCurrent steps"
|
236
|
+
query_costs[index_step.index].first.each { |s| p s }
|
237
|
+
|
238
|
+
puts "\nDiscovered steps"
|
239
|
+
steps.each { |s| p s }
|
240
|
+
puts
|
241
|
+
|
242
|
+
puts '======================================='
|
243
|
+
tree.sort_by(&:cost).each do |plan|
|
244
|
+
next unless plan.indexes.include?(index_step.index)
|
245
|
+
plan.each do |step|
|
246
|
+
print(format('%.3f', step.cost).rjust(7) + ' ')
|
247
|
+
p step
|
248
|
+
end
|
249
|
+
puts "#{format('%.3f', plan.cost).rjust(7)} total"
|
250
|
+
puts '======================================='
|
251
|
+
end
|
252
|
+
|
253
|
+
puts
|
254
|
+
p tree
|
255
|
+
|
256
|
+
fail
|
257
|
+
end
|
258
|
+
else
|
259
|
+
# We either found a new plan or something cheaper
|
260
|
+
query_costs[index_step.index] = [steps, cost]
|
261
|
+
end
|
262
|
+
end
|
263
|
+
end
|
264
|
+
end
|
265
|
+
end
|
266
|
+
end
|