nose 0.1.0pre
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/nose/backend/cassandra.rb +390 -0
- data/lib/nose/backend/file.rb +185 -0
- data/lib/nose/backend/mongo.rb +242 -0
- data/lib/nose/backend.rb +557 -0
- data/lib/nose/cost/cassandra.rb +33 -0
- data/lib/nose/cost/entity_count.rb +27 -0
- data/lib/nose/cost/field_size.rb +31 -0
- data/lib/nose/cost/request_count.rb +32 -0
- data/lib/nose/cost.rb +68 -0
- data/lib/nose/debug.rb +45 -0
- data/lib/nose/enumerator.rb +199 -0
- data/lib/nose/indexes.rb +239 -0
- data/lib/nose/loader/csv.rb +99 -0
- data/lib/nose/loader/mysql.rb +199 -0
- data/lib/nose/loader/random.rb +48 -0
- data/lib/nose/loader/sql.rb +105 -0
- data/lib/nose/loader.rb +38 -0
- data/lib/nose/model/entity.rb +136 -0
- data/lib/nose/model/fields.rb +293 -0
- data/lib/nose/model.rb +113 -0
- data/lib/nose/parser.rb +202 -0
- data/lib/nose/plans/execution_plan.rb +282 -0
- data/lib/nose/plans/filter.rb +99 -0
- data/lib/nose/plans/index_lookup.rb +302 -0
- data/lib/nose/plans/limit.rb +42 -0
- data/lib/nose/plans/query_planner.rb +361 -0
- data/lib/nose/plans/sort.rb +49 -0
- data/lib/nose/plans/update.rb +60 -0
- data/lib/nose/plans/update_planner.rb +270 -0
- data/lib/nose/plans.rb +135 -0
- data/lib/nose/proxy/mysql.rb +275 -0
- data/lib/nose/proxy.rb +102 -0
- data/lib/nose/query_graph.rb +481 -0
- data/lib/nose/random/barbasi_albert.rb +48 -0
- data/lib/nose/random/watts_strogatz.rb +50 -0
- data/lib/nose/random.rb +391 -0
- data/lib/nose/schema.rb +89 -0
- data/lib/nose/search/constraints.rb +143 -0
- data/lib/nose/search/problem.rb +328 -0
- data/lib/nose/search/results.rb +200 -0
- data/lib/nose/search.rb +266 -0
- data/lib/nose/serialize.rb +747 -0
- data/lib/nose/statements/connection.rb +160 -0
- data/lib/nose/statements/delete.rb +83 -0
- data/lib/nose/statements/insert.rb +146 -0
- data/lib/nose/statements/query.rb +161 -0
- data/lib/nose/statements/update.rb +101 -0
- data/lib/nose/statements.rb +645 -0
- data/lib/nose/timing.rb +79 -0
- data/lib/nose/util.rb +305 -0
- data/lib/nose/workload.rb +244 -0
- data/lib/nose.rb +37 -0
- data/templates/workload.erb +42 -0
- metadata +700 -0
data/lib/nose/cost.rb
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module NoSE
|
4
|
+
# Cost models for steps of backend statement excution
|
5
|
+
module Cost
|
6
|
+
# Cost model for a backend database
|
7
|
+
class Cost
|
8
|
+
include Supertype
|
9
|
+
|
10
|
+
def initialize(**options)
|
11
|
+
@options = options
|
12
|
+
end
|
13
|
+
|
14
|
+
# The cost of filtering intermediate results
|
15
|
+
# @return [Fixnum]
|
16
|
+
def filter_cost(_step)
|
17
|
+
# Assume this has no cost and the cost is captured in the fact that we
|
18
|
+
# have to retrieve more data earlier. All this does is skip records.
|
19
|
+
0
|
20
|
+
end
|
21
|
+
|
22
|
+
# The cost of limiting a result set
|
23
|
+
# @return [Fixnum]
|
24
|
+
def limit_cost(_step)
|
25
|
+
# This is basically free since we just discard data
|
26
|
+
0
|
27
|
+
end
|
28
|
+
|
29
|
+
# The cost of sorting a set of results
|
30
|
+
# @return [Fixnum]
|
31
|
+
def sort_cost(_step)
|
32
|
+
# TODO: Find some estimate of sort cost
|
33
|
+
# This could be partially captured by the fact that sort + limit
|
34
|
+
# effectively removes the limit
|
35
|
+
1
|
36
|
+
end
|
37
|
+
|
38
|
+
# The cost of performing a lookup via an index
|
39
|
+
# @return [Fixnum]
|
40
|
+
def index_lookup_cost(_step)
|
41
|
+
fail NotImplementedError, 'Must be implemented in a subclass'
|
42
|
+
end
|
43
|
+
|
44
|
+
# The cost of performing a deletion from an index
|
45
|
+
# @return [Fixnum]
|
46
|
+
def delete_cost(_step)
|
47
|
+
fail NotImplementedError, 'Must be implemented in a subclass'
|
48
|
+
end
|
49
|
+
|
50
|
+
# The cost of performing an insert into an index
|
51
|
+
# @return [Fixnum]
|
52
|
+
def insert_cost(_step)
|
53
|
+
fail NotImplementedError, 'Must be implemented in a subclass'
|
54
|
+
end
|
55
|
+
|
56
|
+
# This is here for debugging purposes because we need a cost
|
57
|
+
# @return [Fixnum]
|
58
|
+
def pruned_cost(_step)
|
59
|
+
0
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
require_relative 'cost/cassandra'
|
66
|
+
require_relative 'cost/entity_count'
|
67
|
+
require_relative 'cost/field_size'
|
68
|
+
require_relative 'cost/request_count'
|
data/lib/nose/debug.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
# rubocop:disable Lint/HandleExceptions
|
3
|
+
begin
|
4
|
+
require 'binding_of_caller'
|
5
|
+
require 'pry'
|
6
|
+
rescue LoadError
|
7
|
+
# Ignore in case we are not in development mode
|
8
|
+
end
|
9
|
+
# rubocop:enable Lint/HandleExceptions
|
10
|
+
|
11
|
+
module NoSE
|
12
|
+
# Various helpful debugging snippets
|
13
|
+
module Debug
|
14
|
+
# Convenience method to break in IndexLookupStep when
|
15
|
+
# a particular set of indexes is reach when planning
|
16
|
+
# @return [void]
|
17
|
+
def self.break_on_indexes(*index_keys)
|
18
|
+
apply = binding.of_caller(1)
|
19
|
+
parent = apply.eval 'parent'
|
20
|
+
index = apply.eval 'index'
|
21
|
+
current_keys = parent.parent_steps.indexes.map(&:key) << index.key
|
22
|
+
|
23
|
+
# rubocop:disable Lint/Debugger
|
24
|
+
binding.pry if current_keys == index_keys
|
25
|
+
# rubocop:enable Lint/Debugger
|
26
|
+
end
|
27
|
+
|
28
|
+
# Export entities in a model as global
|
29
|
+
# variales for easier access when debugging
|
30
|
+
# @return [void]
|
31
|
+
def self.export_model(model)
|
32
|
+
model.entities.each do |name, entity|
|
33
|
+
# rubocop:disable Lint/Eval
|
34
|
+
eval("$#{name} = entity")
|
35
|
+
# rubocop:enable Lint/Eval
|
36
|
+
|
37
|
+
entity.fields.merge(entity.foreign_keys).each do |field_name, field|
|
38
|
+
entity.define_singleton_method field_name.to_sym, -> { field }
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
nil
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,199 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'logging'
|
4
|
+
|
5
|
+
module NoSE
|
6
|
+
# Produces potential indices to be used in schemas
|
7
|
+
class IndexEnumerator
|
8
|
+
def initialize(workload)
|
9
|
+
@logger = Logging.logger['nose::enumerator']
|
10
|
+
|
11
|
+
@workload = workload
|
12
|
+
end
|
13
|
+
|
14
|
+
# Produce all possible indices for a given query
|
15
|
+
# @return [Array<Index>]
|
16
|
+
def indexes_for_query(query)
|
17
|
+
@logger.debug "Enumerating indexes for query #{query.text}"
|
18
|
+
|
19
|
+
range = if query.range_field.nil?
|
20
|
+
query.order
|
21
|
+
else
|
22
|
+
[query.range_field] + query.order
|
23
|
+
end
|
24
|
+
|
25
|
+
eq = query.eq_fields.group_by(&:parent)
|
26
|
+
eq.default_proc = ->(*) { [] }
|
27
|
+
|
28
|
+
range = range.group_by(&:parent)
|
29
|
+
range.default_proc = ->(*) { [] }
|
30
|
+
|
31
|
+
query.graph.subgraphs.flat_map do |graph|
|
32
|
+
indexes_for_graph graph, query.select, eq, range
|
33
|
+
end.uniq << query.materialize_view
|
34
|
+
end
|
35
|
+
|
36
|
+
# Produce all possible indices for a given workload
|
37
|
+
# @return [Set<Index>]
|
38
|
+
def indexes_for_workload(additional_indexes = [], by_id_graph = false)
|
39
|
+
queries = @workload.queries
|
40
|
+
indexes = Parallel.map(queries) do |query|
|
41
|
+
indexes_for_query(query).to_a
|
42
|
+
end.inject(additional_indexes, &:+)
|
43
|
+
|
44
|
+
# Add indexes generated for support queries
|
45
|
+
supporting = support_indexes indexes, by_id_graph
|
46
|
+
supporting += support_indexes supporting, by_id_graph
|
47
|
+
indexes += supporting
|
48
|
+
|
49
|
+
# Deduplicate indexes, combine them and deduplicate again
|
50
|
+
indexes.uniq!
|
51
|
+
combine_indexes indexes
|
52
|
+
indexes.uniq!
|
53
|
+
|
54
|
+
@logger.debug do
|
55
|
+
"Indexes for workload:\n" + indexes.map.with_index do |index, i|
|
56
|
+
"#{i} #{index.inspect}"
|
57
|
+
end.join("\n")
|
58
|
+
end
|
59
|
+
|
60
|
+
indexes
|
61
|
+
end
|
62
|
+
|
63
|
+
private
|
64
|
+
|
65
|
+
# Produce the indexes necessary for support queries for these indexes
|
66
|
+
# @return [Array<Index>]
|
67
|
+
def support_indexes(indexes, by_id_graph)
|
68
|
+
# If indexes are grouped by ID graph, convert them before updating
|
69
|
+
# since other updates will be managed automatically by index maintenance
|
70
|
+
indexes = indexes.map(&:to_id_graph).uniq if by_id_graph
|
71
|
+
|
72
|
+
# Collect all possible support queries
|
73
|
+
queries = indexes.flat_map do |index|
|
74
|
+
@workload.updates.flat_map do |update|
|
75
|
+
update.support_queries(index)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# Enumerate indexes for each support query
|
80
|
+
queries.uniq!
|
81
|
+
queries.flat_map do |query|
|
82
|
+
indexes_for_query(query).to_a
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
# Combine the data of indices based on matching hash fields
|
87
|
+
def combine_indexes(indexes)
|
88
|
+
no_order_indexes = indexes.select do |index|
|
89
|
+
index.order_fields.empty?
|
90
|
+
end
|
91
|
+
no_order_indexes = no_order_indexes.group_by do |index|
|
92
|
+
[index.hash_fields, index.graph]
|
93
|
+
end
|
94
|
+
|
95
|
+
no_order_indexes.each do |(hash_fields, graph), hash_indexes|
|
96
|
+
extra_choices = hash_indexes.map(&:extra).uniq
|
97
|
+
|
98
|
+
# XXX More combos?
|
99
|
+
combos = extra_choices.combination(2)
|
100
|
+
|
101
|
+
combos.map do |combo|
|
102
|
+
indexes << Index.new(hash_fields, [], combo.inject(Set.new, &:+),
|
103
|
+
graph)
|
104
|
+
@logger.debug "Enumerated combined index #{indexes.last.inspect}"
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
# Get all possible choices of fields to use for equality
|
110
|
+
# @return [Array<Array>]
|
111
|
+
def eq_choices(graph, eq)
|
112
|
+
entity_choices = graph.entities.flat_map do |entity|
|
113
|
+
# Get the fields for the entity and add in the IDs
|
114
|
+
entity_fields = eq[entity] << entity.id_field
|
115
|
+
entity_fields.uniq!
|
116
|
+
1.upto(entity_fields.count).flat_map do |n|
|
117
|
+
entity_fields.permutation(n).to_a
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
2.upto(graph.entities.length).flat_map do |n|
|
122
|
+
entity_choices.permutation(n).map(&:flatten).to_a
|
123
|
+
end + entity_choices
|
124
|
+
end
|
125
|
+
|
126
|
+
# Get fields which should be included in an index for the given graph
|
127
|
+
# @return [Array<Array>]
|
128
|
+
def extra_choices(graph, select, eq, range)
|
129
|
+
choices = eq.values + range.values << select.to_a
|
130
|
+
|
131
|
+
choices.each do |choice|
|
132
|
+
choice.select { |field| graph.entities.include?(field.parent) }
|
133
|
+
end
|
134
|
+
|
135
|
+
choices.reject(&:empty?) << []
|
136
|
+
end
|
137
|
+
|
138
|
+
# Get all possible indices which jump a given piece of a query graph
|
139
|
+
# @return [Array<Index>]
|
140
|
+
def indexes_for_graph(graph, select, eq, range)
|
141
|
+
eq_choices = eq_choices graph, eq
|
142
|
+
range_fields = graph.entities.map { |entity| range[entity] }.reduce(&:+)
|
143
|
+
range_fields.uniq!
|
144
|
+
order_choices = range_fields.prefixes.flat_map do |fields|
|
145
|
+
fields.permutation.to_a
|
146
|
+
end.uniq << []
|
147
|
+
extra_choices = extra_choices graph, select, eq, range
|
148
|
+
extra_choices = 1.upto(extra_choices.length).flat_map do |n|
|
149
|
+
extra_choices.combination(n).map(&:flatten).map(&:uniq)
|
150
|
+
end.uniq
|
151
|
+
|
152
|
+
# Generate all possible indices based on the field choices
|
153
|
+
choices = eq_choices.product(extra_choices)
|
154
|
+
indexes = choices.map! do |index, extra|
|
155
|
+
indexes = []
|
156
|
+
|
157
|
+
order_choices.each do |order|
|
158
|
+
# Append the primary key of the entities in the graph if needed
|
159
|
+
order += graph.entities.sort_by(&:name).map(&:id_field) -
|
160
|
+
(index + order)
|
161
|
+
|
162
|
+
# Partition into the ordering portion
|
163
|
+
index.partitions.each do |index_prefix, order_prefix|
|
164
|
+
hash_fields = index_prefix.take_while do |field|
|
165
|
+
field.parent == index.first.parent
|
166
|
+
end
|
167
|
+
order_fields = index_prefix[hash_fields.length..-1] + \
|
168
|
+
order_prefix + order
|
169
|
+
extra_fields = extra - hash_fields - order_fields
|
170
|
+
next if order_fields.empty? && extra_fields.empty?
|
171
|
+
|
172
|
+
new_index = generate_index hash_fields, order_fields, extra_fields,
|
173
|
+
graph
|
174
|
+
indexes << new_index unless new_index.nil?
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
indexes
|
179
|
+
end.inject([], &:+)
|
180
|
+
indexes.flatten!
|
181
|
+
|
182
|
+
indexes
|
183
|
+
end
|
184
|
+
|
185
|
+
# Generate a new index and ignore if invalid
|
186
|
+
# @return [Index]
|
187
|
+
def generate_index(hash, order, extra, graph)
|
188
|
+
begin
|
189
|
+
index = Index.new hash, order.uniq, extra, graph
|
190
|
+
@logger.debug { "Enumerated #{index.inspect}" }
|
191
|
+
rescue InvalidIndexException
|
192
|
+
# This combination of fields is not valid, that's ok
|
193
|
+
index = nil
|
194
|
+
end
|
195
|
+
|
196
|
+
index
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
data/lib/nose/indexes.rb
ADDED
@@ -0,0 +1,239 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module NoSE
|
4
|
+
# A representation of materialized views over fields in an entity
|
5
|
+
class Index
|
6
|
+
attr_reader :hash_fields, :order_fields, :extra, :all_fields, :path,
|
7
|
+
:entries, :entry_size, :size, :hash_count, :per_hash_count,
|
8
|
+
:graph
|
9
|
+
|
10
|
+
def initialize(hash_fields, order_fields, extra, graph, saved_key = nil)
|
11
|
+
order_set = order_fields.to_set
|
12
|
+
@hash_fields = hash_fields.to_set
|
13
|
+
@order_fields = order_fields.delete_if { |e| hash_fields.include? e }
|
14
|
+
@extra = extra.to_set.delete_if do |e|
|
15
|
+
@hash_fields.include?(e) || order_set.include?(e)
|
16
|
+
end
|
17
|
+
@all_fields = Set.new(@hash_fields).merge(order_set).merge(@extra)
|
18
|
+
|
19
|
+
validate_hash_fields
|
20
|
+
|
21
|
+
# Store whether this index is an identity
|
22
|
+
@identity = @hash_fields == [
|
23
|
+
@hash_fields.first.parent.id_field
|
24
|
+
].to_set && graph.nodes.size == 1
|
25
|
+
|
26
|
+
@graph = graph
|
27
|
+
@path = graph.longest_path
|
28
|
+
@path = nil unless @path.length == graph.size
|
29
|
+
|
30
|
+
validate_graph
|
31
|
+
|
32
|
+
build_hash saved_key
|
33
|
+
end
|
34
|
+
|
35
|
+
# Check if this index maps from the primary key to fields from one entity
|
36
|
+
# @return [Boolean]
|
37
|
+
def identity?
|
38
|
+
@identity
|
39
|
+
end
|
40
|
+
|
41
|
+
# A simple key which uniquely identifies the index
|
42
|
+
# @return [String]
|
43
|
+
def key
|
44
|
+
@key ||= "i#{Zlib.crc32 hash_str}"
|
45
|
+
end
|
46
|
+
|
47
|
+
# Look up a field in the index based on its ID
|
48
|
+
# @return [Fields::Field]
|
49
|
+
def [](field_id)
|
50
|
+
@all_fields.find { |field| field.id == field_id }
|
51
|
+
end
|
52
|
+
|
53
|
+
# Check if this index is an ID graph
|
54
|
+
# @return [Boolean]
|
55
|
+
def id_graph?
|
56
|
+
@hash_fields.all?(&:primary_key?) && @order_fields.all?(&:primary_key)
|
57
|
+
end
|
58
|
+
|
59
|
+
# Produce an index with the same fields but keyed by entities in the graph
|
60
|
+
def to_id_graph
|
61
|
+
return self if id_graph?
|
62
|
+
|
63
|
+
all_ids = (@hash_fields.to_a + @order_fields + @extra.to_a)
|
64
|
+
all_ids.map! { |f| f.parent.id_field }.uniq!
|
65
|
+
|
66
|
+
hash_fields = [all_ids.first]
|
67
|
+
order_fields = all_ids[1..-1]
|
68
|
+
extra = @all_fields - hash_fields - order_fields
|
69
|
+
|
70
|
+
Index.new hash_fields, order_fields, extra, @graph
|
71
|
+
end
|
72
|
+
|
73
|
+
# :nocov:
|
74
|
+
def to_color
|
75
|
+
fields = [@hash_fields, @order_fields, @extra].map do |field_group|
|
76
|
+
'[' + field_group.map(&:inspect).join(', ') + ']'
|
77
|
+
end
|
78
|
+
|
79
|
+
"[magenta]#{key}[/] #{fields[0]} #{fields[1]} → #{fields[2]}" \
|
80
|
+
" [yellow]$#{size}[/]" \
|
81
|
+
" [magenta]#{@graph.inspect}[/]"
|
82
|
+
end
|
83
|
+
# :nocov:
|
84
|
+
|
85
|
+
# Two indices are equal if they contain the same fields
|
86
|
+
# @return [Boolean]
|
87
|
+
def ==(other)
|
88
|
+
hash == other.hash
|
89
|
+
end
|
90
|
+
alias eql? ==
|
91
|
+
|
92
|
+
# Hash based on the fields, their keys, and the graph
|
93
|
+
# @return [String]
|
94
|
+
def hash_str
|
95
|
+
@hash_str ||= [
|
96
|
+
@hash_fields.map(&:id).sort!,
|
97
|
+
@order_fields.map(&:id),
|
98
|
+
@extra.map(&:id).sort!,
|
99
|
+
@graph.unique_edges.map(&:canonical_params).sort!
|
100
|
+
].to_s.freeze
|
101
|
+
end
|
102
|
+
|
103
|
+
def hash
|
104
|
+
@hash ||= Zlib.crc32 hash_str
|
105
|
+
end
|
106
|
+
|
107
|
+
# Check if the index contains a given field
|
108
|
+
# @return [Boolean]
|
109
|
+
def contains_field?(field)
|
110
|
+
@all_fields.include? field
|
111
|
+
end
|
112
|
+
|
113
|
+
private
|
114
|
+
|
115
|
+
# Initialize the hash function and freeze ourselves
|
116
|
+
# @return [void]
|
117
|
+
def build_hash(saved_key)
|
118
|
+
@key = saved_key
|
119
|
+
|
120
|
+
hash
|
121
|
+
key
|
122
|
+
calculate_size
|
123
|
+
freeze
|
124
|
+
end
|
125
|
+
|
126
|
+
# Check for valid hash fields in an index
|
127
|
+
# @return [void]
|
128
|
+
def validate_hash_fields
|
129
|
+
fail InvalidIndexException, 'hash fields cannot be empty' \
|
130
|
+
if @hash_fields.empty?
|
131
|
+
|
132
|
+
fail InvalidIndexException, 'hash fields can only involve one entity' \
|
133
|
+
if @hash_fields.map(&:parent).to_set.size > 1
|
134
|
+
end
|
135
|
+
|
136
|
+
# Ensure an index is nonempty
|
137
|
+
# @return [void]
|
138
|
+
def validate_nonempty
|
139
|
+
fail InvalidIndexException, 'must have fields other than hash fields' \
|
140
|
+
if @order_fields.empty? && @extra.empty?
|
141
|
+
end
|
142
|
+
|
143
|
+
# Ensure an index and its fields correspond to a valid graph
|
144
|
+
# @return [void]
|
145
|
+
def validate_graph
|
146
|
+
validate_graph_entities
|
147
|
+
validate_graph_keys
|
148
|
+
end
|
149
|
+
|
150
|
+
# Ensure the graph of the index is valid
|
151
|
+
# @return [void]
|
152
|
+
def validate_graph_entities
|
153
|
+
entities = @all_fields.map(&:parent).to_set
|
154
|
+
fail InvalidIndexException, 'graph entities do match index' \
|
155
|
+
unless entities == @graph.entities.to_set
|
156
|
+
end
|
157
|
+
|
158
|
+
# We must have the primary keys of the all entities in the graph
|
159
|
+
# @return [void]
|
160
|
+
def validate_graph_keys
|
161
|
+
fail InvalidIndexException, 'missing graph entity keys' \
|
162
|
+
unless @graph.entities.map(&:id_field).all? do |field|
|
163
|
+
@hash_fields.include?(field) || @order_fields.include?(field)
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
# Precalculate the size of the index
|
168
|
+
# @return [void]
|
169
|
+
def calculate_size
|
170
|
+
@hash_count = @hash_fields.product_by(&:cardinality)
|
171
|
+
|
172
|
+
# XXX This only works if foreign keys span all possible keys
|
173
|
+
# Take the maximum possible count at each join and multiply
|
174
|
+
@entries = @graph.entities.map(&:count).max
|
175
|
+
@per_hash_count = (@entries * 1.0 / @hash_count)
|
176
|
+
|
177
|
+
@entry_size = @all_fields.sum_by(&:size)
|
178
|
+
@size = @entries * @entry_size
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
# Thrown when something tries to create an invalid index
|
183
|
+
class InvalidIndexException < StandardError
|
184
|
+
end
|
185
|
+
|
186
|
+
# Allow entities to create their own indices
|
187
|
+
class Entity
|
188
|
+
# Create a simple index which maps entity keys to other fields
|
189
|
+
# @return [Index]
|
190
|
+
def simple_index
|
191
|
+
Index.new [id_field], [], fields.values - [id_field],
|
192
|
+
QueryGraph::Graph.from_path([id_field]), name
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
# Allow statements to materialize views
|
197
|
+
class Statement
|
198
|
+
# Construct an index which acts as a materialized view for a query
|
199
|
+
# @return [Index]
|
200
|
+
def materialize_view
|
201
|
+
eq = materialized_view_eq join_order.first
|
202
|
+
order_fields = materialized_view_order(join_order.first) - eq
|
203
|
+
|
204
|
+
Index.new(eq, order_fields,
|
205
|
+
all_fields - (@eq_fields + @order).to_set, @graph)
|
206
|
+
end
|
207
|
+
|
208
|
+
private
|
209
|
+
|
210
|
+
# Get the fields used as parition keys for a materialized view
|
211
|
+
# based over a given entity
|
212
|
+
# @return [Array<Fields::Field>]
|
213
|
+
def materialized_view_eq(hash_entity)
|
214
|
+
eq = @eq_fields.select { |field| field.parent == hash_entity }
|
215
|
+
eq = [join_order.last.id_field] if eq.empty?
|
216
|
+
|
217
|
+
eq
|
218
|
+
end
|
219
|
+
|
220
|
+
# Get the ordered keys for a materialized view
|
221
|
+
# @return [Array<Fields::Field>]
|
222
|
+
def materialized_view_order(hash_entity)
|
223
|
+
# Start the ordered fields with the equality predicates
|
224
|
+
# on other entities, followed by all of the attributes
|
225
|
+
# used in ordering, then the range field
|
226
|
+
order_fields = @eq_fields.select do |field|
|
227
|
+
field.parent != hash_entity
|
228
|
+
end + @order
|
229
|
+
if @range_field && !@order.include?(@range_field)
|
230
|
+
order_fields << @range_field
|
231
|
+
end
|
232
|
+
|
233
|
+
# Ensure we include IDs of the final entity
|
234
|
+
order_fields += join_order.map(&:id_field)
|
235
|
+
|
236
|
+
order_fields.uniq
|
237
|
+
end
|
238
|
+
end
|
239
|
+
end
|
@@ -0,0 +1,99 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'formatador'
|
4
|
+
require 'smarter_csv'
|
5
|
+
require 'zlib'
|
6
|
+
|
7
|
+
module NoSE
|
8
|
+
module Loader
|
9
|
+
# Load data into an index from a set of CSV files
|
10
|
+
class CsvLoader < LoaderBase
|
11
|
+
def initialize(workload = nil, backend = nil)
|
12
|
+
super
|
13
|
+
|
14
|
+
@logger = Logging.logger['nose::loader::csvloader']
|
15
|
+
end
|
16
|
+
|
17
|
+
# Load data for all the indexes
|
18
|
+
def load(indexes, config, show_progress = false, limit = nil,
|
19
|
+
skip_existing = true)
|
20
|
+
indexes.map!(&:to_id_graph).uniq! if @backend.by_id_graph
|
21
|
+
|
22
|
+
simple_indexes = find_simple_indexes indexes, skip_existing
|
23
|
+
simple_indexes.each do |entity, simple_index_list|
|
24
|
+
filename = File.join config[:directory], "#{entity.name}.csv"
|
25
|
+
total_rows = (limit || 0) - 1 # account for header row
|
26
|
+
File.open(filename) { |file| file.each_line { total_rows += 1 } }
|
27
|
+
|
28
|
+
progress = initialize_progress entity, simple_index_list,
|
29
|
+
total_rows if show_progress
|
30
|
+
load_file_indexes filename, entity, simple_index_list, progress
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
# Find the simple indexes we should populate
|
37
|
+
# @return [Hash<Entity, Index>]
|
38
|
+
def find_simple_indexes(indexes, skip_existing)
|
39
|
+
simple_indexes = indexes.select do |index|
|
40
|
+
index.graph.size == 1 &&
|
41
|
+
!(skip_existing && !@backend.index_empty?(index))
|
42
|
+
end
|
43
|
+
|
44
|
+
simple_indexes.group_by do |index|
|
45
|
+
index.hash_fields.first.parent
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Initialize a progress bar to reporting loading results
|
50
|
+
# @return [Formatador::ProgressBar]
|
51
|
+
def initialize_progress(entity, simple_index_list, total_rows)
|
52
|
+
@logger.info "Loading simple indexes for #{entity.name}"
|
53
|
+
@logger.info simple_index_list.map(&:key).join(', ')
|
54
|
+
|
55
|
+
Formatador.new.redisplay_progressbar 0, total_rows
|
56
|
+
Formatador::ProgressBar.new total_rows, started_at: Time.now.utc
|
57
|
+
end
|
58
|
+
|
59
|
+
# Load all indexes for a given file
|
60
|
+
# @return [void]
|
61
|
+
def load_file_indexes(filename, entity, simple_index_list, progress)
|
62
|
+
SmarterCSV.process(filename,
|
63
|
+
downcase_header: false,
|
64
|
+
chunk_size: 1000,
|
65
|
+
convert_values_to_numeric: false) do |chunk|
|
66
|
+
Parallel.each(chunk.each_slice(100),
|
67
|
+
finish: (lambda do |_, _, _|
|
68
|
+
next if progress.nil?
|
69
|
+
inc = [progress.total - progress.current, 100].min
|
70
|
+
progress.increment inc
|
71
|
+
end)) do |minichunk|
|
72
|
+
load_simple_chunk minichunk, entity, simple_index_list
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
# Load a chunk of data from a simple entity index
|
78
|
+
# @return [void]
|
79
|
+
def load_simple_chunk(chunk, entity, indexes)
|
80
|
+
# Prefix all hash keys with the entity name and convert values
|
81
|
+
chunk.map! do |row|
|
82
|
+
index_row = {}
|
83
|
+
row.each_key do |key|
|
84
|
+
field_class = entity[key.to_s].class
|
85
|
+
value = field_class.value_from_string row[key]
|
86
|
+
index_row["#{entity.name}_#{key}"] = value
|
87
|
+
end
|
88
|
+
|
89
|
+
index_row
|
90
|
+
end
|
91
|
+
|
92
|
+
# Insert the batch into the index
|
93
|
+
indexes.each do |index|
|
94
|
+
@backend.index_insert_chunk index, chunk
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|