bud 0.1.0.pre1 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/bud/joins.rb DELETED
@@ -1,526 +0,0 @@
1
- $EMPTY = []
2
- module Bud
3
- class BudJoin < BudCollection
4
- attr_accessor :rels, :origrels, :origpreds # :nodoc: all
5
- attr_reader :hash_tables # :nodoc: all
6
-
7
- def initialize(rellist, bud_instance, preds=[]) # :nodoc: all
8
- @schema = []
9
- @origpreds = preds
10
- @bud_instance = bud_instance
11
- @localpreds = nil
12
- @hashpreds = nil
13
- @selfjoins = []
14
-
15
- # if any elements on rellist are BudJoins, suck up their contents
16
- tmprels = []
17
- rellist.each do |r|
18
- if r.class <= BudJoin
19
- tmprels += r.origrels
20
- preds += r.origpreds
21
- else
22
- tmprels << r
23
- end
24
- end
25
- rellist = tmprels
26
- @origrels = rellist
27
-
28
- # check for self-joins: we currently only handle 2 instances of the same table per rule
29
- counts = @origrels.reduce({}) do |memo, r|
30
- memo[r.tabname] ||= 0
31
- memo[r.tabname] += 1
32
- memo
33
- end
34
- counts.each do |name, cnt|
35
- raise Bud::CompileError, "#{cnt} instances of #{name} in rule; only one self-join currently allowed per rule" if cnt > 2
36
- @selfjoins << name if cnt == 2
37
- end
38
-
39
-
40
- # recurse to form a tree of binary BudJoins
41
- @rels = [rellist[0]]
42
- @rels << (rellist.length == 2 ? rellist[1] : BudJoin.new(rellist[1..rellist.length-1], @bud_instance))
43
- # derive schema: one column for each table.
44
- # duplicated inputs get distinguishing numeral
45
- @schema = []
46
- index = 0
47
- retval = rellist.reduce({}) do |memo, r|
48
- index += 1
49
- memo[r.tabname.to_s] ||= 0
50
- newstr = r.tabname.to_s + ((memo[r.tabname.to_s] > 0) ? ("_" + memo[r.tabname.to_s].to_s) : "")
51
- @schema << newstr.to_sym
52
- memo[r.tabname.to_s] += 1
53
- memo
54
- end
55
-
56
- setup_preds(preds)
57
- setup_state
58
- end
59
-
60
- public
61
- def state_id # :nodoc: all
62
- Marshal.dump([@rels.map{|r| r.tabname}, @localpreds]).hash
63
- end
64
-
65
- # initialize the state for this join to be carried across iterations within a fixpoint
66
- private
67
- def setup_state
68
- sid = state_id
69
- @tabname = ("temp_join"+state_id.to_s).to_sym
70
- @bud_instance.joinstate[sid] ||= [{:storage => {}, :delta => {}}, {:storage => {}, :delta => {}}]
71
- @hash_tables = @bud_instance.joinstate[sid]
72
- end
73
-
74
- private_class_method
75
- def self.natural_preds(bud_instance, rels)
76
- preds = []
77
- rels.each do |r|
78
- rels.each do |s|
79
- matches = r.cols & s.cols
80
- matches.each do |c|
81
- preds << [bud_instance.send(r.tabname).send(c), bud_instance.send(s.tabname).send(c)] unless r.tabname.to_s >= s.tabname.to_s
82
- end
83
- end
84
- end
85
- preds.uniq
86
- end
87
-
88
- private_class_method
89
- def self.positionwise_preds(bud_instance, rels)
90
- preds = []
91
- rels.each do |r|
92
- rels.each do |s|
93
- [r.cols.length, s.cols.length].min.times do |c|
94
- preds << [bud_instance.send(r.tabname).send(r.cols[c]), bud_instance.send(s.tabname).send(s.cols[c])] unless r.tabname.to_s >= s.tabname.to_s
95
- end
96
- end
97
- end
98
- preds.uniq
99
- end
100
-
101
- # flatten joined items into arrays, with attribute accessors inherited
102
- # from the input collections, disambiguated via suffix indexes as needed.
103
- # similar to <tt>SELECT * FROM ... WHERE...</tt> block in SQL.
104
- public
105
- def flatten(*preds)
106
- setup_preds(preds)
107
- flat_schema = @rels.map{|r| r.cols}.flatten(1)
108
- dupfree_schema = []
109
- # while loop here (inefficiently) ensures no collisions
110
- while dupfree_schema == $EMPTY or dupfree_schema.uniq.length < dupfree_schema.length
111
- dupfree_schema = []
112
- flat_schema.reduce({}) do |memo, r|
113
- if r.to_s.include?("_") and ((r.to_s.rpartition("_")[2] =~ /^\d+$/) == 0)
114
- r = r.to_s.rpartition("_")[0].to_sym
115
- end
116
- memo[r] ||= 0
117
- if memo[r] == 0
118
- dupfree_schema << r.to_s.to_sym
119
- else
120
- dupfree_schema << (r.to_s + "_" + (memo[r]).to_s).to_sym
121
- end
122
- memo[r] += 1
123
- memo
124
- end
125
- flat_schema = dupfree_schema
126
- end
127
- retval = BudScratch.new('temp_flatten', bud_instance, dupfree_schema)
128
- retval.uniquify_tabname
129
- retval.merge(self.map{|r,s| r + s}, retval.storage)
130
- end
131
-
132
- undef do_insert
133
-
134
- public
135
- # map each (nested) item in the collection into a string, suitable for placement in stdio
136
- def inspected
137
- raise Bud::Error, "join left unconverted to binary" if @rels.length > 2
138
- tabnames = @origrels.map {|r| r.tabname.to_s}.join " * "
139
- [["(#{tabnames}): [#{self.map{|r1, r2| "\n (#{r1.inspect}, #{r2.inspect})"}}]"]]
140
- end
141
-
142
- public
143
- def pro(&blk) # :nodoc: all
144
- pairs(&blk)
145
- end
146
-
147
- public
148
- def each(mode=:both, &block) # :nodoc: all
149
- mode = :storage if @bud_instance.stratum_first_iter
150
- if mode == :storage
151
- methods = [:storage]
152
- else
153
- methods = [:delta, :storage]
154
- end
155
-
156
- methods.each do |left_rel|
157
- methods.each do |right_rel|
158
- next if (mode == :both and left_rel == :storage and right_rel == :storage)
159
- if @hashpreds.nil? or @hashpreds.empty?
160
- nestloop_join(left_rel, right_rel, &block)
161
- else
162
- hash_join(left_rel, right_rel, &block)
163
- end
164
- end
165
- end
166
- tick_hash_deltas
167
- end
168
-
169
- # given a * expression over n collections, form all combinations of items
170
- # subject to an array of predicates, pred
171
- # currently supports two options for equijoin predicates:
172
- # general form: an array of arrays capturing a conjunction of equiv. classes
173
- # [[table1.col1, table2.col2, table3.col3], [table1.col2, table2.col3]]
174
- # common form: a hash capturing equality of a column on left with one on right.
175
- # :col1 => :col2 (same as lefttable.col1 => righttable.col2)
176
- public
177
- def pairs(*preds, &blk)
178
- @origpreds = preds
179
- setup_preds(preds)
180
- # given new preds, the state for the join will be different. set it up again.
181
- setup_state if self.class <= Bud::BudJoin
182
- blk.nil? ? self : map(&blk)
183
- end
184
-
185
- alias combos pairs
186
-
187
- # the natural join: given a * expression over n collections, form all
188
- # combinations of items that have the same values in matching fields
189
- public
190
- def matches(&blk)
191
- preds = BudJoin::natural_preds(@bud_instance, @origrels)
192
- pairs(*preds, &blk)
193
- end
194
-
195
- # given a * expression over 2 collections, form all combinations of items
196
- # that satisfy the predicates +preds+, and project only onto the attributes
197
- # of the first collection
198
- public
199
- def lefts(*preds, &blk)
200
- setup_preds(preds)
201
- # given new preds, the state for the join will be different. set it up again.
202
- setup_state if self.class <= Bud::BudJoin
203
- map{ |l,r| blk.nil? ? l : blk.call(l) }
204
- end
205
-
206
- # given a * expression over 2 collections, form all combinations of items
207
- # that satisfy the predicates +preds+, and project only onto the attributes
208
- # of the second item
209
- public
210
- def rights(*preds, &blk)
211
- setup_preds(preds)
212
- # given new preds, the state for the join will be different. set it up again.
213
- setup_state if self.class <= Bud::BudJoin
214
- map{ |l,r| blk.nil? ? r : blk.call(r) }
215
- end
216
-
217
- # given a * expression over 2 collections, form all combos of items that
218
- # satisfy +preds+, and for any item from the 1st collection that has no
219
- # matches in the 2nd, nil-pad it and include it in the output.
220
- public
221
- def outer(*preds, &blk)
222
- @origpreds = preds
223
- setup_preds(preds)
224
- self.extend(Bud::BudOuterJoin)
225
- blk.nil? ? self : map(&blk)
226
- end
227
-
228
- # AntiJoin
229
- # note: unlike other join methods (e.g. lefts) all we do with the return value
230
- # of block is check whether it's nil. Putting "projection" logic in the block
231
- # has no effect on the output.
232
- public
233
- def anti(*preds, &blk)
234
- return [] unless @bud_instance.stratum_first_iter
235
- @origpreds = preds
236
- # no projection involved here, so we can propagate the schema
237
- @cols = @rels[0].cols
238
- if preds == [] and blk.nil? and @cols.length == @rels[1].cols.length
239
- preds = BudJoin::positionwise_preds(@bud_instance, rels)
240
- end
241
- setup_preds(preds)
242
- setup_state if self.class <= Bud::BudJoin
243
- if blk.nil?
244
- if preds == [] # mismatched schemas -- no matches to be excluded
245
- @exclude = []
246
- else
247
- # exclude those tuples of r that have a match
248
- @exclude = map { |r, s| r }
249
- end
250
- else
251
- # exclude tuples of r that pass the blk call
252
- @exclude = map { |r, s| r unless blk.call(r, s).nil? }.compact
253
- end
254
- # XXX: @exclude is an Array, which makes include? O(n)
255
- @rels[0].map {|r| (@exclude.include? r) ? nil : r}
256
- end
257
-
258
- private
259
- def check_join_pred(pred, join_rels)
260
- unless join_rels.include? pred[0]
261
- raise Bud::CompileError, "illegal predicate: collection #{pred[0]} is not being joined"
262
- end
263
- end
264
-
265
- # extract predicates on rellist[0] and recurse to right side with remainder
266
- protected
267
- def setup_preds(preds) # :nodoc: all
268
- return if preds.empty?
269
- allpreds = disambiguate_preds(preds)
270
- allpreds = canonicalize_localpreds(@rels, allpreds)
271
- # check for refs to collections that aren't being joined, Issue 191
272
- unless @rels[1].class <= Bud::BudJoin
273
- tabnames = @rels.map{ |r| r.tabname }
274
- allpreds.each do |p|
275
- check_join_pred(p[0], tabnames)
276
- check_join_pred(p[1], tabnames)
277
- end
278
- end
279
- @hashpreds = allpreds.reject {|p| p[0][0] != @rels[0].tabname}
280
- @localpreds = @hashpreds
281
-
282
- # only allow preds on the same table name if they're on a self-joined table
283
- @localpreds.each do |p|
284
- if p[0][0] == p[1][0] and not @selfjoins.include? p[0][0]
285
- raise Bud::CompileError, "single-table predicate on #{p[0][0]} disallowed in joins"
286
- end
287
- end
288
-
289
- @localpreds += allpreds.map do |p|
290
- p if p[0][0] == p[1][0] and (p[0][0] == @rels[0].tabname or p[0][0] == @rels[1].tabname)
291
- end.compact
292
- otherpreds = allpreds - @localpreds
293
- unless otherpreds.empty?
294
- unless @rels[1].class <= Bud::BudJoin
295
- raise Bud::CompileError, "join predicates don't match collections being joined: #{otherpreds.inspect}"
296
- end
297
- @rels[1].setup_preds(otherpreds)
298
- end
299
- end
300
-
301
- protected
302
- def disambiguate_preds(preds) # :nodoc: all
303
- if preds.size == 1 and preds[0].class <= Hash
304
- predarray = preds[0].map do |k,v|
305
- if k.class != v.class
306
- raise Bud::CompileError, "inconsistent attribute ref style #{k.inspect} => #{v.inspect}"
307
- elsif k.class <= Array
308
- [k,v]
309
- elsif k.class <= Symbol
310
- if @origrels and @origrels.length == 2
311
- [find_attr_match(k, @origrels[0]), find_attr_match(v, @origrels[1])]
312
- else
313
- [find_attr_match(k), find_attr_match(v)]
314
- end
315
- else
316
- raise Bud::CompileError, "invalid attribute ref in #{k.inspect} => #{v.inspect}"
317
- end
318
- end
319
- return decomp_preds(*predarray)
320
- else
321
- return decomp_preds(*preds)
322
- end
323
- end
324
-
325
- # find element in @origrels that contains this +aname+ method
326
- # if +rel+ is non-nil, only check that collection.
327
- # after found, return the result of invoking +aname+ from chosen collection
328
- protected
329
- def find_attr_match(aname, rel=nil) # :nodoc: all
330
- dorels = (rel.nil? ? @origrels : [rel])
331
- match = nil
332
- dorels.each do |r|
333
- match ||= r if r.respond_to?(aname)
334
- if r.respond_to?(aname) and match != r
335
- raise Bud::CompileError, "ambiguous attribute :#{aname} in both #{match.tabname} and #{r.tabname}"
336
- end
337
- end
338
- if match.nil?
339
- raise Bud::CompileError, "attribute :#{aname} not found in any of #{dorels.map{|t| t.tabname}.inspect}"
340
- end
341
- match.send(aname)
342
- end
343
-
344
- protected
345
- def decomp_preds(*preds) # :nodoc:all
346
- # decompose each pred into a binary pred
347
- return nil if preds.empty? or preds == [nil]
348
- newpreds = []
349
- preds.each do |p|
350
- p.each_with_index do |c, i|
351
- newpreds << [p[i], p[i+1]] unless p[i+1].nil?
352
- end
353
- end
354
- newpreds
355
- end
356
-
357
- protected
358
- def canonicalize_localpreds(rel_list, preds) # :nodoc:all
359
- retval = preds.map do |p|
360
- # reverse if rhs is rel_list[0], *unless* it's a self-join!
361
- (p[1][0] == rel_list[0].tabname and p[1][0] != p[0][0]) ? p.reverse : p
362
- end
363
- end
364
-
365
- public
366
- def each_from_sym(buf_syms, &block) # :nodoc: all
367
- buf_syms.each do |s|
368
- each(s, &block)
369
- end
370
- end
371
-
372
- private
373
- # r is a tuple
374
- # s is an array (combo) of joined tuples
375
- def test_locals(r, s, *skips)
376
- retval = true
377
- if (@localpreds and skips and @localpreds.length > skips.length)
378
- # check remainder of the predicates
379
- @localpreds.each do |pred|
380
- # skip skips, and self-join preds
381
- next if (skips.include? pred or pred[0][0] == pred[1][0])
382
- vals = []
383
- (0..1).each do |i|
384
- if pred[i][0] == @rels[0].tabname
385
- vals[i] = r[pred[i][1] ]
386
- else
387
- ix, off = join_offset(pred[i])
388
- vals[i] = s[ix][off]
389
- end
390
- end
391
- if vals[0] != vals[1]
392
- retval = false
393
- break
394
- end
395
- end
396
- end
397
- return retval
398
- end
399
-
400
- private
401
- def nestloop_join(left_rel, right_rel, &block)
402
- @rels[0].each_from_sym([left_rel]) do |r|
403
- @rels[1].each_from_sym([right_rel]) do |s|
404
- s = [s] if origrels.length == 2
405
- if test_locals(r, s)
406
- yield([r] + s)
407
- end
408
- end
409
- end
410
- end
411
-
412
- private
413
- # calculate the position for a field in the result of a join:
414
- # the tuple offset ("subtuple") and the attribute position within it
415
- # ("offset")
416
- def join_offset(entry)
417
- name, offset = entry[0], entry[1]
418
-
419
- # determine which subtuple of the collection contains the table
420
- # referenced in entry.
421
- subtuple = 0
422
- origrels[1..origrels.length].each_with_index do |t,i|
423
- if t.tabname == entry[0]
424
- subtuple = i
425
- break
426
- end
427
- end
428
-
429
- return subtuple, offset
430
- end
431
-
432
- def tick_hash_deltas
433
- # for hash_join, move old delta hashtables into storage hashtables
434
- return if @hash_tables.nil?
435
- (0..1).each do |i|
436
- @hash_tables[i][:storage].merge!(@hash_tables[i][:delta]) do |k,l,r|
437
- l+r
438
- end
439
- @hash_tables[i][:delta] = {}
440
- end
441
- end
442
-
443
- # semi-naive symmetric hash join on first predicate
444
- private
445
- def hash_join(left_sym, right_sym, &block)
446
- # we know that a hashpred has been canonicalized with @rels[0] in left offset
447
- left_offset = @hashpreds.first[0][1]
448
- right_subtuple, right_offset = join_offset(@hashpreds.first[1])
449
-
450
- syms = [left_sym, right_sym]
451
-
452
- syms.each_with_index do |probe_sym, probe_ix|
453
- other_ix = 1 - probe_ix # bit-flip
454
- other_sym = syms[other_ix]
455
- probe_offset = (probe_ix == 0) ? left_offset : right_offset
456
-
457
- # in a delta/storage join we do traditional one-sided hash join
458
- # so don't probe from the storage side.
459
- # the other side should have been built already!
460
- if probe_sym == :storage and probe_sym != other_sym
461
- next
462
- end
463
-
464
- # ready to do the symmetric hash join
465
- rels[probe_ix].each_from_sym([probe_sym]) do |r|
466
- r = [r] unless probe_ix == 1 and origrels.length > 2
467
- attrval = (probe_ix == 0) ? r[0][left_offset] : r[right_subtuple][right_offset]
468
-
469
- # insert into the prober's hashtable only if symmetric
470
- if probe_sym == other_sym
471
- @hash_tables[probe_ix][probe_sym][attrval] ||= []
472
- @hash_tables[probe_ix][probe_sym][attrval] << r
473
- end
474
-
475
- # ...and probe the other hashtable
476
- if @hash_tables[other_ix][other_sym][attrval].nil?
477
- next
478
- else
479
- @hash_tables[other_ix][other_sym][attrval].each do |s_tup|
480
- if probe_ix == 0
481
- left = r; right = s_tup
482
- else
483
- left = s_tup; right = r
484
- end
485
- retval = left + right
486
- yield retval if test_locals(left[0], right, @hashpreds.first)
487
- end
488
- end
489
- end
490
- end
491
- end
492
- end
493
-
494
- # intended to be used to extend a BudJoin instance
495
- module BudOuterJoin
496
- public
497
- def each(&block) # :nodoc:all
498
- super(&block)
499
- # Previous line finds all the matches. Now its time to ``preserve'' the
500
- # outer tuples with no matches. Our trick: for each tuple of the outer,
501
- # generate a singleton relation and join with inner. If result is empty,
502
- # preserve tuple.
503
- # XXX: This is totally inefficient: we should fold the identification of
504
- # non-matches into the join algorithms. Another day.
505
- @rels[0].each do |r|
506
- t = @origrels[0].clone_empty
507
- # need to uniquify the tablename here to avoid sharing join state with original
508
- t.uniquify_tabname
509
- t << r
510
- j = BudJoin.new([t, @origrels[1]], @bud_instance, @origpreds)
511
-
512
- # the following is "next if j.any?" on storage tuples *only*
513
- any = false
514
- j.each(:storage) do |j|
515
- any = true
516
- break
517
- end
518
- next if any
519
-
520
- nulltup = @origrels[1].null_tuple
521
- yield [r, nulltup]
522
- end
523
- end
524
- end
525
- end
526
-