bud 0.1.0.pre1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/bud/joins.rb DELETED
@@ -1,526 +0,0 @@
1
- $EMPTY = []
2
- module Bud
3
- class BudJoin < BudCollection
4
- attr_accessor :rels, :origrels, :origpreds # :nodoc: all
5
- attr_reader :hash_tables # :nodoc: all
6
-
7
- def initialize(rellist, bud_instance, preds=[]) # :nodoc: all
8
- @schema = []
9
- @origpreds = preds
10
- @bud_instance = bud_instance
11
- @localpreds = nil
12
- @hashpreds = nil
13
- @selfjoins = []
14
-
15
- # if any elements on rellist are BudJoins, suck up their contents
16
- tmprels = []
17
- rellist.each do |r|
18
- if r.class <= BudJoin
19
- tmprels += r.origrels
20
- preds += r.origpreds
21
- else
22
- tmprels << r
23
- end
24
- end
25
- rellist = tmprels
26
- @origrels = rellist
27
-
28
- # check for self-joins: we currently only handle 2 instances of the same table per rule
29
- counts = @origrels.reduce({}) do |memo, r|
30
- memo[r.tabname] ||= 0
31
- memo[r.tabname] += 1
32
- memo
33
- end
34
- counts.each do |name, cnt|
35
- raise Bud::CompileError, "#{cnt} instances of #{name} in rule; only one self-join currently allowed per rule" if cnt > 2
36
- @selfjoins << name if cnt == 2
37
- end
38
-
39
-
40
- # recurse to form a tree of binary BudJoins
41
- @rels = [rellist[0]]
42
- @rels << (rellist.length == 2 ? rellist[1] : BudJoin.new(rellist[1..rellist.length-1], @bud_instance))
43
- # derive schema: one column for each table.
44
- # duplicated inputs get distinguishing numeral
45
- @schema = []
46
- index = 0
47
- retval = rellist.reduce({}) do |memo, r|
48
- index += 1
49
- memo[r.tabname.to_s] ||= 0
50
- newstr = r.tabname.to_s + ((memo[r.tabname.to_s] > 0) ? ("_" + memo[r.tabname.to_s].to_s) : "")
51
- @schema << newstr.to_sym
52
- memo[r.tabname.to_s] += 1
53
- memo
54
- end
55
-
56
- setup_preds(preds)
57
- setup_state
58
- end
59
-
60
- public
61
- def state_id # :nodoc: all
62
- Marshal.dump([@rels.map{|r| r.tabname}, @localpreds]).hash
63
- end
64
-
65
- # initialize the state for this join to be carried across iterations within a fixpoint
66
- private
67
- def setup_state
68
- sid = state_id
69
- @tabname = ("temp_join"+state_id.to_s).to_sym
70
- @bud_instance.joinstate[sid] ||= [{:storage => {}, :delta => {}}, {:storage => {}, :delta => {}}]
71
- @hash_tables = @bud_instance.joinstate[sid]
72
- end
73
-
74
- private_class_method
75
- def self.natural_preds(bud_instance, rels)
76
- preds = []
77
- rels.each do |r|
78
- rels.each do |s|
79
- matches = r.cols & s.cols
80
- matches.each do |c|
81
- preds << [bud_instance.send(r.tabname).send(c), bud_instance.send(s.tabname).send(c)] unless r.tabname.to_s >= s.tabname.to_s
82
- end
83
- end
84
- end
85
- preds.uniq
86
- end
87
-
88
- private_class_method
89
- def self.positionwise_preds(bud_instance, rels)
90
- preds = []
91
- rels.each do |r|
92
- rels.each do |s|
93
- [r.cols.length, s.cols.length].min.times do |c|
94
- preds << [bud_instance.send(r.tabname).send(r.cols[c]), bud_instance.send(s.tabname).send(s.cols[c])] unless r.tabname.to_s >= s.tabname.to_s
95
- end
96
- end
97
- end
98
- preds.uniq
99
- end
100
-
101
- # flatten joined items into arrays, with attribute accessors inherited
102
- # from the input collections, disambiguated via suffix indexes as needed.
103
- # similar to <tt>SELECT * FROM ... WHERE...</tt> block in SQL.
104
- public
105
- def flatten(*preds)
106
- setup_preds(preds)
107
- flat_schema = @rels.map{|r| r.cols}.flatten(1)
108
- dupfree_schema = []
109
- # while loop here (inefficiently) ensures no collisions
110
- while dupfree_schema == $EMPTY or dupfree_schema.uniq.length < dupfree_schema.length
111
- dupfree_schema = []
112
- flat_schema.reduce({}) do |memo, r|
113
- if r.to_s.include?("_") and ((r.to_s.rpartition("_")[2] =~ /^\d+$/) == 0)
114
- r = r.to_s.rpartition("_")[0].to_sym
115
- end
116
- memo[r] ||= 0
117
- if memo[r] == 0
118
- dupfree_schema << r.to_s.to_sym
119
- else
120
- dupfree_schema << (r.to_s + "_" + (memo[r]).to_s).to_sym
121
- end
122
- memo[r] += 1
123
- memo
124
- end
125
- flat_schema = dupfree_schema
126
- end
127
- retval = BudScratch.new('temp_flatten', bud_instance, dupfree_schema)
128
- retval.uniquify_tabname
129
- retval.merge(self.map{|r,s| r + s}, retval.storage)
130
- end
131
-
132
- undef do_insert
133
-
134
- public
135
- # map each (nested) item in the collection into a string, suitable for placement in stdio
136
- def inspected
137
- raise Bud::Error, "join left unconverted to binary" if @rels.length > 2
138
- tabnames = @origrels.map {|r| r.tabname.to_s}.join " * "
139
- [["(#{tabnames}): [#{self.map{|r1, r2| "\n (#{r1.inspect}, #{r2.inspect})"}}]"]]
140
- end
141
-
142
- public
143
- def pro(&blk) # :nodoc: all
144
- pairs(&blk)
145
- end
146
-
147
- public
148
- def each(mode=:both, &block) # :nodoc: all
149
- mode = :storage if @bud_instance.stratum_first_iter
150
- if mode == :storage
151
- methods = [:storage]
152
- else
153
- methods = [:delta, :storage]
154
- end
155
-
156
- methods.each do |left_rel|
157
- methods.each do |right_rel|
158
- next if (mode == :both and left_rel == :storage and right_rel == :storage)
159
- if @hashpreds.nil? or @hashpreds.empty?
160
- nestloop_join(left_rel, right_rel, &block)
161
- else
162
- hash_join(left_rel, right_rel, &block)
163
- end
164
- end
165
- end
166
- tick_hash_deltas
167
- end
168
-
169
- # given a * expression over n collections, form all combinations of items
170
- # subject to an array of predicates, pred
171
- # currently supports two options for equijoin predicates:
172
- # general form: an array of arrays capturing a conjunction of equiv. classes
173
- # [[table1.col1, table2.col2, table3.col3], [table1.col2, table2.col3]]
174
- # common form: a hash capturing equality of a column on left with one on right.
175
- # :col1 => :col2 (same as lefttable.col1 => righttable.col2)
176
- public
177
- def pairs(*preds, &blk)
178
- @origpreds = preds
179
- setup_preds(preds)
180
- # given new preds, the state for the join will be different. set it up again.
181
- setup_state if self.class <= Bud::BudJoin
182
- blk.nil? ? self : map(&blk)
183
- end
184
-
185
- alias combos pairs
186
-
187
- # the natural join: given a * expression over n collections, form all
188
- # combinations of items that have the same values in matching fields
189
- public
190
- def matches(&blk)
191
- preds = BudJoin::natural_preds(@bud_instance, @origrels)
192
- pairs(*preds, &blk)
193
- end
194
-
195
- # given a * expression over 2 collections, form all combinations of items
196
- # that satisfy the predicates +preds+, and project only onto the attributes
197
- # of the first collection
198
- public
199
- def lefts(*preds, &blk)
200
- setup_preds(preds)
201
- # given new preds, the state for the join will be different. set it up again.
202
- setup_state if self.class <= Bud::BudJoin
203
- map{ |l,r| blk.nil? ? l : blk.call(l) }
204
- end
205
-
206
- # given a * expression over 2 collections, form all combinations of items
207
- # that satisfy the predicates +preds+, and project only onto the attributes
208
- # of the second item
209
- public
210
- def rights(*preds, &blk)
211
- setup_preds(preds)
212
- # given new preds, the state for the join will be different. set it up again.
213
- setup_state if self.class <= Bud::BudJoin
214
- map{ |l,r| blk.nil? ? r : blk.call(r) }
215
- end
216
-
217
- # given a * expression over 2 collections, form all combos of items that
218
- # satisfy +preds+, and for any item from the 1st collection that has no
219
- # matches in the 2nd, nil-pad it and include it in the output.
220
- public
221
- def outer(*preds, &blk)
222
- @origpreds = preds
223
- setup_preds(preds)
224
- self.extend(Bud::BudOuterJoin)
225
- blk.nil? ? self : map(&blk)
226
- end
227
-
228
- # AntiJoin
229
- # note: unlike other join methods (e.g. lefts) all we do with the return value
230
- # of block is check whether it's nil. Putting "projection" logic in the block
231
- # has no effect on the output.
232
- public
233
- def anti(*preds, &blk)
234
- return [] unless @bud_instance.stratum_first_iter
235
- @origpreds = preds
236
- # no projection involved here, so we can propagate the schema
237
- @cols = @rels[0].cols
238
- if preds == [] and blk.nil? and @cols.length == @rels[1].cols.length
239
- preds = BudJoin::positionwise_preds(@bud_instance, rels)
240
- end
241
- setup_preds(preds)
242
- setup_state if self.class <= Bud::BudJoin
243
- if blk.nil?
244
- if preds == [] # mismatched schemas -- no matches to be excluded
245
- @exclude = []
246
- else
247
- # exclude those tuples of r that have a match
248
- @exclude = map { |r, s| r }
249
- end
250
- else
251
- # exclude tuples of r that pass the blk call
252
- @exclude = map { |r, s| r unless blk.call(r, s).nil? }.compact
253
- end
254
- # XXX: @exclude is an Array, which makes include? O(n)
255
- @rels[0].map {|r| (@exclude.include? r) ? nil : r}
256
- end
257
-
258
- private
259
- def check_join_pred(pred, join_rels)
260
- unless join_rels.include? pred[0]
261
- raise Bud::CompileError, "illegal predicate: collection #{pred[0]} is not being joined"
262
- end
263
- end
264
-
265
- # extract predicates on rellist[0] and recurse to right side with remainder
266
- protected
267
- def setup_preds(preds) # :nodoc: all
268
- return if preds.empty?
269
- allpreds = disambiguate_preds(preds)
270
- allpreds = canonicalize_localpreds(@rels, allpreds)
271
- # check for refs to collections that aren't being joined, Issue 191
272
- unless @rels[1].class <= Bud::BudJoin
273
- tabnames = @rels.map{ |r| r.tabname }
274
- allpreds.each do |p|
275
- check_join_pred(p[0], tabnames)
276
- check_join_pred(p[1], tabnames)
277
- end
278
- end
279
- @hashpreds = allpreds.reject {|p| p[0][0] != @rels[0].tabname}
280
- @localpreds = @hashpreds
281
-
282
- # only allow preds on the same table name if they're on a self-joined table
283
- @localpreds.each do |p|
284
- if p[0][0] == p[1][0] and not @selfjoins.include? p[0][0]
285
- raise Bud::CompileError, "single-table predicate on #{p[0][0]} disallowed in joins"
286
- end
287
- end
288
-
289
- @localpreds += allpreds.map do |p|
290
- p if p[0][0] == p[1][0] and (p[0][0] == @rels[0].tabname or p[0][0] == @rels[1].tabname)
291
- end.compact
292
- otherpreds = allpreds - @localpreds
293
- unless otherpreds.empty?
294
- unless @rels[1].class <= Bud::BudJoin
295
- raise Bud::CompileError, "join predicates don't match collections being joined: #{otherpreds.inspect}"
296
- end
297
- @rels[1].setup_preds(otherpreds)
298
- end
299
- end
300
-
301
- protected
302
- def disambiguate_preds(preds) # :nodoc: all
303
- if preds.size == 1 and preds[0].class <= Hash
304
- predarray = preds[0].map do |k,v|
305
- if k.class != v.class
306
- raise Bud::CompileError, "inconsistent attribute ref style #{k.inspect} => #{v.inspect}"
307
- elsif k.class <= Array
308
- [k,v]
309
- elsif k.class <= Symbol
310
- if @origrels and @origrels.length == 2
311
- [find_attr_match(k, @origrels[0]), find_attr_match(v, @origrels[1])]
312
- else
313
- [find_attr_match(k), find_attr_match(v)]
314
- end
315
- else
316
- raise Bud::CompileError, "invalid attribute ref in #{k.inspect} => #{v.inspect}"
317
- end
318
- end
319
- return decomp_preds(*predarray)
320
- else
321
- return decomp_preds(*preds)
322
- end
323
- end
324
-
325
- # find element in @origrels that contains this +aname+ method
326
- # if +rel+ is non-nil, only check that collection.
327
- # after found, return the result of invoking +aname+ from chosen collection
328
- protected
329
- def find_attr_match(aname, rel=nil) # :nodoc: all
330
- dorels = (rel.nil? ? @origrels : [rel])
331
- match = nil
332
- dorels.each do |r|
333
- match ||= r if r.respond_to?(aname)
334
- if r.respond_to?(aname) and match != r
335
- raise Bud::CompileError, "ambiguous attribute :#{aname} in both #{match.tabname} and #{r.tabname}"
336
- end
337
- end
338
- if match.nil?
339
- raise Bud::CompileError, "attribute :#{aname} not found in any of #{dorels.map{|t| t.tabname}.inspect}"
340
- end
341
- match.send(aname)
342
- end
343
-
344
- protected
345
- def decomp_preds(*preds) # :nodoc:all
346
- # decompose each pred into a binary pred
347
- return nil if preds.empty? or preds == [nil]
348
- newpreds = []
349
- preds.each do |p|
350
- p.each_with_index do |c, i|
351
- newpreds << [p[i], p[i+1]] unless p[i+1].nil?
352
- end
353
- end
354
- newpreds
355
- end
356
-
357
- protected
358
- def canonicalize_localpreds(rel_list, preds) # :nodoc:all
359
- retval = preds.map do |p|
360
- # reverse if rhs is rel_list[0], *unless* it's a self-join!
361
- (p[1][0] == rel_list[0].tabname and p[1][0] != p[0][0]) ? p.reverse : p
362
- end
363
- end
364
-
365
- public
366
- def each_from_sym(buf_syms, &block) # :nodoc: all
367
- buf_syms.each do |s|
368
- each(s, &block)
369
- end
370
- end
371
-
372
- private
373
- # r is a tuple
374
- # s is an array (combo) of joined tuples
375
- def test_locals(r, s, *skips)
376
- retval = true
377
- if (@localpreds and skips and @localpreds.length > skips.length)
378
- # check remainder of the predicates
379
- @localpreds.each do |pred|
380
- # skip skips, and self-join preds
381
- next if (skips.include? pred or pred[0][0] == pred[1][0])
382
- vals = []
383
- (0..1).each do |i|
384
- if pred[i][0] == @rels[0].tabname
385
- vals[i] = r[pred[i][1] ]
386
- else
387
- ix, off = join_offset(pred[i])
388
- vals[i] = s[ix][off]
389
- end
390
- end
391
- if vals[0] != vals[1]
392
- retval = false
393
- break
394
- end
395
- end
396
- end
397
- return retval
398
- end
399
-
400
- private
401
- def nestloop_join(left_rel, right_rel, &block)
402
- @rels[0].each_from_sym([left_rel]) do |r|
403
- @rels[1].each_from_sym([right_rel]) do |s|
404
- s = [s] if origrels.length == 2
405
- if test_locals(r, s)
406
- yield([r] + s)
407
- end
408
- end
409
- end
410
- end
411
-
412
- private
413
- # calculate the position for a field in the result of a join:
414
- # the tuple offset ("subtuple") and the attribute position within it
415
- # ("offset")
416
- def join_offset(entry)
417
- name, offset = entry[0], entry[1]
418
-
419
- # determine which subtuple of the collection contains the table
420
- # referenced in entry.
421
- subtuple = 0
422
- origrels[1..origrels.length].each_with_index do |t,i|
423
- if t.tabname == entry[0]
424
- subtuple = i
425
- break
426
- end
427
- end
428
-
429
- return subtuple, offset
430
- end
431
-
432
- def tick_hash_deltas
433
- # for hash_join, move old delta hashtables into storage hashtables
434
- return if @hash_tables.nil?
435
- (0..1).each do |i|
436
- @hash_tables[i][:storage].merge!(@hash_tables[i][:delta]) do |k,l,r|
437
- l+r
438
- end
439
- @hash_tables[i][:delta] = {}
440
- end
441
- end
442
-
443
- # semi-naive symmetric hash join on first predicate
444
- private
445
- def hash_join(left_sym, right_sym, &block)
446
- # we know that a hashpred has been canonicalized with @rels[0] in left offset
447
- left_offset = @hashpreds.first[0][1]
448
- right_subtuple, right_offset = join_offset(@hashpreds.first[1])
449
-
450
- syms = [left_sym, right_sym]
451
-
452
- syms.each_with_index do |probe_sym, probe_ix|
453
- other_ix = 1 - probe_ix # bit-flip
454
- other_sym = syms[other_ix]
455
- probe_offset = (probe_ix == 0) ? left_offset : right_offset
456
-
457
- # in a delta/storage join we do traditional one-sided hash join
458
- # so don't probe from the storage side.
459
- # the other side should have been built already!
460
- if probe_sym == :storage and probe_sym != other_sym
461
- next
462
- end
463
-
464
- # ready to do the symmetric hash join
465
- rels[probe_ix].each_from_sym([probe_sym]) do |r|
466
- r = [r] unless probe_ix == 1 and origrels.length > 2
467
- attrval = (probe_ix == 0) ? r[0][left_offset] : r[right_subtuple][right_offset]
468
-
469
- # insert into the prober's hashtable only if symmetric
470
- if probe_sym == other_sym
471
- @hash_tables[probe_ix][probe_sym][attrval] ||= []
472
- @hash_tables[probe_ix][probe_sym][attrval] << r
473
- end
474
-
475
- # ...and probe the other hashtable
476
- if @hash_tables[other_ix][other_sym][attrval].nil?
477
- next
478
- else
479
- @hash_tables[other_ix][other_sym][attrval].each do |s_tup|
480
- if probe_ix == 0
481
- left = r; right = s_tup
482
- else
483
- left = s_tup; right = r
484
- end
485
- retval = left + right
486
- yield retval if test_locals(left[0], right, @hashpreds.first)
487
- end
488
- end
489
- end
490
- end
491
- end
492
- end
493
-
494
- # intended to be used to extend a BudJoin instance
495
- module BudOuterJoin
496
- public
497
- def each(&block) # :nodoc:all
498
- super(&block)
499
- # Previous line finds all the matches. Now its time to ``preserve'' the
500
- # outer tuples with no matches. Our trick: for each tuple of the outer,
501
- # generate a singleton relation and join with inner. If result is empty,
502
- # preserve tuple.
503
- # XXX: This is totally inefficient: we should fold the identification of
504
- # non-matches into the join algorithms. Another day.
505
- @rels[0].each do |r|
506
- t = @origrels[0].clone_empty
507
- # need to uniquify the tablename here to avoid sharing join state with original
508
- t.uniquify_tabname
509
- t << r
510
- j = BudJoin.new([t, @origrels[1]], @bud_instance, @origpreds)
511
-
512
- # the following is "next if j.any?" on storage tuples *only*
513
- any = false
514
- j.each(:storage) do |j|
515
- any = true
516
- break
517
- end
518
- next if any
519
-
520
- nulltup = @origrels[1].null_tuple
521
- yield [r, nulltup]
522
- end
523
- end
524
- end
525
- end
526
-