fat_core 1.7.1 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,988 +0,0 @@
1
- module FatCore
2
- # A container for a two-dimensional table. All cells in the table must be a
3
- # String, a DateTime (or Date), a Numeric (Bignum, Integer, or BigDecimal), or
4
- # a Boolean (TrueClass or FalseClass). All columns must be of one of those
5
- # types or be a string convertible into one of them. It is considered an error
6
- # if a single column contains cells of different types. Any cell that cannot
7
- # be parsed as one of the Numeric, DateTime, or Boolean types will be treated
8
- # as a String and have to_s applied. Until the column type is determined, it
9
- # will have the type NilClass.
10
- #
11
- # You can initialize a Table in several ways:
12
- #
13
- # 1. with a Nil, which will return an empty table to which rows or columns can
14
- # be added later, 2. with the name of a .csv file, 3. with the name of an
15
- # .org file, 4. with an IO or StringIO object for either type of file, but
16
- # in that case, you need to specify 'csv' or 'org' as the second argument
17
- # to tell it what kind of file format to expect, 5. with an Array of
18
- # Arrays, 6. with an Array of Hashes, all having the same keys, which
19
- # become the names of the column heads, 7. with an Array of any objects
20
- # that respond to .keys and .values methods, 8. with another Table object.
21
- #
22
- # In the resulting Table, the headers are converted into symbols, with all
23
- # spaces converted to underscore and everything down-cased. So, the heading,
24
- # 'Two Words' becomes the hash header :two_words.
25
- class Table
26
- attr_reader :columns
27
-
28
- def initialize
29
- @columns = []
30
- @boundaries = []
31
- end
32
-
33
- ###########################################################################
34
- # Constructors
35
- ###########################################################################
36
-
37
- # Construct a Table from the contents of a CSV file. Headers will be taken
38
- # from the first row and converted to symbols.
39
- def self.from_csv_file(fname)
40
- File.open(fname, 'r') do |io|
41
- from_csv_io(io)
42
- end
43
- end
44
-
45
- # Construct a Table from a string, treated as the input from a CSV file.
46
- def self.from_csv_string(str)
47
- from_csv_io(StringIO.new(str))
48
- end
49
-
50
- # Construct a Table from the first table found in the given org-mode file.
51
- # Headers are taken from the first row if the second row is an hrule.``
52
- def self.from_org_file(fname)
53
- File.open(fname, 'r') do |io|
54
- from_org_io(io)
55
- end
56
- end
57
-
58
- # Construct a Table from a string, treated as the contents of an org-mode
59
- # file.
60
- def self.from_org_string(str)
61
- from_org_io(StringIO.new(str))
62
- end
63
-
64
- # Construct a Table from an array of arrays. If the second element is a nil
65
- # or is an array whose first element is a string that looks like a rule
66
- # separator, '|-----------', '+----------', etc., the headers will be taken
67
- # from the first array converted to strings and then to symbols. Any
68
- # following such rows mark a group boundary. Note that this is the form of
69
- # a table used by org-mode src blocks, so it is useful for building Tables
70
- # from the result of a src block.
71
- def self.from_aoa(aoa)
72
- from_array_of_arrays(aoa)
73
- end
74
-
75
- # Construct a Table from an array of hashes, or any objects that respond to
76
- # the #to_h method. All hashes must have the same keys, which, when
77
- # converted to symbols will become the headers for the Table.
78
- def self.from_aoh(aoh)
79
- if aoh.first.respond_to?(:to_h)
80
- from_array_of_hashes(aoh)
81
- else
82
- raise ArgumentError,
83
- "Cannot initialize Table with an array of #{input[0].class}"
84
- end
85
- end
86
-
87
- # Construct a Table from another Table. Inherit any group boundaries from
88
- # the input table.
89
- def self.from_table(table)
90
- from_aoh(table.rows)
91
- @boundaries = table.boundaries
92
- end
93
-
94
- ############################################################################
95
- # Class-level constructor helpers
96
- ############################################################################
97
-
98
- class << self
99
- private
100
-
101
- # Construct table from an array of hashes or an array of any object that can
102
- # respond to #to_h. If an array element is a nil, mark it as a group
103
- # boundary in the Table.
104
- def from_array_of_hashes(hashes)
105
- result = new
106
- hashes.each do |hsh|
107
- if hsh.nil?
108
- result.mark_boundary
109
- next
110
- end
111
- result << hsh.to_h
112
- end
113
- result
114
- end
115
-
116
- # Construct a new table from an array of arrays. If the second element of
117
- # the array is a nil, a string that looks like an hrule, or an array whose
118
- # first element is a string that looks like an hrule, interpret the first
119
- # element of the array as a row of headers. Otherwise, synthesize headers of
120
- # the form "col1", "col2", ... and so forth. The remaining elements are
121
- # taken as the body of the table, except that if an element of the outer
122
- # array is a nil or a string that looks like an hrule, mark the preceding
123
- # row as a boundary.
124
- def from_array_of_arrays(rows)
125
- result = new
126
- headers = []
127
- if looks_like_boundary?(rows[1])
128
- # Take the first row as headers
129
- # Use first row 0 as headers
130
- headers = rows[0].map(&:as_sym)
131
- first_data_row = 2
132
- else
133
- # Synthesize headers
134
- headers = (1..rows[0].size).to_a.map { |k| "col#{k}".as_sym }
135
- first_data_row = 0
136
- end
137
- rows[first_data_row..-1].each do |row|
138
- if looks_like_boundary?(row)
139
- result.mark_boundary
140
- next
141
- end
142
- row = row.map { |s| s.to_s.strip }
143
- hash_row = Hash[headers.zip(row)]
144
- result << hash_row
145
- end
146
- result
147
- end
148
-
149
- # Return true if row is nil, a string that matches hrule_re, or is an
150
- # array whose first element matches hrule_re.
151
- def looks_like_boundary?(row)
152
- hrule_re = /\A\s*[\|+][-]+/
153
- return true if row.nil?
154
- if row.respond_to?(:first) && row.first.respond_to?(:to_s)
155
- return row.first.to_s =~ hrule_re
156
- end
157
- if row.respond_to?(:to_s)
158
- return row.to_s =~ hrule_re
159
- end
160
- false
161
- end
162
-
163
- def from_csv_io(io)
164
- result = new
165
- ::CSV.new(io, headers: true, header_converters: :symbol,
166
- skip_blanks: true).each do |row|
167
- result << row.to_h
168
- end
169
- result
170
- end
171
-
172
- # Form rows of table by reading the first table found in the org file.
173
- def from_org_io(io)
174
- table_re = /\A\s*\|/
175
- hrule_re = /\A\s*\|[-+]+/
176
- rows = []
177
- table_found = false
178
- header_found = false
179
- io.each do |line|
180
- unless table_found
181
- # Skip through the file until a table is found
182
- next unless line =~ table_re
183
- unless line =~ hrule_re
184
- line = line.sub(/\A\s*\|/, '').sub(/\|\s*\z/, '')
185
- rows << line.split('|').map(&:clean)
186
- end
187
- table_found = true
188
- next
189
- end
190
- break unless line =~ table_re
191
- if !header_found && line =~ hrule_re
192
- rows << nil
193
- header_found = true
194
- next
195
- elsif header_found && line =~ hrule_re
196
- # Mark the boundary with a nil
197
- rows << nil
198
- elsif line !~ table_re
199
- # Stop reading at the second hline
200
- break
201
- else
202
- line = line.sub(/\A\s*\|/, '').sub(/\|\s*\z/, '')
203
- rows << line.split('|').map(&:clean)
204
- end
205
- end
206
- from_array_of_arrays(rows)
207
- end
208
- end
209
-
210
- ###########################################################################
211
- # Attributes
212
- ###########################################################################
213
-
214
- # Return the column with the given header.
215
- def column(key)
216
- columns.detect { |c| c.header == key.as_sym }
217
- end
218
-
219
- # Return the type of the column with the given header
220
- def type(key)
221
- column(key).type
222
- end
223
-
224
- # Return the array of items of the column with the given header, or if the
225
- # index is an integer, return that row number. So a table's rows can be
226
- # accessed by number, and its columns can be accessed by column header.
227
- # Also, double indexing works in either row-major or column-majoir order:
228
- # tab[:id][8] returns the 8th item in the column headed :id and so does
229
- # tab[8][:id].
230
- def [](key)
231
- case key
232
- when Integer
233
- raise "index '#{key}' out of range" unless (1..size).cover?(key)
234
- rows[key - 1]
235
- when String
236
- raise "header '#{key}' not in table" unless headers.include?(key)
237
- column(key).items
238
- when Symbol
239
- raise "header ':#{key}' not in table" unless headers.include?(key)
240
- column(key).items
241
- else
242
- raise "cannot index table with a #{key.class}"
243
- end
244
- end
245
-
246
- # Return true if the table has a column with the given header.
247
- def column?(key)
248
- headers.include?(key.as_sym)
249
- end
250
-
251
- # Return an array of the Table's column types.
252
- def types
253
- columns.map(&:type)
254
- end
255
-
256
- # Return the headers for the Table as an array of symbols.
257
- def headers
258
- columns.map(&:header)
259
- end
260
-
261
- # Return the number of rows in the Table.
262
- def size
263
- return 0 if columns.empty?
264
- columns.first.size
265
- end
266
-
267
- # Return whether this Table is empty.
268
- def empty?
269
- size.zero?
270
- end
271
-
272
- # Return the rows of the Table as an array of hashes, keyed by the headers.
273
- def rows
274
- rows = []
275
- unless columns.empty?
276
- 0.upto(columns.first.items.last_i) do |rnum|
277
- row = {}
278
- columns.each do |col|
279
- row[col.header] = col[rnum]
280
- end
281
- rows << row
282
- end
283
- end
284
- rows
285
- end
286
-
287
- protected
288
-
289
- # Return the rows from first to last. We could just index #rows, but in a
290
- # large table, that would require that we construct all the rows for a range
291
- # of any size.
292
- def rows_range(first = 0, last = size - 1)
293
- raise ArgumentError, 'first must be <= last' unless first <= last
294
- rows = []
295
- unless columns.empty?
296
- first.upto(last) do |rnum|
297
- row = {}
298
- columns.each do |col|
299
- row[col.header] = col[rnum]
300
- end
301
- rows << row
302
- end
303
- end
304
- rows
305
- end
306
-
307
- ## ###########################################################################
308
- ## Group Boundaries
309
- ##
310
- ## Boundaries mark the last row in each "group" within the table. The last
311
- ## row of the table is always an implicit boundary, and having the last row
312
- ## as the sole boundary is the default for new tables unless mentioned
313
- ## otherwise. Resetting the boundaries means to put it back in that default
314
- ## state.
315
- ##
316
- ## Note that tables are for the most part, immutable. That is, the data
317
- ## rows of the table, once set, are never changed by methods on the
318
- ## table. Any transformation of a table results in a new table. Boundaries
319
- ## and footers are exceptions to immutability, but even they only affect
320
- ## the boundary and footer attributes of the table, not the data rows.
321
- ##
322
- ## Boundaries can be added when a table is read in, for example, from the
323
- ## text of an org table in which each hline (other than the one separating
324
- ## the headers from the body) marks a boundary for the row immediately
325
- ## preceding the hline.
326
- ##
327
- ## The #order_by method resets the boundaries then adds boundaries at the
328
- ## last row of each group of rows on which the sort keys were equal as a
329
- ## boundary.
330
- ##
331
- ## The #union_all (but not #union since it deletes duplicates) method adds
332
- ## a boundary between the constituent tables. #union_all also preserves any
333
- ## boundary markers within the constituent tables. In doing so, the
334
- ## boundaries of the second table in the #union_all are increased by the
335
- ## size of the first table so that they refer to rows in the new table.
336
- ##
337
- ## The #select method preserves any boundaries from the parent table
338
- ## without change, since it only selects columns for the output and deletes
339
- ## no rows.
340
- ##
341
- ## Perhaps surprisingly, the #group_by method does /not/ result in any
342
- ## groups in the output table since the result of #group_by is to reduce
343
- ## all groups it finds into a single row, and having a group for each row
344
- ## of the output table would have no use.
345
- ##
346
- ## All the other table-transforming methods reset the boundaries in the new
347
- ## table. For example, #where re-arranges and deletes rows, so the old
348
- ## boundaries would make no sense anyway. Likewise, #union, #intersection,
349
- ## #except, and #join reset the boundaries to their default.
350
- ## ###########################################################################
351
-
352
- public
353
-
354
- # Return an array of an array of row hashes for the groups in this Table.
355
- def groups
356
- normalize_boundaries
357
- groups = []
358
- (0..boundaries.size - 1).each do |k|
359
- groups << group_rows(k)
360
- end
361
- groups
362
- end
363
-
364
- # Mark a boundary at k, and if k is nil, the last row in the table
365
- # as a group boundary.
366
- def mark_boundary(k = nil)
367
- if k
368
- boundaries.push(k)
369
- else
370
- boundaries.push(size - 1)
371
- end
372
- end
373
-
374
- protected
375
-
376
- # Reader for boundaries, but not public.
377
- def boundaries
378
- @boundaries
379
- end
380
-
381
- # Writer for boundaries, but not public.
382
- def boundaries=(bounds)
383
- @boundaries = bounds
384
- end
385
-
386
- # Make sure size - 1 is last boundary and that they are unique and sorted.
387
- def normalize_boundaries
388
- unless empty?
389
- boundaries.push(size - 1) unless boundaries.include?(size - 1)
390
- self.boundaries = boundaries.uniq.sort
391
- end
392
- boundaries
393
- end
394
-
395
- # Concatenate the array of argument bounds to this table's boundaries, but
396
- # increase each of the indexes in bounds by shift. This is used in the
397
- # #union_all method.
398
- def append_boundaries(bounds, shift: 0)
399
- @boundaries += bounds.map { |k| k + shift }
400
- end
401
-
402
- # Return the group number to which row k belongs. Groups, from the user's
403
- # point of view are indexed starting at 1.
404
- def row_index_to_group_index(k)
405
- boundaries.each_with_index do |b_last, g_num|
406
- return (g_num + 1) if k <= b_last
407
- end
408
- 1
409
- end
410
-
411
- def group_rows(k)
412
- normalize_boundaries
413
- return [] unless k < boundaries.size
414
- first = k.zero? ? 0 : boundaries[k - 1] + 1
415
- last = boundaries[k]
416
- rows_range(first, last)
417
- end
418
-
419
- ############################################################################
420
- # SQL look-alikes. The following methods are based on SQL equivalents and
421
- # all return a new Table object rather than modifying the table in place.
422
- ############################################################################
423
-
424
- public
425
-
426
- # Return a new Table sorting the rows of this Table on the possibly multiple
427
- # keys given in the array of syms in headers. Append a ! to the symbol name
428
- # to indicate reverse sorting on that column. Resets groups.
429
- def order_by(*sort_heads)
430
- sort_heads = [sort_heads].flatten
431
- rev_heads = sort_heads.select { |h| h.to_s.ends_with?('!') }
432
- sort_heads = sort_heads.map { |h| h.to_s.sub(/\!\z/, '').to_sym }
433
- rev_heads = rev_heads.map { |h| h.to_s.sub(/\!\z/, '').to_sym }
434
- new_rows = rows.sort do |r1, r2|
435
- key1 = sort_heads.map { |h| rev_heads.include?(h) ? r2[h] : r1[h] }
436
- key2 = sort_heads.map { |h| rev_heads.include?(h) ? r1[h] : r2[h] }
437
- key1 <=> key2
438
- end
439
- # Add the new rows to the table, but mark a group boundary at the points
440
- # where the sort key changes value.
441
- new_tab = Table.new
442
- last_key = nil
443
- new_rows.each_with_index do |nrow, k|
444
- new_tab << nrow
445
- key = nrow.fetch_values(*sort_heads)
446
- new_tab.mark_boundary(k - 1) if last_key && key != last_key
447
- last_key = key
448
- end
449
- new_tab.normalize_boundaries
450
- new_tab
451
- end
452
-
453
- # Return a Table having the selected column expressions. Each expression can
454
- # be either a (1) symbol, :old_col, representing a column in the current
455
- # table, (2) a hash of new_col: :old_col to rename an existing :old_col
456
- # column as :new_col, or (3) a hash of new_col: 'expression', to add a new
457
- # column that is computed as an arbitrary ruby expression of the existing
458
- # columns (whether selected for the output table or not) or any new_col
459
- # defined earlier in the argument list. The expression string can also
460
- # access the instance variable @row as the row number of the row being
461
- # evaluated. The bare symbol arguments (1) must precede any hash arguments
462
- # (2) or (3). Each expression results in a column in the resulting Table in
463
- # the order given. The expressions are evaluated in left-to-right order as
464
- # well. The output table preserves any groups present in the input table.
465
- def select(*cols, **new_cols)
466
- result = Table.new
467
- normalize_boundaries
468
- ev = Evaluator.new(vars: { row: 0, group: 1 },
469
- before: '@row = __row; @group = __group')
470
- rows.each_with_index do |old_row, old_k|
471
- new_row = {}
472
- cols.each do |k|
473
- h = k.as_sym
474
- raise "Column '#{h}' in select does not exist" unless column?(h)
475
- new_row[h] = old_row[h]
476
- end
477
- new_cols.each_pair do |key, val|
478
- key = key.as_sym
479
- vars = old_row.merge(new_row)
480
- vars[:__row] = old_k + 1
481
- vars[:__group] = row_index_to_group_index(old_k)
482
- case val
483
- when Symbol
484
- raise "Column '#{val}' in select does not exist" unless vars.keys.include?(val)
485
- new_row[key] = vars[val]
486
- when String
487
- new_row[key] = ev.evaluate(val, vars: vars)
488
- else
489
- raise 'Hash parameters to select must be a symbol or string'
490
- end
491
- end
492
- result << new_row
493
- end
494
- result.boundaries = boundaries
495
- result.normalize_boundaries
496
- result
497
- end
498
-
499
- # Return a Table containing only rows matching the where expression. Resets
500
- # groups.
501
- def where(expr)
502
- expr = expr.to_s
503
- result = Table.new
504
- ev = Evaluator.new(vars: { row: 0 },
505
- before: '@row = __row; @group = __group')
506
- rows.each_with_index do |row, k|
507
- vars = row
508
- vars[:__row] = k + 1
509
- vars[:__group] = row_index_to_group_index(k)
510
- result << row if ev.evaluate(expr, vars: row)
511
- end
512
- result.normalize_boundaries
513
- result
514
- end
515
-
516
- # Return this table with all duplicate rows eliminated. Resets groups.
517
- def distinct
518
- result = Table.new
519
- uniq_rows = rows.uniq
520
- uniq_rows.each do |row|
521
- result << row
522
- end
523
- result
524
- end
525
-
526
- # Return this table with all duplicate rows eliminated. Resets groups.
527
- def uniq
528
- distinct
529
- end
530
-
531
- # Return a Table that combines this table with another table. In other
532
- # words, return the union of this table with the other. The headers of this
533
- # table are used in the result. There must be the same number of columns of
534
- # the same type in the two tables, or an exception will be thrown.
535
- # Duplicates are eliminated from the result.
536
- def union(other)
537
- set_operation(other, :+,
538
- distinct: true,
539
- add_boundaries: true)
540
- end
541
-
542
- # Return a Table that combines this table with another table. In other
543
- # words, return the union of this table with the other. The headers of this
544
- # table are used in the result. There must be the same number of columns of
545
- # the same type in the two tables, or an exception will be thrown.
546
- # Duplicates are not eliminated from the result. Adds group boundaries at
547
- # boundaries of the constituent tables. Preserves and adjusts the group
548
- # boundaries of the constituent table.
549
- def union_all(other)
550
- set_operation(other, :+,
551
- distinct: false,
552
- add_boundaries: true,
553
- inherit_boundaries: true)
554
- end
555
-
556
- # Return a Table that includes the rows that appear in this table and in
557
- # another table. In other words, return the intersection of this table with
558
- # the other. The headers of this table are used in the result. There must be
559
- # the same number of columns of the same type in the two tables, or an
560
- # exception will be thrown. Duplicates are eliminated from the
561
- # result. Resets groups.
562
- def intersect(other)
563
- set_operation(other, :intersect, true)
564
- end
565
-
566
- # Return a Table that includes the rows that appear in this table and in
567
- # another table. In other words, return the intersection of this table with
568
- # the other. The headers of this table are used in the result. There must be
569
- # the same number of columns of the same type in the two tables, or an
570
- # exception will be thrown. Duplicates are not eliminated from the
571
- # result. Resets groups.
572
- def intersect_all(other)
573
- set_operation(other, :intersect, false)
574
- end
575
-
576
- # Return a Table that includes the rows of this table except for any rows
577
- # that are the same as those in another table. In other words, return the
578
- # set difference between this table an the other. The headers of this table
579
- # are used in the result. There must be the same number of columns of the
580
- # same type in the two tables, or an exception will be thrown. Duplicates
581
- # are eliminated from the result. Resets groups.
582
- def except(other)
583
- set_operation(other, :difference, true)
584
- end
585
-
586
- # Return a Table that includes the rows of this table except for any rows
587
- # that are the same as those in another table. In other words, return the
588
- # set difference between this table an the other. The headers of this table
589
- # are used in the result. There must be the same number of columns of the
590
- # same type in the two tables, or an exception will be thrown. Duplicates
591
- # are not eliminated from the result. Resets groups.
592
- def except_all(other)
593
- set_operation(other, :difference, false)
594
- end
595
-
596
- private
597
-
598
- # Apply the set operation given by op between this table and the other table
599
- # given in the first argument. If distinct is true, eliminate duplicates
600
- # from the result.
601
- def set_operation(other, op = :+,
602
- distinct: true,
603
- add_boundaries: false,
604
- inherit_boundaries: false)
605
- unless columns.size == other.columns.size
606
- raise 'Cannot apply a set operation to tables with a different number of columns.'
607
- end
608
- unless columns.map(&:type) == other.columns.map(&:type)
609
- raise 'Cannot apply a set operation to tables with different column types.'
610
- end
611
- other_rows = other.rows.map { |r| r.replace_keys(headers) }
612
- result = Table.new
613
- new_rows = rows.send(op, other_rows)
614
- new_rows.each_with_index do |row, k|
615
- result << row
616
- result.mark_boundary if k == size - 1 && add_boundaries
617
- end
618
- if inherit_boundaries
619
- result.boundaries = normalize_boundaries
620
- other.normalize_boundaries
621
- result.append_boundaries(other.boundaries, shift: size)
622
- end
623
- result.normalize_boundaries
624
- distinct ? result.distinct : result
625
- end
626
-
627
- public
628
-
629
- # Return a table that joins this table to another based on one or more join
630
- # expressions. There are several possibilities for the join expressions:
631
- #
632
- # 1. If no join expressions are given, the tables will be joined when all
633
- # values with the same name in both tables have the same value, a
634
- # "natural" join. However, if the join type is :cross, the join
635
- # expression will be taken to be 'true'. Otherwise, if there are no
636
- # common column names, an exception will be raised.
637
- #
638
- # 2. If the join expressions are one or more symbols, the join condition
639
- # requires that the values of both tables are equal for all columns named
640
- # by the symbols. A column that appears in both tables can be given
641
- # without modification and will be assumed to require equality on that
642
- # column. If an unmodified symbol is not a name that appears in both
643
- # tables, an exception will be raised. Column names that are unique to
644
- # the first table must have a '_a' appended to the column name and column
645
- # names that are unique to the other table must have a '_b' appended to
646
- # the column name. These disambiguated column names must come in pairs,
647
- # one for the first table and one for the second, and they will imply a
648
- # join condition that the columns must be equal on those columns. Several
649
- # such symbol expressions will require that all such implied pairs are
650
- # equal in order for the join condition to be met.
651
- #
652
- # 3. Finally, a string expression can be given that contains an arbitrary
653
- # ruby expression that will be evaluated for truthiness. Within the
654
- # string, all column names must be disambiguated with the '_a' or '_b'
655
- # modifiers whether they are common to both tables or not. The names of
656
- # the columns in both tables (without the leading ':' for symbols) are
657
- # available as variables within the expression.
658
- #
659
- # The join_type parameter specifies what sort of join is performed, :inner,
660
- # :left, :right, :full, or :cross. The default is an :inner join. The types
661
- # of joins are defined as follows where T1 means this table, the receiver,
662
- # and T2 means other. These descriptions are taken from the Postgresql
663
- # documentation.
664
- #
665
- # - :inner :: For each row R1 of T1, the joined table has a row for each row
666
- # in T2 that satisfies the join condition with R1.
667
- #
668
- # - :left :: First, an inner join is performed. Then, for each row in T1
669
- # that does not satisfy the join condition with any row in T2, a joined
670
- # row is added with null values in columns of T2. Thus, the joined
671
- # table always has at least one row for each row in T1.
672
- #
673
- # - :right :: First, an inner join is performed. Then, for each row in T2
674
- # that does not satisfy the join condition with any row in T1, a joined
675
- # row is added with null values in columns of T1. This is the converse
676
- # of a left join: the result table will always have a row for each row
677
- # in T2.
678
- #
679
- # - :full :: First, an inner join is performed. Then, for each row in T1
680
- # that does not satisfy the join condition with any row in T2, a joined
681
- # row is added with null values in columns of T2. Also, for each row of
682
- # T2 that does not satisfy the join condition with any row in T1, a
683
- # joined row with null values in the columns of T1 is added.
684
- #
685
- # - :cross :: For every possible combination of rows from T1 and T2 (i.e.,
686
- # a Cartesian product), the joined table will contain a row consisting
687
- # of all columns in T1 followed by all columns in T2. If the tables
688
- # have N and M rows respectively, the joined table will have N * M
689
- # rows.
690
- # Resets groups.
691
- JOIN_TYPES = [:inner, :left, :right, :full, :cross].freeze
692
-
693
- def join(other, *exps, join_type: :inner)
694
- unless other.is_a?(Table)
695
- raise ArgumentError, 'need other table as first argument to join'
696
- end
697
- unless JOIN_TYPES.include?(join_type)
698
- raise ArgumentError, "join_type may only be: #{JOIN_TYPES.join(', ')}"
699
- end
700
- # These may be needed for outer joins.
701
- self_row_nils = headers.map { |h| [h, nil] }.to_h
702
- other_row_nils = other.headers.map { |h| [h, nil] }.to_h
703
- join_expression, other_common_heads = build_join_expression(exps, other, join_type)
704
- ev = Evaluator.new
705
- result = Table.new
706
- other_rows = other.rows
707
- other_row_matches = Array.new(other_rows.size, false)
708
- rows.each do |self_row|
709
- self_row_matched = false
710
- other_rows.each_with_index do |other_row, k|
711
- # Same as other_row, but with keys that are common with self and equal
712
- # in value, removed, so the output table need not repeat them.
713
- locals = build_locals_hash(row_a: self_row, row_b: other_row)
714
- matches = ev.evaluate(join_expression, vars: locals)
715
- next unless matches
716
- self_row_matched = other_row_matches[k] = true
717
- out_row = build_out_row(row_a: self_row, row_b: other_row,
718
- common_heads: other_common_heads,
719
- type: join_type)
720
- result << out_row
721
- end
722
- if join_type == :left || join_type == :full
723
- unless self_row_matched
724
- out_row = build_out_row(row_a: self_row, row_b: other_row_nils, type: join_type)
725
- result << out_row
726
- end
727
- end
728
- end
729
- if join_type == :right || join_type == :full
730
- other_rows.each_with_index do |other_row, k|
731
- unless other_row_matches[k]
732
- out_row = build_out_row(row_a: self_row_nils, row_b: other_row, type: join_type)
733
- result << out_row
734
- end
735
- end
736
- end
737
- result.normalize_boundaries
738
- result
739
- end
740
-
741
- def inner_join(other, *exps)
742
- join(other, *exps)
743
- end
744
-
745
- def left_join(other, *exps)
746
- join(other, *exps, join_type: :left)
747
- end
748
-
749
- def right_join(other, *exps)
750
- join(other, *exps, join_type: :right)
751
- end
752
-
753
- def full_join(other, *exps)
754
- join(other, *exps, join_type: :full)
755
- end
756
-
757
- def cross_join(other)
758
- join(other, join_type: :cross)
759
- end
760
-
761
- private
762
-
763
- # Return an output row appropriate to the given join type, including all the
764
- # keys of row_a, the non-common keys of row_b for an :inner join, or all the
765
- # keys of row_b for other joins. If any of the row_b keys are also row_a
766
- # keys, change the key name by appending a '_b' so the keys will not repeat.
767
- def build_out_row(row_a:, row_b:, common_heads: [], type: :inner)
768
- if type == :inner
769
- # Eliminate the keys that are common with row_a and were matched for
770
- # equality
771
- row_b = row_b.reject { |k, _| common_heads.include?(k) }
772
- end
773
- # Translate any remaining row_b heads to append '_b' if they have the
774
- # same name as a row_a key.
775
- a_heads = row_a.keys
776
- row_b = row_b.to_a.each.map { |k, v|
777
- [a_heads.include?(k) ? "#{k}_b".to_sym : k, v]
778
- }.to_h
779
- row_a.merge(row_b)
780
- end
781
-
782
- # Return a hash for the local variables of a join expression in which all
783
- # the keys in row_a have an '_a' appended and all the keys in row_b have a
784
- # '_b' appended.
785
- def build_locals_hash(row_a:, row_b:)
786
- row_a = row_a.to_a.each.map { |k, v| ["#{k}_a".to_sym, v] }.to_h
787
- row_b = row_b.to_a.each.map { |k, v| ["#{k}_b".to_sym, v] }.to_h
788
- row_a.merge(row_b)
789
- end
790
-
791
- # Return an array of two elements: (1) a ruby expression that expresses the
792
- # AND of all join conditions as described in the comment to the #join method
793
- # and (2) the heads from other table that (a) are known to be tested for
794
- # equality with a head in self table and (b) have the same name. Assume that
795
- # the expression will be evaluated in the context of a binding in which the
796
- # local variables are all the headers in the self table with '_a' appended
797
- # and all the headers in the other table with '_b' appended.
798
- def build_join_expression(exps, other, type)
799
- return ['true', []] if type == :cross
800
- a_heads = headers
801
- b_heads = other.headers
802
- common_heads = a_heads & b_heads
803
- b_common_heads = []
804
- if exps.empty?
805
- if common_heads.empty?
806
- raise ArgumentError,
807
- 'A non-cross join with no common column names requires join expressions'
808
- else
809
- # A Natural join on all common heads
810
- common_heads.each do |h|
811
- ensure_common_types!(self_h: h, other_h: h, other: other)
812
- end
813
- nat_exp = common_heads.map { |h| "(#{h}_a == #{h}_b)" }.join(' && ')
814
- [nat_exp, common_heads]
815
- end
816
- else
817
- # We have expressions to evaluate
818
- and_conds = []
819
- partial_result = nil
820
- last_sym = nil
821
- exps.each do |exp|
822
- case exp
823
- when Symbol
824
- case exp.to_s.clean
825
- when /\A(.*)_a\z/
826
- a_head = $1.to_sym
827
- unless a_heads.include?(a_head)
828
- raise ArgumentError, "no column '#{a_head}' in table"
829
- end
830
- if partial_result
831
- # Second of a pair
832
- ensure_common_types!(self_h: a_head, other_h: last_sym, other: other)
833
- partial_result << "#{a_head}_a)"
834
- and_conds << partial_result
835
- partial_result = nil
836
- else
837
- # First of a pair of _a or _b
838
- partial_result = "(#{a_head}_a == "
839
- end
840
- last_sym = a_head
841
- when /\A(.*)_b\z/
842
- b_head = $1.to_sym
843
- unless b_heads.include?(b_head)
844
- raise ArgumentError, "no column '#{b_head}' in second table"
845
- end
846
- if partial_result
847
- # Second of a pair
848
- ensure_common_types!(self_h: last_sym, other_h: b_head, other: other)
849
- partial_result << "#{b_head}_b)"
850
- and_conds << partial_result
851
- partial_result = nil
852
- else
853
- # First of a pair of _a or _b
854
- partial_result = "(#{b_head}_b == "
855
- end
856
- b_common_heads << b_head
857
- last_sym = b_head
858
- else
859
- # No modifier, so must be one of the common columns
860
- unless partial_result.nil?
861
- # We were expecting the second of a modified pair, but got an
862
- # unmodified symbol instead.
863
- msg =
864
- "must follow '#{last_sym}' by qualified exp from the other table"
865
- raise ArgumentError, msg
866
- end
867
- # We have an unqualified symbol that must appear in both tables
868
- unless common_heads.include?(exp)
869
- raise ArgumentError, "unqualified column '#{exp}' must occur in both tables"
870
- end
871
- ensure_common_types!(self_h: exp, other_h: exp, other: other)
872
- and_conds << "(#{exp}_a == #{exp}_b)"
873
- b_common_heads << exp
874
- end
875
- when String
876
- # We have a string expression in which all column references must be
877
- # qualified.
878
- and_conds << "(#{exp})"
879
- else
880
- raise ArgumentError, "invalid join expression '#{exp}' of class #{exp.class}"
881
- end
882
- end
883
- [and_conds.join(' && '), b_common_heads]
884
- end
885
- end
886
-
887
- # Raise an exception unless self_h in this table and other_h in other table
888
- # have the same types.
889
- def ensure_common_types!(self_h:, other_h:, other:)
890
- unless column(self_h).type == other.column(other_h).type
891
- raise ArgumentError,
892
- "type of column '#{self_h}' does not match type of column '#{other_h}"
893
- end
894
- self
895
- end
896
-
897
- ###################################################################################
898
- # Group By
899
- ###################################################################################
900
-
901
- public
902
-
903
- # Return a Table with a single row for each group of rows in the input table
904
- # where the value of all columns named as simple symbols are equal. All
905
- # other columns are set to the result of aggregating the values of that
906
- # column within the group according to a aggregate function (:count, :sum,
907
- # :min, :max, etc.), which defaults to the :first function, giving the value
908
- # of that column for the first row in the group. You can specify a
909
- # different aggregate function for a column by adding a hash parameter with
910
- # the column as the key and a symbol for the aggregate function as the
911
- # value. For example, consider the following call:
912
- #
913
- # tab.group_by(:date, :code, :price, shares: :sum, ).
914
- #
915
- # The first three parameters are simple symbols, so the table is divided
916
- # into groups of rows in which the value of :date, :code, and :price are
917
- # equal. The shares: hash parameter is set to the aggregate function :sum,
918
- # so it will appear in the result as the sum of all the :shares values in
919
- # each group. Any non-aggregate columns that have no aggregate function set
920
- # default to using the aggregate function :first. Because of the way Ruby
921
- # parses parameters to a method call, all the grouping symbols must appear
922
- # first in the parameter list before any hash parameters.
923
- def group_by(*group_cols, **agg_cols)
924
- default_agg_func = :first
925
- default_cols = headers - group_cols - agg_cols.keys
926
- default_cols.each do |h|
927
- agg_cols[h] = default_agg_func
928
- end
929
-
930
- sorted_tab = order_by(group_cols)
931
- groups = sorted_tab.rows.group_by do |r|
932
- group_cols.map { |k| r[k] }
933
- end
934
- result = Table.new
935
- groups.each_pair do |_vals, grp_rows|
936
- result << row_from_group(grp_rows, group_cols, agg_cols)
937
- end
938
- result.normalize_boundaries
939
- result
940
- end
941
-
942
- private
943
-
944
- def row_from_group(rows, grp_cols, agg_cols)
945
- new_row = {}
946
- grp_cols.each do |h|
947
- new_row[h] = rows.first[h]
948
- end
949
- agg_cols.each_pair do |h, agg_func|
950
- items = rows.map { |r| r[h] }
951
- new_h = "#{agg_func}_#{h}".as_sym
952
- new_row[new_h] = Column.new(header: h,
953
- items: items).send(agg_func)
954
- end
955
- new_row
956
- end
957
-
958
- ############################################################################
959
- # Table construction methods.
960
- ############################################################################
961
-
962
- public
963
-
964
- # Add a row represented by a Hash having the headers as keys. If mark is
965
- # true, mark this row as a boundary. All tables should be built ultimately
966
- # using this method as a primitive.
967
- def add_row(row, mark: false)
968
- row.each_pair do |k, v|
969
- key = k.as_sym
970
- columns << Column.new(header: k) unless column?(k)
971
- column(key) << v
972
- end
973
- @boundaries << (size - 1) if mark
974
- self
975
- end
976
-
977
- # Add a row without marking.
978
- def <<(row)
979
- add_row(row)
980
- end
981
-
982
- def add_column(col)
983
- raise "Table already has a column with header '#{col.header}'" if column?(col.header)
984
- columns << col
985
- self
986
- end
987
- end
988
- end