fat_core 1.7.1 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,988 +0,0 @@
1
- module FatCore
2
- # A container for a two-dimensional table. All cells in the table must be a
3
- # String, a DateTime (or Date), a Numeric (Bignum, Integer, or BigDecimal), or
4
- # a Boolean (TrueClass or FalseClass). All columns must be of one of those
5
- # types or be a string convertible into one of them. It is considered an error
6
- # if a single column contains cells of different types. Any cell that cannot
7
- # be parsed as one of the Numeric, DateTime, or Boolean types will be treated
8
- # as a String and have to_s applied. Until the column type is determined, it
9
- # will have the type NilClass.
10
- #
11
- # You can initialize a Table in several ways:
12
- #
13
- # 1. with a Nil, which will return an empty table to which rows or columns can
14
- # be added later, 2. with the name of a .csv file, 3. with the name of an
15
- # .org file, 4. with an IO or StringIO object for either type of file, but
16
- # in that case, you need to specify 'csv' or 'org' as the second argument
17
- # to tell it what kind of file format to expect, 5. with an Array of
18
- # Arrays, 6. with an Array of Hashes, all having the same keys, which
19
- # become the names of the column heads, 7. with an Array of any objects
20
- # that respond to .keys and .values methods, 8. with another Table object.
21
- #
22
- # In the resulting Table, the headers are converted into symbols, with all
23
- # spaces converted to underscore and everything down-cased. So, the heading,
24
- # 'Two Words' becomes the hash header :two_words.
25
- class Table
26
- attr_reader :columns
27
-
28
- def initialize
29
- @columns = []
30
- @boundaries = []
31
- end
32
-
33
- ###########################################################################
34
- # Constructors
35
- ###########################################################################
36
-
37
- # Construct a Table from the contents of a CSV file. Headers will be taken
38
- # from the first row and converted to symbols.
39
- def self.from_csv_file(fname)
40
- File.open(fname, 'r') do |io|
41
- from_csv_io(io)
42
- end
43
- end
44
-
45
- # Construct a Table from a string, treated as the input from a CSV file.
46
- def self.from_csv_string(str)
47
- from_csv_io(StringIO.new(str))
48
- end
49
-
50
- # Construct a Table from the first table found in the given org-mode file.
51
- # Headers are taken from the first row if the second row is an hrule.``
52
- def self.from_org_file(fname)
53
- File.open(fname, 'r') do |io|
54
- from_org_io(io)
55
- end
56
- end
57
-
58
- # Construct a Table from a string, treated as the contents of an org-mode
59
- # file.
60
- def self.from_org_string(str)
61
- from_org_io(StringIO.new(str))
62
- end
63
-
64
- # Construct a Table from an array of arrays. If the second element is a nil
65
- # or is an array whose first element is a string that looks like a rule
66
- # separator, '|-----------', '+----------', etc., the headers will be taken
67
- # from the first array converted to strings and then to symbols. Any
68
- # following such rows mark a group boundary. Note that this is the form of
69
- # a table used by org-mode src blocks, so it is useful for building Tables
70
- # from the result of a src block.
71
- def self.from_aoa(aoa)
72
- from_array_of_arrays(aoa)
73
- end
74
-
75
- # Construct a Table from an array of hashes, or any objects that respond to
76
- # the #to_h method. All hashes must have the same keys, which, when
77
- # converted to symbols will become the headers for the Table.
78
- def self.from_aoh(aoh)
79
- if aoh.first.respond_to?(:to_h)
80
- from_array_of_hashes(aoh)
81
- else
82
- raise ArgumentError,
83
- "Cannot initialize Table with an array of #{input[0].class}"
84
- end
85
- end
86
-
87
- # Construct a Table from another Table. Inherit any group boundaries from
88
- # the input table.
89
- def self.from_table(table)
90
- from_aoh(table.rows)
91
- @boundaries = table.boundaries
92
- end
93
-
94
- ############################################################################
95
- # Class-level constructor helpers
96
- ############################################################################
97
-
98
- class << self
99
- private
100
-
101
- # Construct table from an array of hashes or an array of any object that can
102
- # respond to #to_h. If an array element is a nil, mark it as a group
103
- # boundary in the Table.
104
- def from_array_of_hashes(hashes)
105
- result = new
106
- hashes.each do |hsh|
107
- if hsh.nil?
108
- result.mark_boundary
109
- next
110
- end
111
- result << hsh.to_h
112
- end
113
- result
114
- end
115
-
116
- # Construct a new table from an array of arrays. If the second element of
117
- # the array is a nil, a string that looks like an hrule, or an array whose
118
- # first element is a string that looks like an hrule, interpret the first
119
- # element of the array as a row of headers. Otherwise, synthesize headers of
120
- # the form "col1", "col2", ... and so forth. The remaining elements are
121
- # taken as the body of the table, except that if an element of the outer
122
- # array is a nil or a string that looks like an hrule, mark the preceding
123
- # row as a boundary.
124
- def from_array_of_arrays(rows)
125
- result = new
126
- headers = []
127
- if looks_like_boundary?(rows[1])
128
- # Take the first row as headers
129
- # Use first row 0 as headers
130
- headers = rows[0].map(&:as_sym)
131
- first_data_row = 2
132
- else
133
- # Synthesize headers
134
- headers = (1..rows[0].size).to_a.map { |k| "col#{k}".as_sym }
135
- first_data_row = 0
136
- end
137
- rows[first_data_row..-1].each do |row|
138
- if looks_like_boundary?(row)
139
- result.mark_boundary
140
- next
141
- end
142
- row = row.map { |s| s.to_s.strip }
143
- hash_row = Hash[headers.zip(row)]
144
- result << hash_row
145
- end
146
- result
147
- end
148
-
149
- # Return true if row is nil, a string that matches hrule_re, or is an
150
- # array whose first element matches hrule_re.
151
- def looks_like_boundary?(row)
152
- hrule_re = /\A\s*[\|+][-]+/
153
- return true if row.nil?
154
- if row.respond_to?(:first) && row.first.respond_to?(:to_s)
155
- return row.first.to_s =~ hrule_re
156
- end
157
- if row.respond_to?(:to_s)
158
- return row.to_s =~ hrule_re
159
- end
160
- false
161
- end
162
-
163
- def from_csv_io(io)
164
- result = new
165
- ::CSV.new(io, headers: true, header_converters: :symbol,
166
- skip_blanks: true).each do |row|
167
- result << row.to_h
168
- end
169
- result
170
- end
171
-
172
- # Form rows of table by reading the first table found in the org file.
173
- def from_org_io(io)
174
- table_re = /\A\s*\|/
175
- hrule_re = /\A\s*\|[-+]+/
176
- rows = []
177
- table_found = false
178
- header_found = false
179
- io.each do |line|
180
- unless table_found
181
- # Skip through the file until a table is found
182
- next unless line =~ table_re
183
- unless line =~ hrule_re
184
- line = line.sub(/\A\s*\|/, '').sub(/\|\s*\z/, '')
185
- rows << line.split('|').map(&:clean)
186
- end
187
- table_found = true
188
- next
189
- end
190
- break unless line =~ table_re
191
- if !header_found && line =~ hrule_re
192
- rows << nil
193
- header_found = true
194
- next
195
- elsif header_found && line =~ hrule_re
196
- # Mark the boundary with a nil
197
- rows << nil
198
- elsif line !~ table_re
199
- # Stop reading at the second hline
200
- break
201
- else
202
- line = line.sub(/\A\s*\|/, '').sub(/\|\s*\z/, '')
203
- rows << line.split('|').map(&:clean)
204
- end
205
- end
206
- from_array_of_arrays(rows)
207
- end
208
- end
209
-
210
- ###########################################################################
211
- # Attributes
212
- ###########################################################################
213
-
214
- # Return the column with the given header.
215
- def column(key)
216
- columns.detect { |c| c.header == key.as_sym }
217
- end
218
-
219
- # Return the type of the column with the given header
220
- def type(key)
221
- column(key).type
222
- end
223
-
224
- # Return the array of items of the column with the given header, or if the
225
- # index is an integer, return that row number. So a table's rows can be
226
- # accessed by number, and its columns can be accessed by column header.
227
- # Also, double indexing works in either row-major or column-majoir order:
228
- # tab[:id][8] returns the 8th item in the column headed :id and so does
229
- # tab[8][:id].
230
- def [](key)
231
- case key
232
- when Integer
233
- raise "index '#{key}' out of range" unless (1..size).cover?(key)
234
- rows[key - 1]
235
- when String
236
- raise "header '#{key}' not in table" unless headers.include?(key)
237
- column(key).items
238
- when Symbol
239
- raise "header ':#{key}' not in table" unless headers.include?(key)
240
- column(key).items
241
- else
242
- raise "cannot index table with a #{key.class}"
243
- end
244
- end
245
-
246
- # Return true if the table has a column with the given header.
247
- def column?(key)
248
- headers.include?(key.as_sym)
249
- end
250
-
251
- # Return an array of the Table's column types.
252
- def types
253
- columns.map(&:type)
254
- end
255
-
256
- # Return the headers for the Table as an array of symbols.
257
- def headers
258
- columns.map(&:header)
259
- end
260
-
261
- # Return the number of rows in the Table.
262
- def size
263
- return 0 if columns.empty?
264
- columns.first.size
265
- end
266
-
267
- # Return whether this Table is empty.
268
- def empty?
269
- size.zero?
270
- end
271
-
272
- # Return the rows of the Table as an array of hashes, keyed by the headers.
273
- def rows
274
- rows = []
275
- unless columns.empty?
276
- 0.upto(columns.first.items.last_i) do |rnum|
277
- row = {}
278
- columns.each do |col|
279
- row[col.header] = col[rnum]
280
- end
281
- rows << row
282
- end
283
- end
284
- rows
285
- end
286
-
287
- protected
288
-
289
- # Return the rows from first to last. We could just index #rows, but in a
290
- # large table, that would require that we construct all the rows for a range
291
- # of any size.
292
- def rows_range(first = 0, last = size - 1)
293
- raise ArgumentError, 'first must be <= last' unless first <= last
294
- rows = []
295
- unless columns.empty?
296
- first.upto(last) do |rnum|
297
- row = {}
298
- columns.each do |col|
299
- row[col.header] = col[rnum]
300
- end
301
- rows << row
302
- end
303
- end
304
- rows
305
- end
306
-
307
- ## ###########################################################################
308
- ## Group Boundaries
309
- ##
310
- ## Boundaries mark the last row in each "group" within the table. The last
311
- ## row of the table is always an implicit boundary, and having the last row
312
- ## as the sole boundary is the default for new tables unless mentioned
313
- ## otherwise. Resetting the boundaries means to put it back in that default
314
- ## state.
315
- ##
316
- ## Note that tables are for the most part, immutable. That is, the data
317
- ## rows of the table, once set, are never changed by methods on the
318
- ## table. Any transformation of a table results in a new table. Boundaries
319
- ## and footers are exceptions to immutability, but even they only affect
320
- ## the boundary and footer attributes of the table, not the data rows.
321
- ##
322
- ## Boundaries can be added when a table is read in, for example, from the
323
- ## text of an org table in which each hline (other than the one separating
324
- ## the headers from the body) marks a boundary for the row immediately
325
- ## preceding the hline.
326
- ##
327
- ## The #order_by method resets the boundaries then adds boundaries at the
328
- ## last row of each group of rows on which the sort keys were equal as a
329
- ## boundary.
330
- ##
331
- ## The #union_all (but not #union since it deletes duplicates) method adds
332
- ## a boundary between the constituent tables. #union_all also preserves any
333
- ## boundary markers within the constituent tables. In doing so, the
334
- ## boundaries of the second table in the #union_all are increased by the
335
- ## size of the first table so that they refer to rows in the new table.
336
- ##
337
- ## The #select method preserves any boundaries from the parent table
338
- ## without change, since it only selects columns for the output and deletes
339
- ## no rows.
340
- ##
341
- ## Perhaps surprisingly, the #group_by method does /not/ result in any
342
- ## groups in the output table since the result of #group_by is to reduce
343
- ## all groups it finds into a single row, and having a group for each row
344
- ## of the output table would have no use.
345
- ##
346
- ## All the other table-transforming methods reset the boundaries in the new
347
- ## table. For example, #where re-arranges and deletes rows, so the old
348
- ## boundaries would make no sense anyway. Likewise, #union, #intersection,
349
- ## #except, and #join reset the boundaries to their default.
350
- ## ###########################################################################
351
-
352
- public
353
-
354
- # Return an array of an array of row hashes for the groups in this Table.
355
- def groups
356
- normalize_boundaries
357
- groups = []
358
- (0..boundaries.size - 1).each do |k|
359
- groups << group_rows(k)
360
- end
361
- groups
362
- end
363
-
364
- # Mark a boundary at k, and if k is nil, the last row in the table
365
- # as a group boundary.
366
- def mark_boundary(k = nil)
367
- if k
368
- boundaries.push(k)
369
- else
370
- boundaries.push(size - 1)
371
- end
372
- end
373
-
374
- protected
375
-
376
- # Reader for boundaries, but not public.
377
- def boundaries
378
- @boundaries
379
- end
380
-
381
- # Writer for boundaries, but not public.
382
- def boundaries=(bounds)
383
- @boundaries = bounds
384
- end
385
-
386
- # Make sure size - 1 is last boundary and that they are unique and sorted.
387
- def normalize_boundaries
388
- unless empty?
389
- boundaries.push(size - 1) unless boundaries.include?(size - 1)
390
- self.boundaries = boundaries.uniq.sort
391
- end
392
- boundaries
393
- end
394
-
395
- # Concatenate the array of argument bounds to this table's boundaries, but
396
- # increase each of the indexes in bounds by shift. This is used in the
397
- # #union_all method.
398
- def append_boundaries(bounds, shift: 0)
399
- @boundaries += bounds.map { |k| k + shift }
400
- end
401
-
402
- # Return the group number to which row k belongs. Groups, from the user's
403
- # point of view are indexed starting at 1.
404
- def row_index_to_group_index(k)
405
- boundaries.each_with_index do |b_last, g_num|
406
- return (g_num + 1) if k <= b_last
407
- end
408
- 1
409
- end
410
-
411
- def group_rows(k)
412
- normalize_boundaries
413
- return [] unless k < boundaries.size
414
- first = k.zero? ? 0 : boundaries[k - 1] + 1
415
- last = boundaries[k]
416
- rows_range(first, last)
417
- end
418
-
419
- ############################################################################
420
- # SQL look-alikes. The following methods are based on SQL equivalents and
421
- # all return a new Table object rather than modifying the table in place.
422
- ############################################################################
423
-
424
- public
425
-
426
- # Return a new Table sorting the rows of this Table on the possibly multiple
427
- # keys given in the array of syms in headers. Append a ! to the symbol name
428
- # to indicate reverse sorting on that column. Resets groups.
429
- def order_by(*sort_heads)
430
- sort_heads = [sort_heads].flatten
431
- rev_heads = sort_heads.select { |h| h.to_s.ends_with?('!') }
432
- sort_heads = sort_heads.map { |h| h.to_s.sub(/\!\z/, '').to_sym }
433
- rev_heads = rev_heads.map { |h| h.to_s.sub(/\!\z/, '').to_sym }
434
- new_rows = rows.sort do |r1, r2|
435
- key1 = sort_heads.map { |h| rev_heads.include?(h) ? r2[h] : r1[h] }
436
- key2 = sort_heads.map { |h| rev_heads.include?(h) ? r1[h] : r2[h] }
437
- key1 <=> key2
438
- end
439
- # Add the new rows to the table, but mark a group boundary at the points
440
- # where the sort key changes value.
441
- new_tab = Table.new
442
- last_key = nil
443
- new_rows.each_with_index do |nrow, k|
444
- new_tab << nrow
445
- key = nrow.fetch_values(*sort_heads)
446
- new_tab.mark_boundary(k - 1) if last_key && key != last_key
447
- last_key = key
448
- end
449
- new_tab.normalize_boundaries
450
- new_tab
451
- end
452
-
453
- # Return a Table having the selected column expressions. Each expression can
454
- # be either a (1) symbol, :old_col, representing a column in the current
455
- # table, (2) a hash of new_col: :old_col to rename an existing :old_col
456
- # column as :new_col, or (3) a hash of new_col: 'expression', to add a new
457
- # column that is computed as an arbitrary ruby expression of the existing
458
- # columns (whether selected for the output table or not) or any new_col
459
- # defined earlier in the argument list. The expression string can also
460
- # access the instance variable @row as the row number of the row being
461
- # evaluated. The bare symbol arguments (1) must precede any hash arguments
462
- # (2) or (3). Each expression results in a column in the resulting Table in
463
- # the order given. The expressions are evaluated in left-to-right order as
464
- # well. The output table preserves any groups present in the input table.
465
- def select(*cols, **new_cols)
466
- result = Table.new
467
- normalize_boundaries
468
- ev = Evaluator.new(vars: { row: 0, group: 1 },
469
- before: '@row = __row; @group = __group')
470
- rows.each_with_index do |old_row, old_k|
471
- new_row = {}
472
- cols.each do |k|
473
- h = k.as_sym
474
- raise "Column '#{h}' in select does not exist" unless column?(h)
475
- new_row[h] = old_row[h]
476
- end
477
- new_cols.each_pair do |key, val|
478
- key = key.as_sym
479
- vars = old_row.merge(new_row)
480
- vars[:__row] = old_k + 1
481
- vars[:__group] = row_index_to_group_index(old_k)
482
- case val
483
- when Symbol
484
- raise "Column '#{val}' in select does not exist" unless vars.keys.include?(val)
485
- new_row[key] = vars[val]
486
- when String
487
- new_row[key] = ev.evaluate(val, vars: vars)
488
- else
489
- raise 'Hash parameters to select must be a symbol or string'
490
- end
491
- end
492
- result << new_row
493
- end
494
- result.boundaries = boundaries
495
- result.normalize_boundaries
496
- result
497
- end
498
-
499
- # Return a Table containing only rows matching the where expression. Resets
500
- # groups.
501
- def where(expr)
502
- expr = expr.to_s
503
- result = Table.new
504
- ev = Evaluator.new(vars: { row: 0 },
505
- before: '@row = __row; @group = __group')
506
- rows.each_with_index do |row, k|
507
- vars = row
508
- vars[:__row] = k + 1
509
- vars[:__group] = row_index_to_group_index(k)
510
- result << row if ev.evaluate(expr, vars: row)
511
- end
512
- result.normalize_boundaries
513
- result
514
- end
515
-
516
- # Return this table with all duplicate rows eliminated. Resets groups.
517
- def distinct
518
- result = Table.new
519
- uniq_rows = rows.uniq
520
- uniq_rows.each do |row|
521
- result << row
522
- end
523
- result
524
- end
525
-
526
- # Return this table with all duplicate rows eliminated. Resets groups.
527
- def uniq
528
- distinct
529
- end
530
-
531
- # Return a Table that combines this table with another table. In other
532
- # words, return the union of this table with the other. The headers of this
533
- # table are used in the result. There must be the same number of columns of
534
- # the same type in the two tables, or an exception will be thrown.
535
- # Duplicates are eliminated from the result.
536
- def union(other)
537
- set_operation(other, :+,
538
- distinct: true,
539
- add_boundaries: true)
540
- end
541
-
542
- # Return a Table that combines this table with another table. In other
543
- # words, return the union of this table with the other. The headers of this
544
- # table are used in the result. There must be the same number of columns of
545
- # the same type in the two tables, or an exception will be thrown.
546
- # Duplicates are not eliminated from the result. Adds group boundaries at
547
- # boundaries of the constituent tables. Preserves and adjusts the group
548
- # boundaries of the constituent table.
549
- def union_all(other)
550
- set_operation(other, :+,
551
- distinct: false,
552
- add_boundaries: true,
553
- inherit_boundaries: true)
554
- end
555
-
556
- # Return a Table that includes the rows that appear in this table and in
557
- # another table. In other words, return the intersection of this table with
558
- # the other. The headers of this table are used in the result. There must be
559
- # the same number of columns of the same type in the two tables, or an
560
- # exception will be thrown. Duplicates are eliminated from the
561
- # result. Resets groups.
562
- def intersect(other)
563
- set_operation(other, :intersect, true)
564
- end
565
-
566
- # Return a Table that includes the rows that appear in this table and in
567
- # another table. In other words, return the intersection of this table with
568
- # the other. The headers of this table are used in the result. There must be
569
- # the same number of columns of the same type in the two tables, or an
570
- # exception will be thrown. Duplicates are not eliminated from the
571
- # result. Resets groups.
572
- def intersect_all(other)
573
- set_operation(other, :intersect, false)
574
- end
575
-
576
- # Return a Table that includes the rows of this table except for any rows
577
- # that are the same as those in another table. In other words, return the
578
- # set difference between this table an the other. The headers of this table
579
- # are used in the result. There must be the same number of columns of the
580
- # same type in the two tables, or an exception will be thrown. Duplicates
581
- # are eliminated from the result. Resets groups.
582
- def except(other)
583
- set_operation(other, :difference, true)
584
- end
585
-
586
- # Return a Table that includes the rows of this table except for any rows
587
- # that are the same as those in another table. In other words, return the
588
- # set difference between this table an the other. The headers of this table
589
- # are used in the result. There must be the same number of columns of the
590
- # same type in the two tables, or an exception will be thrown. Duplicates
591
- # are not eliminated from the result. Resets groups.
592
- def except_all(other)
593
- set_operation(other, :difference, false)
594
- end
595
-
596
- private
597
-
598
- # Apply the set operation given by op between this table and the other table
599
- # given in the first argument. If distinct is true, eliminate duplicates
600
- # from the result.
601
- def set_operation(other, op = :+,
602
- distinct: true,
603
- add_boundaries: false,
604
- inherit_boundaries: false)
605
- unless columns.size == other.columns.size
606
- raise 'Cannot apply a set operation to tables with a different number of columns.'
607
- end
608
- unless columns.map(&:type) == other.columns.map(&:type)
609
- raise 'Cannot apply a set operation to tables with different column types.'
610
- end
611
- other_rows = other.rows.map { |r| r.replace_keys(headers) }
612
- result = Table.new
613
- new_rows = rows.send(op, other_rows)
614
- new_rows.each_with_index do |row, k|
615
- result << row
616
- result.mark_boundary if k == size - 1 && add_boundaries
617
- end
618
- if inherit_boundaries
619
- result.boundaries = normalize_boundaries
620
- other.normalize_boundaries
621
- result.append_boundaries(other.boundaries, shift: size)
622
- end
623
- result.normalize_boundaries
624
- distinct ? result.distinct : result
625
- end
626
-
627
- public
628
-
629
- # Return a table that joins this table to another based on one or more join
630
- # expressions. There are several possibilities for the join expressions:
631
- #
632
- # 1. If no join expressions are given, the tables will be joined when all
633
- # values with the same name in both tables have the same value, a
634
- # "natural" join. However, if the join type is :cross, the join
635
- # expression will be taken to be 'true'. Otherwise, if there are no
636
- # common column names, an exception will be raised.
637
- #
638
- # 2. If the join expressions are one or more symbols, the join condition
639
- # requires that the values of both tables are equal for all columns named
640
- # by the symbols. A column that appears in both tables can be given
641
- # without modification and will be assumed to require equality on that
642
- # column. If an unmodified symbol is not a name that appears in both
643
- # tables, an exception will be raised. Column names that are unique to
644
- # the first table must have a '_a' appended to the column name and column
645
- # names that are unique to the other table must have a '_b' appended to
646
- # the column name. These disambiguated column names must come in pairs,
647
- # one for the first table and one for the second, and they will imply a
648
- # join condition that the columns must be equal on those columns. Several
649
- # such symbol expressions will require that all such implied pairs are
650
- # equal in order for the join condition to be met.
651
- #
652
- # 3. Finally, a string expression can be given that contains an arbitrary
653
- # ruby expression that will be evaluated for truthiness. Within the
654
- # string, all column names must be disambiguated with the '_a' or '_b'
655
- # modifiers whether they are common to both tables or not. The names of
656
- # the columns in both tables (without the leading ':' for symbols) are
657
- # available as variables within the expression.
658
- #
659
- # The join_type parameter specifies what sort of join is performed, :inner,
660
- # :left, :right, :full, or :cross. The default is an :inner join. The types
661
- # of joins are defined as follows where T1 means this table, the receiver,
662
- # and T2 means other. These descriptions are taken from the Postgresql
663
- # documentation.
664
- #
665
- # - :inner :: For each row R1 of T1, the joined table has a row for each row
666
- # in T2 that satisfies the join condition with R1.
667
- #
668
- # - :left :: First, an inner join is performed. Then, for each row in T1
669
- # that does not satisfy the join condition with any row in T2, a joined
670
- # row is added with null values in columns of T2. Thus, the joined
671
- # table always has at least one row for each row in T1.
672
- #
673
- # - :right :: First, an inner join is performed. Then, for each row in T2
674
- # that does not satisfy the join condition with any row in T1, a joined
675
- # row is added with null values in columns of T1. This is the converse
676
- # of a left join: the result table will always have a row for each row
677
- # in T2.
678
- #
679
- # - :full :: First, an inner join is performed. Then, for each row in T1
680
- # that does not satisfy the join condition with any row in T2, a joined
681
- # row is added with null values in columns of T2. Also, for each row of
682
- # T2 that does not satisfy the join condition with any row in T1, a
683
- # joined row with null values in the columns of T1 is added.
684
- #
685
- # - :cross :: For every possible combination of rows from T1 and T2 (i.e.,
686
- # a Cartesian product), the joined table will contain a row consisting
687
- # of all columns in T1 followed by all columns in T2. If the tables
688
- # have N and M rows respectively, the joined table will have N * M
689
- # rows.
690
- # Resets groups.
691
- JOIN_TYPES = [:inner, :left, :right, :full, :cross].freeze
692
-
693
- def join(other, *exps, join_type: :inner)
694
- unless other.is_a?(Table)
695
- raise ArgumentError, 'need other table as first argument to join'
696
- end
697
- unless JOIN_TYPES.include?(join_type)
698
- raise ArgumentError, "join_type may only be: #{JOIN_TYPES.join(', ')}"
699
- end
700
- # These may be needed for outer joins.
701
- self_row_nils = headers.map { |h| [h, nil] }.to_h
702
- other_row_nils = other.headers.map { |h| [h, nil] }.to_h
703
- join_expression, other_common_heads = build_join_expression(exps, other, join_type)
704
- ev = Evaluator.new
705
- result = Table.new
706
- other_rows = other.rows
707
- other_row_matches = Array.new(other_rows.size, false)
708
- rows.each do |self_row|
709
- self_row_matched = false
710
- other_rows.each_with_index do |other_row, k|
711
- # Same as other_row, but with keys that are common with self and equal
712
- # in value, removed, so the output table need not repeat them.
713
- locals = build_locals_hash(row_a: self_row, row_b: other_row)
714
- matches = ev.evaluate(join_expression, vars: locals)
715
- next unless matches
716
- self_row_matched = other_row_matches[k] = true
717
- out_row = build_out_row(row_a: self_row, row_b: other_row,
718
- common_heads: other_common_heads,
719
- type: join_type)
720
- result << out_row
721
- end
722
- if join_type == :left || join_type == :full
723
- unless self_row_matched
724
- out_row = build_out_row(row_a: self_row, row_b: other_row_nils, type: join_type)
725
- result << out_row
726
- end
727
- end
728
- end
729
- if join_type == :right || join_type == :full
730
- other_rows.each_with_index do |other_row, k|
731
- unless other_row_matches[k]
732
- out_row = build_out_row(row_a: self_row_nils, row_b: other_row, type: join_type)
733
- result << out_row
734
- end
735
- end
736
- end
737
- result.normalize_boundaries
738
- result
739
- end
740
-
741
- def inner_join(other, *exps)
742
- join(other, *exps)
743
- end
744
-
745
- def left_join(other, *exps)
746
- join(other, *exps, join_type: :left)
747
- end
748
-
749
- def right_join(other, *exps)
750
- join(other, *exps, join_type: :right)
751
- end
752
-
753
- def full_join(other, *exps)
754
- join(other, *exps, join_type: :full)
755
- end
756
-
757
- def cross_join(other)
758
- join(other, join_type: :cross)
759
- end
760
-
761
- private
762
-
763
- # Return an output row appropriate to the given join type, including all the
764
- # keys of row_a, the non-common keys of row_b for an :inner join, or all the
765
- # keys of row_b for other joins. If any of the row_b keys are also row_a
766
- # keys, change the key name by appending a '_b' so the keys will not repeat.
767
- def build_out_row(row_a:, row_b:, common_heads: [], type: :inner)
768
- if type == :inner
769
- # Eliminate the keys that are common with row_a and were matched for
770
- # equality
771
- row_b = row_b.reject { |k, _| common_heads.include?(k) }
772
- end
773
- # Translate any remaining row_b heads to append '_b' if they have the
774
- # same name as a row_a key.
775
- a_heads = row_a.keys
776
- row_b = row_b.to_a.each.map { |k, v|
777
- [a_heads.include?(k) ? "#{k}_b".to_sym : k, v]
778
- }.to_h
779
- row_a.merge(row_b)
780
- end
781
-
782
- # Return a hash for the local variables of a join expression in which all
783
- # the keys in row_a have an '_a' appended and all the keys in row_b have a
784
- # '_b' appended.
785
- def build_locals_hash(row_a:, row_b:)
786
- row_a = row_a.to_a.each.map { |k, v| ["#{k}_a".to_sym, v] }.to_h
787
- row_b = row_b.to_a.each.map { |k, v| ["#{k}_b".to_sym, v] }.to_h
788
- row_a.merge(row_b)
789
- end
790
-
791
- # Return an array of two elements: (1) a ruby expression that expresses the
792
- # AND of all join conditions as described in the comment to the #join method
793
- # and (2) the heads from other table that (a) are known to be tested for
794
- # equality with a head in self table and (b) have the same name. Assume that
795
- # the expression will be evaluated in the context of a binding in which the
796
- # local variables are all the headers in the self table with '_a' appended
797
- # and all the headers in the other table with '_b' appended.
798
- def build_join_expression(exps, other, type)
799
- return ['true', []] if type == :cross
800
- a_heads = headers
801
- b_heads = other.headers
802
- common_heads = a_heads & b_heads
803
- b_common_heads = []
804
- if exps.empty?
805
- if common_heads.empty?
806
- raise ArgumentError,
807
- 'A non-cross join with no common column names requires join expressions'
808
- else
809
- # A Natural join on all common heads
810
- common_heads.each do |h|
811
- ensure_common_types!(self_h: h, other_h: h, other: other)
812
- end
813
- nat_exp = common_heads.map { |h| "(#{h}_a == #{h}_b)" }.join(' && ')
814
- [nat_exp, common_heads]
815
- end
816
- else
817
- # We have expressions to evaluate
818
- and_conds = []
819
- partial_result = nil
820
- last_sym = nil
821
- exps.each do |exp|
822
- case exp
823
- when Symbol
824
- case exp.to_s.clean
825
- when /\A(.*)_a\z/
826
- a_head = $1.to_sym
827
- unless a_heads.include?(a_head)
828
- raise ArgumentError, "no column '#{a_head}' in table"
829
- end
830
- if partial_result
831
- # Second of a pair
832
- ensure_common_types!(self_h: a_head, other_h: last_sym, other: other)
833
- partial_result << "#{a_head}_a)"
834
- and_conds << partial_result
835
- partial_result = nil
836
- else
837
- # First of a pair of _a or _b
838
- partial_result = "(#{a_head}_a == "
839
- end
840
- last_sym = a_head
841
- when /\A(.*)_b\z/
842
- b_head = $1.to_sym
843
- unless b_heads.include?(b_head)
844
- raise ArgumentError, "no column '#{b_head}' in second table"
845
- end
846
- if partial_result
847
- # Second of a pair
848
- ensure_common_types!(self_h: last_sym, other_h: b_head, other: other)
849
- partial_result << "#{b_head}_b)"
850
- and_conds << partial_result
851
- partial_result = nil
852
- else
853
- # First of a pair of _a or _b
854
- partial_result = "(#{b_head}_b == "
855
- end
856
- b_common_heads << b_head
857
- last_sym = b_head
858
- else
859
- # No modifier, so must be one of the common columns
860
- unless partial_result.nil?
861
- # We were expecting the second of a modified pair, but got an
862
- # unmodified symbol instead.
863
- msg =
864
- "must follow '#{last_sym}' by qualified exp from the other table"
865
- raise ArgumentError, msg
866
- end
867
- # We have an unqualified symbol that must appear in both tables
868
- unless common_heads.include?(exp)
869
- raise ArgumentError, "unqualified column '#{exp}' must occur in both tables"
870
- end
871
- ensure_common_types!(self_h: exp, other_h: exp, other: other)
872
- and_conds << "(#{exp}_a == #{exp}_b)"
873
- b_common_heads << exp
874
- end
875
- when String
876
- # We have a string expression in which all column references must be
877
- # qualified.
878
- and_conds << "(#{exp})"
879
- else
880
- raise ArgumentError, "invalid join expression '#{exp}' of class #{exp.class}"
881
- end
882
- end
883
- [and_conds.join(' && '), b_common_heads]
884
- end
885
- end
886
-
887
- # Raise an exception unless self_h in this table and other_h in other table
888
- # have the same types.
889
- def ensure_common_types!(self_h:, other_h:, other:)
890
- unless column(self_h).type == other.column(other_h).type
891
- raise ArgumentError,
892
- "type of column '#{self_h}' does not match type of column '#{other_h}"
893
- end
894
- self
895
- end
896
-
897
- ###################################################################################
898
- # Group By
899
- ###################################################################################
900
-
901
- public
902
-
903
- # Return a Table with a single row for each group of rows in the input table
904
- # where the value of all columns named as simple symbols are equal. All
905
- # other columns are set to the result of aggregating the values of that
906
- # column within the group according to a aggregate function (:count, :sum,
907
- # :min, :max, etc.), which defaults to the :first function, giving the value
908
- # of that column for the first row in the group. You can specify a
909
- # different aggregate function for a column by adding a hash parameter with
910
- # the column as the key and a symbol for the aggregate function as the
911
- # value. For example, consider the following call:
912
- #
913
- # tab.group_by(:date, :code, :price, shares: :sum, ).
914
- #
915
- # The first three parameters are simple symbols, so the table is divided
916
- # into groups of rows in which the value of :date, :code, and :price are
917
- # equal. The shares: hash parameter is set to the aggregate function :sum,
918
- # so it will appear in the result as the sum of all the :shares values in
919
- # each group. Any non-aggregate columns that have no aggregate function set
920
- # default to using the aggregate function :first. Because of the way Ruby
921
- # parses parameters to a method call, all the grouping symbols must appear
922
- # first in the parameter list before any hash parameters.
923
- def group_by(*group_cols, **agg_cols)
924
- default_agg_func = :first
925
- default_cols = headers - group_cols - agg_cols.keys
926
- default_cols.each do |h|
927
- agg_cols[h] = default_agg_func
928
- end
929
-
930
- sorted_tab = order_by(group_cols)
931
- groups = sorted_tab.rows.group_by do |r|
932
- group_cols.map { |k| r[k] }
933
- end
934
- result = Table.new
935
- groups.each_pair do |_vals, grp_rows|
936
- result << row_from_group(grp_rows, group_cols, agg_cols)
937
- end
938
- result.normalize_boundaries
939
- result
940
- end
941
-
942
- private
943
-
944
- def row_from_group(rows, grp_cols, agg_cols)
945
- new_row = {}
946
- grp_cols.each do |h|
947
- new_row[h] = rows.first[h]
948
- end
949
- agg_cols.each_pair do |h, agg_func|
950
- items = rows.map { |r| r[h] }
951
- new_h = "#{agg_func}_#{h}".as_sym
952
- new_row[new_h] = Column.new(header: h,
953
- items: items).send(agg_func)
954
- end
955
- new_row
956
- end
957
-
958
- ############################################################################
959
- # Table construction methods.
960
- ############################################################################
961
-
962
- public
963
-
964
- # Add a row represented by a Hash having the headers as keys. If mark is
965
- # true, mark this row as a boundary. All tables should be built ultimately
966
- # using this method as a primitive.
967
- def add_row(row, mark: false)
968
- row.each_pair do |k, v|
969
- key = k.as_sym
970
- columns << Column.new(header: k) unless column?(k)
971
- column(key) << v
972
- end
973
- @boundaries << (size - 1) if mark
974
- self
975
- end
976
-
977
- # Add a row without marking.
978
- def <<(row)
979
- add_row(row)
980
- end
981
-
982
- def add_column(col)
983
- raise "Table already has a column with header '#{col.header}'" if column?(col.header)
984
- columns << col
985
- self
986
- end
987
- end
988
- end