red_amber 0.2.1 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +15 -0
  3. data/CHANGELOG.md +170 -20
  4. data/Gemfile +4 -2
  5. data/README.md +121 -302
  6. data/benchmark/basic.yml +79 -0
  7. data/benchmark/combine.yml +63 -0
  8. data/benchmark/drop_nil.yml +15 -3
  9. data/benchmark/group.yml +33 -0
  10. data/benchmark/reshape.yml +27 -0
  11. data/benchmark/{csv_load_penguins.yml → rover/csv_load_penguins.yml} +3 -3
  12. data/benchmark/rover/flights.yml +23 -0
  13. data/benchmark/rover/penguins.yml +23 -0
  14. data/benchmark/rover/planes.yml +23 -0
  15. data/benchmark/rover/weather.yml +23 -0
  16. data/doc/DataFrame.md +611 -318
  17. data/doc/Vector.md +31 -36
  18. data/doc/image/basic_verbs.png +0 -0
  19. data/doc/image/dataframe/assign.png +0 -0
  20. data/doc/image/dataframe/assign_operation.png +0 -0
  21. data/doc/image/dataframe/drop.png +0 -0
  22. data/doc/image/dataframe/join.png +0 -0
  23. data/doc/image/dataframe/pick.png +0 -0
  24. data/doc/image/dataframe/pick_operation.png +0 -0
  25. data/doc/image/dataframe/remove.png +0 -0
  26. data/doc/image/dataframe/rename.png +0 -0
  27. data/doc/image/dataframe/rename_operation.png +0 -0
  28. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  29. data/doc/image/dataframe/set_and_bind.png +0 -0
  30. data/doc/image/dataframe/slice.png +0 -0
  31. data/doc/image/dataframe/slice_operation.png +0 -0
  32. data/doc/image/dataframe_model.png +0 -0
  33. data/doc/image/group_operation.png +0 -0
  34. data/doc/image/replace-if_then.png +0 -0
  35. data/doc/image/reshaping_dataframe.png +0 -0
  36. data/doc/image/screenshot.png +0 -0
  37. data/doc/image/vector/binary_element_wise.png +0 -0
  38. data/doc/image/vector/unary_aggregation.png +0 -0
  39. data/doc/image/vector/unary_aggregation_w_option.png +0 -0
  40. data/doc/image/vector/unary_element_wise.png +0 -0
  41. data/lib/red_amber/data_frame.rb +16 -42
  42. data/lib/red_amber/data_frame_combinable.rb +283 -0
  43. data/lib/red_amber/data_frame_displayable.rb +58 -3
  44. data/lib/red_amber/data_frame_loadsave.rb +36 -0
  45. data/lib/red_amber/data_frame_reshaping.rb +8 -6
  46. data/lib/red_amber/data_frame_selectable.rb +9 -9
  47. data/lib/red_amber/data_frame_variable_operation.rb +27 -21
  48. data/lib/red_amber/group.rb +100 -17
  49. data/lib/red_amber/helper.rb +20 -30
  50. data/lib/red_amber/vector.rb +56 -30
  51. data/lib/red_amber/vector_functions.rb +0 -8
  52. data/lib/red_amber/vector_selectable.rb +9 -1
  53. data/lib/red_amber/vector_updatable.rb +61 -63
  54. data/lib/red_amber/version.rb +1 -1
  55. data/lib/red_amber.rb +2 -0
  56. data/red_amber.gemspec +1 -1
  57. metadata +32 -11
  58. data/doc/examples_of_red_amber.ipynb +0 -8979
data/doc/Vector.md CHANGED
@@ -7,7 +7,7 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
7
7
  ### Create from a column in a DataFrame
8
8
 
9
9
  ```ruby
10
- df = RedAmber::DataFrame.new(x: [1, 2, 3])
10
+ df = DataFrame.new(x: [1, 2, 3])
11
11
  df[:x]
12
12
  # =>
13
13
  #<RedAmber::Vector(:uint8, size=3):0x000000000000f4ec>
@@ -17,13 +17,16 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
17
17
  ### New from an Array
18
18
 
19
19
  ```ruby
20
- vector = RedAmber::Vector.new([1, 2, 3])
20
+ vector = Vector.new([1, 2, 3])
21
21
  # or
22
- vector = RedAmber::Vector.new(1, 2, 3)
22
+ vector = Vector.new(1, 2, 3)
23
23
  # or
24
- vector = RedAmber::Vector.new(1..3)
24
+ vector = Vector.new(1..3)
25
25
  # or
26
- vector = RedAmber::Vector.new(Arrow::Array([1, 2, 3])
26
+ vector = Vector.new(Arrow::Array.new([1, 2, 3])
27
+ # or
28
+ require 'arrow-numo-narray'
29
+ vector = Vector.new(Numo::Int8[1, 2, 3])
27
30
 
28
31
  # =>
29
32
  #<RedAmber::Vector(:uint8, size=3):0x000000000000f514>
@@ -61,7 +64,7 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
61
64
 
62
65
  ### `type_class`
63
66
 
64
- ### `each`
67
+ ### `each`, `map`, `collect`
65
68
 
66
69
  If block is not given, returns Enumerator.
67
70
 
@@ -78,7 +81,7 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
78
81
  - `limit` sets size limit to display a long array.
79
82
 
80
83
  ```ruby
81
- vector = RedAmber::Vector.new((1..50).to_a)
84
+ vector = Vector.new((1..50).to_a)
82
85
  # =>
83
86
  #<RedAmber::Vector(:uint8, size=50):0x000000000000f528>
84
87
  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, ... ]
@@ -95,8 +98,8 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
95
98
  - Negative index is also OK like the Ruby's primitive Array.
96
99
 
97
100
  ```ruby
98
- array = RedAmber::Vector.new(%w[A B C D E])
99
- indices = RedAmber::Vector.new([0.1, -0.5, -5.1])
101
+ array = Vector.new(%w[A B C D E])
102
+ indices = Vector.new([0.1, -0.5, -5.1])
100
103
  array.take(indices)
101
104
  # or
102
105
  array[indices]
@@ -106,7 +109,7 @@ array[indices]
106
109
  ["A", "E", "A"]
107
110
  ```
108
111
 
109
- ### `filter(booleans)`, `[](booleans)`
112
+ ### `filter(booleans)`, `select(booleans)`, `[](booleans)`
110
113
 
111
114
  - Acceptable class for booleans:
112
115
  - An array of true, false, or nil
@@ -114,7 +117,7 @@ array[indices]
114
117
  - Arrow::BooleanArray
115
118
 
116
119
  ```ruby
117
- array = RedAmber::Vector.new(%w[A B C D E])
120
+ array = Vector.new(%w[A B C D E])
118
121
  booleans = [true, false, nil, false, true]
119
122
  array.filter(booleans)
120
123
  # or
@@ -124,6 +127,7 @@ array[booleans]
124
127
  #<RedAmber::Vector(:string, size=2):0x000000000000f21c>
125
128
  ["A", "E"]
126
129
  ```
130
+ `filter` and `select` also accepts a block.
127
131
 
128
132
  ## Functions
129
133
 
@@ -158,7 +162,7 @@ Options can be used as follows.
158
162
  See the [document of C++ function](https://arrow.apache.org/docs/cpp/compute.html) for detail.
159
163
 
160
164
  ```ruby
161
- double = RedAmber::Vector.new([1, 0/0.0, -1/0.0, 1/0.0, nil, ""])
165
+ double = Vector.new([1, 0/0.0, -1/0.0, 1/0.0, nil, ""])
162
166
  #=>
163
167
  #<RedAmber::Vector(:double, size=6):0x000000000000f910>
164
168
  [1.0, NaN, -Infinity, Infinity, nil, 0.0]
@@ -168,7 +172,7 @@ double.count(mode: :only_valid) #=> 5, default
168
172
  double.count(mode: :only_null) #=> 1
169
173
  double.count(mode: :all) #=> 6
170
174
 
171
- boolean = RedAmber::Vector.new([true, true, nil])
175
+ boolean = Vector.new([true, true, nil])
172
176
  #=>
173
177
  #<RedAmber::Vector(:boolean, size=3):0x000000000000f924>
174
178
  [true, true, nil]
@@ -215,7 +219,7 @@ Examples of options for `#round`;
215
219
  - `round_mode` Specify rounding mode.
216
220
 
217
221
  ```ruby
218
- double = RedAmber::Vector.new([15.15, 2.5, 3.5, -4.5, -5.5])
222
+ double = Vector.new([15.15, 2.5, 3.5, -4.5, -5.5])
219
223
  # => [15.15, 2.5, 3.5, -4.5, -5.5]
220
224
  double.round
221
225
  # => [15.0, 2.0, 4.0, -4.0, -6.0]
@@ -293,7 +297,7 @@ double.round(n_digits: -1)
293
297
  array = [0.0/0, Float::NAN]
294
298
  array.tally #=> {NaN=>1, NaN=>1}
295
299
 
296
- vector = RedAmber::Vector.new(array)
300
+ vector = Vector.new(array)
297
301
  vector.tally #=> {NaN=>2}
298
302
  vector.value_counts #=> {NaN=>2}
299
303
  ```
@@ -310,7 +314,7 @@ double.round(n_digits: -1)
310
314
  ## Coerce
311
315
 
312
316
  ```ruby
313
- vector = RedAmber::Vector.new(1,2,3)
317
+ vector = Vector.new(1,2,3)
314
318
  # =>
315
319
  #<RedAmber::Vector(:uint8, size=3):0x00000000000decc4>
316
320
  [1, 2, 3]
@@ -340,12 +344,13 @@ vector * -1
340
344
  - Accepts Scalar, Range of Integer, Vector, Array, Arrow::Array as a specifier
341
345
  - Accepts Scalar, Vector, Array and Arrow::Array as a replacer.
342
346
  - Boolean specifiers specify the position of replacer in true.
347
+ - If booleans.any is false, no replacement happen and return self.
343
348
  - Index specifiers specify the position of replacer in indices.
344
349
  - replacer specifies the values to be replaced.
345
350
  - The number of true in booleans must be equal to the length of replacer
346
351
 
347
352
  ```ruby
348
- vector = RedAmber::Vector.new([1, 2, 3])
353
+ vector = Vector.new([1, 2, 3])
349
354
  booleans = [true, false, true]
350
355
  replacer = [4, 5]
351
356
  vector.replace(booleans, replacer)
@@ -379,7 +384,7 @@ vector.replace(booleans, replacer)
379
384
  ```ruby
380
385
  booleans = [true, false, nil]
381
386
  replacer = -1
382
- vec.replace(booleans, replacer)
387
+ vector.replace(booleans, replacer)
383
388
  =>
384
389
  #<RedAmber::Vector(:int8, size=3):0x00000000000304d0>
385
390
  [-1, 2, nil]
@@ -390,17 +395,7 @@ vec.replace(booleans, replacer)
390
395
  ```ruby
391
396
  booleans = [true, false, true]
392
397
  replacer = [nil]
393
- vec.replace(booleans, replacer)
394
- =>
395
- #<RedAmber::Vector(:int8, size=3):0x00000000000304d0>
396
- [nil, 2, nil]
397
- ```
398
-
399
- - If no replacer specified, it is same as to specify nil.
400
-
401
- ```ruby
402
- booleans = [true, false, true]
403
- vec.replace(booleans)
398
+ vector.replace(booleans, replacer)
404
399
  =>
405
400
  #<RedAmber::Vector(:int8, size=3):0x00000000000304d0>
406
401
  [nil, 2, nil]
@@ -409,7 +404,7 @@ vec.replace(booleans)
409
404
  - An example to replace 'NA' to nil.
410
405
 
411
406
  ```ruby
412
- vector = RedAmber::Vector.new(['A', 'B', 'NA'])
407
+ vector = Vector.new(['A', 'B', 'NA'])
413
408
  vector.replace(vector == 'NA', nil)
414
409
  # =>
415
410
  #<RedAmber::Vector(:string, size=3):0x000000000000f8ac>
@@ -421,7 +416,7 @@ vector.replace(vector == 'NA', nil)
421
416
  Specified indices are used 'as sorted'. Position in indices and replacer may not have correspondence.
422
417
 
423
418
  ```ruby
424
- vector = RedAmber::Vector.new([1, 2, 3])
419
+ vector = Vector.new([1, 2, 3])
425
420
  indices = [2, 1]
426
421
  replacer = [4, 5]
427
422
  vector.replace(indices, replacer)
@@ -437,7 +432,7 @@ Propagate the last valid observation forward (or backward).
437
432
  Or preserve nil if all previous values are nil or at the end.
438
433
 
439
434
  ```ruby
440
- integer = RedAmber::Vector.new([0, 1, nil, 3, nil])
435
+ integer = Vector.new([0, 1, nil, 3, nil])
441
436
  integer.fill_nil_forward
442
437
  # =>
443
438
  #<RedAmber::Vector(:uint8, size=5):0x000000000000f960>
@@ -459,7 +454,7 @@ Choose values based on self. Self must be a boolean Vector.
459
454
  This example will normalize negative indices to positive ones.
460
455
 
461
456
  ```ruby
462
- indices = RedAmber::Vector.new([1, -1, 3, -4])
457
+ indices = Vector.new([1, -1, 3, -4])
463
458
  array_size = 10
464
459
  normalized_indices = (indices < 0).if_else(indices + array_size, indices)
465
460
 
@@ -474,7 +469,7 @@ For each element in self, return true if it is found in given `values`, false ot
474
469
  By default, nulls are matched against the value set. (This will be changed in SetLookupOptions: not impremented.)
475
470
 
476
471
  ```ruby
477
- vector = RedAmber::Vector.new %W[A B C D]
472
+ vector = Vector.new %W[A B C D]
478
473
  values = ['A', 'C', 'X']
479
474
  vector.is_in(values)
480
475
 
@@ -486,7 +481,7 @@ vector.is_in(values)
486
481
  `values` are casted to the same Class of Vector.
487
482
 
488
483
  ```ruby
489
- vector = RedAmber::Vector.new([1, 2, 255])
484
+ vector = Vector.new([1, 2, 255])
490
485
  vector.is_in(1, -1)
491
486
 
492
487
  # =>
@@ -499,7 +494,7 @@ vector.is_in(1, -1)
499
494
  Shift vector's values by specified `amount`. Shifted space is filled by value `fill`.
500
495
 
501
496
  ```ruby
502
- vector = RedAmber::Vector.new([1, 2, 3, 4, 5])
497
+ vector = Vector.new([1, 2, 3, 4, 5])
503
498
  vector.shift
504
499
 
505
500
  # =>
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -5,8 +5,10 @@ module RedAmber
5
5
  # Variable @table holds an Arrow::Table object.
6
6
  class DataFrame
7
7
  # mix-in
8
+ include DataFrameCombinable
8
9
  include DataFrameDisplayable
9
10
  include DataFrameIndexable
11
+ include DataFrameLoadSave
10
12
  include DataFrameReshaping
11
13
  include DataFrameSelectable
12
14
  include DataFrameVariableOperation
@@ -37,10 +39,15 @@ module RedAmber
37
39
  # DataFrame.new, DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
38
40
  # returns empty DataFrame
39
41
  @table = Arrow::Table.new({}, [])
42
+ in [->(x) { x.respond_to?(:to_arrow) } => arrowable]
43
+ table = arrowable.to_arrow
44
+ unless table.is_a?(Arrow::Table)
45
+ raise DataFrameTypeError,
46
+ "to_arrow must return an Arrow::Table but #{table.class}: #{arrowable}"
47
+ end
48
+ @table = table
40
49
  in [Arrow::Table => table]
41
50
  @table = table
42
- in [DataFrame => dataframe]
43
- @table = dataframe.table
44
51
  in [rover_or_hash]
45
52
  begin
46
53
  # Accepts Rover::DataFrame or Hash
@@ -52,10 +59,9 @@ module RedAmber
52
59
  @table = Arrow::Table.new(*args)
53
60
  end
54
61
  name_unnamed_keys
55
- end
56
62
 
57
- def self.load(path, options = {})
58
- DataFrame.new(Arrow::Table.load(path, options))
63
+ duplicated_keys = keys.tally.select { |_k, v| v > 1 }.keys
64
+ raise DataFrameArgumentError, "duplicate keys: #{duplicated_keys}" unless duplicated_keys.empty?
59
65
  end
60
66
 
61
67
  attr_reader :table
@@ -64,18 +70,15 @@ module RedAmber
64
70
  @table
65
71
  end
66
72
 
67
- def save(output, options = {})
68
- @table.save(output, options)
69
- end
70
-
71
73
  # Returns the number of rows.
72
74
  #
73
75
  # @return [Integer] Number of rows.
74
76
  def size
75
77
  @table.n_rows
76
78
  end
77
- alias_method :n_rows, :size
79
+ alias_method :n_records, :size
78
80
  alias_method :n_obs, :size
81
+ alias_method :n_rows, :size
79
82
 
80
83
  # Returns the number of columns.
81
84
  #
@@ -83,8 +86,9 @@ module RedAmber
83
86
  def n_keys
84
87
  @table.n_columns
85
88
  end
86
- alias_method :n_cols, :n_keys
89
+ alias_method :n_variables, :n_keys
87
90
  alias_method :n_vars, :n_keys
91
+ alias_method :n_cols, :n_keys
88
92
 
89
93
  # Returns the numbers of rows and columns.
90
94
  #
@@ -171,7 +175,7 @@ module RedAmber
171
175
  # - indices(1) #=> [1, 2, 3, 4, 5]
172
176
  # - indices('a') #=> ['a', 'b', 'c', 'd', 'e']
173
177
  def indices(start = 0)
174
- (start..).take(size)
178
+ Vector.new((start..).take(size))
175
179
  end
176
180
  alias_method :indexes, :indices
177
181
 
@@ -215,17 +219,6 @@ module RedAmber
215
219
  Rover::DataFrame.new(to_h)
216
220
  end
217
221
 
218
- def to_iruby
219
- require 'iruby'
220
- return ['text/plain', '(empty DataFrame)'] if empty?
221
-
222
- if ENV.fetch('RED_AMBER_OUTPUT_MODE', 'Table') == 'TDR'
223
- size <= 5 ? ['text/plain', tdr_str(tally: 0)] : ['text/plain', tdr_str]
224
- else
225
- ['text/html', html_table]
226
- end
227
- end
228
-
229
222
  def group(*group_keys, &block)
230
223
  g = Group.new(self, group_keys)
231
224
  g = g.summarize(&block) if block
@@ -260,25 +253,6 @@ module RedAmber
260
253
  ary[%i[variables keys vectors].index(var)]
261
254
  end
262
255
 
263
- def html_table
264
- reduced = size > 8 ? self[0..4, -4..-1] : self
265
-
266
- converted = reduced.assign do
267
- vectors.select.with_object({}) do |vector, assigner|
268
- if vector.has_nil?
269
- assigner[vector.key] = vector.to_a.map do |e|
270
- e = e.nil? ? '<i>(nil)</i>' : e.to_s # nil
271
- e = '""' if e.empty? # empty string
272
- e.sub(/(\s+)/, '"\1"') # blank spaces
273
- end
274
- end
275
- end
276
- end
277
-
278
- html = IRuby::HTML.table(converted.to_h, maxrows: 8, maxcols: 15)
279
- "#{self.class} <#{size} x #{n_keys} vector#{pl(n_keys)}> #{html}"
280
- end
281
-
282
256
  def name_unnamed_keys
283
257
  return unless @table[:'']
284
258
 
@@ -0,0 +1,283 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedAmber
4
+ # mix-in for the class DataFrame
5
+ module DataFrameCombinable
6
+ # Concatenate other dataframe onto the bottom.
7
+ #
8
+ # @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
9
+ # DataFrame/Table to concatenate onto the bottom of self.
10
+ # @return [DataFrame]
11
+ # Concatenated dataframe.
12
+ def concatenate(*other)
13
+ case other
14
+ in [] | [nil] | [[]]
15
+ return self
16
+ in [Array => array]
17
+ # Nop
18
+ else
19
+ array = other
20
+ end
21
+
22
+ table_array = array.map do |e|
23
+ case e
24
+ when Arrow::Table
25
+ e
26
+ when DataFrame
27
+ e.table
28
+ else
29
+ raise DataFrameArgumentError, "#{e} is not a Table or a DataFrame"
30
+ end
31
+ end
32
+
33
+ DataFrame.new(table.concatenate(table_array))
34
+ end
35
+
36
+ alias_method :concat, :concatenate
37
+ alias_method :bind_rows, :concatenate
38
+
39
+ # Merge other DataFrame or Table from other.
40
+ # - Self and other must have same size.
41
+ # - Self and other do not share the same key.
42
+ # - If they share any keys, raise Error.
43
+ # @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
44
+ # DataFrame/Table to concatenate.
45
+ # @return [DataFrame]
46
+ # Merged dataframe.
47
+ def merge(*other)
48
+ case other
49
+ in [] | [nil] | [[]]
50
+ return self
51
+ in [Array => array]
52
+ # Nop
53
+ else
54
+ array = other
55
+ end
56
+
57
+ hash = array.each_with_object({}) do |e, h|
58
+ df =
59
+ case e
60
+ when Arrow::Table
61
+ DataFrame.new(e)
62
+ when DataFrame
63
+ e
64
+ else
65
+ raise DataFrameArgumentError, "#{e} is not a Table or a DataFrame"
66
+ end
67
+
68
+ raise DataFrameArgumentError, "#{e} do not have same size as self" if size != df.size
69
+
70
+ k = keys.intersection(df.keys).any?
71
+ raise DataFrameArgumentError, "There are some shared keys: #{k}" if k
72
+
73
+ h.merge!(df.to_h)
74
+ end
75
+
76
+ assign(hash)
77
+ end
78
+
79
+ alias_method :bind_cols, :merge
80
+
81
+ # Mutating joins
82
+
83
+ # Join data, leaving only the matching records.
84
+ #
85
+ # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
86
+ # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
87
+ # @return [DataFrame] Joined dataframe.
88
+ #
89
+ def inner_join(other, join_keys = nil, suffix: '.1')
90
+ join(other, join_keys, type: :inner, suffix: suffix)
91
+ end
92
+
93
+ # Join data, leaving all records.
94
+ #
95
+ # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
96
+ # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
97
+ # @return [DataFrame] Joined dataframe.
98
+ #
99
+ def full_join(other, join_keys = nil, suffix: '.1')
100
+ join(other, join_keys, type: :full_outer, suffix: suffix)
101
+ end
102
+
103
+ alias_method :outer_join, :full_join
104
+
105
+ # Join matching values to self from other.
106
+ #
107
+ # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
108
+ # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
109
+ # @return [DataFrame] Joined dataframe.
110
+ #
111
+ def left_join(other, join_keys = nil, suffix: '.1')
112
+ join(other, join_keys, type: :left_outer, suffix: suffix)
113
+ end
114
+
115
+ # Join matching values from self to other.
116
+ #
117
+ # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
118
+ # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
119
+ # @return [DataFrame] Joined dataframe.
120
+ #
121
+ def right_join(other, join_keys = nil, suffix: '.1')
122
+ join(other, join_keys, type: :right_outer, suffix: suffix)
123
+ end
124
+
125
+ # Filtering joins
126
+
127
+ # Return records of self that have a match in other.
128
+ #
129
+ # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
130
+ # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
131
+ # @return [DataFrame] Joined dataframe.
132
+ #
133
+ def semi_join(other, join_keys = nil, suffix: '.1')
134
+ join(other, join_keys, type: :left_semi, suffix: suffix)
135
+ end
136
+
137
+ # Return records of self that do not have a match in other.
138
+ #
139
+ # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
140
+ # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
141
+ # @return [DataFrame] Joined dataframe.
142
+ #
143
+ def anti_join(other, join_keys = nil, suffix: '.1')
144
+ join(other, join_keys, type: :left_anti, suffix: suffix)
145
+ end
146
+
147
+ # Set operations
148
+
149
+ # Check if set operation with self and other is possible.
150
+ #
151
+ # @param other [DataFrame, Arrow::Table] DataFrame/Table to be checked with self.
152
+ # @return [Boolean] true if set operation is possible.
153
+ #
154
+ def set_operable?(other) # rubocop:disable Naming/AccessorMethodName
155
+ other = DataFrame.new(other) if other.is_a?(Arrow::Table)
156
+ keys == other.keys
157
+ end
158
+
159
+ # Select records appearing in both self and other.
160
+ #
161
+ # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
162
+ # @return [DataFrame] Joined dataframe.
163
+ #
164
+ def intersect(other)
165
+ other = DataFrame.new(other) if other.is_a?(Arrow::Table)
166
+ raise DataFrameArgumentError, 'keys are not same with self and other' unless keys == other.keys
167
+
168
+ join(other, keys, type: :inner)
169
+ end
170
+
171
+ # Select records appearing in self or other.
172
+ #
173
+ # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
174
+ # @return [DataFrame] Joined dataframe.
175
+ #
176
+ def union(other)
177
+ other = DataFrame.new(other) if other.is_a?(Arrow::Table)
178
+ raise DataFrameArgumentError, 'keys are not same with self and other' unless keys == other.keys
179
+
180
+ join(other, keys, type: :full_outer)
181
+ end
182
+
183
+ # Select records appearing in self but not in other.
184
+ #
185
+ # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
186
+ # @return [DataFrame] Joined dataframe.
187
+ #
188
+ def difference(other)
189
+ other = DataFrame.new(other) if other.is_a?(Arrow::Table)
190
+ raise DataFrameArgumentError, 'keys are not same with self and other' unless keys == other.keys
191
+
192
+ join(other, keys, type: :left_anti)
193
+ end
194
+
195
+ alias_method :setdiff, :difference
196
+
197
+ # Undocumented. It is preferable to call specific methods.
198
+
199
+ # Join other dataframe
200
+ #
201
+ # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
202
+ # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
203
+ # @return [DataFrame] Joined dataframe.
204
+ #
205
+ # :type is one of
206
+ # :left_semi, :right_semi, :left_anti, :right_anti inner, :left_outer, :right_outer, :full_outer.
207
+ def join(other, join_keys = nil, type: :inner, suffix: '.1', left_outputs: nil, right_outputs: nil)
208
+ case other
209
+ when DataFrame
210
+ # Nop
211
+ when Arrow::Table
212
+ other = DataFrame.new(other)
213
+ else
214
+ raise DataFrameArgumentError, 'other must be a DataFrame or an Arrow::Table'
215
+ end
216
+
217
+ # Support natural keys (implicit common keys)
218
+ natural_keys = keys.intersection(other.keys)
219
+ raise DataFrameArgumentError, "#{join_keys} are not common keys" if natural_keys.empty?
220
+
221
+ join_keys =
222
+ if join_keys
223
+ Array(join_keys).map(&:to_sym)
224
+ else
225
+ natural_keys
226
+ end
227
+ return self if join_keys.empty?
228
+
229
+ # Support partial join_keys (common key other than join_key will be renamed with suffix)
230
+ remainer_keys = natural_keys - join_keys
231
+ unless remainer_keys.empty?
232
+ renamer = remainer_keys.each_with_object({}) do |key, hash|
233
+ new_key = nil
234
+ loop do
235
+ new_key = "#{key}#{suffix}".to_sym
236
+ break unless keys.include?(new_key)
237
+
238
+ s = suffix.succ
239
+ raise DataFrameArgumentError, "suffix #{suffix} is invalid" if s == suffix
240
+
241
+ suffix = s
242
+ end
243
+ hash[key] = new_key
244
+ end
245
+ other = other.rename(renamer)
246
+ end
247
+
248
+ # Red Arrow's #join returns duplicated join_keys from self and other as of v9.0.0 .
249
+ # Temporally merge key vectors here to workaround.
250
+ table_output =
251
+ table.join(other.table, join_keys, type: type, left_outputs: left_outputs, right_outputs: right_outputs)
252
+ left_indexes = [*0...n_keys]
253
+ right_indexes = [*((other.keys - join_keys).map { |key| other.keys.index(key) + n_keys })]
254
+
255
+ case type
256
+ when :left_semi, :left_anti, :right_semi, :right_anti
257
+ return DataFrame.new(table_output)
258
+ else
259
+ selected_indexes = left_indexes.concat(right_indexes)
260
+ end
261
+ merged_columns = join_keys.map do |key|
262
+ i = keys.index(key)
263
+ merge_column(table_output[i], table_output[n_keys + i], type)
264
+ end
265
+ DataFrame.new(table_output[selected_indexes])
266
+ .assign(*join_keys) { merged_columns }
267
+ end
268
+
269
+ private
270
+
271
+ def merge_column(column1, column2, type)
272
+ a1 = column1.to_a
273
+ a2 = column2.to_a
274
+ if type == :full_outer
275
+ a1.zip(a2).map { |x, y| x || y }
276
+ elsif type.start_with?('right')
277
+ a2
278
+ else # :inner or :left-*
279
+ a1
280
+ end
281
+ end
282
+ end
283
+ end