red_amber 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +15 -0
  3. data/CHANGELOG.md +170 -20
  4. data/Gemfile +4 -2
  5. data/README.md +121 -302
  6. data/benchmark/basic.yml +79 -0
  7. data/benchmark/combine.yml +63 -0
  8. data/benchmark/drop_nil.yml +15 -3
  9. data/benchmark/group.yml +33 -0
  10. data/benchmark/reshape.yml +27 -0
  11. data/benchmark/{csv_load_penguins.yml → rover/csv_load_penguins.yml} +3 -3
  12. data/benchmark/rover/flights.yml +23 -0
  13. data/benchmark/rover/penguins.yml +23 -0
  14. data/benchmark/rover/planes.yml +23 -0
  15. data/benchmark/rover/weather.yml +23 -0
  16. data/doc/DataFrame.md +611 -318
  17. data/doc/Vector.md +31 -36
  18. data/doc/image/basic_verbs.png +0 -0
  19. data/doc/image/dataframe/assign.png +0 -0
  20. data/doc/image/dataframe/assign_operation.png +0 -0
  21. data/doc/image/dataframe/drop.png +0 -0
  22. data/doc/image/dataframe/join.png +0 -0
  23. data/doc/image/dataframe/pick.png +0 -0
  24. data/doc/image/dataframe/pick_operation.png +0 -0
  25. data/doc/image/dataframe/remove.png +0 -0
  26. data/doc/image/dataframe/rename.png +0 -0
  27. data/doc/image/dataframe/rename_operation.png +0 -0
  28. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  29. data/doc/image/dataframe/set_and_bind.png +0 -0
  30. data/doc/image/dataframe/slice.png +0 -0
  31. data/doc/image/dataframe/slice_operation.png +0 -0
  32. data/doc/image/dataframe_model.png +0 -0
  33. data/doc/image/group_operation.png +0 -0
  34. data/doc/image/replace-if_then.png +0 -0
  35. data/doc/image/reshaping_dataframe.png +0 -0
  36. data/doc/image/screenshot.png +0 -0
  37. data/doc/image/vector/binary_element_wise.png +0 -0
  38. data/doc/image/vector/unary_aggregation.png +0 -0
  39. data/doc/image/vector/unary_aggregation_w_option.png +0 -0
  40. data/doc/image/vector/unary_element_wise.png +0 -0
  41. data/lib/red_amber/data_frame.rb +16 -42
  42. data/lib/red_amber/data_frame_combinable.rb +283 -0
  43. data/lib/red_amber/data_frame_displayable.rb +58 -3
  44. data/lib/red_amber/data_frame_loadsave.rb +36 -0
  45. data/lib/red_amber/data_frame_reshaping.rb +8 -6
  46. data/lib/red_amber/data_frame_selectable.rb +9 -9
  47. data/lib/red_amber/data_frame_variable_operation.rb +27 -21
  48. data/lib/red_amber/group.rb +100 -17
  49. data/lib/red_amber/helper.rb +20 -30
  50. data/lib/red_amber/vector.rb +56 -30
  51. data/lib/red_amber/vector_functions.rb +0 -8
  52. data/lib/red_amber/vector_selectable.rb +9 -1
  53. data/lib/red_amber/vector_updatable.rb +61 -63
  54. data/lib/red_amber/version.rb +1 -1
  55. data/lib/red_amber.rb +2 -0
  56. data/red_amber.gemspec +1 -1
  57. metadata +32 -11
  58. data/doc/examples_of_red_amber.ipynb +0 -8979
data/doc/Vector.md CHANGED
@@ -7,7 +7,7 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
7
7
  ### Create from a column in a DataFrame
8
8
 
9
9
  ```ruby
10
- df = RedAmber::DataFrame.new(x: [1, 2, 3])
10
+ df = DataFrame.new(x: [1, 2, 3])
11
11
  df[:x]
12
12
  # =>
13
13
  #<RedAmber::Vector(:uint8, size=3):0x000000000000f4ec>
@@ -17,13 +17,16 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
17
17
  ### New from an Array
18
18
 
19
19
  ```ruby
20
- vector = RedAmber::Vector.new([1, 2, 3])
20
+ vector = Vector.new([1, 2, 3])
21
21
  # or
22
- vector = RedAmber::Vector.new(1, 2, 3)
22
+ vector = Vector.new(1, 2, 3)
23
23
  # or
24
- vector = RedAmber::Vector.new(1..3)
24
+ vector = Vector.new(1..3)
25
25
  # or
26
- vector = RedAmber::Vector.new(Arrow::Array([1, 2, 3])
26
+ vector = Vector.new(Arrow::Array.new([1, 2, 3])
27
+ # or
28
+ require 'arrow-numo-narray'
29
+ vector = Vector.new(Numo::Int8[1, 2, 3])
27
30
 
28
31
  # =>
29
32
  #<RedAmber::Vector(:uint8, size=3):0x000000000000f514>
@@ -61,7 +64,7 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
61
64
 
62
65
  ### `type_class`
63
66
 
64
- ### `each`
67
+ ### `each`, `map`, `collect`
65
68
 
66
69
  If block is not given, returns Enumerator.
67
70
 
@@ -78,7 +81,7 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
78
81
  - `limit` sets size limit to display a long array.
79
82
 
80
83
  ```ruby
81
- vector = RedAmber::Vector.new((1..50).to_a)
84
+ vector = Vector.new((1..50).to_a)
82
85
  # =>
83
86
  #<RedAmber::Vector(:uint8, size=50):0x000000000000f528>
84
87
  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, ... ]
@@ -95,8 +98,8 @@ Class `RedAmber::Vector` represents a series of data in the DataFrame.
95
98
  - Negative index is also OK like the Ruby's primitive Array.
96
99
 
97
100
  ```ruby
98
- array = RedAmber::Vector.new(%w[A B C D E])
99
- indices = RedAmber::Vector.new([0.1, -0.5, -5.1])
101
+ array = Vector.new(%w[A B C D E])
102
+ indices = Vector.new([0.1, -0.5, -5.1])
100
103
  array.take(indices)
101
104
  # or
102
105
  array[indices]
@@ -106,7 +109,7 @@ array[indices]
106
109
  ["A", "E", "A"]
107
110
  ```
108
111
 
109
- ### `filter(booleans)`, `[](booleans)`
112
+ ### `filter(booleans)`, `select(booleans)`, `[](booleans)`
110
113
 
111
114
  - Acceptable class for booleans:
112
115
  - An array of true, false, or nil
@@ -114,7 +117,7 @@ array[indices]
114
117
  - Arrow::BooleanArray
115
118
 
116
119
  ```ruby
117
- array = RedAmber::Vector.new(%w[A B C D E])
120
+ array = Vector.new(%w[A B C D E])
118
121
  booleans = [true, false, nil, false, true]
119
122
  array.filter(booleans)
120
123
  # or
@@ -124,6 +127,7 @@ array[booleans]
124
127
  #<RedAmber::Vector(:string, size=2):0x000000000000f21c>
125
128
  ["A", "E"]
126
129
  ```
130
+ `filter` and `select` also accepts a block.
127
131
 
128
132
  ## Functions
129
133
 
@@ -158,7 +162,7 @@ Options can be used as follows.
158
162
  See the [document of C++ function](https://arrow.apache.org/docs/cpp/compute.html) for detail.
159
163
 
160
164
  ```ruby
161
- double = RedAmber::Vector.new([1, 0/0.0, -1/0.0, 1/0.0, nil, ""])
165
+ double = Vector.new([1, 0/0.0, -1/0.0, 1/0.0, nil, ""])
162
166
  #=>
163
167
  #<RedAmber::Vector(:double, size=6):0x000000000000f910>
164
168
  [1.0, NaN, -Infinity, Infinity, nil, 0.0]
@@ -168,7 +172,7 @@ double.count(mode: :only_valid) #=> 5, default
168
172
  double.count(mode: :only_null) #=> 1
169
173
  double.count(mode: :all) #=> 6
170
174
 
171
- boolean = RedAmber::Vector.new([true, true, nil])
175
+ boolean = Vector.new([true, true, nil])
172
176
  #=>
173
177
  #<RedAmber::Vector(:boolean, size=3):0x000000000000f924>
174
178
  [true, true, nil]
@@ -215,7 +219,7 @@ Examples of options for `#round`;
215
219
  - `round_mode` Specify rounding mode.
216
220
 
217
221
  ```ruby
218
- double = RedAmber::Vector.new([15.15, 2.5, 3.5, -4.5, -5.5])
222
+ double = Vector.new([15.15, 2.5, 3.5, -4.5, -5.5])
219
223
  # => [15.15, 2.5, 3.5, -4.5, -5.5]
220
224
  double.round
221
225
  # => [15.0, 2.0, 4.0, -4.0, -6.0]
@@ -293,7 +297,7 @@ double.round(n_digits: -1)
293
297
  array = [0.0/0, Float::NAN]
294
298
  array.tally #=> {NaN=>1, NaN=>1}
295
299
 
296
- vector = RedAmber::Vector.new(array)
300
+ vector = Vector.new(array)
297
301
  vector.tally #=> {NaN=>2}
298
302
  vector.value_counts #=> {NaN=>2}
299
303
  ```
@@ -310,7 +314,7 @@ double.round(n_digits: -1)
310
314
  ## Coerce
311
315
 
312
316
  ```ruby
313
- vector = RedAmber::Vector.new(1,2,3)
317
+ vector = Vector.new(1,2,3)
314
318
  # =>
315
319
  #<RedAmber::Vector(:uint8, size=3):0x00000000000decc4>
316
320
  [1, 2, 3]
@@ -340,12 +344,13 @@ vector * -1
340
344
  - Accepts Scalar, Range of Integer, Vector, Array, Arrow::Array as a specifier
341
345
  - Accepts Scalar, Vector, Array and Arrow::Array as a replacer.
342
346
  - Boolean specifiers specify the position of replacer in true.
347
+ - If booleans.any is false, no replacement happen and return self.
343
348
  - Index specifiers specify the position of replacer in indices.
344
349
  - replacer specifies the values to be replaced.
345
350
  - The number of true in booleans must be equal to the length of replacer
346
351
 
347
352
  ```ruby
348
- vector = RedAmber::Vector.new([1, 2, 3])
353
+ vector = Vector.new([1, 2, 3])
349
354
  booleans = [true, false, true]
350
355
  replacer = [4, 5]
351
356
  vector.replace(booleans, replacer)
@@ -379,7 +384,7 @@ vector.replace(booleans, replacer)
379
384
  ```ruby
380
385
  booleans = [true, false, nil]
381
386
  replacer = -1
382
- vec.replace(booleans, replacer)
387
+ vector.replace(booleans, replacer)
383
388
  =>
384
389
  #<RedAmber::Vector(:int8, size=3):0x00000000000304d0>
385
390
  [-1, 2, nil]
@@ -390,17 +395,7 @@ vec.replace(booleans, replacer)
390
395
  ```ruby
391
396
  booleans = [true, false, true]
392
397
  replacer = [nil]
393
- vec.replace(booleans, replacer)
394
- =>
395
- #<RedAmber::Vector(:int8, size=3):0x00000000000304d0>
396
- [nil, 2, nil]
397
- ```
398
-
399
- - If no replacer specified, it is same as to specify nil.
400
-
401
- ```ruby
402
- booleans = [true, false, true]
403
- vec.replace(booleans)
398
+ vector.replace(booleans, replacer)
404
399
  =>
405
400
  #<RedAmber::Vector(:int8, size=3):0x00000000000304d0>
406
401
  [nil, 2, nil]
@@ -409,7 +404,7 @@ vec.replace(booleans)
409
404
  - An example to replace 'NA' to nil.
410
405
 
411
406
  ```ruby
412
- vector = RedAmber::Vector.new(['A', 'B', 'NA'])
407
+ vector = Vector.new(['A', 'B', 'NA'])
413
408
  vector.replace(vector == 'NA', nil)
414
409
  # =>
415
410
  #<RedAmber::Vector(:string, size=3):0x000000000000f8ac>
@@ -421,7 +416,7 @@ vector.replace(vector == 'NA', nil)
421
416
  Specified indices are used 'as sorted'. Position in indices and replacer may not have correspondence.
422
417
 
423
418
  ```ruby
424
- vector = RedAmber::Vector.new([1, 2, 3])
419
+ vector = Vector.new([1, 2, 3])
425
420
  indices = [2, 1]
426
421
  replacer = [4, 5]
427
422
  vector.replace(indices, replacer)
@@ -437,7 +432,7 @@ Propagate the last valid observation forward (or backward).
437
432
  Or preserve nil if all previous values are nil or at the end.
438
433
 
439
434
  ```ruby
440
- integer = RedAmber::Vector.new([0, 1, nil, 3, nil])
435
+ integer = Vector.new([0, 1, nil, 3, nil])
441
436
  integer.fill_nil_forward
442
437
  # =>
443
438
  #<RedAmber::Vector(:uint8, size=5):0x000000000000f960>
@@ -459,7 +454,7 @@ Choose values based on self. Self must be a boolean Vector.
459
454
  This example will normalize negative indices to positive ones.
460
455
 
461
456
  ```ruby
462
- indices = RedAmber::Vector.new([1, -1, 3, -4])
457
+ indices = Vector.new([1, -1, 3, -4])
463
458
  array_size = 10
464
459
  normalized_indices = (indices < 0).if_else(indices + array_size, indices)
465
460
 
@@ -474,7 +469,7 @@ For each element in self, return true if it is found in given `values`, false ot
474
469
  By default, nulls are matched against the value set. (This will be changed in SetLookupOptions: not impremented.)
475
470
 
476
471
  ```ruby
477
- vector = RedAmber::Vector.new %W[A B C D]
472
+ vector = Vector.new %W[A B C D]
478
473
  values = ['A', 'C', 'X']
479
474
  vector.is_in(values)
480
475
 
@@ -486,7 +481,7 @@ vector.is_in(values)
486
481
  `values` are casted to the same Class of Vector.
487
482
 
488
483
  ```ruby
489
- vector = RedAmber::Vector.new([1, 2, 255])
484
+ vector = Vector.new([1, 2, 255])
490
485
  vector.is_in(1, -1)
491
486
 
492
487
  # =>
@@ -499,7 +494,7 @@ vector.is_in(1, -1)
499
494
  Shift vector's values by specified `amount`. Shifted space is filled by value `fill`.
500
495
 
501
496
  ```ruby
502
- vector = RedAmber::Vector.new([1, 2, 3, 4, 5])
497
+ vector = Vector.new([1, 2, 3, 4, 5])
503
498
  vector.shift
504
499
 
505
500
  # =>
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -5,8 +5,10 @@ module RedAmber
5
5
  # Variable @table holds an Arrow::Table object.
6
6
  class DataFrame
7
7
  # mix-in
8
+ include DataFrameCombinable
8
9
  include DataFrameDisplayable
9
10
  include DataFrameIndexable
11
+ include DataFrameLoadSave
10
12
  include DataFrameReshaping
11
13
  include DataFrameSelectable
12
14
  include DataFrameVariableOperation
@@ -37,10 +39,15 @@ module RedAmber
37
39
  # DataFrame.new, DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
38
40
  # returns empty DataFrame
39
41
  @table = Arrow::Table.new({}, [])
42
+ in [->(x) { x.respond_to?(:to_arrow) } => arrowable]
43
+ table = arrowable.to_arrow
44
+ unless table.is_a?(Arrow::Table)
45
+ raise DataFrameTypeError,
46
+ "to_arrow must return an Arrow::Table but #{table.class}: #{arrowable}"
47
+ end
48
+ @table = table
40
49
  in [Arrow::Table => table]
41
50
  @table = table
42
- in [DataFrame => dataframe]
43
- @table = dataframe.table
44
51
  in [rover_or_hash]
45
52
  begin
46
53
  # Accepts Rover::DataFrame or Hash
@@ -52,10 +59,9 @@ module RedAmber
52
59
  @table = Arrow::Table.new(*args)
53
60
  end
54
61
  name_unnamed_keys
55
- end
56
62
 
57
- def self.load(path, options = {})
58
- DataFrame.new(Arrow::Table.load(path, options))
63
+ duplicated_keys = keys.tally.select { |_k, v| v > 1 }.keys
64
+ raise DataFrameArgumentError, "duplicate keys: #{duplicated_keys}" unless duplicated_keys.empty?
59
65
  end
60
66
 
61
67
  attr_reader :table
@@ -64,18 +70,15 @@ module RedAmber
64
70
  @table
65
71
  end
66
72
 
67
- def save(output, options = {})
68
- @table.save(output, options)
69
- end
70
-
71
73
  # Returns the number of rows.
72
74
  #
73
75
  # @return [Integer] Number of rows.
74
76
  def size
75
77
  @table.n_rows
76
78
  end
77
- alias_method :n_rows, :size
79
+ alias_method :n_records, :size
78
80
  alias_method :n_obs, :size
81
+ alias_method :n_rows, :size
79
82
 
80
83
  # Returns the number of columns.
81
84
  #
@@ -83,8 +86,9 @@ module RedAmber
83
86
  def n_keys
84
87
  @table.n_columns
85
88
  end
86
- alias_method :n_cols, :n_keys
89
+ alias_method :n_variables, :n_keys
87
90
  alias_method :n_vars, :n_keys
91
+ alias_method :n_cols, :n_keys
88
92
 
89
93
  # Returns the numbers of rows and columns.
90
94
  #
@@ -171,7 +175,7 @@ module RedAmber
171
175
  # - indices(1) #=> [1, 2, 3, 4, 5]
172
176
  # - indices('a') #=> ['a', 'b', 'c', 'd', 'e']
173
177
  def indices(start = 0)
174
- (start..).take(size)
178
+ Vector.new((start..).take(size))
175
179
  end
176
180
  alias_method :indexes, :indices
177
181
 
@@ -215,17 +219,6 @@ module RedAmber
215
219
  Rover::DataFrame.new(to_h)
216
220
  end
217
221
 
218
- def to_iruby
219
- require 'iruby'
220
- return ['text/plain', '(empty DataFrame)'] if empty?
221
-
222
- if ENV.fetch('RED_AMBER_OUTPUT_MODE', 'Table') == 'TDR'
223
- size <= 5 ? ['text/plain', tdr_str(tally: 0)] : ['text/plain', tdr_str]
224
- else
225
- ['text/html', html_table]
226
- end
227
- end
228
-
229
222
  def group(*group_keys, &block)
230
223
  g = Group.new(self, group_keys)
231
224
  g = g.summarize(&block) if block
@@ -260,25 +253,6 @@ module RedAmber
260
253
  ary[%i[variables keys vectors].index(var)]
261
254
  end
262
255
 
263
- def html_table
264
- reduced = size > 8 ? self[0..4, -4..-1] : self
265
-
266
- converted = reduced.assign do
267
- vectors.select.with_object({}) do |vector, assigner|
268
- if vector.has_nil?
269
- assigner[vector.key] = vector.to_a.map do |e|
270
- e = e.nil? ? '<i>(nil)</i>' : e.to_s # nil
271
- e = '""' if e.empty? # empty string
272
- e.sub(/(\s+)/, '"\1"') # blank spaces
273
- end
274
- end
275
- end
276
- end
277
-
278
- html = IRuby::HTML.table(converted.to_h, maxrows: 8, maxcols: 15)
279
- "#{self.class} <#{size} x #{n_keys} vector#{pl(n_keys)}> #{html}"
280
- end
281
-
282
256
  def name_unnamed_keys
283
257
  return unless @table[:'']
284
258
 
@@ -0,0 +1,283 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedAmber
4
+ # mix-in for the class DataFrame
5
+ module DataFrameCombinable
6
+ # Concatenate other dataframe onto the bottom.
7
+ #
8
+ # @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
9
+ # DataFrame/Table to concatenate onto the bottom of self.
10
+ # @return [DataFrame]
11
+ # Concatenated dataframe.
12
+ def concatenate(*other)
13
+ case other
14
+ in [] | [nil] | [[]]
15
+ return self
16
+ in [Array => array]
17
+ # Nop
18
+ else
19
+ array = other
20
+ end
21
+
22
+ table_array = array.map do |e|
23
+ case e
24
+ when Arrow::Table
25
+ e
26
+ when DataFrame
27
+ e.table
28
+ else
29
+ raise DataFrameArgumentError, "#{e} is not a Table or a DataFrame"
30
+ end
31
+ end
32
+
33
+ DataFrame.new(table.concatenate(table_array))
34
+ end
35
+
36
+ alias_method :concat, :concatenate
37
+ alias_method :bind_rows, :concatenate
38
+
39
+ # Merge other DataFrame or Table from other.
40
+ # - Self and other must have same size.
41
+ # - Self and other do not share the same key.
42
+ # - If they share any keys, raise Error.
43
+ # @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
44
+ # DataFrame/Table to concatenate.
45
+ # @return [DataFrame]
46
+ # Merged dataframe.
47
+ def merge(*other)
48
+ case other
49
+ in [] | [nil] | [[]]
50
+ return self
51
+ in [Array => array]
52
+ # Nop
53
+ else
54
+ array = other
55
+ end
56
+
57
+ hash = array.each_with_object({}) do |e, h|
58
+ df =
59
+ case e
60
+ when Arrow::Table
61
+ DataFrame.new(e)
62
+ when DataFrame
63
+ e
64
+ else
65
+ raise DataFrameArgumentError, "#{e} is not a Table or a DataFrame"
66
+ end
67
+
68
+ raise DataFrameArgumentError, "#{e} do not have same size as self" if size != df.size
69
+
70
+ k = keys.intersection(df.keys).any?
71
+ raise DataFrameArgumentError, "There are some shared keys: #{k}" if k
72
+
73
+ h.merge!(df.to_h)
74
+ end
75
+
76
+ assign(hash)
77
+ end
78
+
79
+ alias_method :bind_cols, :merge
80
+
81
+ # Mutating joins
82
+
83
+ # Join data, leaving only the matching records.
84
+ #
85
+ # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
86
+ # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
87
+ # @return [DataFrame] Joined dataframe.
88
+ #
89
+ def inner_join(other, join_keys = nil, suffix: '.1')
90
+ join(other, join_keys, type: :inner, suffix: suffix)
91
+ end
92
+
93
+ # Join data, leaving all records.
94
+ #
95
+ # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
96
+ # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
97
+ # @return [DataFrame] Joined dataframe.
98
+ #
99
+ def full_join(other, join_keys = nil, suffix: '.1')
100
+ join(other, join_keys, type: :full_outer, suffix: suffix)
101
+ end
102
+
103
+ alias_method :outer_join, :full_join
104
+
105
+ # Join matching values to self from other.
106
+ #
107
+ # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
108
+ # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
109
+ # @return [DataFrame] Joined dataframe.
110
+ #
111
+ def left_join(other, join_keys = nil, suffix: '.1')
112
+ join(other, join_keys, type: :left_outer, suffix: suffix)
113
+ end
114
+
115
+ # Join matching values from self to other.
116
+ #
117
+ # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
118
+ # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
119
+ # @return [DataFrame] Joined dataframe.
120
+ #
121
+ def right_join(other, join_keys = nil, suffix: '.1')
122
+ join(other, join_keys, type: :right_outer, suffix: suffix)
123
+ end
124
+
125
+ # Filtering joins
126
+
127
+ # Return records of self that have a match in other.
128
+ #
129
+ # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
130
+ # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
131
+ # @return [DataFrame] Joined dataframe.
132
+ #
133
+ def semi_join(other, join_keys = nil, suffix: '.1')
134
+ join(other, join_keys, type: :left_semi, suffix: suffix)
135
+ end
136
+
137
+ # Return records of self that do not have a match in other.
138
+ #
139
+ # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
140
+ # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
141
+ # @return [DataFrame] Joined dataframe.
142
+ #
143
+ def anti_join(other, join_keys = nil, suffix: '.1')
144
+ join(other, join_keys, type: :left_anti, suffix: suffix)
145
+ end
146
+
147
+ # Set operations
148
+
149
+ # Check if set operation with self and other is possible.
150
+ #
151
+ # @param other [DataFrame, Arrow::Table] DataFrame/Table to be checked with self.
152
+ # @return [Boolean] true if set operation is possible.
153
+ #
154
+ def set_operable?(other) # rubocop:disable Naming/AccessorMethodName
155
+ other = DataFrame.new(other) if other.is_a?(Arrow::Table)
156
+ keys == other.keys
157
+ end
158
+
159
+ # Select records appearing in both self and other.
160
+ #
161
+ # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
162
+ # @return [DataFrame] Joined dataframe.
163
+ #
164
+ def intersect(other)
165
+ other = DataFrame.new(other) if other.is_a?(Arrow::Table)
166
+ raise DataFrameArgumentError, 'keys are not same with self and other' unless keys == other.keys
167
+
168
+ join(other, keys, type: :inner)
169
+ end
170
+
171
+ # Select records appearing in self or other.
172
+ #
173
+ # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
174
+ # @return [DataFrame] Joined dataframe.
175
+ #
176
+ def union(other)
177
+ other = DataFrame.new(other) if other.is_a?(Arrow::Table)
178
+ raise DataFrameArgumentError, 'keys are not same with self and other' unless keys == other.keys
179
+
180
+ join(other, keys, type: :full_outer)
181
+ end
182
+
183
+ # Select records appearing in self but not in other.
184
+ #
185
+ # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
186
+ # @return [DataFrame] Joined dataframe.
187
+ #
188
+ def difference(other)
189
+ other = DataFrame.new(other) if other.is_a?(Arrow::Table)
190
+ raise DataFrameArgumentError, 'keys are not same with self and other' unless keys == other.keys
191
+
192
+ join(other, keys, type: :left_anti)
193
+ end
194
+
195
+ alias_method :setdiff, :difference
196
+
197
+ # Undocumented. It is preferable to call specific methods.
198
+
199
+ # Join other dataframe
200
+ #
201
+ # @param other [DataFrame, Arrow::Table] DataFrame/Table to be joined with self.
202
+ # @param join_keys [String, Symbol, ::Array<String, Symbol>] Keys to match.
203
+ # @return [DataFrame] Joined dataframe.
204
+ #
205
+ # :type is one of
206
+ # :left_semi, :right_semi, :left_anti, :right_anti inner, :left_outer, :right_outer, :full_outer.
207
+ def join(other, join_keys = nil, type: :inner, suffix: '.1', left_outputs: nil, right_outputs: nil)
208
+ case other
209
+ when DataFrame
210
+ # Nop
211
+ when Arrow::Table
212
+ other = DataFrame.new(other)
213
+ else
214
+ raise DataFrameArgumentError, 'other must be a DataFrame or an Arrow::Table'
215
+ end
216
+
217
+ # Support natural keys (implicit common keys)
218
+ natural_keys = keys.intersection(other.keys)
219
+ raise DataFrameArgumentError, "#{join_keys} are not common keys" if natural_keys.empty?
220
+
221
+ join_keys =
222
+ if join_keys
223
+ Array(join_keys).map(&:to_sym)
224
+ else
225
+ natural_keys
226
+ end
227
+ return self if join_keys.empty?
228
+
229
+ # Support partial join_keys (common key other than join_key will be renamed with suffix)
230
+ remainer_keys = natural_keys - join_keys
231
+ unless remainer_keys.empty?
232
+ renamer = remainer_keys.each_with_object({}) do |key, hash|
233
+ new_key = nil
234
+ loop do
235
+ new_key = "#{key}#{suffix}".to_sym
236
+ break unless keys.include?(new_key)
237
+
238
+ s = suffix.succ
239
+ raise DataFrameArgumentError, "suffix #{suffix} is invalid" if s == suffix
240
+
241
+ suffix = s
242
+ end
243
+ hash[key] = new_key
244
+ end
245
+ other = other.rename(renamer)
246
+ end
247
+
248
+ # Red Arrow's #join returns duplicated join_keys from self and other as of v9.0.0 .
249
+ # Temporally merge key vectors here to workaround.
250
+ table_output =
251
+ table.join(other.table, join_keys, type: type, left_outputs: left_outputs, right_outputs: right_outputs)
252
+ left_indexes = [*0...n_keys]
253
+ right_indexes = [*((other.keys - join_keys).map { |key| other.keys.index(key) + n_keys })]
254
+
255
+ case type
256
+ when :left_semi, :left_anti, :right_semi, :right_anti
257
+ return DataFrame.new(table_output)
258
+ else
259
+ selected_indexes = left_indexes.concat(right_indexes)
260
+ end
261
+ merged_columns = join_keys.map do |key|
262
+ i = keys.index(key)
263
+ merge_column(table_output[i], table_output[n_keys + i], type)
264
+ end
265
+ DataFrame.new(table_output[selected_indexes])
266
+ .assign(*join_keys) { merged_columns }
267
+ end
268
+
269
+ private
270
+
271
+ def merge_column(column1, column2, type)
272
+ a1 = column1.to_a
273
+ a2 = column2.to_a
274
+ if type == :full_outer
275
+ a1.zip(a2).map { |x, y| x || y }
276
+ elsif type.start_with?('right')
277
+ a2
278
+ else # :inner or :left-*
279
+ a1
280
+ end
281
+ end
282
+ end
283
+ end