red-arrow 10.0.1 → 12.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +3 -3
  3. data/ext/arrow/converters.hpp +45 -41
  4. data/ext/arrow/extconf.rb +14 -2
  5. data/ext/arrow/raw-records.cpp +1 -2
  6. data/ext/arrow/values.cpp +1 -2
  7. data/lib/arrow/array-computable.rb +13 -0
  8. data/lib/arrow/array.rb +5 -0
  9. data/lib/arrow/chunked-array.rb +23 -1
  10. data/lib/arrow/column-containable.rb +9 -0
  11. data/lib/arrow/column.rb +1 -0
  12. data/lib/arrow/data-type.rb +9 -0
  13. data/lib/arrow/dense-union-array-builder.rb +49 -0
  14. data/lib/arrow/dense-union-array.rb +26 -0
  15. data/lib/arrow/half-float-array-builder.rb +32 -0
  16. data/lib/arrow/half-float-array.rb +24 -0
  17. data/lib/arrow/half-float.rb +118 -0
  18. data/lib/arrow/input-referable.rb +29 -0
  19. data/lib/arrow/loader.rb +10 -0
  20. data/lib/arrow/raw-table-converter.rb +7 -5
  21. data/lib/arrow/record-batch-file-reader.rb +2 -0
  22. data/lib/arrow/record-batch-stream-reader.rb +2 -0
  23. data/lib/arrow/record-batch.rb +6 -2
  24. data/lib/arrow/scalar.rb +67 -0
  25. data/lib/arrow/slicer.rb +61 -0
  26. data/lib/arrow/sparse-union-array-builder.rb +56 -0
  27. data/lib/arrow/sparse-union-array.rb +26 -0
  28. data/lib/arrow/struct-array-builder.rb +0 -5
  29. data/lib/arrow/table-loader.rb +4 -4
  30. data/lib/arrow/table-saver.rb +1 -0
  31. data/lib/arrow/table.rb +178 -31
  32. data/lib/arrow/tensor.rb +4 -0
  33. data/lib/arrow/union-array-builder.rb +59 -0
  34. data/lib/arrow/version.rb +1 -1
  35. data/red-arrow.gemspec +1 -1
  36. data/test/raw-records/test-basic-arrays.rb +10 -0
  37. data/test/raw-records/test-dense-union-array.rb +90 -45
  38. data/test/raw-records/test-list-array.rb +28 -10
  39. data/test/raw-records/test-map-array.rb +39 -10
  40. data/test/raw-records/test-sparse-union-array.rb +86 -41
  41. data/test/raw-records/test-struct-array.rb +22 -8
  42. data/test/test-array.rb +7 -0
  43. data/test/test-chunked-array.rb +9 -0
  44. data/test/test-data-type.rb +2 -1
  45. data/test/test-dense-union-array.rb +42 -0
  46. data/test/test-dense-union-data-type.rb +1 -1
  47. data/test/test-function.rb +7 -7
  48. data/test/test-group.rb +58 -58
  49. data/test/test-half-float-array.rb +43 -0
  50. data/test/test-half-float.rb +130 -0
  51. data/test/test-record-batch-file-reader.rb +21 -0
  52. data/test/test-record-batch-stream-reader.rb +129 -0
  53. data/test/test-scalar.rb +65 -0
  54. data/test/test-slicer.rb +194 -129
  55. data/test/test-sparse-union-array.rb +38 -0
  56. data/test/test-table.rb +324 -40
  57. data/test/values/test-basic-arrays.rb +10 -0
  58. data/test/values/test-dense-union-array.rb +88 -45
  59. data/test/values/test-list-array.rb +26 -10
  60. data/test/values/test-map-array.rb +33 -10
  61. data/test/values/test-sparse-union-array.rb +84 -41
  62. data/test/values/test-struct-array.rb +20 -8
  63. metadata +30 -9
data/lib/arrow/table.rb CHANGED
@@ -22,6 +22,7 @@ module Arrow
22
22
  include ColumnContainable
23
23
  include GenericFilterable
24
24
  include GenericTakeable
25
+ include InputReferable
25
26
  include RecordContainable
26
27
 
27
28
  class << self
@@ -188,6 +189,7 @@ module Arrow
188
189
 
189
190
  reader = TableBatchReader.new(self)
190
191
  while record_batch = reader.read_next
192
+ share_input(record_batch)
191
193
  yield(record_batch)
192
194
  end
193
195
  end
@@ -314,8 +316,6 @@ module Arrow
314
316
  end
315
317
  end
316
318
 
317
- filter_options = Arrow::FilterOptions.new
318
- filter_options.null_selection_behavior = :emit_null
319
319
  sliced_tables = []
320
320
  slicers.each do |slicer|
321
321
  slicer = slicer.evaluate if slicer.respond_to?(:evaluate)
@@ -337,7 +337,7 @@ module Arrow
337
337
  to += n_rows if to < 0
338
338
  sliced_tables << slice_by_range(from, to)
339
339
  when ::Array, BooleanArray, ChunkedArray
340
- sliced_tables << filter(slicer, filter_options)
340
+ sliced_tables << filter(slicer)
341
341
  else
342
342
  message = "slicer must be Integer, Range, (from, to), " +
343
343
  "Arrow::ChunkedArray of Arrow::BooleanArray, " +
@@ -346,10 +346,12 @@ module Arrow
346
346
  end
347
347
  end
348
348
  if sliced_tables.size > 1
349
- sliced_tables[0].concatenate(sliced_tables[1..-1])
349
+ sliced_table = sliced_tables[0].concatenate(sliced_tables[1..-1])
350
350
  else
351
- sliced_tables[0]
351
+ sliced_table = sliced_tables[0]
352
352
  end
353
+ share_input(sliced_table)
354
+ sliced_table
353
355
  end
354
356
 
355
357
  # TODO
@@ -401,7 +403,9 @@ module Arrow
401
403
  new_fields << new_column[:field]
402
404
  new_arrays << new_column[:data]
403
405
  end
404
- self.class.new(new_fields, new_arrays)
406
+ table = self.class.new(new_fields, new_arrays)
407
+ share_input(table)
408
+ table
405
409
  end
406
410
 
407
411
  alias_method :remove_column_raw, :remove_column
@@ -423,7 +427,9 @@ module Arrow
423
427
  raise IndexError.new(message)
424
428
  end
425
429
  end
426
- remove_column_raw(index)
430
+ table = remove_column_raw(index)
431
+ share_input(table)
432
+ table
427
433
  end
428
434
 
429
435
  # Experimental
@@ -445,43 +451,69 @@ module Arrow
445
451
  packed_arrays = columns.collect do |column|
446
452
  column.data.pack
447
453
  end
448
- self.class.new(schema, packed_arrays)
454
+ table = self.class.new(schema, packed_arrays)
455
+ share_input(table)
456
+ table
449
457
  end
450
458
 
451
- # @overload join(right, key, type: :inner, left_outputs: nil, right_outputs: nil)
452
- # @!macro join_common_before
453
- # @param right [Arrow::Table] The right table.
459
+ # Join another Table by matching with keys.
460
+ #
461
+ # @!macro join_common_before
462
+ # @param right [Arrow::Table] The right table.
463
+ #
464
+ # Join columns with `right` on join key columns.
454
465
  #
455
- # Join columns with `right` on join key columns.
466
+ # @!macro join_common_after
467
+ # @param type [Arrow::JoinType] How to join.
468
+ # @param left_outputs [::Array<String, Symbol>] Output columns in
469
+ # `self`.
456
470
  #
457
- # @!macro join_common_after
458
- # @param type [Arrow::JoinType] How to join.
459
- # @param left_outputs [::Array<String, Symbol>] Output columns in
460
- # `self`.
471
+ # If both of `left_outputs` and `right_outputs` aren't
472
+ # specified, all columns in `self` and `right` are
473
+ # output.
474
+ # @param right_outputs [::Array<String, Symbol>] Output columns in
475
+ # `right`.
476
+ #
477
+ # If both of `left_outputs` and `right_outputs` aren't
478
+ # specified, all columns in `self` and `right` are
479
+ # output.
480
+ # @return [Arrow::Table]
481
+ # The joined `Arrow::Table`.
461
482
  #
462
- # If both of `left_outputs` and `right_outputs` aren't
463
- # specified, all columns in `self` and `right` are
464
- # outputted.
465
- # @param right_outputs [::Array<String, Symbol>] Output columns in
466
- # `right`.
483
+ # @overload join(right, type: :inner, left_outputs: nil, right_outputs: nil)
484
+ # If key(s) are not supplied, common keys in self and right are used
485
+ # (natural join).
467
486
  #
468
- # If both of `left_outputs` and `right_outputs` aren't
469
- # specified, all columns in `self` and `right` are
470
- # outputted.
471
- # @return [Arrow::Table]
472
- # The joined `Arrow::Table`.
487
+ # Column used as keys are merged and remain in left side
488
+ # when both of `left_outputs` and `right_outputs` are `nil`.
489
+ #
490
+ # @macro join_common_before
491
+ # @macro join_common_after
492
+ #
493
+ # @since 11.0.0
494
+ #
495
+ # @overload join(right, key, type: :inner, left_outputs: nil, right_outputs: nil)
496
+ # Join right by a key.
497
+ #
498
+ # Column used as keys are merged and remain in left side
499
+ # when both of `left_outputs` and `right_outputs` are `nil`.
473
500
  #
474
501
  # @macro join_common_before
475
502
  # @param key [String, Symbol] A join key.
476
503
  # @macro join_common_after
477
504
  #
478
- # @overload join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
505
+ # @overload join(right, keys, type: :inner, left_suffix: "", right_suffix: "",
506
+ # left_outputs: nil, right_outputs: nil)
507
+ # Join right by keys.
508
+ #
509
+ # Column name can be renamed by appending `left_suffix` or `right_suffix`.
479
510
  #
480
511
  # @macro join_common_before
481
512
  # @param keys [::Array<String, Symbol>] Join keys.
482
513
  # @macro join_common_after
483
514
  #
484
515
  # @overload join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
516
+ # Join right by a key or keys mapped by a hash.
485
517
  #
486
518
  # @macro join_common_before
487
519
  # @param keys [Hash] Specify join keys in `self` and `right` separately.
@@ -492,7 +524,16 @@ module Arrow
492
524
  # @macro join_common_after
493
525
  #
494
526
  # @since 7.0.0
495
- def join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
527
+ def join(right,
528
+ keys=nil,
529
+ type: :inner,
530
+ left_suffix: "",
531
+ right_suffix: "",
532
+ left_outputs: nil,
533
+ right_outputs: nil)
534
+ is_natural_join = keys.nil?
535
+ keys ||= (column_names & right.column_names)
536
+ type = JoinType.try_convert(type) || type
496
537
  plan = ExecutePlan.new
497
538
  left_node = plan.build_source_node(self)
498
539
  right_node = plan.build_source_node(right)
@@ -508,22 +549,46 @@ module Arrow
508
549
  hash_join_node_options = HashJoinNodeOptions.new(type,
509
550
  left_keys,
510
551
  right_keys)
552
+ use_manual_outputs = false
511
553
  unless left_outputs.nil?
512
554
  hash_join_node_options.left_outputs = left_outputs
555
+ use_manual_outputs = true
513
556
  end
514
557
  unless right_outputs.nil?
515
558
  hash_join_node_options.right_outputs = right_outputs
559
+ use_manual_outputs = true
516
560
  end
517
561
  hash_join_node = plan.build_hash_join_node(left_node,
518
562
  right_node,
519
563
  hash_join_node_options)
564
+ type_nick = type.nick
565
+ is_filter_join = (type_nick.end_with?("-semi") or
566
+ type_nick.end_with?("-anti"))
567
+ if use_manual_outputs or is_filter_join
568
+ process_node = hash_join_node
569
+ elsif is_natural_join
570
+ process_node = join_merge_keys(plan, hash_join_node, right, keys)
571
+ elsif keys.is_a?(String) or keys.is_a?(Symbol)
572
+ process_node = join_merge_keys(plan, hash_join_node, right, [keys.to_s])
573
+ elsif !keys.is_a?(Hash) and (left_suffix != "" or right_suffix != "")
574
+ process_node = join_rename_keys(plan,
575
+ hash_join_node,
576
+ right,
577
+ keys,
578
+ left_suffix,
579
+ right_suffix)
580
+ else
581
+ process_node = hash_join_node
582
+ end
520
583
  sink_node_options = SinkNodeOptions.new
521
- plan.build_sink_node(hash_join_node, sink_node_options)
584
+ plan.build_sink_node(process_node, sink_node_options)
522
585
  plan.validate
523
586
  plan.start
524
587
  plan.wait
525
- reader = sink_node_options.get_reader(hash_join_node.output_schema)
526
- reader.read_all
588
+ reader = sink_node_options.get_reader(process_node.output_schema)
589
+ table = reader.read_all
590
+ share_input(table)
591
+ table
527
592
  end
528
593
 
529
594
  alias_method :to_s_raw, :to_s
@@ -593,5 +658,87 @@ module Arrow
593
658
  raise ArgumentError, message
594
659
  end
595
660
  end
661
+
662
+ def join_merge_keys(plan, input_node, right, keys)
663
+ expressions = []
664
+ names = []
665
+ normalized_keys = {}
666
+ keys.each do |key|
667
+ normalized_keys[key.to_s] = true
668
+ end
669
+ key_to_outputs = {}
670
+ outputs = []
671
+ left_n_column_names = column_names.size
672
+ column_names.each_with_index do |name, i|
673
+ is_key = normalized_keys.include?(name)
674
+ output = {is_key: is_key, name: name, index: i, direction: :left}
675
+ outputs << output
676
+ key_to_outputs[name] = {left: output} if is_key
677
+ end
678
+ right.column_names.each_with_index do |name, i|
679
+ index = left_n_column_names + i
680
+ is_key = normalized_keys.include?(name)
681
+ output = {is_key: is_key, name: name, index: index, direction: :right}
682
+ outputs << output
683
+ key_to_outputs[name][:right] = output if is_key
684
+ end
685
+
686
+ outputs.each do |output|
687
+ if output[:is_key]
688
+ next if output[:direction] == :right
689
+ left_output = key_to_outputs[output[:name]][:left]
690
+ right_output = key_to_outputs[output[:name]][:right]
691
+ left_field = FieldExpression.new("[#{left_output[:index]}]")
692
+ right_field = FieldExpression.new("[#{right_output[:index]}]")
693
+ is_left_null = CallExpression.new("is_null", [left_field])
694
+ merge_column = CallExpression.new("if_else",
695
+ [
696
+ is_left_null,
697
+ right_field,
698
+ left_field,
699
+ ])
700
+ expressions << merge_column
701
+ else
702
+ expressions << FieldExpression.new("[#{output[:index]}]")
703
+ end
704
+ names << output[:name]
705
+ end
706
+ project_node_options = ProjectNodeOptions.new(expressions, names)
707
+ plan.build_project_node(input_node, project_node_options)
708
+ end
709
+
710
+ def join_rename_keys(plan,
711
+ input_node,
712
+ right,
713
+ keys,
714
+ left_suffix,
715
+ right_suffix)
716
+ expressions = []
717
+ names = []
718
+ normalized_keys = {}
719
+ keys.each do |key|
720
+ normalized_keys[key.to_s] = true
721
+ end
722
+ left_n_column_names = column_names.size
723
+ column_names.each_with_index do |name, i|
724
+ expressions << FieldExpression.new("[#{i}]")
725
+ if normalized_keys.include?(name)
726
+ names << "#{name}#{left_suffix}"
727
+ else
728
+ names << name
729
+ end
730
+ end
731
+ right.column_names.each_with_index do |name, i|
732
+ index = left_n_column_names + i
733
+ expressions << FieldExpression.new("[#{index}]")
734
+ if normalized_keys.include?(name)
735
+ names << "#{name}#{right_suffix}"
736
+ else
737
+ names << name
738
+ end
739
+ end
740
+ project_node_options = ProjectNodeOptions.new(expressions, names)
741
+ plan.build_project_node(input_node, project_node_options)
742
+ end
596
743
  end
597
744
  end
data/lib/arrow/tensor.rb CHANGED
@@ -160,5 +160,9 @@ module Arrow
160
160
  nil,
161
161
  0)
162
162
  end
163
+
164
+ def to_arrow_chunked_array
165
+ ChunkedArray.new([to_arrow_array])
166
+ end
163
167
  end
164
168
  end
@@ -0,0 +1,59 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module Arrow
19
+ class UnionArrayBuilder
20
+ def append_values(values, is_valids=nil)
21
+ if is_valids
22
+ is_valids.each_with_index do |is_valid, i|
23
+ if is_valid
24
+ append_value(values[i])
25
+ else
26
+ append_null
27
+ end
28
+ end
29
+ else
30
+ values.each do |value|
31
+ append_value(value)
32
+ end
33
+ end
34
+ end
35
+
36
+ alias_method :append_child_raw, :append_child
37
+ def append_child(builder, filed_name=nil)
38
+ @child_infos = nil
39
+ append_child_raw(builder, field_name)
40
+ end
41
+
42
+ private
43
+ def child_infos
44
+ @child_infos ||= create_child_infos
45
+ end
46
+
47
+ def create_child_infos
48
+ infos = {}
49
+ type = value_data_type
50
+ type.fields.zip(children, type.type_codes).each do |field, child, id|
51
+ infos[field.name] = {
52
+ builder: child,
53
+ id: id,
54
+ }
55
+ end
56
+ infos
57
+ end
58
+ end
59
+ end
data/lib/arrow/version.rb CHANGED
@@ -16,7 +16,7 @@
16
16
  # under the License.
17
17
 
18
18
  module Arrow
19
- VERSION = "10.0.1"
19
+ VERSION = "12.0.0"
20
20
 
21
21
  module Version
22
22
  numbers, TAG = VERSION.split("-")
data/red-arrow.gemspec CHANGED
@@ -47,7 +47,7 @@ Gem::Specification.new do |spec|
47
47
  spec.extensions = ["ext/arrow/extconf.rb"]
48
48
 
49
49
  spec.add_runtime_dependency("bigdecimal", ">= 3.1.0")
50
- spec.add_runtime_dependency("extpp", ">= 0.0.7")
50
+ spec.add_runtime_dependency("extpp", ">= 0.1.1")
51
51
  spec.add_runtime_dependency("gio2", ">= 3.5.0")
52
52
  spec.add_runtime_dependency("native-package-installer")
53
53
  spec.add_runtime_dependency("pkg-config")
@@ -117,6 +117,16 @@ module RawRecordsBasicArraysTests
117
117
  assert_equal(records, target.raw_records)
118
118
  end
119
119
 
120
+ def test_half_float
121
+ records = [
122
+ [-1.5],
123
+ [nil],
124
+ [1.5],
125
+ ]
126
+ target = build({column: :half_float}, records)
127
+ assert_equal(records, target.raw_records)
128
+ end
129
+
120
130
  def test_float
121
131
  records = [
122
132
  [-1.0],