red-arrow 10.0.0 → 16.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (83) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +3 -3
  3. data/ext/arrow/arrow.cpp +31 -0
  4. data/ext/arrow/converters.hpp +45 -41
  5. data/ext/arrow/extconf.rb +16 -4
  6. data/ext/arrow/raw-records.cpp +155 -2
  7. data/ext/arrow/red-arrow.hpp +2 -0
  8. data/ext/arrow/values.cpp +1 -2
  9. data/lib/arrow/array-computable.rb +13 -0
  10. data/lib/arrow/array.rb +6 -1
  11. data/lib/arrow/chunked-array.rb +35 -1
  12. data/lib/arrow/column-containable.rb +9 -0
  13. data/lib/arrow/column.rb +1 -0
  14. data/lib/arrow/data-type.rb +9 -0
  15. data/lib/arrow/dense-union-array-builder.rb +49 -0
  16. data/lib/arrow/dense-union-array.rb +26 -0
  17. data/lib/arrow/expression.rb +6 -2
  18. data/lib/arrow/function.rb +0 -1
  19. data/lib/arrow/half-float-array-builder.rb +32 -0
  20. data/lib/arrow/half-float-array.rb +24 -0
  21. data/lib/arrow/half-float.rb +118 -0
  22. data/lib/arrow/input-referable.rb +29 -0
  23. data/lib/arrow/loader.rb +11 -0
  24. data/lib/arrow/raw-table-converter.rb +7 -5
  25. data/lib/arrow/record-batch-file-reader.rb +2 -0
  26. data/lib/arrow/record-batch-stream-reader.rb +2 -0
  27. data/lib/arrow/record-batch.rb +6 -2
  28. data/lib/arrow/scalar.rb +67 -0
  29. data/lib/arrow/slicer.rb +61 -0
  30. data/lib/arrow/sort-key.rb +3 -3
  31. data/lib/arrow/sparse-union-array-builder.rb +56 -0
  32. data/lib/arrow/sparse-union-array.rb +26 -0
  33. data/lib/arrow/struct-array-builder.rb +0 -5
  34. data/lib/arrow/table-loader.rb +11 -5
  35. data/lib/arrow/table-saver.rb +1 -0
  36. data/lib/arrow/table.rb +180 -33
  37. data/lib/arrow/tensor.rb +4 -0
  38. data/lib/arrow/timestamp-parser.rb +33 -0
  39. data/lib/arrow/union-array-builder.rb +59 -0
  40. data/lib/arrow/version.rb +1 -1
  41. data/red-arrow.gemspec +2 -1
  42. data/test/each-raw-record/test-basic-arrays.rb +411 -0
  43. data/test/each-raw-record/test-dense-union-array.rb +566 -0
  44. data/test/each-raw-record/test-dictionary-array.rb +341 -0
  45. data/test/each-raw-record/test-list-array.rb +628 -0
  46. data/test/each-raw-record/test-map-array.rb +507 -0
  47. data/test/each-raw-record/test-multiple-columns.rb +72 -0
  48. data/test/each-raw-record/test-sparse-union-array.rb +528 -0
  49. data/test/each-raw-record/test-struct-array.rb +529 -0
  50. data/test/each-raw-record/test-table.rb +47 -0
  51. data/test/helper/omittable.rb +13 -0
  52. data/test/helper.rb +1 -0
  53. data/test/raw-records/test-basic-arrays.rb +11 -1
  54. data/test/raw-records/test-dense-union-array.rb +90 -45
  55. data/test/raw-records/test-list-array.rb +28 -10
  56. data/test/raw-records/test-map-array.rb +39 -10
  57. data/test/raw-records/test-sparse-union-array.rb +86 -41
  58. data/test/raw-records/test-struct-array.rb +22 -8
  59. data/test/test-array.rb +7 -0
  60. data/test/test-chunked-array.rb +9 -0
  61. data/test/test-csv-loader.rb +39 -0
  62. data/test/test-data-type.rb +2 -1
  63. data/test/test-dense-union-array.rb +42 -0
  64. data/test/test-dense-union-data-type.rb +1 -1
  65. data/test/test-expression.rb +11 -0
  66. data/test/test-function.rb +7 -7
  67. data/test/test-group.rb +58 -58
  68. data/test/test-half-float-array.rb +43 -0
  69. data/test/test-half-float.rb +130 -0
  70. data/test/test-ractor.rb +34 -0
  71. data/test/test-record-batch-file-reader.rb +21 -0
  72. data/test/test-record-batch-stream-reader.rb +129 -0
  73. data/test/test-scalar.rb +65 -0
  74. data/test/test-slicer.rb +194 -129
  75. data/test/test-sparse-union-array.rb +38 -0
  76. data/test/test-table.rb +356 -40
  77. data/test/values/test-basic-arrays.rb +10 -0
  78. data/test/values/test-dense-union-array.rb +88 -45
  79. data/test/values/test-list-array.rb +26 -10
  80. data/test/values/test-map-array.rb +33 -10
  81. data/test/values/test-sparse-union-array.rb +84 -41
  82. data/test/values/test-struct-array.rb +20 -8
  83. metadata +62 -9
@@ -51,6 +51,7 @@ module Arrow
51
51
  raise ArgumentError, message
52
52
  end
53
53
  __send__(custom_save_method)
54
+ @table
54
55
  end
55
56
 
56
57
  private
data/lib/arrow/table.rb CHANGED
@@ -22,6 +22,7 @@ module Arrow
22
22
  include ColumnContainable
23
23
  include GenericFilterable
24
24
  include GenericTakeable
25
+ include InputReferable
25
26
  include RecordContainable
26
27
 
27
28
  class << self
@@ -126,7 +127,7 @@ module Arrow
126
127
  # You can also specify schema as primitive Ruby objects.
127
128
  # See {Arrow::Schema#initialize} for details.
128
129
  #
129
- # @param arrays [::Array<Arrow::RecordBatch>] The data of the table.
130
+ # @param record_batches [::Array<Arrow::RecordBatch>] The data of the table.
130
131
  #
131
132
  # @example Create a table from schema and record batches
132
133
  # count_field = Arrow::Field.new("count", :uint32)
@@ -144,7 +145,7 @@ module Arrow
144
145
  # You can also specify schema as primitive Ruby objects.
145
146
  # See {Arrow::Schema#initialize} for details.
146
147
  #
147
- # @param arrays [::Array<::Array>] The data of the table as primitive
148
+ # @param raw_records [::Array<::Array>] The data of the table as primitive
148
149
  # Ruby objects.
149
150
  #
150
151
  # @example Create a table from schema and raw records
@@ -188,6 +189,7 @@ module Arrow
188
189
 
189
190
  reader = TableBatchReader.new(self)
190
191
  while record_batch = reader.read_next
192
+ share_input(record_batch)
191
193
  yield(record_batch)
192
194
  end
193
195
  end
@@ -314,8 +316,6 @@ module Arrow
314
316
  end
315
317
  end
316
318
 
317
- filter_options = Arrow::FilterOptions.new
318
- filter_options.null_selection_behavior = :emit_null
319
319
  sliced_tables = []
320
320
  slicers.each do |slicer|
321
321
  slicer = slicer.evaluate if slicer.respond_to?(:evaluate)
@@ -337,7 +337,7 @@ module Arrow
337
337
  to += n_rows if to < 0
338
338
  sliced_tables << slice_by_range(from, to)
339
339
  when ::Array, BooleanArray, ChunkedArray
340
- sliced_tables << filter(slicer, filter_options)
340
+ sliced_tables << filter(slicer)
341
341
  else
342
342
  message = "slicer must be Integer, Range, (from, to), " +
343
343
  "Arrow::ChunkedArray of Arrow::BooleanArray, " +
@@ -346,10 +346,12 @@ module Arrow
346
346
  end
347
347
  end
348
348
  if sliced_tables.size > 1
349
- sliced_tables[0].concatenate(sliced_tables[1..-1])
349
+ sliced_table = sliced_tables[0].concatenate(sliced_tables[1..-1])
350
350
  else
351
- sliced_tables[0]
351
+ sliced_table = sliced_tables[0]
352
352
  end
353
+ share_input(sliced_table)
354
+ sliced_table
353
355
  end
354
356
 
355
357
  # TODO
@@ -401,7 +403,9 @@ module Arrow
401
403
  new_fields << new_column[:field]
402
404
  new_arrays << new_column[:data]
403
405
  end
404
- self.class.new(new_fields, new_arrays)
406
+ table = self.class.new(new_fields, new_arrays)
407
+ share_input(table)
408
+ table
405
409
  end
406
410
 
407
411
  alias_method :remove_column_raw, :remove_column
@@ -423,7 +427,9 @@ module Arrow
423
427
  raise IndexError.new(message)
424
428
  end
425
429
  end
426
- remove_column_raw(index)
430
+ table = remove_column_raw(index)
431
+ share_input(table)
432
+ table
427
433
  end
428
434
 
429
435
  # Experimental
@@ -445,43 +451,69 @@ module Arrow
445
451
  packed_arrays = columns.collect do |column|
446
452
  column.data.pack
447
453
  end
448
- self.class.new(schema, packed_arrays)
454
+ table = self.class.new(schema, packed_arrays)
455
+ share_input(table)
456
+ table
449
457
  end
450
458
 
451
- # @overload join(right, key, type: :inner, left_outputs: nil, right_outputs: nil)
452
- # @!macro join_common_before
453
- # @param right [Arrow::Table] The right table.
459
+ # Join another Table by matching with keys.
460
+ #
461
+ # @!macro join_common_before
462
+ # @param right [Arrow::Table] The right table.
463
+ #
464
+ # Join columns with `right` on join key columns.
454
465
  #
455
- # Join columns with `right` on join key columns.
466
+ # @!macro join_common_after
467
+ # @param type [Arrow::JoinType] How to join.
468
+ # @param left_outputs [::Array<String, Symbol>] Output columns in
469
+ # `self`.
456
470
  #
457
- # @!macro join_common_after
458
- # @param type [Arrow::JoinType] How to join.
459
- # @param left_outputs [::Array<String, Symbol>] Output columns in
460
- # `self`.
471
+ # If both of `left_outputs` and `right_outputs` aren't
472
+ # specified, all columns in `self` and `right` are
473
+ # output.
474
+ # @param right_outputs [::Array<String, Symbol>] Output columns in
475
+ # `right`.
476
+ #
477
+ # If both of `left_outputs` and `right_outputs` aren't
478
+ # specified, all columns in `self` and `right` are
479
+ # output.
480
+ # @return [Arrow::Table]
481
+ # The joined `Arrow::Table`.
461
482
  #
462
- # If both of `left_outputs` and `right_outputs` aren't
463
- # specified, all columns in `self` and `right` are
464
- # outputted.
465
- # @param right_outputs [::Array<String, Symbol>] Output columns in
466
- # `right`.
483
+ # @overload join(right, type: :inner, left_outputs: nil, right_outputs: nil)
484
+ # If key(s) are not supplied, common keys in self and right are used
485
+ # (natural join).
467
486
  #
468
- # If both of `left_outputs` and `right_outputs` aren't
469
- # specified, all columns in `self` and `right` are
470
- # outputted.
471
- # @return [Arrow::Table]
472
- # The joined `Arrow::Table`.
487
+ # Column used as keys are merged and remain in left side
488
+ # when both of `left_outputs` and `right_outputs` are `nil`.
489
+ #
490
+ # @macro join_common_before
491
+ # @macro join_common_after
492
+ #
493
+ # @since 11.0.0
494
+ #
495
+ # @overload join(right, key, type: :inner, left_outputs: nil, right_outputs: nil)
496
+ # Join right by a key.
497
+ #
498
+ # Column used as keys are merged and remain in left side
499
+ # when both of `left_outputs` and `right_outputs` are `nil`.
473
500
  #
474
501
  # @macro join_common_before
475
502
  # @param key [String, Symbol] A join key.
476
503
  # @macro join_common_after
477
504
  #
478
- # @overload join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
505
+ # @overload join(right, keys, type: :inner, left_suffix: "", right_suffix: "",
506
+ # left_outputs: nil, right_outputs: nil)
507
+ # Join right by keys.
508
+ #
509
+ # Column name can be renamed by appending `left_suffix` or `right_suffix`.
479
510
  #
480
511
  # @macro join_common_before
481
512
  # @param keys [::Array<String, Symbol>] Join keys.
482
513
  # @macro join_common_after
483
514
  #
484
515
  # @overload join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
516
+ # Join right by a key or keys mapped by a hash.
485
517
  #
486
518
  # @macro join_common_before
487
519
  # @param keys [Hash] Specify join keys in `self` and `right` separately.
@@ -492,7 +524,16 @@ module Arrow
492
524
  # @macro join_common_after
493
525
  #
494
526
  # @since 7.0.0
495
- def join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
527
+ def join(right,
528
+ keys=nil,
529
+ type: :inner,
530
+ left_suffix: "",
531
+ right_suffix: "",
532
+ left_outputs: nil,
533
+ right_outputs: nil)
534
+ is_natural_join = keys.nil?
535
+ keys ||= (column_names & right.column_names)
536
+ type = JoinType.try_convert(type) || type
496
537
  plan = ExecutePlan.new
497
538
  left_node = plan.build_source_node(self)
498
539
  right_node = plan.build_source_node(right)
@@ -508,22 +549,46 @@ module Arrow
508
549
  hash_join_node_options = HashJoinNodeOptions.new(type,
509
550
  left_keys,
510
551
  right_keys)
552
+ use_manual_outputs = false
511
553
  unless left_outputs.nil?
512
554
  hash_join_node_options.left_outputs = left_outputs
555
+ use_manual_outputs = true
513
556
  end
514
557
  unless right_outputs.nil?
515
558
  hash_join_node_options.right_outputs = right_outputs
559
+ use_manual_outputs = true
516
560
  end
517
561
  hash_join_node = plan.build_hash_join_node(left_node,
518
562
  right_node,
519
563
  hash_join_node_options)
564
+ type_nick = type.nick
565
+ is_filter_join = (type_nick.end_with?("-semi") or
566
+ type_nick.end_with?("-anti"))
567
+ if use_manual_outputs or is_filter_join
568
+ process_node = hash_join_node
569
+ elsif is_natural_join
570
+ process_node = join_merge_keys(plan, hash_join_node, right, keys)
571
+ elsif keys.is_a?(String) or keys.is_a?(Symbol)
572
+ process_node = join_merge_keys(plan, hash_join_node, right, [keys.to_s])
573
+ elsif !keys.is_a?(Hash) and (left_suffix != "" or right_suffix != "")
574
+ process_node = join_rename_keys(plan,
575
+ hash_join_node,
576
+ right,
577
+ keys,
578
+ left_suffix,
579
+ right_suffix)
580
+ else
581
+ process_node = hash_join_node
582
+ end
520
583
  sink_node_options = SinkNodeOptions.new
521
- plan.build_sink_node(hash_join_node, sink_node_options)
584
+ plan.build_sink_node(process_node, sink_node_options)
522
585
  plan.validate
523
586
  plan.start
524
587
  plan.wait
525
- reader = sink_node_options.get_reader(hash_join_node.output_schema)
526
- reader.read_all
588
+ reader = sink_node_options.get_reader(process_node.output_schema)
589
+ table = reader.read_all
590
+ share_input(table)
591
+ table
527
592
  end
528
593
 
529
594
  alias_method :to_s_raw, :to_s
@@ -593,5 +658,87 @@ module Arrow
593
658
  raise ArgumentError, message
594
659
  end
595
660
  end
661
+
662
+ def join_merge_keys(plan, input_node, right, keys)
663
+ expressions = []
664
+ names = []
665
+ normalized_keys = {}
666
+ keys.each do |key|
667
+ normalized_keys[key.to_s] = true
668
+ end
669
+ key_to_outputs = {}
670
+ outputs = []
671
+ left_n_column_names = column_names.size
672
+ column_names.each_with_index do |name, i|
673
+ is_key = normalized_keys.include?(name)
674
+ output = {is_key: is_key, name: name, index: i, direction: :left}
675
+ outputs << output
676
+ key_to_outputs[name] = {left: output} if is_key
677
+ end
678
+ right.column_names.each_with_index do |name, i|
679
+ index = left_n_column_names + i
680
+ is_key = normalized_keys.include?(name)
681
+ output = {is_key: is_key, name: name, index: index, direction: :right}
682
+ outputs << output
683
+ key_to_outputs[name][:right] = output if is_key
684
+ end
685
+
686
+ outputs.each do |output|
687
+ if output[:is_key]
688
+ next if output[:direction] == :right
689
+ left_output = key_to_outputs[output[:name]][:left]
690
+ right_output = key_to_outputs[output[:name]][:right]
691
+ left_field = FieldExpression.new("[#{left_output[:index]}]")
692
+ right_field = FieldExpression.new("[#{right_output[:index]}]")
693
+ is_left_null = CallExpression.new("is_null", [left_field])
694
+ merge_column = CallExpression.new("if_else",
695
+ [
696
+ is_left_null,
697
+ right_field,
698
+ left_field,
699
+ ])
700
+ expressions << merge_column
701
+ else
702
+ expressions << FieldExpression.new("[#{output[:index]}]")
703
+ end
704
+ names << output[:name]
705
+ end
706
+ project_node_options = ProjectNodeOptions.new(expressions, names)
707
+ plan.build_project_node(input_node, project_node_options)
708
+ end
709
+
710
+ def join_rename_keys(plan,
711
+ input_node,
712
+ right,
713
+ keys,
714
+ left_suffix,
715
+ right_suffix)
716
+ expressions = []
717
+ names = []
718
+ normalized_keys = {}
719
+ keys.each do |key|
720
+ normalized_keys[key.to_s] = true
721
+ end
722
+ left_n_column_names = column_names.size
723
+ column_names.each_with_index do |name, i|
724
+ expressions << FieldExpression.new("[#{i}]")
725
+ if normalized_keys.include?(name)
726
+ names << "#{name}#{left_suffix}"
727
+ else
728
+ names << name
729
+ end
730
+ end
731
+ right.column_names.each_with_index do |name, i|
732
+ index = left_n_column_names + i
733
+ expressions << FieldExpression.new("[#{index}]")
734
+ if normalized_keys.include?(name)
735
+ names << "#{name}#{right_suffix}"
736
+ else
737
+ names << name
738
+ end
739
+ end
740
+ project_node_options = ProjectNodeOptions.new(expressions, names)
741
+ plan.build_project_node(input_node, project_node_options)
742
+ end
596
743
  end
597
744
  end
data/lib/arrow/tensor.rb CHANGED
@@ -160,5 +160,9 @@ module Arrow
160
160
  nil,
161
161
  0)
162
162
  end
163
+
164
+ def to_arrow_chunked_array
165
+ ChunkedArray.new([to_arrow_array])
166
+ end
163
167
  end
164
168
  end
@@ -0,0 +1,33 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module Arrow
19
+ class TimestampParser
20
+ class << self
21
+ def try_convert(value)
22
+ case value
23
+ when :iso8601
24
+ ISO8601TimestampParser.new
25
+ when String
26
+ StrptimeTimestampParser.new(value)
27
+ else
28
+ nil
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,59 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module Arrow
19
+ class UnionArrayBuilder
20
+ def append_values(values, is_valids=nil)
21
+ if is_valids
22
+ is_valids.each_with_index do |is_valid, i|
23
+ if is_valid
24
+ append_value(values[i])
25
+ else
26
+ append_null
27
+ end
28
+ end
29
+ else
30
+ values.each do |value|
31
+ append_value(value)
32
+ end
33
+ end
34
+ end
35
+
36
+ alias_method :append_child_raw, :append_child
37
+ def append_child(builder, filed_name=nil)
38
+ @child_infos = nil
39
+ append_child_raw(builder, field_name)
40
+ end
41
+
42
+ private
43
+ def child_infos
44
+ @child_infos ||= create_child_infos
45
+ end
46
+
47
+ def create_child_infos
48
+ infos = {}
49
+ type = value_data_type
50
+ type.fields.zip(children, type.type_codes).each do |field, child, id|
51
+ infos[field.name] = {
52
+ builder: child,
53
+ id: id,
54
+ }
55
+ end
56
+ infos
57
+ end
58
+ end
59
+ end
data/lib/arrow/version.rb CHANGED
@@ -16,7 +16,7 @@
16
16
  # under the License.
17
17
 
18
18
  module Arrow
19
- VERSION = "10.0.0"
19
+ VERSION = "16.0.0"
20
20
 
21
21
  module Version
22
22
  numbers, TAG = VERSION.split("-")
data/red-arrow.gemspec CHANGED
@@ -47,7 +47,8 @@ Gem::Specification.new do |spec|
47
47
  spec.extensions = ["ext/arrow/extconf.rb"]
48
48
 
49
49
  spec.add_runtime_dependency("bigdecimal", ">= 3.1.0")
50
- spec.add_runtime_dependency("extpp", ">= 0.0.7")
50
+ spec.add_runtime_dependency("csv")
51
+ spec.add_runtime_dependency("extpp", ">= 0.1.1")
51
52
  spec.add_runtime_dependency("gio2", ">= 3.5.0")
52
53
  spec.add_runtime_dependency("native-package-installer")
53
54
  spec.add_runtime_dependency("pkg-config")