red-arrow 8.0.0 → 24.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +15 -7
  3. data/ext/arrow/arrow.cpp +67 -0
  4. data/ext/arrow/converters.cpp +10 -0
  5. data/ext/arrow/converters.hpp +310 -46
  6. data/ext/arrow/extconf.rb +41 -22
  7. data/ext/arrow/raw-records.cpp +165 -2
  8. data/ext/arrow/red-arrow.hpp +2 -0
  9. data/ext/arrow/values.cpp +6 -2
  10. data/lib/arrow/array-builder.rb +89 -14
  11. data/{test/test-time32-data-type.rb → lib/arrow/array-computable.rb} +24 -16
  12. data/{test/test-buffer.rb → lib/arrow/array-statistics.rb} +19 -24
  13. data/lib/arrow/array.rb +40 -4
  14. data/lib/arrow/chunked-array.rb +56 -1
  15. data/lib/arrow/column-containable.rb +9 -0
  16. data/lib/arrow/column.rb +49 -4
  17. data/{test/test-tensor.rb → lib/arrow/csv-write-options.rb} +28 -31
  18. data/lib/arrow/data-type.rb +17 -3
  19. data/lib/arrow/decimal128-array-builder.rb +16 -6
  20. data/lib/arrow/decimal128.rb +14 -0
  21. data/lib/arrow/decimal256-array-builder.rb +16 -6
  22. data/lib/arrow/decimal256.rb +14 -0
  23. data/{test/test-float-scalar.rb → lib/arrow/dense-union-array-builder.rb} +27 -24
  24. data/{test/test-boolean-scalar.rb → lib/arrow/dense-union-array.rb} +7 -7
  25. data/lib/arrow/duration-array-builder.rb +27 -0
  26. data/lib/arrow/duration-array.rb +24 -0
  27. data/lib/arrow/duration-data-type.rb +32 -0
  28. data/lib/arrow/expression.rb +6 -2
  29. data/lib/arrow/field-containable.rb +1 -1
  30. data/lib/arrow/field.rb +44 -3
  31. data/lib/arrow/fixed-size-list-array-builder.rb +29 -0
  32. data/lib/arrow/fixed-size-list-data-type.rb +118 -0
  33. data/lib/arrow/function.rb +0 -1
  34. data/lib/arrow/half-float-array-builder.rb +32 -0
  35. data/lib/arrow/half-float-array.rb +24 -0
  36. data/lib/arrow/half-float.rb +118 -0
  37. data/{test/helper/fixture.rb → lib/arrow/input-referable.rb} +7 -6
  38. data/lib/arrow/jruby/array-builder.rb +114 -0
  39. data/lib/arrow/jruby/array.rb +109 -0
  40. data/lib/arrow/jruby/chunked-array.rb +36 -0
  41. data/lib/arrow/jruby/compression-type.rb +26 -0
  42. data/lib/arrow/jruby/csv-read-options.rb +32 -0
  43. data/{test/test-map-data-type.rb → lib/arrow/jruby/data-type.rb} +24 -12
  44. data/lib/arrow/jruby/decimal128.rb +28 -0
  45. data/lib/arrow/jruby/decimal256.rb +28 -0
  46. data/{test/fixture/float-integer.csv → lib/arrow/jruby/error.rb} +7 -4
  47. data/lib/arrow/jruby/file-system.rb +24 -0
  48. data/{test/test-null-array.rb → lib/arrow/jruby/function.rb} +5 -4
  49. data/lib/arrow/jruby/record-batch-iterator.rb +24 -0
  50. data/{test/fixture/null-with-double-quote.csv → lib/arrow/jruby/record-batch.rb} +8 -4
  51. data/{test/fixture/integer-float.csv → lib/arrow/jruby/sort-key.rb} +8 -4
  52. data/lib/arrow/jruby/sort-options.rb +24 -0
  53. data/lib/arrow/jruby/stream-listener-raw.rb +25 -0
  54. data/{test/test-rolling-window.rb → lib/arrow/jruby/table.rb} +19 -19
  55. data/lib/arrow/jruby/writable.rb +24 -0
  56. data/lib/arrow/jruby.rb +52 -0
  57. data/{test/test-date32-array.rb → lib/arrow/large-list-array-builder.rb} +10 -5
  58. data/lib/arrow/large-list-data-type.rb +83 -0
  59. data/lib/arrow/libraries.rb +140 -0
  60. data/lib/arrow/list-array-builder.rb +1 -68
  61. data/lib/arrow/list-data-type.rb +3 -38
  62. data/{test/test-dictionary-array.rb → lib/arrow/list-field-resolvable.rb} +26 -17
  63. data/lib/arrow/list-slice-options.rb +76 -0
  64. data/lib/arrow/list-values-appendable.rb +88 -0
  65. data/lib/arrow/loader.rb +15 -96
  66. data/{test/test-decimal128-array.rb → lib/arrow/make-struct-options.rb} +18 -18
  67. data/lib/arrow/raw-table-converter.rb +10 -3
  68. data/lib/arrow/raw-tensor-converter.rb +89 -0
  69. data/lib/arrow/record-batch-file-reader.rb +2 -0
  70. data/lib/arrow/record-batch-stream-reader.rb +2 -0
  71. data/lib/arrow/record-batch.rb +6 -2
  72. data/{test/fixture/null-without-double-quote.csv → lib/arrow/ruby.rb} +5 -4
  73. data/lib/arrow/scalar.rb +67 -0
  74. data/lib/arrow/slicer.rb +61 -0
  75. data/lib/arrow/sort-key.rb +3 -3
  76. data/lib/arrow/sparse-union-array-builder.rb +56 -0
  77. data/lib/arrow/sparse-union-array.rb +26 -0
  78. data/lib/arrow/stream-decoder.rb +29 -0
  79. data/{test/test-decimal256-data-type.rb → lib/arrow/stream-listener.rb} +25 -9
  80. data/lib/arrow/string-array-builder.rb +30 -0
  81. data/lib/arrow/struct-array-builder.rb +0 -5
  82. data/lib/arrow/table-formatter.rb +38 -8
  83. data/lib/arrow/table-list-formatter.rb +3 -3
  84. data/lib/arrow/table-loader.rb +11 -5
  85. data/lib/arrow/table-saver.rb +4 -3
  86. data/lib/arrow/table-table-formatter.rb +7 -0
  87. data/lib/arrow/table.rb +180 -33
  88. data/lib/arrow/tensor.rb +144 -0
  89. data/lib/arrow/time-unit.rb +31 -0
  90. data/lib/arrow/time32-array-builder.rb +2 -14
  91. data/lib/arrow/time32-data-type.rb +9 -38
  92. data/lib/arrow/time64-array-builder.rb +2 -14
  93. data/lib/arrow/time64-data-type.rb +9 -38
  94. data/lib/arrow/timestamp-array-builder.rb +3 -15
  95. data/lib/arrow/timestamp-data-type.rb +9 -34
  96. data/{test/test-date64-array.rb → lib/arrow/timestamp-parser.rb} +14 -6
  97. data/lib/arrow/union-array-builder.rb +59 -0
  98. data/lib/arrow/union-array.rb +26 -0
  99. data/lib/arrow/version.rb +1 -1
  100. data/lib/arrow.rb +2 -7
  101. data/red-arrow.gemspec +74 -11
  102. metadata +85 -210
  103. data/test/fixture/TestOrcFile.test1.orc +0 -0
  104. data/test/fixture/with-header-float.csv +0 -20
  105. data/test/fixture/with-header.csv +0 -20
  106. data/test/fixture/without-header-float.csv +0 -19
  107. data/test/fixture/without-header.csv +0 -19
  108. data/test/helper/omittable.rb +0 -36
  109. data/test/helper.rb +0 -30
  110. data/test/raw-records/test-basic-arrays.rb +0 -395
  111. data/test/raw-records/test-dense-union-array.rb +0 -521
  112. data/test/raw-records/test-list-array.rb +0 -610
  113. data/test/raw-records/test-map-array.rb +0 -478
  114. data/test/raw-records/test-multiple-columns.rb +0 -65
  115. data/test/raw-records/test-sparse-union-array.rb +0 -511
  116. data/test/raw-records/test-struct-array.rb +0 -515
  117. data/test/raw-records/test-table.rb +0 -47
  118. data/test/run-test.rb +0 -71
  119. data/test/test-array-builder.rb +0 -136
  120. data/test/test-array.rb +0 -325
  121. data/test/test-bigdecimal.rb +0 -40
  122. data/test/test-binary-dictionary-array-builder.rb +0 -103
  123. data/test/test-chunked-array.rb +0 -183
  124. data/test/test-column.rb +0 -92
  125. data/test/test-csv-loader.rb +0 -250
  126. data/test/test-data-type.rb +0 -83
  127. data/test/test-decimal128-array-builder.rb +0 -112
  128. data/test/test-decimal128-data-type.rb +0 -31
  129. data/test/test-decimal128.rb +0 -102
  130. data/test/test-decimal256-array-builder.rb +0 -112
  131. data/test/test-decimal256-array.rb +0 -38
  132. data/test/test-decimal256.rb +0 -102
  133. data/test/test-dense-union-data-type.rb +0 -41
  134. data/test/test-dictionary-data-type.rb +0 -40
  135. data/test/test-expression.rb +0 -40
  136. data/test/test-feather.rb +0 -49
  137. data/test/test-field.rb +0 -91
  138. data/test/test-file-output-stream.rb +0 -54
  139. data/test/test-fixed-size-binary-array-builder.rb +0 -92
  140. data/test/test-fixed-size-binary-array.rb +0 -36
  141. data/test/test-function.rb +0 -210
  142. data/test/test-group.rb +0 -180
  143. data/test/test-list-array-builder.rb +0 -79
  144. data/test/test-list-array.rb +0 -32
  145. data/test/test-list-data-type.rb +0 -69
  146. data/test/test-map-array-builder.rb +0 -110
  147. data/test/test-map-array.rb +0 -33
  148. data/test/test-memory-view.rb +0 -434
  149. data/test/test-orc.rb +0 -173
  150. data/test/test-record-batch-builder.rb +0 -125
  151. data/test/test-record-batch-file-reader.rb +0 -115
  152. data/test/test-record-batch-iterator.rb +0 -37
  153. data/test/test-record-batch-reader.rb +0 -46
  154. data/test/test-record-batch.rb +0 -182
  155. data/test/test-schema.rb +0 -134
  156. data/test/test-slicer.rb +0 -487
  157. data/test/test-sort-indices.rb +0 -40
  158. data/test/test-sort-key.rb +0 -81
  159. data/test/test-sort-options.rb +0 -58
  160. data/test/test-sparse-union-data-type.rb +0 -41
  161. data/test/test-string-dictionary-array-builder.rb +0 -103
  162. data/test/test-struct-array-builder.rb +0 -184
  163. data/test/test-struct-array.rb +0 -94
  164. data/test/test-struct-data-type.rb +0 -112
  165. data/test/test-table.rb +0 -1123
  166. data/test/test-time.rb +0 -288
  167. data/test/test-time32-array.rb +0 -81
  168. data/test/test-time64-array.rb +0 -81
  169. data/test/test-time64-data-type.rb +0 -42
  170. data/test/test-timestamp-array.rb +0 -45
  171. data/test/test-timestamp-data-type.rb +0 -42
  172. data/test/values/test-basic-arrays.rb +0 -325
  173. data/test/values/test-dense-union-array.rb +0 -509
  174. data/test/values/test-dictionary-array.rb +0 -295
  175. data/test/values/test-list-array.rb +0 -571
  176. data/test/values/test-map-array.rb +0 -466
  177. data/test/values/test-sparse-union-array.rb +0 -500
  178. data/test/values/test-struct-array.rb +0 -512
data/lib/arrow/table.rb CHANGED
@@ -22,6 +22,7 @@ module Arrow
22
22
  include ColumnContainable
23
23
  include GenericFilterable
24
24
  include GenericTakeable
25
+ include InputReferable
25
26
  include RecordContainable
26
27
 
27
28
  class << self
@@ -126,7 +127,7 @@ module Arrow
126
127
  # You can also specify schema as primitive Ruby objects.
127
128
  # See {Arrow::Schema#initialize} for details.
128
129
  #
129
- # @param arrays [::Array<Arrow::RecordBatch>] The data of the table.
130
+ # @param record_batches [::Array<Arrow::RecordBatch>] The data of the table.
130
131
  #
131
132
  # @example Create a table from schema and record batches
132
133
  # count_field = Arrow::Field.new("count", :uint32)
@@ -144,7 +145,7 @@ module Arrow
144
145
  # You can also specify schema as primitive Ruby objects.
145
146
  # See {Arrow::Schema#initialize} for details.
146
147
  #
147
- # @param arrays [::Array<::Array>] The data of the table as primitive
148
+ # @param raw_records [::Array<::Array>] The data of the table as primitive
148
149
  # Ruby objects.
149
150
  #
150
151
  # @example Create a table from schema and raw records
@@ -188,6 +189,7 @@ module Arrow
188
189
 
189
190
  reader = TableBatchReader.new(self)
190
191
  while record_batch = reader.read_next
192
+ share_input(record_batch)
191
193
  yield(record_batch)
192
194
  end
193
195
  end
@@ -314,8 +316,6 @@ module Arrow
314
316
  end
315
317
  end
316
318
 
317
- filter_options = Arrow::FilterOptions.new
318
- filter_options.null_selection_behavior = :emit_null
319
319
  sliced_tables = []
320
320
  slicers.each do |slicer|
321
321
  slicer = slicer.evaluate if slicer.respond_to?(:evaluate)
@@ -337,7 +337,7 @@ module Arrow
337
337
  to += n_rows if to < 0
338
338
  sliced_tables << slice_by_range(from, to)
339
339
  when ::Array, BooleanArray, ChunkedArray
340
- sliced_tables << filter(slicer, filter_options)
340
+ sliced_tables << filter(slicer)
341
341
  else
342
342
  message = "slicer must be Integer, Range, (from, to), " +
343
343
  "Arrow::ChunkedArray of Arrow::BooleanArray, " +
@@ -346,10 +346,12 @@ module Arrow
346
346
  end
347
347
  end
348
348
  if sliced_tables.size > 1
349
- sliced_tables[0].concatenate(sliced_tables[1..-1])
349
+ sliced_table = sliced_tables[0].concatenate(sliced_tables[1..-1])
350
350
  else
351
- sliced_tables[0]
351
+ sliced_table = sliced_tables[0]
352
352
  end
353
+ share_input(sliced_table)
354
+ sliced_table
353
355
  end
354
356
 
355
357
  # TODO
@@ -401,7 +403,9 @@ module Arrow
401
403
  new_fields << new_column[:field]
402
404
  new_arrays << new_column[:data]
403
405
  end
404
- self.class.new(new_fields, new_arrays)
406
+ table = self.class.new(new_fields, new_arrays)
407
+ share_input(table)
408
+ table
405
409
  end
406
410
 
407
411
  alias_method :remove_column_raw, :remove_column
@@ -423,7 +427,9 @@ module Arrow
423
427
  raise IndexError.new(message)
424
428
  end
425
429
  end
426
- remove_column_raw(index)
430
+ table = remove_column_raw(index)
431
+ share_input(table)
432
+ table
427
433
  end
428
434
 
429
435
  # Experimental
@@ -445,43 +451,69 @@ module Arrow
445
451
  packed_arrays = columns.collect do |column|
446
452
  column.data.pack
447
453
  end
448
- self.class.new(schema, packed_arrays)
454
+ table = self.class.new(schema, packed_arrays)
455
+ share_input(table)
456
+ table
449
457
  end
450
458
 
451
- # @overload join(right, key, type: :inner, left_outputs: nil, right_outputs: nil)
452
- # @!macro join_common_before
453
- # @param right [Arrow::Table] The right table.
459
+ # Join another Table by matching with keys.
460
+ #
461
+ # @!macro join_common_before
462
+ # @param right [Arrow::Table] The right table.
463
+ #
464
+ # Join columns with `right` on join key columns.
454
465
  #
455
- # Join columns with `right` on join key columns.
466
+ # @!macro join_common_after
467
+ # @param type [Arrow::JoinType] How to join.
468
+ # @param left_outputs [::Array<String, Symbol>] Output columns in
469
+ # `self`.
456
470
  #
457
- # @!macro join_common_after
458
- # @param type [Arrow::JoinType] How to join.
459
- # @param left_outputs [::Array<String, Symbol>] Output columns in
460
- # `self`.
471
+ # If both of `left_outputs` and `right_outputs` aren't
472
+ # specified, all columns in `self` and `right` are
473
+ # output.
474
+ # @param right_outputs [::Array<String, Symbol>] Output columns in
475
+ # `right`.
476
+ #
477
+ # If both of `left_outputs` and `right_outputs` aren't
478
+ # specified, all columns in `self` and `right` are
479
+ # output.
480
+ # @return [Arrow::Table]
481
+ # The joined `Arrow::Table`.
461
482
  #
462
- # If both of `left_outputs` and `right_outputs` aren't
463
- # specified, all columns in `self` and `right` are
464
- # outputted.
465
- # @param right_outputs [::Array<String, Symbol>] Output columns in
466
- # `right`.
483
+ # @overload join(right, type: :inner, left_outputs: nil, right_outputs: nil)
484
+ # If key(s) are not supplied, common keys in self and right are used
485
+ # (natural join).
467
486
  #
468
- # If both of `left_outputs` and `right_outputs` aren't
469
- # specified, all columns in `self` and `right` are
470
- # outputted.
471
- # @return [Arrow::Table]
472
- # The joined `Arrow::Table`.
487
+ # Column used as keys are merged and remain in left side
488
+ # when both of `left_outputs` and `right_outputs` are `nil`.
489
+ #
490
+ # @macro join_common_before
491
+ # @macro join_common_after
492
+ #
493
+ # @since 11.0.0
494
+ #
495
+ # @overload join(right, key, type: :inner, left_outputs: nil, right_outputs: nil)
496
+ # Join right by a key.
497
+ #
498
+ # Column used as keys are merged and remain in left side
499
+ # when both of `left_outputs` and `right_outputs` are `nil`.
473
500
  #
474
501
  # @macro join_common_before
475
502
  # @param key [String, Symbol] A join key.
476
503
  # @macro join_common_after
477
504
  #
478
- # @overload join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
505
+ # @overload join(right, keys, type: :inner, left_suffix: "", right_suffix: "",
506
+ # left_outputs: nil, right_outputs: nil)
507
+ # Join right by keys.
508
+ #
509
+ # Column name can be renamed by appending `left_suffix` or `right_suffix`.
479
510
  #
480
511
  # @macro join_common_before
481
512
  # @param keys [::Array<String, Symbol>] Join keys.
482
513
  # @macro join_common_after
483
514
  #
484
515
  # @overload join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
516
+ # Join right by a key or keys mapped by a hash.
485
517
  #
486
518
  # @macro join_common_before
487
519
  # @param keys [Hash] Specify join keys in `self` and `right` separately.
@@ -492,7 +524,16 @@ module Arrow
492
524
  # @macro join_common_after
493
525
  #
494
526
  # @since 7.0.0
495
- def join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
527
+ def join(right,
528
+ keys=nil,
529
+ type: :inner,
530
+ left_suffix: "",
531
+ right_suffix: "",
532
+ left_outputs: nil,
533
+ right_outputs: nil)
534
+ is_natural_join = keys.nil?
535
+ keys ||= (column_names & right.column_names)
536
+ type = JoinType.try_convert(type) || type
496
537
  plan = ExecutePlan.new
497
538
  left_node = plan.build_source_node(self)
498
539
  right_node = plan.build_source_node(right)
@@ -508,22 +549,46 @@ module Arrow
508
549
  hash_join_node_options = HashJoinNodeOptions.new(type,
509
550
  left_keys,
510
551
  right_keys)
552
+ use_manual_outputs = false
511
553
  unless left_outputs.nil?
512
554
  hash_join_node_options.left_outputs = left_outputs
555
+ use_manual_outputs = true
513
556
  end
514
557
  unless right_outputs.nil?
515
558
  hash_join_node_options.right_outputs = right_outputs
559
+ use_manual_outputs = true
516
560
  end
517
561
  hash_join_node = plan.build_hash_join_node(left_node,
518
562
  right_node,
519
563
  hash_join_node_options)
564
+ type_nick = type.nick
565
+ is_filter_join = (type_nick.end_with?("-semi") or
566
+ type_nick.end_with?("-anti"))
567
+ if use_manual_outputs or is_filter_join
568
+ process_node = hash_join_node
569
+ elsif is_natural_join
570
+ process_node = join_merge_keys(plan, hash_join_node, right, keys)
571
+ elsif keys.is_a?(String) or keys.is_a?(Symbol)
572
+ process_node = join_merge_keys(plan, hash_join_node, right, [keys.to_s])
573
+ elsif !keys.is_a?(Hash) and (left_suffix != "" or right_suffix != "")
574
+ process_node = join_rename_keys(plan,
575
+ hash_join_node,
576
+ right,
577
+ keys,
578
+ left_suffix,
579
+ right_suffix)
580
+ else
581
+ process_node = hash_join_node
582
+ end
520
583
  sink_node_options = SinkNodeOptions.new
521
- plan.build_sink_node(hash_join_node, sink_node_options)
584
+ plan.build_sink_node(process_node, sink_node_options)
522
585
  plan.validate
523
586
  plan.start
524
587
  plan.wait
525
- reader = sink_node_options.get_reader(hash_join_node.output_schema)
526
- reader.read_all
588
+ reader = sink_node_options.get_reader(process_node.output_schema)
589
+ table = reader.read_all
590
+ share_input(table)
591
+ table
527
592
  end
528
593
 
529
594
  alias_method :to_s_raw, :to_s
@@ -593,5 +658,87 @@ module Arrow
593
658
  raise ArgumentError, message
594
659
  end
595
660
  end
661
+
662
+ def join_merge_keys(plan, input_node, right, keys)
663
+ expressions = []
664
+ names = []
665
+ normalized_keys = {}
666
+ keys.each do |key|
667
+ normalized_keys[key.to_s] = true
668
+ end
669
+ key_to_outputs = {}
670
+ outputs = []
671
+ left_n_column_names = column_names.size
672
+ column_names.each_with_index do |name, i|
673
+ is_key = normalized_keys.include?(name)
674
+ output = {is_key: is_key, name: name, index: i, direction: :left}
675
+ outputs << output
676
+ key_to_outputs[name] = {left: output} if is_key
677
+ end
678
+ right.column_names.each_with_index do |name, i|
679
+ index = left_n_column_names + i
680
+ is_key = normalized_keys.include?(name)
681
+ output = {is_key: is_key, name: name, index: index, direction: :right}
682
+ outputs << output
683
+ key_to_outputs[name][:right] = output if is_key
684
+ end
685
+
686
+ outputs.each do |output|
687
+ if output[:is_key]
688
+ next if output[:direction] == :right
689
+ left_output = key_to_outputs[output[:name]][:left]
690
+ right_output = key_to_outputs[output[:name]][:right]
691
+ left_field = FieldExpression.new("[#{left_output[:index]}]")
692
+ right_field = FieldExpression.new("[#{right_output[:index]}]")
693
+ is_left_null = CallExpression.new("is_null", [left_field])
694
+ merge_column = CallExpression.new("if_else",
695
+ [
696
+ is_left_null,
697
+ right_field,
698
+ left_field,
699
+ ])
700
+ expressions << merge_column
701
+ else
702
+ expressions << FieldExpression.new("[#{output[:index]}]")
703
+ end
704
+ names << output[:name]
705
+ end
706
+ project_node_options = ProjectNodeOptions.new(expressions, names)
707
+ plan.build_project_node(input_node, project_node_options)
708
+ end
709
+
710
+ def join_rename_keys(plan,
711
+ input_node,
712
+ right,
713
+ keys,
714
+ left_suffix,
715
+ right_suffix)
716
+ expressions = []
717
+ names = []
718
+ normalized_keys = {}
719
+ keys.each do |key|
720
+ normalized_keys[key.to_s] = true
721
+ end
722
+ left_n_column_names = column_names.size
723
+ column_names.each_with_index do |name, i|
724
+ expressions << FieldExpression.new("[#{i}]")
725
+ if normalized_keys.include?(name)
726
+ names << "#{name}#{left_suffix}"
727
+ else
728
+ names << name
729
+ end
730
+ end
731
+ right.column_names.each_with_index do |name, i|
732
+ index = left_n_column_names + i
733
+ expressions << FieldExpression.new("[#{index}]")
734
+ if normalized_keys.include?(name)
735
+ names << "#{name}#{right_suffix}"
736
+ else
737
+ names << name
738
+ end
739
+ end
740
+ project_node_options = ProjectNodeOptions.new(expressions, names)
741
+ plan.build_project_node(input_node, project_node_options)
742
+ end
596
743
  end
597
744
  end
data/lib/arrow/tensor.rb CHANGED
@@ -15,10 +15,154 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
+ require_relative "raw-tensor-converter"
19
+
18
20
  module Arrow
19
21
  class Tensor
22
+ alias_method :initialize_raw, :initialize
23
+ # Creates a new {Arrow::Tensor}.
24
+ #
25
+ # @overload initialize(raw_tensor, data_type: nil, shape: nil, dimension_names: nil)
26
+ #
27
+ # @param raw_tensor [::Array<Numeric>] The tensor represented as a
28
+ # raw `Array` (not `Arrow::Array`) and `Numeric`s. You can
29
+ # pass a nested `Array` for a multi-dimensional tensor.
30
+ #
31
+ # @param data_type [Arrow::DataType, String, Symbol, ::Array<String>,
32
+ # ::Array<Symbol>, Hash, nil] The element data type of the tensor.
33
+ #
34
+ # If you specify `nil`, data type is guessed from `raw_tensor`.
35
+ #
36
+ # See {Arrow::DataType.resolve} for how to specify data type.
37
+ #
38
+ # @param shape [::Array<Integer>, nil] The array of dimension sizes.
39
+ #
40
+ # If you specify `nil`, shape is guessed from `raw_tensor`.
41
+ #
42
+ # @param dimension_names [::Array<String>, ::Array<Symbol>, nil]
43
+ # The array of the dimension names.
44
+ #
45
+ # If you specify `nil`, all dimensions have empty names.
46
+ #
47
+ # @example Create a tensor from Ruby's Array
48
+ # raw_tensor = [
49
+ # [
50
+ # [1, 2, 3, 4],
51
+ # [5, 6, 7, 8],
52
+ # ],
53
+ # [
54
+ # [9, 10, 11, 12],
55
+ # [13, 14, 15, 16],
56
+ # ],
57
+ # [
58
+ # [17, 18, 19, 20],
59
+ # [21, 22, 23, 24],
60
+ # ],
61
+ # ]
62
+ # Arrow::Tensor.new(raw_tensor)
63
+ #
64
+ # @since 10.0.0
65
+ #
66
+ # @overload initialize(data_type, data, shape, strides, dimension_names)
67
+ #
68
+ # @param data_type [Arrow::DataType, String, Symbol, ::Array<String>,
69
+ # ::Array<Symbol>, Hash] The element data type of the tensor.
70
+ #
71
+ # See {Arrow::DataType.resolve} how to specify data type.
72
+ #
73
+ # @param data [Arrow::Buffer, String] The data of the tensor.
74
+ #
75
+ # @param shape [::Array<Integer>] The array of dimension sizes.
76
+ #
77
+ # @param strides [::Array<Integer>, nil] The array of strides which
78
+ # is the number of bytes between two adjacent elements in each
79
+ # dimension.
80
+ #
81
+ # If you specify `nil` or an empty `Array`, strides are
82
+ # guessed from `data_type` and `data`.
83
+ #
84
+ # @param dimension_names [::Array<String>, ::Array<Symbol>, nil]
85
+ # The array of the dimension names.
86
+ #
87
+ # If you specify `nil`, all dimensions doesn't have their names.
88
+ #
89
+ # @example Create a table from Arrow::Buffer
90
+ # raw_data = [
91
+ # 1, 2,
92
+ # 3, 4,
93
+ #
94
+ # 5, 6,
95
+ # 7, 8,
96
+ #
97
+ # 9, 10,
98
+ # 11, 12,
99
+ # ]
100
+ # data = Arrow::Buffer.new(raw_data.pack("c*").freeze)
101
+ # shape = [3, 2, 2]
102
+ # strides = []
103
+ # names = ["a", "b", "c"]
104
+ # Arrow::Tensor.new(:int8, data, shape, strides, names)
105
+ def initialize(*args,
106
+ data_type: nil,
107
+ data: nil,
108
+ shape: nil,
109
+ strides: nil,
110
+ dimension_names: nil)
111
+ n_args = args.size
112
+ case n_args
113
+ when 1
114
+ converter = RawTensorConverter.new(args[0],
115
+ data_type: data_type,
116
+ shape: shape,
117
+ strides: strides,
118
+ dimension_names: dimension_names)
119
+ data_type = converter.data_type
120
+ data = converter.data
121
+ shape = converter.shape
122
+ strides = converter.strides
123
+ dimension_names = converter.dimension_names
124
+ when 0, 2..5
125
+ data_type = args[0] || data_type
126
+ data = args[1] || data
127
+ shape = args[2] || shape
128
+ strides = args[3] || strides
129
+ dimension_names = args[4] || dimension_names
130
+ if data_type.nil?
131
+ raise ArgumentError, "data_type: is missing: #{data.inspect}"
132
+ end
133
+ else
134
+ message = "wrong number of arguments (given #{n_args}, expected 0..5)"
135
+ raise ArgumentError, message
136
+ end
137
+ initialize_raw(DataType.resolve(data_type),
138
+ data,
139
+ shape,
140
+ strides,
141
+ dimension_names)
142
+ end
143
+
144
+ def dimension_names
145
+ n_dimensions.times.collect do |i|
146
+ get_dimension_name(i)
147
+ end
148
+ end
149
+
20
150
  def to_arrow
21
151
  self
22
152
  end
153
+
154
+ def to_arrow_array
155
+ if n_dimensions != 1
156
+ raise RangeError, "must be 1 dimensional tensor: #{shape.inspect}"
157
+ end
158
+ value_data_type.array_class.new(size,
159
+ buffer,
160
+ nil,
161
+ 0)
162
+ end
163
+
164
+ def to_arrow_chunked_array
165
+ ChunkedArray.new([to_arrow_array])
166
+ end
23
167
  end
24
168
  end
@@ -0,0 +1,31 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module Arrow
19
+ class TimeUnit
20
+ class << self
21
+ # @api private
22
+ def try_convert(value)
23
+ if value.is_a?(Hash) and value.size == 1 and value[:unit]
24
+ super(value[:unit])
25
+ else
26
+ super
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -18,24 +18,12 @@
18
18
  module Arrow
19
19
  class Time32ArrayBuilder
20
20
  class << self
21
- def build(unit_or_data_type, values)
22
- builder = new(unit_or_data_type)
21
+ def build(data_type, values)
22
+ builder = new(data_type)
23
23
  builder.build(values)
24
24
  end
25
25
  end
26
26
 
27
- alias_method :initialize_raw, :initialize
28
- def initialize(unit_or_data_type)
29
- case unit_or_data_type
30
- when DataType
31
- data_type = unit_or_data_type
32
- else
33
- unit = unit_or_data_type
34
- data_type = Time32DataType.new(unit)
35
- end
36
- initialize_raw(data_type)
37
- end
38
-
39
27
  def unit
40
28
  @unit ||= value_data_type.unit
41
29
  end
@@ -17,45 +17,16 @@
17
17
 
18
18
  module Arrow
19
19
  class Time32DataType
20
- alias_method :initialize_raw, :initialize
21
- private :initialize_raw
22
-
23
- # Creates a new {Arrow::Time32DataType}.
24
- #
25
- # @overload initialize(unit)
26
- #
27
- # @param unit [Arrow::TimeUnit, Symbol] The unit of the
28
- # time32 data type.
29
- #
30
- # The unit must be second or millisecond.
31
- #
32
- # @example Create a time32 data type with Arrow::TimeUnit
33
- # Arrow::Time32DataType.new(Arrow::TimeUnit::MILLI)
34
- #
35
- # @example Create a time32 data type with Symbol
36
- # Arrow::Time32DataType.new(:milli)
37
- #
38
- # @overload initialize(description)
39
- #
40
- # @param description [Hash] The description of the time32 data
41
- # type. It must have `:unit` value.
42
- #
43
- # @option description [Arrow::TimeUnit, Symbol] :unit The unit of
44
- # the time32 data type.
45
- #
46
- # The unit must be second or millisecond.
47
- #
48
- # @example Create a time32 data type with Arrow::TimeUnit
49
- # Arrow::Time32DataType.new(unit: Arrow::TimeUnit::MILLI)
50
- #
51
- # @example Create a time32 data type with Symbol
52
- # Arrow::Time32DataType.new(unit: :milli)
53
- def initialize(unit)
54
- if unit.is_a?(Hash)
55
- description = unit
56
- unit = description[:unit]
20
+ class << self
21
+ # @api private
22
+ def try_convert(value)
23
+ case value
24
+ when Symbol, Arrow::TimeUnit
25
+ new(value)
26
+ else
27
+ super
28
+ end
57
29
  end
58
- initialize_raw(unit)
59
30
  end
60
31
  end
61
32
  end
@@ -18,24 +18,12 @@
18
18
  module Arrow
19
19
  class Time64ArrayBuilder
20
20
  class << self
21
- def build(unit_or_data_type, values)
22
- builder = new(unit_or_data_type)
21
+ def build(data_type, values)
22
+ builder = new(data_type)
23
23
  builder.build(values)
24
24
  end
25
25
  end
26
26
 
27
- alias_method :initialize_raw, :initialize
28
- def initialize(unit_or_data_type)
29
- case unit_or_data_type
30
- when DataType
31
- data_type = unit_or_data_type
32
- else
33
- unit = unit_or_data_type
34
- data_type = Time64DataType.new(unit)
35
- end
36
- initialize_raw(data_type)
37
- end
38
-
39
27
  def unit
40
28
  @unit ||= value_data_type.unit
41
29
  end
@@ -17,45 +17,16 @@
17
17
 
18
18
  module Arrow
19
19
  class Time64DataType
20
- alias_method :initialize_raw, :initialize
21
- private :initialize_raw
22
-
23
- # Creates a new {Arrow::Time64DataType}.
24
- #
25
- # @overload initialize(unit)
26
- #
27
- # @param unit [Arrow::TimeUnit, Symbol] The unit of the
28
- # time64 data type.
29
- #
30
- # The unit must be microsecond or nanosecond.
31
- #
32
- # @example Create a time64 data type with Arrow::TimeUnit
33
- # Arrow::Time64DataType.new(Arrow::TimeUnit::NANO)
34
- #
35
- # @example Create a time64 data type with Symbol
36
- # Arrow::Time64DataType.new(:nano)
37
- #
38
- # @overload initialize(description)
39
- #
40
- # @param description [Hash] The description of the time64 data
41
- # type. It must have `:unit` value.
42
- #
43
- # @option description [Arrow::TimeUnit, Symbol] :unit The unit of
44
- # the time64 data type.
45
- #
46
- # The unit must be microsecond or nanosecond.
47
- #
48
- # @example Create a time64 data type with Arrow::TimeUnit
49
- # Arrow::Time64DataType.new(unit: Arrow::TimeUnit::NANO)
50
- #
51
- # @example Create a time64 data type with Symbol
52
- # Arrow::Time64DataType.new(unit: :nano)
53
- def initialize(unit)
54
- if unit.is_a?(Hash)
55
- description = unit
56
- unit = description[:unit]
20
+ class << self
21
+ # @api private
22
+ def try_convert(value)
23
+ case value
24
+ when Symbol, Arrow::TimeUnit
25
+ new(value)
26
+ else
27
+ super
28
+ end
57
29
  end
58
- initialize_raw(unit)
59
30
  end
60
31
  end
61
32
  end