red-arrow 8.0.0 → 24.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +15 -7
- data/ext/arrow/arrow.cpp +67 -0
- data/ext/arrow/converters.cpp +10 -0
- data/ext/arrow/converters.hpp +310 -46
- data/ext/arrow/extconf.rb +41 -22
- data/ext/arrow/raw-records.cpp +165 -2
- data/ext/arrow/red-arrow.hpp +2 -0
- data/ext/arrow/values.cpp +6 -2
- data/lib/arrow/array-builder.rb +89 -14
- data/{test/test-time32-data-type.rb → lib/arrow/array-computable.rb} +24 -16
- data/{test/test-buffer.rb → lib/arrow/array-statistics.rb} +19 -24
- data/lib/arrow/array.rb +40 -4
- data/lib/arrow/chunked-array.rb +56 -1
- data/lib/arrow/column-containable.rb +9 -0
- data/lib/arrow/column.rb +49 -4
- data/{test/test-tensor.rb → lib/arrow/csv-write-options.rb} +28 -31
- data/lib/arrow/data-type.rb +17 -3
- data/lib/arrow/decimal128-array-builder.rb +16 -6
- data/lib/arrow/decimal128.rb +14 -0
- data/lib/arrow/decimal256-array-builder.rb +16 -6
- data/lib/arrow/decimal256.rb +14 -0
- data/{test/test-float-scalar.rb → lib/arrow/dense-union-array-builder.rb} +27 -24
- data/{test/test-boolean-scalar.rb → lib/arrow/dense-union-array.rb} +7 -7
- data/lib/arrow/duration-array-builder.rb +27 -0
- data/lib/arrow/duration-array.rb +24 -0
- data/lib/arrow/duration-data-type.rb +32 -0
- data/lib/arrow/expression.rb +6 -2
- data/lib/arrow/field-containable.rb +1 -1
- data/lib/arrow/field.rb +44 -3
- data/lib/arrow/fixed-size-list-array-builder.rb +29 -0
- data/lib/arrow/fixed-size-list-data-type.rb +118 -0
- data/lib/arrow/function.rb +0 -1
- data/lib/arrow/half-float-array-builder.rb +32 -0
- data/lib/arrow/half-float-array.rb +24 -0
- data/lib/arrow/half-float.rb +118 -0
- data/{test/helper/fixture.rb → lib/arrow/input-referable.rb} +7 -6
- data/lib/arrow/jruby/array-builder.rb +114 -0
- data/lib/arrow/jruby/array.rb +109 -0
- data/lib/arrow/jruby/chunked-array.rb +36 -0
- data/lib/arrow/jruby/compression-type.rb +26 -0
- data/lib/arrow/jruby/csv-read-options.rb +32 -0
- data/{test/test-map-data-type.rb → lib/arrow/jruby/data-type.rb} +24 -12
- data/lib/arrow/jruby/decimal128.rb +28 -0
- data/lib/arrow/jruby/decimal256.rb +28 -0
- data/{test/fixture/float-integer.csv → lib/arrow/jruby/error.rb} +7 -4
- data/lib/arrow/jruby/file-system.rb +24 -0
- data/{test/test-null-array.rb → lib/arrow/jruby/function.rb} +5 -4
- data/lib/arrow/jruby/record-batch-iterator.rb +24 -0
- data/{test/fixture/null-with-double-quote.csv → lib/arrow/jruby/record-batch.rb} +8 -4
- data/{test/fixture/integer-float.csv → lib/arrow/jruby/sort-key.rb} +8 -4
- data/lib/arrow/jruby/sort-options.rb +24 -0
- data/lib/arrow/jruby/stream-listener-raw.rb +25 -0
- data/{test/test-rolling-window.rb → lib/arrow/jruby/table.rb} +19 -19
- data/lib/arrow/jruby/writable.rb +24 -0
- data/lib/arrow/jruby.rb +52 -0
- data/{test/test-date32-array.rb → lib/arrow/large-list-array-builder.rb} +10 -5
- data/lib/arrow/large-list-data-type.rb +83 -0
- data/lib/arrow/libraries.rb +140 -0
- data/lib/arrow/list-array-builder.rb +1 -68
- data/lib/arrow/list-data-type.rb +3 -38
- data/{test/test-dictionary-array.rb → lib/arrow/list-field-resolvable.rb} +26 -17
- data/lib/arrow/list-slice-options.rb +76 -0
- data/lib/arrow/list-values-appendable.rb +88 -0
- data/lib/arrow/loader.rb +15 -96
- data/{test/test-decimal128-array.rb → lib/arrow/make-struct-options.rb} +18 -18
- data/lib/arrow/raw-table-converter.rb +10 -3
- data/lib/arrow/raw-tensor-converter.rb +89 -0
- data/lib/arrow/record-batch-file-reader.rb +2 -0
- data/lib/arrow/record-batch-stream-reader.rb +2 -0
- data/lib/arrow/record-batch.rb +6 -2
- data/{test/fixture/null-without-double-quote.csv → lib/arrow/ruby.rb} +5 -4
- data/lib/arrow/scalar.rb +67 -0
- data/lib/arrow/slicer.rb +61 -0
- data/lib/arrow/sort-key.rb +3 -3
- data/lib/arrow/sparse-union-array-builder.rb +56 -0
- data/lib/arrow/sparse-union-array.rb +26 -0
- data/lib/arrow/stream-decoder.rb +29 -0
- data/{test/test-decimal256-data-type.rb → lib/arrow/stream-listener.rb} +25 -9
- data/lib/arrow/string-array-builder.rb +30 -0
- data/lib/arrow/struct-array-builder.rb +0 -5
- data/lib/arrow/table-formatter.rb +38 -8
- data/lib/arrow/table-list-formatter.rb +3 -3
- data/lib/arrow/table-loader.rb +11 -5
- data/lib/arrow/table-saver.rb +4 -3
- data/lib/arrow/table-table-formatter.rb +7 -0
- data/lib/arrow/table.rb +180 -33
- data/lib/arrow/tensor.rb +144 -0
- data/lib/arrow/time-unit.rb +31 -0
- data/lib/arrow/time32-array-builder.rb +2 -14
- data/lib/arrow/time32-data-type.rb +9 -38
- data/lib/arrow/time64-array-builder.rb +2 -14
- data/lib/arrow/time64-data-type.rb +9 -38
- data/lib/arrow/timestamp-array-builder.rb +3 -15
- data/lib/arrow/timestamp-data-type.rb +9 -34
- data/{test/test-date64-array.rb → lib/arrow/timestamp-parser.rb} +14 -6
- data/lib/arrow/union-array-builder.rb +59 -0
- data/lib/arrow/union-array.rb +26 -0
- data/lib/arrow/version.rb +1 -1
- data/lib/arrow.rb +2 -7
- data/red-arrow.gemspec +74 -11
- metadata +85 -210
- data/test/fixture/TestOrcFile.test1.orc +0 -0
- data/test/fixture/with-header-float.csv +0 -20
- data/test/fixture/with-header.csv +0 -20
- data/test/fixture/without-header-float.csv +0 -19
- data/test/fixture/without-header.csv +0 -19
- data/test/helper/omittable.rb +0 -36
- data/test/helper.rb +0 -30
- data/test/raw-records/test-basic-arrays.rb +0 -395
- data/test/raw-records/test-dense-union-array.rb +0 -521
- data/test/raw-records/test-list-array.rb +0 -610
- data/test/raw-records/test-map-array.rb +0 -478
- data/test/raw-records/test-multiple-columns.rb +0 -65
- data/test/raw-records/test-sparse-union-array.rb +0 -511
- data/test/raw-records/test-struct-array.rb +0 -515
- data/test/raw-records/test-table.rb +0 -47
- data/test/run-test.rb +0 -71
- data/test/test-array-builder.rb +0 -136
- data/test/test-array.rb +0 -325
- data/test/test-bigdecimal.rb +0 -40
- data/test/test-binary-dictionary-array-builder.rb +0 -103
- data/test/test-chunked-array.rb +0 -183
- data/test/test-column.rb +0 -92
- data/test/test-csv-loader.rb +0 -250
- data/test/test-data-type.rb +0 -83
- data/test/test-decimal128-array-builder.rb +0 -112
- data/test/test-decimal128-data-type.rb +0 -31
- data/test/test-decimal128.rb +0 -102
- data/test/test-decimal256-array-builder.rb +0 -112
- data/test/test-decimal256-array.rb +0 -38
- data/test/test-decimal256.rb +0 -102
- data/test/test-dense-union-data-type.rb +0 -41
- data/test/test-dictionary-data-type.rb +0 -40
- data/test/test-expression.rb +0 -40
- data/test/test-feather.rb +0 -49
- data/test/test-field.rb +0 -91
- data/test/test-file-output-stream.rb +0 -54
- data/test/test-fixed-size-binary-array-builder.rb +0 -92
- data/test/test-fixed-size-binary-array.rb +0 -36
- data/test/test-function.rb +0 -210
- data/test/test-group.rb +0 -180
- data/test/test-list-array-builder.rb +0 -79
- data/test/test-list-array.rb +0 -32
- data/test/test-list-data-type.rb +0 -69
- data/test/test-map-array-builder.rb +0 -110
- data/test/test-map-array.rb +0 -33
- data/test/test-memory-view.rb +0 -434
- data/test/test-orc.rb +0 -173
- data/test/test-record-batch-builder.rb +0 -125
- data/test/test-record-batch-file-reader.rb +0 -115
- data/test/test-record-batch-iterator.rb +0 -37
- data/test/test-record-batch-reader.rb +0 -46
- data/test/test-record-batch.rb +0 -182
- data/test/test-schema.rb +0 -134
- data/test/test-slicer.rb +0 -487
- data/test/test-sort-indices.rb +0 -40
- data/test/test-sort-key.rb +0 -81
- data/test/test-sort-options.rb +0 -58
- data/test/test-sparse-union-data-type.rb +0 -41
- data/test/test-string-dictionary-array-builder.rb +0 -103
- data/test/test-struct-array-builder.rb +0 -184
- data/test/test-struct-array.rb +0 -94
- data/test/test-struct-data-type.rb +0 -112
- data/test/test-table.rb +0 -1123
- data/test/test-time.rb +0 -288
- data/test/test-time32-array.rb +0 -81
- data/test/test-time64-array.rb +0 -81
- data/test/test-time64-data-type.rb +0 -42
- data/test/test-timestamp-array.rb +0 -45
- data/test/test-timestamp-data-type.rb +0 -42
- data/test/values/test-basic-arrays.rb +0 -325
- data/test/values/test-dense-union-array.rb +0 -509
- data/test/values/test-dictionary-array.rb +0 -295
- data/test/values/test-list-array.rb +0 -571
- data/test/values/test-map-array.rb +0 -466
- data/test/values/test-sparse-union-array.rb +0 -500
- data/test/values/test-struct-array.rb +0 -512
data/lib/arrow/table.rb
CHANGED
|
@@ -22,6 +22,7 @@ module Arrow
|
|
|
22
22
|
include ColumnContainable
|
|
23
23
|
include GenericFilterable
|
|
24
24
|
include GenericTakeable
|
|
25
|
+
include InputReferable
|
|
25
26
|
include RecordContainable
|
|
26
27
|
|
|
27
28
|
class << self
|
|
@@ -126,7 +127,7 @@ module Arrow
|
|
|
126
127
|
# You can also specify schema as primitive Ruby objects.
|
|
127
128
|
# See {Arrow::Schema#initialize} for details.
|
|
128
129
|
#
|
|
129
|
-
# @param
|
|
130
|
+
# @param record_batches [::Array<Arrow::RecordBatch>] The data of the table.
|
|
130
131
|
#
|
|
131
132
|
# @example Create a table from schema and record batches
|
|
132
133
|
# count_field = Arrow::Field.new("count", :uint32)
|
|
@@ -144,7 +145,7 @@ module Arrow
|
|
|
144
145
|
# You can also specify schema as primitive Ruby objects.
|
|
145
146
|
# See {Arrow::Schema#initialize} for details.
|
|
146
147
|
#
|
|
147
|
-
# @param
|
|
148
|
+
# @param raw_records [::Array<::Array>] The data of the table as primitive
|
|
148
149
|
# Ruby objects.
|
|
149
150
|
#
|
|
150
151
|
# @example Create a table from schema and raw records
|
|
@@ -188,6 +189,7 @@ module Arrow
|
|
|
188
189
|
|
|
189
190
|
reader = TableBatchReader.new(self)
|
|
190
191
|
while record_batch = reader.read_next
|
|
192
|
+
share_input(record_batch)
|
|
191
193
|
yield(record_batch)
|
|
192
194
|
end
|
|
193
195
|
end
|
|
@@ -314,8 +316,6 @@ module Arrow
|
|
|
314
316
|
end
|
|
315
317
|
end
|
|
316
318
|
|
|
317
|
-
filter_options = Arrow::FilterOptions.new
|
|
318
|
-
filter_options.null_selection_behavior = :emit_null
|
|
319
319
|
sliced_tables = []
|
|
320
320
|
slicers.each do |slicer|
|
|
321
321
|
slicer = slicer.evaluate if slicer.respond_to?(:evaluate)
|
|
@@ -337,7 +337,7 @@ module Arrow
|
|
|
337
337
|
to += n_rows if to < 0
|
|
338
338
|
sliced_tables << slice_by_range(from, to)
|
|
339
339
|
when ::Array, BooleanArray, ChunkedArray
|
|
340
|
-
sliced_tables << filter(slicer
|
|
340
|
+
sliced_tables << filter(slicer)
|
|
341
341
|
else
|
|
342
342
|
message = "slicer must be Integer, Range, (from, to), " +
|
|
343
343
|
"Arrow::ChunkedArray of Arrow::BooleanArray, " +
|
|
@@ -346,10 +346,12 @@ module Arrow
|
|
|
346
346
|
end
|
|
347
347
|
end
|
|
348
348
|
if sliced_tables.size > 1
|
|
349
|
-
sliced_tables[0].concatenate(sliced_tables[1..-1])
|
|
349
|
+
sliced_table = sliced_tables[0].concatenate(sliced_tables[1..-1])
|
|
350
350
|
else
|
|
351
|
-
sliced_tables[0]
|
|
351
|
+
sliced_table = sliced_tables[0]
|
|
352
352
|
end
|
|
353
|
+
share_input(sliced_table)
|
|
354
|
+
sliced_table
|
|
353
355
|
end
|
|
354
356
|
|
|
355
357
|
# TODO
|
|
@@ -401,7 +403,9 @@ module Arrow
|
|
|
401
403
|
new_fields << new_column[:field]
|
|
402
404
|
new_arrays << new_column[:data]
|
|
403
405
|
end
|
|
404
|
-
self.class.new(new_fields, new_arrays)
|
|
406
|
+
table = self.class.new(new_fields, new_arrays)
|
|
407
|
+
share_input(table)
|
|
408
|
+
table
|
|
405
409
|
end
|
|
406
410
|
|
|
407
411
|
alias_method :remove_column_raw, :remove_column
|
|
@@ -423,7 +427,9 @@ module Arrow
|
|
|
423
427
|
raise IndexError.new(message)
|
|
424
428
|
end
|
|
425
429
|
end
|
|
426
|
-
remove_column_raw(index)
|
|
430
|
+
table = remove_column_raw(index)
|
|
431
|
+
share_input(table)
|
|
432
|
+
table
|
|
427
433
|
end
|
|
428
434
|
|
|
429
435
|
# Experimental
|
|
@@ -445,43 +451,69 @@ module Arrow
|
|
|
445
451
|
packed_arrays = columns.collect do |column|
|
|
446
452
|
column.data.pack
|
|
447
453
|
end
|
|
448
|
-
self.class.new(schema, packed_arrays)
|
|
454
|
+
table = self.class.new(schema, packed_arrays)
|
|
455
|
+
share_input(table)
|
|
456
|
+
table
|
|
449
457
|
end
|
|
450
458
|
|
|
451
|
-
#
|
|
452
|
-
#
|
|
453
|
-
#
|
|
459
|
+
# Join another Table by matching with keys.
|
|
460
|
+
#
|
|
461
|
+
# @!macro join_common_before
|
|
462
|
+
# @param right [Arrow::Table] The right table.
|
|
463
|
+
#
|
|
464
|
+
# Join columns with `right` on join key columns.
|
|
454
465
|
#
|
|
455
|
-
#
|
|
466
|
+
# @!macro join_common_after
|
|
467
|
+
# @param type [Arrow::JoinType] How to join.
|
|
468
|
+
# @param left_outputs [::Array<String, Symbol>] Output columns in
|
|
469
|
+
# `self`.
|
|
456
470
|
#
|
|
457
|
-
#
|
|
458
|
-
#
|
|
459
|
-
#
|
|
460
|
-
#
|
|
471
|
+
# If both of `left_outputs` and `right_outputs` aren't
|
|
472
|
+
# specified, all columns in `self` and `right` are
|
|
473
|
+
# output.
|
|
474
|
+
# @param right_outputs [::Array<String, Symbol>] Output columns in
|
|
475
|
+
# `right`.
|
|
476
|
+
#
|
|
477
|
+
# If both of `left_outputs` and `right_outputs` aren't
|
|
478
|
+
# specified, all columns in `self` and `right` are
|
|
479
|
+
# output.
|
|
480
|
+
# @return [Arrow::Table]
|
|
481
|
+
# The joined `Arrow::Table`.
|
|
461
482
|
#
|
|
462
|
-
#
|
|
463
|
-
#
|
|
464
|
-
#
|
|
465
|
-
# @param right_outputs [::Array<String, Symbol>] Output columns in
|
|
466
|
-
# `right`.
|
|
483
|
+
# @overload join(right, type: :inner, left_outputs: nil, right_outputs: nil)
|
|
484
|
+
# If key(s) are not supplied, common keys in self and right are used
|
|
485
|
+
# (natural join).
|
|
467
486
|
#
|
|
468
|
-
#
|
|
469
|
-
#
|
|
470
|
-
#
|
|
471
|
-
#
|
|
472
|
-
#
|
|
487
|
+
# Column used as keys are merged and remain in left side
|
|
488
|
+
# when both of `left_outputs` and `right_outputs` are `nil`.
|
|
489
|
+
#
|
|
490
|
+
# @macro join_common_before
|
|
491
|
+
# @macro join_common_after
|
|
492
|
+
#
|
|
493
|
+
# @since 11.0.0
|
|
494
|
+
#
|
|
495
|
+
# @overload join(right, key, type: :inner, left_outputs: nil, right_outputs: nil)
|
|
496
|
+
# Join right by a key.
|
|
497
|
+
#
|
|
498
|
+
# Column used as keys are merged and remain in left side
|
|
499
|
+
# when both of `left_outputs` and `right_outputs` are `nil`.
|
|
473
500
|
#
|
|
474
501
|
# @macro join_common_before
|
|
475
502
|
# @param key [String, Symbol] A join key.
|
|
476
503
|
# @macro join_common_after
|
|
477
504
|
#
|
|
478
|
-
# @overload join(right, keys, type: :inner,
|
|
505
|
+
# @overload join(right, keys, type: :inner, left_suffix: "", right_suffix: "",
|
|
506
|
+
# left_outputs: nil, right_outputs: nil)
|
|
507
|
+
# Join right by keys.
|
|
508
|
+
#
|
|
509
|
+
# Column name can be renamed by appending `left_suffix` or `right_suffix`.
|
|
479
510
|
#
|
|
480
511
|
# @macro join_common_before
|
|
481
512
|
# @param keys [::Array<String, Symbol>] Join keys.
|
|
482
513
|
# @macro join_common_after
|
|
483
514
|
#
|
|
484
515
|
# @overload join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
|
|
516
|
+
# Join right by a key or keys mapped by a hash.
|
|
485
517
|
#
|
|
486
518
|
# @macro join_common_before
|
|
487
519
|
# @param keys [Hash] Specify join keys in `self` and `right` separately.
|
|
@@ -492,7 +524,16 @@ module Arrow
|
|
|
492
524
|
# @macro join_common_after
|
|
493
525
|
#
|
|
494
526
|
# @since 7.0.0
|
|
495
|
-
def join(right,
|
|
527
|
+
def join(right,
|
|
528
|
+
keys=nil,
|
|
529
|
+
type: :inner,
|
|
530
|
+
left_suffix: "",
|
|
531
|
+
right_suffix: "",
|
|
532
|
+
left_outputs: nil,
|
|
533
|
+
right_outputs: nil)
|
|
534
|
+
is_natural_join = keys.nil?
|
|
535
|
+
keys ||= (column_names & right.column_names)
|
|
536
|
+
type = JoinType.try_convert(type) || type
|
|
496
537
|
plan = ExecutePlan.new
|
|
497
538
|
left_node = plan.build_source_node(self)
|
|
498
539
|
right_node = plan.build_source_node(right)
|
|
@@ -508,22 +549,46 @@ module Arrow
|
|
|
508
549
|
hash_join_node_options = HashJoinNodeOptions.new(type,
|
|
509
550
|
left_keys,
|
|
510
551
|
right_keys)
|
|
552
|
+
use_manual_outputs = false
|
|
511
553
|
unless left_outputs.nil?
|
|
512
554
|
hash_join_node_options.left_outputs = left_outputs
|
|
555
|
+
use_manual_outputs = true
|
|
513
556
|
end
|
|
514
557
|
unless right_outputs.nil?
|
|
515
558
|
hash_join_node_options.right_outputs = right_outputs
|
|
559
|
+
use_manual_outputs = true
|
|
516
560
|
end
|
|
517
561
|
hash_join_node = plan.build_hash_join_node(left_node,
|
|
518
562
|
right_node,
|
|
519
563
|
hash_join_node_options)
|
|
564
|
+
type_nick = type.nick
|
|
565
|
+
is_filter_join = (type_nick.end_with?("-semi") or
|
|
566
|
+
type_nick.end_with?("-anti"))
|
|
567
|
+
if use_manual_outputs or is_filter_join
|
|
568
|
+
process_node = hash_join_node
|
|
569
|
+
elsif is_natural_join
|
|
570
|
+
process_node = join_merge_keys(plan, hash_join_node, right, keys)
|
|
571
|
+
elsif keys.is_a?(String) or keys.is_a?(Symbol)
|
|
572
|
+
process_node = join_merge_keys(plan, hash_join_node, right, [keys.to_s])
|
|
573
|
+
elsif !keys.is_a?(Hash) and (left_suffix != "" or right_suffix != "")
|
|
574
|
+
process_node = join_rename_keys(plan,
|
|
575
|
+
hash_join_node,
|
|
576
|
+
right,
|
|
577
|
+
keys,
|
|
578
|
+
left_suffix,
|
|
579
|
+
right_suffix)
|
|
580
|
+
else
|
|
581
|
+
process_node = hash_join_node
|
|
582
|
+
end
|
|
520
583
|
sink_node_options = SinkNodeOptions.new
|
|
521
|
-
plan.build_sink_node(
|
|
584
|
+
plan.build_sink_node(process_node, sink_node_options)
|
|
522
585
|
plan.validate
|
|
523
586
|
plan.start
|
|
524
587
|
plan.wait
|
|
525
|
-
reader = sink_node_options.get_reader(
|
|
526
|
-
reader.read_all
|
|
588
|
+
reader = sink_node_options.get_reader(process_node.output_schema)
|
|
589
|
+
table = reader.read_all
|
|
590
|
+
share_input(table)
|
|
591
|
+
table
|
|
527
592
|
end
|
|
528
593
|
|
|
529
594
|
alias_method :to_s_raw, :to_s
|
|
@@ -593,5 +658,87 @@ module Arrow
|
|
|
593
658
|
raise ArgumentError, message
|
|
594
659
|
end
|
|
595
660
|
end
|
|
661
|
+
|
|
662
|
+
def join_merge_keys(plan, input_node, right, keys)
|
|
663
|
+
expressions = []
|
|
664
|
+
names = []
|
|
665
|
+
normalized_keys = {}
|
|
666
|
+
keys.each do |key|
|
|
667
|
+
normalized_keys[key.to_s] = true
|
|
668
|
+
end
|
|
669
|
+
key_to_outputs = {}
|
|
670
|
+
outputs = []
|
|
671
|
+
left_n_column_names = column_names.size
|
|
672
|
+
column_names.each_with_index do |name, i|
|
|
673
|
+
is_key = normalized_keys.include?(name)
|
|
674
|
+
output = {is_key: is_key, name: name, index: i, direction: :left}
|
|
675
|
+
outputs << output
|
|
676
|
+
key_to_outputs[name] = {left: output} if is_key
|
|
677
|
+
end
|
|
678
|
+
right.column_names.each_with_index do |name, i|
|
|
679
|
+
index = left_n_column_names + i
|
|
680
|
+
is_key = normalized_keys.include?(name)
|
|
681
|
+
output = {is_key: is_key, name: name, index: index, direction: :right}
|
|
682
|
+
outputs << output
|
|
683
|
+
key_to_outputs[name][:right] = output if is_key
|
|
684
|
+
end
|
|
685
|
+
|
|
686
|
+
outputs.each do |output|
|
|
687
|
+
if output[:is_key]
|
|
688
|
+
next if output[:direction] == :right
|
|
689
|
+
left_output = key_to_outputs[output[:name]][:left]
|
|
690
|
+
right_output = key_to_outputs[output[:name]][:right]
|
|
691
|
+
left_field = FieldExpression.new("[#{left_output[:index]}]")
|
|
692
|
+
right_field = FieldExpression.new("[#{right_output[:index]}]")
|
|
693
|
+
is_left_null = CallExpression.new("is_null", [left_field])
|
|
694
|
+
merge_column = CallExpression.new("if_else",
|
|
695
|
+
[
|
|
696
|
+
is_left_null,
|
|
697
|
+
right_field,
|
|
698
|
+
left_field,
|
|
699
|
+
])
|
|
700
|
+
expressions << merge_column
|
|
701
|
+
else
|
|
702
|
+
expressions << FieldExpression.new("[#{output[:index]}]")
|
|
703
|
+
end
|
|
704
|
+
names << output[:name]
|
|
705
|
+
end
|
|
706
|
+
project_node_options = ProjectNodeOptions.new(expressions, names)
|
|
707
|
+
plan.build_project_node(input_node, project_node_options)
|
|
708
|
+
end
|
|
709
|
+
|
|
710
|
+
def join_rename_keys(plan,
|
|
711
|
+
input_node,
|
|
712
|
+
right,
|
|
713
|
+
keys,
|
|
714
|
+
left_suffix,
|
|
715
|
+
right_suffix)
|
|
716
|
+
expressions = []
|
|
717
|
+
names = []
|
|
718
|
+
normalized_keys = {}
|
|
719
|
+
keys.each do |key|
|
|
720
|
+
normalized_keys[key.to_s] = true
|
|
721
|
+
end
|
|
722
|
+
left_n_column_names = column_names.size
|
|
723
|
+
column_names.each_with_index do |name, i|
|
|
724
|
+
expressions << FieldExpression.new("[#{i}]")
|
|
725
|
+
if normalized_keys.include?(name)
|
|
726
|
+
names << "#{name}#{left_suffix}"
|
|
727
|
+
else
|
|
728
|
+
names << name
|
|
729
|
+
end
|
|
730
|
+
end
|
|
731
|
+
right.column_names.each_with_index do |name, i|
|
|
732
|
+
index = left_n_column_names + i
|
|
733
|
+
expressions << FieldExpression.new("[#{index}]")
|
|
734
|
+
if normalized_keys.include?(name)
|
|
735
|
+
names << "#{name}#{right_suffix}"
|
|
736
|
+
else
|
|
737
|
+
names << name
|
|
738
|
+
end
|
|
739
|
+
end
|
|
740
|
+
project_node_options = ProjectNodeOptions.new(expressions, names)
|
|
741
|
+
plan.build_project_node(input_node, project_node_options)
|
|
742
|
+
end
|
|
596
743
|
end
|
|
597
744
|
end
|
data/lib/arrow/tensor.rb
CHANGED
|
@@ -15,10 +15,154 @@
|
|
|
15
15
|
# specific language governing permissions and limitations
|
|
16
16
|
# under the License.
|
|
17
17
|
|
|
18
|
+
require_relative "raw-tensor-converter"
|
|
19
|
+
|
|
18
20
|
module Arrow
|
|
19
21
|
class Tensor
|
|
22
|
+
alias_method :initialize_raw, :initialize
|
|
23
|
+
# Creates a new {Arrow::Tensor}.
|
|
24
|
+
#
|
|
25
|
+
# @overload initialize(raw_tensor, data_type: nil, shape: nil, dimension_names: nil)
|
|
26
|
+
#
|
|
27
|
+
# @param raw_tensor [::Array<Numeric>] The tensor represented as a
|
|
28
|
+
# raw `Array` (not `Arrow::Array`) and `Numeric`s. You can
|
|
29
|
+
# pass a nested `Array` for a multi-dimensional tensor.
|
|
30
|
+
#
|
|
31
|
+
# @param data_type [Arrow::DataType, String, Symbol, ::Array<String>,
|
|
32
|
+
# ::Array<Symbol>, Hash, nil] The element data type of the tensor.
|
|
33
|
+
#
|
|
34
|
+
# If you specify `nil`, data type is guessed from `raw_tensor`.
|
|
35
|
+
#
|
|
36
|
+
# See {Arrow::DataType.resolve} for how to specify data type.
|
|
37
|
+
#
|
|
38
|
+
# @param shape [::Array<Integer>, nil] The array of dimension sizes.
|
|
39
|
+
#
|
|
40
|
+
# If you specify `nil`, shape is guessed from `raw_tensor`.
|
|
41
|
+
#
|
|
42
|
+
# @param dimension_names [::Array<String>, ::Array<Symbol>, nil]
|
|
43
|
+
# The array of the dimension names.
|
|
44
|
+
#
|
|
45
|
+
# If you specify `nil`, all dimensions have empty names.
|
|
46
|
+
#
|
|
47
|
+
# @example Create a tensor from Ruby's Array
|
|
48
|
+
# raw_tensor = [
|
|
49
|
+
# [
|
|
50
|
+
# [1, 2, 3, 4],
|
|
51
|
+
# [5, 6, 7, 8],
|
|
52
|
+
# ],
|
|
53
|
+
# [
|
|
54
|
+
# [9, 10, 11, 12],
|
|
55
|
+
# [13, 14, 15, 16],
|
|
56
|
+
# ],
|
|
57
|
+
# [
|
|
58
|
+
# [17, 18, 19, 20],
|
|
59
|
+
# [21, 22, 23, 24],
|
|
60
|
+
# ],
|
|
61
|
+
# ]
|
|
62
|
+
# Arrow::Tensor.new(raw_tensor)
|
|
63
|
+
#
|
|
64
|
+
# @since 10.0.0
|
|
65
|
+
#
|
|
66
|
+
# @overload initialize(data_type, data, shape, strides, dimension_names)
|
|
67
|
+
#
|
|
68
|
+
# @param data_type [Arrow::DataType, String, Symbol, ::Array<String>,
|
|
69
|
+
# ::Array<Symbol>, Hash] The element data type of the tensor.
|
|
70
|
+
#
|
|
71
|
+
# See {Arrow::DataType.resolve} how to specify data type.
|
|
72
|
+
#
|
|
73
|
+
# @param data [Arrow::Buffer, String] The data of the tensor.
|
|
74
|
+
#
|
|
75
|
+
# @param shape [::Array<Integer>] The array of dimension sizes.
|
|
76
|
+
#
|
|
77
|
+
# @param strides [::Array<Integer>, nil] The array of strides which
|
|
78
|
+
# is the number of bytes between two adjacent elements in each
|
|
79
|
+
# dimension.
|
|
80
|
+
#
|
|
81
|
+
# If you specify `nil` or an empty `Array`, strides are
|
|
82
|
+
# guessed from `data_type` and `data`.
|
|
83
|
+
#
|
|
84
|
+
# @param dimension_names [::Array<String>, ::Array<Symbol>, nil]
|
|
85
|
+
# The array of the dimension names.
|
|
86
|
+
#
|
|
87
|
+
# If you specify `nil`, all dimensions doesn't have their names.
|
|
88
|
+
#
|
|
89
|
+
# @example Create a table from Arrow::Buffer
|
|
90
|
+
# raw_data = [
|
|
91
|
+
# 1, 2,
|
|
92
|
+
# 3, 4,
|
|
93
|
+
#
|
|
94
|
+
# 5, 6,
|
|
95
|
+
# 7, 8,
|
|
96
|
+
#
|
|
97
|
+
# 9, 10,
|
|
98
|
+
# 11, 12,
|
|
99
|
+
# ]
|
|
100
|
+
# data = Arrow::Buffer.new(raw_data.pack("c*").freeze)
|
|
101
|
+
# shape = [3, 2, 2]
|
|
102
|
+
# strides = []
|
|
103
|
+
# names = ["a", "b", "c"]
|
|
104
|
+
# Arrow::Tensor.new(:int8, data, shape, strides, names)
|
|
105
|
+
def initialize(*args,
|
|
106
|
+
data_type: nil,
|
|
107
|
+
data: nil,
|
|
108
|
+
shape: nil,
|
|
109
|
+
strides: nil,
|
|
110
|
+
dimension_names: nil)
|
|
111
|
+
n_args = args.size
|
|
112
|
+
case n_args
|
|
113
|
+
when 1
|
|
114
|
+
converter = RawTensorConverter.new(args[0],
|
|
115
|
+
data_type: data_type,
|
|
116
|
+
shape: shape,
|
|
117
|
+
strides: strides,
|
|
118
|
+
dimension_names: dimension_names)
|
|
119
|
+
data_type = converter.data_type
|
|
120
|
+
data = converter.data
|
|
121
|
+
shape = converter.shape
|
|
122
|
+
strides = converter.strides
|
|
123
|
+
dimension_names = converter.dimension_names
|
|
124
|
+
when 0, 2..5
|
|
125
|
+
data_type = args[0] || data_type
|
|
126
|
+
data = args[1] || data
|
|
127
|
+
shape = args[2] || shape
|
|
128
|
+
strides = args[3] || strides
|
|
129
|
+
dimension_names = args[4] || dimension_names
|
|
130
|
+
if data_type.nil?
|
|
131
|
+
raise ArgumentError, "data_type: is missing: #{data.inspect}"
|
|
132
|
+
end
|
|
133
|
+
else
|
|
134
|
+
message = "wrong number of arguments (given #{n_args}, expected 0..5)"
|
|
135
|
+
raise ArgumentError, message
|
|
136
|
+
end
|
|
137
|
+
initialize_raw(DataType.resolve(data_type),
|
|
138
|
+
data,
|
|
139
|
+
shape,
|
|
140
|
+
strides,
|
|
141
|
+
dimension_names)
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def dimension_names
|
|
145
|
+
n_dimensions.times.collect do |i|
|
|
146
|
+
get_dimension_name(i)
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
20
150
|
def to_arrow
|
|
21
151
|
self
|
|
22
152
|
end
|
|
153
|
+
|
|
154
|
+
def to_arrow_array
|
|
155
|
+
if n_dimensions != 1
|
|
156
|
+
raise RangeError, "must be 1 dimensional tensor: #{shape.inspect}"
|
|
157
|
+
end
|
|
158
|
+
value_data_type.array_class.new(size,
|
|
159
|
+
buffer,
|
|
160
|
+
nil,
|
|
161
|
+
0)
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def to_arrow_chunked_array
|
|
165
|
+
ChunkedArray.new([to_arrow_array])
|
|
166
|
+
end
|
|
23
167
|
end
|
|
24
168
|
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
|
3
|
+
# distributed with this work for additional information
|
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
|
6
|
+
# "License"); you may not use this file except in compliance
|
|
7
|
+
# with the License. You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
|
12
|
+
# software distributed under the License is distributed on an
|
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
14
|
+
# KIND, either express or implied. See the License for the
|
|
15
|
+
# specific language governing permissions and limitations
|
|
16
|
+
# under the License.
|
|
17
|
+
|
|
18
|
+
module Arrow
|
|
19
|
+
class TimeUnit
|
|
20
|
+
class << self
|
|
21
|
+
# @api private
|
|
22
|
+
def try_convert(value)
|
|
23
|
+
if value.is_a?(Hash) and value.size == 1 and value[:unit]
|
|
24
|
+
super(value[:unit])
|
|
25
|
+
else
|
|
26
|
+
super
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -18,24 +18,12 @@
|
|
|
18
18
|
module Arrow
|
|
19
19
|
class Time32ArrayBuilder
|
|
20
20
|
class << self
|
|
21
|
-
def build(
|
|
22
|
-
builder = new(
|
|
21
|
+
def build(data_type, values)
|
|
22
|
+
builder = new(data_type)
|
|
23
23
|
builder.build(values)
|
|
24
24
|
end
|
|
25
25
|
end
|
|
26
26
|
|
|
27
|
-
alias_method :initialize_raw, :initialize
|
|
28
|
-
def initialize(unit_or_data_type)
|
|
29
|
-
case unit_or_data_type
|
|
30
|
-
when DataType
|
|
31
|
-
data_type = unit_or_data_type
|
|
32
|
-
else
|
|
33
|
-
unit = unit_or_data_type
|
|
34
|
-
data_type = Time32DataType.new(unit)
|
|
35
|
-
end
|
|
36
|
-
initialize_raw(data_type)
|
|
37
|
-
end
|
|
38
|
-
|
|
39
27
|
def unit
|
|
40
28
|
@unit ||= value_data_type.unit
|
|
41
29
|
end
|
|
@@ -17,45 +17,16 @@
|
|
|
17
17
|
|
|
18
18
|
module Arrow
|
|
19
19
|
class Time32DataType
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
#
|
|
30
|
-
# The unit must be second or millisecond.
|
|
31
|
-
#
|
|
32
|
-
# @example Create a time32 data type with Arrow::TimeUnit
|
|
33
|
-
# Arrow::Time32DataType.new(Arrow::TimeUnit::MILLI)
|
|
34
|
-
#
|
|
35
|
-
# @example Create a time32 data type with Symbol
|
|
36
|
-
# Arrow::Time32DataType.new(:milli)
|
|
37
|
-
#
|
|
38
|
-
# @overload initialize(description)
|
|
39
|
-
#
|
|
40
|
-
# @param description [Hash] The description of the time32 data
|
|
41
|
-
# type. It must have `:unit` value.
|
|
42
|
-
#
|
|
43
|
-
# @option description [Arrow::TimeUnit, Symbol] :unit The unit of
|
|
44
|
-
# the time32 data type.
|
|
45
|
-
#
|
|
46
|
-
# The unit must be second or millisecond.
|
|
47
|
-
#
|
|
48
|
-
# @example Create a time32 data type with Arrow::TimeUnit
|
|
49
|
-
# Arrow::Time32DataType.new(unit: Arrow::TimeUnit::MILLI)
|
|
50
|
-
#
|
|
51
|
-
# @example Create a time32 data type with Symbol
|
|
52
|
-
# Arrow::Time32DataType.new(unit: :milli)
|
|
53
|
-
def initialize(unit)
|
|
54
|
-
if unit.is_a?(Hash)
|
|
55
|
-
description = unit
|
|
56
|
-
unit = description[:unit]
|
|
20
|
+
class << self
|
|
21
|
+
# @api private
|
|
22
|
+
def try_convert(value)
|
|
23
|
+
case value
|
|
24
|
+
when Symbol, Arrow::TimeUnit
|
|
25
|
+
new(value)
|
|
26
|
+
else
|
|
27
|
+
super
|
|
28
|
+
end
|
|
57
29
|
end
|
|
58
|
-
initialize_raw(unit)
|
|
59
30
|
end
|
|
60
31
|
end
|
|
61
32
|
end
|
|
@@ -18,24 +18,12 @@
|
|
|
18
18
|
module Arrow
|
|
19
19
|
class Time64ArrayBuilder
|
|
20
20
|
class << self
|
|
21
|
-
def build(
|
|
22
|
-
builder = new(
|
|
21
|
+
def build(data_type, values)
|
|
22
|
+
builder = new(data_type)
|
|
23
23
|
builder.build(values)
|
|
24
24
|
end
|
|
25
25
|
end
|
|
26
26
|
|
|
27
|
-
alias_method :initialize_raw, :initialize
|
|
28
|
-
def initialize(unit_or_data_type)
|
|
29
|
-
case unit_or_data_type
|
|
30
|
-
when DataType
|
|
31
|
-
data_type = unit_or_data_type
|
|
32
|
-
else
|
|
33
|
-
unit = unit_or_data_type
|
|
34
|
-
data_type = Time64DataType.new(unit)
|
|
35
|
-
end
|
|
36
|
-
initialize_raw(data_type)
|
|
37
|
-
end
|
|
38
|
-
|
|
39
27
|
def unit
|
|
40
28
|
@unit ||= value_data_type.unit
|
|
41
29
|
end
|
|
@@ -17,45 +17,16 @@
|
|
|
17
17
|
|
|
18
18
|
module Arrow
|
|
19
19
|
class Time64DataType
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
#
|
|
30
|
-
# The unit must be microsecond or nanosecond.
|
|
31
|
-
#
|
|
32
|
-
# @example Create a time64 data type with Arrow::TimeUnit
|
|
33
|
-
# Arrow::Time64DataType.new(Arrow::TimeUnit::NANO)
|
|
34
|
-
#
|
|
35
|
-
# @example Create a time64 data type with Symbol
|
|
36
|
-
# Arrow::Time64DataType.new(:nano)
|
|
37
|
-
#
|
|
38
|
-
# @overload initialize(description)
|
|
39
|
-
#
|
|
40
|
-
# @param description [Hash] The description of the time64 data
|
|
41
|
-
# type. It must have `:unit` value.
|
|
42
|
-
#
|
|
43
|
-
# @option description [Arrow::TimeUnit, Symbol] :unit The unit of
|
|
44
|
-
# the time64 data type.
|
|
45
|
-
#
|
|
46
|
-
# The unit must be microsecond or nanosecond.
|
|
47
|
-
#
|
|
48
|
-
# @example Create a time64 data type with Arrow::TimeUnit
|
|
49
|
-
# Arrow::Time64DataType.new(unit: Arrow::TimeUnit::NANO)
|
|
50
|
-
#
|
|
51
|
-
# @example Create a time64 data type with Symbol
|
|
52
|
-
# Arrow::Time64DataType.new(unit: :nano)
|
|
53
|
-
def initialize(unit)
|
|
54
|
-
if unit.is_a?(Hash)
|
|
55
|
-
description = unit
|
|
56
|
-
unit = description[:unit]
|
|
20
|
+
class << self
|
|
21
|
+
# @api private
|
|
22
|
+
def try_convert(value)
|
|
23
|
+
case value
|
|
24
|
+
when Symbol, Arrow::TimeUnit
|
|
25
|
+
new(value)
|
|
26
|
+
else
|
|
27
|
+
super
|
|
28
|
+
end
|
|
57
29
|
end
|
|
58
|
-
initialize_raw(unit)
|
|
59
30
|
end
|
|
60
31
|
end
|
|
61
32
|
end
|