red-arrow 6.0.0 → 8.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +10 -0
  3. data/ext/arrow/arrow.cpp +12 -0
  4. data/ext/arrow/converters.hpp +46 -10
  5. data/ext/arrow/extconf.rb +1 -1
  6. data/ext/arrow/raw-records.cpp +3 -2
  7. data/ext/arrow/red-arrow.hpp +7 -0
  8. data/ext/arrow/values.cpp +3 -2
  9. data/lib/arrow/datum.rb +2 -0
  10. data/lib/arrow/day-time-interval-array-builder.rb +29 -0
  11. data/lib/arrow/function.rb +52 -0
  12. data/lib/arrow/loader.rb +16 -0
  13. data/lib/arrow/month-day-nano-interval-array-builder.rb +29 -0
  14. data/lib/arrow/s3-global-options.rb +38 -0
  15. data/lib/arrow/sort-key.rb +61 -55
  16. data/lib/arrow/sort-options.rb +8 -8
  17. data/lib/arrow/table-loader.rb +99 -62
  18. data/lib/arrow/table-saver.rb +7 -2
  19. data/lib/arrow/table.rb +78 -0
  20. data/lib/arrow/version.rb +1 -1
  21. data/red-arrow.gemspec +1 -10
  22. data/test/helper.rb +2 -0
  23. data/test/raw-records/test-basic-arrays.rb +30 -0
  24. data/test/raw-records/test-dense-union-array.rb +27 -0
  25. data/test/raw-records/test-list-array.rb +39 -0
  26. data/test/raw-records/test-map-array.rb +37 -0
  27. data/test/raw-records/test-sparse-union-array.rb +27 -0
  28. data/test/raw-records/test-struct-array.rb +30 -0
  29. data/test/test-function.rb +48 -14
  30. data/test/test-table.rb +204 -6
  31. data/test/values/test-basic-arrays.rb +30 -0
  32. data/test/values/test-dense-union-array.rb +27 -0
  33. data/test/values/test-dictionary-array.rb +295 -0
  34. data/test/values/test-list-array.rb +39 -0
  35. data/test/values/test-map-array.rb +33 -0
  36. data/test/values/test-sparse-union-array.rb +27 -0
  37. data/test/values/test-struct-array.rb +30 -0
  38. metadata +88 -194
@@ -15,7 +15,7 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
- require "uri"
18
+ require "open-uri"
19
19
 
20
20
  module Arrow
21
21
  class TableLoader
@@ -34,30 +34,47 @@ module Arrow
34
34
 
35
35
  def load
36
36
  if @input.is_a?(URI)
37
- custom_load_method = "load_from_uri"
37
+ custom_load_method_candidates = []
38
+ if @input.scheme
39
+ custom_load_method_candidates << "load_from_uri_#{@input.scheme}"
40
+ end
41
+ custom_load_method_candidates << "load_from_uri"
38
42
  elsif @input.is_a?(String) and ::File.directory?(@input)
39
- custom_load_method = "load_from_directory"
43
+ custom_load_method_candidates = ["load_from_directory"]
40
44
  else
41
- custom_load_method = "load_from_file"
45
+ custom_load_method_candidates = ["load_from_file"]
42
46
  end
43
- unless respond_to?(custom_load_method, true)
44
- available_schemes = []
45
- (methods(true) | private_methods(true)).each do |name|
46
- match_data = /\Aload_from_/.match(name.to_s)
47
- if match_data
48
- available_schemes << match_data.post_match
49
- end
47
+ custom_load_method_candidates.each do |custom_load_method|
48
+ next unless respond_to?(custom_load_method, true)
49
+ return __send__(custom_load_method)
50
+ end
51
+ available_schemes = []
52
+ (methods(true) | private_methods(true)).each do |name|
53
+ match_data = /\Aload_from_/.match(name.to_s)
54
+ if match_data
55
+ available_schemes << match_data.post_match
50
56
  end
51
- message = "Arrow::Table load source must be one of ["
52
- message << available_schemes.join(", ")
53
- message << "]: #{@input.inspect}"
54
- raise ArgumentError, message
55
57
  end
56
- __send__(custom_load_method)
58
+ message = "Arrow::Table load source must be one of ["
59
+ message << available_schemes.join(", ")
60
+ message << "]: #{@input.inspect}"
61
+ raise ArgumentError, message
57
62
  end
58
63
 
59
64
  private
65
+ def load_from_uri_http
66
+ load_by_reader
67
+ end
68
+
69
+ def load_from_uri_https
70
+ load_by_reader
71
+ end
72
+
60
73
  def load_from_file
74
+ load_by_reader
75
+ end
76
+
77
+ def load_by_reader
61
78
  format = @options[:format]
62
79
  custom_load_method = "load_as_#{format}"
63
80
  unless respond_to?(custom_load_method, true)
@@ -111,10 +128,29 @@ module Arrow
111
128
  end
112
129
 
113
130
  def open_input_stream
114
- if @input.is_a?(Buffer)
115
- BufferInputStream.new(@input)
131
+ case @input
132
+ when Buffer
133
+ yield(BufferInputStream.new(@input))
134
+ when URI
135
+ @input.open do |ruby_input|
136
+ case @options[:format]
137
+ when :stream, :arrow_streaming
138
+ Gio::RubyInputStream.open(ruby_input) do |gio_input|
139
+ GIOInputStream.open(gio_input) do |input|
140
+ yield(input)
141
+ end
142
+ end
143
+ else
144
+ # TODO: We need to consider Ruby's GVL carefully to use
145
+ # Ruby object directly for input with other formats. We
146
+ # read data and use it as Buffer for now.
147
+ data = GLib::Bytes.new(ruby_input.read.freeze)
148
+ buffer = Buffer.new(data)
149
+ yield(BufferInputStream.new(buffer))
150
+ end
151
+ end
116
152
  else
117
- MemoryMappedInputStream.new(@input)
153
+ yield(MemoryMappedInputStream.new(@input))
118
154
  end
119
155
  end
120
156
 
@@ -130,32 +166,19 @@ module Arrow
130
166
  end
131
167
 
132
168
  def load_as_arrow
133
- input = nil
134
- reader = nil
135
- error = nil
136
- reader_class_candidates = [
137
- RecordBatchFileReader,
138
- RecordBatchStreamReader,
139
- ]
140
- reader_class_candidates.each do |reader_class_candidate|
141
- input = open_input_stream
142
- begin
143
- reader = reader_class_candidate.new(input)
144
- rescue Arrow::Error
145
- error = $!
146
- else
147
- break
148
- end
169
+ begin
170
+ load_as_arrow_file
171
+ rescue
172
+ load_as_arrows
149
173
  end
150
- raise error if reader.nil?
151
- load_raw(input, reader)
152
174
  end
153
175
 
154
176
  # @since 1.0.0
155
177
  def load_as_arrow_file
156
- input = open_input_stream
157
- reader = RecordBatchFileReader.new(input)
158
- load_raw(input, reader)
178
+ open_input_stream do |input|
179
+ reader = RecordBatchFileReader.new(input)
180
+ load_raw(input, reader)
181
+ end
159
182
  end
160
183
 
161
184
  # @deprecated Use `format: :arrow_file` instead.
@@ -163,34 +186,46 @@ module Arrow
163
186
  load_as_arrow_file
164
187
  end
165
188
 
189
+ # @since 7.0.0
190
+ def load_as_arrows
191
+ open_input_stream do |input|
192
+ reader = RecordBatchStreamReader.new(input)
193
+ load_raw(input, reader)
194
+ end
195
+ end
196
+
166
197
  # @since 1.0.0
167
198
  def load_as_arrow_streaming
168
- input = open_input_stream
169
- reader = RecordBatchStreamReader.new(input)
170
- load_raw(input, reader)
199
+ load_as_arrows
171
200
  end
172
201
 
173
202
  # @deprecated Use `format: :arrow_streaming` instead.
174
203
  def load_as_stream
175
- load_as_arrow_streaming
204
+ load_as_arrows
176
205
  end
177
206
 
178
207
  if Arrow.const_defined?(:ORCFileReader)
179
208
  def load_as_orc
180
- input = open_input_stream
181
- reader = ORCFileReader.new(input)
182
- field_indexes = @options[:field_indexes]
183
- reader.set_field_indexes(field_indexes) if field_indexes
184
- table = reader.read_stripes
185
- table.instance_variable_set(:@input, input)
186
- table
209
+ open_input_stream do |input|
210
+ reader = ORCFileReader.new(input)
211
+ field_indexes = @options[:field_indexes]
212
+ reader.set_field_indexes(field_indexes) if field_indexes
213
+ table = reader.read_stripes
214
+ table.instance_variable_set(:@input, input)
215
+ table
216
+ end
187
217
  end
188
218
  end
189
219
 
190
220
  def csv_load(options)
191
221
  options.delete(:format)
192
- if @input.is_a?(Buffer)
222
+ case @input
223
+ when Buffer
193
224
  CSVLoader.load(@input.data.to_s, **options)
225
+ when URI
226
+ @input.open do |input|
227
+ CSVLoader.load(input.read, **options)
228
+ end
194
229
  else
195
230
  CSVLoader.load(Pathname.new(@input), **options)
196
231
  end
@@ -207,19 +242,21 @@ module Arrow
207
242
  end
208
243
 
209
244
  def load_as_feather
210
- input = open_input_stream
211
- reader = FeatherFileReader.new(input)
212
- table = reader.read
213
- table.instance_variable_set(:@input, input)
214
- table
245
+ open_input_stream do |input|
246
+ reader = FeatherFileReader.new(input)
247
+ table = reader.read
248
+ table.instance_variable_set(:@input, input)
249
+ table
250
+ end
215
251
  end
216
252
 
217
253
  def load_as_json
218
- input = open_input_stream
219
- reader = JSONReader.new(input)
220
- table = reader.read
221
- table.instance_variable_set(:@input, input)
222
- table
254
+ open_input_stream do |input|
255
+ reader = JSONReader.new(input)
256
+ table = reader.read
257
+ table.instance_variable_set(:@input, input)
258
+ table
259
+ end
223
260
  end
224
261
  end
225
262
  end
@@ -151,14 +151,19 @@ module Arrow
151
151
  save_as_arrow_file
152
152
  end
153
153
 
154
+ # @since 7.0.0
155
+ def save_as_arrows
156
+ save_raw(RecordBatchStreamWriter)
157
+ end
158
+
154
159
  # @since 1.0.0
155
160
  def save_as_arrow_streaming
156
- save_raw(RecordBatchStreamWriter)
161
+ save_as_arrows
157
162
  end
158
163
 
159
164
  # @deprecated Use `format: :arrow_streaming` instead.
160
165
  def save_as_stream
161
- save_as_arrow_streaming
166
+ save_as_arrows
162
167
  end
163
168
 
164
169
  def csv_save(**options)
data/lib/arrow/table.rb CHANGED
@@ -448,6 +448,84 @@ module Arrow
448
448
  self.class.new(schema, packed_arrays)
449
449
  end
450
450
 
451
+ # @overload join(right, key, type: :inner, left_outputs: nil, right_outputs: nil)
452
+ # @!macro join_common_before
453
+ # @param right [Arrow::Table] The right table.
454
+ #
455
+ # Join columns with `right` on join key columns.
456
+ #
457
+ # @!macro join_common_after
458
+ # @param type [Arrow::JoinType] How to join.
459
+ # @param left_outputs [::Array<String, Symbol>] Output columns in
460
+ # `self`.
461
+ #
462
+ # If both of `left_outputs` and `right_outputs` aren't
463
+ # specified, all columns in `self` and `right` are
464
+ # outputted.
465
+ # @param right_outputs [::Array<String, Symbol>] Output columns in
466
+ # `right`.
467
+ #
468
+ # If both of `left_outputs` and `right_outputs` aren't
469
+ # specified, all columns in `self` and `right` are
470
+ # outputted.
471
+ # @return [Arrow::Table]
472
+ # The joined `Arrow::Table`.
473
+ #
474
+ # @macro join_common_before
475
+ # @param key [String, Symbol] A join key.
476
+ # @macro join_common_after
477
+ #
478
+ # @overload join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
479
+ #
480
+ # @macro join_common_before
481
+ # @param keys [::Array<String, Symbol>] Join keys.
482
+ # @macro join_common_after
483
+ #
484
+ # @overload join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
485
+ #
486
+ # @macro join_common_before
487
+ # @param keys [Hash] Specify join keys in `self` and `right` separately.
488
+ # @option keys [String, Symbol, ::Array<String, Symbol>] :left
489
+ # Join keys in `self`.
490
+ # @option keys [String, Symbol, ::Array<String, Symbol>] :right
491
+ # Join keys in `right`.
492
+ # @macro join_common_after
493
+ #
494
+ # @since 7.0.0
495
+ def join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
496
+ plan = ExecutePlan.new
497
+ left_node = plan.build_source_node(self)
498
+ right_node = plan.build_source_node(right)
499
+ if keys.is_a?(Hash)
500
+ left_keys = keys[:left]
501
+ right_keys = keys[:right]
502
+ else
503
+ left_keys = keys
504
+ right_keys = keys
505
+ end
506
+ left_keys = Array(left_keys)
507
+ right_keys = Array(right_keys)
508
+ hash_join_node_options = HashJoinNodeOptions.new(type,
509
+ left_keys,
510
+ right_keys)
511
+ unless left_outputs.nil?
512
+ hash_join_node_options.left_outputs = left_outputs
513
+ end
514
+ unless right_outputs.nil?
515
+ hash_join_node_options.right_outputs = right_outputs
516
+ end
517
+ hash_join_node = plan.build_hash_join_node(left_node,
518
+ right_node,
519
+ hash_join_node_options)
520
+ sink_node_options = SinkNodeOptions.new
521
+ plan.build_sink_node(hash_join_node, sink_node_options)
522
+ plan.validate
523
+ plan.start
524
+ plan.wait
525
+ reader = sink_node_options.get_reader(hash_join_node.output_schema)
526
+ reader.read_all
527
+ end
528
+
451
529
  alias_method :to_s_raw, :to_s
452
530
  def to_s(options={})
453
531
  format = options[:format]
data/lib/arrow/version.rb CHANGED
@@ -16,7 +16,7 @@
16
16
  # under the License.
17
17
 
18
18
  module Arrow
19
- VERSION = "6.0.0"
19
+ VERSION = "8.0.0"
20
20
 
21
21
  module Version
22
22
  numbers, TAG = VERSION.split("-")
data/red-arrow.gemspec CHANGED
@@ -48,19 +48,10 @@ Gem::Specification.new do |spec|
48
48
 
49
49
  spec.add_runtime_dependency("bigdecimal", ">= 2.0.3")
50
50
  spec.add_runtime_dependency("extpp", ">= 0.0.7")
51
- spec.add_runtime_dependency("gio2", ">= 3.4.9")
51
+ spec.add_runtime_dependency("gio2", ">= 3.5.0")
52
52
  spec.add_runtime_dependency("native-package-installer")
53
53
  spec.add_runtime_dependency("pkg-config")
54
54
 
55
- spec.add_development_dependency("benchmark-driver")
56
- spec.add_development_dependency("bundler")
57
- spec.add_development_dependency("faker")
58
- spec.add_development_dependency("fiddle", ">= 1.0.9")
59
- spec.add_development_dependency("rake")
60
- spec.add_development_dependency("redcarpet")
61
- spec.add_development_dependency("test-unit")
62
- spec.add_development_dependency("yard")
63
-
64
55
  required_msys2_package_version = version_components[0, 3].join(".")
65
56
  spec.metadata["msys2_mingw_dependencies"] =
66
57
  "arrow>=#{required_msys2_package_version}"
data/test/helper.rb CHANGED
@@ -20,6 +20,8 @@ require "arrow"
20
20
  require "fiddle"
21
21
  require "pathname"
22
22
  require "tempfile"
23
+ require "timeout"
24
+ require "webrick"
23
25
  require "zlib"
24
26
 
25
27
  require "test-unit"
@@ -346,6 +346,36 @@ module RawRecordsBasicArraysTests
346
346
  records)
347
347
  assert_equal(records, target.raw_records)
348
348
  end
349
+
350
+ def test_month_interval
351
+ records = [
352
+ [1],
353
+ [nil],
354
+ [12],
355
+ ]
356
+ target = build({column: :month_interval}, records)
357
+ assert_equal(records, target.raw_records)
358
+ end
359
+
360
+ def test_day_time_interval
361
+ records = [
362
+ [{day: 1, millisecond: 100}],
363
+ [nil],
364
+ [{day: 2, millisecond: 300}],
365
+ ]
366
+ target = build({column: :day_time_interval}, records)
367
+ assert_equal(records, target.raw_records)
368
+ end
369
+
370
+ def test_month_day_nano_interval
371
+ records = [
372
+ [{month: 1, day: 1, nanosecond: 100}],
373
+ [nil],
374
+ [{month: 2, day: 3, nanosecond: 400}],
375
+ ]
376
+ target = build({column: :month_day_nano_interval}, records)
377
+ assert_equal(records, target.raw_records)
378
+ end
349
379
  end
350
380
 
351
381
  class RawRecordsRecordBatchBasicArraysTest < Test::Unit::TestCase
@@ -359,6 +359,33 @@ module RawRecordsDenseUnionArrayTests
359
359
  assert_equal(records, target.raw_records)
360
360
  end
361
361
 
362
+ def test_month_interval
363
+ records = [
364
+ [{"0" => 1}],
365
+ [{"1" => nil}],
366
+ ]
367
+ target = build(:month_interval, records)
368
+ assert_equal(records, target.raw_records)
369
+ end
370
+
371
+ def test_day_time_interval
372
+ records = [
373
+ [{"0" => {day: 1, millisecond: 100}}],
374
+ [{"1" => nil}],
375
+ ]
376
+ target = build(:day_time_interval, records)
377
+ assert_equal(records, target.raw_records)
378
+ end
379
+
380
+ def test_month_day_nano_interval
381
+ records = [
382
+ [{"0" => {month: 1, day: 1, nanosecond: 100}}],
383
+ [{"1" => nil}],
384
+ ]
385
+ target = build(:month_day_nano_interval, records)
386
+ assert_equal(records, target.raw_records)
387
+ end
388
+
362
389
  def test_list
363
390
  records = [
364
391
  [{"0" => [true, nil, false]}],
@@ -399,6 +399,45 @@ module RawRecordsListArrayTests
399
399
  assert_equal(records, target.raw_records)
400
400
  end
401
401
 
402
+ def test_month_interval
403
+ records = [
404
+ [[1, nil, 12]],
405
+ [nil],
406
+ ]
407
+ target = build(:month_interval, records)
408
+ assert_equal(records, target.raw_records)
409
+ end
410
+
411
+ def test_day_time_interval
412
+ records = [
413
+ [
414
+ [
415
+ {day: 1, millisecond: 100},
416
+ nil,
417
+ {day: 2, millisecond: 300},
418
+ ]
419
+ ],
420
+ [nil],
421
+ ]
422
+ target = build(:day_time_interval, records)
423
+ assert_equal(records, target.raw_records)
424
+ end
425
+
426
+ def test_month_day_nano_interval
427
+ records = [
428
+ [
429
+ [
430
+ {month: 1, day: 1, nanosecond: 100},
431
+ nil,
432
+ {month: 2, day: 3, nanosecond: 400},
433
+ ]
434
+ ],
435
+ [nil],
436
+ ]
437
+ target = build(:month_day_nano_interval, records)
438
+ assert_equal(records, target.raw_records)
439
+ end
440
+
402
441
  def test_list
403
442
  records = [
404
443
  [
@@ -310,6 +310,43 @@ module RawRecordsMapArrayTests
310
310
  assert_equal(records, target.raw_records)
311
311
  end
312
312
 
313
+ def test_month_interval
314
+ records = [
315
+ [{"key1" => 1, "key2" => nil}],
316
+ [nil],
317
+ ]
318
+ target = build(:month_interval, records)
319
+ assert_equal(records, target.raw_records)
320
+ end
321
+
322
+ def test_day_time_interval
323
+ records = [
324
+ [
325
+ {
326
+ "key1" => {day: 1, millisecond: 100},
327
+ "key2" => nil,
328
+ },
329
+ ],
330
+ [nil],
331
+ ]
332
+ target = build(:day_time_interval, records)
333
+ assert_equal(records, target.raw_records)
334
+ end
335
+
336
+ def test_month_day_nano_interval
337
+ records = [
338
+ [
339
+ {
340
+ "key1" => {month: 1, day: 1, nanosecond: 100},
341
+ "key2" => nil,
342
+ },
343
+ ],
344
+ [nil],
345
+ ]
346
+ target = build(:month_day_nano_interval, records)
347
+ assert_equal(records, target.raw_records)
348
+ end
349
+
313
350
  def test_list
314
351
  records = [
315
352
  [{"key1" => [true, nil, false], "key2" => nil}],
@@ -349,6 +349,33 @@ module RawRecordsSparseUnionArrayTests
349
349
  assert_equal(records, target.raw_records)
350
350
  end
351
351
 
352
+ def test_month_interval
353
+ records = [
354
+ [{"0" => 1}],
355
+ [{"1" => nil}],
356
+ ]
357
+ target = build(:month_interval, records)
358
+ assert_equal(records, target.raw_records)
359
+ end
360
+
361
+ def test_day_time_interval
362
+ records = [
363
+ [{"0" => {day: 1, millisecond: 100}}],
364
+ [{"1" => nil}],
365
+ ]
366
+ target = build(:day_time_interval, records)
367
+ assert_equal(records, target.raw_records)
368
+ end
369
+
370
+ def test_month_day_nano_interval
371
+ records = [
372
+ [{"0" => {month: 1, day: 1, nanosecond: 100}}],
373
+ [{"1" => nil}],
374
+ ]
375
+ target = build(:month_day_nano_interval, records)
376
+ assert_equal(records, target.raw_records)
377
+ end
378
+
352
379
  def test_list
353
380
  records = [
354
381
  [{"0" => [true, nil, false]}],
@@ -344,6 +344,36 @@ module RawRecordsStructArrayTests
344
344
  assert_equal(records, target.raw_records)
345
345
  end
346
346
 
347
+ def test_month_interval
348
+ records = [
349
+ [{"field" => 1}],
350
+ [nil],
351
+ [{"field" => nil}],
352
+ ]
353
+ target = build(:month_interval, records)
354
+ assert_equal(records, target.raw_records)
355
+ end
356
+
357
+ def test_day_time_interval
358
+ records = [
359
+ [{"field" => {day: 1, millisecond: 100}}],
360
+ [nil],
361
+ [{"field" => nil}],
362
+ ]
363
+ target = build(:day_time_interval, records)
364
+ assert_equal(records, target.raw_records)
365
+ end
366
+
367
+ def test_month_day_nano_interval
368
+ records = [
369
+ [{"field" => {month: 1, day: 1, nanosecond: 100}}],
370
+ [nil],
371
+ [{"field" => nil}],
372
+ ]
373
+ target = build(:month_day_nano_interval, records)
374
+ assert_equal(records, target.raw_records)
375
+ end
376
+
347
377
  def test_list
348
378
  records = [
349
379
  [{"field" => [true, nil, false]}],