red-arrow 6.0.0 → 8.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +10 -0
  3. data/ext/arrow/arrow.cpp +12 -0
  4. data/ext/arrow/converters.hpp +46 -10
  5. data/ext/arrow/extconf.rb +1 -1
  6. data/ext/arrow/raw-records.cpp +3 -2
  7. data/ext/arrow/red-arrow.hpp +7 -0
  8. data/ext/arrow/values.cpp +3 -2
  9. data/lib/arrow/datum.rb +2 -0
  10. data/lib/arrow/day-time-interval-array-builder.rb +29 -0
  11. data/lib/arrow/function.rb +52 -0
  12. data/lib/arrow/loader.rb +16 -0
  13. data/lib/arrow/month-day-nano-interval-array-builder.rb +29 -0
  14. data/lib/arrow/s3-global-options.rb +38 -0
  15. data/lib/arrow/sort-key.rb +61 -55
  16. data/lib/arrow/sort-options.rb +8 -8
  17. data/lib/arrow/table-loader.rb +99 -62
  18. data/lib/arrow/table-saver.rb +7 -2
  19. data/lib/arrow/table.rb +78 -0
  20. data/lib/arrow/version.rb +1 -1
  21. data/red-arrow.gemspec +1 -10
  22. data/test/helper.rb +2 -0
  23. data/test/raw-records/test-basic-arrays.rb +30 -0
  24. data/test/raw-records/test-dense-union-array.rb +27 -0
  25. data/test/raw-records/test-list-array.rb +39 -0
  26. data/test/raw-records/test-map-array.rb +37 -0
  27. data/test/raw-records/test-sparse-union-array.rb +27 -0
  28. data/test/raw-records/test-struct-array.rb +30 -0
  29. data/test/test-function.rb +48 -14
  30. data/test/test-table.rb +204 -6
  31. data/test/values/test-basic-arrays.rb +30 -0
  32. data/test/values/test-dense-union-array.rb +27 -0
  33. data/test/values/test-dictionary-array.rb +295 -0
  34. data/test/values/test-list-array.rb +39 -0
  35. data/test/values/test-map-array.rb +33 -0
  36. data/test/values/test-sparse-union-array.rb +27 -0
  37. data/test/values/test-struct-array.rb +30 -0
  38. metadata +88 -194
@@ -15,7 +15,7 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
- require "uri"
18
+ require "open-uri"
19
19
 
20
20
  module Arrow
21
21
  class TableLoader
@@ -34,30 +34,47 @@ module Arrow
34
34
 
35
35
  def load
36
36
  if @input.is_a?(URI)
37
- custom_load_method = "load_from_uri"
37
+ custom_load_method_candidates = []
38
+ if @input.scheme
39
+ custom_load_method_candidates << "load_from_uri_#{@input.scheme}"
40
+ end
41
+ custom_load_method_candidates << "load_from_uri"
38
42
  elsif @input.is_a?(String) and ::File.directory?(@input)
39
- custom_load_method = "load_from_directory"
43
+ custom_load_method_candidates = ["load_from_directory"]
40
44
  else
41
- custom_load_method = "load_from_file"
45
+ custom_load_method_candidates = ["load_from_file"]
42
46
  end
43
- unless respond_to?(custom_load_method, true)
44
- available_schemes = []
45
- (methods(true) | private_methods(true)).each do |name|
46
- match_data = /\Aload_from_/.match(name.to_s)
47
- if match_data
48
- available_schemes << match_data.post_match
49
- end
47
+ custom_load_method_candidates.each do |custom_load_method|
48
+ next unless respond_to?(custom_load_method, true)
49
+ return __send__(custom_load_method)
50
+ end
51
+ available_schemes = []
52
+ (methods(true) | private_methods(true)).each do |name|
53
+ match_data = /\Aload_from_/.match(name.to_s)
54
+ if match_data
55
+ available_schemes << match_data.post_match
50
56
  end
51
- message = "Arrow::Table load source must be one of ["
52
- message << available_schemes.join(", ")
53
- message << "]: #{@input.inspect}"
54
- raise ArgumentError, message
55
57
  end
56
- __send__(custom_load_method)
58
+ message = "Arrow::Table load source must be one of ["
59
+ message << available_schemes.join(", ")
60
+ message << "]: #{@input.inspect}"
61
+ raise ArgumentError, message
57
62
  end
58
63
 
59
64
  private
65
+ def load_from_uri_http
66
+ load_by_reader
67
+ end
68
+
69
+ def load_from_uri_https
70
+ load_by_reader
71
+ end
72
+
60
73
  def load_from_file
74
+ load_by_reader
75
+ end
76
+
77
+ def load_by_reader
61
78
  format = @options[:format]
62
79
  custom_load_method = "load_as_#{format}"
63
80
  unless respond_to?(custom_load_method, true)
@@ -111,10 +128,29 @@ module Arrow
111
128
  end
112
129
 
113
130
  def open_input_stream
114
- if @input.is_a?(Buffer)
115
- BufferInputStream.new(@input)
131
+ case @input
132
+ when Buffer
133
+ yield(BufferInputStream.new(@input))
134
+ when URI
135
+ @input.open do |ruby_input|
136
+ case @options[:format]
137
+ when :stream, :arrow_streaming
138
+ Gio::RubyInputStream.open(ruby_input) do |gio_input|
139
+ GIOInputStream.open(gio_input) do |input|
140
+ yield(input)
141
+ end
142
+ end
143
+ else
144
+ # TODO: We need to consider Ruby's GVL carefully to use
145
+ # Ruby object directly for input with other formats. We
146
+ # read data and use it as Buffer for now.
147
+ data = GLib::Bytes.new(ruby_input.read.freeze)
148
+ buffer = Buffer.new(data)
149
+ yield(BufferInputStream.new(buffer))
150
+ end
151
+ end
116
152
  else
117
- MemoryMappedInputStream.new(@input)
153
+ yield(MemoryMappedInputStream.new(@input))
118
154
  end
119
155
  end
120
156
 
@@ -130,32 +166,19 @@ module Arrow
130
166
  end
131
167
 
132
168
  def load_as_arrow
133
- input = nil
134
- reader = nil
135
- error = nil
136
- reader_class_candidates = [
137
- RecordBatchFileReader,
138
- RecordBatchStreamReader,
139
- ]
140
- reader_class_candidates.each do |reader_class_candidate|
141
- input = open_input_stream
142
- begin
143
- reader = reader_class_candidate.new(input)
144
- rescue Arrow::Error
145
- error = $!
146
- else
147
- break
148
- end
169
+ begin
170
+ load_as_arrow_file
171
+ rescue
172
+ load_as_arrows
149
173
  end
150
- raise error if reader.nil?
151
- load_raw(input, reader)
152
174
  end
153
175
 
154
176
  # @since 1.0.0
155
177
  def load_as_arrow_file
156
- input = open_input_stream
157
- reader = RecordBatchFileReader.new(input)
158
- load_raw(input, reader)
178
+ open_input_stream do |input|
179
+ reader = RecordBatchFileReader.new(input)
180
+ load_raw(input, reader)
181
+ end
159
182
  end
160
183
 
161
184
  # @deprecated Use `format: :arrow_file` instead.
@@ -163,34 +186,46 @@ module Arrow
163
186
  load_as_arrow_file
164
187
  end
165
188
 
189
+ # @since 7.0.0
190
+ def load_as_arrows
191
+ open_input_stream do |input|
192
+ reader = RecordBatchStreamReader.new(input)
193
+ load_raw(input, reader)
194
+ end
195
+ end
196
+
166
197
  # @since 1.0.0
167
198
  def load_as_arrow_streaming
168
- input = open_input_stream
169
- reader = RecordBatchStreamReader.new(input)
170
- load_raw(input, reader)
199
+ load_as_arrows
171
200
  end
172
201
 
173
202
  # @deprecated Use `format: :arrow_streaming` instead.
174
203
  def load_as_stream
175
- load_as_arrow_streaming
204
+ load_as_arrows
176
205
  end
177
206
 
178
207
  if Arrow.const_defined?(:ORCFileReader)
179
208
  def load_as_orc
180
- input = open_input_stream
181
- reader = ORCFileReader.new(input)
182
- field_indexes = @options[:field_indexes]
183
- reader.set_field_indexes(field_indexes) if field_indexes
184
- table = reader.read_stripes
185
- table.instance_variable_set(:@input, input)
186
- table
209
+ open_input_stream do |input|
210
+ reader = ORCFileReader.new(input)
211
+ field_indexes = @options[:field_indexes]
212
+ reader.set_field_indexes(field_indexes) if field_indexes
213
+ table = reader.read_stripes
214
+ table.instance_variable_set(:@input, input)
215
+ table
216
+ end
187
217
  end
188
218
  end
189
219
 
190
220
  def csv_load(options)
191
221
  options.delete(:format)
192
- if @input.is_a?(Buffer)
222
+ case @input
223
+ when Buffer
193
224
  CSVLoader.load(@input.data.to_s, **options)
225
+ when URI
226
+ @input.open do |input|
227
+ CSVLoader.load(input.read, **options)
228
+ end
194
229
  else
195
230
  CSVLoader.load(Pathname.new(@input), **options)
196
231
  end
@@ -207,19 +242,21 @@ module Arrow
207
242
  end
208
243
 
209
244
  def load_as_feather
210
- input = open_input_stream
211
- reader = FeatherFileReader.new(input)
212
- table = reader.read
213
- table.instance_variable_set(:@input, input)
214
- table
245
+ open_input_stream do |input|
246
+ reader = FeatherFileReader.new(input)
247
+ table = reader.read
248
+ table.instance_variable_set(:@input, input)
249
+ table
250
+ end
215
251
  end
216
252
 
217
253
  def load_as_json
218
- input = open_input_stream
219
- reader = JSONReader.new(input)
220
- table = reader.read
221
- table.instance_variable_set(:@input, input)
222
- table
254
+ open_input_stream do |input|
255
+ reader = JSONReader.new(input)
256
+ table = reader.read
257
+ table.instance_variable_set(:@input, input)
258
+ table
259
+ end
223
260
  end
224
261
  end
225
262
  end
@@ -151,14 +151,19 @@ module Arrow
151
151
  save_as_arrow_file
152
152
  end
153
153
 
154
+ # @since 7.0.0
155
+ def save_as_arrows
156
+ save_raw(RecordBatchStreamWriter)
157
+ end
158
+
154
159
  # @since 1.0.0
155
160
  def save_as_arrow_streaming
156
- save_raw(RecordBatchStreamWriter)
161
+ save_as_arrows
157
162
  end
158
163
 
159
164
  # @deprecated Use `format: :arrow_streaming` instead.
160
165
  def save_as_stream
161
- save_as_arrow_streaming
166
+ save_as_arrows
162
167
  end
163
168
 
164
169
  def csv_save(**options)
data/lib/arrow/table.rb CHANGED
@@ -448,6 +448,84 @@ module Arrow
448
448
  self.class.new(schema, packed_arrays)
449
449
  end
450
450
 
451
+ # @overload join(right, key, type: :inner, left_outputs: nil, right_outputs: nil)
452
+ # @!macro join_common_before
453
+ # @param right [Arrow::Table] The right table.
454
+ #
455
+ # Join columns with `right` on join key columns.
456
+ #
457
+ # @!macro join_common_after
458
+ # @param type [Arrow::JoinType] How to join.
459
+ # @param left_outputs [::Array<String, Symbol>] Output columns in
460
+ # `self`.
461
+ #
462
+ # If both of `left_outputs` and `right_outputs` aren't
463
+ # specified, all columns in `self` and `right` are
464
+ # outputted.
465
+ # @param right_outputs [::Array<String, Symbol>] Output columns in
466
+ # `right`.
467
+ #
468
+ # If both of `left_outputs` and `right_outputs` aren't
469
+ # specified, all columns in `self` and `right` are
470
+ # outputted.
471
+ # @return [Arrow::Table]
472
+ # The joined `Arrow::Table`.
473
+ #
474
+ # @macro join_common_before
475
+ # @param key [String, Symbol] A join key.
476
+ # @macro join_common_after
477
+ #
478
+ # @overload join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
479
+ #
480
+ # @macro join_common_before
481
+ # @param keys [::Array<String, Symbol>] Join keys.
482
+ # @macro join_common_after
483
+ #
484
+ # @overload join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
485
+ #
486
+ # @macro join_common_before
487
+ # @param keys [Hash] Specify join keys in `self` and `right` separately.
488
+ # @option keys [String, Symbol, ::Array<String, Symbol>] :left
489
+ # Join keys in `self`.
490
+ # @option keys [String, Symbol, ::Array<String, Symbol>] :right
491
+ # Join keys in `right`.
492
+ # @macro join_common_after
493
+ #
494
+ # @since 7.0.0
495
+ def join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
496
+ plan = ExecutePlan.new
497
+ left_node = plan.build_source_node(self)
498
+ right_node = plan.build_source_node(right)
499
+ if keys.is_a?(Hash)
500
+ left_keys = keys[:left]
501
+ right_keys = keys[:right]
502
+ else
503
+ left_keys = keys
504
+ right_keys = keys
505
+ end
506
+ left_keys = Array(left_keys)
507
+ right_keys = Array(right_keys)
508
+ hash_join_node_options = HashJoinNodeOptions.new(type,
509
+ left_keys,
510
+ right_keys)
511
+ unless left_outputs.nil?
512
+ hash_join_node_options.left_outputs = left_outputs
513
+ end
514
+ unless right_outputs.nil?
515
+ hash_join_node_options.right_outputs = right_outputs
516
+ end
517
+ hash_join_node = plan.build_hash_join_node(left_node,
518
+ right_node,
519
+ hash_join_node_options)
520
+ sink_node_options = SinkNodeOptions.new
521
+ plan.build_sink_node(hash_join_node, sink_node_options)
522
+ plan.validate
523
+ plan.start
524
+ plan.wait
525
+ reader = sink_node_options.get_reader(hash_join_node.output_schema)
526
+ reader.read_all
527
+ end
528
+
451
529
  alias_method :to_s_raw, :to_s
452
530
  def to_s(options={})
453
531
  format = options[:format]
data/lib/arrow/version.rb CHANGED
@@ -16,7 +16,7 @@
16
16
  # under the License.
17
17
 
18
18
  module Arrow
19
- VERSION = "6.0.0"
19
+ VERSION = "8.0.0"
20
20
 
21
21
  module Version
22
22
  numbers, TAG = VERSION.split("-")
data/red-arrow.gemspec CHANGED
@@ -48,19 +48,10 @@ Gem::Specification.new do |spec|
48
48
 
49
49
  spec.add_runtime_dependency("bigdecimal", ">= 2.0.3")
50
50
  spec.add_runtime_dependency("extpp", ">= 0.0.7")
51
- spec.add_runtime_dependency("gio2", ">= 3.4.9")
51
+ spec.add_runtime_dependency("gio2", ">= 3.5.0")
52
52
  spec.add_runtime_dependency("native-package-installer")
53
53
  spec.add_runtime_dependency("pkg-config")
54
54
 
55
- spec.add_development_dependency("benchmark-driver")
56
- spec.add_development_dependency("bundler")
57
- spec.add_development_dependency("faker")
58
- spec.add_development_dependency("fiddle", ">= 1.0.9")
59
- spec.add_development_dependency("rake")
60
- spec.add_development_dependency("redcarpet")
61
- spec.add_development_dependency("test-unit")
62
- spec.add_development_dependency("yard")
63
-
64
55
  required_msys2_package_version = version_components[0, 3].join(".")
65
56
  spec.metadata["msys2_mingw_dependencies"] =
66
57
  "arrow>=#{required_msys2_package_version}"
data/test/helper.rb CHANGED
@@ -20,6 +20,8 @@ require "arrow"
20
20
  require "fiddle"
21
21
  require "pathname"
22
22
  require "tempfile"
23
+ require "timeout"
24
+ require "webrick"
23
25
  require "zlib"
24
26
 
25
27
  require "test-unit"
@@ -346,6 +346,36 @@ module RawRecordsBasicArraysTests
346
346
  records)
347
347
  assert_equal(records, target.raw_records)
348
348
  end
349
+
350
+ def test_month_interval
351
+ records = [
352
+ [1],
353
+ [nil],
354
+ [12],
355
+ ]
356
+ target = build({column: :month_interval}, records)
357
+ assert_equal(records, target.raw_records)
358
+ end
359
+
360
+ def test_day_time_interval
361
+ records = [
362
+ [{day: 1, millisecond: 100}],
363
+ [nil],
364
+ [{day: 2, millisecond: 300}],
365
+ ]
366
+ target = build({column: :day_time_interval}, records)
367
+ assert_equal(records, target.raw_records)
368
+ end
369
+
370
+ def test_month_day_nano_interval
371
+ records = [
372
+ [{month: 1, day: 1, nanosecond: 100}],
373
+ [nil],
374
+ [{month: 2, day: 3, nanosecond: 400}],
375
+ ]
376
+ target = build({column: :month_day_nano_interval}, records)
377
+ assert_equal(records, target.raw_records)
378
+ end
349
379
  end
350
380
 
351
381
  class RawRecordsRecordBatchBasicArraysTest < Test::Unit::TestCase
@@ -359,6 +359,33 @@ module RawRecordsDenseUnionArrayTests
359
359
  assert_equal(records, target.raw_records)
360
360
  end
361
361
 
362
+ def test_month_interval
363
+ records = [
364
+ [{"0" => 1}],
365
+ [{"1" => nil}],
366
+ ]
367
+ target = build(:month_interval, records)
368
+ assert_equal(records, target.raw_records)
369
+ end
370
+
371
+ def test_day_time_interval
372
+ records = [
373
+ [{"0" => {day: 1, millisecond: 100}}],
374
+ [{"1" => nil}],
375
+ ]
376
+ target = build(:day_time_interval, records)
377
+ assert_equal(records, target.raw_records)
378
+ end
379
+
380
+ def test_month_day_nano_interval
381
+ records = [
382
+ [{"0" => {month: 1, day: 1, nanosecond: 100}}],
383
+ [{"1" => nil}],
384
+ ]
385
+ target = build(:month_day_nano_interval, records)
386
+ assert_equal(records, target.raw_records)
387
+ end
388
+
362
389
  def test_list
363
390
  records = [
364
391
  [{"0" => [true, nil, false]}],
@@ -399,6 +399,45 @@ module RawRecordsListArrayTests
399
399
  assert_equal(records, target.raw_records)
400
400
  end
401
401
 
402
+ def test_month_interval
403
+ records = [
404
+ [[1, nil, 12]],
405
+ [nil],
406
+ ]
407
+ target = build(:month_interval, records)
408
+ assert_equal(records, target.raw_records)
409
+ end
410
+
411
+ def test_day_time_interval
412
+ records = [
413
+ [
414
+ [
415
+ {day: 1, millisecond: 100},
416
+ nil,
417
+ {day: 2, millisecond: 300},
418
+ ]
419
+ ],
420
+ [nil],
421
+ ]
422
+ target = build(:day_time_interval, records)
423
+ assert_equal(records, target.raw_records)
424
+ end
425
+
426
+ def test_month_day_nano_interval
427
+ records = [
428
+ [
429
+ [
430
+ {month: 1, day: 1, nanosecond: 100},
431
+ nil,
432
+ {month: 2, day: 3, nanosecond: 400},
433
+ ]
434
+ ],
435
+ [nil],
436
+ ]
437
+ target = build(:month_day_nano_interval, records)
438
+ assert_equal(records, target.raw_records)
439
+ end
440
+
402
441
  def test_list
403
442
  records = [
404
443
  [
@@ -310,6 +310,43 @@ module RawRecordsMapArrayTests
310
310
  assert_equal(records, target.raw_records)
311
311
  end
312
312
 
313
+ def test_month_interval
314
+ records = [
315
+ [{"key1" => 1, "key2" => nil}],
316
+ [nil],
317
+ ]
318
+ target = build(:month_interval, records)
319
+ assert_equal(records, target.raw_records)
320
+ end
321
+
322
+ def test_day_time_interval
323
+ records = [
324
+ [
325
+ {
326
+ "key1" => {day: 1, millisecond: 100},
327
+ "key2" => nil,
328
+ },
329
+ ],
330
+ [nil],
331
+ ]
332
+ target = build(:day_time_interval, records)
333
+ assert_equal(records, target.raw_records)
334
+ end
335
+
336
+ def test_month_day_nano_interval
337
+ records = [
338
+ [
339
+ {
340
+ "key1" => {month: 1, day: 1, nanosecond: 100},
341
+ "key2" => nil,
342
+ },
343
+ ],
344
+ [nil],
345
+ ]
346
+ target = build(:month_day_nano_interval, records)
347
+ assert_equal(records, target.raw_records)
348
+ end
349
+
313
350
  def test_list
314
351
  records = [
315
352
  [{"key1" => [true, nil, false], "key2" => nil}],
@@ -349,6 +349,33 @@ module RawRecordsSparseUnionArrayTests
349
349
  assert_equal(records, target.raw_records)
350
350
  end
351
351
 
352
+ def test_month_interval
353
+ records = [
354
+ [{"0" => 1}],
355
+ [{"1" => nil}],
356
+ ]
357
+ target = build(:month_interval, records)
358
+ assert_equal(records, target.raw_records)
359
+ end
360
+
361
+ def test_day_time_interval
362
+ records = [
363
+ [{"0" => {day: 1, millisecond: 100}}],
364
+ [{"1" => nil}],
365
+ ]
366
+ target = build(:day_time_interval, records)
367
+ assert_equal(records, target.raw_records)
368
+ end
369
+
370
+ def test_month_day_nano_interval
371
+ records = [
372
+ [{"0" => {month: 1, day: 1, nanosecond: 100}}],
373
+ [{"1" => nil}],
374
+ ]
375
+ target = build(:month_day_nano_interval, records)
376
+ assert_equal(records, target.raw_records)
377
+ end
378
+
352
379
  def test_list
353
380
  records = [
354
381
  [{"0" => [true, nil, false]}],
@@ -344,6 +344,36 @@ module RawRecordsStructArrayTests
344
344
  assert_equal(records, target.raw_records)
345
345
  end
346
346
 
347
+ def test_month_interval
348
+ records = [
349
+ [{"field" => 1}],
350
+ [nil],
351
+ [{"field" => nil}],
352
+ ]
353
+ target = build(:month_interval, records)
354
+ assert_equal(records, target.raw_records)
355
+ end
356
+
357
+ def test_day_time_interval
358
+ records = [
359
+ [{"field" => {day: 1, millisecond: 100}}],
360
+ [nil],
361
+ [{"field" => nil}],
362
+ ]
363
+ target = build(:day_time_interval, records)
364
+ assert_equal(records, target.raw_records)
365
+ end
366
+
367
+ def test_month_day_nano_interval
368
+ records = [
369
+ [{"field" => {month: 1, day: 1, nanosecond: 100}}],
370
+ [nil],
371
+ [{"field" => nil}],
372
+ ]
373
+ target = build(:month_day_nano_interval, records)
374
+ assert_equal(records, target.raw_records)
375
+ end
376
+
347
377
  def test_list
348
378
  records = [
349
379
  [{"field" => [true, nil, false]}],