red-arrow 5.0.0 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -0
  3. data/ext/arrow/converters.cpp +5 -0
  4. data/ext/arrow/converters.hpp +126 -0
  5. data/ext/arrow/extconf.rb +13 -0
  6. data/ext/arrow/raw-records.cpp +1 -0
  7. data/ext/arrow/values.cpp +1 -0
  8. data/lib/arrow/aggregate-node-options.rb +35 -0
  9. data/lib/arrow/aggregation.rb +46 -0
  10. data/lib/arrow/array-builder.rb +5 -0
  11. data/lib/arrow/binary-dictionary-array-builder.rb +27 -0
  12. data/lib/arrow/column-containable.rb +100 -1
  13. data/lib/arrow/datum.rb +2 -0
  14. data/lib/arrow/expression.rb +48 -0
  15. data/lib/arrow/file-system.rb +34 -0
  16. data/lib/arrow/group.rb +116 -124
  17. data/lib/arrow/loader.rb +13 -0
  18. data/lib/arrow/map-array-builder.rb +109 -0
  19. data/lib/arrow/map-array.rb +26 -0
  20. data/lib/arrow/map-data-type.rb +89 -0
  21. data/lib/arrow/path-extension.rb +1 -1
  22. data/lib/arrow/record-batch-reader.rb +41 -0
  23. data/lib/arrow/record-batch.rb +0 -2
  24. data/lib/arrow/slicer.rb +44 -143
  25. data/lib/arrow/source-node-options.rb +32 -0
  26. data/lib/arrow/string-dictionary-array-builder.rb +27 -0
  27. data/lib/arrow/symbol-values-appendable.rb +34 -0
  28. data/lib/arrow/table-concatenate-options.rb +36 -0
  29. data/lib/arrow/table-formatter.rb +141 -17
  30. data/lib/arrow/table-list-formatter.rb +5 -3
  31. data/lib/arrow/table-loader.rb +41 -3
  32. data/lib/arrow/table-saver.rb +29 -3
  33. data/lib/arrow/table-table-formatter.rb +7 -31
  34. data/lib/arrow/table.rb +32 -38
  35. data/lib/arrow/version.rb +1 -1
  36. data/red-arrow.gemspec +1 -1
  37. data/test/raw-records/test-dense-union-array.rb +14 -0
  38. data/test/raw-records/test-list-array.rb +19 -0
  39. data/test/raw-records/test-map-array.rb +441 -0
  40. data/test/raw-records/test-sparse-union-array.rb +14 -0
  41. data/test/raw-records/test-struct-array.rb +15 -0
  42. data/test/test-array-builder.rb +7 -0
  43. data/test/test-binary-dictionary-array-builder.rb +103 -0
  44. data/test/test-csv-loader.rb +8 -8
  45. data/test/test-expression.rb +40 -0
  46. data/test/test-group.rb +75 -51
  47. data/test/test-map-array-builder.rb +110 -0
  48. data/test/test-map-array.rb +33 -0
  49. data/test/test-map-data-type.rb +36 -0
  50. data/test/test-record-batch-reader.rb +46 -0
  51. data/test/test-record-batch.rb +42 -0
  52. data/test/test-slicer.rb +166 -167
  53. data/test/test-string-dictionary-array-builder.rb +103 -0
  54. data/test/test-table.rb +190 -53
  55. data/test/values/test-dense-union-array.rb +14 -0
  56. data/test/values/test-list-array.rb +17 -0
  57. data/test/values/test-map-array.rb +433 -0
  58. data/test/values/test-sparse-union-array.rb +14 -0
  59. data/test/values/test-struct-array.rb +15 -0
  60. metadata +107 -76
@@ -15,6 +15,8 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
+ require "uri"
19
+
18
20
  module Arrow
19
21
  class TableLoader
20
22
  class << self
@@ -31,6 +33,31 @@ module Arrow
31
33
  end
32
34
 
33
35
  def load
36
+ if @input.is_a?(URI)
37
+ custom_load_method = "load_from_uri"
38
+ elsif @input.is_a?(String) and ::File.directory?(@input)
39
+ custom_load_method = "load_from_directory"
40
+ else
41
+ custom_load_method = "load_from_file"
42
+ end
43
+ unless respond_to?(custom_load_method, true)
44
+ available_schemes = []
45
+ (methods(true) | private_methods(true)).each do |name|
46
+ match_data = /\Aload_from_/.match(name.to_s)
47
+ if match_data
48
+ available_schemes << match_data.post_match
49
+ end
50
+ end
51
+ message = "Arrow::Table load source must be one of ["
52
+ message << available_schemes.join(", ")
53
+ message << "]: #{@input.inspect}"
54
+ raise ArgumentError, message
55
+ end
56
+ __send__(custom_load_method)
57
+ end
58
+
59
+ private
60
+ def load_from_file
34
61
  format = @options[:format]
35
62
  custom_load_method = "load_as_#{format}"
36
63
  unless respond_to?(custom_load_method, true)
@@ -56,21 +83,24 @@ module Arrow
56
83
  end
57
84
  end
58
85
 
59
- private
60
86
  def fill_options
61
87
  if @options[:format] and @options.key?(:compression)
62
88
  return
63
89
  end
64
90
 
65
- if @input.is_a?(Buffer)
91
+ case @input
92
+ when Buffer
66
93
  info = {}
94
+ when URI
95
+ extension = PathExtension.new(@input.path)
96
+ info = extension.extract
67
97
  else
68
98
  extension = PathExtension.new(@input)
69
99
  info = extension.extract
70
100
  end
71
101
  format = info[:format]
72
102
  @options = @options.dup
73
- if format and respond_to?("load_as_#{format}", true)
103
+ if format
74
104
  @options[:format] ||= format.to_sym
75
105
  else
76
106
  @options[:format] ||= :arrow
@@ -183,5 +213,13 @@ module Arrow
183
213
  table.instance_variable_set(:@input, input)
184
214
  table
185
215
  end
216
+
217
+ def load_as_json
218
+ input = open_input_stream
219
+ reader = JSONReader.new(input)
220
+ table = reader.read
221
+ table.instance_variable_set(:@input, input)
222
+ table
223
+ end
186
224
  end
187
225
  end
@@ -32,6 +32,29 @@ module Arrow
32
32
  end
33
33
 
34
34
  def save
35
+ if @output.is_a?(URI)
36
+ custom_save_method = "save_to_uri"
37
+ else
38
+ custom_save_method = "save_to_file"
39
+ end
40
+ unless respond_to?(custom_save_method, true)
41
+ available_schemes = []
42
+ (methods(true) | private_methods(true)).each do |name|
43
+ match_data = /\Asave_to_/.match(name.to_s)
44
+ if match_data
45
+ available_schemes << match_data.post_match
46
+ end
47
+ end
48
+ message = "Arrow::Table save source must be one of ["
49
+ message << available_schemes.join(", ")
50
+ message << "]: #{@output.scheme.inspect}"
51
+ raise ArgumentError, message
52
+ end
53
+ __send__(custom_save_method)
54
+ end
55
+
56
+ private
57
+ def save_to_file
35
58
  format = @options[:format]
36
59
  custom_save_method = "save_as_#{format}"
37
60
  unless respond_to?(custom_save_method, true)
@@ -57,21 +80,24 @@ module Arrow
57
80
  end
58
81
  end
59
82
 
60
- private
61
83
  def fill_options
62
84
  if @options[:format] and @options.key?(:compression)
63
85
  return
64
86
  end
65
87
 
66
- if @output.is_a?(Buffer)
88
+ case @output
89
+ when Buffer
67
90
  info = {}
91
+ when URI
92
+ extension = PathExtension.new(@output.path)
93
+ info = extension.extract
68
94
  else
69
95
  extension = PathExtension.new(@output)
70
96
  info = extension.extract
71
97
  end
72
98
  format = info[:format]
73
99
  @options = @options.dup
74
- if format and respond_to?("save_as_#{format}", true)
100
+ if format
75
101
  @options[:format] ||= format.to_sym
76
102
  else
77
103
  @options[:format] ||= :arrow
@@ -21,51 +21,27 @@ module Arrow
21
21
  # TODO: Almost codes should be implemented in Apache Arrow C++.
22
22
  class TableTableFormatter < TableFormatter
23
23
  private
24
- def format_header(text, columns)
25
- columns.each do |column|
24
+ def format_header(text, column_formatters)
25
+ column_formatters.each do |column_formatter|
26
26
  text << "\t"
27
- text << format_column_name(column)
27
+ text << column_formatter.aligned_name
28
28
  end
29
29
  text << "\n"
30
30
  end
31
31
 
32
- FLOAT_N_DIGITS = 10
33
- def format_column_name(column)
34
- case column.data_type
35
- when TimestampDataType
36
- "%*s" % [::Time.now.iso8601.size, column.name]
37
- when FloatDataType, DoubleDataType
38
- "%*s" % [FLOAT_N_DIGITS, column.name]
39
- else
40
- column.name
41
- end
42
- end
43
-
44
- def format_rows(text, columns, rows, n_digits, start_offset)
32
+ def format_rows(text, column_formatters, rows, n_digits, start_offset)
45
33
  rows.each_with_index do |row, nth_row|
46
34
  text << ("%*d" % [n_digits, start_offset + nth_row])
47
35
  row.each_with_index do |column_value, nth_column|
48
36
  text << "\t"
49
- column = columns[nth_column]
50
- text << format_column_value(column, column_value)
37
+ column_formatter = column_formatters[nth_column]
38
+ aligned_name = column_formatter.aligned_name
39
+ text << column_formatter.format_value(column_value, aligned_name.size)
51
40
  end
52
41
  text << "\n"
53
42
  end
54
43
  end
55
44
 
56
- def format_column_value(column, value)
57
- case value
58
- when ::Time
59
- value.iso8601
60
- when Float
61
- "%*f" % [[column.name.size, FLOAT_N_DIGITS].max, value]
62
- when Integer
63
- "%*d" % [column.name.size, value]
64
- else
65
- "%-*s" % [column.name.size, value.to_s]
66
- end
67
- end
68
-
69
45
  def format_ellipsis(text)
70
46
  text << "...\n"
71
47
  end
data/lib/arrow/table.rb CHANGED
@@ -195,8 +195,6 @@ module Arrow
195
195
  alias_method :size, :n_rows
196
196
  alias_method :length, :n_rows
197
197
 
198
- alias_method :[], :find_column
199
-
200
198
  alias_method :slice_raw, :slice
201
199
 
202
200
  # @overload slice(offset, length)
@@ -236,6 +234,12 @@ module Arrow
236
234
  # @return [Arrow::Table]
237
235
  # The sub `Arrow::Table` that covers only rows of the range of indices.
238
236
  #
237
+ # @overload slice(conditions)
238
+ #
239
+ # @param conditions [Hash] The conditions to select records.
240
+ # @return [Arrow::Table]
241
+ # The sub `Arrow::Table` that covers only rows matched by condition
242
+ #
239
243
  # @overload slice
240
244
  #
241
245
  # @yield [slicer] Gives slicer that constructs condition to select records.
@@ -263,12 +267,37 @@ module Arrow
263
267
  expected_n_args = nil
264
268
  case args.size
265
269
  when 1
266
- if args[0].is_a?(Integer)
270
+ case args[0]
271
+ when Integer
267
272
  index = args[0]
268
273
  index += n_rows if index < 0
269
274
  return nil if index < 0
270
275
  return nil if index >= n_rows
271
276
  return Record.new(self, index)
277
+ when Hash
278
+ condition_pairs = args[0]
279
+ slicer = Slicer.new(self)
280
+ conditions = []
281
+ condition_pairs.each do |key, value|
282
+ case value
283
+ when Range
284
+ # TODO: Optimize "begin <= key <= end" case by missing "between" kernel
285
+ # https://issues.apache.org/jira/browse/ARROW-9843
286
+ unless value.begin.nil?
287
+ conditions << (slicer[key] >= value.begin)
288
+ end
289
+ unless value.end.nil?
290
+ if value.exclude_end?
291
+ conditions << (slicer[key] < value.end)
292
+ else
293
+ conditions << (slicer[key] <= value.end)
294
+ end
295
+ end
296
+ else
297
+ conditions << (slicer[key] == value)
298
+ end
299
+ end
300
+ slicers << conditions.inject(:&)
272
301
  else
273
302
  slicers << args[0]
274
303
  end
@@ -397,41 +426,6 @@ module Arrow
397
426
  remove_column_raw(index)
398
427
  end
399
428
 
400
- # TODO
401
- #
402
- # @return [Arrow::Table]
403
- def select_columns(*selectors, &block)
404
- if selectors.empty?
405
- return to_enum(__method__) unless block_given?
406
- selected_columns = columns.select(&block)
407
- else
408
- selected_columns = []
409
- selectors.each do |selector|
410
- case selector
411
- when String, Symbol
412
- column = find_column(selector)
413
- if column.nil?
414
- message = "unknown column: #{selector.inspect}: #{inspect}"
415
- raise KeyError.new(message)
416
- end
417
- selected_columns << column
418
- when Range
419
- selected_columns.concat(columns[selector])
420
- else
421
- column = columns[selector]
422
- if column.nil?
423
- message = "out of index (0..#{n_columns - 1}): " +
424
- "#{selector.inspect}: #{inspect}"
425
- raise IndexError.new(message)
426
- end
427
- selected_columns << column
428
- end
429
- end
430
- selected_columns = selected_columns.select(&block) if block_given?
431
- end
432
- self.class.new(selected_columns)
433
- end
434
-
435
429
  # Experimental
436
430
  def group(*keys)
437
431
  Group.new(self, keys)
data/lib/arrow/version.rb CHANGED
@@ -16,7 +16,7 @@
16
16
  # under the License.
17
17
 
18
18
  module Arrow
19
- VERSION = "5.0.0"
19
+ VERSION = "6.0.0"
20
20
 
21
21
  module Version
22
22
  numbers, TAG = VERSION.split("-")
data/red-arrow.gemspec CHANGED
@@ -48,7 +48,7 @@ Gem::Specification.new do |spec|
48
48
 
49
49
  spec.add_runtime_dependency("bigdecimal", ">= 2.0.3")
50
50
  spec.add_runtime_dependency("extpp", ">= 0.0.7")
51
- spec.add_runtime_dependency("gio2", ">= 3.4.5")
51
+ spec.add_runtime_dependency("gio2", ">= 3.4.9")
52
52
  spec.add_runtime_dependency("native-package-installer")
53
53
  spec.add_runtime_dependency("pkg-config")
54
54
 
@@ -394,6 +394,20 @@ module RawRecordsDenseUnionArrayTests
394
394
  assert_equal(records, target.raw_records)
395
395
  end
396
396
 
397
+ def test_map
398
+ records = [
399
+ [{"0" => {"key1" => true, "key2" => nil}}],
400
+ [{"1" => nil}],
401
+ ]
402
+ target = build({
403
+ type: :map,
404
+ key: :string,
405
+ item: :boolean,
406
+ },
407
+ records)
408
+ assert_equal(records, target.raw_records)
409
+ end
410
+
397
411
  def test_sparse_union
398
412
  omit("Need to add support for SparseUnionArrayBuilder")
399
413
  records = [
@@ -451,6 +451,25 @@ module RawRecordsListArrayTests
451
451
  assert_equal(records, target.raw_records)
452
452
  end
453
453
 
454
+ def test_map
455
+ records = [
456
+ [
457
+ [
458
+ {"key1" => true, "key2" => nil},
459
+ nil,
460
+ ],
461
+ ],
462
+ [nil],
463
+ ]
464
+ target = build({
465
+ type: :map,
466
+ key: :string,
467
+ item: :boolean,
468
+ },
469
+ records)
470
+ assert_equal(records, target.raw_records)
471
+ end
472
+
454
473
  def test_sparse
455
474
  omit("Need to add support for SparseUnionArrayBuilder")
456
475
  records = [