red-arrow 5.0.0 → 6.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -0
  3. data/ext/arrow/converters.cpp +5 -0
  4. data/ext/arrow/converters.hpp +126 -0
  5. data/ext/arrow/extconf.rb +13 -0
  6. data/ext/arrow/raw-records.cpp +1 -0
  7. data/ext/arrow/values.cpp +1 -0
  8. data/lib/arrow/aggregate-node-options.rb +35 -0
  9. data/lib/arrow/aggregation.rb +46 -0
  10. data/lib/arrow/array-builder.rb +5 -0
  11. data/lib/arrow/binary-dictionary-array-builder.rb +27 -0
  12. data/lib/arrow/column-containable.rb +100 -1
  13. data/lib/arrow/datum.rb +2 -0
  14. data/lib/arrow/expression.rb +48 -0
  15. data/lib/arrow/file-system.rb +34 -0
  16. data/lib/arrow/group.rb +116 -124
  17. data/lib/arrow/loader.rb +13 -0
  18. data/lib/arrow/map-array-builder.rb +109 -0
  19. data/lib/arrow/map-array.rb +26 -0
  20. data/lib/arrow/map-data-type.rb +89 -0
  21. data/lib/arrow/path-extension.rb +1 -1
  22. data/lib/arrow/record-batch-reader.rb +41 -0
  23. data/lib/arrow/record-batch.rb +0 -2
  24. data/lib/arrow/slicer.rb +44 -143
  25. data/lib/arrow/source-node-options.rb +32 -0
  26. data/lib/arrow/string-dictionary-array-builder.rb +27 -0
  27. data/lib/arrow/symbol-values-appendable.rb +34 -0
  28. data/lib/arrow/table-concatenate-options.rb +36 -0
  29. data/lib/arrow/table-formatter.rb +141 -17
  30. data/lib/arrow/table-list-formatter.rb +5 -3
  31. data/lib/arrow/table-loader.rb +41 -3
  32. data/lib/arrow/table-saver.rb +29 -3
  33. data/lib/arrow/table-table-formatter.rb +7 -31
  34. data/lib/arrow/table.rb +32 -38
  35. data/lib/arrow/version.rb +1 -1
  36. data/red-arrow.gemspec +1 -1
  37. data/test/raw-records/test-dense-union-array.rb +14 -0
  38. data/test/raw-records/test-list-array.rb +19 -0
  39. data/test/raw-records/test-map-array.rb +441 -0
  40. data/test/raw-records/test-sparse-union-array.rb +14 -0
  41. data/test/raw-records/test-struct-array.rb +15 -0
  42. data/test/test-array-builder.rb +7 -0
  43. data/test/test-binary-dictionary-array-builder.rb +103 -0
  44. data/test/test-csv-loader.rb +8 -8
  45. data/test/test-expression.rb +40 -0
  46. data/test/test-group.rb +75 -51
  47. data/test/test-map-array-builder.rb +110 -0
  48. data/test/test-map-array.rb +33 -0
  49. data/test/test-map-data-type.rb +36 -0
  50. data/test/test-record-batch-reader.rb +46 -0
  51. data/test/test-record-batch.rb +42 -0
  52. data/test/test-slicer.rb +166 -167
  53. data/test/test-string-dictionary-array-builder.rb +103 -0
  54. data/test/test-table.rb +190 -53
  55. data/test/values/test-dense-union-array.rb +14 -0
  56. data/test/values/test-list-array.rb +17 -0
  57. data/test/values/test-map-array.rb +433 -0
  58. data/test/values/test-sparse-union-array.rb +14 -0
  59. data/test/values/test-struct-array.rb +15 -0
  60. metadata +107 -76
@@ -15,6 +15,8 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
+ require "uri"
19
+
18
20
  module Arrow
19
21
  class TableLoader
20
22
  class << self
@@ -31,6 +33,31 @@ module Arrow
31
33
  end
32
34
 
33
35
  def load
36
+ if @input.is_a?(URI)
37
+ custom_load_method = "load_from_uri"
38
+ elsif @input.is_a?(String) and ::File.directory?(@input)
39
+ custom_load_method = "load_from_directory"
40
+ else
41
+ custom_load_method = "load_from_file"
42
+ end
43
+ unless respond_to?(custom_load_method, true)
44
+ available_schemes = []
45
+ (methods(true) | private_methods(true)).each do |name|
46
+ match_data = /\Aload_from_/.match(name.to_s)
47
+ if match_data
48
+ available_schemes << match_data.post_match
49
+ end
50
+ end
51
+ message = "Arrow::Table load source must be one of ["
52
+ message << available_schemes.join(", ")
53
+ message << "]: #{@input.inspect}"
54
+ raise ArgumentError, message
55
+ end
56
+ __send__(custom_load_method)
57
+ end
58
+
59
+ private
60
+ def load_from_file
34
61
  format = @options[:format]
35
62
  custom_load_method = "load_as_#{format}"
36
63
  unless respond_to?(custom_load_method, true)
@@ -56,21 +83,24 @@ module Arrow
56
83
  end
57
84
  end
58
85
 
59
- private
60
86
  def fill_options
61
87
  if @options[:format] and @options.key?(:compression)
62
88
  return
63
89
  end
64
90
 
65
- if @input.is_a?(Buffer)
91
+ case @input
92
+ when Buffer
66
93
  info = {}
94
+ when URI
95
+ extension = PathExtension.new(@input.path)
96
+ info = extension.extract
67
97
  else
68
98
  extension = PathExtension.new(@input)
69
99
  info = extension.extract
70
100
  end
71
101
  format = info[:format]
72
102
  @options = @options.dup
73
- if format and respond_to?("load_as_#{format}", true)
103
+ if format
74
104
  @options[:format] ||= format.to_sym
75
105
  else
76
106
  @options[:format] ||= :arrow
@@ -183,5 +213,13 @@ module Arrow
183
213
  table.instance_variable_set(:@input, input)
184
214
  table
185
215
  end
216
+
217
+ def load_as_json
218
+ input = open_input_stream
219
+ reader = JSONReader.new(input)
220
+ table = reader.read
221
+ table.instance_variable_set(:@input, input)
222
+ table
223
+ end
186
224
  end
187
225
  end
@@ -32,6 +32,29 @@ module Arrow
32
32
  end
33
33
 
34
34
  def save
35
+ if @output.is_a?(URI)
36
+ custom_save_method = "save_to_uri"
37
+ else
38
+ custom_save_method = "save_to_file"
39
+ end
40
+ unless respond_to?(custom_save_method, true)
41
+ available_schemes = []
42
+ (methods(true) | private_methods(true)).each do |name|
43
+ match_data = /\Asave_to_/.match(name.to_s)
44
+ if match_data
45
+ available_schemes << match_data.post_match
46
+ end
47
+ end
48
+ message = "Arrow::Table save source must be one of ["
49
+ message << available_schemes.join(", ")
50
+ message << "]: #{@output.scheme.inspect}"
51
+ raise ArgumentError, message
52
+ end
53
+ __send__(custom_save_method)
54
+ end
55
+
56
+ private
57
+ def save_to_file
35
58
  format = @options[:format]
36
59
  custom_save_method = "save_as_#{format}"
37
60
  unless respond_to?(custom_save_method, true)
@@ -57,21 +80,24 @@ module Arrow
57
80
  end
58
81
  end
59
82
 
60
- private
61
83
  def fill_options
62
84
  if @options[:format] and @options.key?(:compression)
63
85
  return
64
86
  end
65
87
 
66
- if @output.is_a?(Buffer)
88
+ case @output
89
+ when Buffer
67
90
  info = {}
91
+ when URI
92
+ extension = PathExtension.new(@output.path)
93
+ info = extension.extract
68
94
  else
69
95
  extension = PathExtension.new(@output)
70
96
  info = extension.extract
71
97
  end
72
98
  format = info[:format]
73
99
  @options = @options.dup
74
- if format and respond_to?("save_as_#{format}", true)
100
+ if format
75
101
  @options[:format] ||= format.to_sym
76
102
  else
77
103
  @options[:format] ||= :arrow
@@ -21,51 +21,27 @@ module Arrow
21
21
  # TODO: Almost codes should be implemented in Apache Arrow C++.
22
22
  class TableTableFormatter < TableFormatter
23
23
  private
24
- def format_header(text, columns)
25
- columns.each do |column|
24
+ def format_header(text, column_formatters)
25
+ column_formatters.each do |column_formatter|
26
26
  text << "\t"
27
- text << format_column_name(column)
27
+ text << column_formatter.aligned_name
28
28
  end
29
29
  text << "\n"
30
30
  end
31
31
 
32
- FLOAT_N_DIGITS = 10
33
- def format_column_name(column)
34
- case column.data_type
35
- when TimestampDataType
36
- "%*s" % [::Time.now.iso8601.size, column.name]
37
- when FloatDataType, DoubleDataType
38
- "%*s" % [FLOAT_N_DIGITS, column.name]
39
- else
40
- column.name
41
- end
42
- end
43
-
44
- def format_rows(text, columns, rows, n_digits, start_offset)
32
+ def format_rows(text, column_formatters, rows, n_digits, start_offset)
45
33
  rows.each_with_index do |row, nth_row|
46
34
  text << ("%*d" % [n_digits, start_offset + nth_row])
47
35
  row.each_with_index do |column_value, nth_column|
48
36
  text << "\t"
49
- column = columns[nth_column]
50
- text << format_column_value(column, column_value)
37
+ column_formatter = column_formatters[nth_column]
38
+ aligned_name = column_formatter.aligned_name
39
+ text << column_formatter.format_value(column_value, aligned_name.size)
51
40
  end
52
41
  text << "\n"
53
42
  end
54
43
  end
55
44
 
56
- def format_column_value(column, value)
57
- case value
58
- when ::Time
59
- value.iso8601
60
- when Float
61
- "%*f" % [[column.name.size, FLOAT_N_DIGITS].max, value]
62
- when Integer
63
- "%*d" % [column.name.size, value]
64
- else
65
- "%-*s" % [column.name.size, value.to_s]
66
- end
67
- end
68
-
69
45
  def format_ellipsis(text)
70
46
  text << "...\n"
71
47
  end
data/lib/arrow/table.rb CHANGED
@@ -195,8 +195,6 @@ module Arrow
195
195
  alias_method :size, :n_rows
196
196
  alias_method :length, :n_rows
197
197
 
198
- alias_method :[], :find_column
199
-
200
198
  alias_method :slice_raw, :slice
201
199
 
202
200
  # @overload slice(offset, length)
@@ -236,6 +234,12 @@ module Arrow
236
234
  # @return [Arrow::Table]
237
235
  # The sub `Arrow::Table` that covers only rows of the range of indices.
238
236
  #
237
+ # @overload slice(conditions)
238
+ #
239
+ # @param conditions [Hash] The conditions to select records.
240
+ # @return [Arrow::Table]
241
+ # The sub `Arrow::Table` that covers only rows matched by condition
242
+ #
239
243
  # @overload slice
240
244
  #
241
245
  # @yield [slicer] Gives slicer that constructs condition to select records.
@@ -263,12 +267,37 @@ module Arrow
263
267
  expected_n_args = nil
264
268
  case args.size
265
269
  when 1
266
- if args[0].is_a?(Integer)
270
+ case args[0]
271
+ when Integer
267
272
  index = args[0]
268
273
  index += n_rows if index < 0
269
274
  return nil if index < 0
270
275
  return nil if index >= n_rows
271
276
  return Record.new(self, index)
277
+ when Hash
278
+ condition_pairs = args[0]
279
+ slicer = Slicer.new(self)
280
+ conditions = []
281
+ condition_pairs.each do |key, value|
282
+ case value
283
+ when Range
284
+ # TODO: Optimize "begin <= key <= end" case by missing "between" kernel
285
+ # https://issues.apache.org/jira/browse/ARROW-9843
286
+ unless value.begin.nil?
287
+ conditions << (slicer[key] >= value.begin)
288
+ end
289
+ unless value.end.nil?
290
+ if value.exclude_end?
291
+ conditions << (slicer[key] < value.end)
292
+ else
293
+ conditions << (slicer[key] <= value.end)
294
+ end
295
+ end
296
+ else
297
+ conditions << (slicer[key] == value)
298
+ end
299
+ end
300
+ slicers << conditions.inject(:&)
272
301
  else
273
302
  slicers << args[0]
274
303
  end
@@ -397,41 +426,6 @@ module Arrow
397
426
  remove_column_raw(index)
398
427
  end
399
428
 
400
- # TODO
401
- #
402
- # @return [Arrow::Table]
403
- def select_columns(*selectors, &block)
404
- if selectors.empty?
405
- return to_enum(__method__) unless block_given?
406
- selected_columns = columns.select(&block)
407
- else
408
- selected_columns = []
409
- selectors.each do |selector|
410
- case selector
411
- when String, Symbol
412
- column = find_column(selector)
413
- if column.nil?
414
- message = "unknown column: #{selector.inspect}: #{inspect}"
415
- raise KeyError.new(message)
416
- end
417
- selected_columns << column
418
- when Range
419
- selected_columns.concat(columns[selector])
420
- else
421
- column = columns[selector]
422
- if column.nil?
423
- message = "out of index (0..#{n_columns - 1}): " +
424
- "#{selector.inspect}: #{inspect}"
425
- raise IndexError.new(message)
426
- end
427
- selected_columns << column
428
- end
429
- end
430
- selected_columns = selected_columns.select(&block) if block_given?
431
- end
432
- self.class.new(selected_columns)
433
- end
434
-
435
429
  # Experimental
436
430
  def group(*keys)
437
431
  Group.new(self, keys)
data/lib/arrow/version.rb CHANGED
@@ -16,7 +16,7 @@
16
16
  # under the License.
17
17
 
18
18
  module Arrow
19
- VERSION = "5.0.0"
19
+ VERSION = "6.0.0"
20
20
 
21
21
  module Version
22
22
  numbers, TAG = VERSION.split("-")
data/red-arrow.gemspec CHANGED
@@ -48,7 +48,7 @@ Gem::Specification.new do |spec|
48
48
 
49
49
  spec.add_runtime_dependency("bigdecimal", ">= 2.0.3")
50
50
  spec.add_runtime_dependency("extpp", ">= 0.0.7")
51
- spec.add_runtime_dependency("gio2", ">= 3.4.5")
51
+ spec.add_runtime_dependency("gio2", ">= 3.4.9")
52
52
  spec.add_runtime_dependency("native-package-installer")
53
53
  spec.add_runtime_dependency("pkg-config")
54
54
 
@@ -394,6 +394,20 @@ module RawRecordsDenseUnionArrayTests
394
394
  assert_equal(records, target.raw_records)
395
395
  end
396
396
 
397
+ def test_map
398
+ records = [
399
+ [{"0" => {"key1" => true, "key2" => nil}}],
400
+ [{"1" => nil}],
401
+ ]
402
+ target = build({
403
+ type: :map,
404
+ key: :string,
405
+ item: :boolean,
406
+ },
407
+ records)
408
+ assert_equal(records, target.raw_records)
409
+ end
410
+
397
411
  def test_sparse_union
398
412
  omit("Need to add support for SparseUnionArrayBuilder")
399
413
  records = [
@@ -451,6 +451,25 @@ module RawRecordsListArrayTests
451
451
  assert_equal(records, target.raw_records)
452
452
  end
453
453
 
454
+ def test_map
455
+ records = [
456
+ [
457
+ [
458
+ {"key1" => true, "key2" => nil},
459
+ nil,
460
+ ],
461
+ ],
462
+ [nil],
463
+ ]
464
+ target = build({
465
+ type: :map,
466
+ key: :string,
467
+ item: :boolean,
468
+ },
469
+ records)
470
+ assert_equal(records, target.raw_records)
471
+ end
472
+
454
473
  def test_sparse
455
474
  omit("Need to add support for SparseUnionArrayBuilder")
456
475
  records = [