red-arrow 5.0.0 → 6.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +23 -0
- data/ext/arrow/converters.cpp +5 -0
- data/ext/arrow/converters.hpp +126 -0
- data/ext/arrow/extconf.rb +13 -0
- data/ext/arrow/raw-records.cpp +1 -0
- data/ext/arrow/values.cpp +1 -0
- data/lib/arrow/aggregate-node-options.rb +35 -0
- data/lib/arrow/aggregation.rb +46 -0
- data/lib/arrow/array-builder.rb +5 -0
- data/lib/arrow/binary-dictionary-array-builder.rb +27 -0
- data/lib/arrow/column-containable.rb +100 -1
- data/lib/arrow/datum.rb +2 -0
- data/lib/arrow/expression.rb +48 -0
- data/lib/arrow/file-system.rb +34 -0
- data/lib/arrow/group.rb +116 -124
- data/lib/arrow/loader.rb +13 -0
- data/lib/arrow/map-array-builder.rb +109 -0
- data/lib/arrow/map-array.rb +26 -0
- data/lib/arrow/map-data-type.rb +89 -0
- data/lib/arrow/path-extension.rb +1 -1
- data/lib/arrow/record-batch-reader.rb +41 -0
- data/lib/arrow/record-batch.rb +0 -2
- data/lib/arrow/slicer.rb +44 -143
- data/lib/arrow/source-node-options.rb +32 -0
- data/lib/arrow/string-dictionary-array-builder.rb +27 -0
- data/lib/arrow/symbol-values-appendable.rb +34 -0
- data/lib/arrow/table-concatenate-options.rb +36 -0
- data/lib/arrow/table-formatter.rb +141 -17
- data/lib/arrow/table-list-formatter.rb +5 -3
- data/lib/arrow/table-loader.rb +41 -3
- data/lib/arrow/table-saver.rb +29 -3
- data/lib/arrow/table-table-formatter.rb +7 -31
- data/lib/arrow/table.rb +32 -38
- data/lib/arrow/version.rb +1 -1
- data/red-arrow.gemspec +1 -1
- data/test/raw-records/test-dense-union-array.rb +14 -0
- data/test/raw-records/test-list-array.rb +19 -0
- data/test/raw-records/test-map-array.rb +441 -0
- data/test/raw-records/test-sparse-union-array.rb +14 -0
- data/test/raw-records/test-struct-array.rb +15 -0
- data/test/test-array-builder.rb +7 -0
- data/test/test-binary-dictionary-array-builder.rb +103 -0
- data/test/test-csv-loader.rb +8 -8
- data/test/test-expression.rb +40 -0
- data/test/test-group.rb +75 -51
- data/test/test-map-array-builder.rb +110 -0
- data/test/test-map-array.rb +33 -0
- data/test/test-map-data-type.rb +36 -0
- data/test/test-record-batch-reader.rb +46 -0
- data/test/test-record-batch.rb +42 -0
- data/test/test-slicer.rb +166 -167
- data/test/test-string-dictionary-array-builder.rb +103 -0
- data/test/test-table.rb +190 -53
- data/test/values/test-dense-union-array.rb +14 -0
- data/test/values/test-list-array.rb +17 -0
- data/test/values/test-map-array.rb +433 -0
- data/test/values/test-sparse-union-array.rb +14 -0
- data/test/values/test-struct-array.rb +15 -0
- metadata +107 -76
data/lib/arrow/table-loader.rb
CHANGED
@@ -15,6 +15,8 @@
|
|
15
15
|
# specific language governing permissions and limitations
|
16
16
|
# under the License.
|
17
17
|
|
18
|
+
require "uri"
|
19
|
+
|
18
20
|
module Arrow
|
19
21
|
class TableLoader
|
20
22
|
class << self
|
@@ -31,6 +33,31 @@ module Arrow
|
|
31
33
|
end
|
32
34
|
|
33
35
|
def load
|
36
|
+
if @input.is_a?(URI)
|
37
|
+
custom_load_method = "load_from_uri"
|
38
|
+
elsif @input.is_a?(String) and ::File.directory?(@input)
|
39
|
+
custom_load_method = "load_from_directory"
|
40
|
+
else
|
41
|
+
custom_load_method = "load_from_file"
|
42
|
+
end
|
43
|
+
unless respond_to?(custom_load_method, true)
|
44
|
+
available_schemes = []
|
45
|
+
(methods(true) | private_methods(true)).each do |name|
|
46
|
+
match_data = /\Aload_from_/.match(name.to_s)
|
47
|
+
if match_data
|
48
|
+
available_schemes << match_data.post_match
|
49
|
+
end
|
50
|
+
end
|
51
|
+
message = "Arrow::Table load source must be one of ["
|
52
|
+
message << available_schemes.join(", ")
|
53
|
+
message << "]: #{@input.inspect}"
|
54
|
+
raise ArgumentError, message
|
55
|
+
end
|
56
|
+
__send__(custom_load_method)
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
def load_from_file
|
34
61
|
format = @options[:format]
|
35
62
|
custom_load_method = "load_as_#{format}"
|
36
63
|
unless respond_to?(custom_load_method, true)
|
@@ -56,21 +83,24 @@ module Arrow
|
|
56
83
|
end
|
57
84
|
end
|
58
85
|
|
59
|
-
private
|
60
86
|
def fill_options
|
61
87
|
if @options[:format] and @options.key?(:compression)
|
62
88
|
return
|
63
89
|
end
|
64
90
|
|
65
|
-
|
91
|
+
case @input
|
92
|
+
when Buffer
|
66
93
|
info = {}
|
94
|
+
when URI
|
95
|
+
extension = PathExtension.new(@input.path)
|
96
|
+
info = extension.extract
|
67
97
|
else
|
68
98
|
extension = PathExtension.new(@input)
|
69
99
|
info = extension.extract
|
70
100
|
end
|
71
101
|
format = info[:format]
|
72
102
|
@options = @options.dup
|
73
|
-
if format
|
103
|
+
if format
|
74
104
|
@options[:format] ||= format.to_sym
|
75
105
|
else
|
76
106
|
@options[:format] ||= :arrow
|
@@ -183,5 +213,13 @@ module Arrow
|
|
183
213
|
table.instance_variable_set(:@input, input)
|
184
214
|
table
|
185
215
|
end
|
216
|
+
|
217
|
+
def load_as_json
|
218
|
+
input = open_input_stream
|
219
|
+
reader = JSONReader.new(input)
|
220
|
+
table = reader.read
|
221
|
+
table.instance_variable_set(:@input, input)
|
222
|
+
table
|
223
|
+
end
|
186
224
|
end
|
187
225
|
end
|
data/lib/arrow/table-saver.rb
CHANGED
@@ -32,6 +32,29 @@ module Arrow
|
|
32
32
|
end
|
33
33
|
|
34
34
|
def save
|
35
|
+
if @output.is_a?(URI)
|
36
|
+
custom_save_method = "save_to_uri"
|
37
|
+
else
|
38
|
+
custom_save_method = "save_to_file"
|
39
|
+
end
|
40
|
+
unless respond_to?(custom_save_method, true)
|
41
|
+
available_schemes = []
|
42
|
+
(methods(true) | private_methods(true)).each do |name|
|
43
|
+
match_data = /\Asave_to_/.match(name.to_s)
|
44
|
+
if match_data
|
45
|
+
available_schemes << match_data.post_match
|
46
|
+
end
|
47
|
+
end
|
48
|
+
message = "Arrow::Table save source must be one of ["
|
49
|
+
message << available_schemes.join(", ")
|
50
|
+
message << "]: #{@output.scheme.inspect}"
|
51
|
+
raise ArgumentError, message
|
52
|
+
end
|
53
|
+
__send__(custom_save_method)
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
def save_to_file
|
35
58
|
format = @options[:format]
|
36
59
|
custom_save_method = "save_as_#{format}"
|
37
60
|
unless respond_to?(custom_save_method, true)
|
@@ -57,21 +80,24 @@ module Arrow
|
|
57
80
|
end
|
58
81
|
end
|
59
82
|
|
60
|
-
private
|
61
83
|
def fill_options
|
62
84
|
if @options[:format] and @options.key?(:compression)
|
63
85
|
return
|
64
86
|
end
|
65
87
|
|
66
|
-
|
88
|
+
case @output
|
89
|
+
when Buffer
|
67
90
|
info = {}
|
91
|
+
when URI
|
92
|
+
extension = PathExtension.new(@output.path)
|
93
|
+
info = extension.extract
|
68
94
|
else
|
69
95
|
extension = PathExtension.new(@output)
|
70
96
|
info = extension.extract
|
71
97
|
end
|
72
98
|
format = info[:format]
|
73
99
|
@options = @options.dup
|
74
|
-
if format
|
100
|
+
if format
|
75
101
|
@options[:format] ||= format.to_sym
|
76
102
|
else
|
77
103
|
@options[:format] ||= :arrow
|
@@ -21,51 +21,27 @@ module Arrow
|
|
21
21
|
# TODO: Almost codes should be implemented in Apache Arrow C++.
|
22
22
|
class TableTableFormatter < TableFormatter
|
23
23
|
private
|
24
|
-
def format_header(text,
|
25
|
-
|
24
|
+
def format_header(text, column_formatters)
|
25
|
+
column_formatters.each do |column_formatter|
|
26
26
|
text << "\t"
|
27
|
-
text <<
|
27
|
+
text << column_formatter.aligned_name
|
28
28
|
end
|
29
29
|
text << "\n"
|
30
30
|
end
|
31
31
|
|
32
|
-
|
33
|
-
def format_column_name(column)
|
34
|
-
case column.data_type
|
35
|
-
when TimestampDataType
|
36
|
-
"%*s" % [::Time.now.iso8601.size, column.name]
|
37
|
-
when FloatDataType, DoubleDataType
|
38
|
-
"%*s" % [FLOAT_N_DIGITS, column.name]
|
39
|
-
else
|
40
|
-
column.name
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
def format_rows(text, columns, rows, n_digits, start_offset)
|
32
|
+
def format_rows(text, column_formatters, rows, n_digits, start_offset)
|
45
33
|
rows.each_with_index do |row, nth_row|
|
46
34
|
text << ("%*d" % [n_digits, start_offset + nth_row])
|
47
35
|
row.each_with_index do |column_value, nth_column|
|
48
36
|
text << "\t"
|
49
|
-
|
50
|
-
|
37
|
+
column_formatter = column_formatters[nth_column]
|
38
|
+
aligned_name = column_formatter.aligned_name
|
39
|
+
text << column_formatter.format_value(column_value, aligned_name.size)
|
51
40
|
end
|
52
41
|
text << "\n"
|
53
42
|
end
|
54
43
|
end
|
55
44
|
|
56
|
-
def format_column_value(column, value)
|
57
|
-
case value
|
58
|
-
when ::Time
|
59
|
-
value.iso8601
|
60
|
-
when Float
|
61
|
-
"%*f" % [[column.name.size, FLOAT_N_DIGITS].max, value]
|
62
|
-
when Integer
|
63
|
-
"%*d" % [column.name.size, value]
|
64
|
-
else
|
65
|
-
"%-*s" % [column.name.size, value.to_s]
|
66
|
-
end
|
67
|
-
end
|
68
|
-
|
69
45
|
def format_ellipsis(text)
|
70
46
|
text << "...\n"
|
71
47
|
end
|
data/lib/arrow/table.rb
CHANGED
@@ -195,8 +195,6 @@ module Arrow
|
|
195
195
|
alias_method :size, :n_rows
|
196
196
|
alias_method :length, :n_rows
|
197
197
|
|
198
|
-
alias_method :[], :find_column
|
199
|
-
|
200
198
|
alias_method :slice_raw, :slice
|
201
199
|
|
202
200
|
# @overload slice(offset, length)
|
@@ -236,6 +234,12 @@ module Arrow
|
|
236
234
|
# @return [Arrow::Table]
|
237
235
|
# The sub `Arrow::Table` that covers only rows of the range of indices.
|
238
236
|
#
|
237
|
+
# @overload slice(conditions)
|
238
|
+
#
|
239
|
+
# @param conditions [Hash] The conditions to select records.
|
240
|
+
# @return [Arrow::Table]
|
241
|
+
# The sub `Arrow::Table` that covers only rows matched by condition
|
242
|
+
#
|
239
243
|
# @overload slice
|
240
244
|
#
|
241
245
|
# @yield [slicer] Gives slicer that constructs condition to select records.
|
@@ -263,12 +267,37 @@ module Arrow
|
|
263
267
|
expected_n_args = nil
|
264
268
|
case args.size
|
265
269
|
when 1
|
266
|
-
|
270
|
+
case args[0]
|
271
|
+
when Integer
|
267
272
|
index = args[0]
|
268
273
|
index += n_rows if index < 0
|
269
274
|
return nil if index < 0
|
270
275
|
return nil if index >= n_rows
|
271
276
|
return Record.new(self, index)
|
277
|
+
when Hash
|
278
|
+
condition_pairs = args[0]
|
279
|
+
slicer = Slicer.new(self)
|
280
|
+
conditions = []
|
281
|
+
condition_pairs.each do |key, value|
|
282
|
+
case value
|
283
|
+
when Range
|
284
|
+
# TODO: Optimize "begin <= key <= end" case by missing "between" kernel
|
285
|
+
# https://issues.apache.org/jira/browse/ARROW-9843
|
286
|
+
unless value.begin.nil?
|
287
|
+
conditions << (slicer[key] >= value.begin)
|
288
|
+
end
|
289
|
+
unless value.end.nil?
|
290
|
+
if value.exclude_end?
|
291
|
+
conditions << (slicer[key] < value.end)
|
292
|
+
else
|
293
|
+
conditions << (slicer[key] <= value.end)
|
294
|
+
end
|
295
|
+
end
|
296
|
+
else
|
297
|
+
conditions << (slicer[key] == value)
|
298
|
+
end
|
299
|
+
end
|
300
|
+
slicers << conditions.inject(:&)
|
272
301
|
else
|
273
302
|
slicers << args[0]
|
274
303
|
end
|
@@ -397,41 +426,6 @@ module Arrow
|
|
397
426
|
remove_column_raw(index)
|
398
427
|
end
|
399
428
|
|
400
|
-
# TODO
|
401
|
-
#
|
402
|
-
# @return [Arrow::Table]
|
403
|
-
def select_columns(*selectors, &block)
|
404
|
-
if selectors.empty?
|
405
|
-
return to_enum(__method__) unless block_given?
|
406
|
-
selected_columns = columns.select(&block)
|
407
|
-
else
|
408
|
-
selected_columns = []
|
409
|
-
selectors.each do |selector|
|
410
|
-
case selector
|
411
|
-
when String, Symbol
|
412
|
-
column = find_column(selector)
|
413
|
-
if column.nil?
|
414
|
-
message = "unknown column: #{selector.inspect}: #{inspect}"
|
415
|
-
raise KeyError.new(message)
|
416
|
-
end
|
417
|
-
selected_columns << column
|
418
|
-
when Range
|
419
|
-
selected_columns.concat(columns[selector])
|
420
|
-
else
|
421
|
-
column = columns[selector]
|
422
|
-
if column.nil?
|
423
|
-
message = "out of index (0..#{n_columns - 1}): " +
|
424
|
-
"#{selector.inspect}: #{inspect}"
|
425
|
-
raise IndexError.new(message)
|
426
|
-
end
|
427
|
-
selected_columns << column
|
428
|
-
end
|
429
|
-
end
|
430
|
-
selected_columns = selected_columns.select(&block) if block_given?
|
431
|
-
end
|
432
|
-
self.class.new(selected_columns)
|
433
|
-
end
|
434
|
-
|
435
429
|
# Experimental
|
436
430
|
def group(*keys)
|
437
431
|
Group.new(self, keys)
|
data/lib/arrow/version.rb
CHANGED
data/red-arrow.gemspec
CHANGED
@@ -48,7 +48,7 @@ Gem::Specification.new do |spec|
|
|
48
48
|
|
49
49
|
spec.add_runtime_dependency("bigdecimal", ">= 2.0.3")
|
50
50
|
spec.add_runtime_dependency("extpp", ">= 0.0.7")
|
51
|
-
spec.add_runtime_dependency("gio2", ">= 3.4.
|
51
|
+
spec.add_runtime_dependency("gio2", ">= 3.4.9")
|
52
52
|
spec.add_runtime_dependency("native-package-installer")
|
53
53
|
spec.add_runtime_dependency("pkg-config")
|
54
54
|
|
@@ -394,6 +394,20 @@ module RawRecordsDenseUnionArrayTests
|
|
394
394
|
assert_equal(records, target.raw_records)
|
395
395
|
end
|
396
396
|
|
397
|
+
def test_map
|
398
|
+
records = [
|
399
|
+
[{"0" => {"key1" => true, "key2" => nil}}],
|
400
|
+
[{"1" => nil}],
|
401
|
+
]
|
402
|
+
target = build({
|
403
|
+
type: :map,
|
404
|
+
key: :string,
|
405
|
+
item: :boolean,
|
406
|
+
},
|
407
|
+
records)
|
408
|
+
assert_equal(records, target.raw_records)
|
409
|
+
end
|
410
|
+
|
397
411
|
def test_sparse_union
|
398
412
|
omit("Need to add support for SparseUnionArrayBuilder")
|
399
413
|
records = [
|
@@ -451,6 +451,25 @@ module RawRecordsListArrayTests
|
|
451
451
|
assert_equal(records, target.raw_records)
|
452
452
|
end
|
453
453
|
|
454
|
+
def test_map
|
455
|
+
records = [
|
456
|
+
[
|
457
|
+
[
|
458
|
+
{"key1" => true, "key2" => nil},
|
459
|
+
nil,
|
460
|
+
],
|
461
|
+
],
|
462
|
+
[nil],
|
463
|
+
]
|
464
|
+
target = build({
|
465
|
+
type: :map,
|
466
|
+
key: :string,
|
467
|
+
item: :boolean,
|
468
|
+
},
|
469
|
+
records)
|
470
|
+
assert_equal(records, target.raw_records)
|
471
|
+
end
|
472
|
+
|
454
473
|
def test_sparse
|
455
474
|
omit("Need to add support for SparseUnionArrayBuilder")
|
456
475
|
records = [
|