red-arrow 4.0.0 → 6.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +23 -0
- data/ext/arrow/arrow.cpp +3 -0
- data/ext/arrow/converters.cpp +5 -0
- data/ext/arrow/converters.hpp +126 -0
- data/ext/arrow/extconf.rb +13 -0
- data/ext/arrow/memory-view.cpp +311 -0
- data/ext/arrow/memory-view.hpp +26 -0
- data/ext/arrow/raw-records.cpp +1 -0
- data/ext/arrow/values.cpp +1 -0
- data/lib/arrow/aggregate-node-options.rb +35 -0
- data/lib/arrow/aggregation.rb +46 -0
- data/lib/arrow/array-builder.rb +5 -0
- data/lib/arrow/array.rb +12 -0
- data/lib/arrow/binary-dictionary-array-builder.rb +27 -0
- data/lib/arrow/buffer.rb +10 -6
- data/lib/arrow/column-containable.rb +100 -1
- data/lib/arrow/constructor-arguments-gc-guardable.rb +25 -0
- data/lib/arrow/datum.rb +100 -0
- data/lib/arrow/equal-options.rb +38 -0
- data/lib/arrow/expression.rb +48 -0
- data/lib/arrow/file-system.rb +34 -0
- data/lib/arrow/group.rb +116 -124
- data/lib/arrow/loader.rb +44 -0
- data/lib/arrow/map-array-builder.rb +109 -0
- data/lib/arrow/map-array.rb +26 -0
- data/lib/arrow/map-data-type.rb +89 -0
- data/lib/arrow/path-extension.rb +1 -1
- data/lib/arrow/record-batch-reader.rb +41 -0
- data/lib/arrow/record-batch.rb +0 -2
- data/lib/arrow/scalar.rb +32 -0
- data/lib/arrow/slicer.rb +44 -143
- data/lib/arrow/source-node-options.rb +32 -0
- data/lib/arrow/string-dictionary-array-builder.rb +27 -0
- data/lib/arrow/symbol-values-appendable.rb +34 -0
- data/lib/arrow/table-concatenate-options.rb +36 -0
- data/lib/arrow/table-formatter.rb +141 -17
- data/lib/arrow/table-list-formatter.rb +5 -3
- data/lib/arrow/table-loader.rb +41 -3
- data/lib/arrow/table-saver.rb +29 -3
- data/lib/arrow/table-table-formatter.rb +7 -31
- data/lib/arrow/table.rb +34 -40
- data/lib/arrow/version.rb +1 -1
- data/red-arrow.gemspec +2 -1
- data/test/helper.rb +1 -0
- data/test/raw-records/test-dense-union-array.rb +14 -0
- data/test/raw-records/test-list-array.rb +19 -0
- data/test/raw-records/test-map-array.rb +441 -0
- data/test/raw-records/test-sparse-union-array.rb +14 -0
- data/test/raw-records/test-struct-array.rb +15 -0
- data/test/test-array-builder.rb +7 -0
- data/test/test-array.rb +34 -0
- data/test/test-binary-dictionary-array-builder.rb +103 -0
- data/test/test-boolean-scalar.rb +26 -0
- data/test/test-csv-loader.rb +8 -8
- data/test/test-expression.rb +40 -0
- data/test/test-float-scalar.rb +46 -0
- data/test/test-function.rb +176 -0
- data/test/test-group.rb +75 -51
- data/test/test-map-array-builder.rb +110 -0
- data/test/test-map-array.rb +33 -0
- data/test/test-map-data-type.rb +36 -0
- data/test/test-memory-view.rb +434 -0
- data/test/test-record-batch-reader.rb +46 -0
- data/test/test-record-batch.rb +42 -0
- data/test/test-slicer.rb +166 -167
- data/test/test-string-dictionary-array-builder.rb +103 -0
- data/test/test-table.rb +190 -53
- data/test/values/test-dense-union-array.rb +14 -0
- data/test/values/test-list-array.rb +17 -0
- data/test/values/test-map-array.rb +433 -0
- data/test/values/test-sparse-union-array.rb +14 -0
- data/test/values/test-struct-array.rb +15 -0
- metadata +132 -73
data/lib/arrow/table-loader.rb
CHANGED
@@ -15,6 +15,8 @@
|
|
15
15
|
# specific language governing permissions and limitations
|
16
16
|
# under the License.
|
17
17
|
|
18
|
+
require "uri"
|
19
|
+
|
18
20
|
module Arrow
|
19
21
|
class TableLoader
|
20
22
|
class << self
|
@@ -31,6 +33,31 @@ module Arrow
|
|
31
33
|
end
|
32
34
|
|
33
35
|
def load
|
36
|
+
if @input.is_a?(URI)
|
37
|
+
custom_load_method = "load_from_uri"
|
38
|
+
elsif @input.is_a?(String) and ::File.directory?(@input)
|
39
|
+
custom_load_method = "load_from_directory"
|
40
|
+
else
|
41
|
+
custom_load_method = "load_from_file"
|
42
|
+
end
|
43
|
+
unless respond_to?(custom_load_method, true)
|
44
|
+
available_schemes = []
|
45
|
+
(methods(true) | private_methods(true)).each do |name|
|
46
|
+
match_data = /\Aload_from_/.match(name.to_s)
|
47
|
+
if match_data
|
48
|
+
available_schemes << match_data.post_match
|
49
|
+
end
|
50
|
+
end
|
51
|
+
message = "Arrow::Table load source must be one of ["
|
52
|
+
message << available_schemes.join(", ")
|
53
|
+
message << "]: #{@input.inspect}"
|
54
|
+
raise ArgumentError, message
|
55
|
+
end
|
56
|
+
__send__(custom_load_method)
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
def load_from_file
|
34
61
|
format = @options[:format]
|
35
62
|
custom_load_method = "load_as_#{format}"
|
36
63
|
unless respond_to?(custom_load_method, true)
|
@@ -56,21 +83,24 @@ module Arrow
|
|
56
83
|
end
|
57
84
|
end
|
58
85
|
|
59
|
-
private
|
60
86
|
def fill_options
|
61
87
|
if @options[:format] and @options.key?(:compression)
|
62
88
|
return
|
63
89
|
end
|
64
90
|
|
65
|
-
|
91
|
+
case @input
|
92
|
+
when Buffer
|
66
93
|
info = {}
|
94
|
+
when URI
|
95
|
+
extension = PathExtension.new(@input.path)
|
96
|
+
info = extension.extract
|
67
97
|
else
|
68
98
|
extension = PathExtension.new(@input)
|
69
99
|
info = extension.extract
|
70
100
|
end
|
71
101
|
format = info[:format]
|
72
102
|
@options = @options.dup
|
73
|
-
if format
|
103
|
+
if format
|
74
104
|
@options[:format] ||= format.to_sym
|
75
105
|
else
|
76
106
|
@options[:format] ||= :arrow
|
@@ -183,5 +213,13 @@ module Arrow
|
|
183
213
|
table.instance_variable_set(:@input, input)
|
184
214
|
table
|
185
215
|
end
|
216
|
+
|
217
|
+
def load_as_json
|
218
|
+
input = open_input_stream
|
219
|
+
reader = JSONReader.new(input)
|
220
|
+
table = reader.read
|
221
|
+
table.instance_variable_set(:@input, input)
|
222
|
+
table
|
223
|
+
end
|
186
224
|
end
|
187
225
|
end
|
data/lib/arrow/table-saver.rb
CHANGED
@@ -32,6 +32,29 @@ module Arrow
|
|
32
32
|
end
|
33
33
|
|
34
34
|
def save
|
35
|
+
if @output.is_a?(URI)
|
36
|
+
custom_save_method = "save_to_uri"
|
37
|
+
else
|
38
|
+
custom_save_method = "save_to_file"
|
39
|
+
end
|
40
|
+
unless respond_to?(custom_save_method, true)
|
41
|
+
available_schemes = []
|
42
|
+
(methods(true) | private_methods(true)).each do |name|
|
43
|
+
match_data = /\Asave_to_/.match(name.to_s)
|
44
|
+
if match_data
|
45
|
+
available_schemes << match_data.post_match
|
46
|
+
end
|
47
|
+
end
|
48
|
+
message = "Arrow::Table save source must be one of ["
|
49
|
+
message << available_schemes.join(", ")
|
50
|
+
message << "]: #{@output.scheme.inspect}"
|
51
|
+
raise ArgumentError, message
|
52
|
+
end
|
53
|
+
__send__(custom_save_method)
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
def save_to_file
|
35
58
|
format = @options[:format]
|
36
59
|
custom_save_method = "save_as_#{format}"
|
37
60
|
unless respond_to?(custom_save_method, true)
|
@@ -57,21 +80,24 @@ module Arrow
|
|
57
80
|
end
|
58
81
|
end
|
59
82
|
|
60
|
-
private
|
61
83
|
def fill_options
|
62
84
|
if @options[:format] and @options.key?(:compression)
|
63
85
|
return
|
64
86
|
end
|
65
87
|
|
66
|
-
|
88
|
+
case @output
|
89
|
+
when Buffer
|
67
90
|
info = {}
|
91
|
+
when URI
|
92
|
+
extension = PathExtension.new(@output.path)
|
93
|
+
info = extension.extract
|
68
94
|
else
|
69
95
|
extension = PathExtension.new(@output)
|
70
96
|
info = extension.extract
|
71
97
|
end
|
72
98
|
format = info[:format]
|
73
99
|
@options = @options.dup
|
74
|
-
if format
|
100
|
+
if format
|
75
101
|
@options[:format] ||= format.to_sym
|
76
102
|
else
|
77
103
|
@options[:format] ||= :arrow
|
@@ -21,51 +21,27 @@ module Arrow
|
|
21
21
|
# TODO: Almost codes should be implemented in Apache Arrow C++.
|
22
22
|
class TableTableFormatter < TableFormatter
|
23
23
|
private
|
24
|
-
def format_header(text,
|
25
|
-
|
24
|
+
def format_header(text, column_formatters)
|
25
|
+
column_formatters.each do |column_formatter|
|
26
26
|
text << "\t"
|
27
|
-
text <<
|
27
|
+
text << column_formatter.aligned_name
|
28
28
|
end
|
29
29
|
text << "\n"
|
30
30
|
end
|
31
31
|
|
32
|
-
|
33
|
-
def format_column_name(column)
|
34
|
-
case column.data_type
|
35
|
-
when TimestampDataType
|
36
|
-
"%*s" % [::Time.now.iso8601.size, column.name]
|
37
|
-
when FloatDataType, DoubleDataType
|
38
|
-
"%*s" % [FLOAT_N_DIGITS, column.name]
|
39
|
-
else
|
40
|
-
column.name
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
def format_rows(text, columns, rows, n_digits, start_offset)
|
32
|
+
def format_rows(text, column_formatters, rows, n_digits, start_offset)
|
45
33
|
rows.each_with_index do |row, nth_row|
|
46
34
|
text << ("%*d" % [n_digits, start_offset + nth_row])
|
47
35
|
row.each_with_index do |column_value, nth_column|
|
48
36
|
text << "\t"
|
49
|
-
|
50
|
-
|
37
|
+
column_formatter = column_formatters[nth_column]
|
38
|
+
aligned_name = column_formatter.aligned_name
|
39
|
+
text << column_formatter.format_value(column_value, aligned_name.size)
|
51
40
|
end
|
52
41
|
text << "\n"
|
53
42
|
end
|
54
43
|
end
|
55
44
|
|
56
|
-
def format_column_value(column, value)
|
57
|
-
case value
|
58
|
-
when ::Time
|
59
|
-
value.iso8601
|
60
|
-
when Float
|
61
|
-
"%*f" % [[column.name.size, FLOAT_N_DIGITS].max, value]
|
62
|
-
when Integer
|
63
|
-
"%*d" % [column.name.size, value]
|
64
|
-
else
|
65
|
-
"%-*s" % [column.name.size, value.to_s]
|
66
|
-
end
|
67
|
-
end
|
68
|
-
|
69
45
|
def format_ellipsis(text)
|
70
46
|
text << "...\n"
|
71
47
|
end
|
data/lib/arrow/table.rb
CHANGED
@@ -195,8 +195,6 @@ module Arrow
|
|
195
195
|
alias_method :size, :n_rows
|
196
196
|
alias_method :length, :n_rows
|
197
197
|
|
198
|
-
alias_method :[], :find_column
|
199
|
-
|
200
198
|
alias_method :slice_raw, :slice
|
201
199
|
|
202
200
|
# @overload slice(offset, length)
|
@@ -236,6 +234,12 @@ module Arrow
|
|
236
234
|
# @return [Arrow::Table]
|
237
235
|
# The sub `Arrow::Table` that covers only rows of the range of indices.
|
238
236
|
#
|
237
|
+
# @overload slice(conditions)
|
238
|
+
#
|
239
|
+
# @param conditions [Hash] The conditions to select records.
|
240
|
+
# @return [Arrow::Table]
|
241
|
+
# The sub `Arrow::Table` that covers only rows matched by condition
|
242
|
+
#
|
239
243
|
# @overload slice
|
240
244
|
#
|
241
245
|
# @yield [slicer] Gives slicer that constructs condition to select records.
|
@@ -263,12 +267,37 @@ module Arrow
|
|
263
267
|
expected_n_args = nil
|
264
268
|
case args.size
|
265
269
|
when 1
|
266
|
-
|
270
|
+
case args[0]
|
271
|
+
when Integer
|
267
272
|
index = args[0]
|
268
273
|
index += n_rows if index < 0
|
269
274
|
return nil if index < 0
|
270
275
|
return nil if index >= n_rows
|
271
276
|
return Record.new(self, index)
|
277
|
+
when Hash
|
278
|
+
condition_pairs = args[0]
|
279
|
+
slicer = Slicer.new(self)
|
280
|
+
conditions = []
|
281
|
+
condition_pairs.each do |key, value|
|
282
|
+
case value
|
283
|
+
when Range
|
284
|
+
# TODO: Optimize "begin <= key <= end" case by missing "between" kernel
|
285
|
+
# https://issues.apache.org/jira/browse/ARROW-9843
|
286
|
+
unless value.begin.nil?
|
287
|
+
conditions << (slicer[key] >= value.begin)
|
288
|
+
end
|
289
|
+
unless value.end.nil?
|
290
|
+
if value.exclude_end?
|
291
|
+
conditions << (slicer[key] < value.end)
|
292
|
+
else
|
293
|
+
conditions << (slicer[key] <= value.end)
|
294
|
+
end
|
295
|
+
end
|
296
|
+
else
|
297
|
+
conditions << (slicer[key] == value)
|
298
|
+
end
|
299
|
+
end
|
300
|
+
slicers << conditions.inject(:&)
|
272
301
|
else
|
273
302
|
slicers << args[0]
|
274
303
|
end
|
@@ -397,41 +426,6 @@ module Arrow
|
|
397
426
|
remove_column_raw(index)
|
398
427
|
end
|
399
428
|
|
400
|
-
# TODO
|
401
|
-
#
|
402
|
-
# @return [Arrow::Table]
|
403
|
-
def select_columns(*selectors, &block)
|
404
|
-
if selectors.empty?
|
405
|
-
return to_enum(__method__) unless block_given?
|
406
|
-
selected_columns = columns.select(&block)
|
407
|
-
else
|
408
|
-
selected_columns = []
|
409
|
-
selectors.each do |selector|
|
410
|
-
case selector
|
411
|
-
when String, Symbol
|
412
|
-
column = find_column(selector)
|
413
|
-
if column.nil?
|
414
|
-
message = "unknown column: #{selector.inspect}: #{inspect}"
|
415
|
-
raise KeyError.new(message)
|
416
|
-
end
|
417
|
-
selected_columns << column
|
418
|
-
when Range
|
419
|
-
selected_columns.concat(columns[selector])
|
420
|
-
else
|
421
|
-
column = columns[selector]
|
422
|
-
if column.nil?
|
423
|
-
message = "out of index (0..#{n_columns - 1}): " +
|
424
|
-
"#{selector.inspect}: #{inspect}"
|
425
|
-
raise IndexError.new(message)
|
426
|
-
end
|
427
|
-
selected_columns << column
|
428
|
-
end
|
429
|
-
end
|
430
|
-
selected_columns = selected_columns.select(&block) if block_given?
|
431
|
-
end
|
432
|
-
self.class.new(selected_columns)
|
433
|
-
end
|
434
|
-
|
435
429
|
# Experimental
|
436
430
|
def group(*keys)
|
437
431
|
Group.new(self, keys)
|
@@ -442,8 +436,8 @@ module Arrow
|
|
442
436
|
RollingWindow.new(self, size)
|
443
437
|
end
|
444
438
|
|
445
|
-
def save(
|
446
|
-
saver = TableSaver.new(self,
|
439
|
+
def save(output, options={})
|
440
|
+
saver = TableSaver.new(self, output, options)
|
447
441
|
saver.save
|
448
442
|
end
|
449
443
|
|
data/lib/arrow/version.rb
CHANGED
data/red-arrow.gemspec
CHANGED
@@ -48,13 +48,14 @@ Gem::Specification.new do |spec|
|
|
48
48
|
|
49
49
|
spec.add_runtime_dependency("bigdecimal", ">= 2.0.3")
|
50
50
|
spec.add_runtime_dependency("extpp", ">= 0.0.7")
|
51
|
-
spec.add_runtime_dependency("gio2", ">= 3.
|
51
|
+
spec.add_runtime_dependency("gio2", ">= 3.4.9")
|
52
52
|
spec.add_runtime_dependency("native-package-installer")
|
53
53
|
spec.add_runtime_dependency("pkg-config")
|
54
54
|
|
55
55
|
spec.add_development_dependency("benchmark-driver")
|
56
56
|
spec.add_development_dependency("bundler")
|
57
57
|
spec.add_development_dependency("faker")
|
58
|
+
spec.add_development_dependency("fiddle", ">= 1.0.9")
|
58
59
|
spec.add_development_dependency("rake")
|
59
60
|
spec.add_development_dependency("redcarpet")
|
60
61
|
spec.add_development_dependency("test-unit")
|
data/test/helper.rb
CHANGED
@@ -394,6 +394,20 @@ module RawRecordsDenseUnionArrayTests
|
|
394
394
|
assert_equal(records, target.raw_records)
|
395
395
|
end
|
396
396
|
|
397
|
+
def test_map
|
398
|
+
records = [
|
399
|
+
[{"0" => {"key1" => true, "key2" => nil}}],
|
400
|
+
[{"1" => nil}],
|
401
|
+
]
|
402
|
+
target = build({
|
403
|
+
type: :map,
|
404
|
+
key: :string,
|
405
|
+
item: :boolean,
|
406
|
+
},
|
407
|
+
records)
|
408
|
+
assert_equal(records, target.raw_records)
|
409
|
+
end
|
410
|
+
|
397
411
|
def test_sparse_union
|
398
412
|
omit("Need to add support for SparseUnionArrayBuilder")
|
399
413
|
records = [
|
@@ -451,6 +451,25 @@ module RawRecordsListArrayTests
|
|
451
451
|
assert_equal(records, target.raw_records)
|
452
452
|
end
|
453
453
|
|
454
|
+
def test_map
|
455
|
+
records = [
|
456
|
+
[
|
457
|
+
[
|
458
|
+
{"key1" => true, "key2" => nil},
|
459
|
+
nil,
|
460
|
+
],
|
461
|
+
],
|
462
|
+
[nil],
|
463
|
+
]
|
464
|
+
target = build({
|
465
|
+
type: :map,
|
466
|
+
key: :string,
|
467
|
+
item: :boolean,
|
468
|
+
},
|
469
|
+
records)
|
470
|
+
assert_equal(records, target.raw_records)
|
471
|
+
end
|
472
|
+
|
454
473
|
def test_sparse
|
455
474
|
omit("Need to add support for SparseUnionArrayBuilder")
|
456
475
|
records = [
|