red-arrow 0.15.1 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Rakefile +28 -16
- data/ext/arrow/converters.hpp +63 -33
- data/ext/arrow/raw-records.cpp +2 -1
- data/ext/arrow/values.cpp +2 -1
- data/lib/arrow/array-builder.rb +101 -52
- data/lib/arrow/array.rb +28 -10
- data/lib/arrow/{binary-array-builder.rb → buffer.rb} +7 -15
- data/lib/arrow/chunked-array.rb +2 -0
- data/lib/arrow/csv-loader.rb +5 -0
- data/lib/arrow/csv-read-options.rb +18 -0
- data/lib/arrow/data-type.rb +35 -2
- data/lib/arrow/decimal128-array-builder.rb +0 -2
- data/lib/arrow/dictionary-array.rb +24 -0
- data/lib/arrow/field.rb +1 -1
- data/lib/arrow/generic-filterable.rb +43 -0
- data/lib/arrow/generic-takeable.rb +38 -0
- data/lib/arrow/list-data-type.rb +58 -8
- data/lib/arrow/loader.rb +12 -1
- data/lib/arrow/null-array-builder.rb +1 -1
- data/lib/arrow/null-array.rb +24 -0
- data/lib/arrow/raw-table-converter.rb +47 -0
- data/lib/arrow/record-batch-iterator.rb +22 -0
- data/lib/arrow/record-batch.rb +8 -3
- data/lib/arrow/schema.rb +5 -2
- data/lib/arrow/struct-array-builder.rb +13 -7
- data/lib/arrow/struct-data-type.rb +0 -2
- data/lib/arrow/table-loader.rb +29 -6
- data/lib/arrow/table-saver.rb +37 -13
- data/lib/arrow/table.rb +20 -73
- data/lib/arrow/version.rb +1 -1
- data/red-arrow.gemspec +3 -1
- data/test/helper.rb +1 -0
- data/test/helper/omittable.rb +36 -0
- data/test/raw-records/test-dense-union-array.rb +1 -34
- data/test/raw-records/test-sparse-union-array.rb +1 -33
- data/test/run-test.rb +14 -3
- data/test/test-array-builder.rb +17 -0
- data/test/test-array.rb +104 -0
- data/test/test-buffer.rb +11 -0
- data/test/test-chunked-array.rb +96 -0
- data/test/test-csv-loader.rb +2 -2
- data/test/test-data-type.rb +11 -0
- data/test/test-dense-union-data-type.rb +2 -2
- data/test/test-dictionary-array.rb +41 -0
- data/test/test-feather.rb +21 -6
- data/test/test-list-data-type.rb +27 -1
- data/test/test-null-array.rb +23 -0
- data/test/test-record-batch-iterator.rb +37 -0
- data/test/test-record-batch.rb +14 -0
- data/test/test-schema.rb +16 -0
- data/test/test-slicer.rb +74 -30
- data/test/test-sparse-union-data-type.rb +2 -2
- data/test/test-struct-array-builder.rb +8 -4
- data/test/test-table.rb +153 -14
- data/test/test-timestamp-array.rb +19 -0
- data/test/values/test-dense-union-array.rb +1 -34
- data/test/values/test-sparse-union-array.rb +1 -33
- metadata +22 -8
@@ -0,0 +1,47 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
|
18
|
+
module Arrow
|
19
|
+
class RawTableConverter
|
20
|
+
attr_reader :n_rows
|
21
|
+
attr_reader :schema
|
22
|
+
attr_reader :values
|
23
|
+
def initialize(raw_table)
|
24
|
+
@raw_table = raw_table
|
25
|
+
convert
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
def convert
|
30
|
+
if @raw_table.is_a?(::Array) and @raw_table[0].is_a?(Column)
|
31
|
+
fields = @raw_table.collect(&:field)
|
32
|
+
@schema = Schema.new(fields)
|
33
|
+
@values = @raw_table.collect(&:data)
|
34
|
+
else
|
35
|
+
fields = []
|
36
|
+
@values = []
|
37
|
+
@raw_table.each do |name, array|
|
38
|
+
array = ArrayBuilder.build(array) if array.is_a?(::Array)
|
39
|
+
fields << Field.new(name.to_s, array.value_data_type)
|
40
|
+
@values << array
|
41
|
+
end
|
42
|
+
@schema = Schema.new(fields)
|
43
|
+
end
|
44
|
+
@n_rows = @values[0].length
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
|
18
|
+
module Arrow
|
19
|
+
class RecordBatchIterator
|
20
|
+
alias_method :to_a, :to_list
|
21
|
+
end
|
22
|
+
end
|
data/lib/arrow/record-batch.rb
CHANGED
@@ -15,8 +15,7 @@
|
|
15
15
|
# specific language governing permissions and limitations
|
16
16
|
# under the License.
|
17
17
|
|
18
|
-
require "arrow/
|
19
|
-
require "arrow/record-containable"
|
18
|
+
require "arrow/raw-table-converter"
|
20
19
|
|
21
20
|
module Arrow
|
22
21
|
class RecordBatch
|
@@ -28,13 +27,19 @@ module Arrow
|
|
28
27
|
def new(*args)
|
29
28
|
n_args = args.size
|
30
29
|
case n_args
|
30
|
+
when 1
|
31
|
+
raw_table_converter = RawTableConverter.new(args[0])
|
32
|
+
n_rows = raw_table_converter.n_rows
|
33
|
+
schema = raw_table_converter.schema
|
34
|
+
values = raw_table_converter.values
|
35
|
+
super(schema, n_rows, values)
|
31
36
|
when 2
|
32
37
|
schema, data = args
|
33
38
|
RecordBatchBuilder.build(schema, data)
|
34
39
|
when 3
|
35
40
|
super
|
36
41
|
else
|
37
|
-
message = "wrong number of arguments (given #{n_args}, expected
|
42
|
+
message = "wrong number of arguments (given #{n_args}, expected 1..3)"
|
38
43
|
raise ArgumentError, message
|
39
44
|
end
|
40
45
|
end
|
data/lib/arrow/schema.rb
CHANGED
@@ -15,8 +15,6 @@
|
|
15
15
|
# specific language governing permissions and limitations
|
16
16
|
# under the License.
|
17
17
|
|
18
|
-
require "arrow/field-containable"
|
19
|
-
|
20
18
|
module Arrow
|
21
19
|
class Schema
|
22
20
|
include FieldContainable
|
@@ -93,5 +91,10 @@ module Arrow
|
|
93
91
|
end
|
94
92
|
|
95
93
|
alias_method :[], :find_field
|
94
|
+
|
95
|
+
alias_method :to_s_raw, :to_s
|
96
|
+
def to_s(show_metadata: false)
|
97
|
+
to_string_metadata(show_metadata)
|
98
|
+
end
|
96
99
|
end
|
97
100
|
end
|
@@ -32,7 +32,7 @@ module Arrow
|
|
32
32
|
case index_or_name
|
33
33
|
when String, Symbol
|
34
34
|
name = index_or_name
|
35
|
-
|
35
|
+
cached_name_to_builder[name.to_s]
|
36
36
|
else
|
37
37
|
index = index_or_name
|
38
38
|
cached_field_builders[index]
|
@@ -70,13 +70,18 @@ module Arrow
|
|
70
70
|
append_null
|
71
71
|
when ::Array
|
72
72
|
append_value_raw
|
73
|
-
value
|
74
|
-
|
73
|
+
cached_field_builders.zip(value) do |builder, sub_value|
|
74
|
+
builder.append(sub_value)
|
75
75
|
end
|
76
76
|
when Hash
|
77
77
|
append_value_raw
|
78
|
+
local_name_to_builder = cached_name_to_builder.dup
|
78
79
|
value.each do |name, sub_value|
|
79
|
-
|
80
|
+
builder = local_name_to_builder.delete(name.to_s)
|
81
|
+
builder.append(sub_value)
|
82
|
+
end
|
83
|
+
local_name_to_builder.each do |_, builder|
|
84
|
+
builder.append_null
|
80
85
|
end
|
81
86
|
else
|
82
87
|
message =
|
@@ -108,9 +113,6 @@ module Arrow
|
|
108
113
|
alias_method :append_null_raw, :append_null
|
109
114
|
def append_null
|
110
115
|
append_null_raw
|
111
|
-
cached_field_builders.each do |builder|
|
112
|
-
builder.append_null
|
113
|
-
end
|
114
116
|
end
|
115
117
|
|
116
118
|
# @since 0.12.0
|
@@ -136,5 +138,9 @@ module Arrow
|
|
136
138
|
end
|
137
139
|
name_to_builder
|
138
140
|
end
|
141
|
+
|
142
|
+
def cached_name_to_builder
|
143
|
+
@name_to_builder ||= build_name_to_builder
|
144
|
+
end
|
139
145
|
end
|
140
146
|
end
|
data/lib/arrow/table-loader.rb
CHANGED
@@ -41,6 +41,8 @@ module Arrow
|
|
41
41
|
available_formats << match_data.post_match
|
42
42
|
end
|
43
43
|
end
|
44
|
+
deprecated_formats = ["batch", "stream"]
|
45
|
+
available_formats -= deprecated_formats
|
44
46
|
message = "Arrow::Table load format must be one of ["
|
45
47
|
message << available_formats.join(", ")
|
46
48
|
message << "]: #{format.inspect}"
|
@@ -119,18 +121,30 @@ module Arrow
|
|
119
121
|
load_raw(input, reader)
|
120
122
|
end
|
121
123
|
|
122
|
-
|
124
|
+
# @since 1.0.0
|
125
|
+
def load_as_arrow_file
|
123
126
|
input = open_input_stream
|
124
127
|
reader = RecordBatchFileReader.new(input)
|
125
128
|
load_raw(input, reader)
|
126
129
|
end
|
127
130
|
|
128
|
-
|
131
|
+
# @deprecated Use `format: :arrow_file` instead.
|
132
|
+
def load_as_batch
|
133
|
+
load_as_arrow_file
|
134
|
+
end
|
135
|
+
|
136
|
+
# @since 1.0.0
|
137
|
+
def load_as_arrow_streaming
|
129
138
|
input = open_input_stream
|
130
139
|
reader = RecordBatchStreamReader.new(input)
|
131
140
|
load_raw(input, reader)
|
132
141
|
end
|
133
142
|
|
143
|
+
# @deprecated Use `format: :arrow_streaming` instead.
|
144
|
+
def load_as_stream
|
145
|
+
load_as_arrow_streaming
|
146
|
+
end
|
147
|
+
|
134
148
|
if Arrow.const_defined?(:ORCFileReader)
|
135
149
|
def load_as_orc
|
136
150
|
input = open_input_stream
|
@@ -143,16 +157,25 @@ module Arrow
|
|
143
157
|
end
|
144
158
|
end
|
145
159
|
|
146
|
-
def
|
147
|
-
options = @options.dup
|
160
|
+
def csv_load(options)
|
148
161
|
options.delete(:format)
|
149
162
|
if @input.is_a?(Buffer)
|
150
|
-
CSVLoader.load(@input.data.to_s, options)
|
163
|
+
CSVLoader.load(@input.data.to_s, **options)
|
151
164
|
else
|
152
|
-
CSVLoader.load(Pathname.new(@input), options)
|
165
|
+
CSVLoader.load(Pathname.new(@input), **options)
|
153
166
|
end
|
154
167
|
end
|
155
168
|
|
169
|
+
def load_as_csv
|
170
|
+
csv_load(@options.dup)
|
171
|
+
end
|
172
|
+
|
173
|
+
def load_as_tsv
|
174
|
+
options = @options.dup
|
175
|
+
options[:delimiter] = "\t"
|
176
|
+
csv_load(options.dup)
|
177
|
+
end
|
178
|
+
|
156
179
|
def load_as_feather
|
157
180
|
input = open_input_stream
|
158
181
|
reader = FeatherFileReader.new(input)
|
data/lib/arrow/table-saver.rb
CHANGED
@@ -42,6 +42,8 @@ module Arrow
|
|
42
42
|
available_formats << match_data.post_match
|
43
43
|
end
|
44
44
|
end
|
45
|
+
deprecated_formats = ["batch", "stream"]
|
46
|
+
available_formats -= deprecated_formats
|
45
47
|
message = "Arrow::Table save format must be one of ["
|
46
48
|
message << available_formats.join(", ")
|
47
49
|
message << "]: #{format.inspect}"
|
@@ -110,35 +112,57 @@ module Arrow
|
|
110
112
|
end
|
111
113
|
|
112
114
|
def save_as_arrow
|
113
|
-
|
115
|
+
save_as_arrow_file
|
114
116
|
end
|
115
117
|
|
116
|
-
|
118
|
+
# @since 1.0.0
|
119
|
+
def save_as_arrow_file
|
117
120
|
save_raw(RecordBatchFileWriter)
|
118
121
|
end
|
119
122
|
|
120
|
-
|
123
|
+
# @deprecated Use `format: :arrow_batch` instead.
|
124
|
+
def save_as_batch
|
125
|
+
save_as_arrow_file
|
126
|
+
end
|
127
|
+
|
128
|
+
# @since 1.0.0
|
129
|
+
def save_as_arrow_streaming
|
121
130
|
save_raw(RecordBatchStreamWriter)
|
122
131
|
end
|
123
132
|
|
124
|
-
|
133
|
+
# @deprecated Use `format: :arrow_streaming` instead.
|
134
|
+
def save_as_stream
|
135
|
+
save_as_arrow_streaming
|
136
|
+
end
|
137
|
+
|
138
|
+
def csv_save(**options)
|
125
139
|
open_output_stream do |output|
|
126
|
-
csv = CSV.new(output)
|
140
|
+
csv = CSV.new(output, **options)
|
127
141
|
names = @table.schema.fields.collect(&:name)
|
128
142
|
csv << names
|
129
|
-
@table.
|
130
|
-
csv <<
|
131
|
-
record[name]
|
132
|
-
end
|
143
|
+
@table.raw_records.each do |record|
|
144
|
+
csv << record
|
133
145
|
end
|
134
146
|
end
|
135
147
|
end
|
136
148
|
|
149
|
+
def save_as_csv
|
150
|
+
csv_save
|
151
|
+
end
|
152
|
+
|
153
|
+
def save_as_tsv
|
154
|
+
csv_save(col_sep: "\t")
|
155
|
+
end
|
156
|
+
|
137
157
|
def save_as_feather
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
158
|
+
properties = FeatherWriteProperties.new
|
159
|
+
properties.class.properties.each do |name|
|
160
|
+
value = @options[name.to_sym]
|
161
|
+
next if value.nil?
|
162
|
+
properties.__send__("#{name}=", value)
|
163
|
+
end
|
164
|
+
open_raw_output_stream do |output|
|
165
|
+
@table.write_as_feather(output, properties)
|
142
166
|
end
|
143
167
|
end
|
144
168
|
end
|
data/lib/arrow/table.rb
CHANGED
@@ -15,13 +15,13 @@
|
|
15
15
|
# specific language governing permissions and limitations
|
16
16
|
# under the License.
|
17
17
|
|
18
|
-
require "arrow/
|
19
|
-
require "arrow/group"
|
20
|
-
require "arrow/record-containable"
|
18
|
+
require "arrow/raw-table-converter"
|
21
19
|
|
22
20
|
module Arrow
|
23
21
|
class Table
|
24
22
|
include ColumnContainable
|
23
|
+
include GenericFilterable
|
24
|
+
include GenericTakeable
|
25
25
|
include RecordContainable
|
26
26
|
|
27
27
|
class << self
|
@@ -83,14 +83,6 @@ module Arrow
|
|
83
83
|
# `Array`.
|
84
84
|
#
|
85
85
|
# @example Create a table from column name and values
|
86
|
-
# count_chunks = [
|
87
|
-
# Arrow::UInt32Array.new([0, 2]),
|
88
|
-
# Arrow::UInt32Array.new([nil, 4]),
|
89
|
-
# ]
|
90
|
-
# visible_chunks = [
|
91
|
-
# Arrow::BooleanArray.new([true]),
|
92
|
-
# Arrow::BooleanArray.new([nil, nil, false]),
|
93
|
-
# ]
|
94
86
|
# Arrow::Table.new("count" => [0, 2, nil, 4],
|
95
87
|
# "visible" => [true, nil, nil, false])
|
96
88
|
#
|
@@ -171,22 +163,9 @@ module Arrow
|
|
171
163
|
n_args = args.size
|
172
164
|
case n_args
|
173
165
|
when 1
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
values = columns.collect(&:data)
|
178
|
-
schema = Schema.new(fields)
|
179
|
-
else
|
180
|
-
raw_table = args[0]
|
181
|
-
fields = []
|
182
|
-
values = []
|
183
|
-
raw_table.each do |name, array|
|
184
|
-
array = ArrayBuilder.build(array) if array.is_a?(::Array)
|
185
|
-
fields << Field.new(name.to_s, array.value_data_type)
|
186
|
-
values << array
|
187
|
-
end
|
188
|
-
schema = Schema.new(fields)
|
189
|
-
end
|
166
|
+
raw_table_converter = RawTableConverter.new(args[0])
|
167
|
+
schema = raw_table_converter.schema
|
168
|
+
values = raw_table_converter.values
|
190
169
|
when 2
|
191
170
|
schema = args[0]
|
192
171
|
schema = Schema.new(schema) unless schema.is_a?(Schema)
|
@@ -306,13 +285,15 @@ module Arrow
|
|
306
285
|
end
|
307
286
|
end
|
308
287
|
|
309
|
-
|
288
|
+
filter_options = Arrow::FilterOptions.new
|
289
|
+
filter_options.null_selection_behavior = :emit_null
|
290
|
+
sliced_tables = []
|
310
291
|
slicers.each do |slicer|
|
311
292
|
slicer = slicer.evaluate if slicer.respond_to?(:evaluate)
|
312
293
|
case slicer
|
313
294
|
when Integer
|
314
295
|
slicer += n_rows if slicer < 0
|
315
|
-
|
296
|
+
sliced_tables << slice_by_range(slicer, n_rows - 1)
|
316
297
|
when Range
|
317
298
|
original_from = from = slicer.first
|
318
299
|
to = slicer.last
|
@@ -325,17 +306,9 @@ module Arrow
|
|
325
306
|
raise ArgumentError, message
|
326
307
|
end
|
327
308
|
to += n_rows if to < 0
|
328
|
-
|
329
|
-
when ::Array
|
330
|
-
|
331
|
-
when ChunkedArray
|
332
|
-
offset = 0
|
333
|
-
slicer.each_chunk do |array|
|
334
|
-
boolean_array_to_slice_ranges(array, offset, ranges)
|
335
|
-
offset += array.length
|
336
|
-
end
|
337
|
-
when BooleanArray
|
338
|
-
boolean_array_to_slice_ranges(slicer, 0, ranges)
|
309
|
+
sliced_tables << slice_by_range(from, to)
|
310
|
+
when ::Array, BooleanArray, ChunkedArray
|
311
|
+
sliced_tables << filter(slicer, filter_options)
|
339
312
|
else
|
340
313
|
message = "slicer must be Integer, Range, (from, to), " +
|
341
314
|
"Arrow::ChunkedArray of Arrow::BooleanArray, " +
|
@@ -343,7 +316,11 @@ module Arrow
|
|
343
316
|
raise ArgumentError, message
|
344
317
|
end
|
345
318
|
end
|
346
|
-
|
319
|
+
if sliced_tables.size > 1
|
320
|
+
sliced_tables[0].concatenate(sliced_tables[1..-1])
|
321
|
+
else
|
322
|
+
sliced_tables[0]
|
323
|
+
end
|
347
324
|
end
|
348
325
|
|
349
326
|
# TODO
|
@@ -514,38 +491,8 @@ module Arrow
|
|
514
491
|
end
|
515
492
|
|
516
493
|
private
|
517
|
-
def
|
518
|
-
|
519
|
-
target_start = nil
|
520
|
-
array.each_with_index do |is_target, i|
|
521
|
-
if is_target
|
522
|
-
unless in_target
|
523
|
-
target_start = offset + i
|
524
|
-
in_target = true
|
525
|
-
end
|
526
|
-
else
|
527
|
-
if in_target
|
528
|
-
ranges << [target_start, offset + i - 1]
|
529
|
-
target_start = nil
|
530
|
-
in_target = false
|
531
|
-
end
|
532
|
-
end
|
533
|
-
end
|
534
|
-
if in_target
|
535
|
-
ranges << [target_start, offset + array.length - 1]
|
536
|
-
end
|
537
|
-
end
|
538
|
-
|
539
|
-
def slice_by_ranges(ranges)
|
540
|
-
sliced_table = []
|
541
|
-
ranges.each do |from, to|
|
542
|
-
sliced_table << slice_raw(from, to - from + 1)
|
543
|
-
end
|
544
|
-
if sliced_table.size > 1
|
545
|
-
sliced_table[0].concatenate(sliced_table[1..-1])
|
546
|
-
else
|
547
|
-
sliced_table[0]
|
548
|
-
end
|
494
|
+
def slice_by_range(from, to)
|
495
|
+
slice_raw(from, to - from + 1)
|
549
496
|
end
|
550
497
|
|
551
498
|
def ensure_raw_column(name, data)
|