red-arrow 0.11.0 → 0.12.0
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of red-arrow might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/Rakefile +4 -0
- data/lib/arrow/array-builder.rb +6 -2
- data/lib/arrow/array.rb +6 -2
- data/{test/test-csv-reader.rb → lib/arrow/compression-type.rb} +16 -13
- data/lib/arrow/csv-loader.rb +102 -2
- data/lib/arrow/csv-read-options.rb +25 -0
- data/lib/arrow/data-type.rb +135 -0
- data/lib/arrow/decimal128-array-builder.rb +64 -0
- data/lib/arrow/decimal128-data-type.rb +69 -0
- data/lib/arrow/dense-union-data-type.rb +90 -0
- data/lib/arrow/dictionary-data-type.rb +106 -0
- data/lib/arrow/field-containable.rb +35 -0
- data/lib/arrow/field.rb +92 -8
- data/lib/arrow/file-output-stream.rb +34 -0
- data/lib/arrow/list-array-builder.rb +96 -0
- data/lib/arrow/list-data-type.rb +68 -0
- data/lib/arrow/loader.rb +30 -5
- data/lib/arrow/{csv-reader.rb → path-extension.rb} +19 -28
- data/lib/arrow/record-batch-builder.rb +115 -0
- data/lib/arrow/record-batch.rb +25 -0
- data/lib/arrow/schema.rb +97 -0
- data/lib/arrow/sparse-union-data-type.rb +90 -0
- data/lib/arrow/struct-array-builder.rb +146 -0
- data/lib/arrow/struct-array.rb +34 -0
- data/lib/arrow/struct-data-type.rb +130 -0
- data/lib/arrow/struct.rb +68 -0
- data/lib/arrow/table-loader.rb +65 -25
- data/lib/arrow/table-saver.rb +73 -24
- data/lib/arrow/table.rb +11 -2
- data/lib/arrow/time32-data-type.rb +61 -0
- data/lib/arrow/time64-data-type.rb +61 -0
- data/lib/arrow/timestamp-data-type.rb +57 -0
- data/lib/arrow/version.rb +5 -7
- data/lib/arrow/writable.rb +22 -0
- data/red-arrow.gemspec +8 -4
- data/test/helper.rb +1 -2
- data/test/test-csv-loader.rb +27 -0
- data/test/test-data-type.rb +47 -0
- data/test/test-decimal128-array-builder.rb +95 -0
- data/test/test-decimal128-array.rb +38 -0
- data/test/test-decimal128-data-type.rb +31 -0
- data/test/test-dense-union-data-type.rb +41 -0
- data/test/test-dictionary-data-type.rb +40 -0
- data/test/test-feather.rb +34 -0
- data/test/test-field.rb +71 -0
- data/test/test-file-output-stream.rb +54 -0
- data/test/test-list-array-builder.rb +79 -0
- data/test/test-list-array.rb +32 -0
- data/test/test-list-data-type.rb +43 -0
- data/test/test-record-batch-builder.rb +116 -0
- data/test/test-record-batch.rb +82 -27
- data/test/test-schema.rb +104 -0
- data/test/test-sparse-union-data-type.rb +41 -0
- data/test/test-struct-array-builder.rb +180 -0
- data/test/test-struct-array.rb +60 -15
- data/test/test-struct-data-type.rb +112 -0
- data/test/test-struct.rb +81 -0
- data/test/test-table.rb +165 -29
- data/test/test-time32-data-type.rb +42 -0
- data/test/test-time64-data-type.rb +42 -0
- data/test/test-timestamp-data-type.rb +42 -0
- metadata +99 -10
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 847a4994cc15fb50df335c7231c942d49392f1ec528b85647b2bbe6fb4e82f7b
|
4
|
+
data.tar.gz: 4337680e47dea67107a1fef863d66e936b9c5d20b2cb6698e879b774eada3c88
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c72b26d9b4f488c4d00184ea3243056c69a601fe59fba80a7423075b19034b0db7ad12dbbf5e210ea4b66607acd2ad7123a85057a27266b21665e143961bccb7
|
7
|
+
data.tar.gz: 1fb8c82007a25cac5e99d41f3bb879ed86ad1f9deab320f45470252513fada6c6f972f7ddb0ed3f38ca1dcdf7de3f5662a3fa28536a4981b9e5993dd5bc666a8
|
data/README.md
CHANGED
data/Rakefile
CHANGED
data/lib/arrow/array-builder.rb
CHANGED
@@ -65,6 +65,12 @@ module Arrow
|
|
65
65
|
end
|
66
66
|
|
67
67
|
def build(values)
|
68
|
+
append(*values)
|
69
|
+
finish
|
70
|
+
end
|
71
|
+
|
72
|
+
# @since 0.12.0
|
73
|
+
def append(*values)
|
68
74
|
value_convertable = respond_to?(:convert_to_arrow_value, true)
|
69
75
|
start_index = 0
|
70
76
|
current_index = 0
|
@@ -111,8 +117,6 @@ module Arrow
|
|
111
117
|
append_nulls(current_index - start_index)
|
112
118
|
end
|
113
119
|
end
|
114
|
-
|
115
|
-
finish
|
116
120
|
end
|
117
121
|
|
118
122
|
def append_nulls(n)
|
data/lib/arrow/array.rb
CHANGED
@@ -20,11 +20,15 @@ module Arrow
|
|
20
20
|
include Enumerable
|
21
21
|
|
22
22
|
class << self
|
23
|
-
def new(
|
23
|
+
def new(*args)
|
24
24
|
builder_class_name = "#{name}Builder"
|
25
25
|
if const_defined?(builder_class_name)
|
26
26
|
builder_class = const_get(builder_class_name)
|
27
|
-
builder_class.build
|
27
|
+
if args.size == builder_class.method(:build).arity
|
28
|
+
builder_class.build(*args)
|
29
|
+
else
|
30
|
+
super
|
31
|
+
end
|
28
32
|
else
|
29
33
|
super
|
30
34
|
end
|
@@ -15,20 +15,23 @@
|
|
15
15
|
# specific language governing permissions and limitations
|
16
16
|
# under the License.
|
17
17
|
|
18
|
-
|
19
|
-
|
18
|
+
module Arrow
|
19
|
+
class CompressionType
|
20
|
+
EXTENSIONS = {}
|
21
|
+
values.each do |value|
|
22
|
+
case value
|
23
|
+
when UNCOMPRESSED
|
24
|
+
when GZIP
|
25
|
+
EXTENSIONS["gz"] = value
|
26
|
+
else
|
27
|
+
EXTENSIONS[value.nick] = value
|
28
|
+
end
|
29
|
+
end
|
20
30
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
reader = Arrow::CSVReader.new(csv)
|
26
|
-
assert_equal(<<-TABLE, reader.read.to_s)
|
27
|
-
name score
|
28
|
-
0 alice 10
|
29
|
-
1 bob 29
|
30
|
-
2 chris -1
|
31
|
-
TABLE
|
31
|
+
class << self
|
32
|
+
def resolve_extension(extension)
|
33
|
+
EXTENSIONS[extension.to_s]
|
34
|
+
end
|
32
35
|
end
|
33
36
|
end
|
34
37
|
end
|
data/lib/arrow/csv-loader.rb
CHANGED
@@ -30,6 +30,7 @@ module Arrow
|
|
30
30
|
def initialize(path_or_data, **options)
|
31
31
|
@path_or_data = path_or_data
|
32
32
|
@options = options
|
33
|
+
@compression = @options.delete(:compression)
|
33
34
|
end
|
34
35
|
|
35
36
|
def load
|
@@ -60,11 +61,85 @@ module Arrow
|
|
60
61
|
end
|
61
62
|
|
62
63
|
def read_csv(csv)
|
63
|
-
|
64
|
-
|
64
|
+
values_set = []
|
65
|
+
csv.each do |row|
|
66
|
+
if row.is_a?(CSV::Row)
|
67
|
+
row = row.collect(&:last)
|
68
|
+
end
|
69
|
+
row.each_with_index do |value, i|
|
70
|
+
values = (values_set[i] ||= [])
|
71
|
+
values << value
|
72
|
+
end
|
73
|
+
end
|
74
|
+
return nil if values_set.empty?
|
75
|
+
|
76
|
+
arrays = values_set.collect.with_index do |values, i|
|
77
|
+
ArrayBuilder.build(values)
|
78
|
+
end
|
79
|
+
if csv.headers
|
80
|
+
names = csv.headers
|
81
|
+
else
|
82
|
+
names = arrays.size.times.collect(&:to_s)
|
83
|
+
end
|
84
|
+
raw_table = {}
|
85
|
+
names.each_with_index do |name, i|
|
86
|
+
raw_table[name] = arrays[i]
|
87
|
+
end
|
88
|
+
Table.new(raw_table)
|
89
|
+
end
|
90
|
+
|
91
|
+
def reader_options
|
92
|
+
options = CSVReadOptions.new
|
93
|
+
@options.each do |key, value|
|
94
|
+
case key
|
95
|
+
when :headers
|
96
|
+
if value
|
97
|
+
options.n_header_rows = 1
|
98
|
+
else
|
99
|
+
options.n_header_rows = 0
|
100
|
+
end
|
101
|
+
when :column_types
|
102
|
+
value.each do |name, type|
|
103
|
+
options.add_column_type(name, type)
|
104
|
+
end
|
105
|
+
when :schema
|
106
|
+
options.add_schema(value)
|
107
|
+
else
|
108
|
+
setter = "#{key}="
|
109
|
+
if options.respond_to?(setter)
|
110
|
+
options.__send__(setter, value)
|
111
|
+
else
|
112
|
+
return nil
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
options
|
117
|
+
end
|
118
|
+
|
119
|
+
def open_input(raw_input)
|
120
|
+
if @compression
|
121
|
+
codec = Codec.new(@compression)
|
122
|
+
CompressedInputStream.open(codec, raw_input) do |input|
|
123
|
+
yield(input)
|
124
|
+
end
|
125
|
+
else
|
126
|
+
yield(raw_input)
|
127
|
+
end
|
65
128
|
end
|
66
129
|
|
67
130
|
def load_from_path(path)
|
131
|
+
options = reader_options
|
132
|
+
if options
|
133
|
+
begin
|
134
|
+
MemoryMappedInputStream.open(path.to_s) do |raw_input|
|
135
|
+
open_input(raw_input) do |input|
|
136
|
+
return CSVReader.new(input, options).read
|
137
|
+
end
|
138
|
+
end
|
139
|
+
rescue Arrow::Error::Invalid
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
68
143
|
options = update_csv_parse_options(@options, :open_csv, path)
|
69
144
|
open_csv(path, **options) do |csv|
|
70
145
|
read_csv(csv)
|
@@ -72,6 +147,18 @@ module Arrow
|
|
72
147
|
end
|
73
148
|
|
74
149
|
def load_data(data)
|
150
|
+
options = reader_options
|
151
|
+
if options
|
152
|
+
begin
|
153
|
+
BufferInputStream.open(Buffer.new(data)) do |raw_input|
|
154
|
+
open_input(raw_input) do |input|
|
155
|
+
return CSVReader.new(input, options).read
|
156
|
+
end
|
157
|
+
end
|
158
|
+
rescue Arrow::Error::Invalid
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
75
162
|
options = update_csv_parse_options(@options, :parse_csv_data, data)
|
76
163
|
parse_csv_data(data, **options) do |csv|
|
77
164
|
read_csv(csv)
|
@@ -119,6 +206,11 @@ module Arrow
|
|
119
206
|
end
|
120
207
|
end
|
121
208
|
|
209
|
+
AVAILABLE_CSV_PARSE_OPTIONS = {}
|
210
|
+
CSV.instance_method(:initialize).parameters.each do |type, name|
|
211
|
+
AVAILABLE_CSV_PARSE_OPTIONS[name] = true if type == :key
|
212
|
+
end
|
213
|
+
|
122
214
|
def update_csv_parse_options(options, create_csv, *args)
|
123
215
|
if options.key?(:converters)
|
124
216
|
new_options = options.dup
|
@@ -127,6 +219,14 @@ module Arrow
|
|
127
219
|
new_options = options.merge(converters: converters)
|
128
220
|
end
|
129
221
|
|
222
|
+
# TODO: Support :schema and :column_types
|
223
|
+
|
224
|
+
unless AVAILABLE_CSV_PARSE_OPTIONS.empty?
|
225
|
+
new_options.select! do |key, value|
|
226
|
+
AVAILABLE_CSV_PARSE_OPTIONS.key?(key)
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
130
230
|
unless options.key?(:headers)
|
131
231
|
__send__(create_csv, *args, **new_options) do |csv|
|
132
232
|
new_options[:headers] = have_header?(csv)
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
|
18
|
+
module Arrow
|
19
|
+
class CSVReadOptions
|
20
|
+
alias_method :add_column_type_raw, :add_column_type
|
21
|
+
def add_column_type(name, type)
|
22
|
+
add_column_type_raw(name, DataType.resolve(type))
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,135 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
|
18
|
+
module Arrow
|
19
|
+
class DataType
|
20
|
+
class << self
|
21
|
+
# Creates a new suitable {Arrow::DataType}.
|
22
|
+
#
|
23
|
+
# @overload resolve(data_type)
|
24
|
+
#
|
25
|
+
# Returns the given data type itself. This is convenient to
|
26
|
+
# use this method as {Arrow::DataType} converter.
|
27
|
+
#
|
28
|
+
# @param data_type [Arrow::DataType] The data type.
|
29
|
+
#
|
30
|
+
# @return [Arrow::DataType] The given data type itself.
|
31
|
+
#
|
32
|
+
# @overload resolve(name, *arguments)
|
33
|
+
#
|
34
|
+
# Creates a suitable data type from type name. For example,
|
35
|
+
# you can create {Arrow::BooleanDataType} from `:boolean`.
|
36
|
+
#
|
37
|
+
# @param name [String, Symbol] The type name of the data type.
|
38
|
+
#
|
39
|
+
# @param arguments [::Array] The additional information of the
|
40
|
+
# data type.
|
41
|
+
#
|
42
|
+
# For example, {Arrow::TimestampDataType} needs unit as
|
43
|
+
# additional information.
|
44
|
+
#
|
45
|
+
# @example Create a boolean data type
|
46
|
+
# Arrow::DataType.resolve(:boolean)
|
47
|
+
#
|
48
|
+
# @example Create a milliseconds unit timestamp data type
|
49
|
+
# Arrow::DataType.resolve(:timestamp, :milli)
|
50
|
+
#
|
51
|
+
# @overload resolve(description)
|
52
|
+
#
|
53
|
+
# Creates a suitable data type from data type description.
|
54
|
+
#
|
55
|
+
# Data type description is a raw `Hash`. Data type description
|
56
|
+
# must have `:type` value. `:type` is the type of the data type.
|
57
|
+
#
|
58
|
+
# If the type needs additional information, you need to
|
59
|
+
# specify it. See constructor document what information is
|
60
|
+
# needed. For example, {Arrow::ListDataType#initialize} needs
|
61
|
+
# `:field` value.
|
62
|
+
#
|
63
|
+
# @param description [Hash] The description of the data type.
|
64
|
+
#
|
65
|
+
# @option description [String, Symbol] :type The type name of
|
66
|
+
# the data type.
|
67
|
+
#
|
68
|
+
# @example Create a boolean data type
|
69
|
+
# Arrow::DataType.resolve(type: :boolean)
|
70
|
+
#
|
71
|
+
# @example Create a list data type
|
72
|
+
# Arrow::DataType.resolve(type: :list,
|
73
|
+
# field: {name: "visible", type: :boolean})
|
74
|
+
def resolve(data_type)
|
75
|
+
case data_type
|
76
|
+
when DataType
|
77
|
+
data_type
|
78
|
+
when String, Symbol
|
79
|
+
resolve_class(data_type).new
|
80
|
+
when ::Array
|
81
|
+
type, *arguments = data_type
|
82
|
+
resolve_class(type).new(*arguments)
|
83
|
+
when Hash
|
84
|
+
type = nil
|
85
|
+
description = {}
|
86
|
+
data_type.each do |key, value|
|
87
|
+
key = key.to_sym
|
88
|
+
case key
|
89
|
+
when :type
|
90
|
+
type = value
|
91
|
+
else
|
92
|
+
description[key] = value
|
93
|
+
end
|
94
|
+
end
|
95
|
+
if type.nil?
|
96
|
+
message =
|
97
|
+
"data type description must have :type value: #{data_type.inspect}"
|
98
|
+
raise ArgumentError, message
|
99
|
+
end
|
100
|
+
data_type_class = resolve_class(type)
|
101
|
+
if description.empty?
|
102
|
+
data_type_class.new
|
103
|
+
else
|
104
|
+
data_type_class.new(description)
|
105
|
+
end
|
106
|
+
else
|
107
|
+
message =
|
108
|
+
"data type must be " +
|
109
|
+
"Arrow::DataType, String, Symbol, [String, ...], [Symbol, ...] " +
|
110
|
+
"{type: String, ...} or {type: Symbol, ...}: #{data_type.inspect}"
|
111
|
+
raise ArgumentError, message
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
private
|
116
|
+
def resolve_class(data_type)
|
117
|
+
data_type_name = data_type.to_s.capitalize.gsub(/\AUint/, "UInt")
|
118
|
+
data_type_class_name = "#{data_type_name}DataType"
|
119
|
+
unless Arrow.const_defined?(data_type_class_name)
|
120
|
+
available_types = []
|
121
|
+
Arrow.constants.each do |name|
|
122
|
+
if name.to_s.end_with?("DataType")
|
123
|
+
available_types << name.to_s.gsub(/DataType\z/, "").downcase.to_sym
|
124
|
+
end
|
125
|
+
end
|
126
|
+
message =
|
127
|
+
"unknown type: #{data_type.inspect}: " +
|
128
|
+
"available types: #{available_types.inspect}"
|
129
|
+
raise ArgumentError, message
|
130
|
+
end
|
131
|
+
Arrow.const_get(data_type_class_name)
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
|
18
|
+
require "bigdecimal"
|
19
|
+
|
20
|
+
module Arrow
|
21
|
+
class Decimal128ArrayBuilder
|
22
|
+
class << self
|
23
|
+
def build(data_type, values)
|
24
|
+
builder = new(data_type)
|
25
|
+
builder.build(values)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
alias_method :append_value_raw, :append_value
|
30
|
+
def append_value(value)
|
31
|
+
case value
|
32
|
+
when nil
|
33
|
+
return append_null
|
34
|
+
when String
|
35
|
+
value = Decimal128.new(value)
|
36
|
+
when Float
|
37
|
+
value = Decimal128.new(value.to_s)
|
38
|
+
when BigDecimal
|
39
|
+
value = Decimal128.new(value.to_s)
|
40
|
+
end
|
41
|
+
append_value_raw(value)
|
42
|
+
end
|
43
|
+
|
44
|
+
def append_values(values, is_valids=nil)
|
45
|
+
if is_valids
|
46
|
+
is_valids.each_with_index do |is_valid, i|
|
47
|
+
if is_valid
|
48
|
+
append_value(values[i])
|
49
|
+
else
|
50
|
+
append_null
|
51
|
+
end
|
52
|
+
end
|
53
|
+
else
|
54
|
+
values.each do |value|
|
55
|
+
if value.nil?
|
56
|
+
append_null
|
57
|
+
else
|
58
|
+
append_value(value)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|