red-arrow 0.15.1 → 0.16.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ext/arrow/converters.hpp +6 -6
- data/lib/arrow/array-builder.rb +101 -52
- data/lib/arrow/array.rb +28 -10
- data/lib/arrow/chunked-array.rb +2 -0
- data/lib/arrow/csv-loader.rb +5 -0
- data/lib/arrow/csv-read-options.rb +18 -0
- data/lib/arrow/data-type.rb +35 -2
- data/lib/arrow/decimal128-array-builder.rb +0 -2
- data/lib/arrow/field.rb +1 -1
- data/lib/arrow/generic-filterable.rb +43 -0
- data/lib/arrow/generic-takeable.rb +38 -0
- data/lib/arrow/list-data-type.rb +58 -8
- data/lib/arrow/loader.rb +9 -1
- data/lib/arrow/{binary-array-builder.rb → null-array.rb} +3 -15
- data/lib/arrow/record-batch.rb +0 -3
- data/lib/arrow/schema.rb +0 -2
- data/lib/arrow/struct-data-type.rb +0 -2
- data/lib/arrow/table-loader.rb +29 -6
- data/lib/arrow/table-saver.rb +29 -9
- data/lib/arrow/table.rb +14 -50
- data/lib/arrow/version.rb +1 -1
- data/red-arrow.gemspec +3 -1
- data/test/test-array-builder.rb +17 -0
- data/test/test-array.rb +102 -0
- data/test/test-chunked-array.rb +94 -0
- data/test/test-csv-loader.rb +2 -2
- data/test/test-data-type.rb +11 -0
- data/test/test-list-data-type.rb +27 -1
- data/test/test-null-array.rb +23 -0
- data/test/test-slicer.rb +74 -30
- data/test/test-table.rb +147 -14
- data/test/test-timestamp-array.rb +19 -0
- metadata +60 -55
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 64b14ef4120f4ab290e8161020902ec2a22631c519d5a133a63ce383610e8545
|
4
|
+
data.tar.gz: 2f5850520e2dc69568a454cee0d4246909d52f1d49851221b1b9efd3149bc15c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0e19a4da6182437a51f9dad6212436e70b00881674ee6cd29e2a40910fe711fdef4076c81d2778f3ff9e9dd3f45b573c43e90378244cf08d7550b504ad8b53af
|
7
|
+
data.tar.gz: 547eb8b31fd59d9c1d5fc1163bb25da70154b797d103d3a01f6fda70dddc0b3c2cdb5391b9006c4510be37b8b75988195f809aaead8c91e7fb68f762cc5313de
|
data/ext/arrow/converters.hpp
CHANGED
@@ -504,14 +504,14 @@ namespace red_arrow {
|
|
504
504
|
uint8_t compute_child_index(const arrow::UnionArray& array,
|
505
505
|
arrow::UnionType* type,
|
506
506
|
const char* tag) {
|
507
|
-
const auto
|
508
|
-
|
509
|
-
|
510
|
-
if (
|
511
|
-
return
|
507
|
+
const auto type_code = array.raw_type_codes()[index_];
|
508
|
+
if (type_code >= 0 && type_code <= arrow::UnionType::kMaxTypeCode) {
|
509
|
+
const auto child_id = type->child_ids()[type_code];
|
510
|
+
if (child_id >= 0) {
|
511
|
+
return child_id;
|
512
512
|
}
|
513
513
|
}
|
514
|
-
check_status(arrow::Status::Invalid("Unknown type ID: ",
|
514
|
+
check_status(arrow::Status::Invalid("Unknown type ID: ", type_code),
|
515
515
|
tag);
|
516
516
|
return 0;
|
517
517
|
}
|
data/lib/arrow/array-builder.rb
CHANGED
@@ -26,60 +26,13 @@ module Arrow
|
|
26
26
|
return builder.build(values)
|
27
27
|
end
|
28
28
|
|
29
|
-
|
30
|
-
builder_class_arguments = []
|
29
|
+
builder_info = nil
|
31
30
|
values.each do |value|
|
32
|
-
|
33
|
-
|
34
|
-
# Ignore
|
35
|
-
when true, false
|
36
|
-
return BooleanArray.new(values)
|
37
|
-
when String
|
38
|
-
return StringArray.new(values)
|
39
|
-
when Float
|
40
|
-
return DoubleArray.new(values)
|
41
|
-
when Integer
|
42
|
-
if value < 0
|
43
|
-
builder = IntArrayBuilder.new
|
44
|
-
return builder.build(values)
|
45
|
-
else
|
46
|
-
builder_class = UIntArrayBuilder
|
47
|
-
builder_class_arguments = []
|
48
|
-
end
|
49
|
-
when Time
|
50
|
-
data_type = value.data_type
|
51
|
-
case data_type.unit
|
52
|
-
when TimeUnit::SECOND
|
53
|
-
if builder.nil?
|
54
|
-
builder = Time32ArrayBuilder
|
55
|
-
builder_class_arguments = [data_type]
|
56
|
-
end
|
57
|
-
when TimeUnit::MILLI
|
58
|
-
if builder != Time64ArrayBuilder
|
59
|
-
builder = Time32ArrayBuilder
|
60
|
-
builder_class_arguments = [data_type]
|
61
|
-
end
|
62
|
-
when TimeUnit::MICRO
|
63
|
-
builder = Time64ArrayBuilder
|
64
|
-
builder_class_arguments = [data_type]
|
65
|
-
when TimeUnit::NANO
|
66
|
-
builder = Time64ArrayBuilder.new(data_type)
|
67
|
-
return builder.build(values)
|
68
|
-
end
|
69
|
-
when ::Time
|
70
|
-
data_type = TimestampDataType.new(:nano)
|
71
|
-
builder = TimestampArrayBuilder.new(data_type)
|
72
|
-
return builder.build(values)
|
73
|
-
when DateTime
|
74
|
-
return Date64Array.new(values)
|
75
|
-
when Date
|
76
|
-
return Date32Array.new(values)
|
77
|
-
else
|
78
|
-
return StringArray.new(values)
|
79
|
-
end
|
31
|
+
builder_info = detect_builder_info(value, builder_info)
|
32
|
+
break if builder_info and builder_info[:detected]
|
80
33
|
end
|
81
|
-
if
|
82
|
-
builder =
|
34
|
+
if builder_info
|
35
|
+
builder = builder_info[:builder]
|
83
36
|
builder.build(values)
|
84
37
|
else
|
85
38
|
Arrow::StringArray.new(values)
|
@@ -89,6 +42,102 @@ module Arrow
|
|
89
42
|
def buildable?(args)
|
90
43
|
args.size == method(:build).arity
|
91
44
|
end
|
45
|
+
|
46
|
+
private
|
47
|
+
def detect_builder_info(value, builder_info)
|
48
|
+
case value
|
49
|
+
when nil
|
50
|
+
builder_info
|
51
|
+
when true, false
|
52
|
+
{
|
53
|
+
builder: BooleanArrayBuilder.new,
|
54
|
+
detected: true,
|
55
|
+
}
|
56
|
+
when String
|
57
|
+
{
|
58
|
+
builder: StringArrayBuilder.new,
|
59
|
+
detected: true,
|
60
|
+
}
|
61
|
+
when Float
|
62
|
+
{
|
63
|
+
builder: DoubleArrayBuilder.new,
|
64
|
+
detected: true,
|
65
|
+
}
|
66
|
+
when Integer
|
67
|
+
if value < 0
|
68
|
+
{
|
69
|
+
builder: IntArrayBuilder.new,
|
70
|
+
detected: true,
|
71
|
+
}
|
72
|
+
else
|
73
|
+
{
|
74
|
+
builder: UIntArrayBuilder.new,
|
75
|
+
}
|
76
|
+
end
|
77
|
+
when Time
|
78
|
+
data_type = value.data_type
|
79
|
+
case data_type.unit
|
80
|
+
when TimeUnit::SECOND
|
81
|
+
builder_info || {
|
82
|
+
builder: Time32ArrayBuilder.new(data_type)
|
83
|
+
}
|
84
|
+
when TimeUnit::MILLI
|
85
|
+
if builder_info and builder_info[:builder].is_a?(Time64ArrayBuilder)
|
86
|
+
builder_info
|
87
|
+
else
|
88
|
+
{
|
89
|
+
builder: Time32ArrayBuilder.new(data_type),
|
90
|
+
}
|
91
|
+
end
|
92
|
+
when TimeUnit::MICRO
|
93
|
+
{
|
94
|
+
builder: Time64ArrayBuilder.new(data_type),
|
95
|
+
}
|
96
|
+
when TimeUnit::NANO
|
97
|
+
{
|
98
|
+
builder: Time64ArrayBuilder.new(data_type),
|
99
|
+
detected: true
|
100
|
+
}
|
101
|
+
end
|
102
|
+
when ::Time
|
103
|
+
data_type = TimestampDataType.new(:nano)
|
104
|
+
{
|
105
|
+
builder: TimestampArrayBuilder.new(data_type),
|
106
|
+
detected: true,
|
107
|
+
}
|
108
|
+
when DateTime
|
109
|
+
{
|
110
|
+
builder: Date64ArrayBuilder.new,
|
111
|
+
detected: true,
|
112
|
+
}
|
113
|
+
when Date
|
114
|
+
{
|
115
|
+
builder: Date32ArrayBuilder.new,
|
116
|
+
detected: true,
|
117
|
+
}
|
118
|
+
when ::Array
|
119
|
+
sub_builder_info = nil
|
120
|
+
value.each do |sub_value|
|
121
|
+
sub_builder_info = detect_builder_info(sub_value, sub_builder_info)
|
122
|
+
break if sub_builder_info and sub_builder_info[:detected]
|
123
|
+
end
|
124
|
+
if sub_builder_info and sub_builder_info[:detected]
|
125
|
+
sub_value_data_type = sub_builder_info[:builder].value_data_type
|
126
|
+
field = Field.new("item", sub_value_data_type)
|
127
|
+
{
|
128
|
+
builder: ListArrayBuilder.new(ListDataType.new(field)),
|
129
|
+
detected: true,
|
130
|
+
}
|
131
|
+
else
|
132
|
+
builder_info
|
133
|
+
end
|
134
|
+
else
|
135
|
+
{
|
136
|
+
builder: StringArrayBuilder.new,
|
137
|
+
detected: true,
|
138
|
+
}
|
139
|
+
end
|
140
|
+
end
|
92
141
|
end
|
93
142
|
|
94
143
|
def build(values)
|
data/lib/arrow/array.rb
CHANGED
@@ -18,20 +18,21 @@
|
|
18
18
|
module Arrow
|
19
19
|
class Array
|
20
20
|
include Enumerable
|
21
|
+
include GenericFilterable
|
22
|
+
include GenericTakeable
|
21
23
|
|
22
24
|
class << self
|
23
25
|
def new(*args)
|
26
|
+
_builder_class = builder_class
|
27
|
+
return super if _builder_class.nil?
|
28
|
+
return super unless _builder_class.buildable?(args)
|
29
|
+
_builder_class.build(*args)
|
30
|
+
end
|
31
|
+
|
32
|
+
def builder_class
|
24
33
|
builder_class_name = "#{name}Builder"
|
25
|
-
|
26
|
-
|
27
|
-
if builder_class.buildable?(args)
|
28
|
-
builder_class.build(*args)
|
29
|
-
else
|
30
|
-
super
|
31
|
-
end
|
32
|
-
else
|
33
|
-
super
|
34
|
-
end
|
34
|
+
return nil unless const_defined?(builder_class_name)
|
35
|
+
const_get(builder_class_name)
|
35
36
|
end
|
36
37
|
end
|
37
38
|
|
@@ -82,5 +83,22 @@ module Arrow
|
|
82
83
|
def to_a
|
83
84
|
values
|
84
85
|
end
|
86
|
+
|
87
|
+
alias_method :is_in_raw, :is_in
|
88
|
+
def is_in(values)
|
89
|
+
case values
|
90
|
+
when ::Array
|
91
|
+
if self.class.builder_class.buildable?([values])
|
92
|
+
values = self.class.new(values)
|
93
|
+
else
|
94
|
+
values = self.class.new(value_data_type, values)
|
95
|
+
end
|
96
|
+
is_in_raw(values)
|
97
|
+
when ChunkedArray
|
98
|
+
is_in_chunked_array(values)
|
99
|
+
else
|
100
|
+
is_in_raw(values)
|
101
|
+
end
|
102
|
+
end
|
85
103
|
end
|
86
104
|
end
|
data/lib/arrow/chunked-array.rb
CHANGED
data/lib/arrow/csv-loader.rb
CHANGED
@@ -30,6 +30,9 @@ module Arrow
|
|
30
30
|
def initialize(path_or_data, **options)
|
31
31
|
@path_or_data = path_or_data
|
32
32
|
@options = options
|
33
|
+
if @options.key?(:delimiter)
|
34
|
+
@options[:col_sep] = @options.delete(:delimiter)
|
35
|
+
end
|
33
36
|
@compression = @options.delete(:compression)
|
34
37
|
end
|
35
38
|
|
@@ -113,6 +116,8 @@ module Arrow
|
|
113
116
|
options.add_schema(value)
|
114
117
|
when :encoding
|
115
118
|
# process encoding on opening input
|
119
|
+
when :col_sep
|
120
|
+
options.delimiter = value
|
116
121
|
else
|
117
122
|
setter = "#{key}="
|
118
123
|
if options.respond_to?(setter)
|
@@ -21,5 +21,23 @@ module Arrow
|
|
21
21
|
def add_column_type(name, type)
|
22
22
|
add_column_type_raw(name, DataType.resolve(type))
|
23
23
|
end
|
24
|
+
|
25
|
+
alias_method :delimiter_raw, :delimiter
|
26
|
+
def delimiter
|
27
|
+
delimiter_raw.chr
|
28
|
+
end
|
29
|
+
|
30
|
+
alias_method :delimiter_raw=, :delimiter=
|
31
|
+
def delimiter=(delimiter)
|
32
|
+
case delimiter
|
33
|
+
when String
|
34
|
+
if delimiter.bytesize != 1
|
35
|
+
message = "delimiter must be 1 byte character: #{delimiter.inspect}"
|
36
|
+
raise ArgumentError, message
|
37
|
+
end
|
38
|
+
delimiter = delimiter.ord
|
39
|
+
end
|
40
|
+
self.delimiter_raw = delimiter
|
41
|
+
end
|
24
42
|
end
|
25
43
|
end
|
data/lib/arrow/data-type.rb
CHANGED
@@ -121,6 +121,26 @@ module Arrow
|
|
121
121
|
end
|
122
122
|
end
|
123
123
|
|
124
|
+
def sub_types
|
125
|
+
types = {}
|
126
|
+
gtype.children.each do |child|
|
127
|
+
sub_type = child.to_class
|
128
|
+
types[sub_type] = true
|
129
|
+
sub_type.sub_types.each do |sub_sub_type|
|
130
|
+
types[sub_sub_type] = true
|
131
|
+
end
|
132
|
+
end
|
133
|
+
types.keys
|
134
|
+
end
|
135
|
+
|
136
|
+
def try_convert(value)
|
137
|
+
begin
|
138
|
+
resolve(value)
|
139
|
+
rescue ArgumentError
|
140
|
+
nil
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
124
144
|
private
|
125
145
|
def resolve_class(data_type)
|
126
146
|
components = data_type.to_s.split("_").collect(&:capitalize)
|
@@ -137,11 +157,24 @@ module Arrow
|
|
137
157
|
available_types << components.collect(&:downcase).join("_").to_sym
|
138
158
|
end
|
139
159
|
message =
|
140
|
-
"unknown type:
|
160
|
+
"unknown type: <#{data_type.inspect}>: " +
|
141
161
|
"available types: #{available_types.inspect}"
|
142
162
|
raise ArgumentError, message
|
143
163
|
end
|
144
|
-
Arrow.const_get(data_type_class_name)
|
164
|
+
data_type_class = Arrow.const_get(data_type_class_name)
|
165
|
+
if data_type_class.gtype.abstract?
|
166
|
+
not_abstract_types = data_type_class.sub_types.find_all do |sub_type|
|
167
|
+
not sub_type.gtype.abstract?
|
168
|
+
end
|
169
|
+
not_abstract_types = not_abstract_types.sort_by do |type|
|
170
|
+
type.name
|
171
|
+
end
|
172
|
+
message =
|
173
|
+
"abstract type: <#{data_type.inspect}>: " +
|
174
|
+
"use one of not abstract type: #{not_abstract_types.inspect}"
|
175
|
+
raise ArgumentError, message
|
176
|
+
end
|
177
|
+
data_type_class
|
145
178
|
end
|
146
179
|
end
|
147
180
|
|
data/lib/arrow/field.rb
CHANGED
@@ -59,7 +59,7 @@ module Arrow
|
|
59
59
|
# There is a shortcut for convenience. If field description
|
60
60
|
# doesn't have `:data_type`, all keys except `:name` are
|
61
61
|
# processes as data type description. For example, the
|
62
|
-
# following field
|
62
|
+
# following field descriptions are the same:
|
63
63
|
#
|
64
64
|
# ```ruby
|
65
65
|
# {name: "visible", data_type: {type: :boolean}}
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
|
18
|
+
module Arrow
|
19
|
+
module GenericFilterable
|
20
|
+
class << self
|
21
|
+
def included(base)
|
22
|
+
base.alias_method :filter_raw, :filter
|
23
|
+
base.alias_method :filter, :filter_generic
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def filter_generic(filter)
|
28
|
+
case filter
|
29
|
+
when ::Array
|
30
|
+
filter_raw(BooleanArray.new(filter))
|
31
|
+
when ChunkedArray
|
32
|
+
if respond_to?(:filter_chunked_array)
|
33
|
+
filter_chunked_array(filter)
|
34
|
+
else
|
35
|
+
# TODO: Implement this in C++
|
36
|
+
filter_raw(filter.pack)
|
37
|
+
end
|
38
|
+
else
|
39
|
+
filter_raw(filter)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
|
18
|
+
module Arrow
|
19
|
+
module GenericTakeable
|
20
|
+
class << self
|
21
|
+
def included(base)
|
22
|
+
base.alias_method :take_raw, :take
|
23
|
+
base.alias_method :take, :take_generic
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def take_generic(indices)
|
28
|
+
case indices
|
29
|
+
when ::Array
|
30
|
+
take_raw(IntArrayBuilder.build(indices))
|
31
|
+
when ChunkedArray
|
32
|
+
take_chunked_array(indices)
|
33
|
+
else
|
34
|
+
take_raw(indices)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|