red-arrow 0.8.0 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/doc/text/development.md +15 -0
- data/doc/text/news.md +14 -0
- data/lib/arrow/array-builder.rb +60 -5
- data/lib/arrow/csv-loader.rb +206 -0
- data/lib/arrow/csv-reader.rb +6 -117
- data/lib/arrow/date32-array-builder.rb +29 -0
- data/lib/arrow/date32-array.rb +27 -0
- data/lib/arrow/date64-array-builder.rb +30 -0
- data/lib/arrow/date64-array.rb +26 -0
- data/lib/arrow/loader.rb +10 -1
- data/lib/arrow/record-batch.rb +11 -37
- data/lib/arrow/record-containable.rb +70 -0
- data/lib/arrow/record.rb +20 -5
- data/lib/arrow/table-loader.rb +117 -0
- data/lib/arrow/table-saver.rb +93 -0
- data/lib/arrow/table.rb +18 -25
- data/lib/arrow/timestamp-array-builder.rb +59 -0
- data/lib/arrow/version.rb +2 -2
- data/test/test-array-builder.rb +80 -42
- data/test/test-csv-loader.rb +79 -0
- data/test/test-csv-reader.rb +5 -66
- data/test/test-date32-array.rb +21 -0
- data/test/test-date64-array.rb +22 -0
- data/test/test-table.rb +64 -10
- metadata +18 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 59cce19c00fbc436f48f29c2d58aca2685330a63
|
4
|
+
data.tar.gz: 280b1305e60c39d74d4fd101dc43387cd398e770
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3ecba86d0d061186def935c54a999f1c0adf1f109a1687eefd0375792a87f34333e7a46ee7f78fe51b01fbe294166491f300d9e46a8315e19ff47a1cb3f45102
|
7
|
+
data.tar.gz: 49d09794b0d007bf9e65675e133c45cd1a90a3eec748b9c747d2e7d78e4c21d682ba503c696cb61d128fec362fa138ec50b04ab780a0bf554557417ccaf02afd
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# Development
|
2
|
+
|
3
|
+
## Naming convention
|
4
|
+
|
5
|
+
### Reader and Writer
|
6
|
+
|
7
|
+
Reader and Writer require an opened IO stream.
|
8
|
+
|
9
|
+
### Loader and Saver
|
10
|
+
|
11
|
+
Loader and Saver require a path. They are convenient classes.
|
12
|
+
|
13
|
+
Loader opens the path and reads data by Reader.
|
14
|
+
|
15
|
+
Writer opens the path and writes data by Writer.
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,19 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 0.8.1 - 2018-01-05
|
4
|
+
|
5
|
+
### Improvements
|
6
|
+
|
7
|
+
* `Arrow::ArrayBuilder.build`: Added generic array build support.
|
8
|
+
|
9
|
+
* `Arrow::Table#save`: Added.
|
10
|
+
|
11
|
+
* `Arrow::Table.load`: Added.
|
12
|
+
|
13
|
+
* `Arrow::CSVLoader`: Added.
|
14
|
+
|
15
|
+
* `Arrow::CSVReader.read`: Removed.
|
16
|
+
|
3
17
|
## 0.8.0 - 2018-01-04
|
4
18
|
|
5
19
|
### Improvements
|
data/lib/arrow/array-builder.rb
CHANGED
@@ -12,16 +12,58 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
+
require "date"
|
16
|
+
|
15
17
|
module Arrow
|
16
18
|
class ArrayBuilder
|
17
19
|
class << self
|
18
20
|
def build(values)
|
19
|
-
|
20
|
-
|
21
|
+
if self != ArrayBuilder
|
22
|
+
builder = new
|
23
|
+
return builder.build(values)
|
24
|
+
end
|
25
|
+
|
26
|
+
builder_class = nil
|
27
|
+
values.each do |value|
|
28
|
+
case value
|
29
|
+
when nil
|
30
|
+
# Ignore
|
31
|
+
nil
|
32
|
+
when true, false
|
33
|
+
return BooleanArray.new(values)
|
34
|
+
when String
|
35
|
+
return StringArray.new(values)
|
36
|
+
when Float
|
37
|
+
return DoubleArray.new(values)
|
38
|
+
when Integer
|
39
|
+
if value.negative?
|
40
|
+
builder = IntArrayBuilder.new
|
41
|
+
return builder.build(values)
|
42
|
+
else
|
43
|
+
builder_class = UIntArrayBuilder
|
44
|
+
end
|
45
|
+
when Time
|
46
|
+
data_type = TimestampDataType.new(:nano)
|
47
|
+
builder = TimestampArrayBuilder.new(data_type)
|
48
|
+
return builder.build(values)
|
49
|
+
when DateTime
|
50
|
+
return Date64Array.new(values)
|
51
|
+
when Date
|
52
|
+
return Date32Array.new(values)
|
53
|
+
else
|
54
|
+
return StringArray.new(values)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
if builder_class
|
58
|
+
builder_class.new.build(values)
|
59
|
+
else
|
60
|
+
Arrow::StringArray.new(values)
|
61
|
+
end
|
21
62
|
end
|
22
63
|
end
|
23
64
|
|
24
65
|
def build(values)
|
66
|
+
value_convertable = respond_to?(:convert_to_arrow_value, true)
|
25
67
|
if respond_to?(:append_values)
|
26
68
|
start_index = 0
|
27
69
|
current_index = 0
|
@@ -30,7 +72,13 @@ module Arrow
|
|
30
72
|
if value.nil?
|
31
73
|
if status == :value
|
32
74
|
if start_index != current_index
|
33
|
-
|
75
|
+
target_values = values[start_index...current_index]
|
76
|
+
if value_convertable
|
77
|
+
target_values = target_values.collect do |v|
|
78
|
+
convert_to_arrow_value(v)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
append_values(target_values)
|
34
82
|
start_index = current_index
|
35
83
|
end
|
36
84
|
status = :null
|
@@ -47,10 +95,16 @@ module Arrow
|
|
47
95
|
if start_index != current_index
|
48
96
|
if status == :value
|
49
97
|
if start_index == 0 and current_index == values.size
|
50
|
-
|
98
|
+
target_values = values
|
51
99
|
else
|
52
|
-
|
100
|
+
target_values = values[start_index...current_index]
|
101
|
+
end
|
102
|
+
if value_convertable
|
103
|
+
target_values = target_values.collect do |v|
|
104
|
+
convert_to_arrow_value(v)
|
105
|
+
end
|
53
106
|
end
|
107
|
+
append_values(target_values)
|
54
108
|
else
|
55
109
|
append_nulls(current_index - start_index)
|
56
110
|
end
|
@@ -60,6 +114,7 @@ module Arrow
|
|
60
114
|
if value.nil?
|
61
115
|
append_null
|
62
116
|
else
|
117
|
+
value = convert_to_arrow_value(value) if value_convertable
|
63
118
|
append(value)
|
64
119
|
end
|
65
120
|
end
|
@@ -0,0 +1,206 @@
|
|
1
|
+
# Copyright 2017-2018 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
require "csv"
|
16
|
+
require "pathname"
|
17
|
+
require "time"
|
18
|
+
|
19
|
+
module Arrow
|
20
|
+
class CSVLoader
|
21
|
+
class << self
|
22
|
+
def load(path_or_data, **options)
|
23
|
+
new(path_or_data, **options).load
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def initialize(path_or_data, **options)
|
28
|
+
@path_or_data = path_or_data
|
29
|
+
@options = options
|
30
|
+
end
|
31
|
+
|
32
|
+
def load
|
33
|
+
case @path_or_data
|
34
|
+
when Pathname
|
35
|
+
load_from_path(@path_or_data.to_path)
|
36
|
+
when /\A.+\.csv\z/i
|
37
|
+
load_from_path(@path_or_data)
|
38
|
+
else
|
39
|
+
load_data(@path_or_data)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
def open_csv(path, **options)
|
45
|
+
CSV.open(path, **options) do |csv|
|
46
|
+
yield(csv)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def parse_csv_data(data, **options)
|
51
|
+
csv = CSV.new(data, **options)
|
52
|
+
begin
|
53
|
+
yield(csv)
|
54
|
+
ensure
|
55
|
+
csv.close
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def read_csv(csv)
|
60
|
+
reader = CSVReader.new(csv)
|
61
|
+
reader.read
|
62
|
+
end
|
63
|
+
|
64
|
+
def load_from_path(path)
|
65
|
+
options = update_csv_parse_options(@options, :open_csv, path)
|
66
|
+
open_csv(path, **options) do |csv|
|
67
|
+
read_csv(csv)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def load_data(data)
|
72
|
+
options = update_csv_parse_options(@options, :parse_csv_data, data)
|
73
|
+
parse_csv_data(data, **options) do |csv|
|
74
|
+
read_csv(csv)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def selective_converter(target_index)
|
79
|
+
lambda do |field, field_info|
|
80
|
+
if target_index.nil? or field_info.index == target_index
|
81
|
+
yield(field)
|
82
|
+
else
|
83
|
+
field
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
BOOLEAN_CONVERTER = lambda do |field|
|
89
|
+
begin
|
90
|
+
encoded_field = field.encode(CSV::ConverterEncoding)
|
91
|
+
rescue EncodingError
|
92
|
+
field
|
93
|
+
else
|
94
|
+
case encoded_field
|
95
|
+
when "true"
|
96
|
+
true
|
97
|
+
when "false"
|
98
|
+
false
|
99
|
+
else
|
100
|
+
field
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
ISO8601_CONVERTER = lambda do |field|
|
106
|
+
begin
|
107
|
+
encoded_field = field.encode(CSV::ConverterEncoding)
|
108
|
+
rescue EncodingError
|
109
|
+
field
|
110
|
+
else
|
111
|
+
begin
|
112
|
+
Time.iso8601(encoded_field)
|
113
|
+
rescue ArgumentError
|
114
|
+
field
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def update_csv_parse_options(options, create_csv, *args)
|
120
|
+
return options unless options.empty?
|
121
|
+
|
122
|
+
converters = [:all, BOOLEAN_CONVERTER, ISO8601_CONVERTER]
|
123
|
+
new_options = options.merge(converters: converters)
|
124
|
+
__send__(create_csv, *args, **new_options) do |csv|
|
125
|
+
new_options[:headers] = have_header?(csv)
|
126
|
+
end
|
127
|
+
__send__(create_csv, *args, **new_options) do |csv|
|
128
|
+
new_options[:converters] = detect_robust_converters(csv)
|
129
|
+
end
|
130
|
+
return new_options
|
131
|
+
end
|
132
|
+
|
133
|
+
def have_header?(csv)
|
134
|
+
row1 = csv.shift
|
135
|
+
return false if row1.nil?
|
136
|
+
return false if row1.any?(&:nil?)
|
137
|
+
|
138
|
+
row2 = csv.shift
|
139
|
+
return nil if row2.nil?
|
140
|
+
return true if row2.any?(&:nil?)
|
141
|
+
|
142
|
+
if row1.collect(&:class) != row2.collect(&:class)
|
143
|
+
return true
|
144
|
+
end
|
145
|
+
|
146
|
+
nil
|
147
|
+
end
|
148
|
+
|
149
|
+
def detect_robust_converters(csv)
|
150
|
+
column_types = []
|
151
|
+
csv.each do |row|
|
152
|
+
row.each_with_index do |(_name, value), i|
|
153
|
+
current_column_type = column_types[i]
|
154
|
+
next if current_column_type == :string
|
155
|
+
|
156
|
+
candidate_type = nil
|
157
|
+
case value
|
158
|
+
when nil
|
159
|
+
next
|
160
|
+
when "true", "false", true, false
|
161
|
+
candidate_type = :boolean
|
162
|
+
when Integer
|
163
|
+
candidate_type = :integer
|
164
|
+
when Float
|
165
|
+
candidate_type = :float
|
166
|
+
if current_column_type == :integer
|
167
|
+
column_types[i] = candidate_type
|
168
|
+
end
|
169
|
+
when Time
|
170
|
+
candidate_type = :time
|
171
|
+
when DateTime
|
172
|
+
candidate_type = :date_time
|
173
|
+
when Date
|
174
|
+
candidate_type = :date
|
175
|
+
else
|
176
|
+
candidate_type = :string
|
177
|
+
end
|
178
|
+
|
179
|
+
column_types[i] ||= candidate_type
|
180
|
+
if column_types[i] != candidate_type
|
181
|
+
column_types[i] = :string
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
converters = []
|
187
|
+
column_types.each_with_index do |type, i|
|
188
|
+
case type
|
189
|
+
when :boolean
|
190
|
+
converters << selective_converter(i, &BOOLEAN_CONVERTER)
|
191
|
+
when :integer
|
192
|
+
converters << selective_converter(i, &CSV::Converters[:integer])
|
193
|
+
when :float
|
194
|
+
converters << selective_converter(i, &CSV::Converters[:float])
|
195
|
+
when :time
|
196
|
+
converters << selective_converter(i, &ISO8601_CONVERTER)
|
197
|
+
when :date_time
|
198
|
+
converters << selective_converter(i, &CSV::Converters[:date_time])
|
199
|
+
when :date
|
200
|
+
converters << selective_converter(i, &CSV::Converters[:date])
|
201
|
+
end
|
202
|
+
end
|
203
|
+
converters
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
data/lib/arrow/csv-reader.rb
CHANGED
@@ -13,150 +13,39 @@
|
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
15
|
require "csv"
|
16
|
-
require "pathname"
|
17
|
-
require "time"
|
18
16
|
|
19
17
|
module Arrow
|
20
18
|
class CSVReader
|
21
|
-
class << self
|
22
|
-
def read(csv, **options)
|
23
|
-
case csv
|
24
|
-
when Pathname
|
25
|
-
path = csv.to_path
|
26
|
-
options = update_csv_parse_options(options, :open_csv, path)
|
27
|
-
open_csv(path, **options) do |_csv|
|
28
|
-
read(_csv)
|
29
|
-
end
|
30
|
-
when /\A.+\.csv\z/i
|
31
|
-
read(Pathname.new(csv), **options)
|
32
|
-
when String
|
33
|
-
options = update_csv_parse_options(options, :parse_csv_data, csv)
|
34
|
-
parse_csv_data(csv, **options) do |_csv|
|
35
|
-
read(_csv)
|
36
|
-
end
|
37
|
-
else
|
38
|
-
new(csv).read
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
private
|
43
|
-
def open_csv(path, **options)
|
44
|
-
CSV.open(path, **options) do |csv|
|
45
|
-
yield(csv)
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
def parse_csv_data(data, **options)
|
50
|
-
csv = CSV.new(data, **options)
|
51
|
-
begin
|
52
|
-
yield(csv)
|
53
|
-
ensure
|
54
|
-
csv.close
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
ISO8601_CONVERTER = lambda do |field|
|
59
|
-
begin
|
60
|
-
encoded_field = field.encode(CSV::ConverterEncoding)
|
61
|
-
rescue EncodingError
|
62
|
-
field
|
63
|
-
else
|
64
|
-
begin
|
65
|
-
Time.iso8601(encoded_field)
|
66
|
-
rescue ArgumentError
|
67
|
-
field
|
68
|
-
end
|
69
|
-
end
|
70
|
-
end
|
71
|
-
|
72
|
-
def update_csv_parse_options(options, create_csv, *args)
|
73
|
-
return options unless options.empty?
|
74
|
-
|
75
|
-
new_options = options.merge(converters: [:all, ISO8601_CONVERTER])
|
76
|
-
__send__(create_csv, *args, **new_options) do |csv|
|
77
|
-
row1 = csv.shift
|
78
|
-
if row1.nil?
|
79
|
-
new_options[:headers] = false
|
80
|
-
return new_options
|
81
|
-
end
|
82
|
-
if row1.any?(&:nil?)
|
83
|
-
new_options[:headers] = false
|
84
|
-
return new_options
|
85
|
-
end
|
86
|
-
|
87
|
-
row2 = csv.shift
|
88
|
-
return new_options if row2.nil?
|
89
|
-
if row2.any?(&:nil?)
|
90
|
-
new_options[:headers] = true
|
91
|
-
return new_options
|
92
|
-
end
|
93
|
-
|
94
|
-
if row1.collect(&:class) != row2.collect(&:class)
|
95
|
-
new_options[:headers] = true
|
96
|
-
return new_options
|
97
|
-
end
|
98
|
-
|
99
|
-
new_options
|
100
|
-
end
|
101
|
-
end
|
102
|
-
end
|
103
|
-
|
104
19
|
def initialize(csv)
|
105
20
|
@csv = csv
|
106
21
|
end
|
107
22
|
|
108
23
|
def read
|
109
|
-
builders = []
|
110
24
|
values_set = []
|
111
25
|
@csv.each do |row|
|
112
26
|
if row.is_a?(CSV::Row)
|
113
27
|
row = row.collect(&:last)
|
114
28
|
end
|
115
29
|
row.each_with_index do |value, i|
|
116
|
-
builders[i] ||= create_builder(value)
|
117
30
|
values = (values_set[i] ||= [])
|
118
|
-
case value
|
119
|
-
when Time
|
120
|
-
value = value.to_i * (10 ** 9) + value.nsec
|
121
|
-
end
|
122
31
|
values << value
|
123
32
|
end
|
124
33
|
end
|
125
34
|
return nil if values_set.empty?
|
126
35
|
|
127
36
|
arrays = values_set.collect.with_index do |values, i|
|
128
|
-
|
37
|
+
ArrayBuilder.build(values)
|
129
38
|
end
|
130
39
|
if @csv.headers
|
131
40
|
names = @csv.headers
|
132
41
|
else
|
133
|
-
names =
|
134
|
-
end
|
135
|
-
fields = names.collect.with_index do |name, i|
|
136
|
-
Arrow::Field.new(name, arrays[i].value_data_type)
|
137
|
-
end
|
138
|
-
schema = Schema.new(fields)
|
139
|
-
columns = arrays.collect.with_index do |array, i|
|
140
|
-
Column.new(fields[i], array)
|
42
|
+
names = arrays.size.times.collect(&:to_s)
|
141
43
|
end
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
private
|
146
|
-
def create_builder(sample_value)
|
147
|
-
case sample_value
|
148
|
-
when Integer
|
149
|
-
IntArrayBuilder.new
|
150
|
-
when Float
|
151
|
-
DoubleArrayBuilder.new
|
152
|
-
when String
|
153
|
-
StringArrayBuilder.new
|
154
|
-
when Time
|
155
|
-
data_type = TimestampDataType.new(:nano)
|
156
|
-
TimestampArrayBuilder.new(data_type)
|
157
|
-
else
|
158
|
-
nil
|
44
|
+
raw_table = {}
|
45
|
+
names.each_with_index do |name, i|
|
46
|
+
raw_table[name] = arrays[i]
|
159
47
|
end
|
48
|
+
Table.new(raw_table)
|
160
49
|
end
|
161
50
|
end
|
162
51
|
end
|