red-arrow 0.8.0 → 0.8.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/doc/text/development.md +15 -0
- data/doc/text/news.md +14 -0
- data/lib/arrow/array-builder.rb +60 -5
- data/lib/arrow/csv-loader.rb +206 -0
- data/lib/arrow/csv-reader.rb +6 -117
- data/lib/arrow/date32-array-builder.rb +29 -0
- data/lib/arrow/date32-array.rb +27 -0
- data/lib/arrow/date64-array-builder.rb +30 -0
- data/lib/arrow/date64-array.rb +26 -0
- data/lib/arrow/loader.rb +10 -1
- data/lib/arrow/record-batch.rb +11 -37
- data/lib/arrow/record-containable.rb +70 -0
- data/lib/arrow/record.rb +20 -5
- data/lib/arrow/table-loader.rb +117 -0
- data/lib/arrow/table-saver.rb +93 -0
- data/lib/arrow/table.rb +18 -25
- data/lib/arrow/timestamp-array-builder.rb +59 -0
- data/lib/arrow/version.rb +2 -2
- data/test/test-array-builder.rb +80 -42
- data/test/test-csv-loader.rb +79 -0
- data/test/test-csv-reader.rb +5 -66
- data/test/test-date32-array.rb +21 -0
- data/test/test-date64-array.rb +22 -0
- data/test/test-table.rb +64 -10
- metadata +18 -2
@@ -0,0 +1,29 @@
|
|
1
|
+
# Copyright 2018 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
module Arrow
|
16
|
+
class Date32ArrayBuilder
|
17
|
+
private
|
18
|
+
UNIX_EPOCH = Date.new(1970, 1, 1)
|
19
|
+
def convert_to_arrow_value(value)
|
20
|
+
value = value.to_date if value.respond_to?(:to_date)
|
21
|
+
|
22
|
+
if value.is_a?(Date)
|
23
|
+
(value - UNIX_EPOCH).to_i
|
24
|
+
else
|
25
|
+
value
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# Copyright 2017 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
module Arrow
|
16
|
+
class Date32Array
|
17
|
+
def get_value(i)
|
18
|
+
to_date(get_raw_value(i))
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
UNIX_EPOCH = 2440588
|
23
|
+
def to_date(raw_value)
|
24
|
+
Date.jd(UNIX_EPOCH + raw_value)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# Copyright 2018 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
module Arrow
|
16
|
+
class Date64ArrayBuilder
|
17
|
+
private
|
18
|
+
def convert_to_arrow_value(value)
|
19
|
+
if value.respond_to?(:to_time) and not value.is_a?(Time)
|
20
|
+
value = value.to_time
|
21
|
+
end
|
22
|
+
|
23
|
+
if value.is_a?(Time)
|
24
|
+
value.to_i * 1_000 + value.usec / 1000
|
25
|
+
else
|
26
|
+
value
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# Copyright 2017 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
module Arrow
|
16
|
+
class Date64Array
|
17
|
+
def get_value(i)
|
18
|
+
to_datetime(get_raw_value(i))
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
def to_datetime(raw_value)
|
23
|
+
Time.at(*raw_value.divmod(1_000)).to_datetime
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
data/lib/arrow/loader.rb
CHANGED
@@ -33,14 +33,23 @@ module Arrow
|
|
33
33
|
require "arrow/buffer"
|
34
34
|
require "arrow/chunked-array"
|
35
35
|
require "arrow/column"
|
36
|
+
require "arrow/csv-loader"
|
36
37
|
require "arrow/csv-reader"
|
38
|
+
require "arrow/date32-array"
|
39
|
+
require "arrow/date32-array-builder"
|
40
|
+
require "arrow/date64-array"
|
41
|
+
require "arrow/date64-array-builder"
|
37
42
|
require "arrow/field"
|
43
|
+
require "arrow/record"
|
38
44
|
require "arrow/record-batch"
|
39
45
|
require "arrow/slicer"
|
40
46
|
require "arrow/table"
|
41
47
|
require "arrow/table-formatter"
|
48
|
+
require "arrow/table-loader"
|
49
|
+
require "arrow/table-saver"
|
42
50
|
require "arrow/tensor"
|
43
51
|
require "arrow/timestamp-array"
|
52
|
+
require "arrow/timestamp-array-builder"
|
44
53
|
|
45
54
|
require "arrow/record-batch-file-reader"
|
46
55
|
require "arrow/record-batch-stream-reader"
|
@@ -67,7 +76,7 @@ module Arrow
|
|
67
76
|
method_name = "get_value"
|
68
77
|
end
|
69
78
|
super(info, klass, method_name)
|
70
|
-
when "Arrow::TimestampArray"
|
79
|
+
when "Arrow::TimestampArray", "Arrow::Date32Array", "Arrow::Date64Array"
|
71
80
|
case method_name
|
72
81
|
when "get_value"
|
73
82
|
method_name = "get_raw_value"
|
data/lib/arrow/record-batch.rb
CHANGED
@@ -12,57 +12,31 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
-
require "arrow/record"
|
15
|
+
require "arrow/record-containable"
|
16
16
|
|
17
17
|
module Arrow
|
18
18
|
class RecordBatch
|
19
|
+
include RecordContainable
|
19
20
|
include Enumerable
|
20
21
|
|
21
|
-
|
22
|
-
unless block_given?
|
23
|
-
return to_enum(__method__, reuse_record: reuse_record)
|
24
|
-
end
|
25
|
-
|
26
|
-
if reuse_record
|
27
|
-
record = Record.new(self, nil)
|
28
|
-
n_rows.times do |i|
|
29
|
-
record.index = i
|
30
|
-
yield(record)
|
31
|
-
end
|
32
|
-
else
|
33
|
-
n_rows.times do |i|
|
34
|
-
yield(Record.new(self, i))
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
def find_column(name_or_index)
|
40
|
-
case name_or_index
|
41
|
-
when String, Symbol
|
42
|
-
name = name_or_index
|
43
|
-
index = resolve_name(name)
|
44
|
-
else
|
45
|
-
index = name_or_index
|
46
|
-
end
|
47
|
-
columns[index]
|
48
|
-
end
|
22
|
+
alias_method :each, :each_record
|
49
23
|
|
50
24
|
alias_method :columns_raw, :columns
|
51
25
|
def columns
|
52
26
|
@columns ||= columns_raw
|
53
27
|
end
|
54
28
|
|
55
|
-
|
56
|
-
|
57
|
-
|
29
|
+
def respond_to_missing?(name, include_private)
|
30
|
+
return true if find_column(name)
|
31
|
+
super
|
58
32
|
end
|
59
33
|
|
60
|
-
def
|
61
|
-
|
62
|
-
|
63
|
-
|
34
|
+
def method_missing(name, *args, &block)
|
35
|
+
if args.empty?
|
36
|
+
column = find_column(name)
|
37
|
+
return column if column
|
64
38
|
end
|
65
|
-
|
39
|
+
super
|
66
40
|
end
|
67
41
|
end
|
68
42
|
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# Copyright 2017-2018 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
module Arrow
|
16
|
+
module RecordContainable
|
17
|
+
def each_column(&block)
|
18
|
+
return to_enum(__method__) unless block_given?
|
19
|
+
|
20
|
+
columns.each(&block)
|
21
|
+
end
|
22
|
+
|
23
|
+
def each_record(reuse_record: false)
|
24
|
+
unless block_given?
|
25
|
+
return to_enum(__method__, reuse_record: reuse_record)
|
26
|
+
end
|
27
|
+
|
28
|
+
if reuse_record
|
29
|
+
record = Record.new(self, nil)
|
30
|
+
n_rows.times do |i|
|
31
|
+
record.index = i
|
32
|
+
yield(record)
|
33
|
+
end
|
34
|
+
else
|
35
|
+
n_rows.times do |i|
|
36
|
+
yield(Record.new(self, i))
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def find_column(name_or_index)
|
42
|
+
case name_or_index
|
43
|
+
when String, Symbol
|
44
|
+
name = name_or_index.to_s
|
45
|
+
index = resolve_column_name(name)
|
46
|
+
return nil if index.nil?
|
47
|
+
columns[index]
|
48
|
+
when Integer
|
49
|
+
index = name_or_index
|
50
|
+
columns[index]
|
51
|
+
else
|
52
|
+
message = "column name or index must be String, Symbol or Integer"
|
53
|
+
raise ArgumentError, message
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
def resolve_column_name(name)
|
59
|
+
(@column_name_to_index ||= build_column_name_resolve_table)[name]
|
60
|
+
end
|
61
|
+
|
62
|
+
def build_column_name_resolve_table
|
63
|
+
table = {}
|
64
|
+
schema.fields.each_with_index do |field, i|
|
65
|
+
table[field.name] = i
|
66
|
+
end
|
67
|
+
table
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
data/lib/arrow/record.rb
CHANGED
@@ -15,25 +15,40 @@
|
|
15
15
|
module Arrow
|
16
16
|
class Record
|
17
17
|
attr_accessor :index
|
18
|
-
def initialize(
|
19
|
-
@
|
18
|
+
def initialize(record_container, index)
|
19
|
+
@record_container = record_container
|
20
20
|
@index = index
|
21
21
|
end
|
22
22
|
|
23
23
|
def [](column_name_or_column_index)
|
24
|
-
@
|
24
|
+
column = @record_container.find_column(column_name_or_column_index)
|
25
|
+
return nil if column.nil?
|
26
|
+
column[@index]
|
25
27
|
end
|
26
28
|
|
27
29
|
def columns
|
28
|
-
@
|
30
|
+
@record_container.columns
|
29
31
|
end
|
30
32
|
|
31
33
|
def to_h
|
32
34
|
attributes = {}
|
33
|
-
@
|
35
|
+
@record_container.schema.fields.each_with_index do |field, i|
|
34
36
|
attributes[field.name] = self[i]
|
35
37
|
end
|
36
38
|
attributes
|
37
39
|
end
|
40
|
+
|
41
|
+
def respond_to_missing?(name, include_private)
|
42
|
+
return true if @record_container.find_column(name)
|
43
|
+
super
|
44
|
+
end
|
45
|
+
|
46
|
+
def method_missing(name, *args, &block)
|
47
|
+
if args.empty?
|
48
|
+
column = @record_container.find_column(name)
|
49
|
+
return column[@index] if column
|
50
|
+
end
|
51
|
+
super
|
52
|
+
end
|
38
53
|
end
|
39
54
|
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
# Copyright 2018 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
module Arrow
|
16
|
+
class TableLoader
|
17
|
+
class << self
|
18
|
+
def load(path, options={})
|
19
|
+
new(path, options).load
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def initialize(path, options={})
|
24
|
+
@path = path
|
25
|
+
@options = options
|
26
|
+
end
|
27
|
+
|
28
|
+
def load
|
29
|
+
path = @path
|
30
|
+
path = path.to_path if path.respond_to?(:to_path)
|
31
|
+
format = @options[:format] || guess_format(path) || :arrow
|
32
|
+
|
33
|
+
custom_load_method = "load_as_#{format}"
|
34
|
+
unless respond_to?(custom_load_method, true)
|
35
|
+
available_formats = []
|
36
|
+
(methods(true) | private_methods(true)).each do |name|
|
37
|
+
match_data = /\Aload_as_/.match(name.to_s)
|
38
|
+
if match_data
|
39
|
+
available_formats << match_data.post_match
|
40
|
+
end
|
41
|
+
end
|
42
|
+
message = "Arrow::Table load format must be one of ["
|
43
|
+
message << available_formats.join(", ")
|
44
|
+
message << "]: #{format.inspect}"
|
45
|
+
raise ArgumentError, message
|
46
|
+
end
|
47
|
+
__send__(custom_load_method, path)
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
def guess_format(path)
|
52
|
+
extension = ::File.extname(path).gsub(/\A\./, "").downcase
|
53
|
+
return nil if extension.empty?
|
54
|
+
|
55
|
+
return extension if respond_to?("load_as_#{extension}", true)
|
56
|
+
|
57
|
+
nil
|
58
|
+
end
|
59
|
+
|
60
|
+
def load_raw(input, reader)
|
61
|
+
schema = reader.schema
|
62
|
+
chunked_arrays = []
|
63
|
+
reader.each do |record_batch|
|
64
|
+
record_batch.columns.each_with_index do |array, i|
|
65
|
+
chunked_array = (chunked_arrays[i] ||= [])
|
66
|
+
chunked_array << array
|
67
|
+
end
|
68
|
+
end
|
69
|
+
columns = schema.fields.collect.with_index do |field, i|
|
70
|
+
Column.new(field, ChunkedArray.new(chunked_arrays[i]))
|
71
|
+
end
|
72
|
+
table = Table.new(schema, columns)
|
73
|
+
table.instance_variable_set(:@input, input)
|
74
|
+
table
|
75
|
+
end
|
76
|
+
|
77
|
+
def load_as_arrow(path)
|
78
|
+
input = nil
|
79
|
+
reader = nil
|
80
|
+
error = nil
|
81
|
+
reader_class_candidates = [
|
82
|
+
RecordBatchFileReader,
|
83
|
+
RecordBatchStreamReader,
|
84
|
+
]
|
85
|
+
reader_class_candidates.each do |reader_class_candidate|
|
86
|
+
input = MemoryMappedInputStream.new(path)
|
87
|
+
begin
|
88
|
+
reader = reader_class_candidate.new(input)
|
89
|
+
rescue Arrow::Error
|
90
|
+
error = $!
|
91
|
+
else
|
92
|
+
break
|
93
|
+
end
|
94
|
+
end
|
95
|
+
raise error if reader.nil?
|
96
|
+
load_raw(input, reader)
|
97
|
+
end
|
98
|
+
|
99
|
+
def load_as_batch(path)
|
100
|
+
input = MemoryMappedInputStream.new(path)
|
101
|
+
reader = RecordBatchFileReader.new(input)
|
102
|
+
load_raw(input, reader)
|
103
|
+
end
|
104
|
+
|
105
|
+
def load_as_stream(path)
|
106
|
+
input = MemoryMappedInputStream.new(path)
|
107
|
+
reader = RecordBatchStreamReader.new(input)
|
108
|
+
load_raw(input, reader)
|
109
|
+
end
|
110
|
+
|
111
|
+
def load_as_csv(path)
|
112
|
+
options = @options.dup
|
113
|
+
options.delete(:format)
|
114
|
+
CSVLoader.load(Pathname.new(path), options)
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|