red-arrow 0.8.0 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/doc/text/development.md +15 -0
- data/doc/text/news.md +14 -0
- data/lib/arrow/array-builder.rb +60 -5
- data/lib/arrow/csv-loader.rb +206 -0
- data/lib/arrow/csv-reader.rb +6 -117
- data/lib/arrow/date32-array-builder.rb +29 -0
- data/lib/arrow/date32-array.rb +27 -0
- data/lib/arrow/date64-array-builder.rb +30 -0
- data/lib/arrow/date64-array.rb +26 -0
- data/lib/arrow/loader.rb +10 -1
- data/lib/arrow/record-batch.rb +11 -37
- data/lib/arrow/record-containable.rb +70 -0
- data/lib/arrow/record.rb +20 -5
- data/lib/arrow/table-loader.rb +117 -0
- data/lib/arrow/table-saver.rb +93 -0
- data/lib/arrow/table.rb +18 -25
- data/lib/arrow/timestamp-array-builder.rb +59 -0
- data/lib/arrow/version.rb +2 -2
- data/test/test-array-builder.rb +80 -42
- data/test/test-csv-loader.rb +79 -0
- data/test/test-csv-reader.rb +5 -66
- data/test/test-date32-array.rb +21 -0
- data/test/test-date64-array.rb +22 -0
- data/test/test-table.rb +64 -10
- metadata +18 -2
@@ -0,0 +1,29 @@
|
|
1
|
+
# Copyright 2018 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
module Arrow
|
16
|
+
class Date32ArrayBuilder
|
17
|
+
private
|
18
|
+
UNIX_EPOCH = Date.new(1970, 1, 1)
|
19
|
+
def convert_to_arrow_value(value)
|
20
|
+
value = value.to_date if value.respond_to?(:to_date)
|
21
|
+
|
22
|
+
if value.is_a?(Date)
|
23
|
+
(value - UNIX_EPOCH).to_i
|
24
|
+
else
|
25
|
+
value
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# Copyright 2017 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
module Arrow
|
16
|
+
class Date32Array
|
17
|
+
def get_value(i)
|
18
|
+
to_date(get_raw_value(i))
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
UNIX_EPOCH = 2440588
|
23
|
+
def to_date(raw_value)
|
24
|
+
Date.jd(UNIX_EPOCH + raw_value)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# Copyright 2018 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
module Arrow
|
16
|
+
class Date64ArrayBuilder
|
17
|
+
private
|
18
|
+
def convert_to_arrow_value(value)
|
19
|
+
if value.respond_to?(:to_time) and not value.is_a?(Time)
|
20
|
+
value = value.to_time
|
21
|
+
end
|
22
|
+
|
23
|
+
if value.is_a?(Time)
|
24
|
+
value.to_i * 1_000 + value.usec / 1000
|
25
|
+
else
|
26
|
+
value
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# Copyright 2017 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
module Arrow
|
16
|
+
class Date64Array
|
17
|
+
def get_value(i)
|
18
|
+
to_datetime(get_raw_value(i))
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
def to_datetime(raw_value)
|
23
|
+
Time.at(*raw_value.divmod(1_000)).to_datetime
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
data/lib/arrow/loader.rb
CHANGED
@@ -33,14 +33,23 @@ module Arrow
|
|
33
33
|
require "arrow/buffer"
|
34
34
|
require "arrow/chunked-array"
|
35
35
|
require "arrow/column"
|
36
|
+
require "arrow/csv-loader"
|
36
37
|
require "arrow/csv-reader"
|
38
|
+
require "arrow/date32-array"
|
39
|
+
require "arrow/date32-array-builder"
|
40
|
+
require "arrow/date64-array"
|
41
|
+
require "arrow/date64-array-builder"
|
37
42
|
require "arrow/field"
|
43
|
+
require "arrow/record"
|
38
44
|
require "arrow/record-batch"
|
39
45
|
require "arrow/slicer"
|
40
46
|
require "arrow/table"
|
41
47
|
require "arrow/table-formatter"
|
48
|
+
require "arrow/table-loader"
|
49
|
+
require "arrow/table-saver"
|
42
50
|
require "arrow/tensor"
|
43
51
|
require "arrow/timestamp-array"
|
52
|
+
require "arrow/timestamp-array-builder"
|
44
53
|
|
45
54
|
require "arrow/record-batch-file-reader"
|
46
55
|
require "arrow/record-batch-stream-reader"
|
@@ -67,7 +76,7 @@ module Arrow
|
|
67
76
|
method_name = "get_value"
|
68
77
|
end
|
69
78
|
super(info, klass, method_name)
|
70
|
-
when "Arrow::TimestampArray"
|
79
|
+
when "Arrow::TimestampArray", "Arrow::Date32Array", "Arrow::Date64Array"
|
71
80
|
case method_name
|
72
81
|
when "get_value"
|
73
82
|
method_name = "get_raw_value"
|
data/lib/arrow/record-batch.rb
CHANGED
@@ -12,57 +12,31 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
-
require "arrow/record"
|
15
|
+
require "arrow/record-containable"
|
16
16
|
|
17
17
|
module Arrow
|
18
18
|
class RecordBatch
|
19
|
+
include RecordContainable
|
19
20
|
include Enumerable
|
20
21
|
|
21
|
-
|
22
|
-
unless block_given?
|
23
|
-
return to_enum(__method__, reuse_record: reuse_record)
|
24
|
-
end
|
25
|
-
|
26
|
-
if reuse_record
|
27
|
-
record = Record.new(self, nil)
|
28
|
-
n_rows.times do |i|
|
29
|
-
record.index = i
|
30
|
-
yield(record)
|
31
|
-
end
|
32
|
-
else
|
33
|
-
n_rows.times do |i|
|
34
|
-
yield(Record.new(self, i))
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
def find_column(name_or_index)
|
40
|
-
case name_or_index
|
41
|
-
when String, Symbol
|
42
|
-
name = name_or_index
|
43
|
-
index = resolve_name(name)
|
44
|
-
else
|
45
|
-
index = name_or_index
|
46
|
-
end
|
47
|
-
columns[index]
|
48
|
-
end
|
22
|
+
alias_method :each, :each_record
|
49
23
|
|
50
24
|
alias_method :columns_raw, :columns
|
51
25
|
def columns
|
52
26
|
@columns ||= columns_raw
|
53
27
|
end
|
54
28
|
|
55
|
-
|
56
|
-
|
57
|
-
|
29
|
+
def respond_to_missing?(name, include_private)
|
30
|
+
return true if find_column(name)
|
31
|
+
super
|
58
32
|
end
|
59
33
|
|
60
|
-
def
|
61
|
-
|
62
|
-
|
63
|
-
|
34
|
+
def method_missing(name, *args, &block)
|
35
|
+
if args.empty?
|
36
|
+
column = find_column(name)
|
37
|
+
return column if column
|
64
38
|
end
|
65
|
-
|
39
|
+
super
|
66
40
|
end
|
67
41
|
end
|
68
42
|
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# Copyright 2017-2018 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
module Arrow
|
16
|
+
module RecordContainable
|
17
|
+
def each_column(&block)
|
18
|
+
return to_enum(__method__) unless block_given?
|
19
|
+
|
20
|
+
columns.each(&block)
|
21
|
+
end
|
22
|
+
|
23
|
+
def each_record(reuse_record: false)
|
24
|
+
unless block_given?
|
25
|
+
return to_enum(__method__, reuse_record: reuse_record)
|
26
|
+
end
|
27
|
+
|
28
|
+
if reuse_record
|
29
|
+
record = Record.new(self, nil)
|
30
|
+
n_rows.times do |i|
|
31
|
+
record.index = i
|
32
|
+
yield(record)
|
33
|
+
end
|
34
|
+
else
|
35
|
+
n_rows.times do |i|
|
36
|
+
yield(Record.new(self, i))
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def find_column(name_or_index)
|
42
|
+
case name_or_index
|
43
|
+
when String, Symbol
|
44
|
+
name = name_or_index.to_s
|
45
|
+
index = resolve_column_name(name)
|
46
|
+
return nil if index.nil?
|
47
|
+
columns[index]
|
48
|
+
when Integer
|
49
|
+
index = name_or_index
|
50
|
+
columns[index]
|
51
|
+
else
|
52
|
+
message = "column name or index must be String, Symbol or Integer"
|
53
|
+
raise ArgumentError, message
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
def resolve_column_name(name)
|
59
|
+
(@column_name_to_index ||= build_column_name_resolve_table)[name]
|
60
|
+
end
|
61
|
+
|
62
|
+
def build_column_name_resolve_table
|
63
|
+
table = {}
|
64
|
+
schema.fields.each_with_index do |field, i|
|
65
|
+
table[field.name] = i
|
66
|
+
end
|
67
|
+
table
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
data/lib/arrow/record.rb
CHANGED
@@ -15,25 +15,40 @@
|
|
15
15
|
module Arrow
|
16
16
|
class Record
|
17
17
|
attr_accessor :index
|
18
|
-
def initialize(
|
19
|
-
@
|
18
|
+
def initialize(record_container, index)
|
19
|
+
@record_container = record_container
|
20
20
|
@index = index
|
21
21
|
end
|
22
22
|
|
23
23
|
def [](column_name_or_column_index)
|
24
|
-
@
|
24
|
+
column = @record_container.find_column(column_name_or_column_index)
|
25
|
+
return nil if column.nil?
|
26
|
+
column[@index]
|
25
27
|
end
|
26
28
|
|
27
29
|
def columns
|
28
|
-
@
|
30
|
+
@record_container.columns
|
29
31
|
end
|
30
32
|
|
31
33
|
def to_h
|
32
34
|
attributes = {}
|
33
|
-
@
|
35
|
+
@record_container.schema.fields.each_with_index do |field, i|
|
34
36
|
attributes[field.name] = self[i]
|
35
37
|
end
|
36
38
|
attributes
|
37
39
|
end
|
40
|
+
|
41
|
+
def respond_to_missing?(name, include_private)
|
42
|
+
return true if @record_container.find_column(name)
|
43
|
+
super
|
44
|
+
end
|
45
|
+
|
46
|
+
def method_missing(name, *args, &block)
|
47
|
+
if args.empty?
|
48
|
+
column = @record_container.find_column(name)
|
49
|
+
return column[@index] if column
|
50
|
+
end
|
51
|
+
super
|
52
|
+
end
|
38
53
|
end
|
39
54
|
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
# Copyright 2018 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
module Arrow
|
16
|
+
class TableLoader
|
17
|
+
class << self
|
18
|
+
def load(path, options={})
|
19
|
+
new(path, options).load
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def initialize(path, options={})
|
24
|
+
@path = path
|
25
|
+
@options = options
|
26
|
+
end
|
27
|
+
|
28
|
+
def load
|
29
|
+
path = @path
|
30
|
+
path = path.to_path if path.respond_to?(:to_path)
|
31
|
+
format = @options[:format] || guess_format(path) || :arrow
|
32
|
+
|
33
|
+
custom_load_method = "load_as_#{format}"
|
34
|
+
unless respond_to?(custom_load_method, true)
|
35
|
+
available_formats = []
|
36
|
+
(methods(true) | private_methods(true)).each do |name|
|
37
|
+
match_data = /\Aload_as_/.match(name.to_s)
|
38
|
+
if match_data
|
39
|
+
available_formats << match_data.post_match
|
40
|
+
end
|
41
|
+
end
|
42
|
+
message = "Arrow::Table load format must be one of ["
|
43
|
+
message << available_formats.join(", ")
|
44
|
+
message << "]: #{format.inspect}"
|
45
|
+
raise ArgumentError, message
|
46
|
+
end
|
47
|
+
__send__(custom_load_method, path)
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
def guess_format(path)
|
52
|
+
extension = ::File.extname(path).gsub(/\A\./, "").downcase
|
53
|
+
return nil if extension.empty?
|
54
|
+
|
55
|
+
return extension if respond_to?("load_as_#{extension}", true)
|
56
|
+
|
57
|
+
nil
|
58
|
+
end
|
59
|
+
|
60
|
+
def load_raw(input, reader)
|
61
|
+
schema = reader.schema
|
62
|
+
chunked_arrays = []
|
63
|
+
reader.each do |record_batch|
|
64
|
+
record_batch.columns.each_with_index do |array, i|
|
65
|
+
chunked_array = (chunked_arrays[i] ||= [])
|
66
|
+
chunked_array << array
|
67
|
+
end
|
68
|
+
end
|
69
|
+
columns = schema.fields.collect.with_index do |field, i|
|
70
|
+
Column.new(field, ChunkedArray.new(chunked_arrays[i]))
|
71
|
+
end
|
72
|
+
table = Table.new(schema, columns)
|
73
|
+
table.instance_variable_set(:@input, input)
|
74
|
+
table
|
75
|
+
end
|
76
|
+
|
77
|
+
def load_as_arrow(path)
|
78
|
+
input = nil
|
79
|
+
reader = nil
|
80
|
+
error = nil
|
81
|
+
reader_class_candidates = [
|
82
|
+
RecordBatchFileReader,
|
83
|
+
RecordBatchStreamReader,
|
84
|
+
]
|
85
|
+
reader_class_candidates.each do |reader_class_candidate|
|
86
|
+
input = MemoryMappedInputStream.new(path)
|
87
|
+
begin
|
88
|
+
reader = reader_class_candidate.new(input)
|
89
|
+
rescue Arrow::Error
|
90
|
+
error = $!
|
91
|
+
else
|
92
|
+
break
|
93
|
+
end
|
94
|
+
end
|
95
|
+
raise error if reader.nil?
|
96
|
+
load_raw(input, reader)
|
97
|
+
end
|
98
|
+
|
99
|
+
def load_as_batch(path)
|
100
|
+
input = MemoryMappedInputStream.new(path)
|
101
|
+
reader = RecordBatchFileReader.new(input)
|
102
|
+
load_raw(input, reader)
|
103
|
+
end
|
104
|
+
|
105
|
+
def load_as_stream(path)
|
106
|
+
input = MemoryMappedInputStream.new(path)
|
107
|
+
reader = RecordBatchStreamReader.new(input)
|
108
|
+
load_raw(input, reader)
|
109
|
+
end
|
110
|
+
|
111
|
+
def load_as_csv(path)
|
112
|
+
options = @options.dup
|
113
|
+
options.delete(:format)
|
114
|
+
CSVLoader.load(Pathname.new(path), options)
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|