red-arrow 0.4.1 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +3 -2
- data/dependency-check/Rakefile +1 -1
- data/doc/text/news.md +37 -1
- data/image/red-arrow.png +0 -0
- data/lib/arrow/array-builder.rb +38 -34
- data/lib/arrow/array.rb +11 -5
- data/lib/arrow/chunked-array.rb +18 -2
- data/lib/arrow/column.rb +4 -0
- data/lib/arrow/csv-reader.rb +162 -0
- data/lib/arrow/loader.rb +14 -12
- data/lib/arrow/slicer.rb +391 -0
- data/lib/arrow/table-formatter.rb +88 -0
- data/lib/arrow/table.rb +309 -3
- data/lib/arrow/timestamp-array.rb +69 -0
- data/lib/arrow/version.rb +1 -1
- data/red-arrow.gemspec +1 -0
- data/test/fixture/with-header.csv +4 -0
- data/test/fixture/without-header.csv +3 -0
- data/test/helper.rb +3 -0
- data/test/helper/fixture.rb +25 -0
- data/test/test-array.rb +6 -0
- data/test/test-column.rb +2 -2
- data/test/test-csv-reader.rb +90 -0
- data/test/test-slicer.rb +401 -0
- data/test/test-table.rb +321 -13
- data/test/test-timestamp-array.rb +23 -0
- metadata +24 -7
data/lib/arrow/table.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright 2017 Kouhei Sutou <kou@clear-code.com>
|
1
|
+
# Copyright 2017-2018 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -14,7 +14,31 @@
|
|
14
14
|
|
15
15
|
module Arrow
|
16
16
|
class Table
|
17
|
-
|
17
|
+
alias_method :initialize_raw, :initialize
|
18
|
+
def initialize(schema_or_raw_table_or_columns, columns=nil)
|
19
|
+
if columns.nil?
|
20
|
+
if schema_or_raw_table_or_columns[0].is_a?(Column)
|
21
|
+
columns = schema_or_raw_table_or_columns
|
22
|
+
fields = columns.collect(&:field)
|
23
|
+
schema = Schema.new(fields)
|
24
|
+
else
|
25
|
+
raw_table = schema_or_raw_table_or_columns
|
26
|
+
fields = []
|
27
|
+
columns = []
|
28
|
+
raw_table.each do |name, array|
|
29
|
+
field = Field.new(name.to_s, array.value_data_type)
|
30
|
+
fields << field
|
31
|
+
columns << Column.new(field, array)
|
32
|
+
end
|
33
|
+
schema = Schema.new(fields)
|
34
|
+
end
|
35
|
+
else
|
36
|
+
schema = schema_or_raw_table_or_columns
|
37
|
+
end
|
38
|
+
initialize_raw(schema, columns)
|
39
|
+
end
|
40
|
+
|
41
|
+
def each_column
|
18
42
|
return to_enum(__method__) unless block_given?
|
19
43
|
|
20
44
|
n_columns.times do |i|
|
@@ -23,7 +47,289 @@ module Arrow
|
|
23
47
|
end
|
24
48
|
|
25
49
|
def columns
|
26
|
-
each_column.to_a
|
50
|
+
@columns ||= each_column.to_a
|
51
|
+
end
|
52
|
+
|
53
|
+
def each_record_batch
|
54
|
+
return to_enum(__method__) unless block_given?
|
55
|
+
|
56
|
+
reader = TableBatchReader.new(self)
|
57
|
+
while record_batch = reader.read_next
|
58
|
+
yield(record_batch)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# TODO
|
63
|
+
#
|
64
|
+
# @return [Arrow::Column, Array<Arrow::Column>, nil]
|
65
|
+
def [](*args)
|
66
|
+
if args.size == 1
|
67
|
+
case args[0]
|
68
|
+
when String, Symbol
|
69
|
+
find_column(args[0])
|
70
|
+
else
|
71
|
+
message = "#{self.class}\#[#{args[0].inspect}]: " +
|
72
|
+
"Must be String or Symbol"
|
73
|
+
raise ArgumentError, message
|
74
|
+
end
|
75
|
+
else
|
76
|
+
new_columns = args.collect do |column_name|
|
77
|
+
column = find_column(column_name)
|
78
|
+
if column.nil?
|
79
|
+
message = "Unknown column: <#{column_name.inspect}>: #{inspect}"
|
80
|
+
raise ArgumentError, message
|
81
|
+
end
|
82
|
+
column
|
83
|
+
end
|
84
|
+
self.class.new(schema, new_columns)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# TODO
|
89
|
+
#
|
90
|
+
# @return [Arrow::Table]
|
91
|
+
def slice(*slicers)
|
92
|
+
if block_given?
|
93
|
+
block_slicer = yield(Slicer.new(self))
|
94
|
+
case block_slicer
|
95
|
+
when nil
|
96
|
+
# Ignore
|
97
|
+
when ::Array
|
98
|
+
slicers.concat(block_slicer)
|
99
|
+
else
|
100
|
+
slicers << block_slicer
|
101
|
+
end
|
102
|
+
end
|
103
|
+
ranges = []
|
104
|
+
slicers.each do |slicer|
|
105
|
+
slicer = slicer.evaluate if slicer.respond_to?(:evaluate)
|
106
|
+
case slicer
|
107
|
+
when Integer
|
108
|
+
ranges << [slicer, slicer]
|
109
|
+
when Range
|
110
|
+
from = slicer.first
|
111
|
+
to = slicer.last
|
112
|
+
to -= 1 if slicer.exclude_end?
|
113
|
+
ranges << [from, to]
|
114
|
+
when ::Array
|
115
|
+
from = slicer[0]
|
116
|
+
to = from + slicer[1] - 1
|
117
|
+
ranges << [from, to]
|
118
|
+
when BooleanArray
|
119
|
+
in_target = false
|
120
|
+
target_start = nil
|
121
|
+
slicer.each_with_index do |is_target, i|
|
122
|
+
if is_target
|
123
|
+
unless in_target
|
124
|
+
target_start = i
|
125
|
+
in_target = true
|
126
|
+
end
|
127
|
+
else
|
128
|
+
if in_target
|
129
|
+
ranges << [target_start, i - 1]
|
130
|
+
target_start = nil
|
131
|
+
in_target = false
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
if in_target
|
136
|
+
ranges << [target_start, slicer.length - 1]
|
137
|
+
end
|
138
|
+
else
|
139
|
+
message = "slicer must be Integer, Range, [from, to] or " +
|
140
|
+
"Arrow::BooleanArray, Arrow::Slicer::Condition: #{slicer.inspect}"
|
141
|
+
raise ArgumentError, message
|
142
|
+
end
|
143
|
+
end
|
144
|
+
slice_by_ranges(ranges)
|
145
|
+
end
|
146
|
+
|
147
|
+
# TODO
|
148
|
+
#
|
149
|
+
# @return [Arrow::Table]
|
150
|
+
def merge(other)
|
151
|
+
added_columns = {}
|
152
|
+
removed_columns = {}
|
153
|
+
|
154
|
+
case other
|
155
|
+
when Hash
|
156
|
+
other.each do |name, value|
|
157
|
+
name = name.to_s
|
158
|
+
if value
|
159
|
+
added_columns[name] = ensure_column(name, value)
|
160
|
+
else
|
161
|
+
removed_columns[name] = true
|
162
|
+
end
|
163
|
+
end
|
164
|
+
when Table
|
165
|
+
added_columns = {}
|
166
|
+
other.columns.each do |column|
|
167
|
+
added_columns[column.name] = column
|
168
|
+
end
|
169
|
+
else
|
170
|
+
message = "merge target must be Hash or Arrow::Table: " +
|
171
|
+
"<#{other.inspect}>: #{inspect}"
|
172
|
+
raise ArgumentError, message
|
173
|
+
end
|
174
|
+
|
175
|
+
new_columns = []
|
176
|
+
columns.each do |column|
|
177
|
+
column_name = column.name
|
178
|
+
new_column = added_columns.delete(column_name)
|
179
|
+
if new_column
|
180
|
+
new_columns << new_column
|
181
|
+
next
|
182
|
+
end
|
183
|
+
next if removed_columns.key?(column_name)
|
184
|
+
new_columns << column
|
185
|
+
end
|
186
|
+
added_columns.each do |name, new_column|
|
187
|
+
new_columns << new_column
|
188
|
+
end
|
189
|
+
new_fields = new_columns.collect do |new_column|
|
190
|
+
new_column.field
|
191
|
+
end
|
192
|
+
self.class.new(Schema.new(new_fields), new_columns)
|
193
|
+
end
|
194
|
+
|
195
|
+
alias_method :remove_column_raw, :remove_column
|
196
|
+
def remove_column(name_or_index)
|
197
|
+
case name_or_index
|
198
|
+
when String, Symbol
|
199
|
+
name = name_or_index.to_s
|
200
|
+
index = columns.index {|column| column.name == name}
|
201
|
+
if index.nil?
|
202
|
+
message = "unknown column: #{name_or_index.inspect}: #{inspect}"
|
203
|
+
raise KeyError.new(message)
|
204
|
+
end
|
205
|
+
else
|
206
|
+
index = name_or_index
|
207
|
+
index += n_columns if index < 0
|
208
|
+
if index < 0 or index >= n_columns
|
209
|
+
message = "out of index (0..#{n_columns - 1}): " +
|
210
|
+
"#{name_or_index.inspect}: #{inspect}"
|
211
|
+
raise IndexError.new(message)
|
212
|
+
end
|
213
|
+
end
|
214
|
+
remove_column_raw(index)
|
215
|
+
end
|
216
|
+
|
217
|
+
def select_columns(*selectors, &block)
|
218
|
+
if selectors.empty?
|
219
|
+
return to_enum(__method__) unless block_given?
|
220
|
+
selected_columns = columns.select(&block)
|
221
|
+
else
|
222
|
+
selected_columns = []
|
223
|
+
selectors.each do |selector|
|
224
|
+
case selector
|
225
|
+
when String, Symbol
|
226
|
+
column = find_column(selector)
|
227
|
+
if column.nil?
|
228
|
+
message = "unknown column: #{selector.inspect}: #{inspect}"
|
229
|
+
raise KeyError.new(message)
|
230
|
+
end
|
231
|
+
selected_columns << column
|
232
|
+
when Range
|
233
|
+
selected_columns.concat(columns[selector])
|
234
|
+
else
|
235
|
+
column = columns[selector]
|
236
|
+
if column.nil?
|
237
|
+
message = "out of index (0..#{n_columns - 1}): " +
|
238
|
+
"#{selector.inspect}: #{inspect}"
|
239
|
+
raise IndexError.new(message)
|
240
|
+
end
|
241
|
+
selected_columns << column
|
242
|
+
end
|
243
|
+
end
|
244
|
+
selected_columns = selected_columns.select(&block) if block_given?
|
245
|
+
end
|
246
|
+
self.class.new(selected_columns)
|
247
|
+
end
|
248
|
+
|
249
|
+
def to_s(options={})
|
250
|
+
formatter = TableFormatter.new(self, options)
|
251
|
+
formatter.format
|
252
|
+
end
|
253
|
+
|
254
|
+
def inspect
|
255
|
+
"#{super}\n#{to_s}"
|
256
|
+
end
|
257
|
+
|
258
|
+
def respond_to_missing?(name, include_private)
|
259
|
+
return true if find_column(name)
|
260
|
+
super
|
261
|
+
end
|
262
|
+
|
263
|
+
def method_missing(name, *args, &block)
|
264
|
+
if args.empty?
|
265
|
+
column = find_column(name)
|
266
|
+
return column if column
|
267
|
+
end
|
268
|
+
super
|
269
|
+
end
|
270
|
+
|
271
|
+
private
|
272
|
+
def find_column(name)
|
273
|
+
name = name.to_s
|
274
|
+
columns.find do |column|
|
275
|
+
column.name == name
|
276
|
+
end
|
277
|
+
end
|
278
|
+
|
279
|
+
def slice_by_ranges(ranges)
|
280
|
+
sliced_columns = columns.collect do |column|
|
281
|
+
chunks = []
|
282
|
+
arrays = column.data.each_chunk.to_a
|
283
|
+
offset = 0
|
284
|
+
offset_in_array = 0
|
285
|
+
ranges.each do |from, to|
|
286
|
+
range_size = to - from + 1
|
287
|
+
while range_size > 0
|
288
|
+
while offset + arrays.first.length - offset_in_array < from
|
289
|
+
offset += arrays.first.length - offset_in_array
|
290
|
+
arrays.shift
|
291
|
+
offset_in_array = 0
|
292
|
+
end
|
293
|
+
if offset < from
|
294
|
+
skipped_size = from - offset
|
295
|
+
offset += skipped_size
|
296
|
+
offset_in_array += skipped_size
|
297
|
+
end
|
298
|
+
array = arrays.first
|
299
|
+
array_length = array.length
|
300
|
+
rest_length = array_length - offset_in_array
|
301
|
+
if rest_length <= range_size
|
302
|
+
chunks << array.slice(offset_in_array, array_length)
|
303
|
+
offset += rest_length
|
304
|
+
range_size -= rest_length
|
305
|
+
offset_in_array = 0
|
306
|
+
arrays.shift
|
307
|
+
else
|
308
|
+
chunks << array.slice(offset_in_array, range_size)
|
309
|
+
offset += range_size
|
310
|
+
offset_in_array += range_size
|
311
|
+
range_size = 0
|
312
|
+
end
|
313
|
+
end
|
314
|
+
end
|
315
|
+
Column.new(column.field, ChunkedArray.new(chunks))
|
316
|
+
end
|
317
|
+
|
318
|
+
self.class.new(schema, sliced_columns)
|
319
|
+
end
|
320
|
+
|
321
|
+
def ensure_column(name, data)
|
322
|
+
case data
|
323
|
+
when Array
|
324
|
+
field = Field.new(name, data.value_data_type)
|
325
|
+
Column.new(field, data)
|
326
|
+
when Column
|
327
|
+
data
|
328
|
+
else
|
329
|
+
message = "column must be Arrow::Array or Arrow::Column: " +
|
330
|
+
"<#{name}>: <#{data.inspect}>: #{inspect}"
|
331
|
+
raise ArgumentError, message
|
332
|
+
end
|
27
333
|
end
|
28
334
|
end
|
29
335
|
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
# Copyright 2017 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
module Arrow
|
16
|
+
class TimestampArray
|
17
|
+
class << self
|
18
|
+
def new(unit, values)
|
19
|
+
data_type = TimestampDataType.new(unit)
|
20
|
+
builder = TimestampArrayBuilder.new(data_type)
|
21
|
+
builder.build(values)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def get_value(i)
|
26
|
+
to_time(get_raw_value(i))
|
27
|
+
end
|
28
|
+
|
29
|
+
def unit
|
30
|
+
@unit ||= value_data_type_unit
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
def unit_id
|
35
|
+
@unit_id ||= unit.nick.to_sym
|
36
|
+
end
|
37
|
+
|
38
|
+
def value_data_type_unit
|
39
|
+
data_type = value_data_type
|
40
|
+
if data_type.respond_to?(:unit)
|
41
|
+
data_type.unit
|
42
|
+
else
|
43
|
+
data_type_name = data_type.to_s
|
44
|
+
if data_type_name.end_with?("[s]")
|
45
|
+
TimeUnit::SECOND
|
46
|
+
elsif data_type_name.end_with?("[ms]")
|
47
|
+
TimeUnit::MILLI
|
48
|
+
elsif data_type_name.end_with?("[us]")
|
49
|
+
TimeUnit::MICRO
|
50
|
+
else
|
51
|
+
TimeUnit::NANO
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def to_time(raw_value)
|
57
|
+
case unit_id
|
58
|
+
when :second
|
59
|
+
Time.at(raw_value)
|
60
|
+
when :milli
|
61
|
+
Time.at(*raw_value.divmod(1_000))
|
62
|
+
when :micro
|
63
|
+
Time.at(*raw_value.divmod(1_000_000))
|
64
|
+
else
|
65
|
+
Time.at(raw_value / 1_000_000_000.0)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
data/lib/arrow/version.rb
CHANGED
data/red-arrow.gemspec
CHANGED
@@ -38,6 +38,7 @@ Gem::Specification.new do |spec|
|
|
38
38
|
spec.files = ["README.md", "Rakefile", "Gemfile", "#{spec.name}.gemspec"]
|
39
39
|
spec.files += [".yardopts"]
|
40
40
|
spec.files += Dir.glob("lib/**/*.rb")
|
41
|
+
spec.files += Dir.glob("image/*.*")
|
41
42
|
spec.files += Dir.glob("doc/text/*")
|
42
43
|
spec.test_files += Dir.glob("test/**/*")
|
43
44
|
spec.extensions = ["dependency-check/Rakefile"]
|
data/test/helper.rb
CHANGED
@@ -0,0 +1,25 @@
|
|
1
|
+
# Copyright 2017 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
module Helper
|
16
|
+
module Fixture
|
17
|
+
def fixture_dir
|
18
|
+
Pathname.new(__dir__).join("..", "fixture").expand_path
|
19
|
+
end
|
20
|
+
|
21
|
+
def fixture_path(*components)
|
22
|
+
fixture_dir.join(*components)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|