red-arrow 0.4.1 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +3 -2
- data/dependency-check/Rakefile +1 -1
- data/doc/text/news.md +37 -1
- data/image/red-arrow.png +0 -0
- data/lib/arrow/array-builder.rb +38 -34
- data/lib/arrow/array.rb +11 -5
- data/lib/arrow/chunked-array.rb +18 -2
- data/lib/arrow/column.rb +4 -0
- data/lib/arrow/csv-reader.rb +162 -0
- data/lib/arrow/loader.rb +14 -12
- data/lib/arrow/slicer.rb +391 -0
- data/lib/arrow/table-formatter.rb +88 -0
- data/lib/arrow/table.rb +309 -3
- data/lib/arrow/timestamp-array.rb +69 -0
- data/lib/arrow/version.rb +1 -1
- data/red-arrow.gemspec +1 -0
- data/test/fixture/with-header.csv +4 -0
- data/test/fixture/without-header.csv +3 -0
- data/test/helper.rb +3 -0
- data/test/helper/fixture.rb +25 -0
- data/test/test-array.rb +6 -0
- data/test/test-column.rb +2 -2
- data/test/test-csv-reader.rb +90 -0
- data/test/test-slicer.rb +401 -0
- data/test/test-table.rb +321 -13
- data/test/test-timestamp-array.rb +23 -0
- metadata +24 -7
data/lib/arrow/table.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright 2017 Kouhei Sutou <kou@clear-code.com>
|
1
|
+
# Copyright 2017-2018 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -14,7 +14,31 @@
|
|
14
14
|
|
15
15
|
module Arrow
|
16
16
|
class Table
|
17
|
-
|
17
|
+
alias_method :initialize_raw, :initialize
|
18
|
+
def initialize(schema_or_raw_table_or_columns, columns=nil)
|
19
|
+
if columns.nil?
|
20
|
+
if schema_or_raw_table_or_columns[0].is_a?(Column)
|
21
|
+
columns = schema_or_raw_table_or_columns
|
22
|
+
fields = columns.collect(&:field)
|
23
|
+
schema = Schema.new(fields)
|
24
|
+
else
|
25
|
+
raw_table = schema_or_raw_table_or_columns
|
26
|
+
fields = []
|
27
|
+
columns = []
|
28
|
+
raw_table.each do |name, array|
|
29
|
+
field = Field.new(name.to_s, array.value_data_type)
|
30
|
+
fields << field
|
31
|
+
columns << Column.new(field, array)
|
32
|
+
end
|
33
|
+
schema = Schema.new(fields)
|
34
|
+
end
|
35
|
+
else
|
36
|
+
schema = schema_or_raw_table_or_columns
|
37
|
+
end
|
38
|
+
initialize_raw(schema, columns)
|
39
|
+
end
|
40
|
+
|
41
|
+
def each_column
|
18
42
|
return to_enum(__method__) unless block_given?
|
19
43
|
|
20
44
|
n_columns.times do |i|
|
@@ -23,7 +47,289 @@ module Arrow
|
|
23
47
|
end
|
24
48
|
|
25
49
|
def columns
|
26
|
-
each_column.to_a
|
50
|
+
@columns ||= each_column.to_a
|
51
|
+
end
|
52
|
+
|
53
|
+
def each_record_batch
|
54
|
+
return to_enum(__method__) unless block_given?
|
55
|
+
|
56
|
+
reader = TableBatchReader.new(self)
|
57
|
+
while record_batch = reader.read_next
|
58
|
+
yield(record_batch)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# TODO
|
63
|
+
#
|
64
|
+
# @return [Arrow::Column, Array<Arrow::Column>, nil]
|
65
|
+
def [](*args)
|
66
|
+
if args.size == 1
|
67
|
+
case args[0]
|
68
|
+
when String, Symbol
|
69
|
+
find_column(args[0])
|
70
|
+
else
|
71
|
+
message = "#{self.class}\#[#{args[0].inspect}]: " +
|
72
|
+
"Must be String or Symbol"
|
73
|
+
raise ArgumentError, message
|
74
|
+
end
|
75
|
+
else
|
76
|
+
new_columns = args.collect do |column_name|
|
77
|
+
column = find_column(column_name)
|
78
|
+
if column.nil?
|
79
|
+
message = "Unknown column: <#{column_name.inspect}>: #{inspect}"
|
80
|
+
raise ArgumentError, message
|
81
|
+
end
|
82
|
+
column
|
83
|
+
end
|
84
|
+
self.class.new(schema, new_columns)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# TODO
|
89
|
+
#
|
90
|
+
# @return [Arrow::Table]
|
91
|
+
def slice(*slicers)
|
92
|
+
if block_given?
|
93
|
+
block_slicer = yield(Slicer.new(self))
|
94
|
+
case block_slicer
|
95
|
+
when nil
|
96
|
+
# Ignore
|
97
|
+
when ::Array
|
98
|
+
slicers.concat(block_slicer)
|
99
|
+
else
|
100
|
+
slicers << block_slicer
|
101
|
+
end
|
102
|
+
end
|
103
|
+
ranges = []
|
104
|
+
slicers.each do |slicer|
|
105
|
+
slicer = slicer.evaluate if slicer.respond_to?(:evaluate)
|
106
|
+
case slicer
|
107
|
+
when Integer
|
108
|
+
ranges << [slicer, slicer]
|
109
|
+
when Range
|
110
|
+
from = slicer.first
|
111
|
+
to = slicer.last
|
112
|
+
to -= 1 if slicer.exclude_end?
|
113
|
+
ranges << [from, to]
|
114
|
+
when ::Array
|
115
|
+
from = slicer[0]
|
116
|
+
to = from + slicer[1] - 1
|
117
|
+
ranges << [from, to]
|
118
|
+
when BooleanArray
|
119
|
+
in_target = false
|
120
|
+
target_start = nil
|
121
|
+
slicer.each_with_index do |is_target, i|
|
122
|
+
if is_target
|
123
|
+
unless in_target
|
124
|
+
target_start = i
|
125
|
+
in_target = true
|
126
|
+
end
|
127
|
+
else
|
128
|
+
if in_target
|
129
|
+
ranges << [target_start, i - 1]
|
130
|
+
target_start = nil
|
131
|
+
in_target = false
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
if in_target
|
136
|
+
ranges << [target_start, slicer.length - 1]
|
137
|
+
end
|
138
|
+
else
|
139
|
+
message = "slicer must be Integer, Range, [from, to] or " +
|
140
|
+
"Arrow::BooleanArray, Arrow::Slicer::Condition: #{slicer.inspect}"
|
141
|
+
raise ArgumentError, message
|
142
|
+
end
|
143
|
+
end
|
144
|
+
slice_by_ranges(ranges)
|
145
|
+
end
|
146
|
+
|
147
|
+
# TODO
|
148
|
+
#
|
149
|
+
# @return [Arrow::Table]
|
150
|
+
def merge(other)
|
151
|
+
added_columns = {}
|
152
|
+
removed_columns = {}
|
153
|
+
|
154
|
+
case other
|
155
|
+
when Hash
|
156
|
+
other.each do |name, value|
|
157
|
+
name = name.to_s
|
158
|
+
if value
|
159
|
+
added_columns[name] = ensure_column(name, value)
|
160
|
+
else
|
161
|
+
removed_columns[name] = true
|
162
|
+
end
|
163
|
+
end
|
164
|
+
when Table
|
165
|
+
added_columns = {}
|
166
|
+
other.columns.each do |column|
|
167
|
+
added_columns[column.name] = column
|
168
|
+
end
|
169
|
+
else
|
170
|
+
message = "merge target must be Hash or Arrow::Table: " +
|
171
|
+
"<#{other.inspect}>: #{inspect}"
|
172
|
+
raise ArgumentError, message
|
173
|
+
end
|
174
|
+
|
175
|
+
new_columns = []
|
176
|
+
columns.each do |column|
|
177
|
+
column_name = column.name
|
178
|
+
new_column = added_columns.delete(column_name)
|
179
|
+
if new_column
|
180
|
+
new_columns << new_column
|
181
|
+
next
|
182
|
+
end
|
183
|
+
next if removed_columns.key?(column_name)
|
184
|
+
new_columns << column
|
185
|
+
end
|
186
|
+
added_columns.each do |name, new_column|
|
187
|
+
new_columns << new_column
|
188
|
+
end
|
189
|
+
new_fields = new_columns.collect do |new_column|
|
190
|
+
new_column.field
|
191
|
+
end
|
192
|
+
self.class.new(Schema.new(new_fields), new_columns)
|
193
|
+
end
|
194
|
+
|
195
|
+
alias_method :remove_column_raw, :remove_column
|
196
|
+
def remove_column(name_or_index)
|
197
|
+
case name_or_index
|
198
|
+
when String, Symbol
|
199
|
+
name = name_or_index.to_s
|
200
|
+
index = columns.index {|column| column.name == name}
|
201
|
+
if index.nil?
|
202
|
+
message = "unknown column: #{name_or_index.inspect}: #{inspect}"
|
203
|
+
raise KeyError.new(message)
|
204
|
+
end
|
205
|
+
else
|
206
|
+
index = name_or_index
|
207
|
+
index += n_columns if index < 0
|
208
|
+
if index < 0 or index >= n_columns
|
209
|
+
message = "out of index (0..#{n_columns - 1}): " +
|
210
|
+
"#{name_or_index.inspect}: #{inspect}"
|
211
|
+
raise IndexError.new(message)
|
212
|
+
end
|
213
|
+
end
|
214
|
+
remove_column_raw(index)
|
215
|
+
end
|
216
|
+
|
217
|
+
def select_columns(*selectors, &block)
|
218
|
+
if selectors.empty?
|
219
|
+
return to_enum(__method__) unless block_given?
|
220
|
+
selected_columns = columns.select(&block)
|
221
|
+
else
|
222
|
+
selected_columns = []
|
223
|
+
selectors.each do |selector|
|
224
|
+
case selector
|
225
|
+
when String, Symbol
|
226
|
+
column = find_column(selector)
|
227
|
+
if column.nil?
|
228
|
+
message = "unknown column: #{selector.inspect}: #{inspect}"
|
229
|
+
raise KeyError.new(message)
|
230
|
+
end
|
231
|
+
selected_columns << column
|
232
|
+
when Range
|
233
|
+
selected_columns.concat(columns[selector])
|
234
|
+
else
|
235
|
+
column = columns[selector]
|
236
|
+
if column.nil?
|
237
|
+
message = "out of index (0..#{n_columns - 1}): " +
|
238
|
+
"#{selector.inspect}: #{inspect}"
|
239
|
+
raise IndexError.new(message)
|
240
|
+
end
|
241
|
+
selected_columns << column
|
242
|
+
end
|
243
|
+
end
|
244
|
+
selected_columns = selected_columns.select(&block) if block_given?
|
245
|
+
end
|
246
|
+
self.class.new(selected_columns)
|
247
|
+
end
|
248
|
+
|
249
|
+
def to_s(options={})
|
250
|
+
formatter = TableFormatter.new(self, options)
|
251
|
+
formatter.format
|
252
|
+
end
|
253
|
+
|
254
|
+
def inspect
|
255
|
+
"#{super}\n#{to_s}"
|
256
|
+
end
|
257
|
+
|
258
|
+
def respond_to_missing?(name, include_private)
|
259
|
+
return true if find_column(name)
|
260
|
+
super
|
261
|
+
end
|
262
|
+
|
263
|
+
def method_missing(name, *args, &block)
|
264
|
+
if args.empty?
|
265
|
+
column = find_column(name)
|
266
|
+
return column if column
|
267
|
+
end
|
268
|
+
super
|
269
|
+
end
|
270
|
+
|
271
|
+
private
|
272
|
+
def find_column(name)
|
273
|
+
name = name.to_s
|
274
|
+
columns.find do |column|
|
275
|
+
column.name == name
|
276
|
+
end
|
277
|
+
end
|
278
|
+
|
279
|
+
def slice_by_ranges(ranges)
|
280
|
+
sliced_columns = columns.collect do |column|
|
281
|
+
chunks = []
|
282
|
+
arrays = column.data.each_chunk.to_a
|
283
|
+
offset = 0
|
284
|
+
offset_in_array = 0
|
285
|
+
ranges.each do |from, to|
|
286
|
+
range_size = to - from + 1
|
287
|
+
while range_size > 0
|
288
|
+
while offset + arrays.first.length - offset_in_array < from
|
289
|
+
offset += arrays.first.length - offset_in_array
|
290
|
+
arrays.shift
|
291
|
+
offset_in_array = 0
|
292
|
+
end
|
293
|
+
if offset < from
|
294
|
+
skipped_size = from - offset
|
295
|
+
offset += skipped_size
|
296
|
+
offset_in_array += skipped_size
|
297
|
+
end
|
298
|
+
array = arrays.first
|
299
|
+
array_length = array.length
|
300
|
+
rest_length = array_length - offset_in_array
|
301
|
+
if rest_length <= range_size
|
302
|
+
chunks << array.slice(offset_in_array, array_length)
|
303
|
+
offset += rest_length
|
304
|
+
range_size -= rest_length
|
305
|
+
offset_in_array = 0
|
306
|
+
arrays.shift
|
307
|
+
else
|
308
|
+
chunks << array.slice(offset_in_array, range_size)
|
309
|
+
offset += range_size
|
310
|
+
offset_in_array += range_size
|
311
|
+
range_size = 0
|
312
|
+
end
|
313
|
+
end
|
314
|
+
end
|
315
|
+
Column.new(column.field, ChunkedArray.new(chunks))
|
316
|
+
end
|
317
|
+
|
318
|
+
self.class.new(schema, sliced_columns)
|
319
|
+
end
|
320
|
+
|
321
|
+
def ensure_column(name, data)
|
322
|
+
case data
|
323
|
+
when Array
|
324
|
+
field = Field.new(name, data.value_data_type)
|
325
|
+
Column.new(field, data)
|
326
|
+
when Column
|
327
|
+
data
|
328
|
+
else
|
329
|
+
message = "column must be Arrow::Array or Arrow::Column: " +
|
330
|
+
"<#{name}>: <#{data.inspect}>: #{inspect}"
|
331
|
+
raise ArgumentError, message
|
332
|
+
end
|
27
333
|
end
|
28
334
|
end
|
29
335
|
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
# Copyright 2017 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
module Arrow
|
16
|
+
class TimestampArray
|
17
|
+
class << self
|
18
|
+
def new(unit, values)
|
19
|
+
data_type = TimestampDataType.new(unit)
|
20
|
+
builder = TimestampArrayBuilder.new(data_type)
|
21
|
+
builder.build(values)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def get_value(i)
|
26
|
+
to_time(get_raw_value(i))
|
27
|
+
end
|
28
|
+
|
29
|
+
def unit
|
30
|
+
@unit ||= value_data_type_unit
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
def unit_id
|
35
|
+
@unit_id ||= unit.nick.to_sym
|
36
|
+
end
|
37
|
+
|
38
|
+
def value_data_type_unit
|
39
|
+
data_type = value_data_type
|
40
|
+
if data_type.respond_to?(:unit)
|
41
|
+
data_type.unit
|
42
|
+
else
|
43
|
+
data_type_name = data_type.to_s
|
44
|
+
if data_type_name.end_with?("[s]")
|
45
|
+
TimeUnit::SECOND
|
46
|
+
elsif data_type_name.end_with?("[ms]")
|
47
|
+
TimeUnit::MILLI
|
48
|
+
elsif data_type_name.end_with?("[us]")
|
49
|
+
TimeUnit::MICRO
|
50
|
+
else
|
51
|
+
TimeUnit::NANO
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def to_time(raw_value)
|
57
|
+
case unit_id
|
58
|
+
when :second
|
59
|
+
Time.at(raw_value)
|
60
|
+
when :milli
|
61
|
+
Time.at(*raw_value.divmod(1_000))
|
62
|
+
when :micro
|
63
|
+
Time.at(*raw_value.divmod(1_000_000))
|
64
|
+
else
|
65
|
+
Time.at(raw_value / 1_000_000_000.0)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
data/lib/arrow/version.rb
CHANGED
data/red-arrow.gemspec
CHANGED
@@ -38,6 +38,7 @@ Gem::Specification.new do |spec|
|
|
38
38
|
spec.files = ["README.md", "Rakefile", "Gemfile", "#{spec.name}.gemspec"]
|
39
39
|
spec.files += [".yardopts"]
|
40
40
|
spec.files += Dir.glob("lib/**/*.rb")
|
41
|
+
spec.files += Dir.glob("image/*.*")
|
41
42
|
spec.files += Dir.glob("doc/text/*")
|
42
43
|
spec.test_files += Dir.glob("test/**/*")
|
43
44
|
spec.extensions = ["dependency-check/Rakefile"]
|
data/test/helper.rb
CHANGED
@@ -0,0 +1,25 @@
|
|
1
|
+
# Copyright 2017 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
module Helper
|
16
|
+
module Fixture
|
17
|
+
def fixture_dir
|
18
|
+
Pathname.new(__dir__).join("..", "fixture").expand_path
|
19
|
+
end
|
20
|
+
|
21
|
+
def fixture_path(*components)
|
22
|
+
fixture_dir.join(*components)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|