red-arrow 0.8.1 → 0.8.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/doc/text/news.md +44 -0
- data/lib/arrow/array-builder.rb +0 -1
- data/lib/arrow/array.rb +9 -1
- data/lib/arrow/chunked-array.rb +43 -7
- data/lib/arrow/column.rb +18 -0
- data/lib/arrow/csv-loader.rb +17 -2
- data/lib/arrow/data-type.rb +81 -0
- data/lib/arrow/group.rb +133 -0
- data/lib/arrow/loader.rb +4 -1
- data/lib/arrow/slicer.rb +76 -19
- data/lib/arrow/table-formatter.rb +21 -47
- data/lib/arrow/table-list-formatter.rb +35 -0
- data/lib/arrow/table-table-formatter.rb +69 -0
- data/lib/arrow/table.rb +62 -40
- data/lib/arrow/version.rb +1 -1
- data/test/fixture/null-with-double-quote.csv +4 -0
- data/test/fixture/null-without-double-quote.csv +4 -0
- data/test/run-test.rb +3 -1
- data/test/test-chunked-array.rb +39 -2
- data/test/test-column.rb +13 -0
- data/test/test-csv-loader.rb +20 -0
- data/test/test-group.rb +111 -0
- data/test/test-slicer.rb +42 -0
- data/test/test-table.rb +52 -10
- metadata +12 -2
data/lib/arrow/loader.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright 2017 Kouhei Sutou <kou@clear-code.com>
|
1
|
+
# Copyright 2017-2018 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -35,6 +35,7 @@ module Arrow
|
|
35
35
|
require "arrow/column"
|
36
36
|
require "arrow/csv-loader"
|
37
37
|
require "arrow/csv-reader"
|
38
|
+
require "arrow/data-type"
|
38
39
|
require "arrow/date32-array"
|
39
40
|
require "arrow/date32-array-builder"
|
40
41
|
require "arrow/date64-array"
|
@@ -45,6 +46,8 @@ module Arrow
|
|
45
46
|
require "arrow/slicer"
|
46
47
|
require "arrow/table"
|
47
48
|
require "arrow/table-formatter"
|
49
|
+
require "arrow/table-list-formatter"
|
50
|
+
require "arrow/table-table-formatter"
|
48
51
|
require "arrow/table-loader"
|
49
52
|
require "arrow/table-saver"
|
50
53
|
require "arrow/tensor"
|
data/lib/arrow/slicer.rb
CHANGED
@@ -110,26 +110,19 @@ module Arrow
|
|
110
110
|
|
111
111
|
def evaluate
|
112
112
|
data = @column.data
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
else
|
118
|
-
array.cast(BooleanDataType.new)
|
119
|
-
end
|
113
|
+
|
114
|
+
case @column.data_type
|
115
|
+
when BooleanDataType
|
116
|
+
data
|
120
117
|
else
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
boolean_array = chunk.cast(BooleanDataType.new)
|
127
|
-
end
|
128
|
-
boolean_array.each do |value|
|
129
|
-
raw_array << value
|
118
|
+
if data.n_chunks == 1
|
119
|
+
data.get_chunk(0).cast(BooleanDataType.new)
|
120
|
+
else
|
121
|
+
arrays = data.each_chunk.collect do |chunk|
|
122
|
+
chunk.cast(BooleanDataType.new)
|
130
123
|
end
|
124
|
+
ChunkedArray.new(arrays)
|
131
125
|
end
|
132
|
-
BooleanArray.new(raw_array)
|
133
126
|
end
|
134
127
|
end
|
135
128
|
|
@@ -141,6 +134,10 @@ module Arrow
|
|
141
134
|
self == nil
|
142
135
|
end
|
143
136
|
|
137
|
+
def valid?
|
138
|
+
self != nil
|
139
|
+
end
|
140
|
+
|
144
141
|
def ==(value)
|
145
142
|
EqualCondition.new(@column, value)
|
146
143
|
end
|
@@ -165,6 +162,10 @@ module Arrow
|
|
165
162
|
GreaterEqualCondition.new(@column, value)
|
166
163
|
end
|
167
164
|
|
165
|
+
def in?(values)
|
166
|
+
InCondition.new(@column, values)
|
167
|
+
end
|
168
|
+
|
168
169
|
def select(&block)
|
169
170
|
SelectCondition.new(@column, block)
|
170
171
|
end
|
@@ -245,8 +246,12 @@ module Arrow
|
|
245
246
|
def evaluate
|
246
247
|
case @value
|
247
248
|
when nil
|
248
|
-
|
249
|
-
|
249
|
+
if @column.n_nulls.zero?
|
250
|
+
raw_array = [true] * @column.length
|
251
|
+
else
|
252
|
+
raw_array = @column.length.times.collect do |i|
|
253
|
+
@column.valid?(i)
|
254
|
+
end
|
250
255
|
end
|
251
256
|
BooleanArray.new(raw_array)
|
252
257
|
else
|
@@ -350,6 +355,58 @@ module Arrow
|
|
350
355
|
end
|
351
356
|
end
|
352
357
|
|
358
|
+
class InCondition < Condition
|
359
|
+
def initialize(column, values)
|
360
|
+
@column = column
|
361
|
+
@values = values
|
362
|
+
end
|
363
|
+
|
364
|
+
def !@
|
365
|
+
NotInCondition.new(@column, @values)
|
366
|
+
end
|
367
|
+
|
368
|
+
def evaluate
|
369
|
+
values_index = {}
|
370
|
+
@values.each do |value|
|
371
|
+
values_index[value] = true
|
372
|
+
end
|
373
|
+
raw_array = @column.collect do |value|
|
374
|
+
if value.nil?
|
375
|
+
nil
|
376
|
+
else
|
377
|
+
values_index.key?(value)
|
378
|
+
end
|
379
|
+
end
|
380
|
+
BooleanArray.new(raw_array)
|
381
|
+
end
|
382
|
+
end
|
383
|
+
|
384
|
+
class NotInCondition < Condition
|
385
|
+
def initialize(column, values)
|
386
|
+
@column = column
|
387
|
+
@values = values
|
388
|
+
end
|
389
|
+
|
390
|
+
def !@
|
391
|
+
InCondition.new(@column, @values)
|
392
|
+
end
|
393
|
+
|
394
|
+
def evaluate
|
395
|
+
values_index = {}
|
396
|
+
@values.each do |value|
|
397
|
+
values_index[value] = true
|
398
|
+
end
|
399
|
+
raw_array = @column.collect do |value|
|
400
|
+
if value.nil?
|
401
|
+
nil
|
402
|
+
else
|
403
|
+
not values_index.key?(value)
|
404
|
+
end
|
405
|
+
end
|
406
|
+
BooleanArray.new(raw_array)
|
407
|
+
end
|
408
|
+
end
|
409
|
+
|
353
410
|
class SelectCondition < Condition
|
354
411
|
def initialize(column, block)
|
355
412
|
@column = column
|
@@ -12,8 +12,6 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
-
require "time"
|
16
|
-
|
17
15
|
module Arrow
|
18
16
|
class TableFormatter
|
19
17
|
def initialize(table, options={})
|
@@ -24,65 +22,41 @@ module Arrow
|
|
24
22
|
def format
|
25
23
|
text = ""
|
26
24
|
columns = @table.columns
|
27
|
-
columns
|
28
|
-
text << "\t"
|
29
|
-
text << format_column_name(column)
|
30
|
-
end
|
31
|
-
text << "\n"
|
25
|
+
format_header(text, columns)
|
32
26
|
|
33
27
|
n_rows = @table.n_rows
|
34
28
|
return text if n_rows.zero?
|
35
29
|
|
36
30
|
border = @options[:border] || 10
|
37
31
|
n_digits = (Math.log10(n_rows) + 1).truncate
|
38
|
-
[border, n_rows].min
|
39
|
-
|
32
|
+
head_limit = [border, n_rows].min
|
33
|
+
head_column_values = columns.collect do |column|
|
34
|
+
column.each.take(head_limit)
|
40
35
|
end
|
36
|
+
format_rows(text,
|
37
|
+
columns,
|
38
|
+
head_column_values.transpose,
|
39
|
+
n_digits,
|
40
|
+
0)
|
41
41
|
return text if n_rows <= border
|
42
42
|
|
43
|
-
|
44
|
-
|
45
|
-
|
43
|
+
tail_start = [border, n_rows - border].max
|
44
|
+
tail_limit = n_rows - tail_start
|
45
|
+
tail_column_values = columns.collect do |column|
|
46
|
+
column.reverse_each.take(tail_limit).reverse
|
46
47
|
end
|
47
48
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
private
|
52
|
-
FLOAT_N_DIGITS = 10
|
53
|
-
def format_column_name(column)
|
54
|
-
case column.data_type
|
55
|
-
when TimestampDataType
|
56
|
-
"%*s" % [Time.now.iso8601.size, column.name]
|
57
|
-
when FloatDataType, DoubleDataType
|
58
|
-
"%*s" % [FLOAT_N_DIGITS, column.name]
|
59
|
-
else
|
60
|
-
column.name
|
49
|
+
if head_limit != tail_start
|
50
|
+
format_ellipsis(text)
|
61
51
|
end
|
62
|
-
end
|
63
52
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
end
|
70
|
-
text << "\n"
|
71
|
-
text
|
72
|
-
end
|
53
|
+
format_rows(text,
|
54
|
+
columns,
|
55
|
+
tail_column_values.transpose,
|
56
|
+
n_digits,
|
57
|
+
tail_start)
|
73
58
|
|
74
|
-
|
75
|
-
value = column[i]
|
76
|
-
case value
|
77
|
-
when Time
|
78
|
-
value.iso8601
|
79
|
-
when Float
|
80
|
-
"%*f" % [[column.name.size, FLOAT_N_DIGITS].max, value]
|
81
|
-
when Integer
|
82
|
-
"%*d" % [column.name.size, value]
|
83
|
-
else
|
84
|
-
"%-*s" % [column.name.size, value.to_s]
|
85
|
-
end
|
59
|
+
text
|
86
60
|
end
|
87
61
|
end
|
88
62
|
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# Copyright 2018 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
module Arrow
|
16
|
+
class TableListFormatter < TableFormatter
|
17
|
+
private
|
18
|
+
def format_header(text, columns)
|
19
|
+
end
|
20
|
+
|
21
|
+
def format_rows(text, columns, rows, n_digits, start_offset)
|
22
|
+
rows.each_with_index do |row, nth_row|
|
23
|
+
text << ("=" * 20 + " #{start_offset + nth_row} " + "=" * 20 + "\n")
|
24
|
+
row.each_with_index do |column_value, nth_column|
|
25
|
+
column = columns[nth_column]
|
26
|
+
text << "#{column.name}: #{column_value}\n"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def format_ellipsis(text)
|
32
|
+
text << "...\n"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
# Copyright 2017-2018 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
require "time"
|
16
|
+
|
17
|
+
module Arrow
|
18
|
+
class TableTableFormatter < TableFormatter
|
19
|
+
private
|
20
|
+
def format_header(text, columns)
|
21
|
+
columns.each do |column|
|
22
|
+
text << "\t"
|
23
|
+
text << format_column_name(column)
|
24
|
+
end
|
25
|
+
text << "\n"
|
26
|
+
end
|
27
|
+
|
28
|
+
FLOAT_N_DIGITS = 10
|
29
|
+
def format_column_name(column)
|
30
|
+
case column.data_type
|
31
|
+
when TimestampDataType
|
32
|
+
"%*s" % [Time.now.iso8601.size, column.name]
|
33
|
+
when FloatDataType, DoubleDataType
|
34
|
+
"%*s" % [FLOAT_N_DIGITS, column.name]
|
35
|
+
else
|
36
|
+
column.name
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def format_rows(text, columns, rows, n_digits, start_offset)
|
41
|
+
rows.each_with_index do |row, nth_row|
|
42
|
+
text << ("%*d" % [n_digits, start_offset + nth_row])
|
43
|
+
row.each_with_index do |column_value, nth_column|
|
44
|
+
text << "\t"
|
45
|
+
column = columns[nth_column]
|
46
|
+
text << format_column_value(column, column_value)
|
47
|
+
end
|
48
|
+
text << "\n"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def format_column_value(column, value)
|
53
|
+
case value
|
54
|
+
when Time
|
55
|
+
value.iso8601
|
56
|
+
when Float
|
57
|
+
"%*f" % [[column.name.size, FLOAT_N_DIGITS].max, value]
|
58
|
+
when Integer
|
59
|
+
"%*d" % [column.name.size, value]
|
60
|
+
else
|
61
|
+
"%-*s" % [column.name.size, value.to_s]
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def format_ellipsis(text)
|
66
|
+
text << "...\n"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
data/lib/arrow/table.rb
CHANGED
@@ -12,6 +12,7 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
+
require "arrow/group"
|
15
16
|
require "arrow/record-containable"
|
16
17
|
|
17
18
|
module Arrow
|
@@ -61,24 +62,10 @@ module Arrow
|
|
61
62
|
end
|
62
63
|
end
|
63
64
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
if args.size == 1
|
69
|
-
find_column(args[0])
|
70
|
-
else
|
71
|
-
new_columns = args.collect do |column_name|
|
72
|
-
column = find_column(column_name)
|
73
|
-
if column.nil?
|
74
|
-
message = "unknown column: <#{column_name.inspect}>: #{inspect}"
|
75
|
-
raise ArgumentError, message
|
76
|
-
end
|
77
|
-
column
|
78
|
-
end
|
79
|
-
self.class.new(schema, new_columns)
|
80
|
-
end
|
81
|
-
end
|
65
|
+
alias_method :size, :n_rows
|
66
|
+
alias_method :length, :n_rows
|
67
|
+
|
68
|
+
alias_method :[], :find_column
|
82
69
|
|
83
70
|
# TODO
|
84
71
|
#
|
@@ -100,39 +87,32 @@ module Arrow
|
|
100
87
|
slicer = slicer.evaluate if slicer.respond_to?(:evaluate)
|
101
88
|
case slicer
|
102
89
|
when Integer
|
90
|
+
slicer += n_rows if slicer < 0
|
103
91
|
ranges << [slicer, slicer]
|
104
92
|
when Range
|
105
93
|
from = slicer.first
|
106
94
|
to = slicer.last
|
107
95
|
to -= 1 if slicer.exclude_end?
|
96
|
+
from += n_rows if from < 0
|
97
|
+
to += n_rows if to < 0
|
108
98
|
ranges << [from, to]
|
109
99
|
when ::Array
|
110
100
|
from = slicer[0]
|
101
|
+
from += n_rows if from < 0
|
111
102
|
to = from + slicer[1] - 1
|
112
103
|
ranges << [from, to]
|
113
|
-
when
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
unless in_target
|
119
|
-
target_start = i
|
120
|
-
in_target = true
|
121
|
-
end
|
122
|
-
else
|
123
|
-
if in_target
|
124
|
-
ranges << [target_start, i - 1]
|
125
|
-
target_start = nil
|
126
|
-
in_target = false
|
127
|
-
end
|
128
|
-
end
|
129
|
-
end
|
130
|
-
if in_target
|
131
|
-
ranges << [target_start, slicer.length - 1]
|
104
|
+
when ChunkedArray
|
105
|
+
offset = 0
|
106
|
+
slicer.each_chunk do |array|
|
107
|
+
boolean_array_to_slice_ranges(array, offset, ranges)
|
108
|
+
offset += array.length
|
132
109
|
end
|
110
|
+
when BooleanArray
|
111
|
+
boolean_array_to_slice_ranges(slicer, 0, ranges)
|
133
112
|
else
|
134
|
-
message = "slicer must be Integer, Range, [from, to]
|
135
|
-
"Arrow::
|
113
|
+
message = "slicer must be Integer, Range, [from, to], " +
|
114
|
+
"Arrow::ChunkedArray of Arrow::BooleanArray, " +
|
115
|
+
"Arrow::BooleanArray or Arrow::Slicer::Condition: #{slicer.inspect}"
|
136
116
|
raise ArgumentError, message
|
137
117
|
end
|
138
118
|
end
|
@@ -209,6 +189,9 @@ module Arrow
|
|
209
189
|
remove_column_raw(index)
|
210
190
|
end
|
211
191
|
|
192
|
+
# TODO
|
193
|
+
#
|
194
|
+
# @return [Arrow::Table]
|
212
195
|
def select_columns(*selectors, &block)
|
213
196
|
if selectors.empty?
|
214
197
|
return to_enum(__method__) unless block_given?
|
@@ -241,13 +224,30 @@ module Arrow
|
|
241
224
|
self.class.new(selected_columns)
|
242
225
|
end
|
243
226
|
|
227
|
+
def group(*keys)
|
228
|
+
Group.new(self, keys)
|
229
|
+
end
|
230
|
+
|
244
231
|
def save(path, options={})
|
245
232
|
saver = TableSaver.new(self, path, options)
|
246
233
|
saver.save
|
247
234
|
end
|
248
235
|
|
236
|
+
def pack
|
237
|
+
packed_columns = columns.collect do |column|
|
238
|
+
column.pack
|
239
|
+
end
|
240
|
+
self.class.new(schema, packed_columns)
|
241
|
+
end
|
242
|
+
|
249
243
|
def to_s(options={})
|
250
|
-
|
244
|
+
case options[:format]
|
245
|
+
when :list
|
246
|
+
formatter_class = TableListFormatter
|
247
|
+
else
|
248
|
+
formatter_class = TableTableFormatter
|
249
|
+
end
|
250
|
+
formatter = formatter_class.new(self, options)
|
251
251
|
formatter.format
|
252
252
|
end
|
253
253
|
|
@@ -269,6 +269,28 @@ module Arrow
|
|
269
269
|
end
|
270
270
|
|
271
271
|
private
|
272
|
+
def boolean_array_to_slice_ranges(array, offset, ranges)
|
273
|
+
in_target = false
|
274
|
+
target_start = nil
|
275
|
+
array.each_with_index do |is_target, i|
|
276
|
+
if is_target
|
277
|
+
unless in_target
|
278
|
+
target_start = offset + i
|
279
|
+
in_target = true
|
280
|
+
end
|
281
|
+
else
|
282
|
+
if in_target
|
283
|
+
ranges << [target_start, offset + i - 1]
|
284
|
+
target_start = nil
|
285
|
+
in_target = false
|
286
|
+
end
|
287
|
+
end
|
288
|
+
end
|
289
|
+
if in_target
|
290
|
+
ranges << [target_start, offset + array.length - 1]
|
291
|
+
end
|
292
|
+
end
|
293
|
+
|
272
294
|
def slice_by_ranges(ranges)
|
273
295
|
sliced_columns = columns.collect do |column|
|
274
296
|
chunks = []
|