red-arrow 0.8.1 → 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/doc/text/news.md +44 -0
- data/lib/arrow/array-builder.rb +0 -1
- data/lib/arrow/array.rb +9 -1
- data/lib/arrow/chunked-array.rb +43 -7
- data/lib/arrow/column.rb +18 -0
- data/lib/arrow/csv-loader.rb +17 -2
- data/lib/arrow/data-type.rb +81 -0
- data/lib/arrow/group.rb +133 -0
- data/lib/arrow/loader.rb +4 -1
- data/lib/arrow/slicer.rb +76 -19
- data/lib/arrow/table-formatter.rb +21 -47
- data/lib/arrow/table-list-formatter.rb +35 -0
- data/lib/arrow/table-table-formatter.rb +69 -0
- data/lib/arrow/table.rb +62 -40
- data/lib/arrow/version.rb +1 -1
- data/test/fixture/null-with-double-quote.csv +4 -0
- data/test/fixture/null-without-double-quote.csv +4 -0
- data/test/run-test.rb +3 -1
- data/test/test-chunked-array.rb +39 -2
- data/test/test-column.rb +13 -0
- data/test/test-csv-loader.rb +20 -0
- data/test/test-group.rb +111 -0
- data/test/test-slicer.rb +42 -0
- data/test/test-table.rb +52 -10
- metadata +12 -2
data/lib/arrow/loader.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright 2017 Kouhei Sutou <kou@clear-code.com>
|
1
|
+
# Copyright 2017-2018 Kouhei Sutou <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
4
|
# you may not use this file except in compliance with the License.
|
@@ -35,6 +35,7 @@ module Arrow
|
|
35
35
|
require "arrow/column"
|
36
36
|
require "arrow/csv-loader"
|
37
37
|
require "arrow/csv-reader"
|
38
|
+
require "arrow/data-type"
|
38
39
|
require "arrow/date32-array"
|
39
40
|
require "arrow/date32-array-builder"
|
40
41
|
require "arrow/date64-array"
|
@@ -45,6 +46,8 @@ module Arrow
|
|
45
46
|
require "arrow/slicer"
|
46
47
|
require "arrow/table"
|
47
48
|
require "arrow/table-formatter"
|
49
|
+
require "arrow/table-list-formatter"
|
50
|
+
require "arrow/table-table-formatter"
|
48
51
|
require "arrow/table-loader"
|
49
52
|
require "arrow/table-saver"
|
50
53
|
require "arrow/tensor"
|
data/lib/arrow/slicer.rb
CHANGED
@@ -110,26 +110,19 @@ module Arrow
|
|
110
110
|
|
111
111
|
def evaluate
|
112
112
|
data = @column.data
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
else
|
118
|
-
array.cast(BooleanDataType.new)
|
119
|
-
end
|
113
|
+
|
114
|
+
case @column.data_type
|
115
|
+
when BooleanDataType
|
116
|
+
data
|
120
117
|
else
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
boolean_array = chunk.cast(BooleanDataType.new)
|
127
|
-
end
|
128
|
-
boolean_array.each do |value|
|
129
|
-
raw_array << value
|
118
|
+
if data.n_chunks == 1
|
119
|
+
data.get_chunk(0).cast(BooleanDataType.new)
|
120
|
+
else
|
121
|
+
arrays = data.each_chunk.collect do |chunk|
|
122
|
+
chunk.cast(BooleanDataType.new)
|
130
123
|
end
|
124
|
+
ChunkedArray.new(arrays)
|
131
125
|
end
|
132
|
-
BooleanArray.new(raw_array)
|
133
126
|
end
|
134
127
|
end
|
135
128
|
|
@@ -141,6 +134,10 @@ module Arrow
|
|
141
134
|
self == nil
|
142
135
|
end
|
143
136
|
|
137
|
+
def valid?
|
138
|
+
self != nil
|
139
|
+
end
|
140
|
+
|
144
141
|
def ==(value)
|
145
142
|
EqualCondition.new(@column, value)
|
146
143
|
end
|
@@ -165,6 +162,10 @@ module Arrow
|
|
165
162
|
GreaterEqualCondition.new(@column, value)
|
166
163
|
end
|
167
164
|
|
165
|
+
def in?(values)
|
166
|
+
InCondition.new(@column, values)
|
167
|
+
end
|
168
|
+
|
168
169
|
def select(&block)
|
169
170
|
SelectCondition.new(@column, block)
|
170
171
|
end
|
@@ -245,8 +246,12 @@ module Arrow
|
|
245
246
|
def evaluate
|
246
247
|
case @value
|
247
248
|
when nil
|
248
|
-
|
249
|
-
|
249
|
+
if @column.n_nulls.zero?
|
250
|
+
raw_array = [true] * @column.length
|
251
|
+
else
|
252
|
+
raw_array = @column.length.times.collect do |i|
|
253
|
+
@column.valid?(i)
|
254
|
+
end
|
250
255
|
end
|
251
256
|
BooleanArray.new(raw_array)
|
252
257
|
else
|
@@ -350,6 +355,58 @@ module Arrow
|
|
350
355
|
end
|
351
356
|
end
|
352
357
|
|
358
|
+
class InCondition < Condition
|
359
|
+
def initialize(column, values)
|
360
|
+
@column = column
|
361
|
+
@values = values
|
362
|
+
end
|
363
|
+
|
364
|
+
def !@
|
365
|
+
NotInCondition.new(@column, @values)
|
366
|
+
end
|
367
|
+
|
368
|
+
def evaluate
|
369
|
+
values_index = {}
|
370
|
+
@values.each do |value|
|
371
|
+
values_index[value] = true
|
372
|
+
end
|
373
|
+
raw_array = @column.collect do |value|
|
374
|
+
if value.nil?
|
375
|
+
nil
|
376
|
+
else
|
377
|
+
values_index.key?(value)
|
378
|
+
end
|
379
|
+
end
|
380
|
+
BooleanArray.new(raw_array)
|
381
|
+
end
|
382
|
+
end
|
383
|
+
|
384
|
+
class NotInCondition < Condition
|
385
|
+
def initialize(column, values)
|
386
|
+
@column = column
|
387
|
+
@values = values
|
388
|
+
end
|
389
|
+
|
390
|
+
def !@
|
391
|
+
InCondition.new(@column, @values)
|
392
|
+
end
|
393
|
+
|
394
|
+
def evaluate
|
395
|
+
values_index = {}
|
396
|
+
@values.each do |value|
|
397
|
+
values_index[value] = true
|
398
|
+
end
|
399
|
+
raw_array = @column.collect do |value|
|
400
|
+
if value.nil?
|
401
|
+
nil
|
402
|
+
else
|
403
|
+
not values_index.key?(value)
|
404
|
+
end
|
405
|
+
end
|
406
|
+
BooleanArray.new(raw_array)
|
407
|
+
end
|
408
|
+
end
|
409
|
+
|
353
410
|
class SelectCondition < Condition
|
354
411
|
def initialize(column, block)
|
355
412
|
@column = column
|
@@ -12,8 +12,6 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
-
require "time"
|
16
|
-
|
17
15
|
module Arrow
|
18
16
|
class TableFormatter
|
19
17
|
def initialize(table, options={})
|
@@ -24,65 +22,41 @@ module Arrow
|
|
24
22
|
def format
|
25
23
|
text = ""
|
26
24
|
columns = @table.columns
|
27
|
-
columns
|
28
|
-
text << "\t"
|
29
|
-
text << format_column_name(column)
|
30
|
-
end
|
31
|
-
text << "\n"
|
25
|
+
format_header(text, columns)
|
32
26
|
|
33
27
|
n_rows = @table.n_rows
|
34
28
|
return text if n_rows.zero?
|
35
29
|
|
36
30
|
border = @options[:border] || 10
|
37
31
|
n_digits = (Math.log10(n_rows) + 1).truncate
|
38
|
-
[border, n_rows].min
|
39
|
-
|
32
|
+
head_limit = [border, n_rows].min
|
33
|
+
head_column_values = columns.collect do |column|
|
34
|
+
column.each.take(head_limit)
|
40
35
|
end
|
36
|
+
format_rows(text,
|
37
|
+
columns,
|
38
|
+
head_column_values.transpose,
|
39
|
+
n_digits,
|
40
|
+
0)
|
41
41
|
return text if n_rows <= border
|
42
42
|
|
43
|
-
|
44
|
-
|
45
|
-
|
43
|
+
tail_start = [border, n_rows - border].max
|
44
|
+
tail_limit = n_rows - tail_start
|
45
|
+
tail_column_values = columns.collect do |column|
|
46
|
+
column.reverse_each.take(tail_limit).reverse
|
46
47
|
end
|
47
48
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
private
|
52
|
-
FLOAT_N_DIGITS = 10
|
53
|
-
def format_column_name(column)
|
54
|
-
case column.data_type
|
55
|
-
when TimestampDataType
|
56
|
-
"%*s" % [Time.now.iso8601.size, column.name]
|
57
|
-
when FloatDataType, DoubleDataType
|
58
|
-
"%*s" % [FLOAT_N_DIGITS, column.name]
|
59
|
-
else
|
60
|
-
column.name
|
49
|
+
if head_limit != tail_start
|
50
|
+
format_ellipsis(text)
|
61
51
|
end
|
62
|
-
end
|
63
52
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
end
|
70
|
-
text << "\n"
|
71
|
-
text
|
72
|
-
end
|
53
|
+
format_rows(text,
|
54
|
+
columns,
|
55
|
+
tail_column_values.transpose,
|
56
|
+
n_digits,
|
57
|
+
tail_start)
|
73
58
|
|
74
|
-
|
75
|
-
value = column[i]
|
76
|
-
case value
|
77
|
-
when Time
|
78
|
-
value.iso8601
|
79
|
-
when Float
|
80
|
-
"%*f" % [[column.name.size, FLOAT_N_DIGITS].max, value]
|
81
|
-
when Integer
|
82
|
-
"%*d" % [column.name.size, value]
|
83
|
-
else
|
84
|
-
"%-*s" % [column.name.size, value.to_s]
|
85
|
-
end
|
59
|
+
text
|
86
60
|
end
|
87
61
|
end
|
88
62
|
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# Copyright 2018 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
module Arrow
|
16
|
+
class TableListFormatter < TableFormatter
|
17
|
+
private
|
18
|
+
def format_header(text, columns)
|
19
|
+
end
|
20
|
+
|
21
|
+
def format_rows(text, columns, rows, n_digits, start_offset)
|
22
|
+
rows.each_with_index do |row, nth_row|
|
23
|
+
text << ("=" * 20 + " #{start_offset + nth_row} " + "=" * 20 + "\n")
|
24
|
+
row.each_with_index do |column_value, nth_column|
|
25
|
+
column = columns[nth_column]
|
26
|
+
text << "#{column.name}: #{column_value}\n"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def format_ellipsis(text)
|
32
|
+
text << "...\n"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
# Copyright 2017-2018 Kouhei Sutou <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
require "time"
|
16
|
+
|
17
|
+
module Arrow
|
18
|
+
class TableTableFormatter < TableFormatter
|
19
|
+
private
|
20
|
+
def format_header(text, columns)
|
21
|
+
columns.each do |column|
|
22
|
+
text << "\t"
|
23
|
+
text << format_column_name(column)
|
24
|
+
end
|
25
|
+
text << "\n"
|
26
|
+
end
|
27
|
+
|
28
|
+
FLOAT_N_DIGITS = 10
|
29
|
+
def format_column_name(column)
|
30
|
+
case column.data_type
|
31
|
+
when TimestampDataType
|
32
|
+
"%*s" % [Time.now.iso8601.size, column.name]
|
33
|
+
when FloatDataType, DoubleDataType
|
34
|
+
"%*s" % [FLOAT_N_DIGITS, column.name]
|
35
|
+
else
|
36
|
+
column.name
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def format_rows(text, columns, rows, n_digits, start_offset)
|
41
|
+
rows.each_with_index do |row, nth_row|
|
42
|
+
text << ("%*d" % [n_digits, start_offset + nth_row])
|
43
|
+
row.each_with_index do |column_value, nth_column|
|
44
|
+
text << "\t"
|
45
|
+
column = columns[nth_column]
|
46
|
+
text << format_column_value(column, column_value)
|
47
|
+
end
|
48
|
+
text << "\n"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def format_column_value(column, value)
|
53
|
+
case value
|
54
|
+
when Time
|
55
|
+
value.iso8601
|
56
|
+
when Float
|
57
|
+
"%*f" % [[column.name.size, FLOAT_N_DIGITS].max, value]
|
58
|
+
when Integer
|
59
|
+
"%*d" % [column.name.size, value]
|
60
|
+
else
|
61
|
+
"%-*s" % [column.name.size, value.to_s]
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def format_ellipsis(text)
|
66
|
+
text << "...\n"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
data/lib/arrow/table.rb
CHANGED
@@ -12,6 +12,7 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
+
require "arrow/group"
|
15
16
|
require "arrow/record-containable"
|
16
17
|
|
17
18
|
module Arrow
|
@@ -61,24 +62,10 @@ module Arrow
|
|
61
62
|
end
|
62
63
|
end
|
63
64
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
if args.size == 1
|
69
|
-
find_column(args[0])
|
70
|
-
else
|
71
|
-
new_columns = args.collect do |column_name|
|
72
|
-
column = find_column(column_name)
|
73
|
-
if column.nil?
|
74
|
-
message = "unknown column: <#{column_name.inspect}>: #{inspect}"
|
75
|
-
raise ArgumentError, message
|
76
|
-
end
|
77
|
-
column
|
78
|
-
end
|
79
|
-
self.class.new(schema, new_columns)
|
80
|
-
end
|
81
|
-
end
|
65
|
+
alias_method :size, :n_rows
|
66
|
+
alias_method :length, :n_rows
|
67
|
+
|
68
|
+
alias_method :[], :find_column
|
82
69
|
|
83
70
|
# TODO
|
84
71
|
#
|
@@ -100,39 +87,32 @@ module Arrow
|
|
100
87
|
slicer = slicer.evaluate if slicer.respond_to?(:evaluate)
|
101
88
|
case slicer
|
102
89
|
when Integer
|
90
|
+
slicer += n_rows if slicer < 0
|
103
91
|
ranges << [slicer, slicer]
|
104
92
|
when Range
|
105
93
|
from = slicer.first
|
106
94
|
to = slicer.last
|
107
95
|
to -= 1 if slicer.exclude_end?
|
96
|
+
from += n_rows if from < 0
|
97
|
+
to += n_rows if to < 0
|
108
98
|
ranges << [from, to]
|
109
99
|
when ::Array
|
110
100
|
from = slicer[0]
|
101
|
+
from += n_rows if from < 0
|
111
102
|
to = from + slicer[1] - 1
|
112
103
|
ranges << [from, to]
|
113
|
-
when
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
unless in_target
|
119
|
-
target_start = i
|
120
|
-
in_target = true
|
121
|
-
end
|
122
|
-
else
|
123
|
-
if in_target
|
124
|
-
ranges << [target_start, i - 1]
|
125
|
-
target_start = nil
|
126
|
-
in_target = false
|
127
|
-
end
|
128
|
-
end
|
129
|
-
end
|
130
|
-
if in_target
|
131
|
-
ranges << [target_start, slicer.length - 1]
|
104
|
+
when ChunkedArray
|
105
|
+
offset = 0
|
106
|
+
slicer.each_chunk do |array|
|
107
|
+
boolean_array_to_slice_ranges(array, offset, ranges)
|
108
|
+
offset += array.length
|
132
109
|
end
|
110
|
+
when BooleanArray
|
111
|
+
boolean_array_to_slice_ranges(slicer, 0, ranges)
|
133
112
|
else
|
134
|
-
message = "slicer must be Integer, Range, [from, to]
|
135
|
-
"Arrow::
|
113
|
+
message = "slicer must be Integer, Range, [from, to], " +
|
114
|
+
"Arrow::ChunkedArray of Arrow::BooleanArray, " +
|
115
|
+
"Arrow::BooleanArray or Arrow::Slicer::Condition: #{slicer.inspect}"
|
136
116
|
raise ArgumentError, message
|
137
117
|
end
|
138
118
|
end
|
@@ -209,6 +189,9 @@ module Arrow
|
|
209
189
|
remove_column_raw(index)
|
210
190
|
end
|
211
191
|
|
192
|
+
# TODO
|
193
|
+
#
|
194
|
+
# @return [Arrow::Table]
|
212
195
|
def select_columns(*selectors, &block)
|
213
196
|
if selectors.empty?
|
214
197
|
return to_enum(__method__) unless block_given?
|
@@ -241,13 +224,30 @@ module Arrow
|
|
241
224
|
self.class.new(selected_columns)
|
242
225
|
end
|
243
226
|
|
227
|
+
def group(*keys)
|
228
|
+
Group.new(self, keys)
|
229
|
+
end
|
230
|
+
|
244
231
|
def save(path, options={})
|
245
232
|
saver = TableSaver.new(self, path, options)
|
246
233
|
saver.save
|
247
234
|
end
|
248
235
|
|
236
|
+
def pack
|
237
|
+
packed_columns = columns.collect do |column|
|
238
|
+
column.pack
|
239
|
+
end
|
240
|
+
self.class.new(schema, packed_columns)
|
241
|
+
end
|
242
|
+
|
249
243
|
def to_s(options={})
|
250
|
-
|
244
|
+
case options[:format]
|
245
|
+
when :list
|
246
|
+
formatter_class = TableListFormatter
|
247
|
+
else
|
248
|
+
formatter_class = TableTableFormatter
|
249
|
+
end
|
250
|
+
formatter = formatter_class.new(self, options)
|
251
251
|
formatter.format
|
252
252
|
end
|
253
253
|
|
@@ -269,6 +269,28 @@ module Arrow
|
|
269
269
|
end
|
270
270
|
|
271
271
|
private
|
272
|
+
def boolean_array_to_slice_ranges(array, offset, ranges)
|
273
|
+
in_target = false
|
274
|
+
target_start = nil
|
275
|
+
array.each_with_index do |is_target, i|
|
276
|
+
if is_target
|
277
|
+
unless in_target
|
278
|
+
target_start = offset + i
|
279
|
+
in_target = true
|
280
|
+
end
|
281
|
+
else
|
282
|
+
if in_target
|
283
|
+
ranges << [target_start, offset + i - 1]
|
284
|
+
target_start = nil
|
285
|
+
in_target = false
|
286
|
+
end
|
287
|
+
end
|
288
|
+
end
|
289
|
+
if in_target
|
290
|
+
ranges << [target_start, offset + array.length - 1]
|
291
|
+
end
|
292
|
+
end
|
293
|
+
|
272
294
|
def slice_by_ranges(ranges)
|
273
295
|
sliced_columns = columns.collect do |column|
|
274
296
|
chunks = []
|