red-arrow 5.0.0 → 6.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -0
  3. data/ext/arrow/converters.cpp +5 -0
  4. data/ext/arrow/converters.hpp +126 -0
  5. data/ext/arrow/extconf.rb +13 -0
  6. data/ext/arrow/raw-records.cpp +1 -0
  7. data/ext/arrow/values.cpp +1 -0
  8. data/lib/arrow/aggregate-node-options.rb +35 -0
  9. data/lib/arrow/aggregation.rb +46 -0
  10. data/lib/arrow/array-builder.rb +5 -0
  11. data/lib/arrow/binary-dictionary-array-builder.rb +27 -0
  12. data/lib/arrow/column-containable.rb +100 -1
  13. data/lib/arrow/datum.rb +2 -0
  14. data/lib/arrow/expression.rb +48 -0
  15. data/lib/arrow/file-system.rb +34 -0
  16. data/lib/arrow/group.rb +116 -124
  17. data/lib/arrow/loader.rb +13 -0
  18. data/lib/arrow/map-array-builder.rb +109 -0
  19. data/lib/arrow/map-array.rb +26 -0
  20. data/lib/arrow/map-data-type.rb +89 -0
  21. data/lib/arrow/path-extension.rb +1 -1
  22. data/lib/arrow/record-batch-reader.rb +41 -0
  23. data/lib/arrow/record-batch.rb +0 -2
  24. data/lib/arrow/slicer.rb +44 -143
  25. data/lib/arrow/source-node-options.rb +32 -0
  26. data/lib/arrow/string-dictionary-array-builder.rb +27 -0
  27. data/lib/arrow/symbol-values-appendable.rb +34 -0
  28. data/lib/arrow/table-concatenate-options.rb +36 -0
  29. data/lib/arrow/table-formatter.rb +141 -17
  30. data/lib/arrow/table-list-formatter.rb +5 -3
  31. data/lib/arrow/table-loader.rb +41 -3
  32. data/lib/arrow/table-saver.rb +29 -3
  33. data/lib/arrow/table-table-formatter.rb +7 -31
  34. data/lib/arrow/table.rb +32 -38
  35. data/lib/arrow/version.rb +1 -1
  36. data/red-arrow.gemspec +1 -1
  37. data/test/raw-records/test-dense-union-array.rb +14 -0
  38. data/test/raw-records/test-list-array.rb +19 -0
  39. data/test/raw-records/test-map-array.rb +441 -0
  40. data/test/raw-records/test-sparse-union-array.rb +14 -0
  41. data/test/raw-records/test-struct-array.rb +15 -0
  42. data/test/test-array-builder.rb +7 -0
  43. data/test/test-binary-dictionary-array-builder.rb +103 -0
  44. data/test/test-csv-loader.rb +8 -8
  45. data/test/test-expression.rb +40 -0
  46. data/test/test-group.rb +75 -51
  47. data/test/test-map-array-builder.rb +110 -0
  48. data/test/test-map-array.rb +33 -0
  49. data/test/test-map-data-type.rb +36 -0
  50. data/test/test-record-batch-reader.rb +46 -0
  51. data/test/test-record-batch.rb +42 -0
  52. data/test/test-slicer.rb +166 -167
  53. data/test/test-string-dictionary-array-builder.rb +103 -0
  54. data/test/test-table.rb +190 -53
  55. data/test/values/test-dense-union-array.rb +14 -0
  56. data/test/values/test-list-array.rb +17 -0
  57. data/test/values/test-map-array.rb +433 -0
  58. data/test/values/test-sparse-union-array.rb +14 -0
  59. data/test/values/test-struct-array.rb +15 -0
  60. metadata +107 -76
data/lib/arrow/slicer.rb CHANGED
@@ -16,9 +16,6 @@
16
16
  # under the License.
17
17
 
18
18
  module Arrow
19
- # Experimental
20
- #
21
- # TODO: Almost codes should be implemented in Apache Arrow C++.
22
19
  class Slicer
23
20
  def initialize(table)
24
21
  @table = table
@@ -43,6 +40,21 @@ module Arrow
43
40
  super
44
41
  end
45
42
 
43
+ module Helper
44
+ class << self
45
+ def ensure_boolean(column)
46
+ case column.data_type
47
+ when Arrow::BooleanDataType
48
+ column.data
49
+ else
50
+ options = CastOptions.new
51
+ options.to_data_type = Arrow::BooleanDataType.new
52
+ Function.find("cast").execute([column.data], options).value
53
+ end
54
+ end
55
+ end
56
+ end
57
+
46
58
  class Condition
47
59
  def evaluate
48
60
  message = "Slicer::Condition must define \#evaluate: #{inspect}"
@@ -69,43 +81,28 @@ module Arrow
69
81
  end
70
82
 
71
83
  def evaluate
72
- values1 = @condition1.evaluate.each
73
- values2 = @condition2.evaluate.each
74
- raw_array = []
75
- begin
76
- loop do
77
- value1 = values1.next
78
- value2 = values2.next
79
- if value1.nil? or value2.nil?
80
- raw_array << nil
81
- else
82
- raw_array << evaluate_value(value1, value2)
83
- end
84
- end
85
- rescue StopIteration
86
- end
87
- BooleanArray.new(raw_array)
84
+ function.execute([@condition1.evaluate, @condition2.evaluate]).value
88
85
  end
89
86
  end
90
87
 
91
88
  class AndCondition < LogicalCondition
92
89
  private
93
- def evaluate_value(value1, value2)
94
- value1 and value2
90
+ def function
91
+ Function.find("and")
95
92
  end
96
93
  end
97
94
 
98
95
  class OrCondition < LogicalCondition
99
96
  private
100
- def evaluate_value(value1, value2)
101
- value1 or value2
97
+ def function
98
+ Function.find("or")
102
99
  end
103
100
  end
104
101
 
105
102
  class XorCondition < LogicalCondition
106
103
  private
107
- def evaluate_value(value1, value2)
108
- value1 ^ value2
104
+ def function
105
+ Function.find("xor")
109
106
  end
110
107
  end
111
108
 
@@ -115,21 +112,7 @@ module Arrow
115
112
  end
116
113
 
117
114
  def evaluate
118
- data = @column.data
119
-
120
- case @column.data_type
121
- when BooleanDataType
122
- data
123
- else
124
- if data.n_chunks == 1
125
- data.get_chunk(0).cast(BooleanDataType.new, nil)
126
- else
127
- arrays = data.each_chunk.collect do |chunk|
128
- chunk.cast(BooleanDataType.new, nil)
129
- end
130
- ChunkedArray.new(arrays)
131
- end
132
- end
115
+ Helper.ensure_boolean(@column)
133
116
  end
134
117
 
135
118
  def !@
@@ -187,23 +170,8 @@ module Arrow
187
170
  end
188
171
 
189
172
  def evaluate
190
- data = @column.data
191
- raw_array = []
192
- data.each_chunk do |chunk|
193
- if chunk.is_a?(BooleanArray)
194
- boolean_array = chunk
195
- else
196
- boolean_array = chunk.cast(BooleanDataType.new, nil)
197
- end
198
- boolean_array.each do |value|
199
- if value.nil?
200
- raw_array << value
201
- else
202
- raw_array << !value
203
- end
204
- end
205
- end
206
- BooleanArray.new(raw_array)
173
+ data = Helper.ensure_boolean(@column)
174
+ Function.find("invert").execute([data]).value
207
175
  end
208
176
 
209
177
  def !@
@@ -222,19 +190,10 @@ module Arrow
222
190
  end
223
191
 
224
192
  def evaluate
225
- case @value
226
- when nil
227
- raw_array = @column.collect(&:nil?)
228
- BooleanArray.new(raw_array)
193
+ if @value.nil?
194
+ Function.find("is_null").execute([@column.data]).value
229
195
  else
230
- raw_array = @column.collect do |value|
231
- if value.nil?
232
- nil
233
- else
234
- @value == value
235
- end
236
- end
237
- BooleanArray.new(raw_array)
196
+ Function.find("equal").execute([@column.data, @value]).value
238
197
  end
239
198
  end
240
199
  end
@@ -250,25 +209,10 @@ module Arrow
250
209
  end
251
210
 
252
211
  def evaluate
253
- case @value
254
- when nil
255
- if @column.n_nulls.zero?
256
- raw_array = [true] * @column.n_rows
257
- else
258
- raw_array = @column.n_rows.times.collect do |i|
259
- @column.valid?(i)
260
- end
261
- end
262
- BooleanArray.new(raw_array)
212
+ if @value.nil?
213
+ Function.find("is_valid").execute([@column.data]).value
263
214
  else
264
- raw_array = @column.collect do |value|
265
- if value.nil?
266
- nil
267
- else
268
- @value != value
269
- end
270
- end
271
- BooleanArray.new(raw_array)
215
+ Function.find("not_equal").execute([@column.data, @value]).value
272
216
  end
273
217
  end
274
218
  end
@@ -284,14 +228,7 @@ module Arrow
284
228
  end
285
229
 
286
230
  def evaluate
287
- raw_array = @column.collect do |value|
288
- if value.nil?
289
- nil
290
- else
291
- @value > value
292
- end
293
- end
294
- BooleanArray.new(raw_array)
231
+ Function.find("less").execute([@column.data, @value]).value
295
232
  end
296
233
  end
297
234
 
@@ -306,14 +243,7 @@ module Arrow
306
243
  end
307
244
 
308
245
  def evaluate
309
- raw_array = @column.collect do |value|
310
- if value.nil?
311
- nil
312
- else
313
- @value >= value
314
- end
315
- end
316
- BooleanArray.new(raw_array)
246
+ Function.find("less_equal").execute([@column.data, @value]).value
317
247
  end
318
248
  end
319
249
 
@@ -328,14 +258,7 @@ module Arrow
328
258
  end
329
259
 
330
260
  def evaluate
331
- raw_array = @column.collect do |value|
332
- if value.nil?
333
- nil
334
- else
335
- @value < value
336
- end
337
- end
338
- BooleanArray.new(raw_array)
261
+ Function.find("greater").execute([@column.data, @value]).value
339
262
  end
340
263
  end
341
264
 
@@ -350,14 +273,7 @@ module Arrow
350
273
  end
351
274
 
352
275
  def evaluate
353
- raw_array = @column.collect do |value|
354
- if value.nil?
355
- nil
356
- else
357
- @value <= value
358
- end
359
- end
360
- BooleanArray.new(raw_array)
276
+ Function.find("greater_equal").execute([@column.data, @value]).value
361
277
  end
362
278
  end
363
279
 
@@ -372,18 +288,10 @@ module Arrow
372
288
  end
373
289
 
374
290
  def evaluate
375
- values_index = {}
376
- @values.each do |value|
377
- values_index[value] = true
378
- end
379
- raw_array = @column.collect do |value|
380
- if value.nil?
381
- nil
382
- else
383
- values_index.key?(value)
384
- end
385
- end
386
- BooleanArray.new(raw_array)
291
+ values = @values
292
+ values = Array.new(values) unless values.is_a?(Array)
293
+ options = SetLookupOptions.new(values)
294
+ Function.find("is_in").execute([@column.data], options).value
387
295
  end
388
296
  end
389
297
 
@@ -398,18 +306,11 @@ module Arrow
398
306
  end
399
307
 
400
308
  def evaluate
401
- values_index = {}
402
- @values.each do |value|
403
- values_index[value] = true
404
- end
405
- raw_array = @column.collect do |value|
406
- if value.nil?
407
- nil
408
- else
409
- not values_index.key?(value)
410
- end
411
- end
412
- BooleanArray.new(raw_array)
309
+ values = @values
310
+ values = Array.new(values) unless values.is_a?(Array)
311
+ options = SetLookupOptions.new(values)
312
+ booleans = Function.find("is_in").execute([@column.data], options).value
313
+ Function.find("invert").execute([booleans]).value
413
314
  end
414
315
  end
415
316
 
@@ -0,0 +1,32 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module Arrow
19
+ class SourceNodeOptions
20
+ class << self
21
+ # @api private
22
+ def try_convert(value)
23
+ case value
24
+ when RecordBatchReader, RecordBatch, Table
25
+ new(value)
26
+ else
27
+ nil
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,27 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module Arrow
19
+ class StringDictionaryArrayBuilder
20
+ include SymbolValuesAppendable
21
+
22
+ private
23
+ def create_values_array_builder
24
+ StringArrayBuilder.new
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,34 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module Arrow
19
+ module SymbolValuesAppendable
20
+ def append_values(values, is_valids=nil)
21
+ builder = create_values_array_builder
22
+ values = values.collect do |value|
23
+ case value
24
+ when Symbol
25
+ value.to_s
26
+ else
27
+ value
28
+ end
29
+ end
30
+ builder.append_values(values, is_valids)
31
+ append_array(builder.finish)
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,36 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module Arrow
19
+ class TableConcatenateOptions
20
+ class << self
21
+ # @api private
22
+ def try_convert(value)
23
+ case value
24
+ when Hash
25
+ options = new
26
+ value.each do |k, v|
27
+ options.public_send("#{k}=", value)
28
+ end
29
+ options
30
+ else
31
+ nil
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -18,6 +18,125 @@
18
18
  module Arrow
19
19
  # TODO: Almost codes should be implemented in Apache Arrow C++.
20
20
  class TableFormatter
21
+ # @private
22
+ class ColumnFormatter
23
+ attr_reader :column
24
+ attr_reader :head_values
25
+ attr_reader :tail_values
26
+ attr_reader :sample_values
27
+ def initialize(column, head_values, tail_values)
28
+ @column = column
29
+ @head_values = head_values
30
+ @tail_values = tail_values
31
+ @sample_values = head_values + tail_values
32
+ @field_value_widths = {}
33
+ end
34
+
35
+ def data_type
36
+ @data_type ||= @column.data_type
37
+ end
38
+
39
+ def name
40
+ @name ||= @column.name
41
+ end
42
+
43
+ def aligned_name
44
+ @aligned_name ||= format_aligned_name(name, data_type, @sample_values)
45
+ end
46
+
47
+ FLOAT_N_DIGITS = 10
48
+ FORMATTED_NULL = "(null)"
49
+
50
+ def format_value(value, width=0)
51
+ case value
52
+ when ::Time
53
+ value.iso8601
54
+ when Float
55
+ "%*f" % [[width, FLOAT_N_DIGITS].max, value]
56
+ when Integer
57
+ "%*d" % [width, value]
58
+ when Hash
59
+ formatted_values = data_type.fields.collect do |field|
60
+ field_name = field.name
61
+ field_value_width = compute_field_value_width(field, @sample_values)
62
+ formatted_name = format_value(field_name, 0)
63
+ formatted_value = format_value(value[field_name], field_value_width)
64
+ "#{formatted_name}: #{formatted_value}"
65
+ end
66
+ formatted = "{"
67
+ formatted << formatted_values.join(", ")
68
+ formatted << "}"
69
+ "%-*s" % [width, formatted]
70
+ when nil
71
+ "%*s" % [width, FORMATTED_NULL]
72
+ else
73
+ "%-*s" % [width, value.to_s]
74
+ end
75
+ end
76
+
77
+ private
78
+ def compute_field_value_width(field, sample_values)
79
+ unless @field_value_widths.key?(field)
80
+ field_name = field.name
81
+ field_sample_values = sample_values.collect do |v|
82
+ (v || {})[field_name]
83
+ end
84
+ field_aligned_name = format_aligned_name("",
85
+ field.data_type,
86
+ field_sample_values)
87
+ @field_value_widths[field] = field_aligned_name.size
88
+ end
89
+ @field_value_widths[field]
90
+ end
91
+
92
+ def format_aligned_name(name, data_type, sample_values)
93
+ case data_type
94
+ when TimestampDataType
95
+ "%*s" % [::Time.now.iso8601.size, name]
96
+ when IntegerDataType
97
+ have_null = false
98
+ have_negative = false
99
+ max_value = nil
100
+ sample_values.each do |value|
101
+ if value.nil?
102
+ have_null = true
103
+ else
104
+ if max_value.nil?
105
+ max_value = value.abs
106
+ else
107
+ max_value = [value.abs, max_value].max
108
+ end
109
+ have_negative = true if value.negative?
110
+ end
111
+ end
112
+ if max_value.nil?
113
+ width = 0
114
+ elsif max_value.zero?
115
+ width = 1
116
+ else
117
+ width = (Math.log10(max_value) + 1).truncate
118
+ end
119
+ width += 1 if have_negative # Need "-"
120
+ width = [width, FORMATTED_NULL.size].max if have_null
121
+ "%*s" % [width, name]
122
+ when FloatDataType, DoubleDataType
123
+ "%*s" % [FLOAT_N_DIGITS, name]
124
+ when StructDataType
125
+ field_widths = data_type.fields.collect do |field|
126
+ field_value_width = compute_field_value_width(field, sample_values)
127
+ field.name.size + ": ".size + field_value_width
128
+ end
129
+ width = "{}".size + field_widths.sum
130
+ if field_widths.size > 0
131
+ width += (", ".size * (field_widths.size - 1))
132
+ end
133
+ "%*s" % [width, name]
134
+ else
135
+ name
136
+ end
137
+ end
138
+ end
139
+
21
140
  def initialize(table, options={})
22
141
  @table = table
23
142
  @options = options
@@ -25,38 +144,43 @@ module Arrow
25
144
 
26
145
  def format
27
146
  text = ""
28
- columns = @table.columns
29
- format_header(text, columns)
30
-
31
147
  n_rows = @table.n_rows
32
- return text if n_rows.zero?
33
-
34
148
  border = @options[:border] || 10
35
- n_digits = (Math.log10(n_rows) + 1).truncate
149
+
36
150
  head_limit = [border, n_rows].min
37
- head_column_values = columns.collect do |column|
38
- column.each.take(head_limit)
151
+
152
+ tail_start = [border, n_rows - border].max
153
+ tail_limit = n_rows - tail_start
154
+
155
+ column_formatters = @table.columns.collect do |column|
156
+ head_values = column.each.take(head_limit)
157
+ if tail_limit > 0
158
+ tail_values = column.reverse_each.take(tail_limit).reverse
159
+ else
160
+ tail_values = []
161
+ end
162
+ ColumnFormatter.new(column, head_values, tail_values)
39
163
  end
164
+
165
+ format_header(text, column_formatters)
166
+ return text if n_rows.zero?
167
+
168
+ n_digits = (Math.log10(n_rows) + 1).truncate
40
169
  format_rows(text,
41
- columns,
42
- head_column_values.transpose,
170
+ column_formatters,
171
+ column_formatters.collect(&:head_values).transpose,
43
172
  n_digits,
44
173
  0)
45
174
  return text if n_rows <= border
46
175
 
47
- tail_start = [border, n_rows - border].max
48
- tail_limit = n_rows - tail_start
49
- tail_column_values = columns.collect do |column|
50
- column.reverse_each.take(tail_limit).reverse
51
- end
52
176
 
53
177
  if head_limit != tail_start
54
178
  format_ellipsis(text)
55
179
  end
56
180
 
57
181
  format_rows(text,
58
- columns,
59
- tail_column_values.transpose,
182
+ column_formatters,
183
+ column_formatters.collect(&:tail_values).transpose,
60
184
  n_digits,
61
185
  tail_start)
62
186
 
@@ -22,12 +22,14 @@ module Arrow
22
22
  def format_header(text, columns)
23
23
  end
24
24
 
25
- def format_rows(text, columns, rows, n_digits, start_offset)
25
+ def format_rows(text, column_formatters, rows, n_digits, start_offset)
26
26
  rows.each_with_index do |row, nth_row|
27
27
  text << ("=" * 20 + " #{start_offset + nth_row} " + "=" * 20 + "\n")
28
28
  row.each_with_index do |column_value, nth_column|
29
- column = columns[nth_column]
30
- text << "#{column.name}: #{column_value}\n"
29
+ column_formatter = column_formatters[nth_column]
30
+ formatted_name = column_formatter.name
31
+ formatted_value = column_formatter.format_value(column_value)
32
+ text << "#{formatted_name}: #{formatted_value}\n"
31
33
  end
32
34
  end
33
35
  end