red-arrow 5.0.0 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -0
  3. data/ext/arrow/converters.cpp +5 -0
  4. data/ext/arrow/converters.hpp +126 -0
  5. data/ext/arrow/extconf.rb +13 -0
  6. data/ext/arrow/raw-records.cpp +1 -0
  7. data/ext/arrow/values.cpp +1 -0
  8. data/lib/arrow/aggregate-node-options.rb +35 -0
  9. data/lib/arrow/aggregation.rb +46 -0
  10. data/lib/arrow/array-builder.rb +5 -0
  11. data/lib/arrow/binary-dictionary-array-builder.rb +27 -0
  12. data/lib/arrow/column-containable.rb +100 -1
  13. data/lib/arrow/datum.rb +2 -0
  14. data/lib/arrow/expression.rb +48 -0
  15. data/lib/arrow/file-system.rb +34 -0
  16. data/lib/arrow/group.rb +116 -124
  17. data/lib/arrow/loader.rb +13 -0
  18. data/lib/arrow/map-array-builder.rb +109 -0
  19. data/lib/arrow/map-array.rb +26 -0
  20. data/lib/arrow/map-data-type.rb +89 -0
  21. data/lib/arrow/path-extension.rb +1 -1
  22. data/lib/arrow/record-batch-reader.rb +41 -0
  23. data/lib/arrow/record-batch.rb +0 -2
  24. data/lib/arrow/slicer.rb +44 -143
  25. data/lib/arrow/source-node-options.rb +32 -0
  26. data/lib/arrow/string-dictionary-array-builder.rb +27 -0
  27. data/lib/arrow/symbol-values-appendable.rb +34 -0
  28. data/lib/arrow/table-concatenate-options.rb +36 -0
  29. data/lib/arrow/table-formatter.rb +141 -17
  30. data/lib/arrow/table-list-formatter.rb +5 -3
  31. data/lib/arrow/table-loader.rb +41 -3
  32. data/lib/arrow/table-saver.rb +29 -3
  33. data/lib/arrow/table-table-formatter.rb +7 -31
  34. data/lib/arrow/table.rb +32 -38
  35. data/lib/arrow/version.rb +1 -1
  36. data/red-arrow.gemspec +1 -1
  37. data/test/raw-records/test-dense-union-array.rb +14 -0
  38. data/test/raw-records/test-list-array.rb +19 -0
  39. data/test/raw-records/test-map-array.rb +441 -0
  40. data/test/raw-records/test-sparse-union-array.rb +14 -0
  41. data/test/raw-records/test-struct-array.rb +15 -0
  42. data/test/test-array-builder.rb +7 -0
  43. data/test/test-binary-dictionary-array-builder.rb +103 -0
  44. data/test/test-csv-loader.rb +8 -8
  45. data/test/test-expression.rb +40 -0
  46. data/test/test-group.rb +75 -51
  47. data/test/test-map-array-builder.rb +110 -0
  48. data/test/test-map-array.rb +33 -0
  49. data/test/test-map-data-type.rb +36 -0
  50. data/test/test-record-batch-reader.rb +46 -0
  51. data/test/test-record-batch.rb +42 -0
  52. data/test/test-slicer.rb +166 -167
  53. data/test/test-string-dictionary-array-builder.rb +103 -0
  54. data/test/test-table.rb +190 -53
  55. data/test/values/test-dense-union-array.rb +14 -0
  56. data/test/values/test-list-array.rb +17 -0
  57. data/test/values/test-map-array.rb +433 -0
  58. data/test/values/test-sparse-union-array.rb +14 -0
  59. data/test/values/test-struct-array.rb +15 -0
  60. metadata +107 -76
data/lib/arrow/slicer.rb CHANGED
@@ -16,9 +16,6 @@
16
16
  # under the License.
17
17
 
18
18
  module Arrow
19
- # Experimental
20
- #
21
- # TODO: Almost codes should be implemented in Apache Arrow C++.
22
19
  class Slicer
23
20
  def initialize(table)
24
21
  @table = table
@@ -43,6 +40,21 @@ module Arrow
43
40
  super
44
41
  end
45
42
 
43
+ module Helper
44
+ class << self
45
+ def ensure_boolean(column)
46
+ case column.data_type
47
+ when Arrow::BooleanDataType
48
+ column.data
49
+ else
50
+ options = CastOptions.new
51
+ options.to_data_type = Arrow::BooleanDataType.new
52
+ Function.find("cast").execute([column.data], options).value
53
+ end
54
+ end
55
+ end
56
+ end
57
+
46
58
  class Condition
47
59
  def evaluate
48
60
  message = "Slicer::Condition must define \#evaluate: #{inspect}"
@@ -69,43 +81,28 @@ module Arrow
69
81
  end
70
82
 
71
83
  def evaluate
72
- values1 = @condition1.evaluate.each
73
- values2 = @condition2.evaluate.each
74
- raw_array = []
75
- begin
76
- loop do
77
- value1 = values1.next
78
- value2 = values2.next
79
- if value1.nil? or value2.nil?
80
- raw_array << nil
81
- else
82
- raw_array << evaluate_value(value1, value2)
83
- end
84
- end
85
- rescue StopIteration
86
- end
87
- BooleanArray.new(raw_array)
84
+ function.execute([@condition1.evaluate, @condition2.evaluate]).value
88
85
  end
89
86
  end
90
87
 
91
88
  class AndCondition < LogicalCondition
92
89
  private
93
- def evaluate_value(value1, value2)
94
- value1 and value2
90
+ def function
91
+ Function.find("and")
95
92
  end
96
93
  end
97
94
 
98
95
  class OrCondition < LogicalCondition
99
96
  private
100
- def evaluate_value(value1, value2)
101
- value1 or value2
97
+ def function
98
+ Function.find("or")
102
99
  end
103
100
  end
104
101
 
105
102
  class XorCondition < LogicalCondition
106
103
  private
107
- def evaluate_value(value1, value2)
108
- value1 ^ value2
104
+ def function
105
+ Function.find("xor")
109
106
  end
110
107
  end
111
108
 
@@ -115,21 +112,7 @@ module Arrow
115
112
  end
116
113
 
117
114
  def evaluate
118
- data = @column.data
119
-
120
- case @column.data_type
121
- when BooleanDataType
122
- data
123
- else
124
- if data.n_chunks == 1
125
- data.get_chunk(0).cast(BooleanDataType.new, nil)
126
- else
127
- arrays = data.each_chunk.collect do |chunk|
128
- chunk.cast(BooleanDataType.new, nil)
129
- end
130
- ChunkedArray.new(arrays)
131
- end
132
- end
115
+ Helper.ensure_boolean(@column)
133
116
  end
134
117
 
135
118
  def !@
@@ -187,23 +170,8 @@ module Arrow
187
170
  end
188
171
 
189
172
  def evaluate
190
- data = @column.data
191
- raw_array = []
192
- data.each_chunk do |chunk|
193
- if chunk.is_a?(BooleanArray)
194
- boolean_array = chunk
195
- else
196
- boolean_array = chunk.cast(BooleanDataType.new, nil)
197
- end
198
- boolean_array.each do |value|
199
- if value.nil?
200
- raw_array << value
201
- else
202
- raw_array << !value
203
- end
204
- end
205
- end
206
- BooleanArray.new(raw_array)
173
+ data = Helper.ensure_boolean(@column)
174
+ Function.find("invert").execute([data]).value
207
175
  end
208
176
 
209
177
  def !@
@@ -222,19 +190,10 @@ module Arrow
222
190
  end
223
191
 
224
192
  def evaluate
225
- case @value
226
- when nil
227
- raw_array = @column.collect(&:nil?)
228
- BooleanArray.new(raw_array)
193
+ if @value.nil?
194
+ Function.find("is_null").execute([@column.data]).value
229
195
  else
230
- raw_array = @column.collect do |value|
231
- if value.nil?
232
- nil
233
- else
234
- @value == value
235
- end
236
- end
237
- BooleanArray.new(raw_array)
196
+ Function.find("equal").execute([@column.data, @value]).value
238
197
  end
239
198
  end
240
199
  end
@@ -250,25 +209,10 @@ module Arrow
250
209
  end
251
210
 
252
211
  def evaluate
253
- case @value
254
- when nil
255
- if @column.n_nulls.zero?
256
- raw_array = [true] * @column.n_rows
257
- else
258
- raw_array = @column.n_rows.times.collect do |i|
259
- @column.valid?(i)
260
- end
261
- end
262
- BooleanArray.new(raw_array)
212
+ if @value.nil?
213
+ Function.find("is_valid").execute([@column.data]).value
263
214
  else
264
- raw_array = @column.collect do |value|
265
- if value.nil?
266
- nil
267
- else
268
- @value != value
269
- end
270
- end
271
- BooleanArray.new(raw_array)
215
+ Function.find("not_equal").execute([@column.data, @value]).value
272
216
  end
273
217
  end
274
218
  end
@@ -284,14 +228,7 @@ module Arrow
284
228
  end
285
229
 
286
230
  def evaluate
287
- raw_array = @column.collect do |value|
288
- if value.nil?
289
- nil
290
- else
291
- @value > value
292
- end
293
- end
294
- BooleanArray.new(raw_array)
231
+ Function.find("less").execute([@column.data, @value]).value
295
232
  end
296
233
  end
297
234
 
@@ -306,14 +243,7 @@ module Arrow
306
243
  end
307
244
 
308
245
  def evaluate
309
- raw_array = @column.collect do |value|
310
- if value.nil?
311
- nil
312
- else
313
- @value >= value
314
- end
315
- end
316
- BooleanArray.new(raw_array)
246
+ Function.find("less_equal").execute([@column.data, @value]).value
317
247
  end
318
248
  end
319
249
 
@@ -328,14 +258,7 @@ module Arrow
328
258
  end
329
259
 
330
260
  def evaluate
331
- raw_array = @column.collect do |value|
332
- if value.nil?
333
- nil
334
- else
335
- @value < value
336
- end
337
- end
338
- BooleanArray.new(raw_array)
261
+ Function.find("greater").execute([@column.data, @value]).value
339
262
  end
340
263
  end
341
264
 
@@ -350,14 +273,7 @@ module Arrow
350
273
  end
351
274
 
352
275
  def evaluate
353
- raw_array = @column.collect do |value|
354
- if value.nil?
355
- nil
356
- else
357
- @value <= value
358
- end
359
- end
360
- BooleanArray.new(raw_array)
276
+ Function.find("greater_equal").execute([@column.data, @value]).value
361
277
  end
362
278
  end
363
279
 
@@ -372,18 +288,10 @@ module Arrow
372
288
  end
373
289
 
374
290
  def evaluate
375
- values_index = {}
376
- @values.each do |value|
377
- values_index[value] = true
378
- end
379
- raw_array = @column.collect do |value|
380
- if value.nil?
381
- nil
382
- else
383
- values_index.key?(value)
384
- end
385
- end
386
- BooleanArray.new(raw_array)
291
+ values = @values
292
+ values = Array.new(values) unless values.is_a?(Array)
293
+ options = SetLookupOptions.new(values)
294
+ Function.find("is_in").execute([@column.data], options).value
387
295
  end
388
296
  end
389
297
 
@@ -398,18 +306,11 @@ module Arrow
398
306
  end
399
307
 
400
308
  def evaluate
401
- values_index = {}
402
- @values.each do |value|
403
- values_index[value] = true
404
- end
405
- raw_array = @column.collect do |value|
406
- if value.nil?
407
- nil
408
- else
409
- not values_index.key?(value)
410
- end
411
- end
412
- BooleanArray.new(raw_array)
309
+ values = @values
310
+ values = Array.new(values) unless values.is_a?(Array)
311
+ options = SetLookupOptions.new(values)
312
+ booleans = Function.find("is_in").execute([@column.data], options).value
313
+ Function.find("invert").execute([booleans]).value
413
314
  end
414
315
  end
415
316
 
@@ -0,0 +1,32 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module Arrow
19
+ class SourceNodeOptions
20
+ class << self
21
+ # @api private
22
+ def try_convert(value)
23
+ case value
24
+ when RecordBatchReader, RecordBatch, Table
25
+ new(value)
26
+ else
27
+ nil
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,27 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module Arrow
19
+ class StringDictionaryArrayBuilder
20
+ include SymbolValuesAppendable
21
+
22
+ private
23
+ def create_values_array_builder
24
+ StringArrayBuilder.new
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,34 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module Arrow
19
+ module SymbolValuesAppendable
20
+ def append_values(values, is_valids=nil)
21
+ builder = create_values_array_builder
22
+ values = values.collect do |value|
23
+ case value
24
+ when Symbol
25
+ value.to_s
26
+ else
27
+ value
28
+ end
29
+ end
30
+ builder.append_values(values, is_valids)
31
+ append_array(builder.finish)
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,36 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module Arrow
19
+ class TableConcatenateOptions
20
+ class << self
21
+ # @api private
22
+ def try_convert(value)
23
+ case value
24
+ when Hash
25
+ options = new
26
+ value.each do |k, v|
27
+ options.public_send("#{k}=", value)
28
+ end
29
+ options
30
+ else
31
+ nil
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -18,6 +18,125 @@
18
18
  module Arrow
19
19
  # TODO: Almost codes should be implemented in Apache Arrow C++.
20
20
  class TableFormatter
21
+ # @private
22
+ class ColumnFormatter
23
+ attr_reader :column
24
+ attr_reader :head_values
25
+ attr_reader :tail_values
26
+ attr_reader :sample_values
27
+ def initialize(column, head_values, tail_values)
28
+ @column = column
29
+ @head_values = head_values
30
+ @tail_values = tail_values
31
+ @sample_values = head_values + tail_values
32
+ @field_value_widths = {}
33
+ end
34
+
35
+ def data_type
36
+ @data_type ||= @column.data_type
37
+ end
38
+
39
+ def name
40
+ @name ||= @column.name
41
+ end
42
+
43
+ def aligned_name
44
+ @aligned_name ||= format_aligned_name(name, data_type, @sample_values)
45
+ end
46
+
47
+ FLOAT_N_DIGITS = 10
48
+ FORMATTED_NULL = "(null)"
49
+
50
+ def format_value(value, width=0)
51
+ case value
52
+ when ::Time
53
+ value.iso8601
54
+ when Float
55
+ "%*f" % [[width, FLOAT_N_DIGITS].max, value]
56
+ when Integer
57
+ "%*d" % [width, value]
58
+ when Hash
59
+ formatted_values = data_type.fields.collect do |field|
60
+ field_name = field.name
61
+ field_value_width = compute_field_value_width(field, @sample_values)
62
+ formatted_name = format_value(field_name, 0)
63
+ formatted_value = format_value(value[field_name], field_value_width)
64
+ "#{formatted_name}: #{formatted_value}"
65
+ end
66
+ formatted = "{"
67
+ formatted << formatted_values.join(", ")
68
+ formatted << "}"
69
+ "%-*s" % [width, formatted]
70
+ when nil
71
+ "%*s" % [width, FORMATTED_NULL]
72
+ else
73
+ "%-*s" % [width, value.to_s]
74
+ end
75
+ end
76
+
77
+ private
78
+ def compute_field_value_width(field, sample_values)
79
+ unless @field_value_widths.key?(field)
80
+ field_name = field.name
81
+ field_sample_values = sample_values.collect do |v|
82
+ (v || {})[field_name]
83
+ end
84
+ field_aligned_name = format_aligned_name("",
85
+ field.data_type,
86
+ field_sample_values)
87
+ @field_value_widths[field] = field_aligned_name.size
88
+ end
89
+ @field_value_widths[field]
90
+ end
91
+
92
+ def format_aligned_name(name, data_type, sample_values)
93
+ case data_type
94
+ when TimestampDataType
95
+ "%*s" % [::Time.now.iso8601.size, name]
96
+ when IntegerDataType
97
+ have_null = false
98
+ have_negative = false
99
+ max_value = nil
100
+ sample_values.each do |value|
101
+ if value.nil?
102
+ have_null = true
103
+ else
104
+ if max_value.nil?
105
+ max_value = value.abs
106
+ else
107
+ max_value = [value.abs, max_value].max
108
+ end
109
+ have_negative = true if value.negative?
110
+ end
111
+ end
112
+ if max_value.nil?
113
+ width = 0
114
+ elsif max_value.zero?
115
+ width = 1
116
+ else
117
+ width = (Math.log10(max_value) + 1).truncate
118
+ end
119
+ width += 1 if have_negative # Need "-"
120
+ width = [width, FORMATTED_NULL.size].max if have_null
121
+ "%*s" % [width, name]
122
+ when FloatDataType, DoubleDataType
123
+ "%*s" % [FLOAT_N_DIGITS, name]
124
+ when StructDataType
125
+ field_widths = data_type.fields.collect do |field|
126
+ field_value_width = compute_field_value_width(field, sample_values)
127
+ field.name.size + ": ".size + field_value_width
128
+ end
129
+ width = "{}".size + field_widths.sum
130
+ if field_widths.size > 0
131
+ width += (", ".size * (field_widths.size - 1))
132
+ end
133
+ "%*s" % [width, name]
134
+ else
135
+ name
136
+ end
137
+ end
138
+ end
139
+
21
140
  def initialize(table, options={})
22
141
  @table = table
23
142
  @options = options
@@ -25,38 +144,43 @@ module Arrow
25
144
 
26
145
  def format
27
146
  text = ""
28
- columns = @table.columns
29
- format_header(text, columns)
30
-
31
147
  n_rows = @table.n_rows
32
- return text if n_rows.zero?
33
-
34
148
  border = @options[:border] || 10
35
- n_digits = (Math.log10(n_rows) + 1).truncate
149
+
36
150
  head_limit = [border, n_rows].min
37
- head_column_values = columns.collect do |column|
38
- column.each.take(head_limit)
151
+
152
+ tail_start = [border, n_rows - border].max
153
+ tail_limit = n_rows - tail_start
154
+
155
+ column_formatters = @table.columns.collect do |column|
156
+ head_values = column.each.take(head_limit)
157
+ if tail_limit > 0
158
+ tail_values = column.reverse_each.take(tail_limit).reverse
159
+ else
160
+ tail_values = []
161
+ end
162
+ ColumnFormatter.new(column, head_values, tail_values)
39
163
  end
164
+
165
+ format_header(text, column_formatters)
166
+ return text if n_rows.zero?
167
+
168
+ n_digits = (Math.log10(n_rows) + 1).truncate
40
169
  format_rows(text,
41
- columns,
42
- head_column_values.transpose,
170
+ column_formatters,
171
+ column_formatters.collect(&:head_values).transpose,
43
172
  n_digits,
44
173
  0)
45
174
  return text if n_rows <= border
46
175
 
47
- tail_start = [border, n_rows - border].max
48
- tail_limit = n_rows - tail_start
49
- tail_column_values = columns.collect do |column|
50
- column.reverse_each.take(tail_limit).reverse
51
- end
52
176
 
53
177
  if head_limit != tail_start
54
178
  format_ellipsis(text)
55
179
  end
56
180
 
57
181
  format_rows(text,
58
- columns,
59
- tail_column_values.transpose,
182
+ column_formatters,
183
+ column_formatters.collect(&:tail_values).transpose,
60
184
  n_digits,
61
185
  tail_start)
62
186
 
@@ -22,12 +22,14 @@ module Arrow
22
22
  def format_header(text, columns)
23
23
  end
24
24
 
25
- def format_rows(text, columns, rows, n_digits, start_offset)
25
+ def format_rows(text, column_formatters, rows, n_digits, start_offset)
26
26
  rows.each_with_index do |row, nth_row|
27
27
  text << ("=" * 20 + " #{start_offset + nth_row} " + "=" * 20 + "\n")
28
28
  row.each_with_index do |column_value, nth_column|
29
- column = columns[nth_column]
30
- text << "#{column.name}: #{column_value}\n"
29
+ column_formatter = column_formatters[nth_column]
30
+ formatted_name = column_formatter.name
31
+ formatted_value = column_formatter.format_value(column_value)
32
+ text << "#{formatted_name}: #{formatted_value}\n"
31
33
  end
32
34
  end
33
35
  end