red-arrow 0.14.1 → 0.15.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of red-arrow might be problematic. Click here for more details.

Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/ext/arrow/arrow.cpp +34 -0
  3. data/ext/arrow/converters.cpp +42 -0
  4. data/ext/arrow/converters.hpp +626 -0
  5. data/ext/arrow/raw-records.cpp +6 -625
  6. data/ext/arrow/red-arrow.hpp +37 -3
  7. data/ext/arrow/values.cpp +154 -0
  8. data/lib/arrow/array-builder.rb +24 -1
  9. data/lib/arrow/array.rb +9 -0
  10. data/lib/arrow/chunked-array.rb +5 -0
  11. data/lib/arrow/column-containable.rb +48 -0
  12. data/lib/arrow/column.rb +36 -10
  13. data/lib/arrow/csv-loader.rb +2 -2
  14. data/lib/arrow/data-type.rb +22 -5
  15. data/lib/arrow/date64-array-builder.rb +2 -2
  16. data/lib/arrow/date64-array.rb +1 -1
  17. data/lib/arrow/decimal128-array.rb +24 -0
  18. data/lib/arrow/field-containable.rb +3 -0
  19. data/lib/arrow/group.rb +10 -13
  20. data/lib/arrow/loader.rb +20 -1
  21. data/lib/arrow/record-batch.rb +6 -4
  22. data/lib/arrow/record-containable.rb +0 -35
  23. data/lib/arrow/record.rb +12 -9
  24. data/lib/arrow/slicer.rb +2 -2
  25. data/lib/arrow/struct-array-builder.rb +1 -7
  26. data/lib/arrow/struct-array.rb +13 -11
  27. data/lib/arrow/table-loader.rb +3 -9
  28. data/lib/arrow/table-table-formatter.rb +2 -2
  29. data/lib/arrow/table.rb +61 -24
  30. data/lib/arrow/time.rb +159 -0
  31. data/lib/arrow/time32-array-builder.rb +49 -0
  32. data/lib/arrow/time32-array.rb +28 -0
  33. data/lib/arrow/time64-array-builder.rb +49 -0
  34. data/lib/arrow/time64-array.rb +28 -0
  35. data/lib/arrow/timestamp-array-builder.rb +20 -1
  36. data/lib/arrow/timestamp-array.rb +10 -22
  37. data/lib/arrow/version.rb +1 -1
  38. data/red-arrow.gemspec +1 -1
  39. data/test/raw-records/test-basic-arrays.rb +16 -8
  40. data/test/raw-records/test-dense-union-array.rb +12 -5
  41. data/test/raw-records/test-list-array.rb +21 -9
  42. data/test/raw-records/test-sparse-union-array.rb +13 -5
  43. data/test/raw-records/test-struct-array.rb +11 -4
  44. data/test/test-column.rb +56 -31
  45. data/test/test-decimal128-array-builder.rb +11 -11
  46. data/test/test-decimal128-array.rb +4 -4
  47. data/test/test-slicer.rb +1 -3
  48. data/test/test-struct-array-builder.rb +4 -4
  49. data/test/test-struct-array.rb +4 -4
  50. data/test/test-table.rb +17 -8
  51. data/test/test-time.rb +288 -0
  52. data/test/test-time32-array.rb +81 -0
  53. data/test/test-time64-array.rb +81 -0
  54. data/test/values/test-basic-arrays.rb +284 -0
  55. data/test/values/test-dense-union-array.rb +487 -0
  56. data/test/values/test-list-array.rb +497 -0
  57. data/test/values/test-sparse-union-array.rb +477 -0
  58. data/test/values/test-struct-array.rb +452 -0
  59. metadata +78 -54
  60. data/lib/arrow/struct.rb +0 -79
  61. data/test/test-struct.rb +0 -81
@@ -19,11 +19,11 @@ module Arrow
19
19
  class Date64ArrayBuilder
20
20
  private
21
21
  def convert_to_arrow_value(value)
22
- if value.respond_to?(:to_time) and not value.is_a?(Time)
22
+ if value.respond_to?(:to_time) and not value.is_a?(::Time)
23
23
  value = value.to_time
24
24
  end
25
25
 
26
- if value.is_a?(Time)
26
+ if value.is_a?(::Time)
27
27
  value.to_i * 1_000 + value.usec / 1_000
28
28
  else
29
29
  value
@@ -23,7 +23,7 @@ module Arrow
23
23
 
24
24
  private
25
25
  def to_datetime(raw_value)
26
- Time.at(*raw_value.divmod(1_000)).to_datetime
26
+ ::Time.at(*raw_value.divmod(1_000)).to_datetime
27
27
  end
28
28
  end
29
29
  end
@@ -0,0 +1,24 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module Arrow
19
+ class Decimal128Array
20
+ def get_value(i)
21
+ BigDecimal(format_value(i))
22
+ end
23
+ end
24
+ end
@@ -24,6 +24,9 @@ module Arrow
24
24
  get_field_by_name(name)
25
25
  when Integer
26
26
  index = name_or_index
27
+ raise if index < 0
28
+ index += n_fields if index < 0
29
+ return nil if index < 0 or index >= n_fields
27
30
  get_field(index)
28
31
  else
29
32
  message = "field name or index must be String, Symbol or Integer"
@@ -152,24 +152,21 @@ module Arrow
152
152
  end
153
153
 
154
154
  grouped_key_arrays_raw = grouped_keys.transpose
155
- columns = @keys.collect.with_index do |key, i|
155
+ fields = []
156
+ arrays = []
157
+ @keys.each_with_index do |key, i|
156
158
  key_column = @table[key]
157
- key_column_array_class = key_column.data.chunks.first.class
158
- if key_column_array_class == TimestampArray
159
- builder = TimestampArrayBuilder.new(key_column.data_type)
160
- key_column_array = builder.build(grouped_key_arrays_raw[i])
161
- else
162
- key_column_array =
163
- key_column_array_class.new(grouped_key_arrays_raw[i])
164
- end
165
- Column.new(key_column.field, key_column_array)
159
+ key_column_array_raw = grouped_key_arrays_raw[i]
160
+ key_column_array = key_column.data_type.build_array(key_column_array_raw)
161
+ fields << key_column.field
162
+ arrays << key_column_array
166
163
  end
167
164
  target_columns.each_with_index do |column, i|
168
165
  array = ArrayBuilder.build(aggregated_arrays_raw[i])
169
- field = Field.new(column.name, array.value_data_type)
170
- columns << Column.new(field, array)
166
+ arrays << array
167
+ fields << Field.new(column.field.name, array.value_data_type)
171
168
  end
172
- Table.new(columns)
169
+ Table.new(fields, arrays)
173
170
  end
174
171
  end
175
172
  end
@@ -46,6 +46,7 @@ module Arrow
46
46
  require "arrow/date64-array"
47
47
  require "arrow/date64-array-builder"
48
48
  require "arrow/decimal128"
49
+ require "arrow/decimal128-array"
49
50
  require "arrow/decimal128-array-builder"
50
51
  require "arrow/decimal128-data-type"
51
52
  require "arrow/dense-union-data-type"
@@ -75,7 +76,12 @@ module Arrow
75
76
  require "arrow/table-loader"
76
77
  require "arrow/table-saver"
77
78
  require "arrow/tensor"
79
+ require "arrow/time"
80
+ require "arrow/time32-array"
81
+ require "arrow/time32-array-builder"
78
82
  require "arrow/time32-data-type"
83
+ require "arrow/time64-array"
84
+ require "arrow/time64-array-builder"
79
85
  require "arrow/time64-data-type"
80
86
  require "arrow/timestamp-array"
81
87
  require "arrow/timestamp-array-builder"
@@ -97,6 +103,14 @@ module Arrow
97
103
  end
98
104
 
99
105
  def load_method_info(info, klass, method_name)
106
+ case klass.name
107
+ when /Array\z/
108
+ case method_name
109
+ when "values"
110
+ method_name = "values_raw"
111
+ end
112
+ end
113
+
100
114
  case klass.name
101
115
  when /Builder\z/
102
116
  case method_name
@@ -113,7 +127,12 @@ module Arrow
113
127
  method_name = "get_value"
114
128
  end
115
129
  super(info, klass, method_name)
116
- when "Arrow::TimestampArray", "Arrow::Date32Array", "Arrow::Date64Array"
130
+ when "Arrow::Date32Array",
131
+ "Arrow::Date64Array",
132
+ "Arrow::Decimal128Array",
133
+ "Arrow::Time32Array",
134
+ "Arrow::Time64Array",
135
+ "Arrow::TimestampArray"
117
136
  case method_name
118
137
  when "get_value"
119
138
  method_name = "get_raw_value"
@@ -15,10 +15,12 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
+ require "arrow/column-containable"
18
19
  require "arrow/record-containable"
19
20
 
20
21
  module Arrow
21
22
  class RecordBatch
23
+ include ColumnContainable
22
24
  include RecordContainable
23
25
  include Enumerable
24
26
 
@@ -40,10 +42,10 @@ module Arrow
40
42
 
41
43
  alias_method :each, :each_record
42
44
 
43
- alias_method :columns_raw, :columns
44
- def columns
45
- @columns ||= columns_raw
46
- end
45
+ alias_method :size, :n_rows
46
+ alias_method :length, :n_rows
47
+
48
+ alias_method :[], :find_column
47
49
 
48
50
  # Converts the record batch to {Arrow::Table}.
49
51
  #
@@ -17,12 +17,6 @@
17
17
 
18
18
  module Arrow
19
19
  module RecordContainable
20
- def each_column(&block)
21
- return to_enum(__method__) unless block_given?
22
-
23
- columns.each(&block)
24
- end
25
-
26
20
  def each_record(reuse_record: false)
27
21
  unless block_given?
28
22
  return to_enum(__method__, reuse_record: reuse_record)
@@ -40,34 +34,5 @@ module Arrow
40
34
  end
41
35
  end
42
36
  end
43
-
44
- def find_column(name_or_index)
45
- case name_or_index
46
- when String, Symbol
47
- name = name_or_index.to_s
48
- index = resolve_column_name(name)
49
- return nil if index.nil?
50
- columns[index]
51
- when Integer
52
- index = name_or_index
53
- columns[index]
54
- else
55
- message = "column name or index must be String, Symbol or Integer"
56
- raise ArgumentError, message
57
- end
58
- end
59
-
60
- private
61
- def resolve_column_name(name)
62
- (@column_name_to_index ||= build_column_name_resolve_table)[name]
63
- end
64
-
65
- def build_column_name_resolve_table
66
- table = {}
67
- schema.fields.each_with_index do |field, i|
68
- table[field.name] = i
69
- end
70
- table
71
- end
72
37
  end
73
38
  end
@@ -17,38 +17,41 @@
17
17
 
18
18
  module Arrow
19
19
  class Record
20
+ attr_reader :container
20
21
  attr_accessor :index
21
- def initialize(record_container, index)
22
- @record_container = record_container
22
+ def initialize(container, index)
23
+ @container = container
23
24
  @index = index
24
25
  end
25
26
 
26
27
  def [](column_name_or_column_index)
27
- column = @record_container.find_column(column_name_or_column_index)
28
+ column = @container.find_column(column_name_or_column_index)
28
29
  return nil if column.nil?
29
30
  column[@index]
30
31
  end
31
32
 
32
- def columns
33
- @record_container.columns
33
+ def to_a
34
+ @container.columns.collect do |column|
35
+ column[@index]
36
+ end
34
37
  end
35
38
 
36
39
  def to_h
37
40
  attributes = {}
38
- @record_container.schema.fields.each_with_index do |field, i|
39
- attributes[field.name] = self[i]
41
+ @container.columns.each do |column|
42
+ attributes[column.name] = column[@index]
40
43
  end
41
44
  attributes
42
45
  end
43
46
 
44
47
  def respond_to_missing?(name, include_private)
45
- return true if @record_container.find_column(name)
48
+ return true if @container.find_column(name)
46
49
  super
47
50
  end
48
51
 
49
52
  def method_missing(name, *args, &block)
50
53
  if args.empty?
51
- column = @record_container.find_column(name)
54
+ column = @container.find_column(name)
52
55
  return column[@index] if column
53
56
  end
54
57
  super
@@ -253,9 +253,9 @@ module Arrow
253
253
  case @value
254
254
  when nil
255
255
  if @column.n_nulls.zero?
256
- raw_array = [true] * @column.length
256
+ raw_array = [true] * @column.n_rows
257
257
  else
258
- raw_array = @column.length.times.collect do |i|
258
+ raw_array = @column.n_rows.times.collect do |i|
259
259
  @column.valid?(i)
260
260
  end
261
261
  end
@@ -73,11 +73,6 @@ module Arrow
73
73
  value.each_with_index do |sub_value, i|
74
74
  self[i].append(sub_value)
75
75
  end
76
- when Arrow::Struct
77
- append_value_raw
78
- value.values.each_with_index do |sub_value, i|
79
- self[i].append(sub_value)
80
- end
81
76
  when Hash
82
77
  append_value_raw
83
78
  value.each do |name, sub_value|
@@ -85,8 +80,7 @@ module Arrow
85
80
  end
86
81
  else
87
82
  message =
88
- "struct value must be nil, Array, " +
89
- "Arrow::Struct or Hash: #{value.inspect}"
83
+ "struct value must be nil, Array or Hash: #{value.inspect}"
90
84
  raise ArgumentError, message
91
85
  end
92
86
  else
@@ -15,8 +15,6 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
- require "arrow/struct"
19
-
20
18
  module Arrow
21
19
  class StructArray
22
20
  # @param i [Integer]
@@ -24,9 +22,13 @@ module Arrow
24
22
  #
25
23
  # You can use {Arrow::Array#[]} for convenient value access.
26
24
  #
27
- # @return [Arrow::Struct] The `i`-th value.
25
+ # @return [Hash] The `i`-th struct.
28
26
  def get_value(i)
29
- Struct.new(self, i)
27
+ value = {}
28
+ value_data_type.fields.zip(fields) do |field, field_array|
29
+ value[field.name] = field_array[i]
30
+ end
31
+ value
30
32
  end
31
33
 
32
34
  # @overload find_field(index)
@@ -45,20 +47,20 @@ module Arrow
45
47
  (@name_to_field ||= build_name_to_field)[name.to_s]
46
48
  else
47
49
  index = index_or_name
48
- cached_fields[index]
50
+ fields[index]
49
51
  end
50
52
  end
51
53
 
52
- private
53
- def cached_fields
54
- @fields ||= fields
54
+ alias_method :fields_raw, :fields
55
+ def fields
56
+ @fields ||= fields_raw
55
57
  end
56
58
 
59
+ private
57
60
  def build_name_to_field
58
61
  name_to_field = {}
59
- field_arrays = cached_fields
60
- value_data_type.fields.each_with_index do |field, i|
61
- name_to_field[field.name] = field_arrays[i]
62
+ value_data_type.fields.zip(fields) do |field, field_array|
63
+ name_to_field[field.name] = field_array
62
64
  end
63
65
  name_to_field
64
66
  end
@@ -88,17 +88,11 @@ module Arrow
88
88
 
89
89
  def load_raw(input, reader)
90
90
  schema = reader.schema
91
- chunked_arrays = []
91
+ record_batches = []
92
92
  reader.each do |record_batch|
93
- record_batch.columns.each_with_index do |array, i|
94
- chunked_array = (chunked_arrays[i] ||= [])
95
- chunked_array << array
96
- end
97
- end
98
- columns = schema.fields.collect.with_index do |field, i|
99
- Column.new(field, ChunkedArray.new(chunked_arrays[i]))
93
+ record_batches << record_batch
100
94
  end
101
- table = Table.new(schema, columns)
95
+ table = Table.new(schema, record_batches)
102
96
  table.instance_variable_set(:@input, input)
103
97
  table
104
98
  end
@@ -33,7 +33,7 @@ module Arrow
33
33
  def format_column_name(column)
34
34
  case column.data_type
35
35
  when TimestampDataType
36
- "%*s" % [Time.now.iso8601.size, column.name]
36
+ "%*s" % [::Time.now.iso8601.size, column.name]
37
37
  when FloatDataType, DoubleDataType
38
38
  "%*s" % [FLOAT_N_DIGITS, column.name]
39
39
  else
@@ -55,7 +55,7 @@ module Arrow
55
55
 
56
56
  def format_column_value(column, value)
57
57
  case value
58
- when Time
58
+ when ::Time
59
59
  value.iso8601
60
60
  when Float
61
61
  "%*f" % [[column.name.size, FLOAT_N_DIGITS].max, value]
@@ -15,11 +15,13 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
+ require "arrow/column-containable"
18
19
  require "arrow/group"
19
20
  require "arrow/record-containable"
20
21
 
21
22
  module Arrow
22
23
  class Table
24
+ include ColumnContainable
23
25
  include RecordContainable
24
26
 
25
27
  class << self
@@ -74,6 +76,24 @@ module Arrow
74
76
  # Arrow::Table.new("count" => Arrow::ChunkedArray.new(count_chunks),
75
77
  # "visible" => Arrow::ChunkedArray.new(visible_chunks))
76
78
  #
79
+ # @overload initialize(raw_table)
80
+ #
81
+ # @param raw_table [Hash<String, ::Array>]
82
+ # The pairs of column name and values of the table. Column values is
83
+ # `Array`.
84
+ #
85
+ # @example Create a table from column name and values
86
+ # count_chunks = [
87
+ # Arrow::UInt32Array.new([0, 2]),
88
+ # Arrow::UInt32Array.new([nil, 4]),
89
+ # ]
90
+ # visible_chunks = [
91
+ # Arrow::BooleanArray.new([true]),
92
+ # Arrow::BooleanArray.new([nil, nil, false]),
93
+ # ]
94
+ # Arrow::Table.new("count" => [0, 2, nil, 4],
95
+ # "visible" => [true, nil, nil, false])
96
+ #
77
97
  # @overload initialize(schema, columns)
78
98
  #
79
99
  # @param schema [Arrow::Schema] The schema of the table.
@@ -152,17 +172,18 @@ module Arrow
152
172
  case n_args
153
173
  when 1
154
174
  if args[0][0].is_a?(Column)
155
- values = args[0]
156
- fields = values.collect(&:field)
175
+ columns = args[0]
176
+ fields = columns.collect(&:field)
177
+ values = columns.collect(&:data)
157
178
  schema = Schema.new(fields)
158
179
  else
159
180
  raw_table = args[0]
160
181
  fields = []
161
182
  values = []
162
183
  raw_table.each do |name, array|
163
- field = Field.new(name.to_s, array.value_data_type)
164
- fields << field
165
- values << Column.new(field, array)
184
+ array = ArrayBuilder.build(array) if array.is_a?(::Array)
185
+ fields << Field.new(name.to_s, array.value_data_type)
186
+ values << array
166
187
  end
167
188
  schema = Schema.new(fields)
168
189
  end
@@ -170,20 +191,19 @@ module Arrow
170
191
  schema = args[0]
171
192
  schema = Schema.new(schema) unless schema.is_a?(Schema)
172
193
  values = args[1]
173
- if values[0].is_a?(::Array)
194
+ case values[0]
195
+ when ::Array
174
196
  values = [RecordBatch.new(schema, values)]
197
+ when Column
198
+ values = values.collect(&:data)
175
199
  end
176
200
  else
177
- message = "wrong number of arguments (given, #{n_args}, expected 1..2)"
201
+ message = "wrong number of arguments (given #{n_args}, expected 1..2)"
178
202
  raise ArgumentError, message
179
203
  end
180
204
  initialize_raw(schema, values)
181
205
  end
182
206
 
183
- def columns
184
- @columns ||= n_columns.times.collect {|i| get_column(i)}
185
- end
186
-
187
207
  def each_record_batch
188
208
  return to_enum(__method__) unless block_given?
189
209
 
@@ -338,7 +358,7 @@ module Arrow
338
358
  other.each do |name, value|
339
359
  name = name.to_s
340
360
  if value
341
- added_columns[name] = ensure_column(name, value)
361
+ added_columns[name] = ensure_raw_column(name, value)
342
362
  else
343
363
  removed_columns[name] = true
344
364
  end
@@ -346,7 +366,8 @@ module Arrow
346
366
  when Table
347
367
  added_columns = {}
348
368
  other.columns.each do |column|
349
- added_columns[column.name] = column
369
+ name = column.name
370
+ added_columns[name] = ensure_raw_column(name, column)
350
371
  end
351
372
  else
352
373
  message = "merge target must be Hash or Arrow::Table: " +
@@ -363,15 +384,18 @@ module Arrow
363
384
  next
364
385
  end
365
386
  next if removed_columns.key?(column_name)
366
- new_columns << column
387
+ new_columns << ensure_raw_column(column_name, column)
367
388
  end
368
389
  added_columns.each do |name, new_column|
369
390
  new_columns << new_column
370
391
  end
371
- new_fields = new_columns.collect do |new_column|
372
- new_column.field
392
+ new_fields = []
393
+ new_arrays = []
394
+ new_columns.each do |new_column|
395
+ new_fields << new_column[:field]
396
+ new_arrays << new_column[:data]
373
397
  end
374
- self.class.new(Schema.new(new_fields), new_columns)
398
+ self.class.new(new_fields, new_arrays)
375
399
  end
376
400
 
377
401
  alias_method :remove_column_raw, :remove_column
@@ -447,10 +471,10 @@ module Arrow
447
471
  end
448
472
 
449
473
  def pack
450
- packed_columns = columns.collect do |column|
451
- column.pack
474
+ packed_arrays = columns.collect do |column|
475
+ column.data.pack
452
476
  end
453
- self.class.new(schema, packed_columns)
477
+ self.class.new(schema, packed_arrays)
454
478
  end
455
479
 
456
480
  alias_method :to_s_raw, :to_s
@@ -524,13 +548,26 @@ module Arrow
524
548
  end
525
549
  end
526
550
 
527
- def ensure_column(name, data)
551
+ def ensure_raw_column(name, data)
528
552
  case data
529
553
  when Array
530
- field = Field.new(name, data.value_data_type)
531
- Column.new(field, data)
554
+ {
555
+ field: Field.new(name, data.value_data_type),
556
+ data: ChunkedArray.new([data]),
557
+ }
558
+ when ChunkedArray
559
+ {
560
+ field: Field.new(name, data.value_data_type),
561
+ data: data,
562
+ }
532
563
  when Column
533
- data
564
+ column = data
565
+ data = column.data
566
+ data = ChunkedArray.new([data]) unless data.is_a?(ChunkedArray)
567
+ {
568
+ field: column.field,
569
+ data: data,
570
+ }
534
571
  else
535
572
  message = "column must be Arrow::Array or Arrow::Column: " +
536
573
  "<#{name}>: <#{data.inspect}>: #{inspect}"