red-arrow 0.14.1 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of red-arrow might be problematic. Click here for more details.

Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/ext/arrow/arrow.cpp +34 -0
  3. data/ext/arrow/converters.cpp +42 -0
  4. data/ext/arrow/converters.hpp +626 -0
  5. data/ext/arrow/raw-records.cpp +6 -625
  6. data/ext/arrow/red-arrow.hpp +37 -3
  7. data/ext/arrow/values.cpp +154 -0
  8. data/lib/arrow/array-builder.rb +24 -1
  9. data/lib/arrow/array.rb +9 -0
  10. data/lib/arrow/chunked-array.rb +5 -0
  11. data/lib/arrow/column-containable.rb +48 -0
  12. data/lib/arrow/column.rb +36 -10
  13. data/lib/arrow/csv-loader.rb +2 -2
  14. data/lib/arrow/data-type.rb +22 -5
  15. data/lib/arrow/date64-array-builder.rb +2 -2
  16. data/lib/arrow/date64-array.rb +1 -1
  17. data/lib/arrow/decimal128-array.rb +24 -0
  18. data/lib/arrow/field-containable.rb +3 -0
  19. data/lib/arrow/group.rb +10 -13
  20. data/lib/arrow/loader.rb +20 -1
  21. data/lib/arrow/record-batch.rb +6 -4
  22. data/lib/arrow/record-containable.rb +0 -35
  23. data/lib/arrow/record.rb +12 -9
  24. data/lib/arrow/slicer.rb +2 -2
  25. data/lib/arrow/struct-array-builder.rb +1 -7
  26. data/lib/arrow/struct-array.rb +13 -11
  27. data/lib/arrow/table-loader.rb +3 -9
  28. data/lib/arrow/table-table-formatter.rb +2 -2
  29. data/lib/arrow/table.rb +61 -24
  30. data/lib/arrow/time.rb +159 -0
  31. data/lib/arrow/time32-array-builder.rb +49 -0
  32. data/lib/arrow/time32-array.rb +28 -0
  33. data/lib/arrow/time64-array-builder.rb +49 -0
  34. data/lib/arrow/time64-array.rb +28 -0
  35. data/lib/arrow/timestamp-array-builder.rb +20 -1
  36. data/lib/arrow/timestamp-array.rb +10 -22
  37. data/lib/arrow/version.rb +1 -1
  38. data/red-arrow.gemspec +1 -1
  39. data/test/raw-records/test-basic-arrays.rb +16 -8
  40. data/test/raw-records/test-dense-union-array.rb +12 -5
  41. data/test/raw-records/test-list-array.rb +21 -9
  42. data/test/raw-records/test-sparse-union-array.rb +13 -5
  43. data/test/raw-records/test-struct-array.rb +11 -4
  44. data/test/test-column.rb +56 -31
  45. data/test/test-decimal128-array-builder.rb +11 -11
  46. data/test/test-decimal128-array.rb +4 -4
  47. data/test/test-slicer.rb +1 -3
  48. data/test/test-struct-array-builder.rb +4 -4
  49. data/test/test-struct-array.rb +4 -4
  50. data/test/test-table.rb +17 -8
  51. data/test/test-time.rb +288 -0
  52. data/test/test-time32-array.rb +81 -0
  53. data/test/test-time64-array.rb +81 -0
  54. data/test/values/test-basic-arrays.rb +284 -0
  55. data/test/values/test-dense-union-array.rb +487 -0
  56. data/test/values/test-list-array.rb +497 -0
  57. data/test/values/test-sparse-union-array.rb +477 -0
  58. data/test/values/test-struct-array.rb +452 -0
  59. metadata +78 -54
  60. data/lib/arrow/struct.rb +0 -79
  61. data/test/test-struct.rb +0 -81
@@ -19,11 +19,11 @@ module Arrow
19
19
  class Date64ArrayBuilder
20
20
  private
21
21
  def convert_to_arrow_value(value)
22
- if value.respond_to?(:to_time) and not value.is_a?(Time)
22
+ if value.respond_to?(:to_time) and not value.is_a?(::Time)
23
23
  value = value.to_time
24
24
  end
25
25
 
26
- if value.is_a?(Time)
26
+ if value.is_a?(::Time)
27
27
  value.to_i * 1_000 + value.usec / 1_000
28
28
  else
29
29
  value
@@ -23,7 +23,7 @@ module Arrow
23
23
 
24
24
  private
25
25
  def to_datetime(raw_value)
26
- Time.at(*raw_value.divmod(1_000)).to_datetime
26
+ ::Time.at(*raw_value.divmod(1_000)).to_datetime
27
27
  end
28
28
  end
29
29
  end
@@ -0,0 +1,24 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module Arrow
19
+ class Decimal128Array
20
+ def get_value(i)
21
+ BigDecimal(format_value(i))
22
+ end
23
+ end
24
+ end
@@ -24,6 +24,9 @@ module Arrow
24
24
  get_field_by_name(name)
25
25
  when Integer
26
26
  index = name_or_index
27
+ raise if index < 0
28
+ index += n_fields if index < 0
29
+ return nil if index < 0 or index >= n_fields
27
30
  get_field(index)
28
31
  else
29
32
  message = "field name or index must be String, Symbol or Integer"
@@ -152,24 +152,21 @@ module Arrow
152
152
  end
153
153
 
154
154
  grouped_key_arrays_raw = grouped_keys.transpose
155
- columns = @keys.collect.with_index do |key, i|
155
+ fields = []
156
+ arrays = []
157
+ @keys.each_with_index do |key, i|
156
158
  key_column = @table[key]
157
- key_column_array_class = key_column.data.chunks.first.class
158
- if key_column_array_class == TimestampArray
159
- builder = TimestampArrayBuilder.new(key_column.data_type)
160
- key_column_array = builder.build(grouped_key_arrays_raw[i])
161
- else
162
- key_column_array =
163
- key_column_array_class.new(grouped_key_arrays_raw[i])
164
- end
165
- Column.new(key_column.field, key_column_array)
159
+ key_column_array_raw = grouped_key_arrays_raw[i]
160
+ key_column_array = key_column.data_type.build_array(key_column_array_raw)
161
+ fields << key_column.field
162
+ arrays << key_column_array
166
163
  end
167
164
  target_columns.each_with_index do |column, i|
168
165
  array = ArrayBuilder.build(aggregated_arrays_raw[i])
169
- field = Field.new(column.name, array.value_data_type)
170
- columns << Column.new(field, array)
166
+ arrays << array
167
+ fields << Field.new(column.field.name, array.value_data_type)
171
168
  end
172
- Table.new(columns)
169
+ Table.new(fields, arrays)
173
170
  end
174
171
  end
175
172
  end
@@ -46,6 +46,7 @@ module Arrow
46
46
  require "arrow/date64-array"
47
47
  require "arrow/date64-array-builder"
48
48
  require "arrow/decimal128"
49
+ require "arrow/decimal128-array"
49
50
  require "arrow/decimal128-array-builder"
50
51
  require "arrow/decimal128-data-type"
51
52
  require "arrow/dense-union-data-type"
@@ -75,7 +76,12 @@ module Arrow
75
76
  require "arrow/table-loader"
76
77
  require "arrow/table-saver"
77
78
  require "arrow/tensor"
79
+ require "arrow/time"
80
+ require "arrow/time32-array"
81
+ require "arrow/time32-array-builder"
78
82
  require "arrow/time32-data-type"
83
+ require "arrow/time64-array"
84
+ require "arrow/time64-array-builder"
79
85
  require "arrow/time64-data-type"
80
86
  require "arrow/timestamp-array"
81
87
  require "arrow/timestamp-array-builder"
@@ -97,6 +103,14 @@ module Arrow
97
103
  end
98
104
 
99
105
  def load_method_info(info, klass, method_name)
106
+ case klass.name
107
+ when /Array\z/
108
+ case method_name
109
+ when "values"
110
+ method_name = "values_raw"
111
+ end
112
+ end
113
+
100
114
  case klass.name
101
115
  when /Builder\z/
102
116
  case method_name
@@ -113,7 +127,12 @@ module Arrow
113
127
  method_name = "get_value"
114
128
  end
115
129
  super(info, klass, method_name)
116
- when "Arrow::TimestampArray", "Arrow::Date32Array", "Arrow::Date64Array"
130
+ when "Arrow::Date32Array",
131
+ "Arrow::Date64Array",
132
+ "Arrow::Decimal128Array",
133
+ "Arrow::Time32Array",
134
+ "Arrow::Time64Array",
135
+ "Arrow::TimestampArray"
117
136
  case method_name
118
137
  when "get_value"
119
138
  method_name = "get_raw_value"
@@ -15,10 +15,12 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
+ require "arrow/column-containable"
18
19
  require "arrow/record-containable"
19
20
 
20
21
  module Arrow
21
22
  class RecordBatch
23
+ include ColumnContainable
22
24
  include RecordContainable
23
25
  include Enumerable
24
26
 
@@ -40,10 +42,10 @@ module Arrow
40
42
 
41
43
  alias_method :each, :each_record
42
44
 
43
- alias_method :columns_raw, :columns
44
- def columns
45
- @columns ||= columns_raw
46
- end
45
+ alias_method :size, :n_rows
46
+ alias_method :length, :n_rows
47
+
48
+ alias_method :[], :find_column
47
49
 
48
50
  # Converts the record batch to {Arrow::Table}.
49
51
  #
@@ -17,12 +17,6 @@
17
17
 
18
18
  module Arrow
19
19
  module RecordContainable
20
- def each_column(&block)
21
- return to_enum(__method__) unless block_given?
22
-
23
- columns.each(&block)
24
- end
25
-
26
20
  def each_record(reuse_record: false)
27
21
  unless block_given?
28
22
  return to_enum(__method__, reuse_record: reuse_record)
@@ -40,34 +34,5 @@ module Arrow
40
34
  end
41
35
  end
42
36
  end
43
-
44
- def find_column(name_or_index)
45
- case name_or_index
46
- when String, Symbol
47
- name = name_or_index.to_s
48
- index = resolve_column_name(name)
49
- return nil if index.nil?
50
- columns[index]
51
- when Integer
52
- index = name_or_index
53
- columns[index]
54
- else
55
- message = "column name or index must be String, Symbol or Integer"
56
- raise ArgumentError, message
57
- end
58
- end
59
-
60
- private
61
- def resolve_column_name(name)
62
- (@column_name_to_index ||= build_column_name_resolve_table)[name]
63
- end
64
-
65
- def build_column_name_resolve_table
66
- table = {}
67
- schema.fields.each_with_index do |field, i|
68
- table[field.name] = i
69
- end
70
- table
71
- end
72
37
  end
73
38
  end
@@ -17,38 +17,41 @@
17
17
 
18
18
  module Arrow
19
19
  class Record
20
+ attr_reader :container
20
21
  attr_accessor :index
21
- def initialize(record_container, index)
22
- @record_container = record_container
22
+ def initialize(container, index)
23
+ @container = container
23
24
  @index = index
24
25
  end
25
26
 
26
27
  def [](column_name_or_column_index)
27
- column = @record_container.find_column(column_name_or_column_index)
28
+ column = @container.find_column(column_name_or_column_index)
28
29
  return nil if column.nil?
29
30
  column[@index]
30
31
  end
31
32
 
32
- def columns
33
- @record_container.columns
33
+ def to_a
34
+ @container.columns.collect do |column|
35
+ column[@index]
36
+ end
34
37
  end
35
38
 
36
39
  def to_h
37
40
  attributes = {}
38
- @record_container.schema.fields.each_with_index do |field, i|
39
- attributes[field.name] = self[i]
41
+ @container.columns.each do |column|
42
+ attributes[column.name] = column[@index]
40
43
  end
41
44
  attributes
42
45
  end
43
46
 
44
47
  def respond_to_missing?(name, include_private)
45
- return true if @record_container.find_column(name)
48
+ return true if @container.find_column(name)
46
49
  super
47
50
  end
48
51
 
49
52
  def method_missing(name, *args, &block)
50
53
  if args.empty?
51
- column = @record_container.find_column(name)
54
+ column = @container.find_column(name)
52
55
  return column[@index] if column
53
56
  end
54
57
  super
@@ -253,9 +253,9 @@ module Arrow
253
253
  case @value
254
254
  when nil
255
255
  if @column.n_nulls.zero?
256
- raw_array = [true] * @column.length
256
+ raw_array = [true] * @column.n_rows
257
257
  else
258
- raw_array = @column.length.times.collect do |i|
258
+ raw_array = @column.n_rows.times.collect do |i|
259
259
  @column.valid?(i)
260
260
  end
261
261
  end
@@ -73,11 +73,6 @@ module Arrow
73
73
  value.each_with_index do |sub_value, i|
74
74
  self[i].append(sub_value)
75
75
  end
76
- when Arrow::Struct
77
- append_value_raw
78
- value.values.each_with_index do |sub_value, i|
79
- self[i].append(sub_value)
80
- end
81
76
  when Hash
82
77
  append_value_raw
83
78
  value.each do |name, sub_value|
@@ -85,8 +80,7 @@ module Arrow
85
80
  end
86
81
  else
87
82
  message =
88
- "struct value must be nil, Array, " +
89
- "Arrow::Struct or Hash: #{value.inspect}"
83
+ "struct value must be nil, Array or Hash: #{value.inspect}"
90
84
  raise ArgumentError, message
91
85
  end
92
86
  else
@@ -15,8 +15,6 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
- require "arrow/struct"
19
-
20
18
  module Arrow
21
19
  class StructArray
22
20
  # @param i [Integer]
@@ -24,9 +22,13 @@ module Arrow
24
22
  #
25
23
  # You can use {Arrow::Array#[]} for convenient value access.
26
24
  #
27
- # @return [Arrow::Struct] The `i`-th value.
25
+ # @return [Hash] The `i`-th struct.
28
26
  def get_value(i)
29
- Struct.new(self, i)
27
+ value = {}
28
+ value_data_type.fields.zip(fields) do |field, field_array|
29
+ value[field.name] = field_array[i]
30
+ end
31
+ value
30
32
  end
31
33
 
32
34
  # @overload find_field(index)
@@ -45,20 +47,20 @@ module Arrow
45
47
  (@name_to_field ||= build_name_to_field)[name.to_s]
46
48
  else
47
49
  index = index_or_name
48
- cached_fields[index]
50
+ fields[index]
49
51
  end
50
52
  end
51
53
 
52
- private
53
- def cached_fields
54
- @fields ||= fields
54
+ alias_method :fields_raw, :fields
55
+ def fields
56
+ @fields ||= fields_raw
55
57
  end
56
58
 
59
+ private
57
60
  def build_name_to_field
58
61
  name_to_field = {}
59
- field_arrays = cached_fields
60
- value_data_type.fields.each_with_index do |field, i|
61
- name_to_field[field.name] = field_arrays[i]
62
+ value_data_type.fields.zip(fields) do |field, field_array|
63
+ name_to_field[field.name] = field_array
62
64
  end
63
65
  name_to_field
64
66
  end
@@ -88,17 +88,11 @@ module Arrow
88
88
 
89
89
  def load_raw(input, reader)
90
90
  schema = reader.schema
91
- chunked_arrays = []
91
+ record_batches = []
92
92
  reader.each do |record_batch|
93
- record_batch.columns.each_with_index do |array, i|
94
- chunked_array = (chunked_arrays[i] ||= [])
95
- chunked_array << array
96
- end
97
- end
98
- columns = schema.fields.collect.with_index do |field, i|
99
- Column.new(field, ChunkedArray.new(chunked_arrays[i]))
93
+ record_batches << record_batch
100
94
  end
101
- table = Table.new(schema, columns)
95
+ table = Table.new(schema, record_batches)
102
96
  table.instance_variable_set(:@input, input)
103
97
  table
104
98
  end
@@ -33,7 +33,7 @@ module Arrow
33
33
  def format_column_name(column)
34
34
  case column.data_type
35
35
  when TimestampDataType
36
- "%*s" % [Time.now.iso8601.size, column.name]
36
+ "%*s" % [::Time.now.iso8601.size, column.name]
37
37
  when FloatDataType, DoubleDataType
38
38
  "%*s" % [FLOAT_N_DIGITS, column.name]
39
39
  else
@@ -55,7 +55,7 @@ module Arrow
55
55
 
56
56
  def format_column_value(column, value)
57
57
  case value
58
- when Time
58
+ when ::Time
59
59
  value.iso8601
60
60
  when Float
61
61
  "%*f" % [[column.name.size, FLOAT_N_DIGITS].max, value]
@@ -15,11 +15,13 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
+ require "arrow/column-containable"
18
19
  require "arrow/group"
19
20
  require "arrow/record-containable"
20
21
 
21
22
  module Arrow
22
23
  class Table
24
+ include ColumnContainable
23
25
  include RecordContainable
24
26
 
25
27
  class << self
@@ -74,6 +76,24 @@ module Arrow
74
76
  # Arrow::Table.new("count" => Arrow::ChunkedArray.new(count_chunks),
75
77
  # "visible" => Arrow::ChunkedArray.new(visible_chunks))
76
78
  #
79
+ # @overload initialize(raw_table)
80
+ #
81
+ # @param raw_table [Hash<String, ::Array>]
82
+ # The pairs of column name and values of the table. Column values is
83
+ # `Array`.
84
+ #
85
+ # @example Create a table from column name and values
86
+ # count_chunks = [
87
+ # Arrow::UInt32Array.new([0, 2]),
88
+ # Arrow::UInt32Array.new([nil, 4]),
89
+ # ]
90
+ # visible_chunks = [
91
+ # Arrow::BooleanArray.new([true]),
92
+ # Arrow::BooleanArray.new([nil, nil, false]),
93
+ # ]
94
+ # Arrow::Table.new("count" => [0, 2, nil, 4],
95
+ # "visible" => [true, nil, nil, false])
96
+ #
77
97
  # @overload initialize(schema, columns)
78
98
  #
79
99
  # @param schema [Arrow::Schema] The schema of the table.
@@ -152,17 +172,18 @@ module Arrow
152
172
  case n_args
153
173
  when 1
154
174
  if args[0][0].is_a?(Column)
155
- values = args[0]
156
- fields = values.collect(&:field)
175
+ columns = args[0]
176
+ fields = columns.collect(&:field)
177
+ values = columns.collect(&:data)
157
178
  schema = Schema.new(fields)
158
179
  else
159
180
  raw_table = args[0]
160
181
  fields = []
161
182
  values = []
162
183
  raw_table.each do |name, array|
163
- field = Field.new(name.to_s, array.value_data_type)
164
- fields << field
165
- values << Column.new(field, array)
184
+ array = ArrayBuilder.build(array) if array.is_a?(::Array)
185
+ fields << Field.new(name.to_s, array.value_data_type)
186
+ values << array
166
187
  end
167
188
  schema = Schema.new(fields)
168
189
  end
@@ -170,20 +191,19 @@ module Arrow
170
191
  schema = args[0]
171
192
  schema = Schema.new(schema) unless schema.is_a?(Schema)
172
193
  values = args[1]
173
- if values[0].is_a?(::Array)
194
+ case values[0]
195
+ when ::Array
174
196
  values = [RecordBatch.new(schema, values)]
197
+ when Column
198
+ values = values.collect(&:data)
175
199
  end
176
200
  else
177
- message = "wrong number of arguments (given, #{n_args}, expected 1..2)"
201
+ message = "wrong number of arguments (given #{n_args}, expected 1..2)"
178
202
  raise ArgumentError, message
179
203
  end
180
204
  initialize_raw(schema, values)
181
205
  end
182
206
 
183
- def columns
184
- @columns ||= n_columns.times.collect {|i| get_column(i)}
185
- end
186
-
187
207
  def each_record_batch
188
208
  return to_enum(__method__) unless block_given?
189
209
 
@@ -338,7 +358,7 @@ module Arrow
338
358
  other.each do |name, value|
339
359
  name = name.to_s
340
360
  if value
341
- added_columns[name] = ensure_column(name, value)
361
+ added_columns[name] = ensure_raw_column(name, value)
342
362
  else
343
363
  removed_columns[name] = true
344
364
  end
@@ -346,7 +366,8 @@ module Arrow
346
366
  when Table
347
367
  added_columns = {}
348
368
  other.columns.each do |column|
349
- added_columns[column.name] = column
369
+ name = column.name
370
+ added_columns[name] = ensure_raw_column(name, column)
350
371
  end
351
372
  else
352
373
  message = "merge target must be Hash or Arrow::Table: " +
@@ -363,15 +384,18 @@ module Arrow
363
384
  next
364
385
  end
365
386
  next if removed_columns.key?(column_name)
366
- new_columns << column
387
+ new_columns << ensure_raw_column(column_name, column)
367
388
  end
368
389
  added_columns.each do |name, new_column|
369
390
  new_columns << new_column
370
391
  end
371
- new_fields = new_columns.collect do |new_column|
372
- new_column.field
392
+ new_fields = []
393
+ new_arrays = []
394
+ new_columns.each do |new_column|
395
+ new_fields << new_column[:field]
396
+ new_arrays << new_column[:data]
373
397
  end
374
- self.class.new(Schema.new(new_fields), new_columns)
398
+ self.class.new(new_fields, new_arrays)
375
399
  end
376
400
 
377
401
  alias_method :remove_column_raw, :remove_column
@@ -447,10 +471,10 @@ module Arrow
447
471
  end
448
472
 
449
473
  def pack
450
- packed_columns = columns.collect do |column|
451
- column.pack
474
+ packed_arrays = columns.collect do |column|
475
+ column.data.pack
452
476
  end
453
- self.class.new(schema, packed_columns)
477
+ self.class.new(schema, packed_arrays)
454
478
  end
455
479
 
456
480
  alias_method :to_s_raw, :to_s
@@ -524,13 +548,26 @@ module Arrow
524
548
  end
525
549
  end
526
550
 
527
- def ensure_column(name, data)
551
+ def ensure_raw_column(name, data)
528
552
  case data
529
553
  when Array
530
- field = Field.new(name, data.value_data_type)
531
- Column.new(field, data)
554
+ {
555
+ field: Field.new(name, data.value_data_type),
556
+ data: ChunkedArray.new([data]),
557
+ }
558
+ when ChunkedArray
559
+ {
560
+ field: Field.new(name, data.value_data_type),
561
+ data: data,
562
+ }
532
563
  when Column
533
- data
564
+ column = data
565
+ data = column.data
566
+ data = ChunkedArray.new([data]) unless data.is_a?(ChunkedArray)
567
+ {
568
+ field: column.field,
569
+ data: data,
570
+ }
534
571
  else
535
572
  message = "column must be Arrow::Array or Arrow::Column: " +
536
573
  "<#{name}>: <#{data.inspect}>: #{inspect}"