red-arrow 0.15.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/Rakefile +28 -16
  3. data/ext/arrow/converters.hpp +63 -33
  4. data/ext/arrow/raw-records.cpp +2 -1
  5. data/ext/arrow/values.cpp +2 -1
  6. data/lib/arrow/array-builder.rb +101 -52
  7. data/lib/arrow/array.rb +28 -10
  8. data/lib/arrow/{binary-array-builder.rb → buffer.rb} +7 -15
  9. data/lib/arrow/chunked-array.rb +2 -0
  10. data/lib/arrow/csv-loader.rb +15 -3
  11. data/lib/arrow/csv-read-options.rb +18 -0
  12. data/lib/arrow/data-type.rb +35 -2
  13. data/lib/arrow/decimal128-array-builder.rb +0 -2
  14. data/lib/arrow/dictionary-array.rb +24 -0
  15. data/lib/arrow/field.rb +1 -1
  16. data/lib/arrow/generic-filterable.rb +43 -0
  17. data/lib/arrow/generic-takeable.rb +38 -0
  18. data/lib/arrow/list-data-type.rb +58 -8
  19. data/lib/arrow/loader.rb +12 -1
  20. data/lib/arrow/null-array-builder.rb +1 -1
  21. data/lib/arrow/null-array.rb +24 -0
  22. data/lib/arrow/raw-table-converter.rb +47 -0
  23. data/lib/arrow/record-batch-iterator.rb +22 -0
  24. data/lib/arrow/record-batch.rb +8 -3
  25. data/lib/arrow/schema.rb +5 -2
  26. data/lib/arrow/struct-array-builder.rb +13 -7
  27. data/lib/arrow/struct-data-type.rb +0 -2
  28. data/lib/arrow/table-loader.rb +29 -6
  29. data/lib/arrow/table-saver.rb +37 -13
  30. data/lib/arrow/table.rb +20 -73
  31. data/lib/arrow/version.rb +1 -1
  32. data/red-arrow.gemspec +4 -2
  33. data/test/helper.rb +1 -0
  34. data/test/helper/omittable.rb +36 -0
  35. data/test/raw-records/test-dense-union-array.rb +1 -34
  36. data/test/raw-records/test-sparse-union-array.rb +1 -33
  37. data/test/run-test.rb +14 -3
  38. data/test/test-array-builder.rb +17 -0
  39. data/test/test-array.rb +104 -0
  40. data/test/test-buffer.rb +11 -0
  41. data/test/test-chunked-array.rb +96 -0
  42. data/test/test-csv-loader.rb +77 -2
  43. data/test/test-data-type.rb +11 -0
  44. data/test/test-dense-union-data-type.rb +2 -2
  45. data/test/test-dictionary-array.rb +41 -0
  46. data/test/test-feather.rb +21 -6
  47. data/test/test-list-data-type.rb +27 -1
  48. data/test/test-null-array.rb +23 -0
  49. data/test/test-record-batch-iterator.rb +37 -0
  50. data/test/test-record-batch.rb +14 -0
  51. data/test/test-schema.rb +16 -0
  52. data/test/test-slicer.rb +74 -30
  53. data/test/test-sparse-union-data-type.rb +2 -2
  54. data/test/test-struct-array-builder.rb +8 -4
  55. data/test/test-table.rb +153 -14
  56. data/test/test-timestamp-array.rb +19 -0
  57. data/test/values/test-dense-union-array.rb +1 -34
  58. data/test/values/test-sparse-union-array.rb +1 -33
  59. metadata +76 -63
@@ -19,7 +19,7 @@ module Arrow
19
19
  class NullArrayBuilder
20
20
  class << self
21
21
  def buildable?(args)
22
- super and args.collect(&:class) != [Integer]
22
+ super and not (args.size == 1 and args[0].is_a?(Integer))
23
23
  end
24
24
  end
25
25
  end
@@ -0,0 +1,24 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module Arrow
19
+ class NullArray
20
+ def get_value(i)
21
+ nil
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,47 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module Arrow
19
+ class RawTableConverter
20
+ attr_reader :n_rows
21
+ attr_reader :schema
22
+ attr_reader :values
23
+ def initialize(raw_table)
24
+ @raw_table = raw_table
25
+ convert
26
+ end
27
+
28
+ private
29
+ def convert
30
+ if @raw_table.is_a?(::Array) and @raw_table[0].is_a?(Column)
31
+ fields = @raw_table.collect(&:field)
32
+ @schema = Schema.new(fields)
33
+ @values = @raw_table.collect(&:data)
34
+ else
35
+ fields = []
36
+ @values = []
37
+ @raw_table.each do |name, array|
38
+ array = ArrayBuilder.build(array) if array.is_a?(::Array)
39
+ fields << Field.new(name.to_s, array.value_data_type)
40
+ @values << array
41
+ end
42
+ @schema = Schema.new(fields)
43
+ end
44
+ @n_rows = @values[0].length
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,22 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module Arrow
19
+ class RecordBatchIterator
20
+ alias_method :to_a, :to_list
21
+ end
22
+ end
@@ -15,8 +15,7 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
- require "arrow/column-containable"
19
- require "arrow/record-containable"
18
+ require "arrow/raw-table-converter"
20
19
 
21
20
  module Arrow
22
21
  class RecordBatch
@@ -28,13 +27,19 @@ module Arrow
28
27
  def new(*args)
29
28
  n_args = args.size
30
29
  case n_args
30
+ when 1
31
+ raw_table_converter = RawTableConverter.new(args[0])
32
+ n_rows = raw_table_converter.n_rows
33
+ schema = raw_table_converter.schema
34
+ values = raw_table_converter.values
35
+ super(schema, n_rows, values)
31
36
  when 2
32
37
  schema, data = args
33
38
  RecordBatchBuilder.build(schema, data)
34
39
  when 3
35
40
  super
36
41
  else
37
- message = "wrong number of arguments (given #{n_args}, expected 2..3)"
42
+ message = "wrong number of arguments (given #{n_args}, expected 1..3)"
38
43
  raise ArgumentError, message
39
44
  end
40
45
  end
@@ -15,8 +15,6 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
- require "arrow/field-containable"
19
-
20
18
  module Arrow
21
19
  class Schema
22
20
  include FieldContainable
@@ -93,5 +91,10 @@ module Arrow
93
91
  end
94
92
 
95
93
  alias_method :[], :find_field
94
+
95
+ alias_method :to_s_raw, :to_s
96
+ def to_s(show_metadata: false)
97
+ to_string_metadata(show_metadata)
98
+ end
96
99
  end
97
100
  end
@@ -32,7 +32,7 @@ module Arrow
32
32
  case index_or_name
33
33
  when String, Symbol
34
34
  name = index_or_name
35
- (@name_to_builder ||= build_name_to_builder)[name.to_s]
35
+ cached_name_to_builder[name.to_s]
36
36
  else
37
37
  index = index_or_name
38
38
  cached_field_builders[index]
@@ -70,13 +70,18 @@ module Arrow
70
70
  append_null
71
71
  when ::Array
72
72
  append_value_raw
73
- value.each_with_index do |sub_value, i|
74
- self[i].append(sub_value)
73
+ cached_field_builders.zip(value) do |builder, sub_value|
74
+ builder.append(sub_value)
75
75
  end
76
76
  when Hash
77
77
  append_value_raw
78
+ local_name_to_builder = cached_name_to_builder.dup
78
79
  value.each do |name, sub_value|
79
- self[name].append(sub_value)
80
+ builder = local_name_to_builder.delete(name.to_s)
81
+ builder.append(sub_value)
82
+ end
83
+ local_name_to_builder.each do |_, builder|
84
+ builder.append_null
80
85
  end
81
86
  else
82
87
  message =
@@ -108,9 +113,6 @@ module Arrow
108
113
  alias_method :append_null_raw, :append_null
109
114
  def append_null
110
115
  append_null_raw
111
- cached_field_builders.each do |builder|
112
- builder.append_null
113
- end
114
116
  end
115
117
 
116
118
  # @since 0.12.0
@@ -136,5 +138,9 @@ module Arrow
136
138
  end
137
139
  name_to_builder
138
140
  end
141
+
142
+ def cached_name_to_builder
143
+ @name_to_builder ||= build_name_to_builder
144
+ end
139
145
  end
140
146
  end
@@ -15,8 +15,6 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
- require "arrow/field-containable"
19
-
20
18
  module Arrow
21
19
  class StructDataType
22
20
  include FieldContainable
@@ -41,6 +41,8 @@ module Arrow
41
41
  available_formats << match_data.post_match
42
42
  end
43
43
  end
44
+ deprecated_formats = ["batch", "stream"]
45
+ available_formats -= deprecated_formats
44
46
  message = "Arrow::Table load format must be one of ["
45
47
  message << available_formats.join(", ")
46
48
  message << "]: #{format.inspect}"
@@ -119,18 +121,30 @@ module Arrow
119
121
  load_raw(input, reader)
120
122
  end
121
123
 
122
- def load_as_batch
124
+ # @since 1.0.0
125
+ def load_as_arrow_file
123
126
  input = open_input_stream
124
127
  reader = RecordBatchFileReader.new(input)
125
128
  load_raw(input, reader)
126
129
  end
127
130
 
128
- def load_as_stream
131
+ # @deprecated Use `format: :arrow_file` instead.
132
+ def load_as_batch
133
+ load_as_arrow_file
134
+ end
135
+
136
+ # @since 1.0.0
137
+ def load_as_arrow_streaming
129
138
  input = open_input_stream
130
139
  reader = RecordBatchStreamReader.new(input)
131
140
  load_raw(input, reader)
132
141
  end
133
142
 
143
+ # @deprecated Use `format: :arrow_streaming` instead.
144
+ def load_as_stream
145
+ load_as_arrow_streaming
146
+ end
147
+
134
148
  if Arrow.const_defined?(:ORCFileReader)
135
149
  def load_as_orc
136
150
  input = open_input_stream
@@ -143,16 +157,25 @@ module Arrow
143
157
  end
144
158
  end
145
159
 
146
- def load_as_csv
147
- options = @options.dup
160
+ def csv_load(options)
148
161
  options.delete(:format)
149
162
  if @input.is_a?(Buffer)
150
- CSVLoader.load(@input.data.to_s, options)
163
+ CSVLoader.load(@input.data.to_s, **options)
151
164
  else
152
- CSVLoader.load(Pathname.new(@input), options)
165
+ CSVLoader.load(Pathname.new(@input), **options)
153
166
  end
154
167
  end
155
168
 
169
+ def load_as_csv
170
+ csv_load(@options.dup)
171
+ end
172
+
173
+ def load_as_tsv
174
+ options = @options.dup
175
+ options[:delimiter] = "\t"
176
+ csv_load(options.dup)
177
+ end
178
+
156
179
  def load_as_feather
157
180
  input = open_input_stream
158
181
  reader = FeatherFileReader.new(input)
@@ -42,6 +42,8 @@ module Arrow
42
42
  available_formats << match_data.post_match
43
43
  end
44
44
  end
45
+ deprecated_formats = ["batch", "stream"]
46
+ available_formats -= deprecated_formats
45
47
  message = "Arrow::Table save format must be one of ["
46
48
  message << available_formats.join(", ")
47
49
  message << "]: #{format.inspect}"
@@ -110,35 +112,57 @@ module Arrow
110
112
  end
111
113
 
112
114
  def save_as_arrow
113
- save_as_batch
115
+ save_as_arrow_file
114
116
  end
115
117
 
116
- def save_as_batch
118
+ # @since 1.0.0
119
+ def save_as_arrow_file
117
120
  save_raw(RecordBatchFileWriter)
118
121
  end
119
122
 
120
- def save_as_stream
123
+ # @deprecated Use `format: :arrow_batch` instead.
124
+ def save_as_batch
125
+ save_as_arrow_file
126
+ end
127
+
128
+ # @since 1.0.0
129
+ def save_as_arrow_streaming
121
130
  save_raw(RecordBatchStreamWriter)
122
131
  end
123
132
 
124
- def save_as_csv
133
+ # @deprecated Use `format: :arrow_streaming` instead.
134
+ def save_as_stream
135
+ save_as_arrow_streaming
136
+ end
137
+
138
+ def csv_save(**options)
125
139
  open_output_stream do |output|
126
- csv = CSV.new(output)
140
+ csv = CSV.new(output, **options)
127
141
  names = @table.schema.fields.collect(&:name)
128
142
  csv << names
129
- @table.each_record(reuse_record: true) do |record|
130
- csv << names.collect do |name|
131
- record[name]
132
- end
143
+ @table.raw_records.each do |record|
144
+ csv << record
133
145
  end
134
146
  end
135
147
  end
136
148
 
149
+ def save_as_csv
150
+ csv_save
151
+ end
152
+
153
+ def save_as_tsv
154
+ csv_save(col_sep: "\t")
155
+ end
156
+
137
157
  def save_as_feather
138
- open_output_stream do |output|
139
- FeatherFileWriter.open(output) do |writer|
140
- writer.write(@table)
141
- end
158
+ properties = FeatherWriteProperties.new
159
+ properties.class.properties.each do |name|
160
+ value = @options[name.to_sym]
161
+ next if value.nil?
162
+ properties.__send__("#{name}=", value)
163
+ end
164
+ open_raw_output_stream do |output|
165
+ @table.write_as_feather(output, properties)
142
166
  end
143
167
  end
144
168
  end
@@ -15,13 +15,13 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
- require "arrow/column-containable"
19
- require "arrow/group"
20
- require "arrow/record-containable"
18
+ require "arrow/raw-table-converter"
21
19
 
22
20
  module Arrow
23
21
  class Table
24
22
  include ColumnContainable
23
+ include GenericFilterable
24
+ include GenericTakeable
25
25
  include RecordContainable
26
26
 
27
27
  class << self
@@ -83,14 +83,6 @@ module Arrow
83
83
  # `Array`.
84
84
  #
85
85
  # @example Create a table from column name and values
86
- # count_chunks = [
87
- # Arrow::UInt32Array.new([0, 2]),
88
- # Arrow::UInt32Array.new([nil, 4]),
89
- # ]
90
- # visible_chunks = [
91
- # Arrow::BooleanArray.new([true]),
92
- # Arrow::BooleanArray.new([nil, nil, false]),
93
- # ]
94
86
  # Arrow::Table.new("count" => [0, 2, nil, 4],
95
87
  # "visible" => [true, nil, nil, false])
96
88
  #
@@ -171,22 +163,9 @@ module Arrow
171
163
  n_args = args.size
172
164
  case n_args
173
165
  when 1
174
- if args[0][0].is_a?(Column)
175
- columns = args[0]
176
- fields = columns.collect(&:field)
177
- values = columns.collect(&:data)
178
- schema = Schema.new(fields)
179
- else
180
- raw_table = args[0]
181
- fields = []
182
- values = []
183
- raw_table.each do |name, array|
184
- array = ArrayBuilder.build(array) if array.is_a?(::Array)
185
- fields << Field.new(name.to_s, array.value_data_type)
186
- values << array
187
- end
188
- schema = Schema.new(fields)
189
- end
166
+ raw_table_converter = RawTableConverter.new(args[0])
167
+ schema = raw_table_converter.schema
168
+ values = raw_table_converter.values
190
169
  when 2
191
170
  schema = args[0]
192
171
  schema = Schema.new(schema) unless schema.is_a?(Schema)
@@ -306,13 +285,15 @@ module Arrow
306
285
  end
307
286
  end
308
287
 
309
- ranges = []
288
+ filter_options = Arrow::FilterOptions.new
289
+ filter_options.null_selection_behavior = :emit_null
290
+ sliced_tables = []
310
291
  slicers.each do |slicer|
311
292
  slicer = slicer.evaluate if slicer.respond_to?(:evaluate)
312
293
  case slicer
313
294
  when Integer
314
295
  slicer += n_rows if slicer < 0
315
- ranges << [slicer, n_rows - 1]
296
+ sliced_tables << slice_by_range(slicer, n_rows - 1)
316
297
  when Range
317
298
  original_from = from = slicer.first
318
299
  to = slicer.last
@@ -325,17 +306,9 @@ module Arrow
325
306
  raise ArgumentError, message
326
307
  end
327
308
  to += n_rows if to < 0
328
- ranges << [from, to]
329
- when ::Array
330
- boolean_array_to_slice_ranges(slicer, 0, ranges)
331
- when ChunkedArray
332
- offset = 0
333
- slicer.each_chunk do |array|
334
- boolean_array_to_slice_ranges(array, offset, ranges)
335
- offset += array.length
336
- end
337
- when BooleanArray
338
- boolean_array_to_slice_ranges(slicer, 0, ranges)
309
+ sliced_tables << slice_by_range(from, to)
310
+ when ::Array, BooleanArray, ChunkedArray
311
+ sliced_tables << filter(slicer, filter_options)
339
312
  else
340
313
  message = "slicer must be Integer, Range, (from, to), " +
341
314
  "Arrow::ChunkedArray of Arrow::BooleanArray, " +
@@ -343,7 +316,11 @@ module Arrow
343
316
  raise ArgumentError, message
344
317
  end
345
318
  end
346
- slice_by_ranges(ranges)
319
+ if sliced_tables.size > 1
320
+ sliced_tables[0].concatenate(sliced_tables[1..-1])
321
+ else
322
+ sliced_tables[0]
323
+ end
347
324
  end
348
325
 
349
326
  # TODO
@@ -514,38 +491,8 @@ module Arrow
514
491
  end
515
492
 
516
493
  private
517
- def boolean_array_to_slice_ranges(array, offset, ranges)
518
- in_target = false
519
- target_start = nil
520
- array.each_with_index do |is_target, i|
521
- if is_target
522
- unless in_target
523
- target_start = offset + i
524
- in_target = true
525
- end
526
- else
527
- if in_target
528
- ranges << [target_start, offset + i - 1]
529
- target_start = nil
530
- in_target = false
531
- end
532
- end
533
- end
534
- if in_target
535
- ranges << [target_start, offset + array.length - 1]
536
- end
537
- end
538
-
539
- def slice_by_ranges(ranges)
540
- sliced_table = []
541
- ranges.each do |from, to|
542
- sliced_table << slice_raw(from, to - from + 1)
543
- end
544
- if sliced_table.size > 1
545
- sliced_table[0].concatenate(sliced_table[1..-1])
546
- else
547
- sliced_table[0]
548
- end
494
+ def slice_by_range(from, to)
495
+ slice_raw(from, to - from + 1)
549
496
  end
550
497
 
551
498
  def ensure_raw_column(name, data)