red-arrow 0.15.1 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/Rakefile +28 -16
  3. data/ext/arrow/converters.hpp +63 -33
  4. data/ext/arrow/raw-records.cpp +2 -1
  5. data/ext/arrow/values.cpp +2 -1
  6. data/lib/arrow/array-builder.rb +101 -52
  7. data/lib/arrow/array.rb +28 -10
  8. data/lib/arrow/{binary-array-builder.rb → buffer.rb} +7 -15
  9. data/lib/arrow/chunked-array.rb +2 -0
  10. data/lib/arrow/csv-loader.rb +5 -0
  11. data/lib/arrow/csv-read-options.rb +18 -0
  12. data/lib/arrow/data-type.rb +35 -2
  13. data/lib/arrow/decimal128-array-builder.rb +0 -2
  14. data/lib/arrow/dictionary-array.rb +24 -0
  15. data/lib/arrow/field.rb +1 -1
  16. data/lib/arrow/generic-filterable.rb +43 -0
  17. data/lib/arrow/generic-takeable.rb +38 -0
  18. data/lib/arrow/list-data-type.rb +58 -8
  19. data/lib/arrow/loader.rb +12 -1
  20. data/lib/arrow/null-array-builder.rb +1 -1
  21. data/lib/arrow/null-array.rb +24 -0
  22. data/lib/arrow/raw-table-converter.rb +47 -0
  23. data/lib/arrow/record-batch-iterator.rb +22 -0
  24. data/lib/arrow/record-batch.rb +8 -3
  25. data/lib/arrow/schema.rb +5 -2
  26. data/lib/arrow/struct-array-builder.rb +13 -7
  27. data/lib/arrow/struct-data-type.rb +0 -2
  28. data/lib/arrow/table-loader.rb +29 -6
  29. data/lib/arrow/table-saver.rb +37 -13
  30. data/lib/arrow/table.rb +20 -73
  31. data/lib/arrow/version.rb +1 -1
  32. data/red-arrow.gemspec +3 -1
  33. data/test/helper.rb +1 -0
  34. data/test/helper/omittable.rb +36 -0
  35. data/test/raw-records/test-dense-union-array.rb +1 -34
  36. data/test/raw-records/test-sparse-union-array.rb +1 -33
  37. data/test/run-test.rb +14 -3
  38. data/test/test-array-builder.rb +17 -0
  39. data/test/test-array.rb +104 -0
  40. data/test/test-buffer.rb +11 -0
  41. data/test/test-chunked-array.rb +96 -0
  42. data/test/test-csv-loader.rb +2 -2
  43. data/test/test-data-type.rb +11 -0
  44. data/test/test-dense-union-data-type.rb +2 -2
  45. data/test/test-dictionary-array.rb +41 -0
  46. data/test/test-feather.rb +21 -6
  47. data/test/test-list-data-type.rb +27 -1
  48. data/test/test-null-array.rb +23 -0
  49. data/test/test-record-batch-iterator.rb +37 -0
  50. data/test/test-record-batch.rb +14 -0
  51. data/test/test-schema.rb +16 -0
  52. data/test/test-slicer.rb +74 -30
  53. data/test/test-sparse-union-data-type.rb +2 -2
  54. data/test/test-struct-array-builder.rb +8 -4
  55. data/test/test-table.rb +153 -14
  56. data/test/test-timestamp-array.rb +19 -0
  57. data/test/values/test-dense-union-array.rb +1 -34
  58. data/test/values/test-sparse-union-array.rb +1 -33
  59. metadata +22 -8
@@ -0,0 +1,47 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module Arrow
19
+ class RawTableConverter
20
+ attr_reader :n_rows
21
+ attr_reader :schema
22
+ attr_reader :values
23
+ def initialize(raw_table)
24
+ @raw_table = raw_table
25
+ convert
26
+ end
27
+
28
+ private
29
+ def convert
30
+ if @raw_table.is_a?(::Array) and @raw_table[0].is_a?(Column)
31
+ fields = @raw_table.collect(&:field)
32
+ @schema = Schema.new(fields)
33
+ @values = @raw_table.collect(&:data)
34
+ else
35
+ fields = []
36
+ @values = []
37
+ @raw_table.each do |name, array|
38
+ array = ArrayBuilder.build(array) if array.is_a?(::Array)
39
+ fields << Field.new(name.to_s, array.value_data_type)
40
+ @values << array
41
+ end
42
+ @schema = Schema.new(fields)
43
+ end
44
+ @n_rows = @values[0].length
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,22 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module Arrow
19
+ class RecordBatchIterator
20
+ alias_method :to_a, :to_list
21
+ end
22
+ end
@@ -15,8 +15,7 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
- require "arrow/column-containable"
19
- require "arrow/record-containable"
18
+ require "arrow/raw-table-converter"
20
19
 
21
20
  module Arrow
22
21
  class RecordBatch
@@ -28,13 +27,19 @@ module Arrow
28
27
  def new(*args)
29
28
  n_args = args.size
30
29
  case n_args
30
+ when 1
31
+ raw_table_converter = RawTableConverter.new(args[0])
32
+ n_rows = raw_table_converter.n_rows
33
+ schema = raw_table_converter.schema
34
+ values = raw_table_converter.values
35
+ super(schema, n_rows, values)
31
36
  when 2
32
37
  schema, data = args
33
38
  RecordBatchBuilder.build(schema, data)
34
39
  when 3
35
40
  super
36
41
  else
37
- message = "wrong number of arguments (given #{n_args}, expected 2..3)"
42
+ message = "wrong number of arguments (given #{n_args}, expected 1..3)"
38
43
  raise ArgumentError, message
39
44
  end
40
45
  end
@@ -15,8 +15,6 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
- require "arrow/field-containable"
19
-
20
18
  module Arrow
21
19
  class Schema
22
20
  include FieldContainable
@@ -93,5 +91,10 @@ module Arrow
93
91
  end
94
92
 
95
93
  alias_method :[], :find_field
94
+
95
+ alias_method :to_s_raw, :to_s
96
+ def to_s(show_metadata: false)
97
+ to_string_metadata(show_metadata)
98
+ end
96
99
  end
97
100
  end
@@ -32,7 +32,7 @@ module Arrow
32
32
  case index_or_name
33
33
  when String, Symbol
34
34
  name = index_or_name
35
- (@name_to_builder ||= build_name_to_builder)[name.to_s]
35
+ cached_name_to_builder[name.to_s]
36
36
  else
37
37
  index = index_or_name
38
38
  cached_field_builders[index]
@@ -70,13 +70,18 @@ module Arrow
70
70
  append_null
71
71
  when ::Array
72
72
  append_value_raw
73
- value.each_with_index do |sub_value, i|
74
- self[i].append(sub_value)
73
+ cached_field_builders.zip(value) do |builder, sub_value|
74
+ builder.append(sub_value)
75
75
  end
76
76
  when Hash
77
77
  append_value_raw
78
+ local_name_to_builder = cached_name_to_builder.dup
78
79
  value.each do |name, sub_value|
79
- self[name].append(sub_value)
80
+ builder = local_name_to_builder.delete(name.to_s)
81
+ builder.append(sub_value)
82
+ end
83
+ local_name_to_builder.each do |_, builder|
84
+ builder.append_null
80
85
  end
81
86
  else
82
87
  message =
@@ -108,9 +113,6 @@ module Arrow
108
113
  alias_method :append_null_raw, :append_null
109
114
  def append_null
110
115
  append_null_raw
111
- cached_field_builders.each do |builder|
112
- builder.append_null
113
- end
114
116
  end
115
117
 
116
118
  # @since 0.12.0
@@ -136,5 +138,9 @@ module Arrow
136
138
  end
137
139
  name_to_builder
138
140
  end
141
+
142
+ def cached_name_to_builder
143
+ @name_to_builder ||= build_name_to_builder
144
+ end
139
145
  end
140
146
  end
@@ -15,8 +15,6 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
- require "arrow/field-containable"
19
-
20
18
  module Arrow
21
19
  class StructDataType
22
20
  include FieldContainable
@@ -41,6 +41,8 @@ module Arrow
41
41
  available_formats << match_data.post_match
42
42
  end
43
43
  end
44
+ deprecated_formats = ["batch", "stream"]
45
+ available_formats -= deprecated_formats
44
46
  message = "Arrow::Table load format must be one of ["
45
47
  message << available_formats.join(", ")
46
48
  message << "]: #{format.inspect}"
@@ -119,18 +121,30 @@ module Arrow
119
121
  load_raw(input, reader)
120
122
  end
121
123
 
122
- def load_as_batch
124
+ # @since 1.0.0
125
+ def load_as_arrow_file
123
126
  input = open_input_stream
124
127
  reader = RecordBatchFileReader.new(input)
125
128
  load_raw(input, reader)
126
129
  end
127
130
 
128
- def load_as_stream
131
+ # @deprecated Use `format: :arrow_file` instead.
132
+ def load_as_batch
133
+ load_as_arrow_file
134
+ end
135
+
136
+ # @since 1.0.0
137
+ def load_as_arrow_streaming
129
138
  input = open_input_stream
130
139
  reader = RecordBatchStreamReader.new(input)
131
140
  load_raw(input, reader)
132
141
  end
133
142
 
143
+ # @deprecated Use `format: :arrow_streaming` instead.
144
+ def load_as_stream
145
+ load_as_arrow_streaming
146
+ end
147
+
134
148
  if Arrow.const_defined?(:ORCFileReader)
135
149
  def load_as_orc
136
150
  input = open_input_stream
@@ -143,16 +157,25 @@ module Arrow
143
157
  end
144
158
  end
145
159
 
146
- def load_as_csv
147
- options = @options.dup
160
+ def csv_load(options)
148
161
  options.delete(:format)
149
162
  if @input.is_a?(Buffer)
150
- CSVLoader.load(@input.data.to_s, options)
163
+ CSVLoader.load(@input.data.to_s, **options)
151
164
  else
152
- CSVLoader.load(Pathname.new(@input), options)
165
+ CSVLoader.load(Pathname.new(@input), **options)
153
166
  end
154
167
  end
155
168
 
169
+ def load_as_csv
170
+ csv_load(@options.dup)
171
+ end
172
+
173
+ def load_as_tsv
174
+ options = @options.dup
175
+ options[:delimiter] = "\t"
176
+ csv_load(options.dup)
177
+ end
178
+
156
179
  def load_as_feather
157
180
  input = open_input_stream
158
181
  reader = FeatherFileReader.new(input)
@@ -42,6 +42,8 @@ module Arrow
42
42
  available_formats << match_data.post_match
43
43
  end
44
44
  end
45
+ deprecated_formats = ["batch", "stream"]
46
+ available_formats -= deprecated_formats
45
47
  message = "Arrow::Table save format must be one of ["
46
48
  message << available_formats.join(", ")
47
49
  message << "]: #{format.inspect}"
@@ -110,35 +112,57 @@ module Arrow
110
112
  end
111
113
 
112
114
  def save_as_arrow
113
- save_as_batch
115
+ save_as_arrow_file
114
116
  end
115
117
 
116
- def save_as_batch
118
+ # @since 1.0.0
119
+ def save_as_arrow_file
117
120
  save_raw(RecordBatchFileWriter)
118
121
  end
119
122
 
120
- def save_as_stream
123
+ # @deprecated Use `format: :arrow_batch` instead.
124
+ def save_as_batch
125
+ save_as_arrow_file
126
+ end
127
+
128
+ # @since 1.0.0
129
+ def save_as_arrow_streaming
121
130
  save_raw(RecordBatchStreamWriter)
122
131
  end
123
132
 
124
- def save_as_csv
133
+ # @deprecated Use `format: :arrow_streaming` instead.
134
+ def save_as_stream
135
+ save_as_arrow_streaming
136
+ end
137
+
138
+ def csv_save(**options)
125
139
  open_output_stream do |output|
126
- csv = CSV.new(output)
140
+ csv = CSV.new(output, **options)
127
141
  names = @table.schema.fields.collect(&:name)
128
142
  csv << names
129
- @table.each_record(reuse_record: true) do |record|
130
- csv << names.collect do |name|
131
- record[name]
132
- end
143
+ @table.raw_records.each do |record|
144
+ csv << record
133
145
  end
134
146
  end
135
147
  end
136
148
 
149
+ def save_as_csv
150
+ csv_save
151
+ end
152
+
153
+ def save_as_tsv
154
+ csv_save(col_sep: "\t")
155
+ end
156
+
137
157
  def save_as_feather
138
- open_output_stream do |output|
139
- FeatherFileWriter.open(output) do |writer|
140
- writer.write(@table)
141
- end
158
+ properties = FeatherWriteProperties.new
159
+ properties.class.properties.each do |name|
160
+ value = @options[name.to_sym]
161
+ next if value.nil?
162
+ properties.__send__("#{name}=", value)
163
+ end
164
+ open_raw_output_stream do |output|
165
+ @table.write_as_feather(output, properties)
142
166
  end
143
167
  end
144
168
  end
@@ -15,13 +15,13 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
- require "arrow/column-containable"
19
- require "arrow/group"
20
- require "arrow/record-containable"
18
+ require "arrow/raw-table-converter"
21
19
 
22
20
  module Arrow
23
21
  class Table
24
22
  include ColumnContainable
23
+ include GenericFilterable
24
+ include GenericTakeable
25
25
  include RecordContainable
26
26
 
27
27
  class << self
@@ -83,14 +83,6 @@ module Arrow
83
83
  # `Array`.
84
84
  #
85
85
  # @example Create a table from column name and values
86
- # count_chunks = [
87
- # Arrow::UInt32Array.new([0, 2]),
88
- # Arrow::UInt32Array.new([nil, 4]),
89
- # ]
90
- # visible_chunks = [
91
- # Arrow::BooleanArray.new([true]),
92
- # Arrow::BooleanArray.new([nil, nil, false]),
93
- # ]
94
86
  # Arrow::Table.new("count" => [0, 2, nil, 4],
95
87
  # "visible" => [true, nil, nil, false])
96
88
  #
@@ -171,22 +163,9 @@ module Arrow
171
163
  n_args = args.size
172
164
  case n_args
173
165
  when 1
174
- if args[0][0].is_a?(Column)
175
- columns = args[0]
176
- fields = columns.collect(&:field)
177
- values = columns.collect(&:data)
178
- schema = Schema.new(fields)
179
- else
180
- raw_table = args[0]
181
- fields = []
182
- values = []
183
- raw_table.each do |name, array|
184
- array = ArrayBuilder.build(array) if array.is_a?(::Array)
185
- fields << Field.new(name.to_s, array.value_data_type)
186
- values << array
187
- end
188
- schema = Schema.new(fields)
189
- end
166
+ raw_table_converter = RawTableConverter.new(args[0])
167
+ schema = raw_table_converter.schema
168
+ values = raw_table_converter.values
190
169
  when 2
191
170
  schema = args[0]
192
171
  schema = Schema.new(schema) unless schema.is_a?(Schema)
@@ -306,13 +285,15 @@ module Arrow
306
285
  end
307
286
  end
308
287
 
309
- ranges = []
288
+ filter_options = Arrow::FilterOptions.new
289
+ filter_options.null_selection_behavior = :emit_null
290
+ sliced_tables = []
310
291
  slicers.each do |slicer|
311
292
  slicer = slicer.evaluate if slicer.respond_to?(:evaluate)
312
293
  case slicer
313
294
  when Integer
314
295
  slicer += n_rows if slicer < 0
315
- ranges << [slicer, n_rows - 1]
296
+ sliced_tables << slice_by_range(slicer, n_rows - 1)
316
297
  when Range
317
298
  original_from = from = slicer.first
318
299
  to = slicer.last
@@ -325,17 +306,9 @@ module Arrow
325
306
  raise ArgumentError, message
326
307
  end
327
308
  to += n_rows if to < 0
328
- ranges << [from, to]
329
- when ::Array
330
- boolean_array_to_slice_ranges(slicer, 0, ranges)
331
- when ChunkedArray
332
- offset = 0
333
- slicer.each_chunk do |array|
334
- boolean_array_to_slice_ranges(array, offset, ranges)
335
- offset += array.length
336
- end
337
- when BooleanArray
338
- boolean_array_to_slice_ranges(slicer, 0, ranges)
309
+ sliced_tables << slice_by_range(from, to)
310
+ when ::Array, BooleanArray, ChunkedArray
311
+ sliced_tables << filter(slicer, filter_options)
339
312
  else
340
313
  message = "slicer must be Integer, Range, (from, to), " +
341
314
  "Arrow::ChunkedArray of Arrow::BooleanArray, " +
@@ -343,7 +316,11 @@ module Arrow
343
316
  raise ArgumentError, message
344
317
  end
345
318
  end
346
- slice_by_ranges(ranges)
319
+ if sliced_tables.size > 1
320
+ sliced_tables[0].concatenate(sliced_tables[1..-1])
321
+ else
322
+ sliced_tables[0]
323
+ end
347
324
  end
348
325
 
349
326
  # TODO
@@ -514,38 +491,8 @@ module Arrow
514
491
  end
515
492
 
516
493
  private
517
- def boolean_array_to_slice_ranges(array, offset, ranges)
518
- in_target = false
519
- target_start = nil
520
- array.each_with_index do |is_target, i|
521
- if is_target
522
- unless in_target
523
- target_start = offset + i
524
- in_target = true
525
- end
526
- else
527
- if in_target
528
- ranges << [target_start, offset + i - 1]
529
- target_start = nil
530
- in_target = false
531
- end
532
- end
533
- end
534
- if in_target
535
- ranges << [target_start, offset + array.length - 1]
536
- end
537
- end
538
-
539
- def slice_by_ranges(ranges)
540
- sliced_table = []
541
- ranges.each do |from, to|
542
- sliced_table << slice_raw(from, to - from + 1)
543
- end
544
- if sliced_table.size > 1
545
- sliced_table[0].concatenate(sliced_table[1..-1])
546
- else
547
- sliced_table[0]
548
- end
494
+ def slice_by_range(from, to)
495
+ slice_raw(from, to - from + 1)
549
496
  end
550
497
 
551
498
  def ensure_raw_column(name, data)