red-arrow 0.14.1 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of red-arrow might be problematic. Click here for more details.

Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/ext/arrow/arrow.cpp +34 -0
  3. data/ext/arrow/converters.cpp +42 -0
  4. data/ext/arrow/converters.hpp +626 -0
  5. data/ext/arrow/raw-records.cpp +6 -625
  6. data/ext/arrow/red-arrow.hpp +37 -3
  7. data/ext/arrow/values.cpp +154 -0
  8. data/lib/arrow/array-builder.rb +24 -1
  9. data/lib/arrow/array.rb +9 -0
  10. data/lib/arrow/chunked-array.rb +5 -0
  11. data/lib/arrow/column-containable.rb +48 -0
  12. data/lib/arrow/column.rb +36 -10
  13. data/lib/arrow/csv-loader.rb +2 -2
  14. data/lib/arrow/data-type.rb +22 -5
  15. data/lib/arrow/date64-array-builder.rb +2 -2
  16. data/lib/arrow/date64-array.rb +1 -1
  17. data/lib/arrow/decimal128-array.rb +24 -0
  18. data/lib/arrow/field-containable.rb +3 -0
  19. data/lib/arrow/group.rb +10 -13
  20. data/lib/arrow/loader.rb +20 -1
  21. data/lib/arrow/record-batch.rb +6 -4
  22. data/lib/arrow/record-containable.rb +0 -35
  23. data/lib/arrow/record.rb +12 -9
  24. data/lib/arrow/slicer.rb +2 -2
  25. data/lib/arrow/struct-array-builder.rb +1 -7
  26. data/lib/arrow/struct-array.rb +13 -11
  27. data/lib/arrow/table-loader.rb +3 -9
  28. data/lib/arrow/table-table-formatter.rb +2 -2
  29. data/lib/arrow/table.rb +61 -24
  30. data/lib/arrow/time.rb +159 -0
  31. data/lib/arrow/time32-array-builder.rb +49 -0
  32. data/lib/arrow/time32-array.rb +28 -0
  33. data/lib/arrow/time64-array-builder.rb +49 -0
  34. data/lib/arrow/time64-array.rb +28 -0
  35. data/lib/arrow/timestamp-array-builder.rb +20 -1
  36. data/lib/arrow/timestamp-array.rb +10 -22
  37. data/lib/arrow/version.rb +1 -1
  38. data/red-arrow.gemspec +1 -1
  39. data/test/raw-records/test-basic-arrays.rb +16 -8
  40. data/test/raw-records/test-dense-union-array.rb +12 -5
  41. data/test/raw-records/test-list-array.rb +21 -9
  42. data/test/raw-records/test-sparse-union-array.rb +13 -5
  43. data/test/raw-records/test-struct-array.rb +11 -4
  44. data/test/test-column.rb +56 -31
  45. data/test/test-decimal128-array-builder.rb +11 -11
  46. data/test/test-decimal128-array.rb +4 -4
  47. data/test/test-slicer.rb +1 -3
  48. data/test/test-struct-array-builder.rb +4 -4
  49. data/test/test-struct-array.rb +4 -4
  50. data/test/test-table.rb +17 -8
  51. data/test/test-time.rb +288 -0
  52. data/test/test-time32-array.rb +81 -0
  53. data/test/test-time64-array.rb +81 -0
  54. data/test/values/test-basic-arrays.rb +284 -0
  55. data/test/values/test-dense-union-array.rb +487 -0
  56. data/test/values/test-list-array.rb +497 -0
  57. data/test/values/test-sparse-union-array.rb +477 -0
  58. data/test/values/test-struct-array.rb +452 -0
  59. metadata +78 -54
  60. data/lib/arrow/struct.rb +0 -79
  61. data/test/test-struct.rb +0 -81
@@ -35,14 +35,25 @@
35
35
  namespace red_arrow {
36
36
  extern VALUE cDate;
37
37
 
38
+ extern VALUE cArrowTime;
39
+
40
+ extern VALUE ArrowTimeUnitSECOND;
41
+ extern VALUE ArrowTimeUnitMILLI;
42
+ extern VALUE ArrowTimeUnitMICRO;
43
+ extern VALUE ArrowTimeUnitNANO;
44
+
38
45
  extern ID id_BigDecimal;
39
46
  extern ID id_jd;
47
+ extern ID id_new;
40
48
  extern ID id_to_datetime;
41
49
 
50
+ VALUE array_values(VALUE obj);
51
+ VALUE chunked_array_values(VALUE obj);
52
+
42
53
  VALUE record_batch_raw_records(VALUE obj);
43
54
  VALUE table_raw_records(VALUE obj);
44
55
 
45
- inline VALUE time_unit_to_scale(arrow::TimeUnit::type unit) {
56
+ inline VALUE time_unit_to_scale(const arrow::TimeUnit::type unit) {
46
57
  switch (unit) {
47
58
  case arrow::TimeUnit::SECOND:
48
59
  return INT2FIX(1);
@@ -54,8 +65,31 @@ namespace red_arrow {
54
65
  // NOTE: INT2FIX works for 1e+9 because: FIXNUM_MAX >= (1<<30) - 1 > 1e+9
55
66
  return INT2FIX(1000 * 1000 * 1000);
56
67
  default:
57
- break; // NOT REACHED
68
+ rb_raise(rb_eArgError, "invalid arrow::TimeUnit: %d", unit);
69
+ return Qnil;
70
+ }
71
+ }
72
+
73
+ inline VALUE time_unit_to_enum(const arrow::TimeUnit::type unit) {
74
+ switch (unit) {
75
+ case arrow::TimeUnit::SECOND:
76
+ return red_arrow::ArrowTimeUnitSECOND;
77
+ case arrow::TimeUnit::MILLI:
78
+ return red_arrow::ArrowTimeUnitMILLI;
79
+ case arrow::TimeUnit::MICRO:
80
+ return red_arrow::ArrowTimeUnitMICRO;
81
+ case arrow::TimeUnit::NANO:
82
+ return red_arrow::ArrowTimeUnitNANO;
83
+ default:
84
+ rb_raise(rb_eArgError, "invalid arrow::TimeUnit: %d", unit);
85
+ return Qnil;
86
+ }
87
+ }
88
+
89
+ inline void check_status(const arrow::Status&& status, const char* context) {
90
+ GError* error = nullptr;
91
+ if (!garrow_error_check(&error, status, context)) {
92
+ RG_RAISE_ERROR(error);
58
93
  }
59
- return Qnil;
60
94
  }
61
95
  }
@@ -0,0 +1,154 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include "converters.hpp"
21
+
22
+ namespace red_arrow {
23
+ namespace {
24
+ class ValuesBuilder : private Converter, public arrow::ArrayVisitor {
25
+ public:
26
+ explicit ValuesBuilder(VALUE values)
27
+ : Converter(),
28
+ values_(values),
29
+ row_offset_(0) {
30
+ }
31
+
32
+ void build(const arrow::Array& array, VALUE rb_array) {
33
+ rb::protect([&] {
34
+ check_status(array.Accept(this),
35
+ "[array][values]");
36
+ return Qnil;
37
+ });
38
+ }
39
+
40
+ void build(const arrow::ChunkedArray& chunked_array,
41
+ VALUE rb_chunked_array) {
42
+ rb::protect([&] {
43
+ for (const auto& array : chunked_array.chunks()) {
44
+ check_status(array->Accept(this),
45
+ "[chunked-array][values]");
46
+ row_offset_ += array->length();
47
+ }
48
+ return Qnil;
49
+ });
50
+ }
51
+
52
+ #define VISIT(TYPE) \
53
+ arrow::Status Visit(const arrow::TYPE ## Array& array) override { \
54
+ convert(array); \
55
+ return arrow::Status::OK(); \
56
+ }
57
+
58
+ VISIT(Null)
59
+ VISIT(Boolean)
60
+ VISIT(Int8)
61
+ VISIT(Int16)
62
+ VISIT(Int32)
63
+ VISIT(Int64)
64
+ VISIT(UInt8)
65
+ VISIT(UInt16)
66
+ VISIT(UInt32)
67
+ VISIT(UInt64)
68
+ // TODO
69
+ // VISIT(HalfFloat)
70
+ VISIT(Float)
71
+ VISIT(Double)
72
+ VISIT(Binary)
73
+ VISIT(String)
74
+ VISIT(FixedSizeBinary)
75
+ VISIT(Date32)
76
+ VISIT(Date64)
77
+ VISIT(Time32)
78
+ VISIT(Time64)
79
+ VISIT(Timestamp)
80
+ // TODO
81
+ // VISIT(Interval)
82
+ VISIT(List)
83
+ VISIT(Struct)
84
+ VISIT(Union)
85
+ VISIT(Dictionary)
86
+ VISIT(Decimal128)
87
+ // TODO
88
+ // VISIT(Extension)
89
+
90
+ #undef VISIT
91
+
92
+ private:
93
+ template <typename ArrayType>
94
+ void convert(const ArrayType& array) {
95
+ const auto n = array.length();
96
+ if (array.null_count() > 0) {
97
+ for (int64_t i = 0, ii = row_offset_; i < n; ++i, ++ii) {
98
+ auto value = Qnil;
99
+ if (!array.IsNull(i)) {
100
+ value = convert_value(array, i);
101
+ }
102
+ rb_ary_store(values_, ii, value);
103
+ }
104
+ } else {
105
+ for (int64_t i = 0, ii = row_offset_; i < n; ++i, ++ii) {
106
+ rb_ary_store(values_, ii, convert_value(array, i));
107
+ }
108
+ }
109
+ }
110
+
111
+ // Destination for converted values.
112
+ VALUE values_;
113
+
114
+ // The current row offset.
115
+ int64_t row_offset_;
116
+ };
117
+ }
118
+
119
+ VALUE
120
+ array_values(VALUE rb_array) {
121
+ auto garrow_array = GARROW_ARRAY(RVAL2GOBJ(rb_array));
122
+ auto array = garrow_array_get_raw(garrow_array).get();
123
+ const auto n_rows = array->length();
124
+ auto values = rb_ary_new_capa(n_rows);
125
+
126
+ try {
127
+ ValuesBuilder builder(values);
128
+ builder.build(*array, rb_array);
129
+ } catch (rb::State& state) {
130
+ state.jump();
131
+ }
132
+
133
+ return values;
134
+ }
135
+
136
+ VALUE
137
+ chunked_array_values(VALUE rb_chunked_array) {
138
+ auto garrow_chunked_array =
139
+ GARROW_CHUNKED_ARRAY(RVAL2GOBJ(rb_chunked_array));
140
+ auto chunked_array =
141
+ garrow_chunked_array_get_raw(garrow_chunked_array).get();
142
+ const auto n_rows = chunked_array->length();
143
+ auto values = rb_ary_new_capa(n_rows);
144
+
145
+ try {
146
+ ValuesBuilder builder(values);
147
+ builder.build(*chunked_array, rb_chunked_array);
148
+ } catch (rb::State& state) {
149
+ state.jump();
150
+ }
151
+
152
+ return values;
153
+ }
154
+ }
@@ -27,6 +27,7 @@ module Arrow
27
27
  end
28
28
 
29
29
  builder_class = nil
30
+ builder_class_arguments = []
30
31
  values.each do |value|
31
32
  case value
32
33
  when nil
@@ -43,8 +44,29 @@ module Arrow
43
44
  return builder.build(values)
44
45
  else
45
46
  builder_class = UIntArrayBuilder
47
+ builder_class_arguments = []
46
48
  end
47
49
  when Time
50
+ data_type = value.data_type
51
+ case data_type.unit
52
+ when TimeUnit::SECOND
53
+ if builder.nil?
54
+ builder = Time32ArrayBuilder
55
+ builder_class_arguments = [data_type]
56
+ end
57
+ when TimeUnit::MILLI
58
+ if builder != Time64ArrayBuilder
59
+ builder = Time32ArrayBuilder
60
+ builder_class_arguments = [data_type]
61
+ end
62
+ when TimeUnit::MICRO
63
+ builder = Time64ArrayBuilder
64
+ builder_class_arguments = [data_type]
65
+ when TimeUnit::NANO
66
+ builder = Time64ArrayBuilder.new(data_type)
67
+ return builder.build(values)
68
+ end
69
+ when ::Time
48
70
  data_type = TimestampDataType.new(:nano)
49
71
  builder = TimestampArrayBuilder.new(data_type)
50
72
  return builder.build(values)
@@ -57,7 +79,8 @@ module Arrow
57
79
  end
58
80
  end
59
81
  if builder_class
60
- builder_class.new.build(values)
82
+ builder = builder_class.new(*builder_class_arguments)
83
+ builder.build(values)
61
84
  else
62
85
  Arrow::StringArray.new(values)
63
86
  end
@@ -73,5 +73,14 @@ module Arrow
73
73
  def to_arrow
74
74
  self
75
75
  end
76
+
77
+ alias_method :value_data_type_raw, :value_data_type
78
+ def value_data_type
79
+ @value_data_type ||= value_data_type_raw
80
+ end
81
+
82
+ def to_a
83
+ values
84
+ end
76
85
  end
77
86
  end
@@ -19,6 +19,11 @@ module Arrow
19
19
  class ChunkedArray
20
20
  include Enumerable
21
21
 
22
+ alias_method :size, :n_rows
23
+ unless method_defined?(:length)
24
+ alias_method :length, :n_rows
25
+ end
26
+
22
27
  alias_method :chunks_raw, :chunks
23
28
  def chunks
24
29
  @chunks ||= chunks_raw
@@ -0,0 +1,48 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module Arrow
19
+ module ColumnContainable
20
+ def columns
21
+ @columns ||= schema.n_fields.times.collect do |i|
22
+ Column.new(self, i)
23
+ end
24
+ end
25
+
26
+ def each_column(&block)
27
+ columns.each(&block)
28
+ end
29
+
30
+ def find_column(name_or_index)
31
+ case name_or_index
32
+ when String, Symbol
33
+ name = name_or_index.to_s
34
+ index = schema.get_field_index(name)
35
+ return nil if index == -1
36
+ Column.new(self, index)
37
+ when Integer
38
+ index = name_or_index
39
+ index += n_columns if index < 0
40
+ return nil if index < 0 or index >= n_columns
41
+ Column.new(self, index)
42
+ else
43
+ message = "column name or index must be String, Symbol or Integer"
44
+ raise ArgumentError, message
45
+ end
46
+ end
47
+ end
48
+ end
@@ -19,32 +19,58 @@ module Arrow
19
19
  class Column
20
20
  include Enumerable
21
21
 
22
+ attr_reader :container
23
+ attr_reader :field
24
+ attr_reader :data
25
+ def initialize(container, index)
26
+ @container = container
27
+ @index = index
28
+ @field = @container.schema[@index]
29
+ @data = @container.get_column_data(@index)
30
+ end
31
+
32
+ def name
33
+ @field.name
34
+ end
35
+
36
+ def data_type
37
+ @field.data_type
38
+ end
39
+
22
40
  def null?(i)
23
- data.null?(i)
41
+ @data.null?(i)
24
42
  end
25
43
 
26
44
  def valid?(i)
27
- data.valid?(i)
45
+ @data.valid?(i)
28
46
  end
29
47
 
30
48
  def [](i)
31
- data[i]
49
+ @data[i]
32
50
  end
33
51
 
34
52
  def each(&block)
35
- return to_enum(__method__) unless block_given?
36
-
37
- data.each(&block)
53
+ @data.each(&block)
38
54
  end
39
55
 
40
56
  def reverse_each(&block)
41
- return to_enum(__method__) unless block_given?
57
+ @data.reverse_each(&block)
58
+ end
59
+
60
+ def n_rows
61
+ @data.n_rows
62
+ end
63
+ alias_method :size, :n_rows
64
+ alias_method :length, :n_rows
42
65
 
43
- data.reverse_each(&block)
66
+ def n_nulls
67
+ @data.n_nulls
44
68
  end
45
69
 
46
- def pack
47
- self.class.new(field, data.pack)
70
+ def ==(other)
71
+ other.is_a?(self.class) and
72
+ @field == other.field and
73
+ @data == other.data
48
74
  end
49
75
  end
50
76
  end
@@ -221,7 +221,7 @@ module Arrow
221
221
  field
222
222
  else
223
223
  begin
224
- Time.iso8601(encoded_field)
224
+ ::Time.iso8601(encoded_field)
225
225
  rescue ArgumentError
226
226
  field
227
227
  end
@@ -317,7 +317,7 @@ module Arrow
317
317
  if current_column_type == :integer
318
318
  column_types[i] = candidate_type
319
319
  end
320
- when Time
320
+ when ::Time
321
321
  candidate_type = :time
322
322
  when DateTime
323
323
  candidate_type = :date_time
@@ -29,24 +29,33 @@ module Arrow
29
29
  #
30
30
  # @return [Arrow::DataType] The given data type itself.
31
31
  #
32
- # @overload resolve(name, *arguments)
32
+ # @overload resolve(name)
33
33
  #
34
34
  # Creates a suitable data type from type name. For example,
35
35
  # you can create {Arrow::BooleanDataType} from `:boolean`.
36
36
  #
37
37
  # @param name [String, Symbol] The type name of the data type.
38
38
  #
39
- # @param arguments [::Array] The additional information of the
40
- # data type.
39
+ # @example Create a boolean data type
40
+ # Arrow::DataType.resolve(:boolean)
41
+ #
42
+ # @overload resolve(name_with_arguments)
43
+ #
44
+ # Creates a suitable data type from type name with arguments.
45
+ #
46
+ # @param name_with_arguments [::Array<String, ...>]
47
+ # The type name of the data type as the first element.
48
+ #
49
+ # The rest elements are additional information of the data type.
41
50
  #
42
51
  # For example, {Arrow::TimestampDataType} needs unit as
43
52
  # additional information.
44
53
  #
45
54
  # @example Create a boolean data type
46
- # Arrow::DataType.resolve(:boolean)
55
+ # Arrow::DataType.resolve([:boolean])
47
56
  #
48
57
  # @example Create a milliseconds unit timestamp data type
49
- # Arrow::DataType.resolve(:timestamp, :milli)
58
+ # Arrow::DataType.resolve([:timestamp, :milli])
50
59
  #
51
60
  # @overload resolve(description)
52
61
  #
@@ -135,5 +144,13 @@ module Arrow
135
144
  Arrow.const_get(data_type_class_name)
136
145
  end
137
146
  end
147
+
148
+ def build_array(values)
149
+ base_name = self.class.name.gsub(/DataType\z/, "")
150
+ builder_class = self.class.const_get("#{base_name}ArrayBuilder")
151
+ args = [values]
152
+ args.unshift(self) unless builder_class.buildable?(args)
153
+ builder_class.build(*args)
154
+ end
138
155
  end
139
156
  end