red-arrow 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of red-arrow might be problematic. Click here for more details.

checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a362c35e8eb6b5b93cb69e1ee077acf572af51d89b54e0e9a41282cbdb546cbb
4
- data.tar.gz: db53bb7327021bbb8e45e1b8dbf741becf4ad98d85366fe6c3a475da3650ef4e
3
+ metadata.gz: 97877568cac79133ccac62dcbb51f2e8466c2fb825aaaf58f65976742257fbdc
4
+ data.tar.gz: 5736802a2d3f5539a2a75421d215aa6bd2a4827ce6da6533cf440c88513db006
5
5
  SHA512:
6
- metadata.gz: 153a2ab1e7ccefe6fbe3d3481d8d86cf90f6d583825543b4ae541d3e3704b1aab4debcc2d5aa7148ded9eefbed828354697514e22664ba63aa83e3ad74e41180
7
- data.tar.gz: a8ab623a40a47a073865a13c0eaa1e7c380574d57307068cb5258884cbf04113030b137b00a83347c0c5a3ccc6ca597db25c89297ccf38b20e03f71e9345b1aa
6
+ metadata.gz: 3bf77b7050ee8ad28f6f2bf99186684b33385cbca51bdc39650441875ab7e0701803b05beb34e75cf9a6ab7a6c4b31b6fb053b4b83d14bf6d205018b84e3d246
7
+ data.tar.gz: 9be3f95a220af905a2c611a6a4742db395441d17d2daf20ea975cec2826bf5dc45319f70fce2536672649a4d3d1a3c499fc3b1f45207764b8500dbecc671613a
data/README.md CHANGED
@@ -33,9 +33,7 @@ gobject-introspection gem is a Ruby bindings of GObject Introspection. Red Arrow
33
33
 
34
34
  ## Install
35
35
 
36
- Install Apache Arrow GLib before install Red Arrow. Use [packages.red-data-tools.org](https://github.com/red-data-tools/packages.red-data-tools.org) for installing Apache Arrow GLib.
37
-
38
- Note that the Apache Arrow GLib packages are "unofficial". "Official" packages will be released in the future.
36
+ Install Apache Arrow GLib before install Red Arrow. See [Apache Arrow install document](https://arrow.apache.org/install/) for details.
39
37
 
40
38
  Install Red Arrow after you install Apache Arrow GLib:
41
39
 
@@ -34,6 +34,10 @@ extern "C" void Init_arrow() {
34
34
  rb_define_method(cArrowRecordBatch, "raw_records",
35
35
  reinterpret_cast<rb::RawMethod>(red_arrow::record_batch_raw_records),
36
36
  0);
37
+ auto cArrowTable = rb_const_get_at(mArrow, rb_intern("Table"));
38
+ rb_define_method(cArrowTable, "raw_records",
39
+ reinterpret_cast<rb::RawMethod>(red_arrow::table_raw_records),
40
+ 0);
37
41
 
38
42
  red_arrow::cDate = rb_const_get(rb_cObject, rb_intern("Date"));
39
43
 
@@ -642,11 +642,34 @@ namespace red_arrow {
642
642
  auto record = rb_ary_new_capa(n_columns_);
643
643
  rb_ary_push(records_, record);
644
644
  }
645
+ row_offset_ = 0;
645
646
  for (int i = 0; i < n_columns_; ++i) {
646
647
  const auto array = record_batch.column(i).get();
647
648
  column_index_ = i;
648
649
  check_status(array->Accept(this),
649
- "[raw-records]");
650
+ "[record-batch][raw-records]");
651
+ }
652
+ return Qnil;
653
+ });
654
+ }
655
+
656
+ void build(const arrow::Table& table) {
657
+ rb::protect([&] {
658
+ const auto n_rows = table.num_rows();
659
+ for (int64_t i = 0; i < n_rows; ++i) {
660
+ auto record = rb_ary_new_capa(n_columns_);
661
+ rb_ary_push(records_, record);
662
+ }
663
+ for (int i = 0; i < n_columns_; ++i) {
664
+ const auto column = table.column(i).get();
665
+ const auto chunked_array = column->data();
666
+ column_index_ = i;
667
+ row_offset_ = 0;
668
+ for (const auto array : chunked_array->chunks()) {
669
+ check_status(array->Accept(this),
670
+ "[table][raw-records]");
671
+ row_offset_ += array->length();
672
+ }
650
673
  }
651
674
  return Qnil;
652
675
  });
@@ -703,17 +726,17 @@ namespace red_arrow {
703
726
  void convert(const ArrayType& array) {
704
727
  const auto n = array.length();
705
728
  if (array.null_count() > 0) {
706
- for (int64_t i = 0; i < n; ++i) {
729
+ for (int64_t i = 0, ii = row_offset_; i < n; ++i, ++ii) {
707
730
  auto value = Qnil;
708
731
  if (!array.IsNull(i)) {
709
732
  value = convert_value(array, i);
710
733
  }
711
- auto record = rb_ary_entry(records_, i);
734
+ auto record = rb_ary_entry(records_, ii);
712
735
  rb_ary_store(record, column_index_, value);
713
736
  }
714
737
  } else {
715
- for (int64_t i = 0; i < n; ++i) {
716
- auto record = rb_ary_entry(records_, i);
738
+ for (int64_t i = 0, ii = row_offset_; i < n; ++i, ++ii) {
739
+ auto record = rb_ary_entry(records_, ii);
717
740
  rb_ary_store(record, column_index_, convert_value(array, i));
718
741
  }
719
742
  }
@@ -731,6 +754,9 @@ namespace red_arrow {
731
754
  // The current column index.
732
755
  int column_index_;
733
756
 
757
+ // The current row offset.
758
+ int64_t row_offset_;
759
+
734
760
  // The number of columns.
735
761
  const int n_columns_;
736
762
  };
@@ -753,4 +779,22 @@ namespace red_arrow {
753
779
 
754
780
  return records;
755
781
  }
782
+
783
+ VALUE
784
+ table_raw_records(VALUE rb_table) {
785
+ auto garrow_table = GARROW_TABLE(RVAL2GOBJ(rb_table));
786
+ auto table = garrow_table_get_raw(garrow_table).get();
787
+ const auto n_rows = table->num_rows();
788
+ const auto n_columns = table->num_columns();
789
+ auto records = rb_ary_new_capa(n_rows);
790
+
791
+ try {
792
+ RawRecordsBuilder builder(records, n_columns);
793
+ builder.build(*table);
794
+ } catch (rb::State& state) {
795
+ state.jump();
796
+ }
797
+
798
+ return records;
799
+ }
756
800
  }
@@ -40,6 +40,7 @@ namespace red_arrow {
40
40
  extern ID id_to_datetime;
41
41
 
42
42
  VALUE record_batch_raw_records(VALUE obj);
43
+ VALUE table_raw_records(VALUE obj);
43
44
 
44
45
  inline VALUE time_unit_to_scale(arrow::TimeUnit::type unit) {
45
46
  switch (unit) {
@@ -22,7 +22,7 @@ module Arrow
22
22
 
23
23
  # Creates a new {Arrow::DictionaryDataType}.
24
24
  #
25
- # @overload initialize(index_data_type, dictionary, ordered)
25
+ # @overload initialize(index_data_type, value_data_type, ordered)
26
26
  #
27
27
  # @param index_data_type [Arrow::DataType, Hash, String, Symbol]
28
28
  # The index data type of the dictionary data type. It must be
@@ -39,18 +39,23 @@ module Arrow
39
39
  # See {Arrow::DataType.resolve} how to specify data type
40
40
  # description.
41
41
  #
42
- # @param dictionary [Arrow::Array] The real values of the
43
- # dictionary data type.
42
+ # @param value_data_type [Arrow::DataType, Hash, String, Symbol]
43
+ # The value data type of the dictionary data type.
44
+ #
45
+ # You can specify data type as a description by `Hash`.
46
+ #
47
+ # See {Arrow::DataType.resolve} how to specify data type
48
+ # description.
44
49
  #
45
50
  # @param ordered [Boolean] Whether dictionary contents are
46
51
  # ordered or not.
47
52
  #
48
53
  # @example Create a dictionary data type for {0: "Hello", 1: "World"}
49
54
  # index_data_type = :int8
50
- # dictionary = Arrow::StringArray.new(["Hello", "World"])
55
+ # value_data_type = :string
51
56
  # ordered = true
52
57
  # Arrow::DictionaryDataType.new(index_data_type,
53
- # dictionary,
58
+ # value_data_type,
54
59
  # ordered)
55
60
  #
56
61
  # @overload initialize(description)
@@ -74,16 +79,21 @@ module Arrow
74
79
  # See {Arrow::DataType.resolve} how to specify data type
75
80
  # description.
76
81
  #
77
- # @option description [Arrow::Array] :dictionary The real values
78
- # of the dictionary data type.
82
+ # @option description [Arrow::DataType, Hash, String, Symbol]
83
+ # :value_data_type
84
+ # The value data type of the dictionary data type.
85
+ #
86
+ # You can specify data type as a description by `Hash`.
87
+ #
88
+ # See {Arrow::DataType.resolve} how to specify data type
89
+ # description.
79
90
  #
80
91
  # @option description [Boolean] :ordered Whether dictionary
81
92
  # contents are ordered or not.
82
93
  #
83
94
  # @example Create a dictionary data type for {0: "Hello", 1: "World"}
84
- # dictionary = Arrow::StringArray.new(["Hello", "World"])
85
95
  # Arrow::DictionaryDataType.new(index_data_type: :int8,
86
- # dictionary: dictionary,
96
+ # value_data_type: :string,
87
97
  # ordered: true)
88
98
  def initialize(*args)
89
99
  n_args = args.size
@@ -91,16 +101,17 @@ module Arrow
91
101
  when 1
92
102
  description = args[0]
93
103
  index_data_type = description[:index_data_type]
94
- dictionary = description[:dictionary]
104
+ value_data_type = description[:value_data_type]
95
105
  ordered = description[:ordered]
96
106
  when 3
97
- index_data_type, dictionary, ordered = args
107
+ index_data_type, value_data_type, ordered = args
98
108
  else
99
109
  message = "wrong number of arguments (given, #{n_args}, expected 1 or 3)"
100
110
  raise ArgumentError, message
101
111
  end
102
112
  index_data_type = DataType.resolve(index_data_type)
103
- initialize_raw(index_data_type, dictionary, ordered)
113
+ value_data_type = DataType.resolve(value_data_type)
114
+ initialize_raw(index_data_type, value_data_type, ordered)
104
115
  end
105
116
  end
106
117
  end
@@ -30,27 +30,154 @@ module Arrow
30
30
 
31
31
  alias_method :initialize_raw, :initialize
32
32
  private :initialize_raw
33
- def initialize(schema_or_raw_table_or_columns, columns=nil)
34
- if columns.nil?
35
- if schema_or_raw_table_or_columns[0].is_a?(Column)
36
- columns = schema_or_raw_table_or_columns
37
- fields = columns.collect(&:field)
33
+
34
+ # Creates a new {Arrow::Table}.
35
+ #
36
+ # @overload initialize(columns)
37
+ #
38
+ # @param columns [::Array<Arrow::Column>] The columns of the table.
39
+ #
40
+ # @example Create a table from columns
41
+ # count_field = Arrow::Field.new("count", :uint32)
42
+ # count_array = Arrow::UInt32Array.new([0, 2, nil, 4])
43
+ # count_column = Arrow::Column.new(count_field, count_array)
44
+ # visible_field = Arrow::Field.new("visible", :boolean)
45
+ # visible_array = Arrow::BooleanArray.new([true, nil, nil, false])
46
+ # visible_column = Arrow::Column.new(visible_field, visible_array)
47
+ # Arrow::Table.new([count_column, visible_column])
48
+ #
49
+ # @overload initialize(raw_table)
50
+ #
51
+ # @param raw_table [Hash<String, Arrow::Array>]
52
+ # The pairs of column name and values of the table. Column values is
53
+ # `Arrow::Array`.
54
+ #
55
+ # @example Create a table from column name and values
56
+ # Arrow::Table.new("count" => Arrow::UInt32Array.new([0, 2, nil, 4]),
57
+ # "visible" => Arrow::BooleanArray.new([true, nil, nil, false]))
58
+ #
59
+ # @overload initialize(raw_table)
60
+ #
61
+ # @param raw_table [Hash<String, Arrow::ChunkedArray>]
62
+ # The pairs of column name and values of the table. Column values is
63
+ # `Arrow::ChunkedArray`.
64
+ #
65
+ # @example Create a table from column name and values
66
+ # count_chunks = [
67
+ # Arrow::UInt32Array.new([0, 2]),
68
+ # Arrow::UInt32Array.new([nil, 4]),
69
+ # ]
70
+ # visible_chunks = [
71
+ # Arrow::BooleanArray.new([true]),
72
+ # Arrow::BooleanArray.new([nil, nil, false]),
73
+ # ]
74
+ # Arrow::Table.new("count" => Arrow::ChunkedArray.new(count_chunks),
75
+ # "visible" => Arrow::ChunkedArray.new(visible_chunks))
76
+ #
77
+ # @overload initialize(schema, columns)
78
+ #
79
+ # @param schema [Arrow::Schema] The schema of the table.
80
+ # You can also specify schema as primitive Ruby objects.
81
+ # See {Arrow::Schema#initialize} for details.
82
+ #
83
+ # @param columns [::Array<Arrow::Column>] The data of the table.
84
+ #
85
+ # @example Create a table from schema and columns
86
+ # count_field = Arrow::Field.new("count", :uint32)
87
+ # count_array = Arrow::UInt32Array.new([0, 2, nil, 4])
88
+ # count_column = Arrow::Column.new(count_field, count_array)
89
+ # visible_field = Arrow::Field.new("visible", :boolean)
90
+ # visible_array = Arrow::BooleanArray.new([true, nil, nil, false])
91
+ # visible_column = Arrow::Column.new(visible_field, visible_array)
92
+ # Arrow::Table.new(Arrow::Schema.new([count_field, visible_field]),
93
+ # [count_column, visible_column])
94
+ #
95
+ # @overload initialize(schema, arrays)
96
+ #
97
+ # @param schema [Arrow::Schema] The schema of the table.
98
+ # You can also specify schema as primitive Ruby objects.
99
+ # See {Arrow::Schema#initialize} for details.
100
+ #
101
+ # @param arrays [::Array<Arrow::Array>] The data of the table.
102
+ #
103
+ # @example Create a table from schema and arrays
104
+ # count_field = Arrow::Field.new("count", :uint32)
105
+ # count_array = Arrow::UInt32Array.new([0, 2, nil, 4])
106
+ # visible_field = Arrow::Field.new("visible", :boolean)
107
+ # visible_array = Arrow::BooleanArray.new([true, nil, nil, false])
108
+ # Arrow::Table.new(Arrow::Schema.new([count_field, visible_field]),
109
+ # [count_array, visible_array])
110
+ #
111
+ # @overload initialize(schema, record_batches)
112
+ #
113
+ # @param schema [Arrow::Schema] The schema of the table.
114
+ # You can also specify schema as primitive Ruby objects.
115
+ # See {Arrow::Schema#initialize} for details.
116
+ #
117
+ # @param arrays [::Array<Arrow::RecordBatch>] The data of the table.
118
+ #
119
+ # @example Create a table from schema and record batches
120
+ # count_field = Arrow::Field.new("count", :uint32)
121
+ # visible_field = Arrow::Field.new("visible", :boolean)
122
+ # schema = Arrow::Schema.new([count_field, visible_field])
123
+ # record_batches = [
124
+ # Arrow::RecordBatch.new(schema, [[0, true], [2, nil], [nil, nil]]),
125
+ # Arrow::RecordBatch.new(schema, [[4, false]]),
126
+ # ]
127
+ # Arrow::Table.new(schema, record_batches)
128
+ #
129
+ # @overload initialize(schema, raw_records)
130
+ #
131
+ # @param schema [Arrow::Schema] The schema of the table.
132
+ # You can also specify schema as primitive Ruby objects.
133
+ # See {Arrow::Schema#initialize} for details.
134
+ #
135
+ # @param arrays [::Array<::Array>] The data of the table as primitive
136
+ # Ruby objects.
137
+ #
138
+ # @example Create a table from schema and raw records
139
+ # schema = {
140
+ # count: :uint32,
141
+ # visible: :boolean,
142
+ # }
143
+ # raw_records = [
144
+ # [0, true],
145
+ # [2, nil],
146
+ # [nil, nil],
147
+ # [4, false],
148
+ # ]
149
+ # Arrow::Table.new(schema, raw_records)
150
+ def initialize(*args)
151
+ n_args = args.size
152
+ case n_args
153
+ when 1
154
+ if args[0][0].is_a?(Column)
155
+ values = args[0]
156
+ fields = values.collect(&:field)
38
157
  schema = Schema.new(fields)
39
158
  else
40
- raw_table = schema_or_raw_table_or_columns
159
+ raw_table = args[0]
41
160
  fields = []
42
- columns = []
161
+ values = []
43
162
  raw_table.each do |name, array|
44
163
  field = Field.new(name.to_s, array.value_data_type)
45
164
  fields << field
46
- columns << Column.new(field, array)
165
+ values << Column.new(field, array)
47
166
  end
48
167
  schema = Schema.new(fields)
49
168
  end
169
+ when 2
170
+ schema = args[0]
171
+ schema = Schema.new(schema) unless schema.is_a?(Schema)
172
+ values = args[1]
173
+ if values[0].is_a?(::Array)
174
+ values = [RecordBatch.new(schema, values)]
175
+ end
50
176
  else
51
- schema = schema_or_raw_table_or_columns
177
+ message = "wrong number of arguments (given, #{n_args}, expected 1..2)"
178
+ raise ArgumentError, message
52
179
  end
53
- initialize_raw(schema, columns)
180
+ initialize_raw(schema, values)
54
181
  end
55
182
 
56
183
  def columns
@@ -71,43 +198,92 @@ module Arrow
71
198
 
72
199
  alias_method :[], :find_column
73
200
 
74
- # TODO
201
+ alias_method :slice_raw, :slice
202
+
203
+ # @overload slice(offset, length)
75
204
  #
76
- # @return [Arrow::Table]
205
+ # @param offset [Integer] The offset of sub Arrow::Table.
206
+ # @param length [Integer] The length of sub Arrow::Table.
207
+ # @return [Arrow::Table]
208
+ # The sub `Arrow::Table` that covers only from
209
+ # `offset` to `offset + length` range.
210
+ #
211
+ # @overload slice(index)
212
+ #
213
+ # @param index [Integer] The index in this table.
214
+ # @return [Arrow::Record]
215
+ # The `Arrow::Record` corresponding to index of
216
+ # the table.
217
+ #
218
+ # @overload slice(booleans)
219
+ #
220
+ # @param booleans [::Array<Boolean>]
221
+ # The values indicating the target rows.
222
+ # @return [Arrow::Table]
223
+ # The sub `Arrow::Table` that covers only rows of indices
224
+ # the values of `booleans` is true.
225
+ #
226
+ # @overload slice(boolean_array)
227
+ #
228
+ # @param boolean_array [::Array<Arrow::BooleanArray>]
229
+ # The values indicating the target rows.
230
+ # @return [Arrow::Table]
231
+ # The sub `Arrow::Table` that covers only rows of indices
232
+ # the values of `boolean_array` is true.
233
+ #
234
+ # @overload slice(range)
235
+ #
236
+ # @param range_included_end [Range] The range indicating the target rows.
237
+ # @return [Arrow::Table]
238
+ # The sub `Arrow::Table` that covers only rows of the range of indices.
239
+ #
240
+ # @overload slice
241
+ #
242
+ # @yield [slicer] Gives slicer that constructs condition to select records.
243
+ # @yieldparam slicer [Arrow::Slicer] The slicer that helps us to
244
+ # build condition.
245
+ # @yieldreturn [Arrow::Slicer::Condition, ::Array<Arrow::Slicer::Condition>]
246
+ # The condition to select records.
247
+ # @return [Arrow::Table]
248
+ # The sub `Arrow::Table` that covers only rows matched by condition
249
+ # specified by slicer.
77
250
  def slice(*args)
78
251
  slicers = []
79
- expected_n_args = nil
80
- case args.size
81
- when 0
82
- expected_n_args = "1..2" unless block_given?
83
- when 1
84
- slicers << args[0]
85
- when 2
86
- from, to = args
87
- slicers << (from...(from + to))
88
- else
89
- if block_given?
90
- expected_n_args = "0..2"
91
- else
92
- expected_n_args = "1..2"
93
- end
94
- end
95
- if expected_n_args
96
- message = "wrong number of arguments " +
97
- "(given #{args.size}, expected #{expected_n_args})"
98
- raise ArgumentError, message
99
- end
100
-
101
252
  if block_given?
253
+ unless args.empty?
254
+ raise ArgumentError, "must not specify both arguments and block"
255
+ end
102
256
  block_slicer = yield(Slicer.new(self))
103
257
  case block_slicer
104
- when nil
105
- # Ignore
106
258
  when ::Array
107
259
  slicers.concat(block_slicer)
108
260
  else
109
261
  slicers << block_slicer
110
262
  end
263
+ else
264
+ expected_n_args = nil
265
+ case args.size
266
+ when 1
267
+ if args[0].is_a?(Integer)
268
+ index = args[0]
269
+ index += n_rows if index < 0
270
+ return nil if index < 0
271
+ return nil if index >= n_rows
272
+ return Record.new(self, index)
273
+ else
274
+ slicers << args[0]
275
+ end
276
+ when 2
277
+ offset, length = args
278
+ slicers << (offset...(offset + length))
279
+ else
280
+ expected_n_args = "1..2"
281
+ end
282
+ if expected_n_args
283
+ message = "wrong number of arguments " +
284
+ "(given #{args.size}, expected #{expected_n_args})"
285
+ raise ArgumentError, message
286
+ end
111
287
  end
112
288
 
113
289
  ranges = []
@@ -116,12 +292,18 @@ module Arrow
116
292
  case slicer
117
293
  when Integer
118
294
  slicer += n_rows if slicer < 0
119
- ranges << [slicer, slicer]
295
+ ranges << [slicer, n_rows - 1]
120
296
  when Range
121
- from = slicer.first
297
+ original_from = from = slicer.first
122
298
  to = slicer.last
123
299
  to -= 1 if slicer.exclude_end?
124
300
  from += n_rows if from < 0
301
+ if from < 0 or from >= n_rows
302
+ message =
303
+ "offset is out of range (-#{n_rows + 1},#{n_rows}): " +
304
+ "#{original_from}"
305
+ raise ArgumentError, message
306
+ end
125
307
  to += n_rows if to < 0
126
308
  ranges << [from, to]
127
309
  when ::Array
@@ -330,47 +512,16 @@ module Arrow
330
512
  end
331
513
  end
332
514
 
333
- # TODO: Almost codes should be implemented in Apache Arrow C++.
334
515
  def slice_by_ranges(ranges)
335
- sliced_columns = columns.collect do |column|
336
- chunks = []
337
- arrays = column.data.each_chunk.to_a
338
- offset = 0
339
- offset_in_array = 0
340
- ranges.each do |from, to|
341
- range_size = to - from + 1
342
- while range_size > 0
343
- while offset + arrays.first.length - offset_in_array < from
344
- offset += arrays.first.length - offset_in_array
345
- arrays.shift
346
- offset_in_array = 0
347
- end
348
- if offset < from
349
- skipped_size = from - offset
350
- offset += skipped_size
351
- offset_in_array += skipped_size
352
- end
353
- array = arrays.first
354
- array_length = array.length
355
- rest_length = array_length - offset_in_array
356
- if rest_length <= range_size
357
- chunks << array.slice(offset_in_array, array_length)
358
- offset += rest_length
359
- range_size -= rest_length
360
- offset_in_array = 0
361
- arrays.shift
362
- else
363
- chunks << array.slice(offset_in_array, range_size)
364
- offset += range_size
365
- offset_in_array += range_size
366
- range_size = 0
367
- end
368
- end
369
- end
370
- Column.new(column.field, ChunkedArray.new(chunks))
516
+ sliced_table = []
517
+ ranges.each do |from, to|
518
+ sliced_table << slice_raw(from, to - from + 1)
519
+ end
520
+ if sliced_table.size > 1
521
+ sliced_table[0].concatenate(sliced_table[1..-1])
522
+ else
523
+ sliced_table[0]
371
524
  end
372
-
373
- self.class.new(schema, sliced_columns)
374
525
  end
375
526
 
376
527
  def ensure_column(name, data)