red-arrow 0.13.0 → 0.14.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of red-arrow might be problematic. Click here for more details.

checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a362c35e8eb6b5b93cb69e1ee077acf572af51d89b54e0e9a41282cbdb546cbb
4
- data.tar.gz: db53bb7327021bbb8e45e1b8dbf741becf4ad98d85366fe6c3a475da3650ef4e
3
+ metadata.gz: 97877568cac79133ccac62dcbb51f2e8466c2fb825aaaf58f65976742257fbdc
4
+ data.tar.gz: 5736802a2d3f5539a2a75421d215aa6bd2a4827ce6da6533cf440c88513db006
5
5
  SHA512:
6
- metadata.gz: 153a2ab1e7ccefe6fbe3d3481d8d86cf90f6d583825543b4ae541d3e3704b1aab4debcc2d5aa7148ded9eefbed828354697514e22664ba63aa83e3ad74e41180
7
- data.tar.gz: a8ab623a40a47a073865a13c0eaa1e7c380574d57307068cb5258884cbf04113030b137b00a83347c0c5a3ccc6ca597db25c89297ccf38b20e03f71e9345b1aa
6
+ metadata.gz: 3bf77b7050ee8ad28f6f2bf99186684b33385cbca51bdc39650441875ab7e0701803b05beb34e75cf9a6ab7a6c4b31b6fb053b4b83d14bf6d205018b84e3d246
7
+ data.tar.gz: 9be3f95a220af905a2c611a6a4742db395441d17d2daf20ea975cec2826bf5dc45319f70fce2536672649a4d3d1a3c499fc3b1f45207764b8500dbecc671613a
data/README.md CHANGED
@@ -33,9 +33,7 @@ gobject-introspection gem is a Ruby bindings of GObject Introspection. Red Arrow
33
33
 
34
34
  ## Install
35
35
 
36
- Install Apache Arrow GLib before install Red Arrow. Use [packages.red-data-tools.org](https://github.com/red-data-tools/packages.red-data-tools.org) for installing Apache Arrow GLib.
37
-
38
- Note that the Apache Arrow GLib packages are "unofficial". "Official" packages will be released in the future.
36
+ Install Apache Arrow GLib before install Red Arrow. See [Apache Arrow install document](https://arrow.apache.org/install/) for details.
39
37
 
40
38
  Install Red Arrow after you install Apache Arrow GLib:
41
39
 
@@ -34,6 +34,10 @@ extern "C" void Init_arrow() {
34
34
  rb_define_method(cArrowRecordBatch, "raw_records",
35
35
  reinterpret_cast<rb::RawMethod>(red_arrow::record_batch_raw_records),
36
36
  0);
37
+ auto cArrowTable = rb_const_get_at(mArrow, rb_intern("Table"));
38
+ rb_define_method(cArrowTable, "raw_records",
39
+ reinterpret_cast<rb::RawMethod>(red_arrow::table_raw_records),
40
+ 0);
37
41
 
38
42
  red_arrow::cDate = rb_const_get(rb_cObject, rb_intern("Date"));
39
43
 
@@ -642,11 +642,34 @@ namespace red_arrow {
642
642
  auto record = rb_ary_new_capa(n_columns_);
643
643
  rb_ary_push(records_, record);
644
644
  }
645
+ row_offset_ = 0;
645
646
  for (int i = 0; i < n_columns_; ++i) {
646
647
  const auto array = record_batch.column(i).get();
647
648
  column_index_ = i;
648
649
  check_status(array->Accept(this),
649
- "[raw-records]");
650
+ "[record-batch][raw-records]");
651
+ }
652
+ return Qnil;
653
+ });
654
+ }
655
+
656
+ void build(const arrow::Table& table) {
657
+ rb::protect([&] {
658
+ const auto n_rows = table.num_rows();
659
+ for (int64_t i = 0; i < n_rows; ++i) {
660
+ auto record = rb_ary_new_capa(n_columns_);
661
+ rb_ary_push(records_, record);
662
+ }
663
+ for (int i = 0; i < n_columns_; ++i) {
664
+ const auto column = table.column(i).get();
665
+ const auto chunked_array = column->data();
666
+ column_index_ = i;
667
+ row_offset_ = 0;
668
+ for (const auto array : chunked_array->chunks()) {
669
+ check_status(array->Accept(this),
670
+ "[table][raw-records]");
671
+ row_offset_ += array->length();
672
+ }
650
673
  }
651
674
  return Qnil;
652
675
  });
@@ -703,17 +726,17 @@ namespace red_arrow {
703
726
  void convert(const ArrayType& array) {
704
727
  const auto n = array.length();
705
728
  if (array.null_count() > 0) {
706
- for (int64_t i = 0; i < n; ++i) {
729
+ for (int64_t i = 0, ii = row_offset_; i < n; ++i, ++ii) {
707
730
  auto value = Qnil;
708
731
  if (!array.IsNull(i)) {
709
732
  value = convert_value(array, i);
710
733
  }
711
- auto record = rb_ary_entry(records_, i);
734
+ auto record = rb_ary_entry(records_, ii);
712
735
  rb_ary_store(record, column_index_, value);
713
736
  }
714
737
  } else {
715
- for (int64_t i = 0; i < n; ++i) {
716
- auto record = rb_ary_entry(records_, i);
738
+ for (int64_t i = 0, ii = row_offset_; i < n; ++i, ++ii) {
739
+ auto record = rb_ary_entry(records_, ii);
717
740
  rb_ary_store(record, column_index_, convert_value(array, i));
718
741
  }
719
742
  }
@@ -731,6 +754,9 @@ namespace red_arrow {
731
754
  // The current column index.
732
755
  int column_index_;
733
756
 
757
+ // The current row offset.
758
+ int64_t row_offset_;
759
+
734
760
  // The number of columns.
735
761
  const int n_columns_;
736
762
  };
@@ -753,4 +779,22 @@ namespace red_arrow {
753
779
 
754
780
  return records;
755
781
  }
782
+
783
+ VALUE
784
+ table_raw_records(VALUE rb_table) {
785
+ auto garrow_table = GARROW_TABLE(RVAL2GOBJ(rb_table));
786
+ auto table = garrow_table_get_raw(garrow_table).get();
787
+ const auto n_rows = table->num_rows();
788
+ const auto n_columns = table->num_columns();
789
+ auto records = rb_ary_new_capa(n_rows);
790
+
791
+ try {
792
+ RawRecordsBuilder builder(records, n_columns);
793
+ builder.build(*table);
794
+ } catch (rb::State& state) {
795
+ state.jump();
796
+ }
797
+
798
+ return records;
799
+ }
756
800
  }
@@ -40,6 +40,7 @@ namespace red_arrow {
40
40
  extern ID id_to_datetime;
41
41
 
42
42
  VALUE record_batch_raw_records(VALUE obj);
43
+ VALUE table_raw_records(VALUE obj);
43
44
 
44
45
  inline VALUE time_unit_to_scale(arrow::TimeUnit::type unit) {
45
46
  switch (unit) {
@@ -22,7 +22,7 @@ module Arrow
22
22
 
23
23
  # Creates a new {Arrow::DictionaryDataType}.
24
24
  #
25
- # @overload initialize(index_data_type, dictionary, ordered)
25
+ # @overload initialize(index_data_type, value_data_type, ordered)
26
26
  #
27
27
  # @param index_data_type [Arrow::DataType, Hash, String, Symbol]
28
28
  # The index data type of the dictionary data type. It must be
@@ -39,18 +39,23 @@ module Arrow
39
39
  # See {Arrow::DataType.resolve} how to specify data type
40
40
  # description.
41
41
  #
42
- # @param dictionary [Arrow::Array] The real values of the
43
- # dictionary data type.
42
+ # @param value_data_type [Arrow::DataType, Hash, String, Symbol]
43
+ # The value data type of the dictionary data type.
44
+ #
45
+ # You can specify data type as a description by `Hash`.
46
+ #
47
+ # See {Arrow::DataType.resolve} how to specify data type
48
+ # description.
44
49
  #
45
50
  # @param ordered [Boolean] Whether dictionary contents are
46
51
  # ordered or not.
47
52
  #
48
53
  # @example Create a dictionary data type for {0: "Hello", 1: "World"}
49
54
  # index_data_type = :int8
50
- # dictionary = Arrow::StringArray.new(["Hello", "World"])
55
+ # value_data_type = :string
51
56
  # ordered = true
52
57
  # Arrow::DictionaryDataType.new(index_data_type,
53
- # dictionary,
58
+ # value_data_type,
54
59
  # ordered)
55
60
  #
56
61
  # @overload initialize(description)
@@ -74,16 +79,21 @@ module Arrow
74
79
  # See {Arrow::DataType.resolve} how to specify data type
75
80
  # description.
76
81
  #
77
- # @option description [Arrow::Array] :dictionary The real values
78
- # of the dictionary data type.
82
+ # @option description [Arrow::DataType, Hash, String, Symbol]
83
+ # :value_data_type
84
+ # The value data type of the dictionary data type.
85
+ #
86
+ # You can specify data type as a description by `Hash`.
87
+ #
88
+ # See {Arrow::DataType.resolve} how to specify data type
89
+ # description.
79
90
  #
80
91
  # @option description [Boolean] :ordered Whether dictionary
81
92
  # contents are ordered or not.
82
93
  #
83
94
  # @example Create a dictionary data type for {0: "Hello", 1: "World"}
84
- # dictionary = Arrow::StringArray.new(["Hello", "World"])
85
95
  # Arrow::DictionaryDataType.new(index_data_type: :int8,
86
- # dictionary: dictionary,
96
+ # value_data_type: :string,
87
97
  # ordered: true)
88
98
  def initialize(*args)
89
99
  n_args = args.size
@@ -91,16 +101,17 @@ module Arrow
91
101
  when 1
92
102
  description = args[0]
93
103
  index_data_type = description[:index_data_type]
94
- dictionary = description[:dictionary]
104
+ value_data_type = description[:value_data_type]
95
105
  ordered = description[:ordered]
96
106
  when 3
97
- index_data_type, dictionary, ordered = args
107
+ index_data_type, value_data_type, ordered = args
98
108
  else
99
109
  message = "wrong number of arguments (given, #{n_args}, expected 1 or 3)"
100
110
  raise ArgumentError, message
101
111
  end
102
112
  index_data_type = DataType.resolve(index_data_type)
103
- initialize_raw(index_data_type, dictionary, ordered)
113
+ value_data_type = DataType.resolve(value_data_type)
114
+ initialize_raw(index_data_type, value_data_type, ordered)
104
115
  end
105
116
  end
106
117
  end
@@ -30,27 +30,154 @@ module Arrow
30
30
 
31
31
  alias_method :initialize_raw, :initialize
32
32
  private :initialize_raw
33
- def initialize(schema_or_raw_table_or_columns, columns=nil)
34
- if columns.nil?
35
- if schema_or_raw_table_or_columns[0].is_a?(Column)
36
- columns = schema_or_raw_table_or_columns
37
- fields = columns.collect(&:field)
33
+
34
+ # Creates a new {Arrow::Table}.
35
+ #
36
+ # @overload initialize(columns)
37
+ #
38
+ # @param columns [::Array<Arrow::Column>] The columns of the table.
39
+ #
40
+ # @example Create a table from columns
41
+ # count_field = Arrow::Field.new("count", :uint32)
42
+ # count_array = Arrow::UInt32Array.new([0, 2, nil, 4])
43
+ # count_column = Arrow::Column.new(count_field, count_array)
44
+ # visible_field = Arrow::Field.new("visible", :boolean)
45
+ # visible_array = Arrow::BooleanArray.new([true, nil, nil, false])
46
+ # visible_column = Arrow::Column.new(visible_field, visible_array)
47
+ # Arrow::Table.new([count_column, visible_column])
48
+ #
49
+ # @overload initialize(raw_table)
50
+ #
51
+ # @param raw_table [Hash<String, Arrow::Array>]
52
+ # The pairs of column name and values of the table. Column values is
53
+ # `Arrow::Array`.
54
+ #
55
+ # @example Create a table from column name and values
56
+ # Arrow::Table.new("count" => Arrow::UInt32Array.new([0, 2, nil, 4]),
57
+ # "visible" => Arrow::BooleanArray.new([true, nil, nil, false]))
58
+ #
59
+ # @overload initialize(raw_table)
60
+ #
61
+ # @param raw_table [Hash<String, Arrow::ChunkedArray>]
62
+ # The pairs of column name and values of the table. Column values is
63
+ # `Arrow::ChunkedArray`.
64
+ #
65
+ # @example Create a table from column name and values
66
+ # count_chunks = [
67
+ # Arrow::UInt32Array.new([0, 2]),
68
+ # Arrow::UInt32Array.new([nil, 4]),
69
+ # ]
70
+ # visible_chunks = [
71
+ # Arrow::BooleanArray.new([true]),
72
+ # Arrow::BooleanArray.new([nil, nil, false]),
73
+ # ]
74
+ # Arrow::Table.new("count" => Arrow::ChunkedArray.new(count_chunks),
75
+ # "visible" => Arrow::ChunkedArray.new(visible_chunks))
76
+ #
77
+ # @overload initialize(schema, columns)
78
+ #
79
+ # @param schema [Arrow::Schema] The schema of the table.
80
+ # You can also specify schema as primitive Ruby objects.
81
+ # See {Arrow::Schema#initialize} for details.
82
+ #
83
+ # @param columns [::Array<Arrow::Column>] The data of the table.
84
+ #
85
+ # @example Create a table from schema and columns
86
+ # count_field = Arrow::Field.new("count", :uint32)
87
+ # count_array = Arrow::UInt32Array.new([0, 2, nil, 4])
88
+ # count_column = Arrow::Column.new(count_field, count_array)
89
+ # visible_field = Arrow::Field.new("visible", :boolean)
90
+ # visible_array = Arrow::BooleanArray.new([true, nil, nil, false])
91
+ # visible_column = Arrow::Column.new(visible_field, visible_array)
92
+ # Arrow::Table.new(Arrow::Schema.new([count_field, visible_field]),
93
+ # [count_column, visible_column])
94
+ #
95
+ # @overload initialize(schema, arrays)
96
+ #
97
+ # @param schema [Arrow::Schema] The schema of the table.
98
+ # You can also specify schema as primitive Ruby objects.
99
+ # See {Arrow::Schema#initialize} for details.
100
+ #
101
+ # @param arrays [::Array<Arrow::Array>] The data of the table.
102
+ #
103
+ # @example Create a table from schema and arrays
104
+ # count_field = Arrow::Field.new("count", :uint32)
105
+ # count_array = Arrow::UInt32Array.new([0, 2, nil, 4])
106
+ # visible_field = Arrow::Field.new("visible", :boolean)
107
+ # visible_array = Arrow::BooleanArray.new([true, nil, nil, false])
108
+ # Arrow::Table.new(Arrow::Schema.new([count_field, visible_field]),
109
+ # [count_array, visible_array])
110
+ #
111
+ # @overload initialize(schema, record_batches)
112
+ #
113
+ # @param schema [Arrow::Schema] The schema of the table.
114
+ # You can also specify schema as primitive Ruby objects.
115
+ # See {Arrow::Schema#initialize} for details.
116
+ #
117
+ # @param arrays [::Array<Arrow::RecordBatch>] The data of the table.
118
+ #
119
+ # @example Create a table from schema and record batches
120
+ # count_field = Arrow::Field.new("count", :uint32)
121
+ # visible_field = Arrow::Field.new("visible", :boolean)
122
+ # schema = Arrow::Schema.new([count_field, visible_field])
123
+ # record_batches = [
124
+ # Arrow::RecordBatch.new(schema, [[0, true], [2, nil], [nil, nil]]),
125
+ # Arrow::RecordBatch.new(schema, [[4, false]]),
126
+ # ]
127
+ # Arrow::Table.new(schema, record_batches)
128
+ #
129
+ # @overload initialize(schema, raw_records)
130
+ #
131
+ # @param schema [Arrow::Schema] The schema of the table.
132
+ # You can also specify schema as primitive Ruby objects.
133
+ # See {Arrow::Schema#initialize} for details.
134
+ #
135
+ # @param arrays [::Array<::Array>] The data of the table as primitive
136
+ # Ruby objects.
137
+ #
138
+ # @example Create a table from schema and raw records
139
+ # schema = {
140
+ # count: :uint32,
141
+ # visible: :boolean,
142
+ # }
143
+ # raw_records = [
144
+ # [0, true],
145
+ # [2, nil],
146
+ # [nil, nil],
147
+ # [4, false],
148
+ # ]
149
+ # Arrow::Table.new(schema, raw_records)
150
+ def initialize(*args)
151
+ n_args = args.size
152
+ case n_args
153
+ when 1
154
+ if args[0][0].is_a?(Column)
155
+ values = args[0]
156
+ fields = values.collect(&:field)
38
157
  schema = Schema.new(fields)
39
158
  else
40
- raw_table = schema_or_raw_table_or_columns
159
+ raw_table = args[0]
41
160
  fields = []
42
- columns = []
161
+ values = []
43
162
  raw_table.each do |name, array|
44
163
  field = Field.new(name.to_s, array.value_data_type)
45
164
  fields << field
46
- columns << Column.new(field, array)
165
+ values << Column.new(field, array)
47
166
  end
48
167
  schema = Schema.new(fields)
49
168
  end
169
+ when 2
170
+ schema = args[0]
171
+ schema = Schema.new(schema) unless schema.is_a?(Schema)
172
+ values = args[1]
173
+ if values[0].is_a?(::Array)
174
+ values = [RecordBatch.new(schema, values)]
175
+ end
50
176
  else
51
- schema = schema_or_raw_table_or_columns
177
+ message = "wrong number of arguments (given, #{n_args}, expected 1..2)"
178
+ raise ArgumentError, message
52
179
  end
53
- initialize_raw(schema, columns)
180
+ initialize_raw(schema, values)
54
181
  end
55
182
 
56
183
  def columns
@@ -71,43 +198,92 @@ module Arrow
71
198
 
72
199
  alias_method :[], :find_column
73
200
 
74
- # TODO
201
+ alias_method :slice_raw, :slice
202
+
203
+ # @overload slice(offset, length)
75
204
  #
76
- # @return [Arrow::Table]
205
+ # @param offset [Integer] The offset of sub Arrow::Table.
206
+ # @param length [Integer] The length of sub Arrow::Table.
207
+ # @return [Arrow::Table]
208
+ # The sub `Arrow::Table` that covers only from
209
+ # `offset` to `offset + length` range.
210
+ #
211
+ # @overload slice(index)
212
+ #
213
+ # @param index [Integer] The index in this table.
214
+ # @return [Arrow::Record]
215
+ # The `Arrow::Record` corresponding to index of
216
+ # the table.
217
+ #
218
+ # @overload slice(booleans)
219
+ #
220
+ # @param booleans [::Array<Boolean>]
221
+ # The values indicating the target rows.
222
+ # @return [Arrow::Table]
223
+ # The sub `Arrow::Table` that covers only rows of indices
224
+ # the values of `booleans` is true.
225
+ #
226
+ # @overload slice(boolean_array)
227
+ #
228
+ # @param boolean_array [::Array<Arrow::BooleanArray>]
229
+ # The values indicating the target rows.
230
+ # @return [Arrow::Table]
231
+ # The sub `Arrow::Table` that covers only rows of indices
232
+ # the values of `boolean_array` is true.
233
+ #
234
+ # @overload slice(range)
235
+ #
236
+ # @param range_included_end [Range] The range indicating the target rows.
237
+ # @return [Arrow::Table]
238
+ # The sub `Arrow::Table` that covers only rows of the range of indices.
239
+ #
240
+ # @overload slice
241
+ #
242
+ # @yield [slicer] Gives slicer that constructs condition to select records.
243
+ # @yieldparam slicer [Arrow::Slicer] The slicer that helps us to
244
+ # build condition.
245
+ # @yieldreturn [Arrow::Slicer::Condition, ::Array<Arrow::Slicer::Condition>]
246
+ # The condition to select records.
247
+ # @return [Arrow::Table]
248
+ # The sub `Arrow::Table` that covers only rows matched by condition
249
+ # specified by slicer.
77
250
  def slice(*args)
78
251
  slicers = []
79
- expected_n_args = nil
80
- case args.size
81
- when 0
82
- expected_n_args = "1..2" unless block_given?
83
- when 1
84
- slicers << args[0]
85
- when 2
86
- from, to = args
87
- slicers << (from...(from + to))
88
- else
89
- if block_given?
90
- expected_n_args = "0..2"
91
- else
92
- expected_n_args = "1..2"
93
- end
94
- end
95
- if expected_n_args
96
- message = "wrong number of arguments " +
97
- "(given #{args.size}, expected #{expected_n_args})"
98
- raise ArgumentError, message
99
- end
100
-
101
252
  if block_given?
253
+ unless args.empty?
254
+ raise ArgumentError, "must not specify both arguments and block"
255
+ end
102
256
  block_slicer = yield(Slicer.new(self))
103
257
  case block_slicer
104
- when nil
105
- # Ignore
106
258
  when ::Array
107
259
  slicers.concat(block_slicer)
108
260
  else
109
261
  slicers << block_slicer
110
262
  end
263
+ else
264
+ expected_n_args = nil
265
+ case args.size
266
+ when 1
267
+ if args[0].is_a?(Integer)
268
+ index = args[0]
269
+ index += n_rows if index < 0
270
+ return nil if index < 0
271
+ return nil if index >= n_rows
272
+ return Record.new(self, index)
273
+ else
274
+ slicers << args[0]
275
+ end
276
+ when 2
277
+ offset, length = args
278
+ slicers << (offset...(offset + length))
279
+ else
280
+ expected_n_args = "1..2"
281
+ end
282
+ if expected_n_args
283
+ message = "wrong number of arguments " +
284
+ "(given #{args.size}, expected #{expected_n_args})"
285
+ raise ArgumentError, message
286
+ end
111
287
  end
112
288
 
113
289
  ranges = []
@@ -116,12 +292,18 @@ module Arrow
116
292
  case slicer
117
293
  when Integer
118
294
  slicer += n_rows if slicer < 0
119
- ranges << [slicer, slicer]
295
+ ranges << [slicer, n_rows - 1]
120
296
  when Range
121
- from = slicer.first
297
+ original_from = from = slicer.first
122
298
  to = slicer.last
123
299
  to -= 1 if slicer.exclude_end?
124
300
  from += n_rows if from < 0
301
+ if from < 0 or from >= n_rows
302
+ message =
303
+ "offset is out of range (-#{n_rows + 1},#{n_rows}): " +
304
+ "#{original_from}"
305
+ raise ArgumentError, message
306
+ end
125
307
  to += n_rows if to < 0
126
308
  ranges << [from, to]
127
309
  when ::Array
@@ -330,47 +512,16 @@ module Arrow
330
512
  end
331
513
  end
332
514
 
333
- # TODO: Almost codes should be implemented in Apache Arrow C++.
334
515
  def slice_by_ranges(ranges)
335
- sliced_columns = columns.collect do |column|
336
- chunks = []
337
- arrays = column.data.each_chunk.to_a
338
- offset = 0
339
- offset_in_array = 0
340
- ranges.each do |from, to|
341
- range_size = to - from + 1
342
- while range_size > 0
343
- while offset + arrays.first.length - offset_in_array < from
344
- offset += arrays.first.length - offset_in_array
345
- arrays.shift
346
- offset_in_array = 0
347
- end
348
- if offset < from
349
- skipped_size = from - offset
350
- offset += skipped_size
351
- offset_in_array += skipped_size
352
- end
353
- array = arrays.first
354
- array_length = array.length
355
- rest_length = array_length - offset_in_array
356
- if rest_length <= range_size
357
- chunks << array.slice(offset_in_array, array_length)
358
- offset += rest_length
359
- range_size -= rest_length
360
- offset_in_array = 0
361
- arrays.shift
362
- else
363
- chunks << array.slice(offset_in_array, range_size)
364
- offset += range_size
365
- offset_in_array += range_size
366
- range_size = 0
367
- end
368
- end
369
- end
370
- Column.new(column.field, ChunkedArray.new(chunks))
516
+ sliced_table = []
517
+ ranges.each do |from, to|
518
+ sliced_table << slice_raw(from, to - from + 1)
519
+ end
520
+ if sliced_table.size > 1
521
+ sliced_table[0].concatenate(sliced_table[1..-1])
522
+ else
523
+ sliced_table[0]
371
524
  end
372
-
373
- self.class.new(schema, sliced_columns)
374
525
  end
375
526
 
376
527
  def ensure_column(name, data)