red-arrow 13.0.0 → 14.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dcbc004b8b61405c6e1143428ca37aac1be3de4d31d73f0a235f9c2e8d7bf5bb
4
- data.tar.gz: 508ba001225d07eae018b7552987c6102a8b59c98f4d717b4e88ad1a188d84b3
3
+ metadata.gz: d76026332781cece25d5056c7b5eff1d650bfb8b794678ee16fe5e733412d39d
4
+ data.tar.gz: de610617cb966bca7b13b4e8e5d012f1d4ccb9c2e61e1e5db89a7af6f5d46cd4
5
5
  SHA512:
6
- metadata.gz: 5cbeae509369615824b6c5d41d8f97d4b166ca99d9b1dc3fba4207e726ac22a0147027cb531761ba91245df7d71bb02ab370221ec72359470149ace65b2a01f7
7
- data.tar.gz: 2043d58148d1370789ad6b75ed605b7d3a78c1094f2250a09b0857a43d24f37a83a5706b3b96b41dfa7f0dafa22179dde63a4e991e58e85f19002759cfed52b8
6
+ metadata.gz: 13e0adf265f41336b40cea4e3d96be8a142d920c0a6a15bc0ce343d80408ca7c4220edbe8a4a91a2973fed4ea8229820cd762166e5a9ae4d0418ed3eb1c164c1
7
+ data.tar.gz: 9cf84a3bd5527a7d19dfaaf3aa3c922543ec6cfccdedb90bd721d68bec1fb4b9851df20e003df302a663bde921c48d15debf2c37dd0e82bde669382e6cf4343d
data/ext/arrow/arrow.cpp CHANGED
@@ -82,11 +82,17 @@ extern "C" void Init_arrow() {
82
82
  rb_define_method(cArrowRecordBatch, "raw_records",
83
83
  reinterpret_cast<rb::RawMethod>(red_arrow::record_batch_raw_records),
84
84
  0);
85
+ rb_define_method(cArrowRecordBatch, "each_raw_record",
86
+ reinterpret_cast<rb::RawMethod>(red_arrow::record_batch_each_raw_record),
87
+ 0);
85
88
 
86
89
  auto cArrowTable = rb_const_get_at(mArrow, rb_intern("Table"));
87
90
  rb_define_method(cArrowTable, "raw_records",
88
91
  reinterpret_cast<rb::RawMethod>(red_arrow::table_raw_records),
89
92
  0);
93
+ rb_define_method(cArrowTable, "each_raw_record",
94
+ reinterpret_cast<rb::RawMethod>(red_arrow::table_each_raw_record),
95
+ 0);
90
96
 
91
97
  red_arrow::cDate = rb_const_get(rb_cObject, rb_intern("Date"));
92
98
 
@@ -144,6 +144,128 @@ namespace red_arrow {
144
144
  // The number of columns.
145
145
  const int n_columns_;
146
146
  };
147
+
148
+ class RawRecordsProducer : private Converter, public arrow::ArrayVisitor {
149
+ public:
150
+ explicit RawRecordsProducer()
151
+ : Converter(),
152
+ record_(Qnil),
153
+ column_index_(0),
154
+ row_offset_(0) {
155
+ }
156
+
157
+ void produce(const arrow::RecordBatch& record_batch) {
158
+ rb::protect([&] {
159
+ const auto n_columns = record_batch.num_columns();
160
+ const auto n_rows = record_batch.num_rows();
161
+ for (int64_t i = 0; i < n_rows; ++i) {
162
+ record_ = rb_ary_new_capa(n_columns);
163
+ row_offset_ = i;
164
+ for (int i = 0; i < n_columns; ++i) {
165
+ const auto array = record_batch.column(i).get();
166
+ column_index_ = i;
167
+ check_status(array->Accept(this),
168
+ "[record-batch][each-raw-record]");
169
+ }
170
+ rb_yield(record_);
171
+ }
172
+ return Qnil;
173
+ });
174
+ }
175
+
176
+ void produce(const arrow::Table& table) {
177
+ rb::protect([&] {
178
+ const auto n_columns = table.num_columns();
179
+ const auto n_rows = table.num_rows();
180
+ std::vector<int> chunk_indexes(n_columns);
181
+ std::vector<int64_t> row_offsets(n_columns);
182
+ for (int64_t i_row = 0; i_row < n_rows; ++i_row) {
183
+ record_ = rb_ary_new_capa(n_columns);
184
+ for (int i_column = 0; i_column < n_columns; ++i_column) {
185
+ column_index_ = i_column;
186
+ const auto chunked_array = table.column(i_column).get();
187
+ auto& chunk_index = chunk_indexes[i_column];
188
+ auto& row_offset = row_offsets[i_column];
189
+ auto array = chunked_array->chunk(chunk_index).get();
190
+ while (array->length() == row_offset) {
191
+ ++chunk_index;
192
+ row_offset = 0;
193
+ array = chunked_array->chunk(chunk_index).get();
194
+ }
195
+ row_offset_ = row_offset;
196
+ check_status(array->Accept(this),
197
+ "[table][each-raw-record]");
198
+ ++row_offset;
199
+ }
200
+ rb_yield(record_);
201
+ }
202
+
203
+ return Qnil;
204
+ });
205
+ }
206
+
207
+ #define VISIT(TYPE) \
208
+ arrow::Status Visit(const arrow::TYPE ## Array& array) override { \
209
+ convert(array); \
210
+ return arrow::Status::OK(); \
211
+ }
212
+
213
+ VISIT(Null)
214
+ VISIT(Boolean)
215
+ VISIT(Int8)
216
+ VISIT(Int16)
217
+ VISIT(Int32)
218
+ VISIT(Int64)
219
+ VISIT(UInt8)
220
+ VISIT(UInt16)
221
+ VISIT(UInt32)
222
+ VISIT(UInt64)
223
+ VISIT(HalfFloat)
224
+ VISIT(Float)
225
+ VISIT(Double)
226
+ VISIT(Binary)
227
+ VISIT(String)
228
+ VISIT(FixedSizeBinary)
229
+ VISIT(Date32)
230
+ VISIT(Date64)
231
+ VISIT(Time32)
232
+ VISIT(Time64)
233
+ VISIT(Timestamp)
234
+ VISIT(MonthInterval)
235
+ VISIT(DayTimeInterval)
236
+ VISIT(MonthDayNanoInterval)
237
+ VISIT(List)
238
+ VISIT(Struct)
239
+ VISIT(Map)
240
+ VISIT(SparseUnion)
241
+ VISIT(DenseUnion)
242
+ VISIT(Dictionary)
243
+ VISIT(Decimal128)
244
+ VISIT(Decimal256)
245
+ // TODO
246
+ // VISIT(Extension)
247
+
248
+ #undef VISIT
249
+
250
+ private:
251
+ template <typename ArrayType>
252
+ void convert(const ArrayType& array) {
253
+ auto value = Qnil;
254
+ if (!array.IsNull(row_offset_)) {
255
+ value = convert_value(array, row_offset_);
256
+ }
257
+ rb_ary_store(record_, column_index_, value);
258
+ }
259
+
260
+ // Destination for converted record.
261
+ VALUE record_;
262
+
263
+ // The current column index.
264
+ int column_index_;
265
+
266
+ // The current row offset.
267
+ int64_t row_offset_;
268
+ };
147
269
  }
148
270
 
149
271
  VALUE
@@ -181,4 +303,36 @@ namespace red_arrow {
181
303
 
182
304
  return records;
183
305
  }
306
+
307
+ VALUE
308
+ record_batch_each_raw_record(VALUE rb_record_batch) {
309
+ auto garrow_record_batch = GARROW_RECORD_BATCH(RVAL2GOBJ(rb_record_batch));
310
+ auto record_batch = garrow_record_batch_get_raw(garrow_record_batch).get();
311
+ RETURN_SIZED_ENUMERATOR(rb_record_batch, 0, nullptr, record_batch->num_rows());
312
+
313
+ try {
314
+ RawRecordsProducer producer;
315
+ producer.produce(*record_batch);
316
+ } catch (rb::State& state) {
317
+ state.jump();
318
+ }
319
+
320
+ return Qnil;
321
+ }
322
+
323
+ VALUE
324
+ table_each_raw_record(VALUE rb_table) {
325
+ auto garrow_table = GARROW_TABLE(RVAL2GOBJ(rb_table));
326
+ auto table = garrow_table_get_raw(garrow_table).get();
327
+ RETURN_SIZED_ENUMERATOR(rb_table, 0, nullptr, table->num_rows());
328
+
329
+ try {
330
+ RawRecordsProducer producer;
331
+ producer.produce(*table);
332
+ } catch (rb::State& state) {
333
+ state.jump();
334
+ }
335
+
336
+ return Qnil;
337
+ }
184
338
  }
@@ -59,6 +59,8 @@ namespace red_arrow {
59
59
 
60
60
  VALUE record_batch_raw_records(VALUE obj);
61
61
  VALUE table_raw_records(VALUE obj);
62
+ VALUE record_batch_each_raw_record(VALUE obj);
63
+ VALUE table_each_raw_record(VALUE obj);
62
64
 
63
65
  inline VALUE time_unit_to_scale(const arrow::TimeUnit::type unit) {
64
66
  switch (unit) {
data/lib/arrow/table.rb CHANGED
@@ -127,7 +127,7 @@ module Arrow
127
127
  # You can also specify schema as primitive Ruby objects.
128
128
  # See {Arrow::Schema#initialize} for details.
129
129
  #
130
- # @param arrays [::Array<Arrow::RecordBatch>] The data of the table.
130
+ # @param record_batches [::Array<Arrow::RecordBatch>] The data of the table.
131
131
  #
132
132
  # @example Create a table from schema and record batches
133
133
  # count_field = Arrow::Field.new("count", :uint32)
@@ -145,7 +145,7 @@ module Arrow
145
145
  # You can also specify schema as primitive Ruby objects.
146
146
  # See {Arrow::Schema#initialize} for details.
147
147
  #
148
- # @param arrays [::Array<::Array>] The data of the table as primitive
148
+ # @param raw_records [::Array<::Array>] The data of the table as primitive
149
149
  # Ruby objects.
150
150
  #
151
151
  # @example Create a table from schema and raw records
data/lib/arrow/version.rb CHANGED
@@ -16,7 +16,7 @@
16
16
  # under the License.
17
17
 
18
18
  module Arrow
19
- VERSION = "13.0.0"
19
+ VERSION = "14.0.0"
20
20
 
21
21
  module Version
22
22
  numbers, TAG = VERSION.split("-")
data/red-arrow.gemspec CHANGED
@@ -47,6 +47,7 @@ Gem::Specification.new do |spec|
47
47
  spec.extensions = ["ext/arrow/extconf.rb"]
48
48
 
49
49
  spec.add_runtime_dependency("bigdecimal", ">= 3.1.0")
50
+ spec.add_runtime_dependency("csv")
50
51
  spec.add_runtime_dependency("extpp", ">= 0.1.1")
51
52
  spec.add_runtime_dependency("gio2", ">= 3.5.0")
52
53
  spec.add_runtime_dependency("native-package-installer")
@@ -0,0 +1,411 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module EachRawRecordBasicArraysTests
19
+ def test_null
20
+ records = [
21
+ [nil],
22
+ [nil],
23
+ [nil],
24
+ ]
25
+ target = build({column: :null}, records)
26
+ assert_equal(records, target.each_raw_record.to_a)
27
+ end
28
+
29
+ def test_boolean
30
+ records = [
31
+ [true],
32
+ [nil],
33
+ [false],
34
+ ]
35
+ target = build({column: :boolean}, records)
36
+ assert_equal(records, target.each_raw_record.to_a)
37
+ end
38
+
39
+ def test_int8
40
+ records = [
41
+ [-(2 ** 7)],
42
+ [nil],
43
+ [(2 ** 7) - 1],
44
+ ]
45
+ target = build({column: :int8}, records)
46
+ assert_equal(records, target.each_raw_record.to_a)
47
+ end
48
+
49
+ def test_uint8
50
+ records = [
51
+ [0],
52
+ [nil],
53
+ [(2 ** 8) - 1],
54
+ ]
55
+ target = build({column: :uint8}, records)
56
+ assert_equal(records, target.each_raw_record.to_a)
57
+ end
58
+
59
+ def test_int16
60
+ records = [
61
+ [-(2 ** 15)],
62
+ [nil],
63
+ [(2 ** 15) - 1],
64
+ ]
65
+ target = build({column: :int16}, records)
66
+ assert_equal(records, target.each_raw_record.to_a)
67
+ end
68
+
69
+ def test_uint16
70
+ records = [
71
+ [0],
72
+ [nil],
73
+ [(2 ** 16) - 1],
74
+ ]
75
+ target = build({column: :uint16}, records)
76
+ assert_equal(records, target.each_raw_record.to_a)
77
+ end
78
+
79
+ def test_int32
80
+ records = [
81
+ [-(2 ** 31)],
82
+ [nil],
83
+ [(2 ** 31) - 1],
84
+ ]
85
+ target = build({column: :int32}, records)
86
+ assert_equal(records, target.each_raw_record.to_a)
87
+ end
88
+
89
+ def test_uint32
90
+ records = [
91
+ [0],
92
+ [nil],
93
+ [(2 ** 32) - 1],
94
+ ]
95
+ target = build({column: :uint32}, records)
96
+ assert_equal(records, target.each_raw_record.to_a)
97
+ end
98
+
99
+ def test_int64
100
+ records = [
101
+ [-(2 ** 63)],
102
+ [nil],
103
+ [(2 ** 63) - 1],
104
+ ]
105
+ target = build({column: :int64}, records)
106
+ assert_equal(records, target.each_raw_record.to_a)
107
+ end
108
+
109
+ def test_uint64
110
+ records = [
111
+ [0],
112
+ [nil],
113
+ [(2 ** 64) - 1],
114
+ ]
115
+ target = build({column: :uint64}, records)
116
+ assert_equal(records, target.each_raw_record.to_a)
117
+ end
118
+
119
+ def test_half_float
120
+ records = [
121
+ [-1.5],
122
+ [nil],
123
+ [1.5],
124
+ ]
125
+ target = build({column: :half_float}, records)
126
+ assert_equal(records, target.each_raw_record.to_a)
127
+ end
128
+
129
+ def test_float
130
+ records = [
131
+ [-1.0],
132
+ [nil],
133
+ [1.0],
134
+ ]
135
+ target = build({column: :float}, records)
136
+ assert_equal(records, target.each_raw_record.to_a)
137
+ end
138
+
139
+ def test_double
140
+ records = [
141
+ [-1.0],
142
+ [nil],
143
+ [1.0],
144
+ ]
145
+ target = build({column: :double}, records)
146
+ assert_equal(records, target.each_raw_record.to_a)
147
+ end
148
+
149
+ def test_binary
150
+ records = [
151
+ ["\x00".b],
152
+ [nil],
153
+ ["\xff".b],
154
+ ]
155
+ target = build({column: :binary}, records)
156
+ assert_equal(records, target.each_raw_record.to_a)
157
+ end
158
+
159
+ def test_string
160
+ records = [
161
+ ["Ruby"],
162
+ [nil],
163
+ ["\u3042"], # U+3042 HIRAGANA LETTER A
164
+ ]
165
+ target = build({column: :string}, records)
166
+ assert_equal(records, target.each_raw_record.to_a)
167
+ end
168
+
169
+ def test_date32
170
+ records = [
171
+ [Date.new(1960, 1, 1)],
172
+ [nil],
173
+ [Date.new(2017, 8, 23)],
174
+ ]
175
+ target = build({column: :date32}, records)
176
+ assert_equal(records, target.each_raw_record.to_a)
177
+ end
178
+
179
+ def test_date64
180
+ records = [
181
+ [DateTime.new(1960, 1, 1, 2, 9, 30)],
182
+ [nil],
183
+ [DateTime.new(2017, 8, 23, 14, 57, 2)],
184
+ ]
185
+ target = build({column: :date64}, records)
186
+ assert_equal(records, target.each_raw_record.to_a)
187
+ end
188
+
189
+ def test_timestamp_second
190
+ records = [
191
+ [Time.parse("1960-01-01T02:09:30Z")],
192
+ [nil],
193
+ [Time.parse("2017-08-23T14:57:02Z")],
194
+ ]
195
+ target = build({
196
+ column: {
197
+ type: :timestamp,
198
+ unit: :second,
199
+ }
200
+ },
201
+ records)
202
+ assert_equal(records, target.each_raw_record.to_a)
203
+ end
204
+
205
+ def test_timestamp_milli
206
+ records = [
207
+ [Time.parse("1960-01-01T02:09:30.123Z")],
208
+ [nil],
209
+ [Time.parse("2017-08-23T14:57:02.987Z")],
210
+ ]
211
+ target = build({
212
+ column: {
213
+ type: :timestamp,
214
+ unit: :milli,
215
+ }
216
+ },
217
+ records)
218
+ assert_equal(records, target.each_raw_record.to_a)
219
+ end
220
+
221
+ def test_timestamp_micro
222
+ records = [
223
+ [Time.parse("1960-01-01T02:09:30.123456Z")],
224
+ [nil],
225
+ [Time.parse("2017-08-23T14:57:02.987654Z")],
226
+ ]
227
+ target = build({
228
+ column: {
229
+ type: :timestamp,
230
+ unit: :micro,
231
+ }
232
+ },
233
+ records)
234
+ assert_equal(records, target.each_raw_record.to_a)
235
+ end
236
+
237
+ def test_timestamp_nano
238
+ records = [
239
+ [Time.parse("1960-01-01T02:09:30.123456789Z")],
240
+ [nil],
241
+ [Time.parse("2017-08-23T14:57:02.987654321Z")],
242
+ ]
243
+ target = build({
244
+ column: {
245
+ type: :timestamp,
246
+ unit: :nano,
247
+ }
248
+ },
249
+ records)
250
+ assert_equal(records, target.each_raw_record.to_a)
251
+ end
252
+
253
+ def test_time32_second
254
+ unit = Arrow::TimeUnit::SECOND
255
+ records = [
256
+ [Arrow::Time.new(unit, 60 * 10)], # 00:10:00
257
+ [nil],
258
+ [Arrow::Time.new(unit, 60 * 60 * 2 + 9)], # 02:00:09
259
+ ]
260
+ target = build({
261
+ column: {
262
+ type: :time32,
263
+ unit: :second,
264
+ }
265
+ },
266
+ records)
267
+ assert_equal(records, target.each_raw_record.to_a)
268
+ end
269
+
270
+ def test_time32_milli
271
+ unit = Arrow::TimeUnit::MILLI
272
+ records = [
273
+ [Arrow::Time.new(unit, (60 * 10) * 1000 + 123)], # 00:10:00.123
274
+ [nil],
275
+ [Arrow::Time.new(unit, (60 * 60 * 2 + 9) * 1000 + 987)], # 02:00:09.987
276
+ ]
277
+ target = build({
278
+ column: {
279
+ type: :time32,
280
+ unit: :milli,
281
+ }
282
+ },
283
+ records)
284
+ assert_equal(records, target.each_raw_record.to_a)
285
+ end
286
+
287
+ def test_time64_micro
288
+ unit = Arrow::TimeUnit::MICRO
289
+ records = [
290
+ # 00:10:00.123456
291
+ [Arrow::Time.new(unit, (60 * 10) * 1_000_000 + 123_456)],
292
+ [nil],
293
+ # 02:00:09.987654
294
+ [Arrow::Time.new(unit, (60 * 60 * 2 + 9) * 1_000_000 + 987_654)],
295
+ ]
296
+ target = build({
297
+ column: {
298
+ type: :time64,
299
+ unit: :micro,
300
+ }
301
+ },
302
+ records)
303
+ assert_equal(records, target.each_raw_record.to_a)
304
+ end
305
+
306
+ def test_time64_nano
307
+ unit = Arrow::TimeUnit::NANO
308
+ records = [
309
+ # 00:10:00.123456789
310
+ [Arrow::Time.new(unit, (60 * 10) * 1_000_000_000 + 123_456_789)],
311
+ [nil],
312
+ # 02:00:09.987654321
313
+ [Arrow::Time.new(unit, (60 * 60 * 2 + 9) * 1_000_000_000 + 987_654_321)],
314
+ ]
315
+ target = build({
316
+ column: {
317
+ type: :time64,
318
+ unit: :nano,
319
+ }
320
+ },
321
+ records)
322
+ assert_equal(records, target.each_raw_record.to_a)
323
+ end
324
+
325
+ def test_decimal128
326
+ records = [
327
+ [BigDecimal("92.92")],
328
+ [nil],
329
+ [BigDecimal("29.29")],
330
+ ]
331
+ target = build({
332
+ column: {
333
+ type: :decimal128,
334
+ precision: 8,
335
+ scale: 2,
336
+ }
337
+ },
338
+ records)
339
+ assert_equal(records, target.each_raw_record.to_a)
340
+ end
341
+
342
+ def test_decimal256
343
+ records = [
344
+ [BigDecimal("92.92")],
345
+ [nil],
346
+ [BigDecimal("29.29")],
347
+ ]
348
+ target = build({
349
+ column: {
350
+ type: :decimal256,
351
+ precision: 38,
352
+ scale: 2,
353
+ }
354
+ },
355
+ records)
356
+ assert_equal(records, target.each_raw_record.to_a)
357
+ end
358
+
359
+ def test_month_interval
360
+ records = [
361
+ [1],
362
+ [nil],
363
+ [12],
364
+ ]
365
+ target = build({column: :month_interval}, records)
366
+ assert_equal(records, target.each_raw_record.to_a)
367
+ end
368
+
369
+ def test_day_time_interval
370
+ records = [
371
+ [{day: 1, millisecond: 100}],
372
+ [nil],
373
+ [{day: 2, millisecond: 300}],
374
+ ]
375
+ target = build({column: :day_time_interval}, records)
376
+ assert_equal(records, target.each_raw_record.to_a)
377
+ end
378
+
379
+ def test_month_day_nano_interval
380
+ records = [
381
+ [{month: 1, day: 1, nanosecond: 100}],
382
+ [nil],
383
+ [{month: 2, day: 3, nanosecond: 400}],
384
+ ]
385
+ target = build({column: :month_day_nano_interval}, records)
386
+ assert_equal(records, target.each_raw_record.to_a)
387
+ end
388
+ end
389
+
390
+ class EachRawRecordRecordBatchBasicArraysTest< Test::Unit::TestCase
391
+ include EachRawRecordBasicArraysTests
392
+
393
+ def build(schema, records)
394
+ Arrow::RecordBatch.new(schema, records)
395
+ end
396
+ end
397
+
398
+ class EachRawRecordTableBasicArraysTest < Test::Unit::TestCase
399
+ include EachRawRecordBasicArraysTests
400
+
401
+ def build(schema, records)
402
+ record_batch = Arrow::RecordBatch.new(schema, records)
403
+ # Multiple chunks
404
+ record_batches = [
405
+ record_batch.slice(0, 2),
406
+ record_batch.slice(2, 0), # Empty chunk
407
+ record_batch.slice(2, record_batch.length - 2),
408
+ ]
409
+ Arrow::Table.new(schema, record_batches)
410
+ end
411
+ end