RubyGems - red-arrow - Versions diffs - 0.13.0 → 0.14.0 - Mend

red-arrow 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of red-arrow might be problematic. Click here for more details.

Files changed (23) hide show

checksums.yaml +4 -4
data/README.md +1 -3
data/ext/arrow/arrow.cpp +4 -0
data/ext/arrow/{record-batch.cpp → raw-records.cpp} +49 -5
data/ext/arrow/red-arrow.hpp +1 -0
data/lib/arrow/dictionary-data-type.rb +23 -12
data/lib/arrow/table.rb +228 -77
data/lib/arrow/version.rb +1 -1
data/test/raw-records/test-basic-arrays.rb +340 -0
data/test/raw-records/test-dense-union-array.rb +492 -0
data/test/raw-records/test-list-array.rb +520 -0
data/test/raw-records/{record-batch/test-multiple-columns.rb → test-multiple-columns.rb} +34 -18
data/test/raw-records/test-sparse-union-array.rb +480 -0
data/test/raw-records/test-struct-array.rb +448 -0
data/test/raw-records/test-table.rb +47 -0
data/test/test-dictionary-data-type.rb +3 -3
data/test/test-table.rb +40 -14
metadata +59 -57
data/test/raw-records/record-batch/test-basic-arrays.rb +0 -349
data/test/raw-records/record-batch/test-dense-union-array.rb +0 -486
data/test/raw-records/record-batch/test-list-array.rb +0 -498
data/test/raw-records/record-batch/test-sparse-union-array.rb +0 -474
data/test/raw-records/record-batch/test-struct-array.rb +0 -426

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: a362c35e8eb6b5b93cb69e1ee077acf572af51d89b54e0e9a41282cbdb546cbb
-  data.tar.gz: db53bb7327021bbb8e45e1b8dbf741becf4ad98d85366fe6c3a475da3650ef4e
+  metadata.gz: 97877568cac79133ccac62dcbb51f2e8466c2fb825aaaf58f65976742257fbdc
+  data.tar.gz: 5736802a2d3f5539a2a75421d215aa6bd2a4827ce6da6533cf440c88513db006
 SHA512:
-  metadata.gz: 153a2ab1e7ccefe6fbe3d3481d8d86cf90f6d583825543b4ae541d3e3704b1aab4debcc2d5aa7148ded9eefbed828354697514e22664ba63aa83e3ad74e41180
-  data.tar.gz: a8ab623a40a47a073865a13c0eaa1e7c380574d57307068cb5258884cbf04113030b137b00a83347c0c5a3ccc6ca597db25c89297ccf38b20e03f71e9345b1aa
+  metadata.gz: 3bf77b7050ee8ad28f6f2bf99186684b33385cbca51bdc39650441875ab7e0701803b05beb34e75cf9a6ab7a6c4b31b6fb053b4b83d14bf6d205018b84e3d246
+  data.tar.gz: 9be3f95a220af905a2c611a6a4742db395441d17d2daf20ea975cec2826bf5dc45319f70fce2536672649a4d3d1a3c499fc3b1f45207764b8500dbecc671613a

data/README.md CHANGED

@@ -33,9 +33,7 @@ gobject-introspection gem is a Ruby bindings of GObject Introspection. Red Arrow
 ## Install
-Install Apache Arrow GLib before install Red Arrow. Use [packages.red-data-tools.org](https://github.com/red-data-tools/packages.red-data-tools.org) for installing Apache Arrow GLib.
-Note that the Apache Arrow GLib packages are "unofficial". "Official" packages will be released in the future.
+Install Apache Arrow GLib before install Red Arrow. See [Apache Arrow install document](https://arrow.apache.org/install/) for details.
 Install Red Arrow after you install Apache Arrow GLib:

data/ext/arrow/arrow.cpp CHANGED

@@ -34,6 +34,10 @@ extern "C" void Init_arrow() {
   rb_define_method(cArrowRecordBatch, "raw_records",
                    reinterpret_cast<rb::RawMethod>(red_arrow::record_batch_raw_records),
                    0);
+  auto cArrowTable = rb_const_get_at(mArrow, rb_intern("Table"));
+  rb_define_method(cArrowTable, "raw_records",
+                   reinterpret_cast<rb::RawMethod>(red_arrow::table_raw_records),
+                   0);
   red_arrow::cDate = rb_const_get(rb_cObject, rb_intern("Date"));

data/ext/arrow/{record-batch.cpp → raw-records.cpp} RENAMED

@@ -642,11 +642,34 @@ namespace red_arrow {
             auto record = rb_ary_new_capa(n_columns_);
             rb_ary_push(records_, record);
           }
+          row_offset_ = 0;
           for (int i = 0; i < n_columns_; ++i) {
             const auto array = record_batch.column(i).get();
             column_index_ = i;
             check_status(array->Accept(this),
-                         "[raw-records]");
+                         "[record-batch][raw-records]");
+          }
+          return Qnil;
+        });
+      }
+      void build(const arrow::Table& table) {
+        rb::protect([&] {
+          const auto n_rows = table.num_rows();
+          for (int64_t i = 0; i < n_rows; ++i) {
+            auto record = rb_ary_new_capa(n_columns_);
+            rb_ary_push(records_, record);
+          }
+          for (int i = 0; i < n_columns_; ++i) {
+            const auto column = table.column(i).get();
+            const auto chunked_array = column->data();
+            column_index_ = i;
+            row_offset_ = 0;
+            for (const auto array : chunked_array->chunks()) {
+              check_status(array->Accept(this),
+                           "[table][raw-records]");
+              row_offset_ += array->length();
+            }
           }
           return Qnil;
         });
@@ -703,17 +726,17 @@ namespace red_arrow {
       void convert(const ArrayType& array) {
         const auto n = array.length();
         if (array.null_count() > 0) {
-          for (int64_t i = 0; i < n; ++i) {
+          for (int64_t i = 0, ii = row_offset_; i < n; ++i, ++ii) {
             auto value = Qnil;
             if (!array.IsNull(i)) {
               value = convert_value(array, i);
             }
-            auto record = rb_ary_entry(records_, i);
+            auto record = rb_ary_entry(records_, ii);
             rb_ary_store(record, column_index_, value);
           }
         } else {
-          for (int64_t i = 0; i < n; ++i) {
-            auto record = rb_ary_entry(records_, i);
+          for (int64_t i = 0, ii = row_offset_; i < n; ++i, ++ii) {
+            auto record = rb_ary_entry(records_, ii);
             rb_ary_store(record, column_index_, convert_value(array, i));
           }
         }
@@ -731,6 +754,9 @@ namespace red_arrow {
       // The current column index.
       int column_index_;
+      // The current row offset.
+      int64_t row_offset_;
       // The number of columns.
       const int n_columns_;
     };
@@ -753,4 +779,22 @@ namespace red_arrow {
     return records;
   }
+  VALUE
+  table_raw_records(VALUE rb_table) {
+    auto garrow_table = GARROW_TABLE(RVAL2GOBJ(rb_table));
+    auto table = garrow_table_get_raw(garrow_table).get();
+    const auto n_rows = table->num_rows();
+    const auto n_columns = table->num_columns();
+    auto records = rb_ary_new_capa(n_rows);
+    try {
+      RawRecordsBuilder builder(records, n_columns);
+      builder.build(*table);
+    } catch (rb::State& state) {
+      state.jump();
+    }
+    return records;
+  }
 }

data/ext/arrow/red-arrow.hpp CHANGED

@@ -40,6 +40,7 @@ namespace red_arrow {
   extern ID id_to_datetime;
   VALUE record_batch_raw_records(VALUE obj);
+  VALUE table_raw_records(VALUE obj);
   inline VALUE time_unit_to_scale(arrow::TimeUnit::type unit) {
     switch (unit) {

data/lib/arrow/dictionary-data-type.rb CHANGED

@@ -22,7 +22,7 @@ module Arrow
     # Creates a new {Arrow::DictionaryDataType}.
     #
-    # @overload initialize(index_data_type, dictionary, ordered)
+    # @overload initialize(index_data_type, value_data_type, ordered)
     #
     #   @param index_data_type [Arrow::DataType, Hash, String, Symbol]
     #     The index data type of the dictionary data type. It must be
@@ -39,18 +39,23 @@ module Arrow
     #     See {Arrow::DataType.resolve} how to specify data type
     #     description.
     #
-    #   @param dictionary [Arrow::Array] The real values of the
-    #     dictionary data type.
+    #   @param value_data_type [Arrow::DataType, Hash, String, Symbol]
+    #     The value data type of the dictionary data type.
+    #
+    #     You can specify data type as a description by `Hash`.
+    #
+    #     See {Arrow::DataType.resolve} how to specify data type
+    #     description.
     #
     #   @param ordered [Boolean] Whether dictionary contents are
     #     ordered or not.
     #
     #   @example Create a dictionary data type for {0: "Hello", 1: "World"}
     #     index_data_type = :int8
-    #     dictionary = Arrow::StringArray.new(["Hello", "World"])
+    #     value_data_type = :string
     #     ordered = true
     #     Arrow::DictionaryDataType.new(index_data_type,
-    #                                   dictionary,
+    #                                   value_data_type,
     #                                   ordered)
     #
     # @overload initialize(description)
@@ -74,16 +79,21 @@ module Arrow
     #     See {Arrow::DataType.resolve} how to specify data type
     #     description.
     #
-    #   @option description [Arrow::Array] :dictionary The real values
-    #     of the dictionary data type.
+    #   @option description [Arrow::DataType, Hash, String, Symbol]
+    #     :value_data_type
+    #     The value data type of the dictionary data type.
+    #
+    #     You can specify data type as a description by `Hash`.
+    #
+    #     See {Arrow::DataType.resolve} how to specify data type
+    #     description.
     #
     #   @option description [Boolean] :ordered Whether dictionary
     #     contents are ordered or not.
     #
     #   @example Create a dictionary data type for {0: "Hello", 1: "World"}
-    #     dictionary = Arrow::StringArray.new(["Hello", "World"])
     #     Arrow::DictionaryDataType.new(index_data_type: :int8,
-    #                                   dictionary: dictionary,
+    #                                   value_data_type: :string,
     #                                   ordered: true)
     def initialize(*args)
       n_args = args.size
@@ -91,16 +101,17 @@ module Arrow
       when 1
         description = args[0]
         index_data_type = description[:index_data_type]
-        dictionary = description[:dictionary]
+        value_data_type = description[:value_data_type]
         ordered = description[:ordered]
       when 3
-        index_data_type, dictionary, ordered = args
+        index_data_type, value_data_type, ordered = args
       else
         message = "wrong number of arguments (given, #{n_args}, expected 1 or 3)"
         raise ArgumentError, message
       end
       index_data_type = DataType.resolve(index_data_type)
-      initialize_raw(index_data_type, dictionary, ordered)
+      value_data_type = DataType.resolve(value_data_type)
+      initialize_raw(index_data_type, value_data_type, ordered)
     end
   end
 end

data/lib/arrow/table.rb CHANGED

@@ -30,27 +30,154 @@ module Arrow
     alias_method :initialize_raw, :initialize
     private :initialize_raw
-    def initialize(schema_or_raw_table_or_columns, columns=nil)
-      if columns.nil?
-        if schema_or_raw_table_or_columns[0].is_a?(Column)
-          columns = schema_or_raw_table_or_columns
-          fields = columns.collect(&:field)
+    # Creates a new {Arrow::Table}.
+    #
+    # @overload initialize(columns)
+    #
+    #   @param columns [::Array<Arrow::Column>] The columns of the table.
+    #
+    #   @example Create a table from columns
+    #     count_field = Arrow::Field.new("count", :uint32)
+    #     count_array = Arrow::UInt32Array.new([0, 2, nil, 4])
+    #     count_column = Arrow::Column.new(count_field, count_array)
+    #     visible_field = Arrow::Field.new("visible", :boolean)
+    #     visible_array = Arrow::BooleanArray.new([true, nil, nil, false])
+    #     visible_column = Arrow::Column.new(visible_field, visible_array)
+    #     Arrow::Table.new([count_column, visible_column])
+    #
+    # @overload initialize(raw_table)
+    #
+    #   @param raw_table [Hash<String, Arrow::Array>]
+    #     The pairs of column name and values of the table. Column values is
+    #     `Arrow::Array`.
+    #
+    #   @example Create a table from column name and values
+    #     Arrow::Table.new("count" => Arrow::UInt32Array.new([0, 2, nil, 4]),
+    #                      "visible" => Arrow::BooleanArray.new([true, nil, nil, false]))
+    #
+    # @overload initialize(raw_table)
+    #
+    #   @param raw_table [Hash<String, Arrow::ChunkedArray>]
+    #     The pairs of column name and values of the table. Column values is
+    #     `Arrow::ChunkedArray`.
+    #
+    #   @example Create a table from column name and values
+    #     count_chunks = [
+    #       Arrow::UInt32Array.new([0, 2]),
+    #       Arrow::UInt32Array.new([nil, 4]),
+    #     ]
+    #     visible_chunks = [
+    #       Arrow::BooleanArray.new([true]),
+    #       Arrow::BooleanArray.new([nil, nil, false]),
+    #     ]
+    #     Arrow::Table.new("count" => Arrow::ChunkedArray.new(count_chunks),
+    #                      "visible" => Arrow::ChunkedArray.new(visible_chunks))
+    #
+    # @overload initialize(schema, columns)
+    #
+    #   @param schema [Arrow::Schema] The schema of the table.
+    #     You can also specify schema as primitive Ruby objects.
+    #     See {Arrow::Schema#initialize} for details.
+    #
+    #   @param columns [::Array<Arrow::Column>] The data of the table.
+    #
+    #   @example Create a table from schema and columns
+    #     count_field = Arrow::Field.new("count", :uint32)
+    #     count_array = Arrow::UInt32Array.new([0, 2, nil, 4])
+    #     count_column = Arrow::Column.new(count_field, count_array)
+    #     visible_field = Arrow::Field.new("visible", :boolean)
+    #     visible_array = Arrow::BooleanArray.new([true, nil, nil, false])
+    #     visible_column = Arrow::Column.new(visible_field, visible_array)
+    #     Arrow::Table.new(Arrow::Schema.new([count_field, visible_field]),
+    #                      [count_column, visible_column])
+    #
+    # @overload initialize(schema, arrays)
+    #
+    #   @param schema [Arrow::Schema] The schema of the table.
+    #     You can also specify schema as primitive Ruby objects.
+    #     See {Arrow::Schema#initialize} for details.
+    #
+    #   @param arrays [::Array<Arrow::Array>] The data of the table.
+    #
+    #   @example Create a table from schema and arrays
+    #     count_field = Arrow::Field.new("count", :uint32)
+    #     count_array = Arrow::UInt32Array.new([0, 2, nil, 4])
+    #     visible_field = Arrow::Field.new("visible", :boolean)
+    #     visible_array = Arrow::BooleanArray.new([true, nil, nil, false])
+    #     Arrow::Table.new(Arrow::Schema.new([count_field, visible_field]),
+    #                      [count_array, visible_array])
+    #
+    # @overload initialize(schema, record_batches)
+    #
+    #   @param schema [Arrow::Schema] The schema of the table.
+    #     You can also specify schema as primitive Ruby objects.
+    #     See {Arrow::Schema#initialize} for details.
+    #
+    #   @param arrays [::Array<Arrow::RecordBatch>] The data of the table.
+    #
+    #   @example Create a table from schema and record batches
+    #     count_field = Arrow::Field.new("count", :uint32)
+    #     visible_field = Arrow::Field.new("visible", :boolean)
+    #     schema = Arrow::Schema.new([count_field, visible_field])
+    #     record_batches = [
+    #       Arrow::RecordBatch.new(schema, [[0, true], [2, nil], [nil, nil]]),
+    #       Arrow::RecordBatch.new(schema, [[4, false]]),
+    #     ]
+    #     Arrow::Table.new(schema, record_batches)
+    #
+    # @overload initialize(schema, raw_records)
+    #
+    #   @param schema [Arrow::Schema] The schema of the table.
+    #     You can also specify schema as primitive Ruby objects.
+    #     See {Arrow::Schema#initialize} for details.
+    #
+    #   @param arrays [::Array<::Array>] The data of the table as primitive
+    #     Ruby objects.
+    #
+    #   @example Create a table from schema and raw records
+    #     schema = {
+    #       count: :uint32,
+    #       visible: :boolean,
+    #     }
+    #     raw_records = [
+    #       [0, true],
+    #       [2, nil],
+    #       [nil, nil],
+    #       [4, false],
+    #     ]
+    #     Arrow::Table.new(schema, raw_records)
+    def initialize(*args)
+      n_args = args.size
+      case n_args
+      when 1
+        if args[0][0].is_a?(Column)
+          values = args[0]
+          fields = values.collect(&:field)
           schema = Schema.new(fields)
         else
-          raw_table = schema_or_raw_table_or_columns
+          raw_table = args[0]
           fields = []
-          columns = []
+          values = []
           raw_table.each do |name, array|
             field = Field.new(name.to_s, array.value_data_type)
             fields << field
-            columns << Column.new(field, array)
+            values << Column.new(field, array)
           end
           schema = Schema.new(fields)
         end
+      when 2
+        schema = args[0]
+        schema = Schema.new(schema) unless schema.is_a?(Schema)
+        values = args[1]
+        if values[0].is_a?(::Array)
+          values = [RecordBatch.new(schema, values)]
+        end
       else
-        schema = schema_or_raw_table_or_columns
+        message = "wrong number of arguments (given, #{n_args}, expected 1..2)"
+        raise ArgumentError, message
       end
-      initialize_raw(schema, columns)
+      initialize_raw(schema, values)
     end
     def columns
@@ -71,43 +198,92 @@ module Arrow
     alias_method :[], :find_column
-    # TODO
+    alias_method :slice_raw, :slice
+    # @overload slice(offset, length)
     #
-    # @return [Arrow::Table]
+    #   @param offset [Integer] The offset of sub Arrow::Table.
+    #   @param length [Integer] The length of sub Arrow::Table.
+    #   @return [Arrow::Table]
+    #     The sub `Arrow::Table` that covers only from
+    #     `offset` to `offset + length` range.
+    #
+    # @overload slice(index)
+    #
+    #   @param index [Integer] The index in this table.
+    #   @return [Arrow::Record]
+    #     The `Arrow::Record` corresponding to index of
+    #     the table.
+    #
+    # @overload slice(booleans)
+    #
+    #   @param booleans [::Array<Boolean>]
+    #     The values indicating the target rows.
+    #   @return [Arrow::Table]
+    #     The sub `Arrow::Table` that covers only rows of indices
+    #     the values of `booleans` is true.
+    #
+    # @overload slice(boolean_array)
+    #
+    #   @param boolean_array [::Array<Arrow::BooleanArray>]
+    #     The values indicating the target rows.
+    #   @return [Arrow::Table]
+    #     The sub `Arrow::Table` that covers only rows of indices
+    #     the values of `boolean_array` is true.
+    #
+    # @overload slice(range)
+    #
+    #   @param range_included_end [Range] The range indicating the target rows.
+    #   @return [Arrow::Table]
+    #     The sub `Arrow::Table` that covers only rows of the range of indices.
+    #
+    # @overload slice
+    #
+    #   @yield [slicer] Gives slicer that constructs condition to select records.
+    #   @yieldparam slicer [Arrow::Slicer] The slicer that helps us to
+    #     build condition.
+    #   @yieldreturn [Arrow::Slicer::Condition, ::Array<Arrow::Slicer::Condition>]
+    #     The condition to select records.
+    #   @return [Arrow::Table]
+    #     The sub `Arrow::Table` that covers only rows matched by condition
+    #     specified by slicer.
     def slice(*args)
       slicers = []
-      expected_n_args = nil
-      case args.size
-      when 0
-        expected_n_args = "1..2" unless block_given?
-      when 1
-        slicers << args[0]
-      when 2
-        from, to = args
-        slicers << (from...(from + to))
-      else
-        if block_given?
-          expected_n_args = "0..2"
-        else
-          expected_n_args = "1..2"
-        end
-      end
-      if expected_n_args
-        message = "wrong number of arguments " +
-          "(given #{args.size}, expected #{expected_n_args})"
-        raise ArgumentError, message
-      end
       if block_given?
+        unless args.empty?
+          raise ArgumentError, "must not specify both arguments and block"
+        end
         block_slicer = yield(Slicer.new(self))
         case block_slicer
-        when nil
-          # Ignore
         when ::Array
           slicers.concat(block_slicer)
         else
           slicers << block_slicer
         end
+      else
+        expected_n_args = nil
+        case args.size
+        when 1
+          if args[0].is_a?(Integer)
+            index = args[0]
+            index += n_rows if index < 0
+            return nil if index < 0
+            return nil if index >= n_rows
+            return Record.new(self, index)
+          else
+            slicers << args[0]
+          end
+        when 2
+          offset, length = args
+          slicers << (offset...(offset + length))
+        else
+          expected_n_args = "1..2"
+        end
+        if expected_n_args
+          message = "wrong number of arguments " +
+            "(given #{args.size}, expected #{expected_n_args})"
+          raise ArgumentError, message
+        end
       end
       ranges = []
@@ -116,12 +292,18 @@ module Arrow
         case slicer
         when Integer
           slicer += n_rows if slicer < 0
-          ranges << [slicer, slicer]
+          ranges << [slicer, n_rows - 1]
         when Range
-          from = slicer.first
+          original_from = from = slicer.first
           to = slicer.last
           to -= 1 if slicer.exclude_end?
           from += n_rows if from < 0
+          if from < 0 or from >= n_rows
+            message =
+              "offset is out of range (-#{n_rows + 1},#{n_rows}): " +
+              "#{original_from}"
+            raise ArgumentError, message
+          end
           to += n_rows if to < 0
           ranges << [from, to]
         when ::Array
@@ -330,47 +512,16 @@ module Arrow
       end
     end
-    # TODO: Almost codes should be implemented in Apache Arrow C++.
     def slice_by_ranges(ranges)
-      sliced_columns = columns.collect do |column|
-        chunks = []
-        arrays = column.data.each_chunk.to_a
-        offset = 0
-        offset_in_array = 0
-        ranges.each do |from, to|
-          range_size = to - from + 1
-          while range_size > 0
-            while offset + arrays.first.length - offset_in_array < from
-              offset += arrays.first.length - offset_in_array
-              arrays.shift
-              offset_in_array = 0
-            end
-            if offset < from
-              skipped_size = from - offset
-              offset += skipped_size
-              offset_in_array += skipped_size
-            end
-            array = arrays.first
-            array_length = array.length
-            rest_length = array_length - offset_in_array
-            if rest_length <= range_size
-              chunks << array.slice(offset_in_array, array_length)
-              offset += rest_length
-              range_size -= rest_length
-              offset_in_array = 0
-              arrays.shift
-            else
-              chunks << array.slice(offset_in_array, range_size)
-              offset += range_size
-              offset_in_array += range_size
-              range_size = 0
-            end
-          end
-        end
-        Column.new(column.field, ChunkedArray.new(chunks))
+      sliced_table = []
+      ranges.each do |from, to|
+        sliced_table << slice_raw(from, to - from + 1)
+      end
+      if sliced_table.size > 1
+        sliced_table[0].concatenate(sliced_table[1..-1])
+      else
+        sliced_table[0]
       end
-      self.class.new(schema, sliced_columns)
     end
     def ensure_column(name, data)