RubyGems - red_amber - Versions diffs - 0.1.3 → 0.1.6 - Mend

red_amber 0.1.3 → 0.1.6

Files changed (43) hide show

checksums.yaml +4 -4
data/.rubocop.yml +31 -7
data/CHANGELOG.md +214 -10
data/Gemfile +4 -0
data/README.md +117 -342
data/benchmark/csv_load_penguins.yml +15 -0
data/benchmark/drop_nil.yml +11 -0
data/doc/DataFrame.md +854 -0
data/doc/Vector.md +449 -0
data/doc/image/arrow_table_new.png +0 -0
data/doc/image/dataframe/assign.png +0 -0
data/doc/image/dataframe/drop.png +0 -0
data/doc/image/dataframe/pick.png +0 -0
data/doc/image/dataframe/remove.png +0 -0
data/doc/image/dataframe/rename.png +0 -0
data/doc/image/dataframe/slice.png +0 -0
data/doc/image/dataframe_model.png +0 -0
data/doc/image/example_in_red_arrow.png +0 -0
data/doc/image/tdr.png +0 -0
data/doc/image/tdr_and_table.png +0 -0
data/doc/image/tidy_data_in_TDR.png +0 -0
data/doc/image/vector/binary_element_wise.png +0 -0
data/doc/image/vector/unary_aggregation.png +0 -0
data/doc/image/vector/unary_aggregation_w_option.png +0 -0
data/doc/image/vector/unary_element_wise.png +0 -0
data/doc/tdr.md +56 -0
data/doc/tdr_ja.md +56 -0
data/lib/red-amber.rb +27 -0
data/lib/red_amber/data_frame.rb +91 -37
data/lib/red_amber/{data_frame_output.rb → data_frame_displayable.rb} +49 -41
data/lib/red_amber/data_frame_indexable.rb +38 -0
data/lib/red_amber/data_frame_observation_operation.rb +11 -0
data/lib/red_amber/data_frame_selectable.rb +155 -48
data/lib/red_amber/data_frame_variable_operation.rb +137 -0
data/lib/red_amber/helper.rb +61 -0
data/lib/red_amber/vector.rb +69 -16
data/lib/red_amber/vector_functions.rb +80 -45
data/lib/red_amber/vector_selectable.rb +124 -0
data/lib/red_amber/vector_updatable.rb +104 -0
data/lib/red_amber/version.rb +1 -1
data/lib/red_amber.rb +1 -16
data/red_amber.gemspec +3 -6
metadata +38 -9

data/lib/red_amber/data_frame_variable_operation.rb ADDED Viewed

@@ -0,0 +1,137 @@
+# frozen_string_literal: true
+module RedAmber
+  # mix-ins for the class DataFrame
+  module DataFrameVariableOperation
+    # pick up some variables to create sub DataFrame
+    def pick(*args, &block)
+      picker = args
+      if block
+        raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
+        picker = instance_eval(&block)
+      end
+      picker = [picker].flatten
+      return DataFrame.new if picker.empty? || picker == [nil]
+      picker = keys_by_booleans(picker) if booleans?(picker)
+      # DataFrame#[] creates a Vector with single key is specified.
+      # DataFrame#pick creates a DataFrame with single key.
+      return DataFrame.new(@table[picker]) if sym_or_str?(picker)
+      raise DataFrameArgumentError, "Invalid argument #{args}"
+    end
+    # drop some variables to create remainer sub DataFrame
+    def drop(*args, &block)
+      dropper = args
+      if block
+        raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
+        dropper = instance_eval(&block)
+      end
+      dropper = [dropper].flatten
+      dropper = keys_by_booleans(dropper) if booleans?(dropper)
+      picker = keys - dropper
+      return DataFrame.new if picker.empty?
+      # DataFrame#[] creates a Vector with single key is specified.
+      # DataFrame#drop creates a DataFrame with single key.
+      return DataFrame.new(@table[picker]) if sym_or_str?(picker)
+      raise DataFrameArgumentError, "Invalid argument #{args}"
+    end
+    # rename variables to create new DataFrame
+    def rename(*args, &block)
+      renamer = args
+      if block
+        raise DataFrameArgumentError, 'Must not specify both arguments and a block' unless args.empty?
+        renamer = instance_eval(&block)
+      end
+      renamer = [renamer].flatten
+      return self if renamer.empty?
+      return rename_by_hash([renamer].to_h) if renamer.size == 2 && sym_or_str?(renamer) # rename(from, to)
+      return rename_by_hash(renamer[0]) if renamer.one? && renamer[0].is_a?(Hash) # rename({from => to})
+      raise DataFrameArgumentError, "Invalid argument #{args}"
+    end
+    # assign variables to create new DataFrame
+    def assign(*args, &block)
+      assigner = args
+      if block
+        raise DataFrameArgumentError, 'Must not specify both arguments and a block' unless args.empty?
+        assigner = instance_eval(&block)
+      end
+      assigner = [assigner].flatten
+      return self if assigner.empty? || assigner == [nil]
+      raise DataFrameArgumentError, "Invalid argument #{args}" unless assigner.one? && assigner[0].is_a?(Hash)
+      updater = {}
+      appender = {}
+      assigner[0].each do |key, value|
+        if keys.include? key
+          updater[key] = value
+        else
+          appender[key] = value
+        end
+      end
+      fields, arrays = update_fields_and_arrays(updater)
+      append_to_fields_and_arrays(appender, fields, arrays) unless appender.empty?
+      DataFrame.new(Arrow::Table.new(Arrow::Schema.new(fields), arrays))
+    end
+    private
+    def rename_by_hash(key_pairs)
+      fields = keys.map do |key|
+        new_key = key_pairs[key]
+        if new_key
+          Arrow::Field.new(new_key.to_sym, @table[key].data_type)
+        else
+          @table.schema[key]
+        end
+      end
+      schema = Arrow::Schema.new(fields)
+      DataFrame.new(Arrow::Table.new(schema, @table.columns))
+    end
+    def update_fields_and_arrays(updater)
+      fields = @table.columns.map(&:field)
+      arrays = @table.columns.map(&:data) # chunked_arrays
+      keys.each_with_index do |key, i|
+        data = updater[key]
+        next unless data
+        raise DataFrameArgumentError, "Data size mismatch (#{data.size} != #{size})" if data.size != size
+        a = Arrow::Array.new(data.is_a?(Vector) ? data.to_a : data)
+        fields[i] = Arrow::Field.new(key, a.value_data_type)
+        arrays[i] = Arrow::ChunkedArray.new([a])
+      end
+      [fields, arrays]
+    end
+    def append_to_fields_and_arrays(appender, fields, arrays)
+      appender.each do |key, data|
+        raise DataFrameArgumentError, "Data size mismatch (#{data.size} != #{size})" if data.size != size
+        a = Arrow::Array.new(data.is_a?(Vector) ? data.to_a : data)
+        fields << Arrow::Field.new(key.to_sym, a.value_data_type)
+        arrays << Arrow::ChunkedArray.new([a])
+      end
+    end
+    def keys_by_booleans(booleans)
+      keys.select.with_index { |_, i| booleans[i] }
+    end
+  end
+end

data/lib/red_amber/helper.rb ADDED Viewed

@@ -0,0 +1,61 @@
+# frozen_string_literal: true
+module RedAmber
+  # mix-in for the class DataFrame
+  module Helper
+    private
+    def pl(num)
+      num > 1 ? 's' : ''
+    end
+    def out_of_range?(indeces)
+      indeces.max >= size || indeces.min < -size
+    end
+    def integers?(enum)
+      enum.all?(Integer)
+    end
+    def sym_or_str?(enum)
+      enum.all? { |e| e.is_a?(Symbol) || e.is_a?(String) }
+    end
+    def booleans?(enum)
+      enum.all? { |e| e.is_a?(TrueClass) || e.is_a?(FalseClass) || e.is_a?(NilClass) }
+    end
+    def create_dataframe_from_vector(key, vector)
+      DataFrame.new(key => vector.data)
+    end
+    def parse_to_vector(args)
+      a = args.reduce([]) do |accum, elem|
+        accum.concat(normalize_element(elem))
+      end
+      Vector.new(a)
+    end
+    def normalize_element(elem)
+      case elem
+      when Numeric, String, Symbol, TrueClass, FalseClass, NilClass
+        [elem]
+      when Range
+        both_end = [elem.begin, elem.end]
+        both_end[1] -= 1 if elem.exclude_end? && elem.end.is_a?(Integer)
+        if both_end.any?(Integer) || both_end.all?(&:nil?)
+          if both_end.any? { |e| e&.>=(size) || e&.<(-size) }
+            raise DataFrameArgumentError, "Index out of range: #{elem} for 0..#{size - 1}"
+          end
+          (0...size).to_a[elem]
+        else
+          elem.to_a
+        end
+      else
+        Array(elem)
+      end
+    end
+  end
+end

data/lib/red_amber/vector.rb CHANGED Viewed

@@ -1,27 +1,42 @@
 # frozen_string_literal: true
 module RedAmber
-  # Columnar data object
+  # Values in variable (columnar) data object
   #   @data : holds Arrow::ChunkedArray
   class Vector
     # mix-in
     include VectorFunctions
-    # chunked_array may come from column.data
-    def initialize(array)
-      case array
-      when Vector
-        @data = array.data
-      when Arrow::Array, Arrow::ChunkedArray
-        @data = array
-      when Array
-        @data = Arrow::Array.new(array)
+    include VectorUpdatable
+    include VectorSelectable
+    include Helper
+    def initialize(*array)
+      @key = nil # default is 'headless'
+      if array.empty? || array[0].nil?
+        Vector.new([])
       else
-        raise ArgumentError, 'Unknown array in argument'
+        array.flatten!
+        case array[0]
+        when Vector
+          @data = array[0].data
+          return
+        when Arrow::Array, Arrow::ChunkedArray
+          @data = array[0]
+          return
+        when Range
+          @data = Arrow::Array.new(Array(array[0]))
+          return
+        end
+        begin
+          @data = Arrow::Array.new(Array(array))
+        rescue Error
+          raise VectorArgumentError, "Invalid argument: #{array}"
+        end
       end
     end
     attr_reader :data
+    attr_accessor :key
     def to_s
       @data.to_a.inspect
@@ -49,6 +64,16 @@ module RedAmber
     alias_method :to_a, :values
     alias_method :entries, :values
+    def indices
+      (0...size).to_a
+    end
+    alias_method :indexes, :indices
+    alias_method :indeces, :indices
+    def to_ary
+      to_a
+    end
     def size
       # only defined :length in Arrow?
       @data.length
@@ -57,6 +82,10 @@ module RedAmber
     alias_method :n_rows, :size
     alias_method :nrow, :size
+    def empty?
+      size.zero?
+    end
     def type
       @data.value_type.nick.to_sym
     end
@@ -66,15 +95,19 @@ module RedAmber
     end
     def numeric?
-      %i[int8 uint8 int16 uint16 int32 uint32 int64 uint64 float double].member? type
+      type_class < Arrow::NumericDataType
     end
     def string?
       type == :string
     end
-    def data_type
-      @data.value_type
+    def temporal?
+      type_class < Arrow::TemporalDataType
+    end
+    def type_class
+      @data.value_data_type.class
     end
     # def each() end
@@ -90,7 +123,23 @@ module RedAmber
     # def each_chunk() end
     def tally
-      values.tally
+      hash = values.tally
+      if (type_class < Arrow::FloatingPointDataType) && is_nan.any
+        a = 0
+        hash.each do |key, value|
+          if key.is_a?(Float) && key.nan?
+            hash.delete(key)
+            a += value
+          end
+        end
+        hash[Float::NAN] = a
+      end
+      hash
+    end
+    def value_counts
+      values, counts = Arrow::Function.find(:value_counts).execute([data]).value.fields
+      values.zip(counts).to_h
     end
     def n_nulls
@@ -101,5 +150,9 @@ module RedAmber
     def n_nans
       numeric? ? is_nan.to_a.count(true) : 0
     end
+    def has_nil?
+      is_nil.any
+    end
   end
 end

data/lib/red_amber/vector_functions.rb CHANGED Viewed

@@ -12,32 +12,44 @@ module RedAmber
   module VectorFunctions
     # [Unary aggregations]: vector.func => scalar
     unary_aggregations =
-      %i[all any approximate_median count count_distinct max mean min product stddev sum variance]
+      %i[all any approximate_median count count_distinct max mean min min_max product stddev sum variance]
     unary_aggregations.each do |function|
       define_method(function) do |opts: nil|
-        output = exec_func_unary(function, options: opts)
-        take_out_scalar(output)
+        datum = exec_func_unary(function, options: opts)
+        get_scalar(datum)
       end
     end
     alias_method :median, :approximate_median
     alias_method :count_uniq, :count_distinct
+    alias_method :all?, :all
+    alias_method :any?, :any
+    def unbiased_variance
+      variance(opts: { ddof: 1 })
+    end
+    alias_method :var, :unbiased_variance
+    def sd
+      stddev(opts: { ddof: 1 })
+    end
+    alias_method :std, :sd
     # option(s) required
     # - index
     # Returns other than value
-    # - min_max
     # - mode
     # - quantile
     # - tdigest
     # [Unary element-wise]: vector.func => vector
     unary_element_wise =
-      %i[abs atan bit_wise_not ceil cos floor is_finite is_inf is_nan is_null is_valid sign sin tan trunc]
+      %i[abs array_sort_indices atan bit_wise_not ceil cos fill_null_backward fill_null_forward floor is_finite
+         is_inf is_nan is_null is_valid round round_to_multiple sign sin tan trunc unique]
     unary_element_wise.each do |function|
       define_method(function) do |opts: nil|
-        output = exec_func_unary(function, options: opts)
-        take_out_element_wise(output)
+        datum = exec_func_unary(function, options: opts)
+        Vector.new(datum.value)
       end
     end
     alias_method :is_nil, :is_null
@@ -46,6 +58,14 @@ module RedAmber
       numeric? ? (is_nil | is_nan) : is_nil
     end
+    alias_method :fill_nil_backward, :fill_null_backward
+    alias_method :fill_nil_forward, :fill_null_forward
+    alias_method :sort_indexes, :array_sort_indices
+    alias_method :sort_indices, :array_sort_indices
+    alias_method :uniq, :unique
     # [Unary element-wise with operator]: vector.func => vector, op vector
     unary_element_wise_op = {
       invert: '!',
@@ -53,20 +73,17 @@ module RedAmber
     }
     unary_element_wise_op.each do |function, operator|
       define_method(function) do |opts: nil|
-        output = exec_func_unary(function, options: opts)
-        take_out_element_wise(output)
+        datum = exec_func_unary(function, options: opts)
+        Vector.new(datum.value)
       end
       define_method(operator) do |opts: nil|
-        output = exec_func_unary(function, options: opts)
-        take_out_element_wise(output)
+        datum = exec_func_unary(function, options: opts)
+        Vector.new(datum.value)
       end
     end
     alias_method :not, :invert
-    # option(s) required
-    # - round, round_to_multiple
     # NaN support needed
     # - acos asin ln log10 log1p log2
@@ -79,8 +96,8 @@ module RedAmber
       %i[atan2 and_not and_not_kleene bit_wise_and bit_wise_or bit_wise_xor]
     binary_element_wise.each do |function|
       define_method(function) do |other, opts: nil|
-        output = exec_func_binary(function, other, options: opts)
-        take_out_element_wise(output)
+        datum = exec_func_binary(function, other, options: opts)
+        Vector.new(datum.value)
       end
     end
@@ -95,8 +112,8 @@ module RedAmber
     }
     logical_binary_element_wise.each do |method, function|
       define_method(method) do |other, opts: nil|
-        output = exec_func_binary(function, other, options: opts)
-        take_out_element_wise(output)
+        datum = exec_func_binary(function, other, options: opts)
+        Vector.new(datum.value)
       end
     end
@@ -128,13 +145,13 @@ module RedAmber
     }
     binary_element_wise_op.each do |function, operator|
       define_method(function) do |other, opts: nil|
-        output = exec_func_binary(function, other, options: opts)
-        take_out_element_wise(output)
+        datum = exec_func_binary(function, other, options: opts)
+        Vector.new(datum.value)
       end
       define_method(operator) do |other, opts: nil|
-        output = exec_func_binary(function, other, options: opts)
-        take_out_element_wise(output)
+        datum = exec_func_binary(function, other, options: opts)
+        Vector.new(datum.value)
       end
     end
     alias_method :eq, :equal
@@ -144,14 +161,20 @@ module RedAmber
     alias_method :lt, :less
     alias_method :ne, :not_equal
+    def coerce(other)
+      case other
+      when Vector, Array, Arrow::Array
+        raise VectorArgumentError, "Size unmatch: #{size} != #{other.length}" unless size == other.length
+        [Vector.new(Array(other)), self]
+      end
+      [Vector.new(Array(other) * size), self]
+    end
     # (array functions)
-    # array_filter, array_sort_indices, array_take
-    # dictionary_encode, hash_all, hash_any, hash_approximate_median,
-    # hash_count, hash_count_distinct, hash_distinct, hash_max, hash_mean, hash_min,
-    # hash_min_max, hash_product, hash_stddev, hash_sum, hash_tdigest, hash_variance,
+    # dictionary_encode,
     # partition_nth_indices,
-    # quarter, quarters_between, unique,
-    # value_counts
+    # quarter, quarters_between,
     # (strings)
     # ascii_capitalize, ascii_center, ascii_is_alnum, ascii_is_alpha, ascii_is_decimal,
@@ -180,44 +203,56 @@ module RedAmber
     # strptime, subsecond, us_week, week, weeks_between, year, year_month_day, years_between
     # (onditional)
-    # case_when, cast, if_else
+    # case_when, cast,
     # (indices)
     # choose, index_in, index_in_meta_binary, indices_nonzero
     # (others)
-    # coalesce, drop_null, fill_null_backward, fill_null_forward,
-    # filter, is_in, is_in_meta_binary,
+    # coalesce,
+    # is_in_meta_binary,
     # list_element, list_flatten, list_parent_indices, list_value_length, make_struct,
-    # max_element_wise, min_element_wise, random, replace_with_mask, select_k_unstable,
-    # sort_indices, struct_field, take
+    # max_element_wise, min_element_wise, random, select_k_unstable,
+    # struct_field,
     private # =======
     def exec_func_unary(function, options: nil)
-      func = Arrow::Function.find(function)
-      func.execute([data], options)
+      find(function).execute([data], options)
     end
     def exec_func_binary(function, other, options: nil)
-      func = Arrow::Function.find(function)
       case other
       when Vector
-        func.execute([data, other.data], options)
-      when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar, Array, Numeric
-        func.execute([data, other], options)
+        find(function).execute([data, other.data], options)
+      when Arrow::Array, Arrow::ChunkedArray, Arrow::Scalar, Array, Numeric, String, TrueClass, FalseClass
+        find(function).execute([data, other], options)
       else
-        raise ArgumentError, "Operand is not supported: #{other.class}"
+        raise VectorArgumentError, "Operand is not supported: #{other.class}"
       end
     end
-    def take_out_scalar(output)
-      output = output.value
-      output.is_a?(Arrow::StringScalar) ? output.to_s : output.value
+    def get_scalar(datum)
+      output = datum.value
+      case output
+      when Arrow::StringScalar then output.to_s
+      when Arrow::StructScalar
+        output.value.map { |s| s.is_a?(Arrow::StringScalar) ? s.to_s : s.value }
+      else
+        output.value
+      end
+    end
+    module_function # ======
+    def find(function_name)
+      Arrow::Function.find(function_name)
     end
-    def take_out_element_wise(output)
-      Vector.new(output.value)
+    # temporary API until RedAmber document prepared.
+    def arrow_doc(function_name)
+      f = find(function_name)
+      "#{f}\n#{'-' * function_name.size}\n#{f.doc.description}"
     end
   end
 end

data/lib/red_amber/vector_selectable.rb ADDED Viewed

@@ -0,0 +1,124 @@
+# frozen_string_literal: true
+# Available functions in Arrow are shown by `Arrow::Function.all.map(&:name)`
+# reference: https://arrow.apache.org/docs/cpp/compute.html
+module RedAmber
+  # mix-ins for class Vector
+  # Functions to select some data.
+  module VectorSelectable
+    def drop_nil
+      datum = find(:drop_null).execute([data])
+      Vector.new(datum.value)
+    end
+    # vector calculation version of selection by indices
+    # TODO: support for option {boundscheck: true}
+    def take(*indices)
+      indices.flatten!
+      return Vector.new([]) if indices.empty?
+      indices = indices[0] if indices.one? && !indices[0].is_a?(Numeric)
+      indices = Vector.new(indices) unless indices.is_a?(Vector)
+      take_by_vector(indices) # returns sub Vector
+    end
+    # TODO: support for option {null_selection_behavior: :drop}
+    def filter(*booleans)
+      booleans.flatten!
+      return Vector.new([]) if booleans.empty?
+      b = booleans[0]
+      boolean_array =
+        case b
+        when Vector
+          raise VectorTypeError, 'Argument is not a boolean.' unless b.boolean?
+          b.data
+        when Arrow::BooleanArray
+          b
+        else
+          raise VectorTypeError, 'Argument is not a boolean.' unless booleans?(booleans)
+          Arrow::BooleanArray.new(booleans)
+        end
+      filter_by_array(boolean_array) # returns sub Vector
+    end
+    #   @param indices
+    #   @param booleans
+    def [](*args)
+      args.flatten!
+      return Vector.new([]) if args.empty?
+      arg = args[0]
+      case arg
+      when Vector
+        return take_by_vector(arg) if arg.numeric?
+        return filter_by_array(arg.data) if arg.boolean?
+        raise VectorTypeError, "Argument must be numeric or boolean: #{arg}"
+      when Arrow::BooleanArray
+        return filter_by_array(arg)
+      when Arrow::Array
+        array = arg
+      else
+        unless arg.is_a?(Numeric) || booleans?([arg])
+          raise VectorArgumentError, "Argument must be numeric or boolean: #{args}"
+        end
+      end
+      array ||= Arrow::Array.new(args)
+      return filter_by_array(array) if array.is_a?(Arrow::BooleanArray)
+      vector = Vector.new(array)
+      return take_by_vector(vector) if vector.numeric?
+      raise VectorArgumentError, "Invalid argument: #{args}"
+    end
+    #   @param values [Array, Arrow::Array, Vector]
+    def is_in(*values)
+      values.flatten!
+      array =
+        case values[0]
+        when Vector
+          values[0].data
+        when Arrow::Array
+          values[0]
+        end
+      array ||= data.class.new(values)
+      Vector.new(data.is_in(array))
+    end
+    # Arrow's support required
+    def index(element)
+      to_a.index(element)
+    end
+    private
+    # Accepts indices by numeric Vector
+    def take_by_vector(indices)
+      raise VectorTypeError, "Indices must be numeric Vector: #{indices}" unless indices.numeric?
+      raise VectorArgumentError, "Index out of range: #{indices.min}" if indices.min <= -size - 1
+      normalized_indices = (indices < 0).if_else(indices + size, indices) # normalize index from tail
+      raise VectorArgumentError, "Index out of range: #{normalized_indices.max}" if normalized_indices.max >= size
+      index_array = Arrow::UInt64ArrayBuilder.build(normalized_indices.data) # round to integer array
+      datum = find(:array_take).execute([data, index_array])
+      Vector.new(datum.value)
+    end
+    # Accepts booleans by Arrow::BooleanArray
+    def filter_by_array(boolean_array)
+      raise VectorArgumentError, 'Booleans must be same size as self.' unless boolean_array.length == size
+      datum = find(:array_filter).execute([data, boolean_array])
+      Vector.new(datum.value)
+    end
+  end
+end