RubyGems - red_amber - Versions diffs - 0.2.2 → 0.3.0 - Mend

red_amber 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

checksums.yaml +4 -4
data/.rubocop.yml +114 -39
data/CHANGELOG.md +203 -31
data/Gemfile +5 -2
data/README.md +62 -29
data/benchmark/basic.yml +86 -0
data/benchmark/combine.yml +62 -0
data/benchmark/dataframe.yml +62 -0
data/benchmark/drop_nil.yml +15 -3
data/benchmark/group.yml +39 -0
data/benchmark/reshape.yml +31 -0
data/benchmark/{csv_load_penguins.yml → rover/csv_load_penguins.yml} +3 -3
data/benchmark/rover/flights.yml +23 -0
data/benchmark/rover/penguins.yml +23 -0
data/benchmark/rover/planes.yml +23 -0
data/benchmark/rover/weather.yml +23 -0
data/benchmark/vector.yml +60 -0
data/doc/DataFrame.md +335 -53
data/doc/Vector.md +91 -0
data/doc/image/dataframe/join.png +0 -0
data/doc/image/dataframe/set_and_bind.png +0 -0
data/doc/image/dataframe_model.png +0 -0
data/lib/red_amber/data_frame.rb +167 -51
data/lib/red_amber/data_frame_combinable.rb +486 -0
data/lib/red_amber/data_frame_displayable.rb +6 -4
data/lib/red_amber/data_frame_indexable.rb +2 -2
data/lib/red_amber/data_frame_loadsave.rb +4 -1
data/lib/red_amber/data_frame_reshaping.rb +35 -10
data/lib/red_amber/data_frame_selectable.rb +221 -116
data/lib/red_amber/data_frame_variable_operation.rb +146 -82
data/lib/red_amber/group.rb +108 -18
data/lib/red_amber/helper.rb +53 -43
data/lib/red_amber/refinements.rb +199 -0
data/lib/red_amber/vector.rb +56 -46
data/lib/red_amber/vector_functions.rb +23 -83
data/lib/red_amber/vector_selectable.rb +116 -69
data/lib/red_amber/vector_updatable.rb +189 -65
data/lib/red_amber/version.rb +1 -1
data/lib/red_amber.rb +3 -0
data/red_amber.gemspec +4 -3
metadata +24 -10

data/lib/red_amber/data_frame_combinable.rb ADDED Viewed

@@ -0,0 +1,486 @@
+# frozen_string_literal: true
+module RedAmber
+  # mix-in for the class DataFrame
+  module DataFrameCombinable
+    # Refinements for Arrow::Table
+    using RefineArrowTable
+    # Concatenate other dataframe onto the bottom.
+    #
+    # @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
+    #   DataFrame/Table to concatenate onto the bottom of self.
+    # @return [DataFrame]
+    #   Concatenated dataframe.
+    def concatenate(*other)
+      case other
+      in [] | [nil] | [[]]
+        return self
+      in [Array => array]
+        # Nop
+      else
+        array = other
+      end
+      table_array = array.map do |e|
+        case e
+        when Arrow::Table
+          e
+        when DataFrame
+          e.table
+        else
+          raise DataFrameArgumentError, "#{e} is not a Table or a DataFrame"
+        end
+      end
+      DataFrame.create(table.concatenate(table_array))
+    end
+    alias_method :concat, :concatenate
+    alias_method :bind_rows, :concatenate
+    # Merge other DataFrame or Table from other.
+    # - Self and other must have same size.
+    # - Self and other do not share the same key.
+    #   - If they share any keys, raise Error.
+    # @param other [DataFrame, Arrow::Table, Array<DataFrame, Arrow::Table>]
+    #   DataFrame/Table to concatenate.
+    # @return [DataFrame]
+    #   Merged dataframe.
+    def merge(*other)
+      case other
+      in [] | [nil] | [[]]
+        return self
+      in [Array => array]
+        # Nop
+      else
+        array = other
+      end
+      hash = array.each_with_object({}) do |e, h|
+        df =
+          case e
+          when Arrow::Table
+            DataFrame.create(e)
+          when DataFrame
+            e
+          else
+            raise DataFrameArgumentError, "#{e} is not a Table or a DataFrame"
+          end
+        if size != df.size
+          raise DataFrameArgumentError, "#{e} do not have same size as self"
+        end
+        k = keys.intersection(df.keys).any?
+        raise DataFrameArgumentError, "There are some shared keys: #{k}" if k
+        h.merge!(df.to_h)
+      end
+      assign(hash)
+    end
+    alias_method :bind_cols, :merge
+    # Mutating joins (#inner_join, #full_join, #left_join, #right_join)
+    # Join another DataFrame or Table, leaving only the matching records.
+    # - Same as `#join` with `type: :inner`
+    # - A kind of mutating join.
+    #
+    # @!macro join_before
+    #   @param other [DataFrame, Arrow::Table]
+    #     A DataFrame or a Table to be joined with self.
+    #
+    # @!macro join_after
+    #   @param suffix [#succ]
+    #     a suffix to rename keys when key names conflict as a result of join.
+    #     `suffix` must be responsible to `#succ`.
+    #   @return [DataFrame]
+    #     Joined dataframe.
+    #
+    # @!macro join_key_in_array
+    #   @param join_keys [String, Symbol, Array<String, Symbol>]
+    #     A key or keys to match.
+    #
+    # @!macro join_key_in_hash
+    #   @param join_key_pairs [Hash]
+    #     Pairs of a key name or key names to match in left and right.
+    #   @option join_key_pairs [String, Symbol, Array<String, Symbol>] :left
+    #     Join keys in `self`.
+    #   @option join_key_pairs [String, Symbol, Array<String, Symbol>] :right
+    #     Join keys in `other`.
+    #
+    # @overload inner_join(other, suffix: '.1')
+    #   If `join_key` is not specified, common keys in self and other are used
+    #   (natural keys). Returns joined dataframe.
+    #
+    #   @macro join_before
+    #   @macro join_after
+    #
+    # @overload inner_join(other, join_keys, suffix: '.1')
+    #
+    #   @macro join_before
+    #   @macro join_key_in_array
+    #   @macro join_after
+    #
+    # @overload inner_join(other, join_key_pairs, suffix: '.1')
+    #
+    #   @macro join_before
+    #   @macro join_key_in_hash
+    #   @macro join_after
+    #
+    def inner_join(other, join_keys = nil, suffix: '.1')
+      join(other, join_keys, type: :inner, suffix: suffix)
+    end
+    # Join another DataFrame or Table, leaving all records.
+    # - Same as `#join` with `type: :full_outer`
+    # - A kind of mutating join.
+    #
+    # @overload full_join(other, suffix: '.1')
+    #   If `join_key` is not specified, common keys in self and other are used
+    #   (natural keys). Returns joined dataframe.
+    #
+    #   @macro join_before
+    #   @macro join_after
+    #
+    # @overload full_join(other, join_keys, suffix: '.1')
+    #
+    #   @macro join_before
+    #   @macro join_key_in_array
+    #   @macro join_after
+    #
+    # @overload full_join(other, join_key_pairs, suffix: '.1')
+    #
+    #   @macro join_before
+    #   @macro join_key_in_hash
+    #   @macro join_after
+    #
+    def full_join(other, join_keys = nil, suffix: '.1')
+      join(other, join_keys, type: :full_outer, suffix: suffix)
+    end
+    alias_method :outer_join, :full_join
+    # Join matching values to self from other.
+    # - Same as `#join` with `type: :left_outer`
+    # - A kind of mutating join.
+    #
+    # @overload left_join(other, suffix: '.1')
+    #   If `join_key` is not specified, common keys in self and other are used
+    #   (natural keys). Returns joined dataframe.
+    #
+    #   @macro join_before
+    #   @macro join_after
+    #
+    # @overload left_join(other, join_keys, suffix: '.1')
+    #
+    #   @macro join_before
+    #   @macro join_key_in_array
+    #   @macro join_after
+    #
+    # @overload left_join(other, join_key_pairs, suffix: '.1')
+    #
+    #   @macro join_before
+    #   @macro join_key_in_hash
+    #   @macro join_after
+    #
+    def left_join(other, join_keys = nil, suffix: '.1')
+      join(other, join_keys, type: :left_outer, suffix: suffix)
+    end
+    # Join matching values from self to other.
+    # - Same as `#join` with `type: :right_outer`
+    # - A kind of mutating join.
+    #
+    # @overload right_join(other, suffix: '.1')
+    #   If `join_key` is not specified, common keys in self and other are used
+    #   (natural keys). Returns joined dataframe.
+    #
+    #   @macro join_before
+    #   @macro join_after
+    #
+    # @overload right_join(other, join_keys, suffix: '.1')
+    #
+    #   @macro join_before
+    #   @macro join_key_in_array
+    #   @macro join_after
+    #
+    # @overload right_join(other, join_key_pairs, suffix: '.1')
+    #
+    #   @macro join_before
+    #   @macro join_key_in_hash
+    #   @macro join_after
+    #
+    def right_join(other, join_keys = nil, suffix: '.1')
+      join(other, join_keys, type: :right_outer, suffix: suffix)
+    end
+    # Filtering joins (#semi_join, #anti_join)
+    # Return records of self that have a match in other.
+    # - Same as `#join` with `type: :left_semi`
+    # - A kind of filtering join.
+    #
+    # @overload semi_join(other, suffix: '.1')
+    #   If `join_key` is not specified, common keys in self and other are used
+    #   (natural keys). Returns joined dataframe.
+    #
+    #   @macro join_before
+    #   @macro join_after
+    #
+    # @overload semi_join(other, join_keys, suffix: '.1')
+    #
+    #   @macro join_before
+    #   @macro join_key_in_array
+    #   @macro join_after
+    #
+    # @overload semi_join(other, join_key_pairs, suffix: '.1')
+    #
+    #   @macro join_before
+    #   @macro join_key_in_hash
+    #   @macro join_after
+    #
+    def semi_join(other, join_keys = nil, suffix: '.1')
+      join(other, join_keys, type: :left_semi, suffix: suffix)
+    end
+    # Return records of self that do not have a match in other.
+    # - Same as `#join` with `type: :left_anti`
+    # - A kind of filtering join.
+    #
+    # @overload anti_join(other, suffix: '.1')
+    #   If `join_key` is not specified, common keys in self and other are used
+    #   (natural keys). Returns joined dataframe.
+    #
+    #   @macro join_before
+    #   @macro join_after
+    #
+    # @overload anti_join(other, join_keys, suffix: '.1')
+    #
+    #   @macro join_before
+    #   @macro join_key_in_array
+    #   @macro join_after
+    #
+    # @overload anti_join(other, join_key_pairs, suffix: '.1')
+    #
+    #   @macro join_before
+    #   @macro join_key_in_hash
+    #   @macro join_after
+    #
+    def anti_join(other, join_keys = nil, suffix: '.1')
+      join(other, join_keys, type: :left_anti, suffix: suffix)
+    end
+    # Set operations (#intersect, #union, #difference, #set_operable?)
+    # Check if set operation with self and other is possible.
+    #
+    # @macro join_before
+    #
+    # @return [Boolean] true if set operation is possible.
+    #
+    def set_operable?(other) # rubocop:disable Naming/AccessorMethodName
+      keys == other.keys.map(&:to_sym)
+    end
+    # Select records appearing in both self and other.
+    # - Same as `#join` with `type: :inner` when keys in self are same with other.
+    # - A kind of set operations.
+    #
+    # @macro join_before
+    #
+    # @return [DataFrame] Joined dataframe.
+    #
+    def intersect(other)
+      unless keys == other.keys.map(&:to_sym)
+        raise DataFrameArgumentError, 'keys are not same with self and other'
+      end
+      join(other, keys, type: :inner)
+    end
+    # Select records appearing in self or other.
+    # - Same as `#join` with `type: :full_outer` when keys in self are same with other.
+    # - A kind of set operations.
+    #
+    # @macro join_before
+    #
+    # @return [DataFrame] Joined dataframe.
+    #
+    def union(other)
+      unless keys == other.keys.map(&:to_sym)
+        raise DataFrameArgumentError, 'keys are not same with self and other'
+      end
+      join(other, keys, type: :full_outer)
+    end
+    # Select records appearing in self but not in other.
+    # - Same as `#join` with `type: :left_anti` when keys in self are same with other.
+    # - A kind of set operations.
+    #
+    # @macro join_before
+    #
+    # @return [DataFrame] Joined dataframe.
+    #
+    def difference(other)
+      unless keys == other.keys.map(&:to_sym)
+        raise DataFrameArgumentError, 'keys are not same with self and other'
+      end
+      join(other, keys, type: :left_anti)
+    end
+    alias_method :setdiff, :difference
+    # Join another DataFrame or Table to self.
+    #
+    # @overload join(other, type: :inner, suffix: '.1')
+    #
+    #   If `join_key` is not specified, common keys in self and other are used
+    #   (natural keys). Returns joined dataframe.
+    #
+    #   @!macro join_common_type
+    #     @param type [:left_semi, :right_semi, :left_anti, :right_anti, :inner,
+    #                  left_outer, :right_outer, :full_outer] type of join.
+    #
+    #   @macro join_before
+    #   @macro join_common_type
+    #   @macro join_after
+    #
+    # @overload join(other, join_keys, type: :inner, suffix: '.1')
+    #
+    #   @macro join_before
+    #   @macro join_key_in_array
+    #   @macro join_common_type
+    #   @macro join_after
+    #
+    # @overload join(other, join_key_pairs, type: :inner, suffix: '.1')
+    #
+    #   @macro join_before
+    #   @macro join_key_in_hash
+    #   @macro join_common_type
+    #   @macro join_after
+    #
+    def join(other, join_keys = nil, type: :inner, suffix: '.1')
+      case other
+      when DataFrame
+        other = other.table
+      when Arrow::Table
+        # Nop
+      else
+        raise DataFrameArgumentError, 'other must be a DataFrame or an Arrow::Table'
+      end
+      table_keys = table.keys
+      other_keys = other.keys
+      type = type.to_sym
+      # natural keys (implicit common keys)
+      join_keys ||= table_keys.intersection(other_keys)
+      # This is not necessary if additional procedure is contributed to Red Arrow.
+      if join_keys.is_a?(Hash)
+        left_keys = join_keys[:left]
+        right_keys = join_keys[:right]
+      else
+        left_keys = join_keys
+        right_keys = join_keys
+      end
+      left_keys = Array(left_keys).map(&:to_s)
+      right_keys = Array(right_keys).map(&:to_s)
+      case type
+      when :full_outer, :left_semi, :left_anti, :right_semi, :right_anti
+        left_outputs = nil
+        right_outputs = nil
+      when :inner, :left_outer
+        left_outputs = table_keys
+        right_outputs = other_keys - right_keys
+      when :right_outer
+        left_outputs = table_keys - left_keys
+        right_outputs = other_keys
+      end
+      # Should we rescue errors in Arrow::Table#join for usability ?
+      joined_table =
+        table.join(other, join_keys,
+                   type: type,
+                   left_outputs: left_outputs,
+                   right_outputs: right_outputs)
+      case type
+      when :inner, :left_outer, :left_semi, :left_anti, :right_semi, :right_anti
+        if joined_table.keys.uniq!
+          DataFrame.create(rename_table(joined_table, n_keys, suffix))
+        else
+          DataFrame.create(joined_table)
+        end
+      when :full_outer
+        renamed_table = rename_table(joined_table, n_keys, suffix)
+        renamed_keys = renamed_table.keys
+        dropper = []
+        DataFrame.create(renamed_table).assign do |df|
+          left_keys.map do |left_key|
+            i_left_key = renamed_keys.index(left_key)
+            right_key = renamed_keys[i_left_key + table_keys.size]
+            dropper << right_key
+            [left_key.to_sym, merge_array(df[left_key].data, df[right_key].data)]
+          end
+        end.drop(dropper)
+      when :right_outer
+        if joined_table.keys.uniq!
+          DataFrame.create(rename_table(joined_table, left_outputs.size, suffix))
+        else
+          DataFrame.create(joined_table)
+        end.pick do
+          [right_keys, keys.map(&:to_s) - right_keys]
+        end
+      end
+    end
+    private
+    # Rename duplicate keys by suffix
+    def rename_table(joined_table, n_keys, suffix)
+      joined_keys = joined_table.keys
+      other_keys = joined_keys[n_keys..]
+      dup_keys = joined_keys.tally.select { |_, v| v > 1 }.keys
+      renamed_right_keys =
+        other_keys.map do |key|
+          if dup_keys.include?(key)
+            new_key = nil
+            loop do
+              new_key = "#{key}#{suffix}"
+              break unless joined_keys.include?(new_key)
+              s = suffix.succ
+              raise DataFrameArgumentError, "suffix #{suffix} is invalid" if s == suffix
+              suffix = s
+            end
+            new_key
+          else
+            key
+          end
+        end
+      joined_keys[n_keys..] = renamed_right_keys
+      fields =
+        joined_keys.map.with_index do |k, i|
+          Arrow::Field.new(k, joined_table[i].data_type)
+        end
+      Arrow::Table.new(Arrow::Schema.new(fields), joined_table.columns)
+    end
+    # Merge two Arrow::Arrays
+    def merge_array(array1, array2)
+      t = Arrow::Function.find(:is_null).execute([array1])
+      Arrow::Function.find(:if_else).execute([t, array2, array1]).value
+    end
+  end
+end

data/lib/red_amber/data_frame_displayable.rb CHANGED Viewed

@@ -93,7 +93,8 @@ module RedAmber
       levels = tallys.map(&:size)
       type_groups = @table.columns.map { |column| type_group(column.data_type) }
       quoted_keys = keys.map(&:inspect)
-      headers = { idx: '#', key: 'key', type: 'type', levels: 'level', data: 'data_preview' }
+      headers = { idx: '#', key: 'key', type: 'type', levels: 'level',
+                  data: 'data_preview' }
       header_format = make_header_format(levels, headers, quoted_keys)
       sio = StringIO.new # output string buffer
@@ -174,6 +175,8 @@ module RedAmber
     end
     def format_table(width: 80, head: 5, tail: 3, n_digit: 2)
+      return "  #{keys.join(' ')}\n  (Empty Vectors)\n" if size.zero?
       original = self
       indices = size > head + tail ? [*0..head, *(size - tail)...size] : [*0...size]
       df = slice(indices).assign do
@@ -199,7 +202,8 @@ module RedAmber
         vectors.each_with_object({}) do |v, assigner|
           vec = v.replace(0, v.key == INDEX_KEY ? '' : v.key.to_s)
                  .replace(1, v.key == INDEX_KEY ? '' : "<#{original[v.key].type}>")
-          assigner[v.key] = original.size > head + tail + 1 ? vec.replace(head + 2, ':') : vec
+          assigner[v.key] =
+            original.size > head + tail + 1 ? vec.replace(head + 2, ':') : vec
         end
       end
@@ -263,8 +267,6 @@ module RedAmber
               format('%g', element)
             in Integer
               format('%d', element)
-            else
-              element
             end
           end
         end

data/lib/red_amber/data_frame_indexable.rb CHANGED Viewed

@@ -18,7 +18,7 @@ module RedAmber
     # @return [RedAmber::Vector] Sorted indices in Vector
     def sort_indices(*sort_keys)
       indices = @table.sort_indices(sort_keys.flatten)
-      Vector.new(indices)
+      Vector.create(indices)
     end
     # @return [RedAmber::DataFrame] Sorted DataFrame
@@ -32,7 +32,7 @@ module RedAmber
     def new_dataframe_by(index_array)
       t = Arrow::Function.find(:take).execute([@table, index_array]).value
-      RedAmber::DataFrame.new(t)
+      DataFrame.create(t)
     end
   end
 end

data/lib/red_amber/data_frame_loadsave.rb CHANGED Viewed

@@ -17,14 +17,17 @@ module RedAmber
     end
     # Save DataFrame
+    #
+    # @return [DataFrame] self.
     def save(output, options = {})
       @table.save(output, options)
+      self
     end
     # Save and reload to cast automatically
     #   Via tsv format file temporally as default
     #
-    #   experimental feature
+    # @note experimental feature
     def auto_cast(format: :tsv)
       return self if empty?

data/lib/red_amber/data_frame_reshaping.rb CHANGED Viewed

@@ -8,11 +8,14 @@ module RedAmber
     # @param key [Symbol] key of the index column
     #   to transepose into keys.
     #   If it is not specified, keys[0] is used.
-    # @param new_key [Symbol] key name of transposed index column.
-    #   If it is not specified, :NAME is used. If it already exists, :NAME1 or :NAME1.succ is used.
+    # @param name [Symbol] key name of transposed index column.
+    #   If it is not specified, :NAME is used.
+    #   If it already exists, :NAME1 or :NAME1.succ is used.
     # @return [DataFrame] trnsposed DataFrame
     def transpose(key: keys.first, name: :NAME)
-      raise DataFrameArgumentError, "Self does not include: #{key}" unless keys.include?(key)
+      unless keys.include?(key)
+        raise DataFrameArgumentError, "Self does not include: #{key}"
+      end
       # Find unused name
       new_keys = self[key].to_a.map { |e| e.to_s.to_sym }
@@ -35,14 +38,24 @@ module RedAmber
     # @param value [Symbol, String] key of the column which is come **from values**.
     # @return [DataFrame] long DataFrame.
     def to_long(*keep_keys, name: :NAME, value: :VALUE)
+      warn('[Info] No key to keep is specified.') if keep_keys.empty?
       not_included = keep_keys - keys
-      raise DataFrameArgumentError, "Not have keys #{not_included}" unless not_included.empty?
+      unless not_included.empty?
+        raise DataFrameArgumentError, "Not have keys #{not_included}"
+      end
       name = name.to_sym
-      raise DataFrameArgumentError, "Invalid key: #{name}" if keep_keys.include?(name)
+      if keep_keys.include?(name)
+        raise DataFrameArgumentError,
+              "Can't specify the key: #{name} for the column from keys."
+      end
       value = value.to_sym
-      raise DataFrameArgumentError, "Invalid key: #{value}" if keep_keys.include?(value)
+      if keep_keys.include?(value)
+        raise DataFrameArgumentError,
+              "Can't specify the key: #{value} for the column from values."
+      end
       hash = Hash.new { |h, k| h[k] = [] }
       l = keys.size - keep_keys.size
@@ -62,15 +75,27 @@ module RedAmber
     # Reshape long DataFrame to a wide DataFrame.
     #
-    # @param name [Symbol, String] key of the column which will be expanded **to key names**.
-    # @param value [Symbol, String] key of the column which will be expanded **to values**.
+    # @param name [Symbol, String]
+    #   key of the column which will be expanded **to key names**.
+    # @param value [Symbol, String]
+    #   key of the column which will be expanded **to values**.
     # @return [DataFrame] wide DataFrame.
     def to_wide(name: :NAME, value: :VALUE)
       name = name.to_sym
-      raise DataFrameArgumentError, "Invalid key: #{name}" unless keys.include?(name)
+      unless keys.include?(name)
+        raise DataFrameArgumentError,
+              "You are going to keep the key: #{name}. " \
+              'You may need to specify the column name ' \
+              'that gives the new keys by `:name` option.'
+      end
       value = value.to_sym
-      raise DataFrameArgumentError, "Invalid key: #{value}" unless keys.include?(value)
+      unless keys.include?(value)
+        raise DataFrameArgumentError,
+              "You are going to keep the key: #{value}. " \
+              'You may need to specify the column name ' \
+              'that gives the new values by `:value` option.'
+      end
       hash = Hash.new { |h, k| h[k] = {} }
       keep_keys = keys - [name, value]