RubyGems - rover-df - Versions diffs - 0.2.5 → 0.2.8 - Mend

rover-df 0.2.5 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ca39a558c3c12103f03fed4cb8f007fbd00a1f8e84b839916fd0010aae4613ba
-  data.tar.gz: 43df8cdc415cc036ac383f30b7c91a35b644067a3cb8ea199abd7452b98298d5
+  metadata.gz: 65d2fda186484e920421543e2f0203635054ccb8a23250bd3fc6a9d8c328725f
+  data.tar.gz: e4cd1e6d69e1e4f340f6692111476a5be9405f348841cfba6f6c431f04d85347
 SHA512:
-  metadata.gz: 2724c7e85ee7921f277be833cf89be638c14cbb37a44411bba86c42cacffe7c0e4b82ea04d4dfb3d694c6429ba41bc8e8c10f7cb40e5d34bf59d14755858735f
-  data.tar.gz: fa860158decbca0a0b35ccb82e6f73d9a513c37b483eca52d140842d5dd255899a2e1ded3ec4375a492b86d3ec09ffa53d4871e05f1fdad39f3d2630215417dc
+  metadata.gz: c720f3bc45178f938c20546ac1b7279ae047affafce5e06cff4f703e1d8ff7a99c1bca94a3f40cb7d26945d770bf136a2adc3477cf6ffc3cdaad9a15aa6090a1
+  data.tar.gz: c44135cc0e70b08b72e1084565ef3479bcb92000bf34662b76a25933e68ad33a584afae071ddebfd5724ad61fe7e7dbc283241d7194c532dd70f36b1358b266d

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,19 @@
+## 0.2.8 (2022-03-15)
+- Added `group` and `stacked` options to `plot`
+- Improved performance of `read_csv` and `parse_csv`
+## 0.2.7 (2022-01-16)
+- Added support for booleans to Parquet methods
+- Added support for creating data frames from `ActiveRecord::Result`
+- Added `types` option to `read_parquet` and `parse_parquet` methods
+## 0.2.6 (2021-10-27)
+- Added support for `nil` headers to `read_csv` and `parse_csv`
+- Added `read_parquet`, `parse_parquet`, and `to_parquet` methods
 ## 0.2.5 (2021-09-25)
 - Fixed column types with joins

data/LICENSE.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-Copyright (c) 2020-2021 Andrew Kane
+Copyright (c) 2020-2022 Andrew Kane
 MIT License

data/README.md CHANGED Viewed

@@ -13,7 +13,7 @@ Simple, powerful data frames for Ruby
 Add this line to your application’s Gemfile:
 ```ruby
-gem 'rover-df'
+gem "rover-df"
 ```
 ## Intro
@@ -61,6 +61,14 @@ Rover.read_csv("file.csv")
 Rover.parse_csv("CSV,data,string")
 ```
+From Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem)
+```ruby
+Rover.read_parquet("file.parquet")
+# or
+Rover.parse_parquet("PAR1...")
+```
 ## Attributes
 Get number of rows
@@ -89,7 +97,7 @@ Select a column
 df[:a]
 ```
-> Note that strings and symbols are different keys, just like hashes
+> Note that strings and symbols are different keys, just like hashes. Creating a data frame from Active Record, a CSV, or Parquet uses strings.
 Select multiple columns
@@ -228,7 +236,7 @@ df.group(:a).max(:b)
 Multiple groups
 ```ruby
-df.group([:a, :b]).count
+df.group(:a, :b).count
 ```
 ## Visualization
@@ -236,7 +244,7 @@ df.group([:a, :b]).count
 Add [Vega](https://github.com/ankane/vega) to your application’s Gemfile:
 ```ruby
-gem 'vega'
+gem "vega"
 ```
 And use:
@@ -251,6 +259,18 @@ Specify the chart type (`line`, `pie`, `column`, `bar`, `area`, or `scatter`)
 df.plot(:a, :b, type: "pie")
 ```
+Group data
+```ruby
+df.plot(:a, :b, group: :c)
+```
+Stacked columns or bars
+```ruby
+df.plot(:a, :b, group: :c, stacked: true)
+```
 ## Updating Data
 Add a new column
@@ -393,6 +413,12 @@ CSV
 df.to_csv
 ```
+Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem)
+```ruby
+df.to_parquet
+```
 ## Types
 You can specify column types when creating a data frame

data/lib/rover/data_frame.rb CHANGED Viewed

@@ -40,8 +40,8 @@ module Rover
         vectors.each do |k, v|
           @vectors[k] = to_vector(v, type: types[k])
         end
-      elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base))
-        result = data.connection.select_all(data.all.to_sql)
+      elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base) || data.is_a?(ActiveRecord::Result))
+        result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.all.to_sql)
         result.columns.each_with_index do |k, i|
           @vectors[k] = to_vector(result.rows.map { |r| r[i] }, type: types[k])
         end
@@ -235,6 +235,44 @@ module Rover
       end
     end
+    def to_parquet
+      require "parquet"
+      schema = {}
+      types.each do |name, type|
+        schema[name] =
+          case type
+          when :int
+            :int64
+          when :uint
+            :uint64
+          when :float
+            :double
+          when :float32
+            :float
+          when :bool
+            :boolean
+          when :object
+            if @vectors[name].all? { |v| v.is_a?(String) }
+              :string
+            else
+              raise "Unknown type"
+            end
+          else
+            type
+          end
+      end
+      # TODO improve performance
+      raw_records = []
+      size.times do |i|
+        raw_records << @vectors.map { |_, v| v[i] }
+      end
+      table = Arrow::Table.new(schema, raw_records)
+      buffer = Arrow::ResizableBuffer.new(1024)
+      table.save(buffer, format: :parquet)
+      buffer.data.to_s
+    end
     # for IRuby
     def to_html
       require "iruby"
@@ -363,7 +401,7 @@ module Rover
       keys.all? { |k| self[k].to_numo == other[k].to_numo }
     end
-    def plot(x = nil, y = nil, type: nil)
+    def plot(x = nil, y = nil, type: nil, group: nil, stacked: nil)
       require "vega"
       raise ArgumentError, "Must specify columns" if keys.size != 2 && (!x || !y)
@@ -378,7 +416,7 @@ module Rover
           raise "Cannot determine type. Use the type option."
         end
       end
-      data = self[[x, y]]
+      data = self[group.nil? ? [x, y] : [x, y, group]]
       case type
       when "line", "area"
@@ -392,16 +430,20 @@ module Rover
           end
         scale = x_type == "temporal" ? {type: "utc"} : {}
+        encoding = {
+          x: {field: x, type: x_type, scale: scale},
+          y: {field: y, type: "quantitative"}
+        }
+        encoding[:color] = {field: group} if group
         Vega.lite
           .data(data)
           .mark(type: type, tooltip: true, interpolate: "cardinal", point: {size: 60})
-          .encoding(
-            x: {field: x, type: x_type, scale: scale},
-            y: {field: y, type: "quantitative"}
-          )
+          .encoding(encoding)
           .config(axis: {labelFontSize: 12})
       when "pie"
+        raise ArgumentError, "Cannot use group option with pie chart" unless group.nil?
         Vega.lite
           .data(data)
           .mark(type: "arc", tooltip: true)
@@ -411,34 +453,48 @@ module Rover
           )
           .view(stroke: nil)
       when "column"
+        encoding = {
+          x: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
+          y: {field: y, type: "quantitative"}
+        }
+        if group
+          encoding[:color] = {field: group}
+          encoding[:xOffset] = {field: group} unless stacked
+        end
         Vega.lite
           .data(data)
           .mark(type: "bar", tooltip: true)
-          .encoding(
-            # TODO determine label angle
-            x: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
-            y: {field: y, type: "quantitative"}
-          )
+          .encoding(encoding)
           .config(axis: {labelFontSize: 12})
       when "bar"
+        encoding = {
+          # TODO determine label angle
+          y: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
+          x: {field: y, type: "quantitative"}
+        }
+        if group
+          encoding[:color] = {field: group}
+          encoding[:yOffset] = {field: group} unless stacked
+        end
         Vega.lite
           .data(data)
           .mark(type: "bar", tooltip: true)
-          .encoding(
-            # TODO determine label angle
-            y: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
-            x: {field: y, type: "quantitative"}
-          )
+          .encoding(encoding)
           .config(axis: {labelFontSize: 12})
       when "scatter"
+        encoding = {
+          x: {field: x, type: "quantitative", scale: {zero: false}},
+          y: {field: y, type: "quantitative", scale: {zero: false}},
+          size: {value: 60}
+        }
+        encoding[:color] = {field: group} if group
         Vega.lite
           .data(data)
           .mark(type: "circle", tooltip: true)
-          .encoding(
-            x: {field: x, type: "quantitative", scale: {zero: false}},
-            y: {field: y, type: "quantitative", scale: {zero: false}},
-            size: {value: 60}
-          )
+          .encoding(encoding)
           .config(axis: {labelFontSize: 12})
       else
         raise ArgumentError, "Invalid type: #{type}"

data/lib/rover/group.rb CHANGED Viewed

@@ -1,10 +1,12 @@
 module Rover
   class Group
+    # TODO raise ArgumentError for empty columns in 0.3.0
     def initialize(df, columns)
       @df = df
       @columns = columns
     end
+    # TODO raise ArgumentError for empty columns in 0.3.0
     def group(*columns)
       Group.new(@df, @columns + columns.flatten)
     end
@@ -22,6 +24,14 @@ module Rover
       end
     end
+    def plot(*args, **options)
+      raise ArgumentError, "Multiple groups not supported" if @columns.size > 1
+      # same message as Ruby
+      raise ArgumentError, "unknown keyword: :group" if options.key?(:group)
+      @df.plot(*args, **options, group: @columns.first)
+    end
     private
     # TODO make more efficient

data/lib/rover/vector.rb CHANGED Viewed

@@ -359,6 +359,7 @@ module Rover
         data = data.to_a
         if type
+          data = data.map { |v| v || Float::NAN } if [:float, :float32].include?(type)
           data = numo_type.cast(data)
         else
           data =

data/lib/rover/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Rover
-  VERSION = "0.2.5"
+  VERSION = "0.2.8"
 end

data/lib/rover.rb CHANGED Viewed

@@ -9,36 +9,125 @@ require "rover/version"
 module Rover
   class << self
-    def read_csv(path, types: nil, **options)
-      require "csv"
-      csv_to_df(CSV.read(path, **csv_options(options)), types: types, headers: options[:headers])
+    def read_csv(path, **options)
+      csv_to_df(**options) do |csv_options|
+        CSV.read(path, **csv_options)
+      end
     end
-    def parse_csv(str, types: nil, **options)
-      require "csv"
-      csv_to_df(CSV.parse(str, **csv_options(options)), types: types, headers: options[:headers])
+    def parse_csv(str, **options)
+      csv_to_df(**options) do |csv_options|
+        CSV.parse(str, **csv_options)
+      end
     end
-    private
+    def read_parquet(path, **options)
+      parquet_to_df(**options) do
+        Arrow::Table.load(path)
+      end
+    end
-    # TODO use date converter
-    def csv_options(options)
-      options = {headers: true, converters: :numeric}.merge(options)
-      raise ArgumentError, "Must specify headers" unless options[:headers]
-      options
+    def parse_parquet(str, **options)
+      parquet_to_df(**options) do
+        Arrow::Table.load(Arrow::Buffer.new(str), format: :parquet)
+      end
     end
-    def csv_to_df(table, types: nil, headers: nil)
-      if headers && headers.size < table.headers.size
-        raise ArgumentError, "Expected #{table.headers.size} headers, got #{headers.size}"
+    private
+    def csv_to_df(types: nil, headers: nil, **csv_options)
+      require "csv"
+      raise ArgumentError, "Must specify headers" if headers == false
+      # TODO use date converter
+      table = yield({converters: :numeric}.merge(csv_options))
+      headers = nil if headers == true
+      if headers && table.first && headers.size < table.first.size
+        raise ArgumentError, "Expected #{table.first.size} headers, got #{headers.size}"
+      end
+      table_headers = (headers || table.shift || []).dup
+      # keep same behavior as headers: true
+      if table.first
+        while table_headers.size < table.first.size
+          table_headers << nil
+        end
       end
-      table.by_col!
       data = {}
-      table.each do |k, v|
-        data[k] = v
+      keys = table_headers.map { |k| [k, true] }.to_h
+      unnamed_suffix = 1
+      table_headers.each_with_index do |k, i|
+        # TODO do same for empty string in 0.3.0
+        if k.nil?
+          k = "unnamed"
+          while keys.include?(k)
+            unnamed_suffix += 1
+            k = "unnamed#{unnamed_suffix}"
+          end
+          keys[k] = true
+        end
+        table_headers[i] = k
       end
+      table_headers.each_with_index do |k, i|
+        # use first value for duplicate headers like headers: true
+        next if data[k]
+        values = []
+        table.each do |row|
+          values << row[i]
+        end
+        data[k] = values
+      end
       DataFrame.new(data, types: types)
     end
+    PARQUET_TYPE_MAPPING = {
+      "bool" => Numo::Bit,
+      "float" => Numo::SFloat,
+      "double" => Numo::DFloat,
+      "int8" => Numo::Int8,
+      "int16" => Numo::Int16,
+      "int32" => Numo::Int32,
+      "int64" => Numo::Int64,
+      "string" => Numo::RObject,
+      "uint8" => Numo::UInt8,
+      "uint16" => Numo::UInt16,
+      "uint32" => Numo::UInt32,
+      "uint64" => Numo::UInt64
+    }
+    def parquet_to_df(types: nil)
+      require "parquet"
+      table = yield
+      data = {}
+      types ||= {}
+      table.each_column do |column|
+        k = column.field.name
+        if types[k]
+          data[k] = Vector.new(column.data.values, type: types[k])
+        else
+          type = column.field.data_type.to_s
+          numo_type = PARQUET_TYPE_MAPPING[type]
+          raise "Unknown type: #{type}" unless numo_type
+          # TODO automatic conversion?
+          # int => float
+          # bool => object
+          if (type.include?("int") || type == "bool") && column.n_nulls > 0
+            raise "Nulls not supported for #{type} column: #{k}"
+          end
+          # TODO improve performance
+          data[k] = numo_type.cast(column.data.values)
+        end
+      end
+      DataFrame.new(data)
+    end
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rover-df
 version: !ruby/object:Gem::Version
-  version: 0.2.5
+  version: 0.2.8
 platform: ruby
 authors:
 - Andrew Kane
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2021-09-25 00:00:00.000000000 Z
+date: 2022-03-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: numo-narray
@@ -58,7 +58,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.2.22
+rubygems_version: 3.3.7
 signing_key:
 specification_version: 4
 summary: Simple, powerful data frames for Ruby