rover-df 0.2.5 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ca39a558c3c12103f03fed4cb8f007fbd00a1f8e84b839916fd0010aae4613ba
4
- data.tar.gz: 43df8cdc415cc036ac383f30b7c91a35b644067a3cb8ea199abd7452b98298d5
3
+ metadata.gz: 65d2fda186484e920421543e2f0203635054ccb8a23250bd3fc6a9d8c328725f
4
+ data.tar.gz: e4cd1e6d69e1e4f340f6692111476a5be9405f348841cfba6f6c431f04d85347
5
5
  SHA512:
6
- metadata.gz: 2724c7e85ee7921f277be833cf89be638c14cbb37a44411bba86c42cacffe7c0e4b82ea04d4dfb3d694c6429ba41bc8e8c10f7cb40e5d34bf59d14755858735f
7
- data.tar.gz: fa860158decbca0a0b35ccb82e6f73d9a513c37b483eca52d140842d5dd255899a2e1ded3ec4375a492b86d3ec09ffa53d4871e05f1fdad39f3d2630215417dc
6
+ metadata.gz: c720f3bc45178f938c20546ac1b7279ae047affafce5e06cff4f703e1d8ff7a99c1bca94a3f40cb7d26945d770bf136a2adc3477cf6ffc3cdaad9a15aa6090a1
7
+ data.tar.gz: c44135cc0e70b08b72e1084565ef3479bcb92000bf34662b76a25933e68ad33a584afae071ddebfd5724ad61fe7e7dbc283241d7194c532dd70f36b1358b266d
data/CHANGELOG.md CHANGED
@@ -1,3 +1,19 @@
1
+ ## 0.2.8 (2022-03-15)
2
+
3
+ - Added `group` and `stacked` options to `plot`
4
+ - Improved performance of `read_csv` and `parse_csv`
5
+
6
+ ## 0.2.7 (2022-01-16)
7
+
8
+ - Added support for booleans to Parquet methods
9
+ - Added support for creating data frames from `ActiveRecord::Result`
10
+ - Added `types` option to `read_parquet` and `parse_parquet` methods
11
+
12
+ ## 0.2.6 (2021-10-27)
13
+
14
+ - Added support for `nil` headers to `read_csv` and `parse_csv`
15
+ - Added `read_parquet`, `parse_parquet`, and `to_parquet` methods
16
+
1
17
  ## 0.2.5 (2021-09-25)
2
18
 
3
19
  - Fixed column types with joins
data/LICENSE.txt CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2020-2021 Andrew Kane
1
+ Copyright (c) 2020-2022 Andrew Kane
2
2
 
3
3
  MIT License
4
4
 
data/README.md CHANGED
@@ -13,7 +13,7 @@ Simple, powerful data frames for Ruby
13
13
  Add this line to your application’s Gemfile:
14
14
 
15
15
  ```ruby
16
- gem 'rover-df'
16
+ gem "rover-df"
17
17
  ```
18
18
 
19
19
  ## Intro
@@ -61,6 +61,14 @@ Rover.read_csv("file.csv")
61
61
  Rover.parse_csv("CSV,data,string")
62
62
  ```
63
63
 
64
+ From Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem)
65
+
66
+ ```ruby
67
+ Rover.read_parquet("file.parquet")
68
+ # or
69
+ Rover.parse_parquet("PAR1...")
70
+ ```
71
+
64
72
  ## Attributes
65
73
 
66
74
  Get number of rows
@@ -89,7 +97,7 @@ Select a column
89
97
  df[:a]
90
98
  ```
91
99
 
92
- > Note that strings and symbols are different keys, just like hashes
100
+ > Note that strings and symbols are different keys, just like hashes. Creating a data frame from Active Record, a CSV, or Parquet uses strings.
93
101
 
94
102
  Select multiple columns
95
103
 
@@ -228,7 +236,7 @@ df.group(:a).max(:b)
228
236
  Multiple groups
229
237
 
230
238
  ```ruby
231
- df.group([:a, :b]).count
239
+ df.group(:a, :b).count
232
240
  ```
233
241
 
234
242
  ## Visualization
@@ -236,7 +244,7 @@ df.group([:a, :b]).count
236
244
  Add [Vega](https://github.com/ankane/vega) to your application’s Gemfile:
237
245
 
238
246
  ```ruby
239
- gem 'vega'
247
+ gem "vega"
240
248
  ```
241
249
 
242
250
  And use:
@@ -251,6 +259,18 @@ Specify the chart type (`line`, `pie`, `column`, `bar`, `area`, or `scatter`)
251
259
  df.plot(:a, :b, type: "pie")
252
260
  ```
253
261
 
262
+ Group data
263
+
264
+ ```ruby
265
+ df.plot(:a, :b, group: :c)
266
+ ```
267
+
268
+ Stacked columns or bars
269
+
270
+ ```ruby
271
+ df.plot(:a, :b, group: :c, stacked: true)
272
+ ```
273
+
254
274
  ## Updating Data
255
275
 
256
276
  Add a new column
@@ -393,6 +413,12 @@ CSV
393
413
  df.to_csv
394
414
  ```
395
415
 
416
+ Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem)
417
+
418
+ ```ruby
419
+ df.to_parquet
420
+ ```
421
+
396
422
  ## Types
397
423
 
398
424
  You can specify column types when creating a data frame
@@ -40,8 +40,8 @@ module Rover
40
40
  vectors.each do |k, v|
41
41
  @vectors[k] = to_vector(v, type: types[k])
42
42
  end
43
- elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base))
44
- result = data.connection.select_all(data.all.to_sql)
43
+ elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base) || data.is_a?(ActiveRecord::Result))
44
+ result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.all.to_sql)
45
45
  result.columns.each_with_index do |k, i|
46
46
  @vectors[k] = to_vector(result.rows.map { |r| r[i] }, type: types[k])
47
47
  end
@@ -235,6 +235,44 @@ module Rover
235
235
  end
236
236
  end
237
237
 
238
+ def to_parquet
239
+ require "parquet"
240
+
241
+ schema = {}
242
+ types.each do |name, type|
243
+ schema[name] =
244
+ case type
245
+ when :int
246
+ :int64
247
+ when :uint
248
+ :uint64
249
+ when :float
250
+ :double
251
+ when :float32
252
+ :float
253
+ when :bool
254
+ :boolean
255
+ when :object
256
+ if @vectors[name].all? { |v| v.is_a?(String) }
257
+ :string
258
+ else
259
+ raise "Unknown type"
260
+ end
261
+ else
262
+ type
263
+ end
264
+ end
265
+ # TODO improve performance
266
+ raw_records = []
267
+ size.times do |i|
268
+ raw_records << @vectors.map { |_, v| v[i] }
269
+ end
270
+ table = Arrow::Table.new(schema, raw_records)
271
+ buffer = Arrow::ResizableBuffer.new(1024)
272
+ table.save(buffer, format: :parquet)
273
+ buffer.data.to_s
274
+ end
275
+
238
276
  # for IRuby
239
277
  def to_html
240
278
  require "iruby"
@@ -363,7 +401,7 @@ module Rover
363
401
  keys.all? { |k| self[k].to_numo == other[k].to_numo }
364
402
  end
365
403
 
366
- def plot(x = nil, y = nil, type: nil)
404
+ def plot(x = nil, y = nil, type: nil, group: nil, stacked: nil)
367
405
  require "vega"
368
406
 
369
407
  raise ArgumentError, "Must specify columns" if keys.size != 2 && (!x || !y)
@@ -378,7 +416,7 @@ module Rover
378
416
  raise "Cannot determine type. Use the type option."
379
417
  end
380
418
  end
381
- data = self[[x, y]]
419
+ data = self[group.nil? ? [x, y] : [x, y, group]]
382
420
 
383
421
  case type
384
422
  when "line", "area"
@@ -392,16 +430,20 @@ module Rover
392
430
  end
393
431
 
394
432
  scale = x_type == "temporal" ? {type: "utc"} : {}
433
+ encoding = {
434
+ x: {field: x, type: x_type, scale: scale},
435
+ y: {field: y, type: "quantitative"}
436
+ }
437
+ encoding[:color] = {field: group} if group
395
438
 
396
439
  Vega.lite
397
440
  .data(data)
398
441
  .mark(type: type, tooltip: true, interpolate: "cardinal", point: {size: 60})
399
- .encoding(
400
- x: {field: x, type: x_type, scale: scale},
401
- y: {field: y, type: "quantitative"}
402
- )
442
+ .encoding(encoding)
403
443
  .config(axis: {labelFontSize: 12})
404
444
  when "pie"
445
+ raise ArgumentError, "Cannot use group option with pie chart" unless group.nil?
446
+
405
447
  Vega.lite
406
448
  .data(data)
407
449
  .mark(type: "arc", tooltip: true)
@@ -411,34 +453,48 @@ module Rover
411
453
  )
412
454
  .view(stroke: nil)
413
455
  when "column"
456
+ encoding = {
457
+ x: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
458
+ y: {field: y, type: "quantitative"}
459
+ }
460
+ if group
461
+ encoding[:color] = {field: group}
462
+ encoding[:xOffset] = {field: group} unless stacked
463
+ end
464
+
414
465
  Vega.lite
415
466
  .data(data)
416
467
  .mark(type: "bar", tooltip: true)
417
- .encoding(
418
- # TODO determine label angle
419
- x: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
420
- y: {field: y, type: "quantitative"}
421
- )
468
+ .encoding(encoding)
422
469
  .config(axis: {labelFontSize: 12})
423
470
  when "bar"
471
+ encoding = {
472
+ # TODO determine label angle
473
+ y: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
474
+ x: {field: y, type: "quantitative"}
475
+ }
476
+ if group
477
+ encoding[:color] = {field: group}
478
+ encoding[:yOffset] = {field: group} unless stacked
479
+ end
480
+
424
481
  Vega.lite
425
482
  .data(data)
426
483
  .mark(type: "bar", tooltip: true)
427
- .encoding(
428
- # TODO determine label angle
429
- y: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
430
- x: {field: y, type: "quantitative"}
431
- )
484
+ .encoding(encoding)
432
485
  .config(axis: {labelFontSize: 12})
433
486
  when "scatter"
487
+ encoding = {
488
+ x: {field: x, type: "quantitative", scale: {zero: false}},
489
+ y: {field: y, type: "quantitative", scale: {zero: false}},
490
+ size: {value: 60}
491
+ }
492
+ encoding[:color] = {field: group} if group
493
+
434
494
  Vega.lite
435
495
  .data(data)
436
496
  .mark(type: "circle", tooltip: true)
437
- .encoding(
438
- x: {field: x, type: "quantitative", scale: {zero: false}},
439
- y: {field: y, type: "quantitative", scale: {zero: false}},
440
- size: {value: 60}
441
- )
497
+ .encoding(encoding)
442
498
  .config(axis: {labelFontSize: 12})
443
499
  else
444
500
  raise ArgumentError, "Invalid type: #{type}"
data/lib/rover/group.rb CHANGED
@@ -1,10 +1,12 @@
1
1
  module Rover
2
2
  class Group
3
+ # TODO raise ArgumentError for empty columns in 0.3.0
3
4
  def initialize(df, columns)
4
5
  @df = df
5
6
  @columns = columns
6
7
  end
7
8
 
9
+ # TODO raise ArgumentError for empty columns in 0.3.0
8
10
  def group(*columns)
9
11
  Group.new(@df, @columns + columns.flatten)
10
12
  end
@@ -22,6 +24,14 @@ module Rover
22
24
  end
23
25
  end
24
26
 
27
+ def plot(*args, **options)
28
+ raise ArgumentError, "Multiple groups not supported" if @columns.size > 1
29
+ # same message as Ruby
30
+ raise ArgumentError, "unknown keyword: :group" if options.key?(:group)
31
+
32
+ @df.plot(*args, **options, group: @columns.first)
33
+ end
34
+
25
35
  private
26
36
 
27
37
  # TODO make more efficient
data/lib/rover/vector.rb CHANGED
@@ -359,6 +359,7 @@ module Rover
359
359
  data = data.to_a
360
360
 
361
361
  if type
362
+ data = data.map { |v| v || Float::NAN } if [:float, :float32].include?(type)
362
363
  data = numo_type.cast(data)
363
364
  else
364
365
  data =
data/lib/rover/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Rover
2
- VERSION = "0.2.5"
2
+ VERSION = "0.2.8"
3
3
  end
data/lib/rover.rb CHANGED
@@ -9,36 +9,125 @@ require "rover/version"
9
9
 
10
10
  module Rover
11
11
  class << self
12
- def read_csv(path, types: nil, **options)
13
- require "csv"
14
- csv_to_df(CSV.read(path, **csv_options(options)), types: types, headers: options[:headers])
12
+ def read_csv(path, **options)
13
+ csv_to_df(**options) do |csv_options|
14
+ CSV.read(path, **csv_options)
15
+ end
15
16
  end
16
17
 
17
- def parse_csv(str, types: nil, **options)
18
- require "csv"
19
- csv_to_df(CSV.parse(str, **csv_options(options)), types: types, headers: options[:headers])
18
+ def parse_csv(str, **options)
19
+ csv_to_df(**options) do |csv_options|
20
+ CSV.parse(str, **csv_options)
21
+ end
20
22
  end
21
23
 
22
- private
24
+ def read_parquet(path, **options)
25
+ parquet_to_df(**options) do
26
+ Arrow::Table.load(path)
27
+ end
28
+ end
23
29
 
24
- # TODO use date converter
25
- def csv_options(options)
26
- options = {headers: true, converters: :numeric}.merge(options)
27
- raise ArgumentError, "Must specify headers" unless options[:headers]
28
- options
30
+ def parse_parquet(str, **options)
31
+ parquet_to_df(**options) do
32
+ Arrow::Table.load(Arrow::Buffer.new(str), format: :parquet)
33
+ end
29
34
  end
30
35
 
31
- def csv_to_df(table, types: nil, headers: nil)
32
- if headers && headers.size < table.headers.size
33
- raise ArgumentError, "Expected #{table.headers.size} headers, got #{headers.size}"
36
+ private
37
+
38
+ def csv_to_df(types: nil, headers: nil, **csv_options)
39
+ require "csv"
40
+
41
+ raise ArgumentError, "Must specify headers" if headers == false
42
+
43
+ # TODO use date converter
44
+ table = yield({converters: :numeric}.merge(csv_options))
45
+
46
+ headers = nil if headers == true
47
+ if headers && table.first && headers.size < table.first.size
48
+ raise ArgumentError, "Expected #{table.first.size} headers, got #{headers.size}"
49
+ end
50
+
51
+ table_headers = (headers || table.shift || []).dup
52
+ # keep same behavior as headers: true
53
+ if table.first
54
+ while table_headers.size < table.first.size
55
+ table_headers << nil
56
+ end
34
57
  end
35
58
 
36
- table.by_col!
37
59
  data = {}
38
- table.each do |k, v|
39
- data[k] = v
60
+ keys = table_headers.map { |k| [k, true] }.to_h
61
+ unnamed_suffix = 1
62
+ table_headers.each_with_index do |k, i|
63
+ # TODO do same for empty string in 0.3.0
64
+ if k.nil?
65
+ k = "unnamed"
66
+ while keys.include?(k)
67
+ unnamed_suffix += 1
68
+ k = "unnamed#{unnamed_suffix}"
69
+ end
70
+ keys[k] = true
71
+ end
72
+ table_headers[i] = k
40
73
  end
74
+
75
+ table_headers.each_with_index do |k, i|
76
+ # use first value for duplicate headers like headers: true
77
+ next if data[k]
78
+
79
+ values = []
80
+ table.each do |row|
81
+ values << row[i]
82
+ end
83
+ data[k] = values
84
+ end
85
+
41
86
  DataFrame.new(data, types: types)
42
87
  end
88
+
89
+ PARQUET_TYPE_MAPPING = {
90
+ "bool" => Numo::Bit,
91
+ "float" => Numo::SFloat,
92
+ "double" => Numo::DFloat,
93
+ "int8" => Numo::Int8,
94
+ "int16" => Numo::Int16,
95
+ "int32" => Numo::Int32,
96
+ "int64" => Numo::Int64,
97
+ "string" => Numo::RObject,
98
+ "uint8" => Numo::UInt8,
99
+ "uint16" => Numo::UInt16,
100
+ "uint32" => Numo::UInt32,
101
+ "uint64" => Numo::UInt64
102
+ }
103
+
104
+ def parquet_to_df(types: nil)
105
+ require "parquet"
106
+
107
+ table = yield
108
+ data = {}
109
+ types ||= {}
110
+ table.each_column do |column|
111
+ k = column.field.name
112
+ if types[k]
113
+ data[k] = Vector.new(column.data.values, type: types[k])
114
+ else
115
+ type = column.field.data_type.to_s
116
+ numo_type = PARQUET_TYPE_MAPPING[type]
117
+ raise "Unknown type: #{type}" unless numo_type
118
+
119
+ # TODO automatic conversion?
120
+ # int => float
121
+ # bool => object
122
+ if (type.include?("int") || type == "bool") && column.n_nulls > 0
123
+ raise "Nulls not supported for #{type} column: #{k}"
124
+ end
125
+
126
+ # TODO improve performance
127
+ data[k] = numo_type.cast(column.data.values)
128
+ end
129
+ end
130
+ DataFrame.new(data)
131
+ end
43
132
  end
44
133
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rover-df
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.5
4
+ version: 0.2.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-09-25 00:00:00.000000000 Z
11
+ date: 2022-03-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -58,7 +58,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
58
58
  - !ruby/object:Gem::Version
59
59
  version: '0'
60
60
  requirements: []
61
- rubygems_version: 3.2.22
61
+ rubygems_version: 3.3.7
62
62
  signing_key:
63
63
  specification_version: 4
64
64
  summary: Simple, powerful data frames for Ruby