rover-df 0.2.5 → 0.2.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ca39a558c3c12103f03fed4cb8f007fbd00a1f8e84b839916fd0010aae4613ba
4
- data.tar.gz: 43df8cdc415cc036ac383f30b7c91a35b644067a3cb8ea199abd7452b98298d5
3
+ metadata.gz: 65d2fda186484e920421543e2f0203635054ccb8a23250bd3fc6a9d8c328725f
4
+ data.tar.gz: e4cd1e6d69e1e4f340f6692111476a5be9405f348841cfba6f6c431f04d85347
5
5
  SHA512:
6
- metadata.gz: 2724c7e85ee7921f277be833cf89be638c14cbb37a44411bba86c42cacffe7c0e4b82ea04d4dfb3d694c6429ba41bc8e8c10f7cb40e5d34bf59d14755858735f
7
- data.tar.gz: fa860158decbca0a0b35ccb82e6f73d9a513c37b483eca52d140842d5dd255899a2e1ded3ec4375a492b86d3ec09ffa53d4871e05f1fdad39f3d2630215417dc
6
+ metadata.gz: c720f3bc45178f938c20546ac1b7279ae047affafce5e06cff4f703e1d8ff7a99c1bca94a3f40cb7d26945d770bf136a2adc3477cf6ffc3cdaad9a15aa6090a1
7
+ data.tar.gz: c44135cc0e70b08b72e1084565ef3479bcb92000bf34662b76a25933e68ad33a584afae071ddebfd5724ad61fe7e7dbc283241d7194c532dd70f36b1358b266d
data/CHANGELOG.md CHANGED
@@ -1,3 +1,19 @@
1
+ ## 0.2.8 (2022-03-15)
2
+
3
+ - Added `group` and `stacked` options to `plot`
4
+ - Improved performance of `read_csv` and `parse_csv`
5
+
6
+ ## 0.2.7 (2022-01-16)
7
+
8
+ - Added support for booleans to Parquet methods
9
+ - Added support for creating data frames from `ActiveRecord::Result`
10
+ - Added `types` option to `read_parquet` and `parse_parquet` methods
11
+
12
+ ## 0.2.6 (2021-10-27)
13
+
14
+ - Added support for `nil` headers to `read_csv` and `parse_csv`
15
+ - Added `read_parquet`, `parse_parquet`, and `to_parquet` methods
16
+
1
17
  ## 0.2.5 (2021-09-25)
2
18
 
3
19
  - Fixed column types with joins
data/LICENSE.txt CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2020-2021 Andrew Kane
1
+ Copyright (c) 2020-2022 Andrew Kane
2
2
 
3
3
  MIT License
4
4
 
data/README.md CHANGED
@@ -13,7 +13,7 @@ Simple, powerful data frames for Ruby
13
13
  Add this line to your application’s Gemfile:
14
14
 
15
15
  ```ruby
16
- gem 'rover-df'
16
+ gem "rover-df"
17
17
  ```
18
18
 
19
19
  ## Intro
@@ -61,6 +61,14 @@ Rover.read_csv("file.csv")
61
61
  Rover.parse_csv("CSV,data,string")
62
62
  ```
63
63
 
64
+ From Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem)
65
+
66
+ ```ruby
67
+ Rover.read_parquet("file.parquet")
68
+ # or
69
+ Rover.parse_parquet("PAR1...")
70
+ ```
71
+
64
72
  ## Attributes
65
73
 
66
74
  Get number of rows
@@ -89,7 +97,7 @@ Select a column
89
97
  df[:a]
90
98
  ```
91
99
 
92
- > Note that strings and symbols are different keys, just like hashes
100
+ > Note that strings and symbols are different keys, just like hashes. Creating a data frame from Active Record, a CSV, or Parquet uses strings.
93
101
 
94
102
  Select multiple columns
95
103
 
@@ -228,7 +236,7 @@ df.group(:a).max(:b)
228
236
  Multiple groups
229
237
 
230
238
  ```ruby
231
- df.group([:a, :b]).count
239
+ df.group(:a, :b).count
232
240
  ```
233
241
 
234
242
  ## Visualization
@@ -236,7 +244,7 @@ df.group([:a, :b]).count
236
244
  Add [Vega](https://github.com/ankane/vega) to your application’s Gemfile:
237
245
 
238
246
  ```ruby
239
- gem 'vega'
247
+ gem "vega"
240
248
  ```
241
249
 
242
250
  And use:
@@ -251,6 +259,18 @@ Specify the chart type (`line`, `pie`, `column`, `bar`, `area`, or `scatter`)
251
259
  df.plot(:a, :b, type: "pie")
252
260
  ```
253
261
 
262
+ Group data
263
+
264
+ ```ruby
265
+ df.plot(:a, :b, group: :c)
266
+ ```
267
+
268
+ Stacked columns or bars
269
+
270
+ ```ruby
271
+ df.plot(:a, :b, group: :c, stacked: true)
272
+ ```
273
+
254
274
  ## Updating Data
255
275
 
256
276
  Add a new column
@@ -393,6 +413,12 @@ CSV
393
413
  df.to_csv
394
414
  ```
395
415
 
416
+ Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem)
417
+
418
+ ```ruby
419
+ df.to_parquet
420
+ ```
421
+
396
422
  ## Types
397
423
 
398
424
  You can specify column types when creating a data frame
@@ -40,8 +40,8 @@ module Rover
40
40
  vectors.each do |k, v|
41
41
  @vectors[k] = to_vector(v, type: types[k])
42
42
  end
43
- elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base))
44
- result = data.connection.select_all(data.all.to_sql)
43
+ elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base) || data.is_a?(ActiveRecord::Result))
44
+ result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.all.to_sql)
45
45
  result.columns.each_with_index do |k, i|
46
46
  @vectors[k] = to_vector(result.rows.map { |r| r[i] }, type: types[k])
47
47
  end
@@ -235,6 +235,44 @@ module Rover
235
235
  end
236
236
  end
237
237
 
238
+ def to_parquet
239
+ require "parquet"
240
+
241
+ schema = {}
242
+ types.each do |name, type|
243
+ schema[name] =
244
+ case type
245
+ when :int
246
+ :int64
247
+ when :uint
248
+ :uint64
249
+ when :float
250
+ :double
251
+ when :float32
252
+ :float
253
+ when :bool
254
+ :boolean
255
+ when :object
256
+ if @vectors[name].all? { |v| v.is_a?(String) }
257
+ :string
258
+ else
259
+ raise "Unknown type"
260
+ end
261
+ else
262
+ type
263
+ end
264
+ end
265
+ # TODO improve performance
266
+ raw_records = []
267
+ size.times do |i|
268
+ raw_records << @vectors.map { |_, v| v[i] }
269
+ end
270
+ table = Arrow::Table.new(schema, raw_records)
271
+ buffer = Arrow::ResizableBuffer.new(1024)
272
+ table.save(buffer, format: :parquet)
273
+ buffer.data.to_s
274
+ end
275
+
238
276
  # for IRuby
239
277
  def to_html
240
278
  require "iruby"
@@ -363,7 +401,7 @@ module Rover
363
401
  keys.all? { |k| self[k].to_numo == other[k].to_numo }
364
402
  end
365
403
 
366
- def plot(x = nil, y = nil, type: nil)
404
+ def plot(x = nil, y = nil, type: nil, group: nil, stacked: nil)
367
405
  require "vega"
368
406
 
369
407
  raise ArgumentError, "Must specify columns" if keys.size != 2 && (!x || !y)
@@ -378,7 +416,7 @@ module Rover
378
416
  raise "Cannot determine type. Use the type option."
379
417
  end
380
418
  end
381
- data = self[[x, y]]
419
+ data = self[group.nil? ? [x, y] : [x, y, group]]
382
420
 
383
421
  case type
384
422
  when "line", "area"
@@ -392,16 +430,20 @@ module Rover
392
430
  end
393
431
 
394
432
  scale = x_type == "temporal" ? {type: "utc"} : {}
433
+ encoding = {
434
+ x: {field: x, type: x_type, scale: scale},
435
+ y: {field: y, type: "quantitative"}
436
+ }
437
+ encoding[:color] = {field: group} if group
395
438
 
396
439
  Vega.lite
397
440
  .data(data)
398
441
  .mark(type: type, tooltip: true, interpolate: "cardinal", point: {size: 60})
399
- .encoding(
400
- x: {field: x, type: x_type, scale: scale},
401
- y: {field: y, type: "quantitative"}
402
- )
442
+ .encoding(encoding)
403
443
  .config(axis: {labelFontSize: 12})
404
444
  when "pie"
445
+ raise ArgumentError, "Cannot use group option with pie chart" unless group.nil?
446
+
405
447
  Vega.lite
406
448
  .data(data)
407
449
  .mark(type: "arc", tooltip: true)
@@ -411,34 +453,48 @@ module Rover
411
453
  )
412
454
  .view(stroke: nil)
413
455
  when "column"
456
+ encoding = {
457
+ x: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
458
+ y: {field: y, type: "quantitative"}
459
+ }
460
+ if group
461
+ encoding[:color] = {field: group}
462
+ encoding[:xOffset] = {field: group} unless stacked
463
+ end
464
+
414
465
  Vega.lite
415
466
  .data(data)
416
467
  .mark(type: "bar", tooltip: true)
417
- .encoding(
418
- # TODO determine label angle
419
- x: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
420
- y: {field: y, type: "quantitative"}
421
- )
468
+ .encoding(encoding)
422
469
  .config(axis: {labelFontSize: 12})
423
470
  when "bar"
471
+ encoding = {
472
+ # TODO determine label angle
473
+ y: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
474
+ x: {field: y, type: "quantitative"}
475
+ }
476
+ if group
477
+ encoding[:color] = {field: group}
478
+ encoding[:yOffset] = {field: group} unless stacked
479
+ end
480
+
424
481
  Vega.lite
425
482
  .data(data)
426
483
  .mark(type: "bar", tooltip: true)
427
- .encoding(
428
- # TODO determine label angle
429
- y: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
430
- x: {field: y, type: "quantitative"}
431
- )
484
+ .encoding(encoding)
432
485
  .config(axis: {labelFontSize: 12})
433
486
  when "scatter"
487
+ encoding = {
488
+ x: {field: x, type: "quantitative", scale: {zero: false}},
489
+ y: {field: y, type: "quantitative", scale: {zero: false}},
490
+ size: {value: 60}
491
+ }
492
+ encoding[:color] = {field: group} if group
493
+
434
494
  Vega.lite
435
495
  .data(data)
436
496
  .mark(type: "circle", tooltip: true)
437
- .encoding(
438
- x: {field: x, type: "quantitative", scale: {zero: false}},
439
- y: {field: y, type: "quantitative", scale: {zero: false}},
440
- size: {value: 60}
441
- )
497
+ .encoding(encoding)
442
498
  .config(axis: {labelFontSize: 12})
443
499
  else
444
500
  raise ArgumentError, "Invalid type: #{type}"
data/lib/rover/group.rb CHANGED
@@ -1,10 +1,12 @@
1
1
  module Rover
2
2
  class Group
3
+ # TODO raise ArgumentError for empty columns in 0.3.0
3
4
  def initialize(df, columns)
4
5
  @df = df
5
6
  @columns = columns
6
7
  end
7
8
 
9
+ # TODO raise ArgumentError for empty columns in 0.3.0
8
10
  def group(*columns)
9
11
  Group.new(@df, @columns + columns.flatten)
10
12
  end
@@ -22,6 +24,14 @@ module Rover
22
24
  end
23
25
  end
24
26
 
27
+ def plot(*args, **options)
28
+ raise ArgumentError, "Multiple groups not supported" if @columns.size > 1
29
+ # same message as Ruby
30
+ raise ArgumentError, "unknown keyword: :group" if options.key?(:group)
31
+
32
+ @df.plot(*args, **options, group: @columns.first)
33
+ end
34
+
25
35
  private
26
36
 
27
37
  # TODO make more efficient
data/lib/rover/vector.rb CHANGED
@@ -359,6 +359,7 @@ module Rover
359
359
  data = data.to_a
360
360
 
361
361
  if type
362
+ data = data.map { |v| v || Float::NAN } if [:float, :float32].include?(type)
362
363
  data = numo_type.cast(data)
363
364
  else
364
365
  data =
data/lib/rover/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Rover
2
- VERSION = "0.2.5"
2
+ VERSION = "0.2.8"
3
3
  end
data/lib/rover.rb CHANGED
@@ -9,36 +9,125 @@ require "rover/version"
9
9
 
10
10
  module Rover
11
11
  class << self
12
- def read_csv(path, types: nil, **options)
13
- require "csv"
14
- csv_to_df(CSV.read(path, **csv_options(options)), types: types, headers: options[:headers])
12
+ def read_csv(path, **options)
13
+ csv_to_df(**options) do |csv_options|
14
+ CSV.read(path, **csv_options)
15
+ end
15
16
  end
16
17
 
17
- def parse_csv(str, types: nil, **options)
18
- require "csv"
19
- csv_to_df(CSV.parse(str, **csv_options(options)), types: types, headers: options[:headers])
18
+ def parse_csv(str, **options)
19
+ csv_to_df(**options) do |csv_options|
20
+ CSV.parse(str, **csv_options)
21
+ end
20
22
  end
21
23
 
22
- private
24
+ def read_parquet(path, **options)
25
+ parquet_to_df(**options) do
26
+ Arrow::Table.load(path)
27
+ end
28
+ end
23
29
 
24
- # TODO use date converter
25
- def csv_options(options)
26
- options = {headers: true, converters: :numeric}.merge(options)
27
- raise ArgumentError, "Must specify headers" unless options[:headers]
28
- options
30
+ def parse_parquet(str, **options)
31
+ parquet_to_df(**options) do
32
+ Arrow::Table.load(Arrow::Buffer.new(str), format: :parquet)
33
+ end
29
34
  end
30
35
 
31
- def csv_to_df(table, types: nil, headers: nil)
32
- if headers && headers.size < table.headers.size
33
- raise ArgumentError, "Expected #{table.headers.size} headers, got #{headers.size}"
36
+ private
37
+
38
+ def csv_to_df(types: nil, headers: nil, **csv_options)
39
+ require "csv"
40
+
41
+ raise ArgumentError, "Must specify headers" if headers == false
42
+
43
+ # TODO use date converter
44
+ table = yield({converters: :numeric}.merge(csv_options))
45
+
46
+ headers = nil if headers == true
47
+ if headers && table.first && headers.size < table.first.size
48
+ raise ArgumentError, "Expected #{table.first.size} headers, got #{headers.size}"
49
+ end
50
+
51
+ table_headers = (headers || table.shift || []).dup
52
+ # keep same behavior as headers: true
53
+ if table.first
54
+ while table_headers.size < table.first.size
55
+ table_headers << nil
56
+ end
34
57
  end
35
58
 
36
- table.by_col!
37
59
  data = {}
38
- table.each do |k, v|
39
- data[k] = v
60
+ keys = table_headers.map { |k| [k, true] }.to_h
61
+ unnamed_suffix = 1
62
+ table_headers.each_with_index do |k, i|
63
+ # TODO do same for empty string in 0.3.0
64
+ if k.nil?
65
+ k = "unnamed"
66
+ while keys.include?(k)
67
+ unnamed_suffix += 1
68
+ k = "unnamed#{unnamed_suffix}"
69
+ end
70
+ keys[k] = true
71
+ end
72
+ table_headers[i] = k
40
73
  end
74
+
75
+ table_headers.each_with_index do |k, i|
76
+ # use first value for duplicate headers like headers: true
77
+ next if data[k]
78
+
79
+ values = []
80
+ table.each do |row|
81
+ values << row[i]
82
+ end
83
+ data[k] = values
84
+ end
85
+
41
86
  DataFrame.new(data, types: types)
42
87
  end
88
+
89
+ PARQUET_TYPE_MAPPING = {
90
+ "bool" => Numo::Bit,
91
+ "float" => Numo::SFloat,
92
+ "double" => Numo::DFloat,
93
+ "int8" => Numo::Int8,
94
+ "int16" => Numo::Int16,
95
+ "int32" => Numo::Int32,
96
+ "int64" => Numo::Int64,
97
+ "string" => Numo::RObject,
98
+ "uint8" => Numo::UInt8,
99
+ "uint16" => Numo::UInt16,
100
+ "uint32" => Numo::UInt32,
101
+ "uint64" => Numo::UInt64
102
+ }
103
+
104
+ def parquet_to_df(types: nil)
105
+ require "parquet"
106
+
107
+ table = yield
108
+ data = {}
109
+ types ||= {}
110
+ table.each_column do |column|
111
+ k = column.field.name
112
+ if types[k]
113
+ data[k] = Vector.new(column.data.values, type: types[k])
114
+ else
115
+ type = column.field.data_type.to_s
116
+ numo_type = PARQUET_TYPE_MAPPING[type]
117
+ raise "Unknown type: #{type}" unless numo_type
118
+
119
+ # TODO automatic conversion?
120
+ # int => float
121
+ # bool => object
122
+ if (type.include?("int") || type == "bool") && column.n_nulls > 0
123
+ raise "Nulls not supported for #{type} column: #{k}"
124
+ end
125
+
126
+ # TODO improve performance
127
+ data[k] = numo_type.cast(column.data.values)
128
+ end
129
+ end
130
+ DataFrame.new(data)
131
+ end
43
132
  end
44
133
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rover-df
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.5
4
+ version: 0.2.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-09-25 00:00:00.000000000 Z
11
+ date: 2022-03-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -58,7 +58,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
58
58
  - !ruby/object:Gem::Version
59
59
  version: '0'
60
60
  requirements: []
61
- rubygems_version: 3.2.22
61
+ rubygems_version: 3.3.7
62
62
  signing_key:
63
63
  specification_version: 4
64
64
  summary: Simple, powerful data frames for Ruby