rover-df 0.2.7 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c75bed3989211e806e54c296290e5f7b3af236a15742daac876e211e3ca5a76f
4
- data.tar.gz: 5865ff8f1d0036423f18cfee867da63214ee50f79d373b0f0f244853d8efbefa
3
+ metadata.gz: e7acc7d0f290ab905594bac7ab1641572e717346f1d0546b3d8b783bcd1e5aca
4
+ data.tar.gz: 2569b64c237ef836884be2077e8bb8107512d6946631fdb217fb276439412419
5
5
  SHA512:
6
- metadata.gz: 11718bc8ade75a605e92cabe05c29e55c6d4dfe427cd5ada0a8a216db678b32a88f4a43843d1e7dcda7b7a64adb63b76969f1d958e91ca57c4f71989632e14aa
7
- data.tar.gz: 16940236090625bef69cb14d6d9f9f50720314edea1b5892f60443799e5389700ddfb0d79a29ee1e193168097add9d7195799e7f049d85f9c9dc9c443843a678
6
+ metadata.gz: dc46c174fced1c55a96fff28a5c69dfe5a508de527432dac3801cf65375093e290860112191444ceaa23023456452f9331406cdf360a7e689f0db27c289c157c
7
+ data.tar.gz: 770174c6fb1cc8a52cd47ef77eae871cb5a346d752e9c1c13b8cb44fdc5d791b60fea3d5480bcaa4453b3bca1e83e32af57a57ff0c2a319308d7206d620fa75b
data/CHANGELOG.md CHANGED
@@ -1,3 +1,24 @@
1
+ ## 0.3.1 (2022-05-18)
2
+
3
+ - Added `to!` to vectors
4
+ - Fixed error with `nil` and `:float64` type
5
+ - Fixed `:header_converters` option with `read_csv` and `parse_csv`
6
+
7
+ ## 0.3.0 (2022-04-03)
8
+
9
+ - Added `deep_dup` method to data frames
10
+ - Changed `:int` to `:int64`, `:uint` to `:uint64`, and `:float` to `:float64` for type methods
11
+ - Changed missing column to raise `KeyError` instead of `ArgumentError` for aggregate methods
12
+ - Changed passing too many headers to `read_csv` and `parse_csv` to raise `ArgumentError`
13
+ - Changed empty string in CSV headers to match behavior of `nil`
14
+ - Fixed `clone` and `dup` method for vectors
15
+ - Dropped support for Ruby < 2.7
16
+
17
+ ## 0.2.8 (2022-03-15)
18
+
19
+ - Added `group` and `stacked` options to `plot`
20
+ - Improved performance of `read_csv` and `parse_csv`
21
+
1
22
  ## 0.2.7 (2022-01-16)
2
23
 
3
24
  - Added support for booleans to Parquet methods
data/README.md CHANGED
@@ -13,7 +13,7 @@ Simple, powerful data frames for Ruby
13
13
  Add this line to your application’s Gemfile:
14
14
 
15
15
  ```ruby
16
- gem 'rover-df'
16
+ gem "rover-df"
17
17
  ```
18
18
 
19
19
  ## Intro
@@ -236,7 +236,7 @@ df.group(:a).max(:b)
236
236
  Multiple groups
237
237
 
238
238
  ```ruby
239
- df.group([:a, :b]).count
239
+ df.group(:a, :b).count
240
240
  ```
241
241
 
242
242
  ## Visualization
@@ -244,7 +244,7 @@ df.group([:a, :b]).count
244
244
  Add [Vega](https://github.com/ankane/vega) to your application’s Gemfile:
245
245
 
246
246
  ```ruby
247
- gem 'vega'
247
+ gem "vega"
248
248
  ```
249
249
 
250
250
  And use:
@@ -259,6 +259,18 @@ Specify the chart type (`line`, `pie`, `column`, `bar`, `area`, or `scatter`)
259
259
  df.plot(:a, :b, type: "pie")
260
260
  ```
261
261
 
262
+ Group data
263
+
264
+ ```ruby
265
+ df.plot(:a, :b, group: :c)
266
+ ```
267
+
268
+ Stacked columns or bars
269
+
270
+ ```ruby
271
+ df.plot(:a, :b, group: :c, stacked: true)
272
+ ```
273
+
262
274
  ## Updating Data
263
275
 
264
276
  Add a new column
@@ -412,22 +424,22 @@ df.to_parquet
412
424
  You can specify column types when creating a data frame
413
425
 
414
426
  ```ruby
415
- Rover::DataFrame.new(data, types: {"a" => :int, "b" => :float})
427
+ Rover::DataFrame.new(data, types: {"a" => :int64, "b" => :float64})
416
428
  ```
417
429
 
418
430
  Or
419
431
 
420
432
  ```ruby
421
- Rover.read_csv("data.csv", types: {"a" => :int, "b" => :float})
433
+ Rover.read_csv("data.csv", types: {"a" => :int64, "b" => :float64})
422
434
  ```
423
435
 
424
436
  Supported types are:
425
437
 
426
- - boolean - `bool`
427
- - float - `float`, `float32`
428
- - integer - `int`, `int32`, `int16`, `int8`
429
- - unsigned integer - `uint`, `uint32`, `uint16`, `uint8`
430
- - object - `object`
438
+ - boolean - `:bool`
439
+ - float - `:float64`, `:float32`
440
+ - integer - `:int64`, `:int32`, `:int16`, `:int8`
441
+ - unsigned integer - `:uint64`, `:uint32`, `:uint16`, `:uint8`
442
+ - object - `:object`
431
443
 
432
444
  Get column types
433
445
 
@@ -444,7 +456,7 @@ df[:a].type
444
456
  Change the type of a column
445
457
 
446
458
  ```ruby
447
- df[:a] = df[:a].to(:int)
459
+ df[:a] = df[:a].to(:int32)
448
460
  ```
449
461
 
450
462
  ## History
@@ -72,7 +72,7 @@ module Rover
72
72
  # multiple columns
73
73
  df = DataFrame.new
74
74
  where.each do |k|
75
- check_column(k, true)
75
+ check_column(k)
76
76
  df[k] = @vectors[k]
77
77
  end
78
78
  df
@@ -102,7 +102,7 @@ module Rover
102
102
  def []=(k, v)
103
103
  check_key(k)
104
104
  v = to_vector(v, size: size)
105
- raise ArgumentError, "Size mismatch: expected #{size}, got #{v.size}" if @vectors.any? && v.size != size
105
+ raise ArgumentError, "Size mismatch (given #{v.size}, expected #{size})" if @vectors.any? && v.size != size
106
106
  @vectors[k] = v
107
107
  end
108
108
 
@@ -242,11 +242,11 @@ module Rover
242
242
  types.each do |name, type|
243
243
  schema[name] =
244
244
  case type
245
- when :int
245
+ when :int64
246
246
  :int64
247
- when :uint
247
+ when :uint64
248
248
  :uint64
249
- when :float
249
+ when :float64
250
250
  :double
251
251
  when :float32
252
252
  :float
@@ -346,10 +346,10 @@ module Rover
346
346
  end
347
347
  end
348
348
 
349
- def dup
349
+ def deep_dup
350
350
  df = DataFrame.new
351
351
  @vectors.each do |k, v|
352
- df[k] = v
352
+ df[k] = v.dup
353
353
  end
354
354
  df
355
355
  end
@@ -401,7 +401,7 @@ module Rover
401
401
  keys.all? { |k| self[k].to_numo == other[k].to_numo }
402
402
  end
403
403
 
404
- def plot(x = nil, y = nil, type: nil)
404
+ def plot(x = nil, y = nil, type: nil, group: nil, stacked: nil)
405
405
  require "vega"
406
406
 
407
407
  raise ArgumentError, "Must specify columns" if keys.size != 2 && (!x || !y)
@@ -416,7 +416,7 @@ module Rover
416
416
  raise "Cannot determine type. Use the type option."
417
417
  end
418
418
  end
419
- data = self[[x, y]]
419
+ data = self[group.nil? ? [x, y] : [x, y, group]]
420
420
 
421
421
  case type
422
422
  when "line", "area"
@@ -430,16 +430,20 @@ module Rover
430
430
  end
431
431
 
432
432
  scale = x_type == "temporal" ? {type: "utc"} : {}
433
+ encoding = {
434
+ x: {field: x, type: x_type, scale: scale},
435
+ y: {field: y, type: "quantitative"}
436
+ }
437
+ encoding[:color] = {field: group} if group
433
438
 
434
439
  Vega.lite
435
440
  .data(data)
436
441
  .mark(type: type, tooltip: true, interpolate: "cardinal", point: {size: 60})
437
- .encoding(
438
- x: {field: x, type: x_type, scale: scale},
439
- y: {field: y, type: "quantitative"}
440
- )
442
+ .encoding(encoding)
441
443
  .config(axis: {labelFontSize: 12})
442
444
  when "pie"
445
+ raise ArgumentError, "Cannot use group option with pie chart" unless group.nil?
446
+
443
447
  Vega.lite
444
448
  .data(data)
445
449
  .mark(type: "arc", tooltip: true)
@@ -449,34 +453,48 @@ module Rover
449
453
  )
450
454
  .view(stroke: nil)
451
455
  when "column"
456
+ encoding = {
457
+ x: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
458
+ y: {field: y, type: "quantitative"}
459
+ }
460
+ if group
461
+ encoding[:color] = {field: group}
462
+ encoding[:xOffset] = {field: group} unless stacked
463
+ end
464
+
452
465
  Vega.lite
453
466
  .data(data)
454
467
  .mark(type: "bar", tooltip: true)
455
- .encoding(
456
- # TODO determine label angle
457
- x: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
458
- y: {field: y, type: "quantitative"}
459
- )
468
+ .encoding(encoding)
460
469
  .config(axis: {labelFontSize: 12})
461
470
  when "bar"
471
+ encoding = {
472
+ # TODO determine label angle
473
+ y: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
474
+ x: {field: y, type: "quantitative"}
475
+ }
476
+ if group
477
+ encoding[:color] = {field: group}
478
+ encoding[:yOffset] = {field: group} unless stacked
479
+ end
480
+
462
481
  Vega.lite
463
482
  .data(data)
464
483
  .mark(type: "bar", tooltip: true)
465
- .encoding(
466
- # TODO determine label angle
467
- y: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
468
- x: {field: y, type: "quantitative"}
469
- )
484
+ .encoding(encoding)
470
485
  .config(axis: {labelFontSize: 12})
471
486
  when "scatter"
487
+ encoding = {
488
+ x: {field: x, type: "quantitative", scale: {zero: false}},
489
+ y: {field: y, type: "quantitative", scale: {zero: false}},
490
+ size: {value: 60}
491
+ }
492
+ encoding[:color] = {field: group} if group
493
+
472
494
  Vega.lite
473
495
  .data(data)
474
496
  .mark(type: "circle", tooltip: true)
475
- .encoding(
476
- x: {field: x, type: "quantitative", scale: {zero: false}},
477
- y: {field: y, type: "quantitative", scale: {zero: false}},
478
- size: {value: 60}
479
- )
497
+ .encoding(encoding)
480
498
  .config(axis: {labelFontSize: 12})
481
499
  else
482
500
  raise ArgumentError, "Invalid type: #{type}"
@@ -485,8 +503,20 @@ module Rover
485
503
 
486
504
  private
487
505
 
506
+ # for clone
507
+ def initialize_clone(_)
508
+ @vectors = @vectors.clone
509
+ super
510
+ end
511
+
512
+ # for dup
513
+ def initialize_dup(_)
514
+ @vectors = @vectors.dup
515
+ super
516
+ end
517
+
488
518
  def check_key(key)
489
- raise ArgumentError, "Key must be a string or symbol, got #{key.inspect}" unless key.is_a?(String) || key.is_a?(Symbol)
519
+ raise ArgumentError, "Key must be a String or Symbol, given #{key.class.name}" unless key.is_a?(String) || key.is_a?(Symbol)
490
520
  end
491
521
 
492
522
  # TODO make more efficient
@@ -547,19 +577,9 @@ module Rover
547
577
  raise ArgumentError, "Missing keys: #{missing_keys.join(", ")}" if missing_keys.any?
548
578
  end
549
579
 
550
- # TODO in 0.3.0
551
- # always use did_you_mean
552
- def check_column(key, did_you_mean = false)
580
+ def check_column(key)
553
581
  unless include?(key)
554
- if did_you_mean
555
- if RUBY_VERSION.to_f >= 2.6
556
- raise KeyError.new("Missing column: #{key}", receiver: self, key: key)
557
- else
558
- raise KeyError.new("Missing column: #{key}")
559
- end
560
- else
561
- raise ArgumentError, "Missing column: #{key}"
562
- end
582
+ raise KeyError.new("Missing column: #{key}", receiver: self, key: key)
563
583
  end
564
584
  end
565
585
 
data/lib/rover/group.rb CHANGED
@@ -3,6 +3,7 @@ module Rover
3
3
  def initialize(df, columns)
4
4
  @df = df
5
5
  @columns = columns
6
+ check_columns
6
7
  end
7
8
 
8
9
  def group(*columns)
@@ -22,16 +23,20 @@ module Rover
22
23
  end
23
24
  end
24
25
 
26
+ def plot(*args, **options)
27
+ raise ArgumentError, "Multiple groups not supported" if @columns.size > 1
28
+ # same message as Ruby
29
+ raise ArgumentError, "unknown keyword: :group" if options.key?(:group)
30
+
31
+ @df.plot(*args, **options, group: @columns.first)
32
+ end
33
+
25
34
  private
26
35
 
27
36
  # TODO make more efficient
28
37
  def grouped_dfs
29
38
  # cache here so we can reuse for multiple calcuations if needed
30
39
  @grouped_dfs ||= begin
31
- raise ArgumentError, "No columns given" if @columns.empty?
32
- missing_keys = @columns - @df.keys
33
- raise ArgumentError, "Missing keys: #{missing_keys.join(", ")}" if missing_keys.any?
34
-
35
40
  groups = Hash.new { |hash, key| hash[key] = [] }
36
41
  i = 0
37
42
  @df.each_row do |row|
@@ -46,5 +51,12 @@ module Rover
46
51
  result
47
52
  end
48
53
  end
54
+
55
+ def check_columns
56
+ raise ArgumentError, "No columns given" if @columns.empty?
57
+
58
+ missing_keys = @columns - @df.keys
59
+ raise ArgumentError, "Missing keys: #{missing_keys.join(", ")}" if missing_keys.any?
60
+ end
49
61
  end
50
62
  end
data/lib/rover/vector.rb CHANGED
@@ -1,23 +1,23 @@
1
1
  module Rover
2
2
  class Vector
3
3
  # if a user never specifies types,
4
- # the defaults are bool, float, int, and object
5
- # keep these simple
6
- #
7
- # we could create aliases for float64, int64, uint64
8
- # if so, type should still return the simple type
4
+ # the defaults are bool, float64, int64, and object
9
5
  TYPE_CAST_MAPPING = {
10
6
  bool: Numo::Bit,
11
7
  float32: Numo::SFloat,
12
- float: Numo::DFloat,
8
+ float64: Numo::DFloat,
13
9
  int8: Numo::Int8,
14
10
  int16: Numo::Int16,
15
11
  int32: Numo::Int32,
16
- int: Numo::Int64,
12
+ int64: Numo::Int64,
17
13
  object: Numo::RObject,
18
14
  uint8: Numo::UInt8,
19
15
  uint16: Numo::UInt16,
20
16
  uint32: Numo::UInt32,
17
+ uint64: Numo::UInt64,
18
+ # legacy - must come last
19
+ float: Numo::DFloat,
20
+ int: Numo::Int64,
21
21
  uint: Numo::UInt64
22
22
  }
23
23
 
@@ -31,7 +31,12 @@ module Rover
31
31
  end
32
32
 
33
33
  def to(type)
34
- Vector.new(self, type: type)
34
+ dup.to!(type)
35
+ end
36
+
37
+ def to!(type)
38
+ @data = cast_data(@data, type: type)
39
+ self
35
40
  end
36
41
 
37
42
  def to_numo
@@ -333,6 +338,18 @@ module Rover
333
338
 
334
339
  private
335
340
 
341
+ # for clone
342
+ def initialize_clone(_)
343
+ @data = @data.clone
344
+ super
345
+ end
346
+
347
+ # for dup
348
+ def initialize_dup(_)
349
+ @data = @data.dup
350
+ super
351
+ end
352
+
336
353
  def cast_data(data, type: nil)
337
354
  numo_type = numo_type(type) if type
338
355
 
@@ -359,7 +376,7 @@ module Rover
359
376
  data = data.to_a
360
377
 
361
378
  if type
362
- data = data.map { |v| v || Float::NAN } if [:float, :float32].include?(type)
379
+ data = data.map { |v| v || Float::NAN } if [:float, :float32, :float64].include?(type)
363
380
  data = numo_type.cast(data)
364
381
  else
365
382
  data =
data/lib/rover/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Rover
2
- VERSION = "0.2.7"
2
+ VERSION = "0.3.1"
3
3
  end
data/lib/rover.rb CHANGED
@@ -9,47 +9,64 @@ require "rover/version"
9
9
 
10
10
  module Rover
11
11
  class << self
12
- def read_csv(path, types: nil, **options)
13
- require "csv"
14
- csv_to_df(CSV.read(path, **csv_options(options)), types: types, headers: options[:headers])
12
+ def read_csv(path, **options)
13
+ csv_to_df(**options) do |csv_options|
14
+ CSV.read(path, **csv_options)
15
+ end
15
16
  end
16
17
 
17
- def parse_csv(str, types: nil, **options)
18
- require "csv"
19
- csv_to_df(CSV.parse(str, **csv_options(options)), types: types, headers: options[:headers])
18
+ def parse_csv(str, **options)
19
+ csv_to_df(**options) do |csv_options|
20
+ CSV.parse(str, **csv_options)
21
+ end
20
22
  end
21
23
 
22
- def read_parquet(path, types: nil)
23
- require "parquet"
24
- parquet_to_df(Arrow::Table.load(path), types: types)
24
+ def read_parquet(path, **options)
25
+ parquet_to_df(**options) do
26
+ Arrow::Table.load(path)
27
+ end
25
28
  end
26
29
 
27
- def parse_parquet(str, types: nil)
28
- require "parquet"
29
- parquet_to_df(Arrow::Table.load(Arrow::Buffer.new(str), format: :parquet), types: types)
30
+ def parse_parquet(str, **options)
31
+ parquet_to_df(**options) do
32
+ Arrow::Table.load(Arrow::Buffer.new(str), format: :parquet)
33
+ end
30
34
  end
31
35
 
32
36
  private
33
37
 
34
- # TODO use date converter
35
- def csv_options(options)
36
- options = {headers: true, converters: :numeric}.merge(options)
37
- raise ArgumentError, "Must specify headers" unless options[:headers]
38
- options
39
- end
38
+ def csv_to_df(types: nil, headers: nil, **csv_options)
39
+ require "csv"
40
+
41
+ raise ArgumentError, "Must specify headers" if headers == false
42
+
43
+ # TODO use date converter in 0.4.0 - need to test performance
44
+ table = yield({converters: :numeric}.merge(csv_options))
40
45
 
41
- def csv_to_df(table, types: nil, headers: nil)
42
- if headers && headers.size < table.headers.size
43
- raise ArgumentError, "Expected #{table.headers.size} headers, got #{headers.size}"
46
+ headers = nil if headers == true
47
+ if headers && table.first && headers.size != table.first.size
48
+ raise ArgumentError, "Expected #{table.first.size} headers, given #{headers.size}"
49
+ end
50
+
51
+ table_headers = (headers || table.shift || []).dup
52
+ # keep same behavior as headers: true
53
+ if table.first
54
+ while table_headers.size < table.first.size
55
+ table_headers << nil
56
+ end
57
+ end
58
+ # TODO handle date converters
59
+ table_headers = table_headers.map! { |v| v.nil? ? nil : v.to_s }
60
+
61
+ if csv_options[:header_converters]
62
+ table_headers = CSV.parse(CSV.generate_line(table_headers), headers: true, header_converters: csv_options[:header_converters]).headers
44
63
  end
45
64
 
46
- table.by_col!
47
65
  data = {}
48
- keys = table.map { |k, _| [k, true] }.to_h
66
+ keys = table_headers.map { |k| [k, true] }.to_h
49
67
  unnamed_suffix = 1
50
- table.each do |k, v|
51
- # TODO do same for empty string in 0.3.0
52
- if k.nil?
68
+ table_headers.each_with_index do |k, i|
69
+ if k.nil? || k.empty?
53
70
  k = "unnamed"
54
71
  while keys.include?(k)
55
72
  unnamed_suffix += 1
@@ -57,7 +74,18 @@ module Rover
57
74
  end
58
75
  keys[k] = true
59
76
  end
60
- data[k] = v
77
+ table_headers[i] = k
78
+ end
79
+
80
+ table_headers.each_with_index do |k, i|
81
+ # use first value for duplicate headers like headers: true
82
+ next if data[k]
83
+
84
+ values = []
85
+ table.each do |row|
86
+ values << row[i]
87
+ end
88
+ data[k] = values
61
89
  end
62
90
 
63
91
  DataFrame.new(data, types: types)
@@ -78,7 +106,10 @@ module Rover
78
106
  "uint64" => Numo::UInt64
79
107
  }
80
108
 
81
- def parquet_to_df(table, types: nil)
109
+ def parquet_to_df(types: nil)
110
+ require "parquet"
111
+
112
+ table = yield
82
113
  data = {}
83
114
  types ||= {}
84
115
  table.each_column do |column|
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rover-df
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.7
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-01-16 00:00:00.000000000 Z
11
+ date: 2022-05-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -51,14 +51,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
- version: '2.4'
54
+ version: '2.7'
55
55
  required_rubygems_version: !ruby/object:Gem::Requirement
56
56
  requirements:
57
57
  - - ">="
58
58
  - !ruby/object:Gem::Version
59
59
  version: '0'
60
60
  requirements: []
61
- rubygems_version: 3.3.3
61
+ rubygems_version: 3.3.7
62
62
  signing_key:
63
63
  specification_version: 4
64
64
  summary: Simple, powerful data frames for Ruby