rover-df 0.2.5 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/LICENSE.txt +1 -1
- data/README.md +30 -4
- data/lib/rover/data_frame.rb +79 -23
- data/lib/rover/group.rb +10 -0
- data/lib/rover/vector.rb +1 -0
- data/lib/rover/version.rb +1 -1
- data/lib/rover.rb +107 -18
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 65d2fda186484e920421543e2f0203635054ccb8a23250bd3fc6a9d8c328725f
|
4
|
+
data.tar.gz: e4cd1e6d69e1e4f340f6692111476a5be9405f348841cfba6f6c431f04d85347
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c720f3bc45178f938c20546ac1b7279ae047affafce5e06cff4f703e1d8ff7a99c1bca94a3f40cb7d26945d770bf136a2adc3477cf6ffc3cdaad9a15aa6090a1
|
7
|
+
data.tar.gz: c44135cc0e70b08b72e1084565ef3479bcb92000bf34662b76a25933e68ad33a584afae071ddebfd5724ad61fe7e7dbc283241d7194c532dd70f36b1358b266d
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
## 0.2.8 (2022-03-15)
|
2
|
+
|
3
|
+
- Added `group` and `stacked` options to `plot`
|
4
|
+
- Improved performance of `read_csv` and `parse_csv`
|
5
|
+
|
6
|
+
## 0.2.7 (2022-01-16)
|
7
|
+
|
8
|
+
- Added support for booleans to Parquet methods
|
9
|
+
- Added support for creating data frames from `ActiveRecord::Result`
|
10
|
+
- Added `types` option to `read_parquet` and `parse_parquet` methods
|
11
|
+
|
12
|
+
## 0.2.6 (2021-10-27)
|
13
|
+
|
14
|
+
- Added support for `nil` headers to `read_csv` and `parse_csv`
|
15
|
+
- Added `read_parquet`, `parse_parquet`, and `to_parquet` methods
|
16
|
+
|
1
17
|
## 0.2.5 (2021-09-25)
|
2
18
|
|
3
19
|
- Fixed column types with joins
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -13,7 +13,7 @@ Simple, powerful data frames for Ruby
|
|
13
13
|
Add this line to your application’s Gemfile:
|
14
14
|
|
15
15
|
```ruby
|
16
|
-
gem
|
16
|
+
gem "rover-df"
|
17
17
|
```
|
18
18
|
|
19
19
|
## Intro
|
@@ -61,6 +61,14 @@ Rover.read_csv("file.csv")
|
|
61
61
|
Rover.parse_csv("CSV,data,string")
|
62
62
|
```
|
63
63
|
|
64
|
+
From Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem)
|
65
|
+
|
66
|
+
```ruby
|
67
|
+
Rover.read_parquet("file.parquet")
|
68
|
+
# or
|
69
|
+
Rover.parse_parquet("PAR1...")
|
70
|
+
```
|
71
|
+
|
64
72
|
## Attributes
|
65
73
|
|
66
74
|
Get number of rows
|
@@ -89,7 +97,7 @@ Select a column
|
|
89
97
|
df[:a]
|
90
98
|
```
|
91
99
|
|
92
|
-
> Note that strings and symbols are different keys, just like hashes
|
100
|
+
> Note that strings and symbols are different keys, just like hashes. Creating a data frame from Active Record, a CSV, or Parquet uses strings.
|
93
101
|
|
94
102
|
Select multiple columns
|
95
103
|
|
@@ -228,7 +236,7 @@ df.group(:a).max(:b)
|
|
228
236
|
Multiple groups
|
229
237
|
|
230
238
|
```ruby
|
231
|
-
df.group(
|
239
|
+
df.group(:a, :b).count
|
232
240
|
```
|
233
241
|
|
234
242
|
## Visualization
|
@@ -236,7 +244,7 @@ df.group([:a, :b]).count
|
|
236
244
|
Add [Vega](https://github.com/ankane/vega) to your application’s Gemfile:
|
237
245
|
|
238
246
|
```ruby
|
239
|
-
gem
|
247
|
+
gem "vega"
|
240
248
|
```
|
241
249
|
|
242
250
|
And use:
|
@@ -251,6 +259,18 @@ Specify the chart type (`line`, `pie`, `column`, `bar`, `area`, or `scatter`)
|
|
251
259
|
df.plot(:a, :b, type: "pie")
|
252
260
|
```
|
253
261
|
|
262
|
+
Group data
|
263
|
+
|
264
|
+
```ruby
|
265
|
+
df.plot(:a, :b, group: :c)
|
266
|
+
```
|
267
|
+
|
268
|
+
Stacked columns or bars
|
269
|
+
|
270
|
+
```ruby
|
271
|
+
df.plot(:a, :b, group: :c, stacked: true)
|
272
|
+
```
|
273
|
+
|
254
274
|
## Updating Data
|
255
275
|
|
256
276
|
Add a new column
|
@@ -393,6 +413,12 @@ CSV
|
|
393
413
|
df.to_csv
|
394
414
|
```
|
395
415
|
|
416
|
+
Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem)
|
417
|
+
|
418
|
+
```ruby
|
419
|
+
df.to_parquet
|
420
|
+
```
|
421
|
+
|
396
422
|
## Types
|
397
423
|
|
398
424
|
You can specify column types when creating a data frame
|
data/lib/rover/data_frame.rb
CHANGED
@@ -40,8 +40,8 @@ module Rover
|
|
40
40
|
vectors.each do |k, v|
|
41
41
|
@vectors[k] = to_vector(v, type: types[k])
|
42
42
|
end
|
43
|
-
elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base))
|
44
|
-
result = data.connection.select_all(data.all.to_sql)
|
43
|
+
elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base) || data.is_a?(ActiveRecord::Result))
|
44
|
+
result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.all.to_sql)
|
45
45
|
result.columns.each_with_index do |k, i|
|
46
46
|
@vectors[k] = to_vector(result.rows.map { |r| r[i] }, type: types[k])
|
47
47
|
end
|
@@ -235,6 +235,44 @@ module Rover
|
|
235
235
|
end
|
236
236
|
end
|
237
237
|
|
238
|
+
def to_parquet
|
239
|
+
require "parquet"
|
240
|
+
|
241
|
+
schema = {}
|
242
|
+
types.each do |name, type|
|
243
|
+
schema[name] =
|
244
|
+
case type
|
245
|
+
when :int
|
246
|
+
:int64
|
247
|
+
when :uint
|
248
|
+
:uint64
|
249
|
+
when :float
|
250
|
+
:double
|
251
|
+
when :float32
|
252
|
+
:float
|
253
|
+
when :bool
|
254
|
+
:boolean
|
255
|
+
when :object
|
256
|
+
if @vectors[name].all? { |v| v.is_a?(String) }
|
257
|
+
:string
|
258
|
+
else
|
259
|
+
raise "Unknown type"
|
260
|
+
end
|
261
|
+
else
|
262
|
+
type
|
263
|
+
end
|
264
|
+
end
|
265
|
+
# TODO improve performance
|
266
|
+
raw_records = []
|
267
|
+
size.times do |i|
|
268
|
+
raw_records << @vectors.map { |_, v| v[i] }
|
269
|
+
end
|
270
|
+
table = Arrow::Table.new(schema, raw_records)
|
271
|
+
buffer = Arrow::ResizableBuffer.new(1024)
|
272
|
+
table.save(buffer, format: :parquet)
|
273
|
+
buffer.data.to_s
|
274
|
+
end
|
275
|
+
|
238
276
|
# for IRuby
|
239
277
|
def to_html
|
240
278
|
require "iruby"
|
@@ -363,7 +401,7 @@ module Rover
|
|
363
401
|
keys.all? { |k| self[k].to_numo == other[k].to_numo }
|
364
402
|
end
|
365
403
|
|
366
|
-
def plot(x = nil, y = nil, type: nil)
|
404
|
+
def plot(x = nil, y = nil, type: nil, group: nil, stacked: nil)
|
367
405
|
require "vega"
|
368
406
|
|
369
407
|
raise ArgumentError, "Must specify columns" if keys.size != 2 && (!x || !y)
|
@@ -378,7 +416,7 @@ module Rover
|
|
378
416
|
raise "Cannot determine type. Use the type option."
|
379
417
|
end
|
380
418
|
end
|
381
|
-
data = self[[x, y]]
|
419
|
+
data = self[group.nil? ? [x, y] : [x, y, group]]
|
382
420
|
|
383
421
|
case type
|
384
422
|
when "line", "area"
|
@@ -392,16 +430,20 @@ module Rover
|
|
392
430
|
end
|
393
431
|
|
394
432
|
scale = x_type == "temporal" ? {type: "utc"} : {}
|
433
|
+
encoding = {
|
434
|
+
x: {field: x, type: x_type, scale: scale},
|
435
|
+
y: {field: y, type: "quantitative"}
|
436
|
+
}
|
437
|
+
encoding[:color] = {field: group} if group
|
395
438
|
|
396
439
|
Vega.lite
|
397
440
|
.data(data)
|
398
441
|
.mark(type: type, tooltip: true, interpolate: "cardinal", point: {size: 60})
|
399
|
-
.encoding(
|
400
|
-
x: {field: x, type: x_type, scale: scale},
|
401
|
-
y: {field: y, type: "quantitative"}
|
402
|
-
)
|
442
|
+
.encoding(encoding)
|
403
443
|
.config(axis: {labelFontSize: 12})
|
404
444
|
when "pie"
|
445
|
+
raise ArgumentError, "Cannot use group option with pie chart" unless group.nil?
|
446
|
+
|
405
447
|
Vega.lite
|
406
448
|
.data(data)
|
407
449
|
.mark(type: "arc", tooltip: true)
|
@@ -411,34 +453,48 @@ module Rover
|
|
411
453
|
)
|
412
454
|
.view(stroke: nil)
|
413
455
|
when "column"
|
456
|
+
encoding = {
|
457
|
+
x: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
|
458
|
+
y: {field: y, type: "quantitative"}
|
459
|
+
}
|
460
|
+
if group
|
461
|
+
encoding[:color] = {field: group}
|
462
|
+
encoding[:xOffset] = {field: group} unless stacked
|
463
|
+
end
|
464
|
+
|
414
465
|
Vega.lite
|
415
466
|
.data(data)
|
416
467
|
.mark(type: "bar", tooltip: true)
|
417
|
-
.encoding(
|
418
|
-
# TODO determine label angle
|
419
|
-
x: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
|
420
|
-
y: {field: y, type: "quantitative"}
|
421
|
-
)
|
468
|
+
.encoding(encoding)
|
422
469
|
.config(axis: {labelFontSize: 12})
|
423
470
|
when "bar"
|
471
|
+
encoding = {
|
472
|
+
# TODO determine label angle
|
473
|
+
y: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
|
474
|
+
x: {field: y, type: "quantitative"}
|
475
|
+
}
|
476
|
+
if group
|
477
|
+
encoding[:color] = {field: group}
|
478
|
+
encoding[:yOffset] = {field: group} unless stacked
|
479
|
+
end
|
480
|
+
|
424
481
|
Vega.lite
|
425
482
|
.data(data)
|
426
483
|
.mark(type: "bar", tooltip: true)
|
427
|
-
.encoding(
|
428
|
-
# TODO determine label angle
|
429
|
-
y: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
|
430
|
-
x: {field: y, type: "quantitative"}
|
431
|
-
)
|
484
|
+
.encoding(encoding)
|
432
485
|
.config(axis: {labelFontSize: 12})
|
433
486
|
when "scatter"
|
487
|
+
encoding = {
|
488
|
+
x: {field: x, type: "quantitative", scale: {zero: false}},
|
489
|
+
y: {field: y, type: "quantitative", scale: {zero: false}},
|
490
|
+
size: {value: 60}
|
491
|
+
}
|
492
|
+
encoding[:color] = {field: group} if group
|
493
|
+
|
434
494
|
Vega.lite
|
435
495
|
.data(data)
|
436
496
|
.mark(type: "circle", tooltip: true)
|
437
|
-
.encoding(
|
438
|
-
x: {field: x, type: "quantitative", scale: {zero: false}},
|
439
|
-
y: {field: y, type: "quantitative", scale: {zero: false}},
|
440
|
-
size: {value: 60}
|
441
|
-
)
|
497
|
+
.encoding(encoding)
|
442
498
|
.config(axis: {labelFontSize: 12})
|
443
499
|
else
|
444
500
|
raise ArgumentError, "Invalid type: #{type}"
|
data/lib/rover/group.rb
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
module Rover
|
2
2
|
class Group
|
3
|
+
# TODO raise ArgumentError for empty columns in 0.3.0
|
3
4
|
def initialize(df, columns)
|
4
5
|
@df = df
|
5
6
|
@columns = columns
|
6
7
|
end
|
7
8
|
|
9
|
+
# TODO raise ArgumentError for empty columns in 0.3.0
|
8
10
|
def group(*columns)
|
9
11
|
Group.new(@df, @columns + columns.flatten)
|
10
12
|
end
|
@@ -22,6 +24,14 @@ module Rover
|
|
22
24
|
end
|
23
25
|
end
|
24
26
|
|
27
|
+
def plot(*args, **options)
|
28
|
+
raise ArgumentError, "Multiple groups not supported" if @columns.size > 1
|
29
|
+
# same message as Ruby
|
30
|
+
raise ArgumentError, "unknown keyword: :group" if options.key?(:group)
|
31
|
+
|
32
|
+
@df.plot(*args, **options, group: @columns.first)
|
33
|
+
end
|
34
|
+
|
25
35
|
private
|
26
36
|
|
27
37
|
# TODO make more efficient
|
data/lib/rover/vector.rb
CHANGED
data/lib/rover/version.rb
CHANGED
data/lib/rover.rb
CHANGED
@@ -9,36 +9,125 @@ require "rover/version"
|
|
9
9
|
|
10
10
|
module Rover
|
11
11
|
class << self
|
12
|
-
def read_csv(path,
|
13
|
-
|
14
|
-
|
12
|
+
def read_csv(path, **options)
|
13
|
+
csv_to_df(**options) do |csv_options|
|
14
|
+
CSV.read(path, **csv_options)
|
15
|
+
end
|
15
16
|
end
|
16
17
|
|
17
|
-
def parse_csv(str,
|
18
|
-
|
19
|
-
|
18
|
+
def parse_csv(str, **options)
|
19
|
+
csv_to_df(**options) do |csv_options|
|
20
|
+
CSV.parse(str, **csv_options)
|
21
|
+
end
|
20
22
|
end
|
21
23
|
|
22
|
-
|
24
|
+
def read_parquet(path, **options)
|
25
|
+
parquet_to_df(**options) do
|
26
|
+
Arrow::Table.load(path)
|
27
|
+
end
|
28
|
+
end
|
23
29
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
options
|
30
|
+
def parse_parquet(str, **options)
|
31
|
+
parquet_to_df(**options) do
|
32
|
+
Arrow::Table.load(Arrow::Buffer.new(str), format: :parquet)
|
33
|
+
end
|
29
34
|
end
|
30
35
|
|
31
|
-
|
32
|
-
|
33
|
-
|
36
|
+
private
|
37
|
+
|
38
|
+
def csv_to_df(types: nil, headers: nil, **csv_options)
|
39
|
+
require "csv"
|
40
|
+
|
41
|
+
raise ArgumentError, "Must specify headers" if headers == false
|
42
|
+
|
43
|
+
# TODO use date converter
|
44
|
+
table = yield({converters: :numeric}.merge(csv_options))
|
45
|
+
|
46
|
+
headers = nil if headers == true
|
47
|
+
if headers && table.first && headers.size < table.first.size
|
48
|
+
raise ArgumentError, "Expected #{table.first.size} headers, got #{headers.size}"
|
49
|
+
end
|
50
|
+
|
51
|
+
table_headers = (headers || table.shift || []).dup
|
52
|
+
# keep same behavior as headers: true
|
53
|
+
if table.first
|
54
|
+
while table_headers.size < table.first.size
|
55
|
+
table_headers << nil
|
56
|
+
end
|
34
57
|
end
|
35
58
|
|
36
|
-
table.by_col!
|
37
59
|
data = {}
|
38
|
-
|
39
|
-
|
60
|
+
keys = table_headers.map { |k| [k, true] }.to_h
|
61
|
+
unnamed_suffix = 1
|
62
|
+
table_headers.each_with_index do |k, i|
|
63
|
+
# TODO do same for empty string in 0.3.0
|
64
|
+
if k.nil?
|
65
|
+
k = "unnamed"
|
66
|
+
while keys.include?(k)
|
67
|
+
unnamed_suffix += 1
|
68
|
+
k = "unnamed#{unnamed_suffix}"
|
69
|
+
end
|
70
|
+
keys[k] = true
|
71
|
+
end
|
72
|
+
table_headers[i] = k
|
40
73
|
end
|
74
|
+
|
75
|
+
table_headers.each_with_index do |k, i|
|
76
|
+
# use first value for duplicate headers like headers: true
|
77
|
+
next if data[k]
|
78
|
+
|
79
|
+
values = []
|
80
|
+
table.each do |row|
|
81
|
+
values << row[i]
|
82
|
+
end
|
83
|
+
data[k] = values
|
84
|
+
end
|
85
|
+
|
41
86
|
DataFrame.new(data, types: types)
|
42
87
|
end
|
88
|
+
|
89
|
+
PARQUET_TYPE_MAPPING = {
|
90
|
+
"bool" => Numo::Bit,
|
91
|
+
"float" => Numo::SFloat,
|
92
|
+
"double" => Numo::DFloat,
|
93
|
+
"int8" => Numo::Int8,
|
94
|
+
"int16" => Numo::Int16,
|
95
|
+
"int32" => Numo::Int32,
|
96
|
+
"int64" => Numo::Int64,
|
97
|
+
"string" => Numo::RObject,
|
98
|
+
"uint8" => Numo::UInt8,
|
99
|
+
"uint16" => Numo::UInt16,
|
100
|
+
"uint32" => Numo::UInt32,
|
101
|
+
"uint64" => Numo::UInt64
|
102
|
+
}
|
103
|
+
|
104
|
+
def parquet_to_df(types: nil)
|
105
|
+
require "parquet"
|
106
|
+
|
107
|
+
table = yield
|
108
|
+
data = {}
|
109
|
+
types ||= {}
|
110
|
+
table.each_column do |column|
|
111
|
+
k = column.field.name
|
112
|
+
if types[k]
|
113
|
+
data[k] = Vector.new(column.data.values, type: types[k])
|
114
|
+
else
|
115
|
+
type = column.field.data_type.to_s
|
116
|
+
numo_type = PARQUET_TYPE_MAPPING[type]
|
117
|
+
raise "Unknown type: #{type}" unless numo_type
|
118
|
+
|
119
|
+
# TODO automatic conversion?
|
120
|
+
# int => float
|
121
|
+
# bool => object
|
122
|
+
if (type.include?("int") || type == "bool") && column.n_nulls > 0
|
123
|
+
raise "Nulls not supported for #{type} column: #{k}"
|
124
|
+
end
|
125
|
+
|
126
|
+
# TODO improve performance
|
127
|
+
data[k] = numo_type.cast(column.data.values)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
DataFrame.new(data)
|
131
|
+
end
|
43
132
|
end
|
44
133
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rover-df
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-03-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|
@@ -58,7 +58,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
58
58
|
- !ruby/object:Gem::Version
|
59
59
|
version: '0'
|
60
60
|
requirements: []
|
61
|
-
rubygems_version: 3.
|
61
|
+
rubygems_version: 3.3.7
|
62
62
|
signing_key:
|
63
63
|
specification_version: 4
|
64
64
|
summary: Simple, powerful data frames for Ruby
|