rover-df 0.2.6 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +21 -0
- data/LICENSE.txt +1 -1
- data/README.md +25 -13
- data/lib/rover/data_frame.rb +65 -43
- data/lib/rover/group.rb +16 -4
- data/lib/rover/vector.rb +20 -7
- data/lib/rover/version.rb +1 -1
- data/lib/rover.rb +72 -33
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 171a06a05afa4ec4bb09efe7fef53b49220a3d4fa5352621112e29f2b70812b9
|
4
|
+
data.tar.gz: 435d8f3d4781f1960236f3c2b7f9fa2c4e38dfc987b53cb9fbe6351a9e8db4e9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9e5baa8cb051f7c7f06bbe0025ac4d923947b34768461a814e533aa78ec5d2d391a12edc6a9a64abc2fa9b1147255211ff26e8094cfaef67c9d70e393e57bcc0
|
7
|
+
data.tar.gz: 55f3438d438326c324c612a92b39a54698123889aeb28bafd93a196f2659208b4b72a1ac02efd407166fc56c8c9f3abff9472804ddfb231220b6adf41ff38df1
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,24 @@
|
|
1
|
+
## 0.3.0 (2022-04-03)
|
2
|
+
|
3
|
+
- Added `deep_dup` method to data frames
|
4
|
+
- Changed `:int` to `:int64`, `:uint` to `:uint64`, and `:float` to `:float64` for type methods
|
5
|
+
- Changed missing column to raise `KeyError` instead of `ArgumentError` for aggregate methods
|
6
|
+
- Changed passing too many headers to `read_csv` and `parse_csv` to raise `ArgumentError`
|
7
|
+
- Changed empty string in CSV headers to match behavior of `nil`
|
8
|
+
- Fixed `clone` and `dup` method for vectors
|
9
|
+
- Dropped support for Ruby < 2.7
|
10
|
+
|
11
|
+
## 0.2.8 (2022-03-15)
|
12
|
+
|
13
|
+
- Added `group` and `stacked` options to `plot`
|
14
|
+
- Improved performance of `read_csv` and `parse_csv`
|
15
|
+
|
16
|
+
## 0.2.7 (2022-01-16)
|
17
|
+
|
18
|
+
- Added support for booleans to Parquet methods
|
19
|
+
- Added support for creating data frames from `ActiveRecord::Result`
|
20
|
+
- Added `types` option to `read_parquet` and `parse_parquet` methods
|
21
|
+
|
1
22
|
## 0.2.6 (2021-10-27)
|
2
23
|
|
3
24
|
- Added support for `nil` headers to `read_csv` and `parse_csv`
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -13,7 +13,7 @@ Simple, powerful data frames for Ruby
|
|
13
13
|
Add this line to your application’s Gemfile:
|
14
14
|
|
15
15
|
```ruby
|
16
|
-
gem
|
16
|
+
gem "rover-df"
|
17
17
|
```
|
18
18
|
|
19
19
|
## Intro
|
@@ -61,7 +61,7 @@ Rover.read_csv("file.csv")
|
|
61
61
|
Rover.parse_csv("CSV,data,string")
|
62
62
|
```
|
63
63
|
|
64
|
-
From Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem)
|
64
|
+
From Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem)
|
65
65
|
|
66
66
|
```ruby
|
67
67
|
Rover.read_parquet("file.parquet")
|
@@ -236,7 +236,7 @@ df.group(:a).max(:b)
|
|
236
236
|
Multiple groups
|
237
237
|
|
238
238
|
```ruby
|
239
|
-
df.group(
|
239
|
+
df.group(:a, :b).count
|
240
240
|
```
|
241
241
|
|
242
242
|
## Visualization
|
@@ -244,7 +244,7 @@ df.group([:a, :b]).count
|
|
244
244
|
Add [Vega](https://github.com/ankane/vega) to your application’s Gemfile:
|
245
245
|
|
246
246
|
```ruby
|
247
|
-
gem
|
247
|
+
gem "vega"
|
248
248
|
```
|
249
249
|
|
250
250
|
And use:
|
@@ -259,6 +259,18 @@ Specify the chart type (`line`, `pie`, `column`, `bar`, `area`, or `scatter`)
|
|
259
259
|
df.plot(:a, :b, type: "pie")
|
260
260
|
```
|
261
261
|
|
262
|
+
Group data
|
263
|
+
|
264
|
+
```ruby
|
265
|
+
df.plot(:a, :b, group: :c)
|
266
|
+
```
|
267
|
+
|
268
|
+
Stacked columns or bars
|
269
|
+
|
270
|
+
```ruby
|
271
|
+
df.plot(:a, :b, group: :c, stacked: true)
|
272
|
+
```
|
273
|
+
|
262
274
|
## Updating Data
|
263
275
|
|
264
276
|
Add a new column
|
@@ -401,7 +413,7 @@ CSV
|
|
401
413
|
df.to_csv
|
402
414
|
```
|
403
415
|
|
404
|
-
Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem)
|
416
|
+
Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem)
|
405
417
|
|
406
418
|
```ruby
|
407
419
|
df.to_parquet
|
@@ -412,22 +424,22 @@ df.to_parquet
|
|
412
424
|
You can specify column types when creating a data frame
|
413
425
|
|
414
426
|
```ruby
|
415
|
-
Rover::DataFrame.new(data, types: {"a" => :
|
427
|
+
Rover::DataFrame.new(data, types: {"a" => :int64, "b" => :float64})
|
416
428
|
```
|
417
429
|
|
418
430
|
Or
|
419
431
|
|
420
432
|
```ruby
|
421
|
-
Rover.read_csv("data.csv", types: {"a" => :
|
433
|
+
Rover.read_csv("data.csv", types: {"a" => :int64, "b" => :float64})
|
422
434
|
```
|
423
435
|
|
424
436
|
Supported types are:
|
425
437
|
|
426
|
-
- boolean -
|
427
|
-
- float -
|
428
|
-
- integer -
|
429
|
-
- unsigned integer -
|
430
|
-
- object -
|
438
|
+
- boolean - `:bool`
|
439
|
+
- float - `:float64`, `:float32`
|
440
|
+
- integer - `:int64`, `:int32`, `:int16`, `:int8`
|
441
|
+
- unsigned integer - `:uint64`, `:uint32`, `:uint16`, `:uint8`
|
442
|
+
- object - `:object`
|
431
443
|
|
432
444
|
Get column types
|
433
445
|
|
@@ -444,7 +456,7 @@ df[:a].type
|
|
444
456
|
Change the type of a column
|
445
457
|
|
446
458
|
```ruby
|
447
|
-
df[:a] = df[:a].to(:
|
459
|
+
df[:a] = df[:a].to(:int32)
|
448
460
|
```
|
449
461
|
|
450
462
|
## History
|
data/lib/rover/data_frame.rb
CHANGED
@@ -40,8 +40,8 @@ module Rover
|
|
40
40
|
vectors.each do |k, v|
|
41
41
|
@vectors[k] = to_vector(v, type: types[k])
|
42
42
|
end
|
43
|
-
elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base))
|
44
|
-
result = data.connection.select_all(data.all.to_sql)
|
43
|
+
elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base) || data.is_a?(ActiveRecord::Result))
|
44
|
+
result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.all.to_sql)
|
45
45
|
result.columns.each_with_index do |k, i|
|
46
46
|
@vectors[k] = to_vector(result.rows.map { |r| r[i] }, type: types[k])
|
47
47
|
end
|
@@ -72,7 +72,7 @@ module Rover
|
|
72
72
|
# multiple columns
|
73
73
|
df = DataFrame.new
|
74
74
|
where.each do |k|
|
75
|
-
check_column(k
|
75
|
+
check_column(k)
|
76
76
|
df[k] = @vectors[k]
|
77
77
|
end
|
78
78
|
df
|
@@ -102,7 +102,7 @@ module Rover
|
|
102
102
|
def []=(k, v)
|
103
103
|
check_key(k)
|
104
104
|
v = to_vector(v, size: size)
|
105
|
-
raise ArgumentError, "Size mismatch
|
105
|
+
raise ArgumentError, "Size mismatch (given #{v.size}, expected #{size})" if @vectors.any? && v.size != size
|
106
106
|
@vectors[k] = v
|
107
107
|
end
|
108
108
|
|
@@ -242,14 +242,16 @@ module Rover
|
|
242
242
|
types.each do |name, type|
|
243
243
|
schema[name] =
|
244
244
|
case type
|
245
|
-
when :
|
245
|
+
when :int64
|
246
246
|
:int64
|
247
|
-
when :
|
247
|
+
when :uint64
|
248
248
|
:uint64
|
249
|
-
when :
|
249
|
+
when :float64
|
250
250
|
:double
|
251
251
|
when :float32
|
252
252
|
:float
|
253
|
+
when :bool
|
254
|
+
:boolean
|
253
255
|
when :object
|
254
256
|
if @vectors[name].all? { |v| v.is_a?(String) }
|
255
257
|
:string
|
@@ -344,10 +346,10 @@ module Rover
|
|
344
346
|
end
|
345
347
|
end
|
346
348
|
|
347
|
-
def
|
349
|
+
def deep_dup
|
348
350
|
df = DataFrame.new
|
349
351
|
@vectors.each do |k, v|
|
350
|
-
df[k] = v
|
352
|
+
df[k] = v.dup
|
351
353
|
end
|
352
354
|
df
|
353
355
|
end
|
@@ -399,7 +401,7 @@ module Rover
|
|
399
401
|
keys.all? { |k| self[k].to_numo == other[k].to_numo }
|
400
402
|
end
|
401
403
|
|
402
|
-
def plot(x = nil, y = nil, type: nil)
|
404
|
+
def plot(x = nil, y = nil, type: nil, group: nil, stacked: nil)
|
403
405
|
require "vega"
|
404
406
|
|
405
407
|
raise ArgumentError, "Must specify columns" if keys.size != 2 && (!x || !y)
|
@@ -414,7 +416,7 @@ module Rover
|
|
414
416
|
raise "Cannot determine type. Use the type option."
|
415
417
|
end
|
416
418
|
end
|
417
|
-
data = self[[x, y]]
|
419
|
+
data = self[group.nil? ? [x, y] : [x, y, group]]
|
418
420
|
|
419
421
|
case type
|
420
422
|
when "line", "area"
|
@@ -428,16 +430,20 @@ module Rover
|
|
428
430
|
end
|
429
431
|
|
430
432
|
scale = x_type == "temporal" ? {type: "utc"} : {}
|
433
|
+
encoding = {
|
434
|
+
x: {field: x, type: x_type, scale: scale},
|
435
|
+
y: {field: y, type: "quantitative"}
|
436
|
+
}
|
437
|
+
encoding[:color] = {field: group} if group
|
431
438
|
|
432
439
|
Vega.lite
|
433
440
|
.data(data)
|
434
441
|
.mark(type: type, tooltip: true, interpolate: "cardinal", point: {size: 60})
|
435
|
-
.encoding(
|
436
|
-
x: {field: x, type: x_type, scale: scale},
|
437
|
-
y: {field: y, type: "quantitative"}
|
438
|
-
)
|
442
|
+
.encoding(encoding)
|
439
443
|
.config(axis: {labelFontSize: 12})
|
440
444
|
when "pie"
|
445
|
+
raise ArgumentError, "Cannot use group option with pie chart" unless group.nil?
|
446
|
+
|
441
447
|
Vega.lite
|
442
448
|
.data(data)
|
443
449
|
.mark(type: "arc", tooltip: true)
|
@@ -447,34 +453,48 @@ module Rover
|
|
447
453
|
)
|
448
454
|
.view(stroke: nil)
|
449
455
|
when "column"
|
456
|
+
encoding = {
|
457
|
+
x: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
|
458
|
+
y: {field: y, type: "quantitative"}
|
459
|
+
}
|
460
|
+
if group
|
461
|
+
encoding[:color] = {field: group}
|
462
|
+
encoding[:xOffset] = {field: group} unless stacked
|
463
|
+
end
|
464
|
+
|
450
465
|
Vega.lite
|
451
466
|
.data(data)
|
452
467
|
.mark(type: "bar", tooltip: true)
|
453
|
-
.encoding(
|
454
|
-
# TODO determine label angle
|
455
|
-
x: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
|
456
|
-
y: {field: y, type: "quantitative"}
|
457
|
-
)
|
468
|
+
.encoding(encoding)
|
458
469
|
.config(axis: {labelFontSize: 12})
|
459
470
|
when "bar"
|
471
|
+
encoding = {
|
472
|
+
# TODO determine label angle
|
473
|
+
y: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
|
474
|
+
x: {field: y, type: "quantitative"}
|
475
|
+
}
|
476
|
+
if group
|
477
|
+
encoding[:color] = {field: group}
|
478
|
+
encoding[:yOffset] = {field: group} unless stacked
|
479
|
+
end
|
480
|
+
|
460
481
|
Vega.lite
|
461
482
|
.data(data)
|
462
483
|
.mark(type: "bar", tooltip: true)
|
463
|
-
.encoding(
|
464
|
-
# TODO determine label angle
|
465
|
-
y: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
|
466
|
-
x: {field: y, type: "quantitative"}
|
467
|
-
)
|
484
|
+
.encoding(encoding)
|
468
485
|
.config(axis: {labelFontSize: 12})
|
469
486
|
when "scatter"
|
487
|
+
encoding = {
|
488
|
+
x: {field: x, type: "quantitative", scale: {zero: false}},
|
489
|
+
y: {field: y, type: "quantitative", scale: {zero: false}},
|
490
|
+
size: {value: 60}
|
491
|
+
}
|
492
|
+
encoding[:color] = {field: group} if group
|
493
|
+
|
470
494
|
Vega.lite
|
471
495
|
.data(data)
|
472
496
|
.mark(type: "circle", tooltip: true)
|
473
|
-
.encoding(
|
474
|
-
x: {field: x, type: "quantitative", scale: {zero: false}},
|
475
|
-
y: {field: y, type: "quantitative", scale: {zero: false}},
|
476
|
-
size: {value: 60}
|
477
|
-
)
|
497
|
+
.encoding(encoding)
|
478
498
|
.config(axis: {labelFontSize: 12})
|
479
499
|
else
|
480
500
|
raise ArgumentError, "Invalid type: #{type}"
|
@@ -483,8 +503,20 @@ module Rover
|
|
483
503
|
|
484
504
|
private
|
485
505
|
|
506
|
+
# for clone
|
507
|
+
def initialize_clone(_)
|
508
|
+
@vectors = @vectors.clone
|
509
|
+
super
|
510
|
+
end
|
511
|
+
|
512
|
+
# for dup
|
513
|
+
def initialize_dup(_)
|
514
|
+
@vectors = @vectors.dup
|
515
|
+
super
|
516
|
+
end
|
517
|
+
|
486
518
|
def check_key(key)
|
487
|
-
raise ArgumentError, "Key must be a
|
519
|
+
raise ArgumentError, "Key must be a String or Symbol, given #{key.class.name}" unless key.is_a?(String) || key.is_a?(Symbol)
|
488
520
|
end
|
489
521
|
|
490
522
|
# TODO make more efficient
|
@@ -545,19 +577,9 @@ module Rover
|
|
545
577
|
raise ArgumentError, "Missing keys: #{missing_keys.join(", ")}" if missing_keys.any?
|
546
578
|
end
|
547
579
|
|
548
|
-
|
549
|
-
# always use did_you_mean
|
550
|
-
def check_column(key, did_you_mean = false)
|
580
|
+
def check_column(key)
|
551
581
|
unless include?(key)
|
552
|
-
|
553
|
-
if RUBY_VERSION.to_f >= 2.6
|
554
|
-
raise KeyError.new("Missing column: #{key}", receiver: self, key: key)
|
555
|
-
else
|
556
|
-
raise KeyError.new("Missing column: #{key}")
|
557
|
-
end
|
558
|
-
else
|
559
|
-
raise ArgumentError, "Missing column: #{key}"
|
560
|
-
end
|
582
|
+
raise KeyError.new("Missing column: #{key}", receiver: self, key: key)
|
561
583
|
end
|
562
584
|
end
|
563
585
|
|
data/lib/rover/group.rb
CHANGED
@@ -3,6 +3,7 @@ module Rover
|
|
3
3
|
def initialize(df, columns)
|
4
4
|
@df = df
|
5
5
|
@columns = columns
|
6
|
+
check_columns
|
6
7
|
end
|
7
8
|
|
8
9
|
def group(*columns)
|
@@ -22,16 +23,20 @@ module Rover
|
|
22
23
|
end
|
23
24
|
end
|
24
25
|
|
26
|
+
def plot(*args, **options)
|
27
|
+
raise ArgumentError, "Multiple groups not supported" if @columns.size > 1
|
28
|
+
# same message as Ruby
|
29
|
+
raise ArgumentError, "unknown keyword: :group" if options.key?(:group)
|
30
|
+
|
31
|
+
@df.plot(*args, **options, group: @columns.first)
|
32
|
+
end
|
33
|
+
|
25
34
|
private
|
26
35
|
|
27
36
|
# TODO make more efficient
|
28
37
|
def grouped_dfs
|
29
38
|
# cache here so we can reuse for multiple calcuations if needed
|
30
39
|
@grouped_dfs ||= begin
|
31
|
-
raise ArgumentError, "No columns given" if @columns.empty?
|
32
|
-
missing_keys = @columns - @df.keys
|
33
|
-
raise ArgumentError, "Missing keys: #{missing_keys.join(", ")}" if missing_keys.any?
|
34
|
-
|
35
40
|
groups = Hash.new { |hash, key| hash[key] = [] }
|
36
41
|
i = 0
|
37
42
|
@df.each_row do |row|
|
@@ -46,5 +51,12 @@ module Rover
|
|
46
51
|
result
|
47
52
|
end
|
48
53
|
end
|
54
|
+
|
55
|
+
def check_columns
|
56
|
+
raise ArgumentError, "No columns given" if @columns.empty?
|
57
|
+
|
58
|
+
missing_keys = @columns - @df.keys
|
59
|
+
raise ArgumentError, "Missing keys: #{missing_keys.join(", ")}" if missing_keys.any?
|
60
|
+
end
|
49
61
|
end
|
50
62
|
end
|
data/lib/rover/vector.rb
CHANGED
@@ -1,23 +1,23 @@
|
|
1
1
|
module Rover
|
2
2
|
class Vector
|
3
3
|
# if a user never specifies types,
|
4
|
-
# the defaults are bool,
|
5
|
-
# keep these simple
|
6
|
-
#
|
7
|
-
# we could create aliases for float64, int64, uint64
|
8
|
-
# if so, type should still return the simple type
|
4
|
+
# the defaults are bool, float64, int64, and object
|
9
5
|
TYPE_CAST_MAPPING = {
|
10
6
|
bool: Numo::Bit,
|
11
7
|
float32: Numo::SFloat,
|
12
|
-
|
8
|
+
float64: Numo::DFloat,
|
13
9
|
int8: Numo::Int8,
|
14
10
|
int16: Numo::Int16,
|
15
11
|
int32: Numo::Int32,
|
16
|
-
|
12
|
+
int64: Numo::Int64,
|
17
13
|
object: Numo::RObject,
|
18
14
|
uint8: Numo::UInt8,
|
19
15
|
uint16: Numo::UInt16,
|
20
16
|
uint32: Numo::UInt32,
|
17
|
+
uint64: Numo::UInt64,
|
18
|
+
# legacy - must come last
|
19
|
+
float: Numo::DFloat,
|
20
|
+
int: Numo::Int64,
|
21
21
|
uint: Numo::UInt64
|
22
22
|
}
|
23
23
|
|
@@ -333,6 +333,18 @@ module Rover
|
|
333
333
|
|
334
334
|
private
|
335
335
|
|
336
|
+
# for clone
|
337
|
+
def initialize_clone(_)
|
338
|
+
@data = @data.clone
|
339
|
+
super
|
340
|
+
end
|
341
|
+
|
342
|
+
# for dup
|
343
|
+
def initialize_dup(_)
|
344
|
+
@data = @data.dup
|
345
|
+
super
|
346
|
+
end
|
347
|
+
|
336
348
|
def cast_data(data, type: nil)
|
337
349
|
numo_type = numo_type(type) if type
|
338
350
|
|
@@ -359,6 +371,7 @@ module Rover
|
|
359
371
|
data = data.to_a
|
360
372
|
|
361
373
|
if type
|
374
|
+
data = data.map { |v| v || Float::NAN } if [:float, :float32].include?(type)
|
362
375
|
data = numo_type.cast(data)
|
363
376
|
else
|
364
377
|
data =
|
data/lib/rover/version.rb
CHANGED
data/lib/rover.rb
CHANGED
@@ -9,47 +9,58 @@ require "rover/version"
|
|
9
9
|
|
10
10
|
module Rover
|
11
11
|
class << self
|
12
|
-
def read_csv(path,
|
13
|
-
|
14
|
-
|
12
|
+
def read_csv(path, **options)
|
13
|
+
csv_to_df(**options) do |csv_options|
|
14
|
+
CSV.read(path, **csv_options)
|
15
|
+
end
|
15
16
|
end
|
16
17
|
|
17
|
-
def parse_csv(str,
|
18
|
-
|
19
|
-
|
18
|
+
def parse_csv(str, **options)
|
19
|
+
csv_to_df(**options) do |csv_options|
|
20
|
+
CSV.parse(str, **csv_options)
|
21
|
+
end
|
20
22
|
end
|
21
23
|
|
22
|
-
def read_parquet(path)
|
23
|
-
|
24
|
-
|
24
|
+
def read_parquet(path, **options)
|
25
|
+
parquet_to_df(**options) do
|
26
|
+
Arrow::Table.load(path)
|
27
|
+
end
|
25
28
|
end
|
26
29
|
|
27
|
-
def parse_parquet(str)
|
28
|
-
|
29
|
-
|
30
|
+
def parse_parquet(str, **options)
|
31
|
+
parquet_to_df(**options) do
|
32
|
+
Arrow::Table.load(Arrow::Buffer.new(str), format: :parquet)
|
33
|
+
end
|
30
34
|
end
|
31
35
|
|
32
36
|
private
|
33
37
|
|
34
|
-
|
35
|
-
|
36
|
-
options = {headers: true, converters: :numeric}.merge(options)
|
37
|
-
raise ArgumentError, "Must specify headers" unless options[:headers]
|
38
|
-
options
|
39
|
-
end
|
38
|
+
def csv_to_df(types: nil, headers: nil, **csv_options)
|
39
|
+
require "csv"
|
40
40
|
|
41
|
-
|
42
|
-
|
43
|
-
|
41
|
+
raise ArgumentError, "Must specify headers" if headers == false
|
42
|
+
|
43
|
+
# TODO use date converter in 0.4.0 - need to test performance
|
44
|
+
table = yield({converters: :numeric}.merge(csv_options))
|
45
|
+
|
46
|
+
headers = nil if headers == true
|
47
|
+
if headers && table.first && headers.size != table.first.size
|
48
|
+
raise ArgumentError, "Expected #{table.first.size} headers, given #{headers.size}"
|
49
|
+
end
|
50
|
+
|
51
|
+
table_headers = (headers || table.shift || []).dup
|
52
|
+
# keep same behavior as headers: true
|
53
|
+
if table.first
|
54
|
+
while table_headers.size < table.first.size
|
55
|
+
table_headers << nil
|
56
|
+
end
|
44
57
|
end
|
45
58
|
|
46
|
-
table.by_col!
|
47
59
|
data = {}
|
48
|
-
keys =
|
60
|
+
keys = table_headers.map { |k| [k, true] }.to_h
|
49
61
|
unnamed_suffix = 1
|
50
|
-
|
51
|
-
|
52
|
-
if k.nil?
|
62
|
+
table_headers.each_with_index do |k, i|
|
63
|
+
if k.nil? || k.empty?
|
53
64
|
k = "unnamed"
|
54
65
|
while keys.include?(k)
|
55
66
|
unnamed_suffix += 1
|
@@ -57,13 +68,25 @@ module Rover
|
|
57
68
|
end
|
58
69
|
keys[k] = true
|
59
70
|
end
|
60
|
-
|
71
|
+
table_headers[i] = k
|
72
|
+
end
|
73
|
+
|
74
|
+
table_headers.each_with_index do |k, i|
|
75
|
+
# use first value for duplicate headers like headers: true
|
76
|
+
next if data[k]
|
77
|
+
|
78
|
+
values = []
|
79
|
+
table.each do |row|
|
80
|
+
values << row[i]
|
81
|
+
end
|
82
|
+
data[k] = values
|
61
83
|
end
|
62
84
|
|
63
85
|
DataFrame.new(data, types: types)
|
64
86
|
end
|
65
87
|
|
66
88
|
PARQUET_TYPE_MAPPING = {
|
89
|
+
"bool" => Numo::Bit,
|
67
90
|
"float" => Numo::SFloat,
|
68
91
|
"double" => Numo::DFloat,
|
69
92
|
"int8" => Numo::Int8,
|
@@ -77,15 +100,31 @@ module Rover
|
|
77
100
|
"uint64" => Numo::UInt64
|
78
101
|
}
|
79
102
|
|
80
|
-
def parquet_to_df(
|
103
|
+
def parquet_to_df(types: nil)
|
104
|
+
require "parquet"
|
105
|
+
|
106
|
+
table = yield
|
81
107
|
data = {}
|
108
|
+
types ||= {}
|
82
109
|
table.each_column do |column|
|
83
110
|
k = column.field.name
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
111
|
+
if types[k]
|
112
|
+
data[k] = Vector.new(column.data.values, type: types[k])
|
113
|
+
else
|
114
|
+
type = column.field.data_type.to_s
|
115
|
+
numo_type = PARQUET_TYPE_MAPPING[type]
|
116
|
+
raise "Unknown type: #{type}" unless numo_type
|
117
|
+
|
118
|
+
# TODO automatic conversion?
|
119
|
+
# int => float
|
120
|
+
# bool => object
|
121
|
+
if (type.include?("int") || type == "bool") && column.n_nulls > 0
|
122
|
+
raise "Nulls not supported for #{type} column: #{k}"
|
123
|
+
end
|
124
|
+
|
125
|
+
# TODO improve performance
|
126
|
+
data[k] = numo_type.cast(column.data.values)
|
127
|
+
end
|
89
128
|
end
|
90
129
|
DataFrame.new(data)
|
91
130
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rover-df
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-04-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|
@@ -51,14 +51,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
51
51
|
requirements:
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '2.
|
54
|
+
version: '2.7'
|
55
55
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
56
56
|
requirements:
|
57
57
|
- - ">="
|
58
58
|
- !ruby/object:Gem::Version
|
59
59
|
version: '0'
|
60
60
|
requirements: []
|
61
|
-
rubygems_version: 3.
|
61
|
+
rubygems_version: 3.3.7
|
62
62
|
signing_key:
|
63
63
|
specification_version: 4
|
64
64
|
summary: Simple, powerful data frames for Ruby
|