rover-df 0.2.6 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +21 -0
- data/LICENSE.txt +1 -1
- data/README.md +25 -13
- data/lib/rover/data_frame.rb +65 -43
- data/lib/rover/group.rb +16 -4
- data/lib/rover/vector.rb +20 -7
- data/lib/rover/version.rb +1 -1
- data/lib/rover.rb +72 -33
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 171a06a05afa4ec4bb09efe7fef53b49220a3d4fa5352621112e29f2b70812b9
|
4
|
+
data.tar.gz: 435d8f3d4781f1960236f3c2b7f9fa2c4e38dfc987b53cb9fbe6351a9e8db4e9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9e5baa8cb051f7c7f06bbe0025ac4d923947b34768461a814e533aa78ec5d2d391a12edc6a9a64abc2fa9b1147255211ff26e8094cfaef67c9d70e393e57bcc0
|
7
|
+
data.tar.gz: 55f3438d438326c324c612a92b39a54698123889aeb28bafd93a196f2659208b4b72a1ac02efd407166fc56c8c9f3abff9472804ddfb231220b6adf41ff38df1
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,24 @@
|
|
1
|
+
## 0.3.0 (2022-04-03)
|
2
|
+
|
3
|
+
- Added `deep_dup` method to data frames
|
4
|
+
- Changed `:int` to `:int64`, `:uint` to `:uint64`, and `:float` to `:float64` for type methods
|
5
|
+
- Changed missing column to raise `KeyError` instead of `ArgumentError` for aggregate methods
|
6
|
+
- Changed passing too many headers to `read_csv` and `parse_csv` to raise `ArgumentError`
|
7
|
+
- Changed empty string in CSV headers to match behavior of `nil`
|
8
|
+
- Fixed `clone` and `dup` method for vectors
|
9
|
+
- Dropped support for Ruby < 2.7
|
10
|
+
|
11
|
+
## 0.2.8 (2022-03-15)
|
12
|
+
|
13
|
+
- Added `group` and `stacked` options to `plot`
|
14
|
+
- Improved performance of `read_csv` and `parse_csv`
|
15
|
+
|
16
|
+
## 0.2.7 (2022-01-16)
|
17
|
+
|
18
|
+
- Added support for booleans to Parquet methods
|
19
|
+
- Added support for creating data frames from `ActiveRecord::Result`
|
20
|
+
- Added `types` option to `read_parquet` and `parse_parquet` methods
|
21
|
+
|
1
22
|
## 0.2.6 (2021-10-27)
|
2
23
|
|
3
24
|
- Added support for `nil` headers to `read_csv` and `parse_csv`
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -13,7 +13,7 @@ Simple, powerful data frames for Ruby
|
|
13
13
|
Add this line to your application’s Gemfile:
|
14
14
|
|
15
15
|
```ruby
|
16
|
-
gem
|
16
|
+
gem "rover-df"
|
17
17
|
```
|
18
18
|
|
19
19
|
## Intro
|
@@ -61,7 +61,7 @@ Rover.read_csv("file.csv")
|
|
61
61
|
Rover.parse_csv("CSV,data,string")
|
62
62
|
```
|
63
63
|
|
64
|
-
From Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem)
|
64
|
+
From Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem)
|
65
65
|
|
66
66
|
```ruby
|
67
67
|
Rover.read_parquet("file.parquet")
|
@@ -236,7 +236,7 @@ df.group(:a).max(:b)
|
|
236
236
|
Multiple groups
|
237
237
|
|
238
238
|
```ruby
|
239
|
-
df.group(
|
239
|
+
df.group(:a, :b).count
|
240
240
|
```
|
241
241
|
|
242
242
|
## Visualization
|
@@ -244,7 +244,7 @@ df.group([:a, :b]).count
|
|
244
244
|
Add [Vega](https://github.com/ankane/vega) to your application’s Gemfile:
|
245
245
|
|
246
246
|
```ruby
|
247
|
-
gem
|
247
|
+
gem "vega"
|
248
248
|
```
|
249
249
|
|
250
250
|
And use:
|
@@ -259,6 +259,18 @@ Specify the chart type (`line`, `pie`, `column`, `bar`, `area`, or `scatter`)
|
|
259
259
|
df.plot(:a, :b, type: "pie")
|
260
260
|
```
|
261
261
|
|
262
|
+
Group data
|
263
|
+
|
264
|
+
```ruby
|
265
|
+
df.plot(:a, :b, group: :c)
|
266
|
+
```
|
267
|
+
|
268
|
+
Stacked columns or bars
|
269
|
+
|
270
|
+
```ruby
|
271
|
+
df.plot(:a, :b, group: :c, stacked: true)
|
272
|
+
```
|
273
|
+
|
262
274
|
## Updating Data
|
263
275
|
|
264
276
|
Add a new column
|
@@ -401,7 +413,7 @@ CSV
|
|
401
413
|
df.to_csv
|
402
414
|
```
|
403
415
|
|
404
|
-
Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem)
|
416
|
+
Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem)
|
405
417
|
|
406
418
|
```ruby
|
407
419
|
df.to_parquet
|
@@ -412,22 +424,22 @@ df.to_parquet
|
|
412
424
|
You can specify column types when creating a data frame
|
413
425
|
|
414
426
|
```ruby
|
415
|
-
Rover::DataFrame.new(data, types: {"a" => :
|
427
|
+
Rover::DataFrame.new(data, types: {"a" => :int64, "b" => :float64})
|
416
428
|
```
|
417
429
|
|
418
430
|
Or
|
419
431
|
|
420
432
|
```ruby
|
421
|
-
Rover.read_csv("data.csv", types: {"a" => :
|
433
|
+
Rover.read_csv("data.csv", types: {"a" => :int64, "b" => :float64})
|
422
434
|
```
|
423
435
|
|
424
436
|
Supported types are:
|
425
437
|
|
426
|
-
- boolean -
|
427
|
-
- float -
|
428
|
-
- integer -
|
429
|
-
- unsigned integer -
|
430
|
-
- object -
|
438
|
+
- boolean - `:bool`
|
439
|
+
- float - `:float64`, `:float32`
|
440
|
+
- integer - `:int64`, `:int32`, `:int16`, `:int8`
|
441
|
+
- unsigned integer - `:uint64`, `:uint32`, `:uint16`, `:uint8`
|
442
|
+
- object - `:object`
|
431
443
|
|
432
444
|
Get column types
|
433
445
|
|
@@ -444,7 +456,7 @@ df[:a].type
|
|
444
456
|
Change the type of a column
|
445
457
|
|
446
458
|
```ruby
|
447
|
-
df[:a] = df[:a].to(:
|
459
|
+
df[:a] = df[:a].to(:int32)
|
448
460
|
```
|
449
461
|
|
450
462
|
## History
|
data/lib/rover/data_frame.rb
CHANGED
@@ -40,8 +40,8 @@ module Rover
|
|
40
40
|
vectors.each do |k, v|
|
41
41
|
@vectors[k] = to_vector(v, type: types[k])
|
42
42
|
end
|
43
|
-
elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base))
|
44
|
-
result = data.connection.select_all(data.all.to_sql)
|
43
|
+
elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base) || data.is_a?(ActiveRecord::Result))
|
44
|
+
result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.all.to_sql)
|
45
45
|
result.columns.each_with_index do |k, i|
|
46
46
|
@vectors[k] = to_vector(result.rows.map { |r| r[i] }, type: types[k])
|
47
47
|
end
|
@@ -72,7 +72,7 @@ module Rover
|
|
72
72
|
# multiple columns
|
73
73
|
df = DataFrame.new
|
74
74
|
where.each do |k|
|
75
|
-
check_column(k
|
75
|
+
check_column(k)
|
76
76
|
df[k] = @vectors[k]
|
77
77
|
end
|
78
78
|
df
|
@@ -102,7 +102,7 @@ module Rover
|
|
102
102
|
def []=(k, v)
|
103
103
|
check_key(k)
|
104
104
|
v = to_vector(v, size: size)
|
105
|
-
raise ArgumentError, "Size mismatch
|
105
|
+
raise ArgumentError, "Size mismatch (given #{v.size}, expected #{size})" if @vectors.any? && v.size != size
|
106
106
|
@vectors[k] = v
|
107
107
|
end
|
108
108
|
|
@@ -242,14 +242,16 @@ module Rover
|
|
242
242
|
types.each do |name, type|
|
243
243
|
schema[name] =
|
244
244
|
case type
|
245
|
-
when :
|
245
|
+
when :int64
|
246
246
|
:int64
|
247
|
-
when :
|
247
|
+
when :uint64
|
248
248
|
:uint64
|
249
|
-
when :
|
249
|
+
when :float64
|
250
250
|
:double
|
251
251
|
when :float32
|
252
252
|
:float
|
253
|
+
when :bool
|
254
|
+
:boolean
|
253
255
|
when :object
|
254
256
|
if @vectors[name].all? { |v| v.is_a?(String) }
|
255
257
|
:string
|
@@ -344,10 +346,10 @@ module Rover
|
|
344
346
|
end
|
345
347
|
end
|
346
348
|
|
347
|
-
def
|
349
|
+
def deep_dup
|
348
350
|
df = DataFrame.new
|
349
351
|
@vectors.each do |k, v|
|
350
|
-
df[k] = v
|
352
|
+
df[k] = v.dup
|
351
353
|
end
|
352
354
|
df
|
353
355
|
end
|
@@ -399,7 +401,7 @@ module Rover
|
|
399
401
|
keys.all? { |k| self[k].to_numo == other[k].to_numo }
|
400
402
|
end
|
401
403
|
|
402
|
-
def plot(x = nil, y = nil, type: nil)
|
404
|
+
def plot(x = nil, y = nil, type: nil, group: nil, stacked: nil)
|
403
405
|
require "vega"
|
404
406
|
|
405
407
|
raise ArgumentError, "Must specify columns" if keys.size != 2 && (!x || !y)
|
@@ -414,7 +416,7 @@ module Rover
|
|
414
416
|
raise "Cannot determine type. Use the type option."
|
415
417
|
end
|
416
418
|
end
|
417
|
-
data = self[[x, y]]
|
419
|
+
data = self[group.nil? ? [x, y] : [x, y, group]]
|
418
420
|
|
419
421
|
case type
|
420
422
|
when "line", "area"
|
@@ -428,16 +430,20 @@ module Rover
|
|
428
430
|
end
|
429
431
|
|
430
432
|
scale = x_type == "temporal" ? {type: "utc"} : {}
|
433
|
+
encoding = {
|
434
|
+
x: {field: x, type: x_type, scale: scale},
|
435
|
+
y: {field: y, type: "quantitative"}
|
436
|
+
}
|
437
|
+
encoding[:color] = {field: group} if group
|
431
438
|
|
432
439
|
Vega.lite
|
433
440
|
.data(data)
|
434
441
|
.mark(type: type, tooltip: true, interpolate: "cardinal", point: {size: 60})
|
435
|
-
.encoding(
|
436
|
-
x: {field: x, type: x_type, scale: scale},
|
437
|
-
y: {field: y, type: "quantitative"}
|
438
|
-
)
|
442
|
+
.encoding(encoding)
|
439
443
|
.config(axis: {labelFontSize: 12})
|
440
444
|
when "pie"
|
445
|
+
raise ArgumentError, "Cannot use group option with pie chart" unless group.nil?
|
446
|
+
|
441
447
|
Vega.lite
|
442
448
|
.data(data)
|
443
449
|
.mark(type: "arc", tooltip: true)
|
@@ -447,34 +453,48 @@ module Rover
|
|
447
453
|
)
|
448
454
|
.view(stroke: nil)
|
449
455
|
when "column"
|
456
|
+
encoding = {
|
457
|
+
x: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
|
458
|
+
y: {field: y, type: "quantitative"}
|
459
|
+
}
|
460
|
+
if group
|
461
|
+
encoding[:color] = {field: group}
|
462
|
+
encoding[:xOffset] = {field: group} unless stacked
|
463
|
+
end
|
464
|
+
|
450
465
|
Vega.lite
|
451
466
|
.data(data)
|
452
467
|
.mark(type: "bar", tooltip: true)
|
453
|
-
.encoding(
|
454
|
-
# TODO determine label angle
|
455
|
-
x: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
|
456
|
-
y: {field: y, type: "quantitative"}
|
457
|
-
)
|
468
|
+
.encoding(encoding)
|
458
469
|
.config(axis: {labelFontSize: 12})
|
459
470
|
when "bar"
|
471
|
+
encoding = {
|
472
|
+
# TODO determine label angle
|
473
|
+
y: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
|
474
|
+
x: {field: y, type: "quantitative"}
|
475
|
+
}
|
476
|
+
if group
|
477
|
+
encoding[:color] = {field: group}
|
478
|
+
encoding[:yOffset] = {field: group} unless stacked
|
479
|
+
end
|
480
|
+
|
460
481
|
Vega.lite
|
461
482
|
.data(data)
|
462
483
|
.mark(type: "bar", tooltip: true)
|
463
|
-
.encoding(
|
464
|
-
# TODO determine label angle
|
465
|
-
y: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
|
466
|
-
x: {field: y, type: "quantitative"}
|
467
|
-
)
|
484
|
+
.encoding(encoding)
|
468
485
|
.config(axis: {labelFontSize: 12})
|
469
486
|
when "scatter"
|
487
|
+
encoding = {
|
488
|
+
x: {field: x, type: "quantitative", scale: {zero: false}},
|
489
|
+
y: {field: y, type: "quantitative", scale: {zero: false}},
|
490
|
+
size: {value: 60}
|
491
|
+
}
|
492
|
+
encoding[:color] = {field: group} if group
|
493
|
+
|
470
494
|
Vega.lite
|
471
495
|
.data(data)
|
472
496
|
.mark(type: "circle", tooltip: true)
|
473
|
-
.encoding(
|
474
|
-
x: {field: x, type: "quantitative", scale: {zero: false}},
|
475
|
-
y: {field: y, type: "quantitative", scale: {zero: false}},
|
476
|
-
size: {value: 60}
|
477
|
-
)
|
497
|
+
.encoding(encoding)
|
478
498
|
.config(axis: {labelFontSize: 12})
|
479
499
|
else
|
480
500
|
raise ArgumentError, "Invalid type: #{type}"
|
@@ -483,8 +503,20 @@ module Rover
|
|
483
503
|
|
484
504
|
private
|
485
505
|
|
506
|
+
# for clone
|
507
|
+
def initialize_clone(_)
|
508
|
+
@vectors = @vectors.clone
|
509
|
+
super
|
510
|
+
end
|
511
|
+
|
512
|
+
# for dup
|
513
|
+
def initialize_dup(_)
|
514
|
+
@vectors = @vectors.dup
|
515
|
+
super
|
516
|
+
end
|
517
|
+
|
486
518
|
def check_key(key)
|
487
|
-
raise ArgumentError, "Key must be a
|
519
|
+
raise ArgumentError, "Key must be a String or Symbol, given #{key.class.name}" unless key.is_a?(String) || key.is_a?(Symbol)
|
488
520
|
end
|
489
521
|
|
490
522
|
# TODO make more efficient
|
@@ -545,19 +577,9 @@ module Rover
|
|
545
577
|
raise ArgumentError, "Missing keys: #{missing_keys.join(", ")}" if missing_keys.any?
|
546
578
|
end
|
547
579
|
|
548
|
-
|
549
|
-
# always use did_you_mean
|
550
|
-
def check_column(key, did_you_mean = false)
|
580
|
+
def check_column(key)
|
551
581
|
unless include?(key)
|
552
|
-
|
553
|
-
if RUBY_VERSION.to_f >= 2.6
|
554
|
-
raise KeyError.new("Missing column: #{key}", receiver: self, key: key)
|
555
|
-
else
|
556
|
-
raise KeyError.new("Missing column: #{key}")
|
557
|
-
end
|
558
|
-
else
|
559
|
-
raise ArgumentError, "Missing column: #{key}"
|
560
|
-
end
|
582
|
+
raise KeyError.new("Missing column: #{key}", receiver: self, key: key)
|
561
583
|
end
|
562
584
|
end
|
563
585
|
|
data/lib/rover/group.rb
CHANGED
@@ -3,6 +3,7 @@ module Rover
|
|
3
3
|
def initialize(df, columns)
|
4
4
|
@df = df
|
5
5
|
@columns = columns
|
6
|
+
check_columns
|
6
7
|
end
|
7
8
|
|
8
9
|
def group(*columns)
|
@@ -22,16 +23,20 @@ module Rover
|
|
22
23
|
end
|
23
24
|
end
|
24
25
|
|
26
|
+
def plot(*args, **options)
|
27
|
+
raise ArgumentError, "Multiple groups not supported" if @columns.size > 1
|
28
|
+
# same message as Ruby
|
29
|
+
raise ArgumentError, "unknown keyword: :group" if options.key?(:group)
|
30
|
+
|
31
|
+
@df.plot(*args, **options, group: @columns.first)
|
32
|
+
end
|
33
|
+
|
25
34
|
private
|
26
35
|
|
27
36
|
# TODO make more efficient
|
28
37
|
def grouped_dfs
|
29
38
|
# cache here so we can reuse for multiple calcuations if needed
|
30
39
|
@grouped_dfs ||= begin
|
31
|
-
raise ArgumentError, "No columns given" if @columns.empty?
|
32
|
-
missing_keys = @columns - @df.keys
|
33
|
-
raise ArgumentError, "Missing keys: #{missing_keys.join(", ")}" if missing_keys.any?
|
34
|
-
|
35
40
|
groups = Hash.new { |hash, key| hash[key] = [] }
|
36
41
|
i = 0
|
37
42
|
@df.each_row do |row|
|
@@ -46,5 +51,12 @@ module Rover
|
|
46
51
|
result
|
47
52
|
end
|
48
53
|
end
|
54
|
+
|
55
|
+
def check_columns
|
56
|
+
raise ArgumentError, "No columns given" if @columns.empty?
|
57
|
+
|
58
|
+
missing_keys = @columns - @df.keys
|
59
|
+
raise ArgumentError, "Missing keys: #{missing_keys.join(", ")}" if missing_keys.any?
|
60
|
+
end
|
49
61
|
end
|
50
62
|
end
|
data/lib/rover/vector.rb
CHANGED
@@ -1,23 +1,23 @@
|
|
1
1
|
module Rover
|
2
2
|
class Vector
|
3
3
|
# if a user never specifies types,
|
4
|
-
# the defaults are bool,
|
5
|
-
# keep these simple
|
6
|
-
#
|
7
|
-
# we could create aliases for float64, int64, uint64
|
8
|
-
# if so, type should still return the simple type
|
4
|
+
# the defaults are bool, float64, int64, and object
|
9
5
|
TYPE_CAST_MAPPING = {
|
10
6
|
bool: Numo::Bit,
|
11
7
|
float32: Numo::SFloat,
|
12
|
-
|
8
|
+
float64: Numo::DFloat,
|
13
9
|
int8: Numo::Int8,
|
14
10
|
int16: Numo::Int16,
|
15
11
|
int32: Numo::Int32,
|
16
|
-
|
12
|
+
int64: Numo::Int64,
|
17
13
|
object: Numo::RObject,
|
18
14
|
uint8: Numo::UInt8,
|
19
15
|
uint16: Numo::UInt16,
|
20
16
|
uint32: Numo::UInt32,
|
17
|
+
uint64: Numo::UInt64,
|
18
|
+
# legacy - must come last
|
19
|
+
float: Numo::DFloat,
|
20
|
+
int: Numo::Int64,
|
21
21
|
uint: Numo::UInt64
|
22
22
|
}
|
23
23
|
|
@@ -333,6 +333,18 @@ module Rover
|
|
333
333
|
|
334
334
|
private
|
335
335
|
|
336
|
+
# for clone
|
337
|
+
def initialize_clone(_)
|
338
|
+
@data = @data.clone
|
339
|
+
super
|
340
|
+
end
|
341
|
+
|
342
|
+
# for dup
|
343
|
+
def initialize_dup(_)
|
344
|
+
@data = @data.dup
|
345
|
+
super
|
346
|
+
end
|
347
|
+
|
336
348
|
def cast_data(data, type: nil)
|
337
349
|
numo_type = numo_type(type) if type
|
338
350
|
|
@@ -359,6 +371,7 @@ module Rover
|
|
359
371
|
data = data.to_a
|
360
372
|
|
361
373
|
if type
|
374
|
+
data = data.map { |v| v || Float::NAN } if [:float, :float32].include?(type)
|
362
375
|
data = numo_type.cast(data)
|
363
376
|
else
|
364
377
|
data =
|
data/lib/rover/version.rb
CHANGED
data/lib/rover.rb
CHANGED
@@ -9,47 +9,58 @@ require "rover/version"
|
|
9
9
|
|
10
10
|
module Rover
|
11
11
|
class << self
|
12
|
-
def read_csv(path,
|
13
|
-
|
14
|
-
|
12
|
+
def read_csv(path, **options)
|
13
|
+
csv_to_df(**options) do |csv_options|
|
14
|
+
CSV.read(path, **csv_options)
|
15
|
+
end
|
15
16
|
end
|
16
17
|
|
17
|
-
def parse_csv(str,
|
18
|
-
|
19
|
-
|
18
|
+
def parse_csv(str, **options)
|
19
|
+
csv_to_df(**options) do |csv_options|
|
20
|
+
CSV.parse(str, **csv_options)
|
21
|
+
end
|
20
22
|
end
|
21
23
|
|
22
|
-
def read_parquet(path)
|
23
|
-
|
24
|
-
|
24
|
+
def read_parquet(path, **options)
|
25
|
+
parquet_to_df(**options) do
|
26
|
+
Arrow::Table.load(path)
|
27
|
+
end
|
25
28
|
end
|
26
29
|
|
27
|
-
def parse_parquet(str)
|
28
|
-
|
29
|
-
|
30
|
+
def parse_parquet(str, **options)
|
31
|
+
parquet_to_df(**options) do
|
32
|
+
Arrow::Table.load(Arrow::Buffer.new(str), format: :parquet)
|
33
|
+
end
|
30
34
|
end
|
31
35
|
|
32
36
|
private
|
33
37
|
|
34
|
-
|
35
|
-
|
36
|
-
options = {headers: true, converters: :numeric}.merge(options)
|
37
|
-
raise ArgumentError, "Must specify headers" unless options[:headers]
|
38
|
-
options
|
39
|
-
end
|
38
|
+
def csv_to_df(types: nil, headers: nil, **csv_options)
|
39
|
+
require "csv"
|
40
40
|
|
41
|
-
|
42
|
-
|
43
|
-
|
41
|
+
raise ArgumentError, "Must specify headers" if headers == false
|
42
|
+
|
43
|
+
# TODO use date converter in 0.4.0 - need to test performance
|
44
|
+
table = yield({converters: :numeric}.merge(csv_options))
|
45
|
+
|
46
|
+
headers = nil if headers == true
|
47
|
+
if headers && table.first && headers.size != table.first.size
|
48
|
+
raise ArgumentError, "Expected #{table.first.size} headers, given #{headers.size}"
|
49
|
+
end
|
50
|
+
|
51
|
+
table_headers = (headers || table.shift || []).dup
|
52
|
+
# keep same behavior as headers: true
|
53
|
+
if table.first
|
54
|
+
while table_headers.size < table.first.size
|
55
|
+
table_headers << nil
|
56
|
+
end
|
44
57
|
end
|
45
58
|
|
46
|
-
table.by_col!
|
47
59
|
data = {}
|
48
|
-
keys =
|
60
|
+
keys = table_headers.map { |k| [k, true] }.to_h
|
49
61
|
unnamed_suffix = 1
|
50
|
-
|
51
|
-
|
52
|
-
if k.nil?
|
62
|
+
table_headers.each_with_index do |k, i|
|
63
|
+
if k.nil? || k.empty?
|
53
64
|
k = "unnamed"
|
54
65
|
while keys.include?(k)
|
55
66
|
unnamed_suffix += 1
|
@@ -57,13 +68,25 @@ module Rover
|
|
57
68
|
end
|
58
69
|
keys[k] = true
|
59
70
|
end
|
60
|
-
|
71
|
+
table_headers[i] = k
|
72
|
+
end
|
73
|
+
|
74
|
+
table_headers.each_with_index do |k, i|
|
75
|
+
# use first value for duplicate headers like headers: true
|
76
|
+
next if data[k]
|
77
|
+
|
78
|
+
values = []
|
79
|
+
table.each do |row|
|
80
|
+
values << row[i]
|
81
|
+
end
|
82
|
+
data[k] = values
|
61
83
|
end
|
62
84
|
|
63
85
|
DataFrame.new(data, types: types)
|
64
86
|
end
|
65
87
|
|
66
88
|
PARQUET_TYPE_MAPPING = {
|
89
|
+
"bool" => Numo::Bit,
|
67
90
|
"float" => Numo::SFloat,
|
68
91
|
"double" => Numo::DFloat,
|
69
92
|
"int8" => Numo::Int8,
|
@@ -77,15 +100,31 @@ module Rover
|
|
77
100
|
"uint64" => Numo::UInt64
|
78
101
|
}
|
79
102
|
|
80
|
-
def parquet_to_df(
|
103
|
+
def parquet_to_df(types: nil)
|
104
|
+
require "parquet"
|
105
|
+
|
106
|
+
table = yield
|
81
107
|
data = {}
|
108
|
+
types ||= {}
|
82
109
|
table.each_column do |column|
|
83
110
|
k = column.field.name
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
111
|
+
if types[k]
|
112
|
+
data[k] = Vector.new(column.data.values, type: types[k])
|
113
|
+
else
|
114
|
+
type = column.field.data_type.to_s
|
115
|
+
numo_type = PARQUET_TYPE_MAPPING[type]
|
116
|
+
raise "Unknown type: #{type}" unless numo_type
|
117
|
+
|
118
|
+
# TODO automatic conversion?
|
119
|
+
# int => float
|
120
|
+
# bool => object
|
121
|
+
if (type.include?("int") || type == "bool") && column.n_nulls > 0
|
122
|
+
raise "Nulls not supported for #{type} column: #{k}"
|
123
|
+
end
|
124
|
+
|
125
|
+
# TODO improve performance
|
126
|
+
data[k] = numo_type.cast(column.data.values)
|
127
|
+
end
|
89
128
|
end
|
90
129
|
DataFrame.new(data)
|
91
130
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rover-df
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-04-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|
@@ -51,14 +51,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
51
51
|
requirements:
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '2.
|
54
|
+
version: '2.7'
|
55
55
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
56
56
|
requirements:
|
57
57
|
- - ">="
|
58
58
|
- !ruby/object:Gem::Version
|
59
59
|
version: '0'
|
60
60
|
requirements: []
|
61
|
-
rubygems_version: 3.
|
61
|
+
rubygems_version: 3.3.7
|
62
62
|
signing_key:
|
63
63
|
specification_version: 4
|
64
64
|
summary: Simple, powerful data frames for Ruby
|