rover-df 0.2.3 → 0.2.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +22 -0
- data/LICENSE.txt +1 -1
- data/README.md +40 -2
- data/lib/rover/data_frame.rb +60 -7
- data/lib/rover/group.rb +1 -1
- data/lib/rover/vector.rb +6 -1
- data/lib/rover/version.rb +1 -1
- data/lib/rover.rb +63 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c75bed3989211e806e54c296290e5f7b3af236a15742daac876e211e3ca5a76f
|
4
|
+
data.tar.gz: 5865ff8f1d0036423f18cfee867da63214ee50f79d373b0f0f244853d8efbefa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 11718bc8ade75a605e92cabe05c29e55c6d4dfe427cd5ada0a8a216db678b32a88f4a43843d1e7dcda7b7a64adb63b76969f1d958e91ca57c4f71989632e14aa
|
7
|
+
data.tar.gz: 16940236090625bef69cb14d6d9f9f50720314edea1b5892f60443799e5389700ddfb0d79a29ee1e193168097add9d7195799e7f049d85f9c9dc9c443843a678
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,25 @@
|
|
1
|
+
## 0.2.7 (2022-01-16)
|
2
|
+
|
3
|
+
- Added support for booleans to Parquet methods
|
4
|
+
- Added support for creating data frames from `ActiveRecord::Result`
|
5
|
+
- Added `types` option to `read_parquet` and `parse_parquet` methods
|
6
|
+
|
7
|
+
## 0.2.6 (2021-10-27)
|
8
|
+
|
9
|
+
- Added support for `nil` headers to `read_csv` and `parse_csv`
|
10
|
+
- Added `read_parquet`, `parse_parquet`, and `to_parquet` methods
|
11
|
+
|
12
|
+
## 0.2.5 (2021-09-25)
|
13
|
+
|
14
|
+
- Fixed column types with joins
|
15
|
+
|
16
|
+
## 0.2.4 (2021-06-03)
|
17
|
+
|
18
|
+
- Added grouping for `std` and `var`
|
19
|
+
- Fixed `==` for data frames
|
20
|
+
- Fixed error with `first` and `last` for data frames
|
21
|
+
- Fixed error with `last` when vector size is smaller than `n`
|
22
|
+
|
1
23
|
## 0.2.3 (2021-02-08)
|
2
24
|
|
3
25
|
- Added `select`, `reject`, and `map!` methods to vectors
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -20,7 +20,7 @@ gem 'rover-df'
|
|
20
20
|
|
21
21
|
A data frame is an in-memory table. It’s a useful data structure for data analysis and machine learning. It uses columnar storage for fast operations on columns.
|
22
22
|
|
23
|
-
Try it out for forecasting by clicking the button below:
|
23
|
+
Try it out for forecasting by clicking the button below (it can take a few minutes to start):
|
24
24
|
|
25
25
|
[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ankane/ml-stack/master?filepath=Forecasting.ipynb)
|
26
26
|
|
@@ -61,6 +61,14 @@ Rover.read_csv("file.csv")
|
|
61
61
|
Rover.parse_csv("CSV,data,string")
|
62
62
|
```
|
63
63
|
|
64
|
+
From Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem)
|
65
|
+
|
66
|
+
```ruby
|
67
|
+
Rover.read_parquet("file.parquet")
|
68
|
+
# or
|
69
|
+
Rover.parse_parquet("PAR1...")
|
70
|
+
```
|
71
|
+
|
64
72
|
## Attributes
|
65
73
|
|
66
74
|
Get number of rows
|
@@ -89,7 +97,7 @@ Select a column
|
|
89
97
|
df[:a]
|
90
98
|
```
|
91
99
|
|
92
|
-
> Note that strings and symbols are different keys, just like hashes
|
100
|
+
> Note that strings and symbols are different keys, just like hashes. Creating a data frame from Active Record, a CSV, or Parquet uses strings.
|
93
101
|
|
94
102
|
Select multiple columns
|
95
103
|
|
@@ -123,6 +131,20 @@ df[1..3]
|
|
123
131
|
df[[1, 4, 5]]
|
124
132
|
```
|
125
133
|
|
134
|
+
Iterate over rows
|
135
|
+
|
136
|
+
```ruby
|
137
|
+
df.each_row { |row| ... }
|
138
|
+
```
|
139
|
+
|
140
|
+
Iterate over a column
|
141
|
+
|
142
|
+
```ruby
|
143
|
+
df[:a].each { |item| ... }
|
144
|
+
# or
|
145
|
+
df[:a].each_with_index { |item, index| ... }
|
146
|
+
```
|
147
|
+
|
126
148
|
## Filtering
|
127
149
|
|
128
150
|
Filter on a condition
|
@@ -181,6 +203,8 @@ df[:a].median
|
|
181
203
|
df[:a].percentile(90)
|
182
204
|
df[:a].min
|
183
205
|
df[:a].max
|
206
|
+
df[:a].std
|
207
|
+
df[:a].var
|
184
208
|
```
|
185
209
|
|
186
210
|
Count occurrences
|
@@ -259,6 +283,14 @@ df[:a][0..2] = 1
|
|
259
283
|
df[:a][0..2] = [1, 2, 3]
|
260
284
|
```
|
261
285
|
|
286
|
+
Update all elements
|
287
|
+
|
288
|
+
```ruby
|
289
|
+
df[:a] = df[:a].map { |v| v.gsub("a", "b") }
|
290
|
+
# or
|
291
|
+
df[:a].map! { |v| v.gsub("a", "b") }
|
292
|
+
```
|
293
|
+
|
262
294
|
Update elements matching a condition
|
263
295
|
|
264
296
|
```ruby
|
@@ -369,6 +401,12 @@ CSV
|
|
369
401
|
df.to_csv
|
370
402
|
```
|
371
403
|
|
404
|
+
Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem)
|
405
|
+
|
406
|
+
```ruby
|
407
|
+
df.to_parquet
|
408
|
+
```
|
409
|
+
|
372
410
|
## Types
|
373
411
|
|
374
412
|
You can specify column types when creating a data frame
|
data/lib/rover/data_frame.rb
CHANGED
@@ -40,8 +40,8 @@ module Rover
|
|
40
40
|
vectors.each do |k, v|
|
41
41
|
@vectors[k] = to_vector(v, type: types[k])
|
42
42
|
end
|
43
|
-
elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base))
|
44
|
-
result = data.connection.select_all(data.all.to_sql)
|
43
|
+
elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base) || data.is_a?(ActiveRecord::Result))
|
44
|
+
result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.all.to_sql)
|
45
45
|
result.columns.each_with_index do |k, i|
|
46
46
|
@vectors[k] = to_vector(result.rows.map { |r| r[i] }, type: types[k])
|
47
47
|
end
|
@@ -163,7 +163,7 @@ module Rover
|
|
163
163
|
last(n)
|
164
164
|
end
|
165
165
|
|
166
|
-
def first(n =
|
166
|
+
def first(n = 1)
|
167
167
|
new_vectors = {}
|
168
168
|
@vectors.each do |k, v|
|
169
169
|
new_vectors[k] = v.first(n)
|
@@ -171,7 +171,7 @@ module Rover
|
|
171
171
|
DataFrame.new(new_vectors)
|
172
172
|
end
|
173
173
|
|
174
|
-
def last(n =
|
174
|
+
def last(n = 1)
|
175
175
|
new_vectors = {}
|
176
176
|
@vectors.each do |k, v|
|
177
177
|
new_vectors[k] = v.last(n)
|
@@ -235,6 +235,44 @@ module Rover
|
|
235
235
|
end
|
236
236
|
end
|
237
237
|
|
238
|
+
def to_parquet
|
239
|
+
require "parquet"
|
240
|
+
|
241
|
+
schema = {}
|
242
|
+
types.each do |name, type|
|
243
|
+
schema[name] =
|
244
|
+
case type
|
245
|
+
when :int
|
246
|
+
:int64
|
247
|
+
when :uint
|
248
|
+
:uint64
|
249
|
+
when :float
|
250
|
+
:double
|
251
|
+
when :float32
|
252
|
+
:float
|
253
|
+
when :bool
|
254
|
+
:boolean
|
255
|
+
when :object
|
256
|
+
if @vectors[name].all? { |v| v.is_a?(String) }
|
257
|
+
:string
|
258
|
+
else
|
259
|
+
raise "Unknown type"
|
260
|
+
end
|
261
|
+
else
|
262
|
+
type
|
263
|
+
end
|
264
|
+
end
|
265
|
+
# TODO improve performance
|
266
|
+
raw_records = []
|
267
|
+
size.times do |i|
|
268
|
+
raw_records << @vectors.map { |_, v| v[i] }
|
269
|
+
end
|
270
|
+
table = Arrow::Table.new(schema, raw_records)
|
271
|
+
buffer = Arrow::ResizableBuffer.new(1024)
|
272
|
+
table.save(buffer, format: :parquet)
|
273
|
+
buffer.data.to_s
|
274
|
+
end
|
275
|
+
|
238
276
|
# for IRuby
|
239
277
|
def to_html
|
240
278
|
require "iruby"
|
@@ -301,7 +339,7 @@ module Rover
|
|
301
339
|
Group.new(self, columns.flatten)
|
302
340
|
end
|
303
341
|
|
304
|
-
[:max, :min, :median, :mean, :percentile, :sum].each do |name|
|
342
|
+
[:max, :min, :median, :mean, :percentile, :sum, :std, :var].each do |name|
|
305
343
|
define_method(name) do |column, *args|
|
306
344
|
check_column(column)
|
307
345
|
self[column].send(name, *args)
|
@@ -360,7 +398,7 @@ module Rover
|
|
360
398
|
def ==(other)
|
361
399
|
size == other.size &&
|
362
400
|
keys == other.keys &&
|
363
|
-
keys.all? { |k| self[k] == other[k] }
|
401
|
+
keys.all? { |k| self[k].to_numo == other[k].to_numo }
|
364
402
|
end
|
365
403
|
|
366
404
|
def plot(x = nil, y = nil, type: nil)
|
@@ -475,10 +513,12 @@ module Rover
|
|
475
513
|
|
476
514
|
left = how == "left"
|
477
515
|
|
516
|
+
types = {}
|
478
517
|
vectors = {}
|
479
518
|
keys = (self.keys + other.keys).uniq
|
480
519
|
keys.each do |k|
|
481
520
|
vectors[k] = []
|
521
|
+
types[k] = join_type(self.types[k], other.types[k])
|
482
522
|
end
|
483
523
|
|
484
524
|
each_row do |r|
|
@@ -498,7 +538,7 @@ module Rover
|
|
498
538
|
end
|
499
539
|
end
|
500
540
|
|
501
|
-
DataFrame.new(vectors)
|
541
|
+
DataFrame.new(vectors, types: types)
|
502
542
|
end
|
503
543
|
|
504
544
|
def check_join_keys(df, keys)
|
@@ -523,6 +563,19 @@ module Rover
|
|
523
563
|
end
|
524
564
|
end
|
525
565
|
|
566
|
+
def join_type(a, b)
|
567
|
+
if a.nil?
|
568
|
+
b
|
569
|
+
elsif b.nil?
|
570
|
+
a
|
571
|
+
elsif a == b
|
572
|
+
a
|
573
|
+
else
|
574
|
+
# TODO specify
|
575
|
+
nil
|
576
|
+
end
|
577
|
+
end
|
578
|
+
|
526
579
|
def to_vector(v, size: nil, type: nil)
|
527
580
|
if v.is_a?(Vector)
|
528
581
|
v = v.to(type) if type && v.type != type
|
data/lib/rover/group.rb
CHANGED
@@ -9,7 +9,7 @@ module Rover
|
|
9
9
|
Group.new(@df, @columns + columns.flatten)
|
10
10
|
end
|
11
11
|
|
12
|
-
[:count, :max, :min, :mean, :median, :percentile, :sum].each do |name|
|
12
|
+
[:count, :max, :min, :mean, :median, :percentile, :sum, :std, :var].each do |name|
|
13
13
|
define_method(name) do |*args|
|
14
14
|
n = [name, args.first].compact.join("_")
|
15
15
|
|
data/lib/rover/vector.rb
CHANGED
@@ -263,7 +263,11 @@ module Rover
|
|
263
263
|
end
|
264
264
|
|
265
265
|
def last(n = 1)
|
266
|
-
|
266
|
+
if n >= size
|
267
|
+
Vector.new(@data)
|
268
|
+
else
|
269
|
+
Vector.new(@data[-n..-1])
|
270
|
+
end
|
267
271
|
end
|
268
272
|
|
269
273
|
def take(n)
|
@@ -355,6 +359,7 @@ module Rover
|
|
355
359
|
data = data.to_a
|
356
360
|
|
357
361
|
if type
|
362
|
+
data = data.map { |v| v || Float::NAN } if [:float, :float32].include?(type)
|
358
363
|
data = numo_type.cast(data)
|
359
364
|
else
|
360
365
|
data =
|
data/lib/rover/version.rb
CHANGED
data/lib/rover.rb
CHANGED
@@ -19,6 +19,16 @@ module Rover
|
|
19
19
|
csv_to_df(CSV.parse(str, **csv_options(options)), types: types, headers: options[:headers])
|
20
20
|
end
|
21
21
|
|
22
|
+
def read_parquet(path, types: nil)
|
23
|
+
require "parquet"
|
24
|
+
parquet_to_df(Arrow::Table.load(path), types: types)
|
25
|
+
end
|
26
|
+
|
27
|
+
def parse_parquet(str, types: nil)
|
28
|
+
require "parquet"
|
29
|
+
parquet_to_df(Arrow::Table.load(Arrow::Buffer.new(str), format: :parquet), types: types)
|
30
|
+
end
|
31
|
+
|
22
32
|
private
|
23
33
|
|
24
34
|
# TODO use date converter
|
@@ -35,10 +45,63 @@ module Rover
|
|
35
45
|
|
36
46
|
table.by_col!
|
37
47
|
data = {}
|
48
|
+
keys = table.map { |k, _| [k, true] }.to_h
|
49
|
+
unnamed_suffix = 1
|
38
50
|
table.each do |k, v|
|
51
|
+
# TODO do same for empty string in 0.3.0
|
52
|
+
if k.nil?
|
53
|
+
k = "unnamed"
|
54
|
+
while keys.include?(k)
|
55
|
+
unnamed_suffix += 1
|
56
|
+
k = "unnamed#{unnamed_suffix}"
|
57
|
+
end
|
58
|
+
keys[k] = true
|
59
|
+
end
|
39
60
|
data[k] = v
|
40
61
|
end
|
62
|
+
|
41
63
|
DataFrame.new(data, types: types)
|
42
64
|
end
|
65
|
+
|
66
|
+
PARQUET_TYPE_MAPPING = {
|
67
|
+
"bool" => Numo::Bit,
|
68
|
+
"float" => Numo::SFloat,
|
69
|
+
"double" => Numo::DFloat,
|
70
|
+
"int8" => Numo::Int8,
|
71
|
+
"int16" => Numo::Int16,
|
72
|
+
"int32" => Numo::Int32,
|
73
|
+
"int64" => Numo::Int64,
|
74
|
+
"string" => Numo::RObject,
|
75
|
+
"uint8" => Numo::UInt8,
|
76
|
+
"uint16" => Numo::UInt16,
|
77
|
+
"uint32" => Numo::UInt32,
|
78
|
+
"uint64" => Numo::UInt64
|
79
|
+
}
|
80
|
+
|
81
|
+
def parquet_to_df(table, types: nil)
|
82
|
+
data = {}
|
83
|
+
types ||= {}
|
84
|
+
table.each_column do |column|
|
85
|
+
k = column.field.name
|
86
|
+
if types[k]
|
87
|
+
data[k] = Vector.new(column.data.values, type: types[k])
|
88
|
+
else
|
89
|
+
type = column.field.data_type.to_s
|
90
|
+
numo_type = PARQUET_TYPE_MAPPING[type]
|
91
|
+
raise "Unknown type: #{type}" unless numo_type
|
92
|
+
|
93
|
+
# TODO automatic conversion?
|
94
|
+
# int => float
|
95
|
+
# bool => object
|
96
|
+
if (type.include?("int") || type == "bool") && column.n_nulls > 0
|
97
|
+
raise "Nulls not supported for #{type} column: #{k}"
|
98
|
+
end
|
99
|
+
|
100
|
+
# TODO improve performance
|
101
|
+
data[k] = numo_type.cast(column.data.values)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
DataFrame.new(data)
|
105
|
+
end
|
43
106
|
end
|
44
107
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rover-df
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-01-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|
@@ -58,7 +58,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
58
58
|
- !ruby/object:Gem::Version
|
59
59
|
version: '0'
|
60
60
|
requirements: []
|
61
|
-
rubygems_version: 3.
|
61
|
+
rubygems_version: 3.3.3
|
62
62
|
signing_key:
|
63
63
|
specification_version: 4
|
64
64
|
summary: Simple, powerful data frames for Ruby
|