rover-df 0.2.3 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +22 -0
- data/LICENSE.txt +1 -1
- data/README.md +40 -2
- data/lib/rover/data_frame.rb +60 -7
- data/lib/rover/group.rb +1 -1
- data/lib/rover/vector.rb +6 -1
- data/lib/rover/version.rb +1 -1
- data/lib/rover.rb +63 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c75bed3989211e806e54c296290e5f7b3af236a15742daac876e211e3ca5a76f
|
4
|
+
data.tar.gz: 5865ff8f1d0036423f18cfee867da63214ee50f79d373b0f0f244853d8efbefa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 11718bc8ade75a605e92cabe05c29e55c6d4dfe427cd5ada0a8a216db678b32a88f4a43843d1e7dcda7b7a64adb63b76969f1d958e91ca57c4f71989632e14aa
|
7
|
+
data.tar.gz: 16940236090625bef69cb14d6d9f9f50720314edea1b5892f60443799e5389700ddfb0d79a29ee1e193168097add9d7195799e7f049d85f9c9dc9c443843a678
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,25 @@
|
|
1
|
+
## 0.2.7 (2022-01-16)
|
2
|
+
|
3
|
+
- Added support for booleans to Parquet methods
|
4
|
+
- Added support for creating data frames from `ActiveRecord::Result`
|
5
|
+
- Added `types` option to `read_parquet` and `parse_parquet` methods
|
6
|
+
|
7
|
+
## 0.2.6 (2021-10-27)
|
8
|
+
|
9
|
+
- Added support for `nil` headers to `read_csv` and `parse_csv`
|
10
|
+
- Added `read_parquet`, `parse_parquet`, and `to_parquet` methods
|
11
|
+
|
12
|
+
## 0.2.5 (2021-09-25)
|
13
|
+
|
14
|
+
- Fixed column types with joins
|
15
|
+
|
16
|
+
## 0.2.4 (2021-06-03)
|
17
|
+
|
18
|
+
- Added grouping for `std` and `var`
|
19
|
+
- Fixed `==` for data frames
|
20
|
+
- Fixed error with `first` and `last` for data frames
|
21
|
+
- Fixed error with `last` when vector size is smaller than `n`
|
22
|
+
|
1
23
|
## 0.2.3 (2021-02-08)
|
2
24
|
|
3
25
|
- Added `select`, `reject`, and `map!` methods to vectors
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -20,7 +20,7 @@ gem 'rover-df'
|
|
20
20
|
|
21
21
|
A data frame is an in-memory table. It’s a useful data structure for data analysis and machine learning. It uses columnar storage for fast operations on columns.
|
22
22
|
|
23
|
-
Try it out for forecasting by clicking the button below:
|
23
|
+
Try it out for forecasting by clicking the button below (it can take a few minutes to start):
|
24
24
|
|
25
25
|
[](https://mybinder.org/v2/gh/ankane/ml-stack/master?filepath=Forecasting.ipynb)
|
26
26
|
|
@@ -61,6 +61,14 @@ Rover.read_csv("file.csv")
|
|
61
61
|
Rover.parse_csv("CSV,data,string")
|
62
62
|
```
|
63
63
|
|
64
|
+
From Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem)
|
65
|
+
|
66
|
+
```ruby
|
67
|
+
Rover.read_parquet("file.parquet")
|
68
|
+
# or
|
69
|
+
Rover.parse_parquet("PAR1...")
|
70
|
+
```
|
71
|
+
|
64
72
|
## Attributes
|
65
73
|
|
66
74
|
Get number of rows
|
@@ -89,7 +97,7 @@ Select a column
|
|
89
97
|
df[:a]
|
90
98
|
```
|
91
99
|
|
92
|
-
> Note that strings and symbols are different keys, just like hashes
|
100
|
+
> Note that strings and symbols are different keys, just like hashes. Creating a data frame from Active Record, a CSV, or Parquet uses strings.
|
93
101
|
|
94
102
|
Select multiple columns
|
95
103
|
|
@@ -123,6 +131,20 @@ df[1..3]
|
|
123
131
|
df[[1, 4, 5]]
|
124
132
|
```
|
125
133
|
|
134
|
+
Iterate over rows
|
135
|
+
|
136
|
+
```ruby
|
137
|
+
df.each_row { |row| ... }
|
138
|
+
```
|
139
|
+
|
140
|
+
Iterate over a column
|
141
|
+
|
142
|
+
```ruby
|
143
|
+
df[:a].each { |item| ... }
|
144
|
+
# or
|
145
|
+
df[:a].each_with_index { |item, index| ... }
|
146
|
+
```
|
147
|
+
|
126
148
|
## Filtering
|
127
149
|
|
128
150
|
Filter on a condition
|
@@ -181,6 +203,8 @@ df[:a].median
|
|
181
203
|
df[:a].percentile(90)
|
182
204
|
df[:a].min
|
183
205
|
df[:a].max
|
206
|
+
df[:a].std
|
207
|
+
df[:a].var
|
184
208
|
```
|
185
209
|
|
186
210
|
Count occurrences
|
@@ -259,6 +283,14 @@ df[:a][0..2] = 1
|
|
259
283
|
df[:a][0..2] = [1, 2, 3]
|
260
284
|
```
|
261
285
|
|
286
|
+
Update all elements
|
287
|
+
|
288
|
+
```ruby
|
289
|
+
df[:a] = df[:a].map { |v| v.gsub("a", "b") }
|
290
|
+
# or
|
291
|
+
df[:a].map! { |v| v.gsub("a", "b") }
|
292
|
+
```
|
293
|
+
|
262
294
|
Update elements matching a condition
|
263
295
|
|
264
296
|
```ruby
|
@@ -369,6 +401,12 @@ CSV
|
|
369
401
|
df.to_csv
|
370
402
|
```
|
371
403
|
|
404
|
+
Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem)
|
405
|
+
|
406
|
+
```ruby
|
407
|
+
df.to_parquet
|
408
|
+
```
|
409
|
+
|
372
410
|
## Types
|
373
411
|
|
374
412
|
You can specify column types when creating a data frame
|
data/lib/rover/data_frame.rb
CHANGED
@@ -40,8 +40,8 @@ module Rover
|
|
40
40
|
vectors.each do |k, v|
|
41
41
|
@vectors[k] = to_vector(v, type: types[k])
|
42
42
|
end
|
43
|
-
elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base))
|
44
|
-
result = data.connection.select_all(data.all.to_sql)
|
43
|
+
elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base) || data.is_a?(ActiveRecord::Result))
|
44
|
+
result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.all.to_sql)
|
45
45
|
result.columns.each_with_index do |k, i|
|
46
46
|
@vectors[k] = to_vector(result.rows.map { |r| r[i] }, type: types[k])
|
47
47
|
end
|
@@ -163,7 +163,7 @@ module Rover
|
|
163
163
|
last(n)
|
164
164
|
end
|
165
165
|
|
166
|
-
def first(n =
|
166
|
+
def first(n = 1)
|
167
167
|
new_vectors = {}
|
168
168
|
@vectors.each do |k, v|
|
169
169
|
new_vectors[k] = v.first(n)
|
@@ -171,7 +171,7 @@ module Rover
|
|
171
171
|
DataFrame.new(new_vectors)
|
172
172
|
end
|
173
173
|
|
174
|
-
def last(n =
|
174
|
+
def last(n = 1)
|
175
175
|
new_vectors = {}
|
176
176
|
@vectors.each do |k, v|
|
177
177
|
new_vectors[k] = v.last(n)
|
@@ -235,6 +235,44 @@ module Rover
|
|
235
235
|
end
|
236
236
|
end
|
237
237
|
|
238
|
+
def to_parquet
|
239
|
+
require "parquet"
|
240
|
+
|
241
|
+
schema = {}
|
242
|
+
types.each do |name, type|
|
243
|
+
schema[name] =
|
244
|
+
case type
|
245
|
+
when :int
|
246
|
+
:int64
|
247
|
+
when :uint
|
248
|
+
:uint64
|
249
|
+
when :float
|
250
|
+
:double
|
251
|
+
when :float32
|
252
|
+
:float
|
253
|
+
when :bool
|
254
|
+
:boolean
|
255
|
+
when :object
|
256
|
+
if @vectors[name].all? { |v| v.is_a?(String) }
|
257
|
+
:string
|
258
|
+
else
|
259
|
+
raise "Unknown type"
|
260
|
+
end
|
261
|
+
else
|
262
|
+
type
|
263
|
+
end
|
264
|
+
end
|
265
|
+
# TODO improve performance
|
266
|
+
raw_records = []
|
267
|
+
size.times do |i|
|
268
|
+
raw_records << @vectors.map { |_, v| v[i] }
|
269
|
+
end
|
270
|
+
table = Arrow::Table.new(schema, raw_records)
|
271
|
+
buffer = Arrow::ResizableBuffer.new(1024)
|
272
|
+
table.save(buffer, format: :parquet)
|
273
|
+
buffer.data.to_s
|
274
|
+
end
|
275
|
+
|
238
276
|
# for IRuby
|
239
277
|
def to_html
|
240
278
|
require "iruby"
|
@@ -301,7 +339,7 @@ module Rover
|
|
301
339
|
Group.new(self, columns.flatten)
|
302
340
|
end
|
303
341
|
|
304
|
-
[:max, :min, :median, :mean, :percentile, :sum].each do |name|
|
342
|
+
[:max, :min, :median, :mean, :percentile, :sum, :std, :var].each do |name|
|
305
343
|
define_method(name) do |column, *args|
|
306
344
|
check_column(column)
|
307
345
|
self[column].send(name, *args)
|
@@ -360,7 +398,7 @@ module Rover
|
|
360
398
|
def ==(other)
|
361
399
|
size == other.size &&
|
362
400
|
keys == other.keys &&
|
363
|
-
keys.all? { |k| self[k] == other[k] }
|
401
|
+
keys.all? { |k| self[k].to_numo == other[k].to_numo }
|
364
402
|
end
|
365
403
|
|
366
404
|
def plot(x = nil, y = nil, type: nil)
|
@@ -475,10 +513,12 @@ module Rover
|
|
475
513
|
|
476
514
|
left = how == "left"
|
477
515
|
|
516
|
+
types = {}
|
478
517
|
vectors = {}
|
479
518
|
keys = (self.keys + other.keys).uniq
|
480
519
|
keys.each do |k|
|
481
520
|
vectors[k] = []
|
521
|
+
types[k] = join_type(self.types[k], other.types[k])
|
482
522
|
end
|
483
523
|
|
484
524
|
each_row do |r|
|
@@ -498,7 +538,7 @@ module Rover
|
|
498
538
|
end
|
499
539
|
end
|
500
540
|
|
501
|
-
DataFrame.new(vectors)
|
541
|
+
DataFrame.new(vectors, types: types)
|
502
542
|
end
|
503
543
|
|
504
544
|
def check_join_keys(df, keys)
|
@@ -523,6 +563,19 @@ module Rover
|
|
523
563
|
end
|
524
564
|
end
|
525
565
|
|
566
|
+
def join_type(a, b)
|
567
|
+
if a.nil?
|
568
|
+
b
|
569
|
+
elsif b.nil?
|
570
|
+
a
|
571
|
+
elsif a == b
|
572
|
+
a
|
573
|
+
else
|
574
|
+
# TODO specify
|
575
|
+
nil
|
576
|
+
end
|
577
|
+
end
|
578
|
+
|
526
579
|
def to_vector(v, size: nil, type: nil)
|
527
580
|
if v.is_a?(Vector)
|
528
581
|
v = v.to(type) if type && v.type != type
|
data/lib/rover/group.rb
CHANGED
@@ -9,7 +9,7 @@ module Rover
|
|
9
9
|
Group.new(@df, @columns + columns.flatten)
|
10
10
|
end
|
11
11
|
|
12
|
-
[:count, :max, :min, :mean, :median, :percentile, :sum].each do |name|
|
12
|
+
[:count, :max, :min, :mean, :median, :percentile, :sum, :std, :var].each do |name|
|
13
13
|
define_method(name) do |*args|
|
14
14
|
n = [name, args.first].compact.join("_")
|
15
15
|
|
data/lib/rover/vector.rb
CHANGED
@@ -263,7 +263,11 @@ module Rover
|
|
263
263
|
end
|
264
264
|
|
265
265
|
def last(n = 1)
|
266
|
-
|
266
|
+
if n >= size
|
267
|
+
Vector.new(@data)
|
268
|
+
else
|
269
|
+
Vector.new(@data[-n..-1])
|
270
|
+
end
|
267
271
|
end
|
268
272
|
|
269
273
|
def take(n)
|
@@ -355,6 +359,7 @@ module Rover
|
|
355
359
|
data = data.to_a
|
356
360
|
|
357
361
|
if type
|
362
|
+
data = data.map { |v| v || Float::NAN } if [:float, :float32].include?(type)
|
358
363
|
data = numo_type.cast(data)
|
359
364
|
else
|
360
365
|
data =
|
data/lib/rover/version.rb
CHANGED
data/lib/rover.rb
CHANGED
@@ -19,6 +19,16 @@ module Rover
|
|
19
19
|
csv_to_df(CSV.parse(str, **csv_options(options)), types: types, headers: options[:headers])
|
20
20
|
end
|
21
21
|
|
22
|
+
def read_parquet(path, types: nil)
|
23
|
+
require "parquet"
|
24
|
+
parquet_to_df(Arrow::Table.load(path), types: types)
|
25
|
+
end
|
26
|
+
|
27
|
+
def parse_parquet(str, types: nil)
|
28
|
+
require "parquet"
|
29
|
+
parquet_to_df(Arrow::Table.load(Arrow::Buffer.new(str), format: :parquet), types: types)
|
30
|
+
end
|
31
|
+
|
22
32
|
private
|
23
33
|
|
24
34
|
# TODO use date converter
|
@@ -35,10 +45,63 @@ module Rover
|
|
35
45
|
|
36
46
|
table.by_col!
|
37
47
|
data = {}
|
48
|
+
keys = table.map { |k, _| [k, true] }.to_h
|
49
|
+
unnamed_suffix = 1
|
38
50
|
table.each do |k, v|
|
51
|
+
# TODO do same for empty string in 0.3.0
|
52
|
+
if k.nil?
|
53
|
+
k = "unnamed"
|
54
|
+
while keys.include?(k)
|
55
|
+
unnamed_suffix += 1
|
56
|
+
k = "unnamed#{unnamed_suffix}"
|
57
|
+
end
|
58
|
+
keys[k] = true
|
59
|
+
end
|
39
60
|
data[k] = v
|
40
61
|
end
|
62
|
+
|
41
63
|
DataFrame.new(data, types: types)
|
42
64
|
end
|
65
|
+
|
66
|
+
PARQUET_TYPE_MAPPING = {
|
67
|
+
"bool" => Numo::Bit,
|
68
|
+
"float" => Numo::SFloat,
|
69
|
+
"double" => Numo::DFloat,
|
70
|
+
"int8" => Numo::Int8,
|
71
|
+
"int16" => Numo::Int16,
|
72
|
+
"int32" => Numo::Int32,
|
73
|
+
"int64" => Numo::Int64,
|
74
|
+
"string" => Numo::RObject,
|
75
|
+
"uint8" => Numo::UInt8,
|
76
|
+
"uint16" => Numo::UInt16,
|
77
|
+
"uint32" => Numo::UInt32,
|
78
|
+
"uint64" => Numo::UInt64
|
79
|
+
}
|
80
|
+
|
81
|
+
def parquet_to_df(table, types: nil)
|
82
|
+
data = {}
|
83
|
+
types ||= {}
|
84
|
+
table.each_column do |column|
|
85
|
+
k = column.field.name
|
86
|
+
if types[k]
|
87
|
+
data[k] = Vector.new(column.data.values, type: types[k])
|
88
|
+
else
|
89
|
+
type = column.field.data_type.to_s
|
90
|
+
numo_type = PARQUET_TYPE_MAPPING[type]
|
91
|
+
raise "Unknown type: #{type}" unless numo_type
|
92
|
+
|
93
|
+
# TODO automatic conversion?
|
94
|
+
# int => float
|
95
|
+
# bool => object
|
96
|
+
if (type.include?("int") || type == "bool") && column.n_nulls > 0
|
97
|
+
raise "Nulls not supported for #{type} column: #{k}"
|
98
|
+
end
|
99
|
+
|
100
|
+
# TODO improve performance
|
101
|
+
data[k] = numo_type.cast(column.data.values)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
DataFrame.new(data)
|
105
|
+
end
|
43
106
|
end
|
44
107
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rover-df
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-01-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|
@@ -58,7 +58,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
58
58
|
- !ruby/object:Gem::Version
|
59
59
|
version: '0'
|
60
60
|
requirements: []
|
61
|
-
rubygems_version: 3.
|
61
|
+
rubygems_version: 3.3.3
|
62
62
|
signing_key:
|
63
63
|
specification_version: 4
|
64
64
|
summary: Simple, powerful data frames for Ruby
|