rover-df 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +99 -5
- data/lib/rover.rb +14 -6
- data/lib/rover/data_frame.rb +83 -9
- data/lib/rover/group.rb +49 -0
- data/lib/rover/vector.rb +129 -21
- data/lib/rover/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b003d311b623fdd38fee4c6fa76129ad4bba042e8193c1872928cb05085daad3
|
4
|
+
data.tar.gz: d0c8c04b2a8aec3ea5b7616cbcda61f03a12c96fc8f9a0c7aa29fc898948b759
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8033d8ae9e5fb8c8e767ba68897d37745cc5d35a7a82cb2847e2b1d2c3adf8eeb406914cd47949d8f4c3f21307617ab550f435e7a5c257fe1187ed47dd943829
|
7
|
+
data.tar.gz: acfdca4ad081e2722c4b5269824de123d26aadab28532837d2bfc717c2ca263f73dc4335963beae654b24d4c17cebbefdcce761a946f391340a306c9ca2a8c9b
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
## 0.1.1 (2020-06-10)
|
2
|
+
|
3
|
+
- Added methods and options for types
|
4
|
+
- Added grouping
|
5
|
+
- Added one-hot encoding
|
6
|
+
- Added `sample` to data frames
|
7
|
+
- Added `tally`, `var`, `std`, `take`, `count`, and `length` to vectors
|
8
|
+
- Improved error message for `read_csv` with no headers
|
9
|
+
|
1
10
|
## 0.1.0 (2020-05-13)
|
2
11
|
|
3
12
|
- First release
|
data/README.md
CHANGED
@@ -2,7 +2,9 @@
|
|
2
2
|
|
3
3
|
Simple, powerful data frames for Ruby
|
4
4
|
|
5
|
-
:mountain: Designed for data exploration and machine learning, and powered by [Numo](https://github.com/ruby-numo/numo-narray)
|
5
|
+
:mountain: Designed for data exploration and machine learning, and powered by [Numo](https://github.com/ruby-numo/numo-narray)
|
6
|
+
|
7
|
+
[![Build Status](https://travis-ci.org/ankane/rover.svg?branch=master)](https://travis-ci.org/ankane/rover)
|
6
8
|
|
7
9
|
## Installation
|
8
10
|
|
@@ -16,12 +18,22 @@ gem 'rover-df'
|
|
16
18
|
|
17
19
|
A data frame is an in-memory table. It’s a useful data structure for data analysis and machine learning. It uses columnar storage for fast operations on columns.
|
18
20
|
|
21
|
+
Try it out for forecasting by clicking the button below:
|
22
|
+
|
23
|
+
[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ankane/ml-stack/master?filepath=Forecasting.ipynb)
|
24
|
+
|
25
|
+
Use the `Run` button (or `SHIFT` + `ENTER`) to run each line.
|
26
|
+
|
19
27
|
## Creating Data Frames
|
20
28
|
|
21
29
|
From an array
|
22
30
|
|
23
31
|
```ruby
|
24
|
-
Rover::DataFrame.new([
|
32
|
+
Rover::DataFrame.new([
|
33
|
+
{a: 1, b: "one"},
|
34
|
+
{a: 2, b: "two"},
|
35
|
+
{a: 3, b: "three"}
|
36
|
+
])
|
25
37
|
```
|
26
38
|
|
27
39
|
From a hash
|
@@ -33,7 +45,7 @@ Rover::DataFrame.new({
|
|
33
45
|
})
|
34
46
|
```
|
35
47
|
|
36
|
-
From
|
48
|
+
From Active Record
|
37
49
|
|
38
50
|
```ruby
|
39
51
|
Rover::DataFrame.new(User.all)
|
@@ -75,6 +87,8 @@ Select a column
|
|
75
87
|
df[:a]
|
76
88
|
```
|
77
89
|
|
90
|
+
> Note that strings and symbols are different keys, just like hashes
|
91
|
+
|
78
92
|
Select multiple columns
|
79
93
|
|
80
94
|
```ruby
|
@@ -158,13 +172,39 @@ df[:a].min
|
|
158
172
|
df[:a].max
|
159
173
|
```
|
160
174
|
|
175
|
+
Count occurrences
|
176
|
+
|
177
|
+
```ruby
|
178
|
+
df[:a].tally
|
179
|
+
```
|
180
|
+
|
161
181
|
Cross tabulation
|
162
182
|
|
163
183
|
```ruby
|
164
184
|
df[:a].crosstab(df[:b])
|
165
185
|
```
|
166
186
|
|
167
|
-
##
|
187
|
+
## Grouping
|
188
|
+
|
189
|
+
Group
|
190
|
+
|
191
|
+
```ruby
|
192
|
+
df.group(:a).count
|
193
|
+
```
|
194
|
+
|
195
|
+
Works with all summary statistics
|
196
|
+
|
197
|
+
```ruby
|
198
|
+
df.group(:a).max(:b)
|
199
|
+
```
|
200
|
+
|
201
|
+
Multiple groups
|
202
|
+
|
203
|
+
```ruby
|
204
|
+
df.group([:a, :b]).count
|
205
|
+
```
|
206
|
+
|
207
|
+
## Updating Data
|
168
208
|
|
169
209
|
Add a new column
|
170
210
|
|
@@ -214,7 +254,7 @@ Rename a column
|
|
214
254
|
df[:new_a] = df.delete(:a)
|
215
255
|
```
|
216
256
|
|
217
|
-
Sort
|
257
|
+
Sort rows
|
218
258
|
|
219
259
|
```ruby
|
220
260
|
df.sort_by! { |r| r[:a] }
|
@@ -258,6 +298,20 @@ Left join
|
|
258
298
|
df.left_join(other_df)
|
259
299
|
```
|
260
300
|
|
301
|
+
## Encoding
|
302
|
+
|
303
|
+
One-hot encoding
|
304
|
+
|
305
|
+
```ruby
|
306
|
+
df.one_hot
|
307
|
+
```
|
308
|
+
|
309
|
+
Drop a variable in each category to avoid the dummy variable trap
|
310
|
+
|
311
|
+
```ruby
|
312
|
+
df.one_hot(drop: true)
|
313
|
+
```
|
314
|
+
|
261
315
|
## Conversion
|
262
316
|
|
263
317
|
Array of hashes
|
@@ -284,6 +338,46 @@ CSV
|
|
284
338
|
df.to_csv
|
285
339
|
```
|
286
340
|
|
341
|
+
## Types
|
342
|
+
|
343
|
+
Pass column types when creating a data frame
|
344
|
+
|
345
|
+
```ruby
|
346
|
+
Rover::DataFrame.new(data, types: {"a" => :int, "b" => :float})
|
347
|
+
```
|
348
|
+
|
349
|
+
Or
|
350
|
+
|
351
|
+
```ruby
|
352
|
+
Rover.read_csv("data.csv", types: {"a" => :int, "b" => :float})
|
353
|
+
```
|
354
|
+
|
355
|
+
Supported types are:
|
356
|
+
|
357
|
+
- boolean - `bool`
|
358
|
+
- float - `float`, `float32`
|
359
|
+
- integer - `int`, `int32`, `int16`, `int8`
|
360
|
+
- unsigned integer - `uint`, `uint32`, `uint16`, `uint8`
|
361
|
+
- object - `object`
|
362
|
+
|
363
|
+
Get column types
|
364
|
+
|
365
|
+
```ruby
|
366
|
+
df.types
|
367
|
+
```
|
368
|
+
|
369
|
+
For a specific column
|
370
|
+
|
371
|
+
```ruby
|
372
|
+
df[:a].type
|
373
|
+
```
|
374
|
+
|
375
|
+
Change the type of a column
|
376
|
+
|
377
|
+
```ruby
|
378
|
+
df[:a] = df[:a].to(:int)
|
379
|
+
```
|
380
|
+
|
287
381
|
## History
|
288
382
|
|
289
383
|
View the [changelog](https://github.com/ankane/rover/blob/master/CHANGELOG.md)
|
data/lib/rover.rb
CHANGED
@@ -3,30 +3,38 @@ require "numo/narray"
|
|
3
3
|
|
4
4
|
# modules
|
5
5
|
require "rover/data_frame"
|
6
|
+
require "rover/group"
|
6
7
|
require "rover/vector"
|
7
8
|
require "rover/version"
|
8
9
|
|
9
10
|
module Rover
|
10
11
|
class << self
|
11
|
-
def read_csv(path, **options)
|
12
|
+
def read_csv(path, types: nil, **options)
|
12
13
|
require "csv"
|
13
|
-
csv_to_df(CSV.read(path,
|
14
|
+
csv_to_df(CSV.read(path, **csv_options(options)), types: types)
|
14
15
|
end
|
15
16
|
|
16
|
-
def parse_csv(str, **options)
|
17
|
+
def parse_csv(str, types: nil, **options)
|
17
18
|
require "csv"
|
18
|
-
csv_to_df(CSV.parse(str,
|
19
|
+
csv_to_df(CSV.parse(str, **csv_options(options)), types: types)
|
19
20
|
end
|
20
21
|
|
21
22
|
private
|
22
23
|
|
23
|
-
|
24
|
+
# TODO use date converter
|
25
|
+
def csv_options(options)
|
26
|
+
options = {headers: true, converters: :numeric}.merge(options)
|
27
|
+
raise ArgumentError, "Must specify headers" unless options[:headers]
|
28
|
+
options
|
29
|
+
end
|
30
|
+
|
31
|
+
def csv_to_df(table, types: nil)
|
24
32
|
table.by_col!
|
25
33
|
data = {}
|
26
34
|
table.each do |k, v|
|
27
35
|
data[k] = v
|
28
36
|
end
|
29
|
-
DataFrame.new(data)
|
37
|
+
DataFrame.new(data, types: types)
|
30
38
|
end
|
31
39
|
end
|
32
40
|
end
|
data/lib/rover/data_frame.rb
CHANGED
@@ -1,7 +1,10 @@
|
|
1
1
|
module Rover
|
2
2
|
class DataFrame
|
3
|
-
def initialize(
|
3
|
+
def initialize(*args)
|
4
|
+
data, options = process_args(args)
|
5
|
+
|
4
6
|
@vectors = {}
|
7
|
+
types = options[:types] || {}
|
5
8
|
|
6
9
|
if data.is_a?(DataFrame)
|
7
10
|
data.vectors.each do |k, v|
|
@@ -11,7 +14,7 @@ module Rover
|
|
11
14
|
data.to_h.each do |k, v|
|
12
15
|
@vectors[k] =
|
13
16
|
if v.respond_to?(:to_a)
|
14
|
-
Vector.new(v)
|
17
|
+
Vector.new(v, type: types[k])
|
15
18
|
else
|
16
19
|
v
|
17
20
|
end
|
@@ -20,7 +23,7 @@ module Rover
|
|
20
23
|
# handle scalars
|
21
24
|
size = @vectors.values.find { |v| v.is_a?(Vector) }&.size || 1
|
22
25
|
@vectors.each_key do |k|
|
23
|
-
@vectors[k] = to_vector(@vectors[k], size)
|
26
|
+
@vectors[k] = to_vector(@vectors[k], size: size, type: types[k])
|
24
27
|
end
|
25
28
|
elsif data.is_a?(Array)
|
26
29
|
vectors = {}
|
@@ -35,12 +38,12 @@ module Rover
|
|
35
38
|
end
|
36
39
|
end
|
37
40
|
vectors.each do |k, v|
|
38
|
-
@vectors[k] = to_vector(v)
|
41
|
+
@vectors[k] = to_vector(v, type: types[k])
|
39
42
|
end
|
40
43
|
elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base))
|
41
44
|
result = data.connection.select_all(data.all.to_sql)
|
42
45
|
result.columns.each_with_index do |k, i|
|
43
|
-
@vectors[k] = to_vector(result.rows.map { |r| r[i] })
|
46
|
+
@vectors[k] = to_vector(result.rows.map { |r| r[i] }, type: types[k])
|
44
47
|
end
|
45
48
|
else
|
46
49
|
raise ArgumentError, "Cannot cast to data frame: #{data.class.name}"
|
@@ -90,9 +93,13 @@ module Rover
|
|
90
93
|
@vectors.dup
|
91
94
|
end
|
92
95
|
|
96
|
+
def types
|
97
|
+
@vectors.map { |k, v| [k, v.type] }.to_h
|
98
|
+
end
|
99
|
+
|
93
100
|
def []=(k, v)
|
94
101
|
check_key(k)
|
95
|
-
v = to_vector(v, size)
|
102
|
+
v = to_vector(v, size: size)
|
96
103
|
raise ArgumentError, "Size mismatch: expected #{size}, got #{v.size}" if @vectors.any? && v.size != size
|
97
104
|
@vectors[k] = v
|
98
105
|
end
|
@@ -170,6 +177,12 @@ module Rover
|
|
170
177
|
DataFrame.new(new_vectors)
|
171
178
|
end
|
172
179
|
|
180
|
+
def sample(*args, **kwargs)
|
181
|
+
# TODO make more efficient
|
182
|
+
indexes = (0...size).to_a.sample(*args, **kwargs)
|
183
|
+
self[indexes]
|
184
|
+
end
|
185
|
+
|
173
186
|
def to_a
|
174
187
|
a = []
|
175
188
|
each_row do |row|
|
@@ -190,6 +203,25 @@ module Rover
|
|
190
203
|
Numo::NArray.column_stack(vectors.values.map(&:to_numo))
|
191
204
|
end
|
192
205
|
|
206
|
+
# TODO raise error when collision
|
207
|
+
def one_hot(drop: false)
|
208
|
+
df = DataFrame.new
|
209
|
+
vectors.each do |k, v|
|
210
|
+
if v.to_numo.is_a?(Numo::RObject)
|
211
|
+
df.merge!(v.one_hot(drop: drop, prefix: "#{k}_"))
|
212
|
+
else
|
213
|
+
df[k] = v
|
214
|
+
end
|
215
|
+
end
|
216
|
+
df
|
217
|
+
rescue ArgumentError => e
|
218
|
+
if e.message == "All elements must be strings"
|
219
|
+
# better error message
|
220
|
+
raise ArgumentError, "All elements must be numeric or strings"
|
221
|
+
end
|
222
|
+
raise e
|
223
|
+
end
|
224
|
+
|
193
225
|
def to_csv
|
194
226
|
require "csv"
|
195
227
|
CSV.generate do |csv|
|
@@ -258,6 +290,17 @@ module Rover
|
|
258
290
|
dup.sort_by!(&block)
|
259
291
|
end
|
260
292
|
|
293
|
+
def group(columns)
|
294
|
+
Group.new(self, columns)
|
295
|
+
end
|
296
|
+
|
297
|
+
[:max, :min, :median, :mean, :percentile, :sum].each do |name|
|
298
|
+
define_method(name) do |column, *args|
|
299
|
+
check_column(column)
|
300
|
+
self[column].send(name, *args)
|
301
|
+
end
|
302
|
+
end
|
303
|
+
|
261
304
|
def dup
|
262
305
|
df = DataFrame.new
|
263
306
|
@vectors.each do |k, v|
|
@@ -375,8 +418,15 @@ module Rover
|
|
375
418
|
raise ArgumentError, "Missing keys: #{missing_keys.join(", ")}" if missing_keys.any?
|
376
419
|
end
|
377
420
|
|
378
|
-
def
|
379
|
-
|
421
|
+
def check_column(key)
|
422
|
+
raise ArgumentError, "Missing column: #{key}" unless include?(key)
|
423
|
+
end
|
424
|
+
|
425
|
+
def to_vector(v, size: nil, type: nil)
|
426
|
+
if v.is_a?(Vector)
|
427
|
+
v = v.to(type) if type && v.type != type
|
428
|
+
return v
|
429
|
+
end
|
380
430
|
|
381
431
|
if size && !v.respond_to?(:to_a)
|
382
432
|
v =
|
@@ -392,7 +442,31 @@ module Rover
|
|
392
442
|
end
|
393
443
|
end
|
394
444
|
|
395
|
-
Vector.new(v)
|
445
|
+
Vector.new(v, type: type)
|
446
|
+
end
|
447
|
+
|
448
|
+
# can't use data = {} and keyword arguments
|
449
|
+
# as this causes an unknown keyword error when data is passed as
|
450
|
+
# DataFrame.new({a: ..., b: ...})
|
451
|
+
#
|
452
|
+
# at the moment, there doesn't appear to be a way to distinguish between
|
453
|
+
# DataFrame.new({types: ...}) which should set data, and
|
454
|
+
# DataFrame.new(types: ...) which should set options
|
455
|
+
# https://bugs.ruby-lang.org/issues/16891
|
456
|
+
#
|
457
|
+
# there aren't currently options that should be used without data
|
458
|
+
# if this is ever the case, we should still require data
|
459
|
+
# to prevent new options from breaking existing code
|
460
|
+
def process_args(args)
|
461
|
+
data = args[0] || {}
|
462
|
+
options = args.size > 1 && args.last.is_a?(Hash) ? args.pop : {}
|
463
|
+
raise ArgumentError, "wrong number of arguments (given #{args.size}, expected 0..1)" if args.size > 1
|
464
|
+
|
465
|
+
known_keywords = [:types]
|
466
|
+
unknown_keywords = options.keys - known_keywords
|
467
|
+
raise ArgumentError, "unknown keywords: #{unknown_keywords.join(", ")}" if unknown_keywords.any?
|
468
|
+
|
469
|
+
[data, options]
|
396
470
|
end
|
397
471
|
end
|
398
472
|
end
|
data/lib/rover/group.rb
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
module Rover
|
2
|
+
class Group
|
3
|
+
def initialize(df, columns)
|
4
|
+
@df = df
|
5
|
+
@columns = Array(columns)
|
6
|
+
end
|
7
|
+
|
8
|
+
[:count, :max, :min, :mean, :median, :percentile, :sum].each do |name|
|
9
|
+
define_method(name) do |*args|
|
10
|
+
result = {}
|
11
|
+
grouped_dfs.each do |k, df|
|
12
|
+
result[k] = df.send(name, *args)
|
13
|
+
end
|
14
|
+
result
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
# TODO make more efficient
|
21
|
+
def grouped_dfs
|
22
|
+
# cache here so we can reuse for multiple calcuations if needed
|
23
|
+
@grouped_dfs ||= begin
|
24
|
+
raise ArgumentError, "No columns given" if @columns.empty?
|
25
|
+
missing_keys = @columns - @df.keys
|
26
|
+
raise ArgumentError, "Missing keys: #{missing_keys.join(", ")}" if missing_keys.any?
|
27
|
+
|
28
|
+
groups = Hash.new { |hash, key| hash[key] = [] }
|
29
|
+
if @columns.size == 1
|
30
|
+
@df[@columns.first].each_with_index do |v, i|
|
31
|
+
groups[v] << i
|
32
|
+
end
|
33
|
+
else
|
34
|
+
i = 0
|
35
|
+
@df.each_row do |row|
|
36
|
+
groups[@columns.map { |c| row[c] }] << i
|
37
|
+
i += 1
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
result = {}
|
42
|
+
groups.each do |k, indexes|
|
43
|
+
result[k] = @df[indexes]
|
44
|
+
end
|
45
|
+
result
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
data/lib/rover/vector.rb
CHANGED
@@ -1,27 +1,39 @@
|
|
1
1
|
module Rover
|
2
2
|
class Vector
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
3
|
+
# if a user never specifies types,
|
4
|
+
# the defaults are bool, float, int, and object
|
5
|
+
# keep these simple
|
6
|
+
#
|
7
|
+
# we could create aliases for float64, int64, uint64
|
8
|
+
# if so, type should still return the simple type
|
9
|
+
TYPE_CAST_MAPPING = {
|
10
|
+
bool: Numo::Bit,
|
11
|
+
float32: Numo::SFloat,
|
12
|
+
float: Numo::DFloat,
|
13
|
+
int8: Numo::Int8,
|
14
|
+
int16: Numo::Int16,
|
15
|
+
int32: Numo::Int32,
|
16
|
+
int: Numo::Int64,
|
17
|
+
object: Numo::RObject,
|
18
|
+
uint8: Numo::UInt8,
|
19
|
+
uint16: Numo::UInt16,
|
20
|
+
uint32: Numo::UInt32,
|
21
|
+
uint: Numo::UInt64
|
22
|
+
}
|
23
|
+
|
24
|
+
def initialize(data, type: nil)
|
25
|
+
@data = cast_data(data, type: type)
|
22
26
|
raise ArgumentError, "Bad size: #{@data.shape}" unless @data.ndim == 1
|
23
27
|
end
|
24
28
|
|
29
|
+
def type
|
30
|
+
TYPE_CAST_MAPPING.find { |_, v| @data.is_a?(v) }[0]
|
31
|
+
end
|
32
|
+
|
33
|
+
def to(type)
|
34
|
+
Vector.new(self, type: type)
|
35
|
+
end
|
36
|
+
|
25
37
|
def to_numo
|
26
38
|
@data
|
27
39
|
end
|
@@ -35,6 +47,8 @@ module Rover
|
|
35
47
|
def size
|
36
48
|
@data.size
|
37
49
|
end
|
50
|
+
alias_method :length, :size
|
51
|
+
alias_method :count, :size
|
38
52
|
|
39
53
|
def uniq
|
40
54
|
Vector.new(@data.to_a.uniq)
|
@@ -148,6 +162,15 @@ module Rover
|
|
148
162
|
Vector.new(mapped)
|
149
163
|
end
|
150
164
|
|
165
|
+
def tally
|
166
|
+
result = Hash.new(0)
|
167
|
+
@data.each do |v|
|
168
|
+
result[v] += 1
|
169
|
+
end
|
170
|
+
result.default = nil
|
171
|
+
result
|
172
|
+
end
|
173
|
+
|
151
174
|
def sort
|
152
175
|
Vector.new(@data.respond_to?(:sort) ? @data.sort : @data.to_a.sort)
|
153
176
|
end
|
@@ -157,7 +180,11 @@ module Rover
|
|
157
180
|
end
|
158
181
|
|
159
182
|
def each(&block)
|
160
|
-
|
183
|
+
@data.each(&block)
|
184
|
+
end
|
185
|
+
|
186
|
+
def each_with_index(&block)
|
187
|
+
@data.each_with_index(&block)
|
161
188
|
end
|
162
189
|
|
163
190
|
def max
|
@@ -176,7 +203,7 @@ module Rover
|
|
176
203
|
|
177
204
|
def median
|
178
205
|
# need to cast to get correct result
|
179
|
-
#
|
206
|
+
# https://github.com/ruby-numo/numo-narray/issues/165
|
180
207
|
@data.cast_to(Numo::DFloat).median
|
181
208
|
end
|
182
209
|
|
@@ -188,6 +215,16 @@ module Rover
|
|
188
215
|
@data.sum
|
189
216
|
end
|
190
217
|
|
218
|
+
# uses Bessel's correction for now since that's all Numo supports
|
219
|
+
def std
|
220
|
+
@data.cast_to(Numo::DFloat).stddev
|
221
|
+
end
|
222
|
+
|
223
|
+
# uses Bessel's correction for now since that's all Numo supports
|
224
|
+
def var
|
225
|
+
@data.cast_to(Numo::DFloat).var
|
226
|
+
end
|
227
|
+
|
191
228
|
def all?(&block)
|
192
229
|
@data.to_a.all?(&block)
|
193
230
|
end
|
@@ -208,6 +245,11 @@ module Rover
|
|
208
245
|
Vector.new(@data[-n..-1])
|
209
246
|
end
|
210
247
|
|
248
|
+
def take(n)
|
249
|
+
raise ArgumentError, "attempt to take negative size" if n < 0
|
250
|
+
first(n)
|
251
|
+
end
|
252
|
+
|
211
253
|
def crosstab(other)
|
212
254
|
index = uniq.sort
|
213
255
|
index_pos = index.to_a.map.with_index.to_h
|
@@ -231,6 +273,20 @@ module Rover
|
|
231
273
|
last(n)
|
232
274
|
end
|
233
275
|
|
276
|
+
def one_hot(drop: false, prefix: nil)
|
277
|
+
raise ArgumentError, "All elements must be strings" unless all? { |vi| vi.is_a?(String) }
|
278
|
+
|
279
|
+
new_vectors = {}
|
280
|
+
# maybe sort values first
|
281
|
+
values = uniq.to_a
|
282
|
+
values.shift if drop
|
283
|
+
values.each do |v2|
|
284
|
+
# TODO use types
|
285
|
+
new_vectors["#{prefix}#{v2}"] = (self == v2).to_numo.cast_to(Numo::Int64)
|
286
|
+
end
|
287
|
+
DataFrame.new(new_vectors)
|
288
|
+
end
|
289
|
+
|
234
290
|
# TODO add type and size?
|
235
291
|
def inspect
|
236
292
|
elements = first(5).to_a.map(&:inspect)
|
@@ -244,5 +300,57 @@ module Rover
|
|
244
300
|
require "iruby"
|
245
301
|
IRuby::HTML.table(to_a)
|
246
302
|
end
|
303
|
+
|
304
|
+
private
|
305
|
+
|
306
|
+
def cast_data(data, type: nil)
|
307
|
+
numo_type = numo_type(type) if type
|
308
|
+
|
309
|
+
data = data.to_numo if data.is_a?(Vector)
|
310
|
+
|
311
|
+
if data.is_a?(Numo::NArray)
|
312
|
+
raise ArgumentError, "Complex types not supported yet" if data.is_a?(Numo::DComplex) || data.is_a?(Numo::SComplex)
|
313
|
+
|
314
|
+
if type
|
315
|
+
case type
|
316
|
+
when /int/
|
317
|
+
# Numo does not check these when casting
|
318
|
+
raise RangeError, "float NaN out of range of integer" if data.respond_to?(:isnan) && data.isnan.any?
|
319
|
+
raise RangeError, "float Inf out of range of integer" if data.respond_to?(:isinf) && data.isinf.any?
|
320
|
+
|
321
|
+
data = data.to_a.map { |v| v.nil? ? nil : v.to_i } if data.is_a?(Numo::RObject)
|
322
|
+
when /float/
|
323
|
+
data = data.to_a.map { |v| v.nil? ? Float::NAN : v.to_f } if data.is_a?(Numo::RObject)
|
324
|
+
end
|
325
|
+
|
326
|
+
data = numo_type.cast(data)
|
327
|
+
end
|
328
|
+
else
|
329
|
+
data = data.to_a
|
330
|
+
|
331
|
+
if type
|
332
|
+
data = numo_type.cast(data)
|
333
|
+
else
|
334
|
+
data =
|
335
|
+
if data.all? { |v| v.is_a?(Integer) }
|
336
|
+
Numo::Int64.cast(data)
|
337
|
+
elsif data.all? { |v| v.is_a?(Numeric) || v.nil? }
|
338
|
+
Numo::DFloat.cast(data.map { |v| v || Float::NAN })
|
339
|
+
elsif data.all? { |v| v == true || v == false }
|
340
|
+
Numo::Bit.cast(data)
|
341
|
+
else
|
342
|
+
Numo::RObject.cast(data)
|
343
|
+
end
|
344
|
+
end
|
345
|
+
end
|
346
|
+
|
347
|
+
data
|
348
|
+
end
|
349
|
+
|
350
|
+
def numo_type(type)
|
351
|
+
numo_type = TYPE_CAST_MAPPING[type]
|
352
|
+
raise ArgumentError, "Invalid type: #{type}" unless numo_type
|
353
|
+
numo_type
|
354
|
+
end
|
247
355
|
end
|
248
356
|
end
|
data/lib/rover/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rover-df
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-06-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: numo-narray
|
@@ -120,6 +120,7 @@ files:
|
|
120
120
|
- lib/rover-df.rb
|
121
121
|
- lib/rover.rb
|
122
122
|
- lib/rover/data_frame.rb
|
123
|
+
- lib/rover/group.rb
|
123
124
|
- lib/rover/vector.rb
|
124
125
|
- lib/rover/version.rb
|
125
126
|
homepage: https://github.com/ankane/rover
|